From ba15dfd0be8d08390fe29c88a8e82b1089af3a4c Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 22 Jun 2023 12:58:07 +0200
Subject: [PATCH 001/140] Nomic vulkan backend licensed under the Software for
 Open Models License (SOM), version 1.0.

---
 .gitmodules                                   |    0
 CMakeLists.txt                                |  123 ++
 LICENSE_SOM.txt                               |   30 +
 examples/main/main.cpp                        |    8 +
 ggml-vulkan.cpp                               | 1313 +++++++++++++++++
 ggml-vulkan.h                                 |   61 +
 ggml.c                                        |   32 +-
 kompute/.ccls                                 |   27 +
 kompute/.clang-format                         |    5 +
 kompute/.dockerignore                         |    4 +
 kompute/.github/workflows/cpp_examples.yml    |   58 +
 kompute/.github/workflows/cpp_tests.yml       |  104 ++
 kompute/.github/workflows/python_tests.yml    |   28 +
 kompute/CMakeLists.txt                        |  187 +++
 kompute/LICENSE                               |  203 +++
 kompute/Makefile                              |  210 +++
 kompute/README.md                             |  513 +++++++
 kompute/cmake/bin2h.cmake                     |  106 ++
 kompute/cmake/bin_file_to_header.cmake        |   19 +
 kompute/cmake/check_vulkan_version.cmake      |  139 ++
 kompute/cmake/code_coverage.cmake             |   35 +
 kompute/cmake/deprecation_warnings.cmake      |   15 +
 kompute/cmake/komputeConfig.cmake.in          |    8 +
 kompute/cmake/vulkan_shader_compiler.cmake    |   43 +
 kompute/config/FindSphinx.cmake               |   16 +
 kompute/external/bin/xxd.c                    |  819 ++++++++++
 kompute/kompute-config.cmake                  |   28 +
 kompute/op_add.comp                           |  145 ++
 kompute/op_addrow.comp                        |  145 ++
 kompute/op_cpy_f16_f16.comp                   |  176 +++
 kompute/op_cpy_f16_f32.comp                   |  176 +++
 kompute/op_cpy_f32_f16.comp                   |  176 +++
 kompute/op_cpy_f32_f32.comp                   |  168 +++
 kompute/op_diagmask.comp                      |  153 ++
 kompute/op_gelu.comp                          |  142 ++
 kompute/op_getrows_f16.comp                   |  150 ++
 kompute/op_getrows_q4_0.comp                  |  179 +++
 kompute/op_getrows_q4_1.comp                  |  181 +++
 kompute/op_mul.comp                           |  145 ++
 kompute/op_mul_mat_f16.comp                   |  177 +++
 kompute/op_mul_mat_q4_0.comp                  |  195 +++
 kompute/op_mul_mat_q4_1.comp                  |  218 +++
 kompute/op_mulrow.comp                        |  145 ++
 kompute/op_norm.comp                          |  209 +++
 kompute/op_relu.comp                          |  141 ++
 kompute/op_rmsnorm.comp                       |  178 +++
 kompute/op_rope.comp                          |  183 +++
 kompute/op_scale.comp                         |  142 ++
 kompute/op_silu.comp                          |  141 ++
 kompute/op_softmax.comp                       |  197 +++
 kompute/scripts/convert_shaders.py            |  148 ++
 kompute/scripts/requirements.txt              |   11 +
 kompute/setup.py                              |   93 ++
 kompute/src/Algorithm.cpp                     |  450 ++++++
 kompute/src/CMakeLists.txt                    |   82 +
 kompute/src/Core.cpp                          |   27 +
 kompute/src/Manager.cpp                       |  493 +++++++
 kompute/src/OpAlgoDispatch.cpp                |   65 +
 kompute/src/OpBufferSyncDevice.cpp            |   51 +
 kompute/src/OpBufferSyncLocal.cpp             |   51 +
 kompute/src/OpMemoryBarrier.cpp               |   74 +
 kompute/src/OpTensorCopy.cpp                  |   90 ++
 kompute/src/OpTensorSyncDevice.cpp            |   61 +
 kompute/src/OpTensorSyncLocal.cpp             |   76 +
 kompute/src/Sequence.cpp                      |  396 +++++
 kompute/src/Tensor.cpp                        |  451 ++++++
 kompute/src/include/CMakeLists.txt            |   46 +
 kompute/src/include/kompute/Algorithm.hpp     |  338 +++++
 kompute/src/include/kompute/Core.hpp          |   39 +
 kompute/src/include/kompute/Kompute.hpp       |   21 +
 kompute/src/include/kompute/Manager.hpp       |  267 ++++
 kompute/src/include/kompute/Sequence.hpp      |  313 ++++
 kompute/src/include/kompute/Tensor.hpp        |  306 ++++
 kompute/src/include/kompute/logger/Logger.hpp |  197 +++
 .../kompute/operations/OpAlgoDispatch.hpp     |   86 ++
 .../src/include/kompute/operations/OpBase.hpp |   62 +
 .../kompute/operations/OpBufferSyncDevice.hpp |   50 +
 .../kompute/operations/OpBufferSyncLocal.hpp  |   50 +
 .../kompute/operations/OpMemoryBarrier.hpp    |   81 +
 .../src/include/kompute/operations/OpMult.hpp |   58 +
 .../kompute/operations/OpTensorCopy.hpp       |   63 +
 .../kompute/operations/OpTensorSyncDevice.hpp |   66 +
 .../kompute/operations/OpTensorSyncLocal.hpp  |   66 +
 kompute/src/logger/CMakeLists.txt             |   69 +
 kompute/src/logger/Logger.cpp                 |  101 ++
 kompute/src/shaders/CMakeLists.txt            |    5 +
 kompute/src/shaders/glsl/CMakeLists.txt       |   26 +
 .../glsl/ShaderLogisticRegression.comp        |   52 +
 .../glsl/ShaderLogisticRegression.hpp.in      |  310 ++++
 kompute/src/shaders/glsl/ShaderOpMult.comp    |   28 +
 kompute/src/shaders/glsl/ShaderOpMult.hpp.in  |  101 ++
 kompute/src/shaders/hlsl/computeheadless.comp |   29 +
 llama.cpp                                     |   47 +-
 llama.h                                       |    2 +-
 undump.py                                     |   18 +
 95 files changed, 13489 insertions(+), 23 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 LICENSE_SOM.txt
 create mode 100644 ggml-vulkan.cpp
 create mode 100644 ggml-vulkan.h
 create mode 100644 kompute/.ccls
 create mode 100644 kompute/.clang-format
 create mode 100644 kompute/.dockerignore
 create mode 100644 kompute/.github/workflows/cpp_examples.yml
 create mode 100644 kompute/.github/workflows/cpp_tests.yml
 create mode 100644 kompute/.github/workflows/python_tests.yml
 create mode 100644 kompute/CMakeLists.txt
 create mode 100644 kompute/LICENSE
 create mode 100644 kompute/Makefile
 create mode 100644 kompute/README.md
 create mode 100644 kompute/cmake/bin2h.cmake
 create mode 100644 kompute/cmake/bin_file_to_header.cmake
 create mode 100644 kompute/cmake/check_vulkan_version.cmake
 create mode 100644 kompute/cmake/code_coverage.cmake
 create mode 100644 kompute/cmake/deprecation_warnings.cmake
 create mode 100644 kompute/cmake/komputeConfig.cmake.in
 create mode 100644 kompute/cmake/vulkan_shader_compiler.cmake
 create mode 100644 kompute/config/FindSphinx.cmake
 create mode 100644 kompute/external/bin/xxd.c
 create mode 100644 kompute/kompute-config.cmake
 create mode 100644 kompute/op_add.comp
 create mode 100644 kompute/op_addrow.comp
 create mode 100644 kompute/op_cpy_f16_f16.comp
 create mode 100644 kompute/op_cpy_f16_f32.comp
 create mode 100644 kompute/op_cpy_f32_f16.comp
 create mode 100644 kompute/op_cpy_f32_f32.comp
 create mode 100644 kompute/op_diagmask.comp
 create mode 100644 kompute/op_gelu.comp
 create mode 100644 kompute/op_getrows_f16.comp
 create mode 100644 kompute/op_getrows_q4_0.comp
 create mode 100644 kompute/op_getrows_q4_1.comp
 create mode 100644 kompute/op_mul.comp
 create mode 100644 kompute/op_mul_mat_f16.comp
 create mode 100644 kompute/op_mul_mat_q4_0.comp
 create mode 100644 kompute/op_mul_mat_q4_1.comp
 create mode 100644 kompute/op_mulrow.comp
 create mode 100644 kompute/op_norm.comp
 create mode 100644 kompute/op_relu.comp
 create mode 100644 kompute/op_rmsnorm.comp
 create mode 100644 kompute/op_rope.comp
 create mode 100644 kompute/op_scale.comp
 create mode 100644 kompute/op_silu.comp
 create mode 100644 kompute/op_softmax.comp
 create mode 100644 kompute/scripts/convert_shaders.py
 create mode 100644 kompute/scripts/requirements.txt
 create mode 100644 kompute/setup.py
 create mode 100644 kompute/src/Algorithm.cpp
 create mode 100644 kompute/src/CMakeLists.txt
 create mode 100644 kompute/src/Core.cpp
 create mode 100644 kompute/src/Manager.cpp
 create mode 100644 kompute/src/OpAlgoDispatch.cpp
 create mode 100644 kompute/src/OpBufferSyncDevice.cpp
 create mode 100644 kompute/src/OpBufferSyncLocal.cpp
 create mode 100644 kompute/src/OpMemoryBarrier.cpp
 create mode 100644 kompute/src/OpTensorCopy.cpp
 create mode 100644 kompute/src/OpTensorSyncDevice.cpp
 create mode 100644 kompute/src/OpTensorSyncLocal.cpp
 create mode 100644 kompute/src/Sequence.cpp
 create mode 100644 kompute/src/Tensor.cpp
 create mode 100644 kompute/src/include/CMakeLists.txt
 create mode 100644 kompute/src/include/kompute/Algorithm.hpp
 create mode 100644 kompute/src/include/kompute/Core.hpp
 create mode 100644 kompute/src/include/kompute/Kompute.hpp
 create mode 100644 kompute/src/include/kompute/Manager.hpp
 create mode 100644 kompute/src/include/kompute/Sequence.hpp
 create mode 100644 kompute/src/include/kompute/Tensor.hpp
 create mode 100644 kompute/src/include/kompute/logger/Logger.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBase.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpMult.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorCopy.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
 create mode 100644 kompute/src/logger/CMakeLists.txt
 create mode 100644 kompute/src/logger/Logger.cpp
 create mode 100644 kompute/src/shaders/CMakeLists.txt
 create mode 100644 kompute/src/shaders/glsl/CMakeLists.txt
 create mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.comp
 create mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
 create mode 100644 kompute/src/shaders/glsl/ShaderOpMult.comp
 create mode 100644 kompute/src/shaders/glsl/ShaderOpMult.hpp.in
 create mode 100644 kompute/src/shaders/hlsl/computeheadless.comp
 create mode 100644 undump.py

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4a649a976275..88585fb933495 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ option(LLAMA_HIPBLAS                         "llama: use hipBLAS"
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
+option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@@ -412,6 +413,127 @@ if (LLAMA_HIPBLAS)
     endif()
 endif()
 
+if (LLAMA_KOMPUTE)
+    find_package(Vulkan COMPONENTS glslc REQUIRED)
+    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+    if (NOT glslc_executable)
+        message(FATAL_ERROR "glslc not found")
+    endif()
+
+    function(compile_shader)
+      set(options)
+      set(oneValueArgs)
+      set(multiValueArgs SOURCES)
+      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+      foreach(source ${compile_shader_SOURCES})
+        set(spv_file ${source}.spv)
+        add_custom_command(
+            OUTPUT ${spv_file}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMENT "Compiling ${source} to ${source}.spv"
+        )
+
+        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+        set(FILE_NAME "shader${RAW_FILE_NAME}")
+        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+        add_custom_command(
+          OUTPUT ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          DEPENDS ${spv_file}
+          COMMENT "Converting to hpp: ${FILE_NAME}"
+        )
+      endforeach()
+    endfunction()
+
+    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+        message(STATUS "Kompute found")
+        add_subdirectory(kompute)
+
+        # Compile our shaders
+        compile_shader(SOURCES
+          kompute/op_scale.comp
+          kompute/op_add.comp
+          kompute/op_addrow.comp
+          kompute/op_mul.comp
+          kompute/op_mulrow.comp
+          kompute/op_silu.comp
+          kompute/op_relu.comp
+          kompute/op_gelu.comp
+          kompute/op_softmax.comp
+          kompute/op_norm.comp
+          kompute/op_rmsnorm.comp
+          kompute/op_diagmask.comp
+          kompute/op_mul_mat_f16.comp
+          kompute/op_mul_mat_q4_0.comp
+          kompute/op_mul_mat_q4_1.comp
+          kompute/op_getrows_f16.comp
+          kompute/op_getrows_q4_0.comp
+          kompute/op_getrows_q4_1.comp
+          kompute/op_rope.comp
+          kompute/op_cpy_f16_f16.comp
+          kompute/op_cpy_f16_f32.comp
+          kompute/op_cpy_f32_f16.comp
+          kompute/op_cpy_f32_f32.comp
+        )
+
+        # Create a custom target for our generated shaders
+        add_custom_target(generated_shaders DEPENDS
+          shaderop_scale.h
+          shaderop_add.h
+          shaderop_addrow.h
+          shaderop_mul.h
+          shaderop_mulrow.h
+          shaderop_silu.h
+          shaderop_relu.h
+          shaderop_gelu.h
+          shaderop_softmax.h
+          shaderop_norm.h
+          shaderop_rmsnorm.h
+          shaderop_diagmask.h
+          shaderop_mul_mat_f16.h
+          shaderop_mul_mat_q4_0.h
+          shaderop_mul_mat_q4_1.h
+          shaderop_getrows_f16.h
+          shaderop_getrows_q4_0.h
+          shaderop_getrows_q4_1.h
+          shaderop_rope.h
+          shaderop_cpy_f16_f16.h
+          shaderop_cpy_f16_f32.h
+          shaderop_cpy_f32_f16.h
+          shaderop_cpy_f32_f32.h
+        )
+
+        # Create a custom command that depends on the generated_shaders
+        add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            DEPENDS generated_shaders
+            COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
+        )
+
+        # Add the stamp to the main sources to ensure dependency tracking
+        set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        set(GGML_HEADERS_KOMPUTE ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        add_compile_definitions(GGML_USE_KOMPUTE)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
+    else()
+        message(WARNING "Kompute not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags
@@ -648,6 +770,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
             ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
             ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
             )
 
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
diff --git a/LICENSE_SOM.txt b/LICENSE_SOM.txt
new file mode 100644
index 0000000000000..eb912c0fd9d30
--- /dev/null
+++ b/LICENSE_SOM.txt
@@ -0,0 +1,30 @@
+Software for Open Models License (SOM)
+Version 1.0 dated August 30th, 2023
+
+This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
+
+This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
+
+1. Definitions
+The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
+A “Model” is the output of a machine learning algorithm, and excludes the Software.
+“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
+“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
+
+2. Grant of Rights. Subject to the conditions and limitations in section 3:
+(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
+
+(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
+
+3. Conditions and Limitations
+(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
+
+(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
+
+(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
+
+(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
+
+(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
+
+(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d78112260de08..16f8fc72bf0d8 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -33,6 +33,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#if defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
+#endif
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
@@ -171,6 +175,10 @@ int main(int argc, char ** argv) {
     g_model = &model;
     g_ctx = &ctx;
 
+#if defined(GGML_USE_KOMPUTE)
+    ggml_vk_init_device(0, "gpu");
+#endif
+
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
new file mode 100644
index 0000000000000..32590d03ec1ab
--- /dev/null
+++ b/ggml-vulkan.cpp
@@ -0,0 +1,1313 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "ggml-vulkan.h"
+#include "ggml.h"
+
+// These are generated at build time by cmake custom command
+#include "shaderop_scale.h"
+#include "shaderop_add.h"
+#include "shaderop_addrow.h"
+#include "shaderop_mul.h"
+#include "shaderop_mulrow.h"
+#include "shaderop_silu.h"
+#include "shaderop_relu.h"
+#include "shaderop_gelu.h"
+#include "shaderop_softmax.h"
+#include "shaderop_norm.h"
+#include "shaderop_rmsnorm.h"
+#include "shaderop_diagmask.h"
+#include "shaderop_mul_mat_f16.h"
+#include "shaderop_mul_mat_q4_0.h"
+#include "shaderop_mul_mat_q4_1.h"
+#include "shaderop_getrows_f16.h"
+#include "shaderop_getrows_q4_0.h"
+#include "shaderop_getrows_q4_1.h"
+#include "shaderop_rope.h"
+#include "shaderop_cpy_f16_f16.h"
+#include "shaderop_cpy_f16_f32.h"
+#include "shaderop_cpy_f32_f16.h"
+#include "shaderop_cpy_f32_f32.h"
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <memory>
+#include <vector>
+#include <utility>
+#include <fstream>
+#include <exception>
+#include <thread>
+#include <mutex>
+#include <atomic>
+#include <cstring>
+#include <immintrin.h>
+#include <kompute/Kompute.hpp>
+
+#ifndef __STDC_IEC_559__
+#warning Your C implementation does not seem to be IEC 559 compliant, which is required for proper Vulkan interop.
+#endif
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+typedef ggml_fp16_t half;
+
+struct ggml_kompute_context {
+    bool hasH2DAll = false;
+    std::vector<ggml_vk_memory> buffers;
+    std::shared_ptr<vk::DescriptorPool> pool;
+    static ggml_kompute_context *instance;
+    ggml_kompute_context() {
+        instance = this;
+    }
+};
+
+ggml_kompute_context *ggml_kompute_context::instance;
+
+kp::Manager mgr;
+
+#ifdef __linux__
+__attribute__((constructor))
+static void enable_sam() {
+    setenv("RADV_PERFTEST", "sam", false);
+}
+#endif
+
+static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physicalDevice) {
+    vk::PhysicalDeviceFeatures availableFeatures;
+    physicalDevice.getFeatures(&availableFeatures);
+
+    if (!availableFeatures.shaderInt16)
+        return false;
+
+    vk::PhysicalDeviceVulkan11Features availableFeatures11;
+    vk::PhysicalDeviceVulkan12Features availableFeatures12;
+
+    availableFeatures11.pNext = &availableFeatures12;
+    availableFeatures12.pNext = nullptr;
+
+    vk::PhysicalDeviceFeatures2 features2;
+    features2.pNext = &availableFeatures11;
+
+    physicalDevice.getFeatures2(&features2);
+
+    if (!availableFeatures11.uniformAndStorageBuffer16BitAccess ||
+        !availableFeatures11.storageBuffer16BitAccess) {
+        return false;
+    }
+
+    if (!availableFeatures12.storageBuffer8BitAccess ||
+        !availableFeatures12.uniformAndStorageBuffer8BitAccess ||
+        !availableFeatures12.shaderFloat16 ||
+        !availableFeatures12.shaderInt8) {
+        return false;
+    }
+
+    return true;
+}
+
+static std::string ggml_vk_getVendorName(uint32_t vendorID) {
+    switch (vendorID) {
+        case 0x10DE:
+            return "nvidia";
+        case 0x1002:
+            return "amd";
+        case 0x8086:
+            return "intel";
+        default:
+            return "unknown";
+    }
+}
+
+std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
+    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
+    uint32_t deviceCount = physicalDevices.size();
+
+    std::vector<ggml_vk_device> results;
+
+    if (deviceCount == 0)
+        return results;
+
+    for (uint32_t i = 0; i < deviceCount; i++) {
+        VkPhysicalDeviceProperties properties;
+        vkGetPhysicalDeviceProperties(physicalDevices.at(i), &properties);
+
+        VkPhysicalDeviceMemoryProperties memoryProperties;
+        vkGetPhysicalDeviceMemoryProperties(physicalDevices.at(i), &memoryProperties);
+
+        const uint32_t major = VK_VERSION_MAJOR(properties.apiVersion);
+        const uint32_t minor = VK_VERSION_MINOR(properties.apiVersion);
+        if (major < 1 || minor < 2)
+            continue;
+
+        if (!ggml_vk_checkPhysicalDeviceFeatures(physicalDevices.at(i)))
+            continue;
+
+        size_t heapSize = 0;
+        for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) {
+            VkMemoryHeap heap = memoryProperties.memoryHeaps[j];
+            if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
+                heapSize = heap.size;
+                break;
+            }
+        }
+
+        if (heapSize < memoryRequired)
+            continue;
+
+        ggml_vk_device d;
+        d.index = i;
+        d.type = properties.deviceType;
+        d.heapSize = heapSize;
+        d.name = properties.deviceName;
+        d.vendor = ggml_vk_getVendorName(properties.vendorID);
+        results.push_back(d);
+    }
+
+    std::stable_sort(results.begin(), results.end(),
+        [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool {
+            if (lhs.type != rhs.type) {
+                if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true;
+                if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false;
+
+                if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true;
+                if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false;
+            }
+            return lhs.heapSize < rhs.heapSize;
+        }
+    );
+
+    return results;
+}
+
+static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
+    devices.erase(
+        std::remove_if(devices.begin(), devices.end(),
+            [&targetVendor](const ggml_vk_device& device) {
+                return device.vendor != targetVendor;
+            }),
+        devices.end()
+    );
+}
+
+static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std::string& targetName) {
+    devices.erase(
+        std::remove_if(devices.begin(), devices.end(),
+            [&targetName](const ggml_vk_device& device) {
+                return device.name != targetName;
+            }),
+        devices.end()
+    );
+}
+
+bool ggml_vk_init_device(size_t memoryRequired, const std::string &device) {
+    if (device.empty())
+        return false;
+
+    std::vector<ggml_vk_device> devices = ggml_vk_available_devices(memoryRequired);
+    if (device == "gpu") {
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    } else if (device == "amd" || device == "nvidia" || device == "intel") {
+        ggml_vk_filterByVendor(devices, device);
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    } else {
+        ggml_vk_filterByName(devices, device);
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    }
+
+    return ggml_vk_has_device();
+}
+
+bool ggml_vk_init_device(const ggml_vk_device &device) {
+    return ggml_vk_init_device(device.index);
+}
+
+bool ggml_vk_init_device(int device) {
+    mgr.initializeDevice(device, {},
+                         {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
+                          "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
+    return ggml_vk_has_device();
+}
+
+bool ggml_vk_has_device() {
+    return mgr.hasDevice();
+}
+
+ggml_vk_device ggml_vk_current_device() {
+    if (!mgr.hasDevice())
+        return ggml_vk_device();
+
+    std::vector<ggml_vk_device> devices = ggml_vk_available_devices(0);
+    ggml_vk_filterByName(devices, mgr.physicalDevice()->getProperties().deviceName);
+    return devices.front();
+}
+
+ggml_kompute_context *ggml_vk_init() {
+    return new ggml_kompute_context;
+}
+
+bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
+    return ctx->hasH2DAll;
+}
+
+void ggml_vk_free(struct ggml_kompute_context * ctx) {
+    delete ctx;
+}
+
+static
+void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
+    std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
+        vk::DescriptorPoolSize(
+          vk::DescriptorType::eStorageBuffer,
+          3 * size // Descriptor count is number of possible tensors to pass into an algorithm
+          )
+    };
+
+    vk::DescriptorPoolCreateInfo descriptorPoolInfo(
+      vk::DescriptorPoolCreateFlags(),
+      size, // Max sets
+      static_cast<uint32_t>(descriptorPoolSizes.size()),
+      descriptorPoolSizes.data());
+
+    ctx->pool = std::make_shared<vk::DescriptorPool>();
+    vk::Result r = mgr.device()->createDescriptorPool(
+      &descriptorPoolInfo, nullptr, ctx->pool.get());
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
+}
+
+static
+void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
+    if (ctx->pool) {
+        mgr.device()->destroy(
+          *ctx->pool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        ctx->pool = nullptr;
+    }
+}
+
+static
+vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
+    vk::BufferCreateInfo bufferCreateInfo;
+    bufferCreateInfo.size = size;
+    bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer |
+                             vk::BufferUsageFlagBits::eTransferSrc |
+                             vk::BufferUsageFlagBits::eTransferDst;
+    bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
+
+    vk::Buffer *vkBuffer = new vk::Buffer;
+    vk::Result r = mgr.device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating buffer" << vk::to_string(r);
+    return vkBuffer;
+}
+
+static
+vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
+
+    uint32_t memoryTypeIndex = -1;
+    bool memoryTypeIndexFound = false;
+    vk::PhysicalDeviceMemoryProperties memoryProperties = mgr.physicalDevice()->getMemoryProperties();
+    for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
+        if (requirements.memoryTypeBits & (1 << i)) {
+            if (((memoryProperties.memoryTypes[i]).propertyFlags &
+                 flags) == flags) {
+                memoryTypeIndex = i;
+                memoryTypeIndexFound = true;
+                if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) {
+                    *isHostVisible = true;
+                }
+                break;
+            }
+        }
+    }
+    if (!memoryTypeIndexFound) {
+        throw std::runtime_error(
+          "Memory type index for buffer creation not found");
+    }
+
+    vk::MemoryAllocateInfo allocInfo;
+    allocInfo.allocationSize = size;
+    allocInfo.memoryTypeIndex = memoryTypeIndex;
+    vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
+    vk::Result r = mgr.device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating memory" << vk::to_string(r);
+    return vkDeviceMemory;
+}
+
+size_t ggml_vk_aligned_offset(size_t offset) {
+
+    static size_t minStorageBufferOffsetAlignment = 0;
+    if (minStorageBufferOffsetAlignment == 0) {
+        vk::PhysicalDeviceProperties deviceProperties;
+        deviceProperties = mgr.physicalDevice()->getProperties();
+        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
+        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
+    }
+
+    // If offset is already aligned, return it directly
+    if (offset % minStorageBufferOffsetAlignment == 0) {
+        return offset;
+    }
+
+    // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset
+    return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
+}
+
+static void ggml_vk_h2d_buffer(const ggml_vk_memory &memory) {
+    if (memory.stagingBuffer)
+        mgr.sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+}
+
+static void ggml_vk_d2h_buffer(const ggml_vk_memory &memory) {
+    if (memory.stagingBuffer)
+        mgr.sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+}
+
+ggml_vk_memory ggml_vk_allocate(size_t size) {
+    ggml_vk_memory memory;
+    bool isHostVisible = false;
+    {
+        memory.primaryBuffer = ggml_vk_allocate_buffer(size);
+        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.primaryBuffer);
+        vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
+        memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        mgr.device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
+        if (isHostVisible) {
+            vk::Result r = mgr.device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+            if (r != vk::Result::eSuccess)
+                std::cerr << "Error mapping memory" << vk::to_string(r);
+        }
+    }
+
+    if (!isHostVisible) {
+        memory.stagingBuffer = ggml_vk_allocate_buffer(size);
+        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.stagingBuffer);
+        vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
+                                                      vk::MemoryPropertyFlagBits::eHostCoherent |
+                                                      vk::MemoryPropertyFlagBits::eHostCached;
+        memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        mgr.device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
+        vk::Result r = mgr.device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+        if (r != vk::Result::eSuccess)
+            std::cerr << "Error mapping memory" << vk::to_string(r);
+    }
+
+    memory.size = size;
+    return memory;
+}
+
+void ggml_vk_free_memory(ggml_vk_memory &memory)
+{
+    mgr.device()->destroy(
+      *memory.primaryBuffer,
+      (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    if (memory.stagingBuffer) {
+        mgr.device()->destroy(
+          *memory.stagingBuffer,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    }
+    mgr.device()->freeMemory(
+      *memory.primaryMemory,
+      (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    if (memory.stagingMemory) {
+        mgr.device()->freeMemory(
+          *memory.stagingMemory,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    }
+}
+
+static
+decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+    for (auto it = ctx->buffers.begin(); ; it++) {
+        if (it == ctx->buffers.end()) {
+            fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
+            return it;
+        }
+        if (it->data <= t->data &&
+                reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
+            offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
+            return it;
+        }
+    }
+}
+
+static
+const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
+    uint64_t originalOffset = 0;
+    auto res = ggml_vk_find_tensor(ctx, t, originalOffset);
+    if (res == ctx->buffers.end()) {
+        static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
+        return nullTensor;
+    }
+
+    // Create a tensor whose memory will be composed of our buffers at the correct offset
+    const size_t nelements = ggml_nelements(t);
+    size_t nbytes = ggml_nbytes(t);
+
+    size_t vulkanOffset = ggml_vk_aligned_offset(originalOffset);
+    if (alignedOffset) {
+        *alignedOffset = originalOffset - vulkanOffset;
+        nbytes += *alignedOffset;
+    }
+
+    return mgr.tensor(
+        t->data,
+        nelements,
+        nbytes, kp::Tensor::TensorDataTypes::eFloat,
+        res->primaryMemory, res->primaryBuffer,
+        res->stagingMemory, res->stagingBuffer,
+        vulkanOffset);
+}
+
+void ggml_vk_add_buffer(
+        struct ggml_kompute_context * ctx,
+        const char * /*name*/,
+        const ggml_vk_memory &memory) {
+    ctx->buffers.emplace_back(memory);
+}
+
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
+    GGML_ASSERT(res);
+    mgr.sequence()->eval<kp::OpTensorSyncDevice>({res});
+}
+
+void ggml_vk_h2d_all(struct ggml_kompute_context * ctx) {
+    for (auto& it : ctx->buffers) {
+        ggml_vk_h2d_buffer(it);
+    }
+    ctx->hasH2DAll = true;
+}
+
+void ggml_vk_d2h_all(struct ggml_kompute_context * ctx) {
+    for (auto& it : ctx->buffers) {
+        ggml_vk_d2h_buffer(it);
+    }
+}
+
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
+
+    GGML_ASSERT(res);
+    mgr.sequence()->eval<kp::OpTensorSyncLocal>({res});
+}
+
+std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
+    if (size % sizeof(uint32_t) != 0) {
+        throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
+    }
+
+    const uint32_t* data_ptr = reinterpret_cast<const uint32_t*>(rawData);
+    size_t count = size / sizeof(uint32_t);
+    return std::vector<uint32_t>(data_ptr, data_ptr + count);
+}
+
+inline static
+uint32_t safe_divide(uint32_t a, uint32_t b) {
+    if (b <= 1) {
+        return a;
+    }
+    if ((a % b) != 0) {
+        fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
+        GGML_ASSERT(!"safe_divide result would've had remainder");
+    }
+    return a / b;
+}
+
+void ggml_vk_add(kp::Sequence& seq,
+                    const std::shared_ptr<kp::Tensor>& inA,
+                    const std::shared_ptr<kp::Tensor>& inB,
+                    const std::shared_ptr<kp::Tensor>& out,
+                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                    uint32_t size) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
+        kp::shader_data::op_add_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_addrow(kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& inA,
+                 const std::shared_ptr<kp::Tensor>& inB,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                 uint32_t size, uint32_t row = 0) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv,
+        kp::shader_data::op_addrow_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        uint32_t row;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        row
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul(kp::Sequence& seq,
+                    const std::shared_ptr<kp::Tensor>& inA,
+                    const std::shared_ptr<kp::Tensor>& inB,
+                    const std::shared_ptr<kp::Tensor>& out,
+                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                    uint32_t size) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv,
+        kp::shader_data::op_mul_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mulrow(kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& inA,
+                 const std::shared_ptr<kp::Tensor>& inB,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                 uint32_t size, uint32_t row = 0) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mulrow_comp_spv,
+        kp::shader_data::op_mulrow_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        uint32_t row;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        row
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_scale(kp::Sequence& seq,
+                   const std::shared_ptr<kp::Tensor>& in,
+                   const std::shared_ptr<kp::Tensor>& out,
+                   uint32_t inOff, uint32_t outOff,
+                   uint32_t size, float scale) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_scale_comp_spv,
+        kp::shader_data::op_scale_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        float scale;
+    } const pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        scale
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in,
+                  const std::shared_ptr<kp::Tensor>& out,
+                  uint32_t inOff, uint32_t outOff,
+                  uint32_t size) {
+    struct PushConstants {
+        uint32_t inOff, outOff;
+    } const pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_silu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
+        kp::shader_data::op_silu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_relu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
+        kp::shader_data::op_relu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_gelu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
+        kp::shader_data::op_gelu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_soft_max(kp::Sequence& seq,
+                      const std::shared_ptr<kp::Tensor>& in,
+                      const std::shared_ptr<kp::Tensor>& out,
+                      uint32_t inOff, uint32_t outOff,
+                      int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
+        kp::shader_data::op_softmax_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        int32_t ne00, ne01, ne02;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+                   const std::shared_ptr<kp::Tensor>& in,
+                   const std::shared_ptr<kp::Tensor>& out,
+                   uint32_t inOff, uint32_t outOff,
+                   int32_t ne00, int32_t nb01,
+                   int32_t nrows) {
+    GGML_ASSERT(nb01%sizeof(float) == 0);
+    GGML_ASSERT(ne00%sizeof(float) == 0);
+
+    const float epsilon = 1e-6f; // this is what ggml.c uses for rms norm
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t ne00, nb01;
+        float eps;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        (uint32_t)ne00, (uint32_t)nb01, epsilon
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({(uint32_t)nrows});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_norm(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
+        kp::shader_data::op_norm_comp_spv_len);
+
+    ggml_vk_norm_(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_rms_norm(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
+        kp::shader_data::op_rmsnorm_comp_spv_len);
+
+    ggml_vk_norm_(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_diag_mask_inf(kp::Sequence& seq,
+                           const std::shared_ptr<kp::Tensor>& in,
+                           const std::shared_ptr<kp::Tensor>& out,
+                           uint32_t inOff, uint32_t outOff,
+                           uint32_t n_past,
+                           int32_t ne00, int32_t ne01, int32_t ne02) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv,
+        kp::shader_data::op_diagmask_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t n_past;
+        int32_t ne00, ne01;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        n_past,
+        ne00, ne01
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul_mat_f16(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         int32_t ne0, int32_t ne1) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv,
+        kp::shader_data::op_mul_mat_f16_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        int32_t ne0, ne1;
+    } pushConsts {
+        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
+                          const std::shared_ptr<kp::Tensor>& inA,
+                          const std::shared_ptr<kp::Tensor>& inB,
+                          const std::shared_ptr<kp::Tensor>& out,
+                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                          int32_t ne00, int32_t ne10, int32_t ne0,
+                          int32_t ne01, int32_t ne11) {
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne10, ne0;
+    } pushConsts {
+        safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0,
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_mul_mat_q4_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
+        kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
+
+    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+}
+
+// FIXME: This could be improved like was done in q4_0 version but needs testing...
+template <typename... Args>
+void ggml_vk_mul_mat_q4_1(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
+        kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
+
+    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+}
+
+void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
+                      unsigned element_size, unsigned qk,
+                      kp::Sequence& seq,
+                      const std::shared_ptr<kp::Tensor>& inA,
+                      const std::shared_ptr<kp::Tensor>& inB,
+                      const std::shared_ptr<kp::Tensor>& out,
+                      uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                      int32_t ne00, int32_t nb01, int32_t nb1,
+                      uint32_t size) {
+    GGML_ASSERT(nb01%element_size == 0);
+    GGML_ASSERT(nb1%sizeof(float) == 0);
+    if (qk) GGML_ASSERT(ne00%qk == 0);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, nb01, nb1;
+    } pushConsts {
+        safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, nb01, nb1
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
+        kp::shader_data::op_getrows_f16_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, sizeof(half), 0, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_q4_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
+        kp::shader_data::op_getrows_q4_0_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_q4_1(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
+        kp::shader_data::op_getrows_q4_1_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
+}
+
+void ggml_vk_rope(kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in,
+                  const std::shared_ptr<kp::Tensor>& out,
+                  uint32_t inOff, uint32_t outOff,
+                  uint32_t n_past, int32_t n_dims, int32_t mode,
+                  float freq_base, float freq_scale,
+                  int32_t ne01, int32_t ne02, int32_t ne03,
+                  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+                  int32_t ne0,
+                  uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_rope_comp_spv,
+        kp::shader_data::op_rope_comp_spv_len);
+
+    GGML_ASSERT(nb03%sizeof(float) == 0);
+    GGML_ASSERT(nb02%sizeof(float) == 0);
+    GGML_ASSERT(nb01%sizeof(float) == 0);
+    GGML_ASSERT(nb00%sizeof(float) == 0);
+    GGML_ASSERT(nb3%sizeof(float) == 0);
+    GGML_ASSERT(nb2%sizeof(float) == 0);
+    GGML_ASSERT(nb1%sizeof(float) == 0);
+    GGML_ASSERT(nb0%sizeof(float) == 0);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t n_past;
+        int32_t n_dims, mode;
+        float freq_base, freq_scale;
+        uint32_t nb00, nb01, nb02, nb03;
+        int32_t ne0;
+        uint32_t nb0, nb1, nb2, nb3;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        n_past, n_dims, mode,
+        freq_base, freq_scale,
+        nb00, nb01, nb02, nb03,
+        ne0,
+        nb0, nb1, nb2, nb3
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template<uint32_t in_element_size, uint32_t out_element_size>
+void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
+                 kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& in,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inOff, uint32_t outOff,
+                 int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
+                 uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+                 int32_t ne0, int32_t ne1, int32_t ne2,
+                 uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        int32_t ne00, ne01, ne02;
+        uint32_t nb00, nb01, nb02, nb03;
+        int32_t ne0, ne1, ne2;
+        uint32_t nb0, nb1, nb2, nb3;
+    } pushConsts {
+        safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size),
+        ne00, ne01, ne02,
+        nb00, nb01, nb02, nb03,
+        ne0, ne1, ne2,
+        nb0, nb1, nb2, nb3
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f32_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
+        kp::shader_data::op_cpy_f32_f16_comp_spv_len);
+    ggml_vk_cpy<4, 2>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f32_f32(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
+        kp::shader_data::op_cpy_f32_f32_comp_spv_len);
+    ggml_vk_cpy<4, 4>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f16_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
+        kp::shader_data::op_cpy_f16_f16_comp_spv_len);
+    ggml_vk_cpy<2, 2>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f16_f32(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
+        kp::shader_data::op_cpy_f16_f32_comp_spv_len);
+    ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
+    const int n_seq = 8;
+
+    // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
+    // it to the size of the graph, but I think it can be made smaller?
+    ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes);
+
+    std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
+
+    for (auto& sequence : sequences) {
+        sequence = mgr.sequence();
+    }
+    for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
+        const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
+
+        auto& seq = *sequences[seq_idx];
+
+        const int node_start = (seq_idx + 0) * n_nodes_per_seq;
+        const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
+
+        for (int i = node_start; i < node_end; ++i) {
+            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
+            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
+            struct ggml_tensor * dst = gf->nodes[i];
+            GGML_ASSERT(dst->data != nullptr);
+
+            const int32_t ne00 = src0 ? src0->ne[0] : 0;
+            const int32_t ne01 = src0 ? src0->ne[1] : 0;
+            const int32_t ne02 = src0 ? src0->ne[2] : 0;
+            const int32_t ne03 = src0 ? src0->ne[3] : 0;
+
+            const uint32_t nb00 = src0 ? src0->nb[0] : 0;
+            const uint32_t nb01 = src0 ? src0->nb[1] : 0;
+            const uint32_t nb02 = src0 ? src0->nb[2] : 0;
+            const uint32_t nb03 = src0 ? src0->nb[3] : 0;
+
+            const int32_t ne10 = src1 ? src1->ne[0] : 0;
+            const int32_t ne11 = src1 ? src1->ne[1] : 0;
+            const int32_t ne12 = src1 ? src1->ne[2] : 0;
+//            const int32_t ne13 = src1 ? src1->ne[3] : 0;
+
+//            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint32_t nb11 = src1 ? src1->nb[1] : 0;
+            const uint32_t nb12 = src1 ? src1->nb[2] : 0;
+//            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
+
+            const int32_t ne0 = dst ? dst->ne[0] : 0;
+            const int32_t ne1 = dst ? dst->ne[1] : 0;
+            const int32_t ne2 = dst ? dst->ne[2] : 0;
+//            const int32_t ne3 = dst ? dst->ne[3] : 0;
+
+            const uint32_t nb0 = dst ? dst->nb[0] : 0;
+            const uint32_t nb1 = dst ? dst->nb[1] : 0;
+            const uint32_t nb2 = dst ? dst->nb[2] : 0;
+            const uint32_t nb3 = dst ? dst->nb[3] : 0;
+
+            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+//            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
+
+            const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
+            uint32_t off_src0 = 0;
+            uint32_t off_src1 = 0;
+            uint32_t off_dst = 0;
+            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0, &off_src0) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_dst  = dst ? ggml_vk_get_tensor(ctx, dst, &off_dst)  : nullTensor;
+
+            switch (dst->op) {
+                case GGML_OP_RESHAPE:
+                case GGML_OP_VIEW:
+                case GGML_OP_TRANSPOSE:
+                case GGML_OP_PERMUTE:
+                    {
+                        // noop
+                    } break;
+                case GGML_OP_ADD:
+                    {
+                        if (ggml_nelements(src1) == ne10) {
+                            // src1 is a row
+                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                        } else {
+                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst));
+                        }
+                    } break;
+                case GGML_OP_MUL:
+                    {
+                        if (ggml_nelements(src1) == ne10) {
+                            // src1 is a row
+                            ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                        } else {
+                            ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst));
+                        }
+                    } break;
+                case GGML_OP_SCALE:
+                    {
+                        const float scale = *(const float *) src1->data;
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
+                    } break;
+                case GGML_OP_UNARY:
+                    switch (ggml_get_unary_op(gf->nodes[i])) {
+                        case GGML_UNARY_OP_SILU:
+                            {
+                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        case GGML_UNARY_OP_RELU:
+                            {
+                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        case GGML_UNARY_OP_GELU:
+                            {
+                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        default:
+                            {
+                                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                GGML_ASSERT(false);
+                            }
+                    } break;
+                case GGML_OP_SOFT_MAX:
+                    {
+                        ggml_vk_soft_max(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03);
+                    } break;
+                case GGML_OP_DIAG_MASK_INF:
+                    {
+                        const int n_past = ((int32_t *)(dst->op_params))[0];
+                        ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
+                    } break;
+                case GGML_OP_NORM:
+                    {
+                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                    } break;
+                case GGML_OP_RMS_NORM:
+                    {
+                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                    } break;
+                case GGML_OP_MUL_MAT:
+                    {
+                        if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
+                            && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                        } else if (src0->type == GGML_TYPE_Q4_0
+                                   && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
+                        } else if (src0->type == GGML_TYPE_Q4_1
+                                   && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
+                        } else {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
+                            goto not_implemented;
+                        }
+                    } break;
+                case GGML_OP_GET_ROWS:
+                    {
+                        if (src0->type == GGML_TYPE_F16) {
+                            ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0->type == GGML_TYPE_Q4_0) {
+                            ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0->type == GGML_TYPE_Q4_1) {
+                            ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
+                            goto not_implemented;
+                        }
+                    } break;
+                case GGML_OP_ROPE:
+                    {
+                        const int n_past = ((int32_t *) dst->op_params)[0];
+                        const int n_dims = ((int32_t *) dst->op_params)[1];
+                        const int mode   = ((int32_t *) dst->op_params)[2];
+                        float freq_base;
+                        float freq_scale;
+                        memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+                        memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+                        ggml_vk_rope(seq, id_src0, id_dst, off_src0, off_dst, n_past, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
+                    } break;
+                case GGML_OP_DUP:
+                case GGML_OP_CPY:
+                case GGML_OP_CONT:
+                    {
+                        switch (src0t) {
+                            case GGML_TYPE_F32:
+                                {
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        default: goto not_implemented;
+                                    }
+                                } break;
+                            case GGML_TYPE_F16:
+                                {
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                    default: goto not_implemented;
+                                } break;
+                            default: goto not_implemented;
+                            }
+                        }
+                    } break;
+                default: goto not_implemented;
+            }
+            continue;
+            not_implemented: {}
+            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+            //GGML_ASSERT(false);
+        }
+
+        // Evaluate sequence
+        seq.evalAsync();
+    }
+
+    // Wait for all sequences to finish
+    for (auto& sequence : sequences) {
+        if (sequence->isRunning())
+            sequence->evalAwait();
+    }
+
+    ggml_vk_free_descriptor_pool(ctx);
+}
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<half>::dataType()
+{
+    return TensorDataTypes::eFloat;
+}
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<uint8_t>::dataType()
+{
+    return TensorDataTypes::eUnsignedInt;
+}
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
new file mode 100644
index 0000000000000..ad8b41e4d205e
--- /dev/null
+++ b/ggml-vulkan.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+#include <string>
+
+struct ggml_kompute_context;
+
+namespace vk {
+    class DeviceMemory;
+    class Buffer;
+};
+
+struct ggml_vk_memory {
+    void *data = nullptr;
+    size_t size = 0;
+    vk::DeviceMemory *primaryMemory = nullptr;
+    vk::Buffer *primaryBuffer = nullptr;
+    vk::DeviceMemory *stagingMemory = nullptr;
+    vk::Buffer *stagingBuffer = nullptr;
+};
+
+struct ggml_vk_device {
+    int index = 0;
+    int type = 0;           // same as VkPhysicalDeviceType
+    size_t heapSize = 0;
+    std::string name;
+    std::string vendor;
+};
+
+std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
+bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
+bool ggml_vk_init_device(const ggml_vk_device &device);
+bool ggml_vk_init_device(int device);
+bool ggml_vk_has_device();
+ggml_vk_device ggml_vk_current_device();
+struct ggml_kompute_context * ggml_vk_init(void);
+bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);
+void ggml_vk_free(struct ggml_kompute_context * ctx);
+size_t ggml_vk_aligned_offset(size_t offset);
+ggml_vk_memory ggml_vk_allocate(size_t size);
+void ggml_vk_free_memory(ggml_vk_memory &memory);
+
+void ggml_vk_add_buffer(
+    struct ggml_kompute_context * ctx,
+    const char * name,
+    const ggml_vk_memory &memory);
+
+void ggml_vk_h2d_all(struct ggml_kompute_context * ctx);
+void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
diff --git a/ggml.c b/ggml.c
index a0be068d6c9f7..cf9e056bad70a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9007,7 +9007,7 @@ static void ggml_compute_forward_add_q_f32(
     }
 }
 
-static void ggml_compute_forward_add(
+void ggml_compute_forward_add(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -9587,7 +9587,7 @@ static void ggml_compute_forward_mul_f32(
     }
 }
 
-static void ggml_compute_forward_mul(
+void ggml_compute_forward_mul(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -10510,7 +10510,7 @@ static void ggml_compute_forward_elu(
 
 // ggml_compute_forward_relu
 
-static void ggml_compute_forward_relu_f32(
+void ggml_compute_forward_relu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10534,7 +10534,7 @@ static void ggml_compute_forward_relu_f32(
     }
 }
 
-static void ggml_compute_forward_relu(
+void ggml_compute_forward_relu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10552,7 +10552,7 @@ static void ggml_compute_forward_relu(
 
 // ggml_compute_forward_gelu
 
-static void ggml_compute_forward_gelu_f32(
+void ggml_compute_forward_gelu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10593,7 +10593,7 @@ static void ggml_compute_forward_gelu_f32(
     }
 }
 
-static void ggml_compute_forward_gelu(
+void ggml_compute_forward_gelu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10670,7 +10670,7 @@ static void ggml_compute_forward_gelu_quick(
 
 // ggml_compute_forward_silu
 
-static void ggml_compute_forward_silu_f32(
+void ggml_compute_forward_silu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10711,7 +10711,7 @@ static void ggml_compute_forward_silu_f32(
     }
 }
 
-static void ggml_compute_forward_silu(
+void ggml_compute_forward_silu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10844,7 +10844,7 @@ static void ggml_compute_forward_norm_f32(
     }
 }
 
-static void ggml_compute_forward_norm(
+void ggml_compute_forward_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10910,7 +10910,7 @@ static void ggml_compute_forward_rms_norm_f32(
     }
 }
 
-static void ggml_compute_forward_rms_norm(
+void ggml_compute_forward_rms_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -11623,7 +11623,7 @@ static void ggml_compute_forward_scale_f32(
     }
 }
 
-static void ggml_compute_forward_scale(
+void ggml_compute_forward_scale(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -11744,7 +11744,7 @@ static void ggml_compute_forward_set(
 
 // ggml_compute_forward_cpy
 
-static void ggml_compute_forward_cpy(
+void ggml_compute_forward_cpy(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -11888,7 +11888,7 @@ static void ggml_compute_forward_get_rows_f32(
     }
 }
 
-static void ggml_compute_forward_get_rows(
+void ggml_compute_forward_get_rows(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -12164,7 +12164,7 @@ static void ggml_compute_forward_diag_mask_f32(
     }
 }
 
-static void ggml_compute_forward_diag_mask_inf(
+void ggml_compute_forward_diag_mask_inf(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -12198,7 +12198,7 @@ static void ggml_compute_forward_diag_mask_zero(
 
 // ggml_compute_forward_soft_max
 
-static void ggml_compute_forward_soft_max_f32(
+void ggml_compute_forward_soft_max_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -12887,7 +12887,7 @@ static void ggml_compute_forward_rope_f16(
     }
 }
 
-static void ggml_compute_forward_rope(
+void ggml_compute_forward_rope(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
diff --git a/kompute/.ccls b/kompute/.ccls
new file mode 100644
index 0000000000000..71d5d711e0797
--- /dev/null
+++ b/kompute/.ccls
@@ -0,0 +1,27 @@
+
+%clang
+
+-fdeclspec
+-fms-extensions
+-Wall
+-Wextra
+-std=c++17
+
+%h -x
+%h c++-header
+
+-DDEBUG=1
+-DKOMPUTE_INCLUDE_FOR_SYNTAX
+
+-I/usr/include/python3.6/
+-I./python/pybind11/include/
+
+-I./build/_deps/vulkan_header-src/include/
+-I./build/_deps/spdlog-src/include/
+-I./build/_deps/googletest-src/googletest/include/
+-I./build/_deps/fmt-src/include/
+
+-I./src/include/
+-I./build/src/shaders/glsl/
+-I./build/test/shaders/glsl/
+-I./test/utils/
diff --git a/kompute/.clang-format b/kompute/.clang-format
new file mode 100644
index 0000000000000..5191313a38a18
--- /dev/null
+++ b/kompute/.clang-format
@@ -0,0 +1,5 @@
+﻿---
+BasedOnStyle: Mozilla
+IndentWidth: 4
+
+...
diff --git a/kompute/.dockerignore b/kompute/.dockerignore
new file mode 100644
index 0000000000000..9498d9195f7b2
--- /dev/null
+++ b/kompute/.dockerignore
@@ -0,0 +1,4 @@
+build/*
+examples/*
+docker-builders/
+swiftshader/
diff --git a/kompute/.github/workflows/cpp_examples.yml b/kompute/.github/workflows/cpp_examples.yml
new file mode 100644
index 0000000000000..ad5306e9b29e9
--- /dev/null
+++ b/kompute/.github/workflows/cpp_examples.yml
@@ -0,0 +1,58 @@
+name: C++ Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  array-multiplication-example:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/examples/array_multiplication/build
+        source-dir: ${{github.workspace}}/examples/array_multiplication
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
+        build-options: --parallel # Given we don't build too many resources we can leverage parallel
+    - name: Run tests
+      run: ./examples/array_multiplication/build/src/kompute_array_mult
+
+  logistc-regression-example:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/examples/logistic_regression/build
+        source-dir: ${{github.workspace}}/examples/logistic_regression
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
+        build-options: --parallel # Given we don't build too many resources we can leverage parallel
+    - name: Run tests
+      run: ./examples/logistic_regression/build/src/kompute_logistic_regression
diff --git a/kompute/.github/workflows/cpp_tests.yml b/kompute/.github/workflows/cpp_tests.yml
new file mode 100644
index 0000000000000..53a90a145d386
--- /dev/null
+++ b/kompute/.github/workflows/cpp_tests.yml
@@ -0,0 +1,104 @@
+name: C++ Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  cpp-tests-debug-with-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+
+  cpp-tests-release-with-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Release
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+
+  cpp-tests-debug-without-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+  
+  cpp-tests-release-without-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Release
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
diff --git a/kompute/.github/workflows/python_tests.yml b/kompute/.github/workflows/python_tests.yml
new file mode 100644
index 0000000000000..9f84d1e854178
--- /dev/null
+++ b/kompute/.github/workflows/python_tests.yml
@@ -0,0 +1,28 @@
+name: Python Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  python-tests:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: Install Python Requirements
+      run: pip3 install --user -r python/test/requirements-dev.txt
+    - name: Python Build
+      env:
+        KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
+        KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
+      run: pip3 install --user . -v
+    - name: Python run Tests
+      run: |
+        export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
+        make test_python
diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
new file mode 100644
index 0000000000000..f89e13d1d7e6c
--- /dev/null
+++ b/kompute/CMakeLists.txt
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+project(kompute VERSION 0.8.1 LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 14)
+
+# Only change the folder behavior if kompute is not a subproject
+if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+    set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake")
+    set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin)
+    set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
+endif()
+
+# Avoid the dll boilerplate code for windows
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
+
+set(KOMPUTE_LIBRARIES kompute CACHE INTERNAL "")
+
+# ####################################################
+# Options
+# ####################################################
+macro(kompute_option OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    option(${OPTION_NAME} ${OPTION_TEXT} ${OPTION_DEFAULT})
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow overriding the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+macro(kompute_log_level OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
+    set_property(CACHE ${OPTION_NAME} PROPERTY STRINGS "Trace" "Debug" "Info" "Warn" "Error" "Critical" "Default" "Off")
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow setting the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    # Allow disabling logging completely and prevent linking against it:
+    if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
+        set(${OPTION_NAME}_DISABLED ON)
+        add_compile_definitions(${OPTION_NAME}_DISABLED=1)
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+macro(kompute_option_string OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow setting the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+message(STATUS "General purpose GPU compute framework built on Vulkan")
+message(STATUS "=======================================================")
+
+# Build options
+kompute_log_level(KOMPUTE_OPT_LOG_LEVEL "Internally we use Spdlog or fmt for logging, depending on the value of 'KOMPUTE_OPT_USE_SPDLOG'. The log level used can be changed here. Possible values: 'Trace', 'Debug', 'Info', 'Warn', 'Error', 'Critical', 'Off', 'Default'. If set to 'Off' logging will be deactivated completely. If set to 'Default', the log level will be set to 'Info' for release builds and 'Debug' else." "Off")
+kompute_option(KOMPUTE_OPT_USE_SPDLOG "If enabled, logging via KP_LOG_<DEBUG, INFO, etc...> will happen through Spdlog instead of plan fmt." OFF)
+kompute_option(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS "Explicitly disable debug layers even on debug." ON)
+kompute_option(KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK "Whether to check if your driver supports the Vulkan Header version you are linking against. This might be useful in case you build shared on a different system than you run later." OFF)
+kompute_option(KOMPUTE_OPT_BUILD_SHADERS "Rebuilds all compute shaders during compilation and does not use the already precompiled versions. Requires glslangValidator to be installed on your system." OFF)
+
+# External components
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG "Use the built-in version of Spdlog. Requires 'KOMPUTE_OPT_USE_SPDLOG' to be set to ON in order to have any effect." ON)
+kompute_option(KOMPUTE_OPT_SPDLOG_ASYNC_MODE "If spdlog is enabled this allows for selecting whether the default logger setup creates sync or async logger" OFF)
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_FMT "Use the built-in version of fmt." ON)
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER "Use the built-in version of Vulkan Headers. This could be helpful in case your system Vulkan Headers are too new for your driver. If you set this to OFF, please make sure your system Vulkan Headers are supported by your driver." ON)
+kompute_option_string(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "The git tag used for the built-in Vulkan Headers when 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER' is enabled. A list of tags can be found here: https://github.com/KhronosGroup/Vulkan-Headers/tags" "v1.3.231")
+message(STATUS "=======================================================")
+
+# ####################################################
+# Deprecated Options
+# ####################################################
+include(cmake/deprecation_warnings.cmake)
+
+# ####################################################
+# Dependencies
+# ####################################################
+include(cmake/vulkan_shader_compiler.cmake)
+include(cmake/check_vulkan_version.cmake)
+include(FetchContent)
+
+# Vulkan Header
+if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
+    FetchContent_Declare(vulkan_header GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers.git
+        GIT_TAG ${KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG}) # Source: https://github.com/KhronosGroup/Vulkan-Headers/tags
+    FetchContent_MakeAvailable(vulkan_header)
+
+    if(NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
+        # Ensure the driver supports this Vulkan version
+        check_vulkan_version(INCLUDE_DIR "${vulkan_header_SOURCE_DIR}/include")
+    endif()
+endif()
+
+find_package(Vulkan REQUIRED)
+
+if(Vulkan_FOUND AND NOT TARGET Vulkan::Headers)
+    add_library(Vulkan::Headers INTERFACE IMPORTED)
+    set_target_properties(Vulkan::Headers PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${Vulkan_INCLUDE_DIRS}")
+endif()
+
+if(NOT KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER AND NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
+    # Ensure the driver supports this Vulkan version
+    check_vulkan_version(INCLUDE_DIR ${Vulkan_INCLUDE_DIR})
+endif()
+
+# Spdlog
+if(KOMPUTE_OPT_USE_SPDLOG)
+    add_compile_definitions(KOMPUTE_OPT_USE_SPDLOG=1)
+
+    if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
+        if(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG)
+            set(SPDLOG_BUILD_SHARED ${BUILD_SHARED_LIBS})
+
+            FetchContent_Declare(spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git
+                GIT_TAG v1.10.0) # Source: https://github.com/gabime/spdlog/releases
+            FetchContent_MakeAvailable(spdlog)
+        else()
+            find_package(spdlog REQUIRED)
+        endif()
+    endif()
+endif()
+
+# fmt
+if(KOMPUTE_OPT_USE_BUILT_IN_FMT)
+    FetchContent_Declare(fmt GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+        GIT_TAG 10.0.0) # Source: https://github.com/fmtlib/fmt/releases
+    FetchContent_MakeAvailable(fmt)
+else()
+    find_package(fmt REQUIRED)
+endif()
+
+# ####################################################
+# Preprocessor Macros
+# ####################################################
+if(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS)
+    add_compile_definitions(KOMPUTE_DISABLE_VK_DEBUG_LAYERS=1)
+endif()
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
+endif()
+
+# If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h
+# As after installation, SPIRV/ header files will be found in glslang/SPIRV/ , more info in #193
+if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
+    add_definitions(-DUSE_EXTERNAL_GLSLANG)
+endif()
+
+# Allow scripts to call main kompute Makefile
+function(kompute_make KOMPUTE_MAKE_TARGET)
+    add_custom_target(${KOMPUTE_MAKE_TARGET}
+        COMMAND make -C ${PROJECT_SOURCE_DIR} ${KOMPUTE_MAKE_TARGET})
+endfunction()
+
+add_executable(xxd external/bin/xxd.c)
+
+add_subdirectory(src)
diff --git a/kompute/LICENSE b/kompute/LICENSE
new file mode 100644
index 0000000000000..821a2723e9a83
--- /dev/null
+++ b/kompute/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2021 The Institute for Ethical AI & Machine Learning
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/kompute/Makefile b/kompute/Makefile
new file mode 100644
index 0000000000000..62ad68b46ab11
--- /dev/null
+++ b/kompute/Makefile
@@ -0,0 +1,210 @@
+# This makefile is optimized to be run from WSL and to interact with the 
+# Windows host as there are limitations when building GPU programs. This
+# makefile contains the commands for interacting with the visual studio
+# build via command line for faster iterations, as the intention is to 
+# support other editors (optimised for vim). There are also commands that
+# support the builds for linux-native compilations and these are the commands
+# starting with mk_.
+
+VERSION := $(shell cat ./VERSION)
+
+VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsystems\\vcpkg.cmake"
+VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
+
+# These are the tests that don't work with swiftshader but can be run directly with vulkan
+FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps:TestPushConstants.TestConstantsDouble"
+
+ifeq ($(OS),Windows_NT)     # is Windows_NT on XP, 2000, 7, Vista, 10...
+	CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
+	SCMP_BIN="C:\\VulkanSDK\\1.2.141.2\\Bin32\\glslangValidator.exe"
+	MSBUILD_BIN ?= "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\Bin\\MSBuild.exe"
+else
+	CLANG_FORMAT_BIN ?= "/home/alejandro/Programming/lib/clang+llvm-10.0.0-x86_64-linux-gnu-ubuntu-18.04/bin/clang-format"
+	CMAKE_BIN ?= "/c/Program Files/CMake/bin/cmake.exe"
+	MSBUILD_BIN ?= "/c/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
+	# Choosing the binary based on whether it's on WSL or linux-native
+	KERNEL := $(shell uname -r)
+	IS_WSL := $(shell (if [[ "$(KERNEL)" =~ Microsoft$  ]]; then echo '0'; fi))
+	ifeq ($(IS_WSL),0)
+		SCMP_BIN ?= "/c/VulkanSDK/1.2.141.2/Bin32/glslangValidator.exe"
+	else
+		SCMP_BIN ?= "/usr/bin/glslangValidator"
+	endif
+endif
+
+
+####### Main Target Rules #######
+
+push_docs_to_ghpages:
+	GIT_DEPLOY_DIR="build/docs/sphinx/" \
+		GIT_DEPLOY_BRANCH="gh-pages" \
+		GIT_DEPLOY_REPO="origin" \
+			./scripts/push_folder_to_branch.sh
+
+####### CMAKE quickstart commands #######
+
+clean_cmake:
+	rm -rf build/
+
+####### Visual studio build shortcut commands #######
+
+MK_BUILD_TYPE ?= "Release"
+MK_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
+MK_CMAKE_EXTRA_FLAGS ?= ""
+MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
+
+mk_cmake:
+	cmake \
+		-Bbuild \
+		-DCMAKE_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
+		-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
+		-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
+		-DKOMPUTE_OPT_INSTALL=ON \
+		-DKOMPUTE_OPT_BUILD_TESTS=ON \
+		-DKOMPUTE_OPT_BUILD_DOCS=ON \
+		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
+		-DKOMPUTE_OPT_CODE_COVERAGE=ON \
+		-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+		-DKOMPUTE_OPT_LOG_LEVEL=Debug \
+		$(MK_CMAKE_EXTRA_FLAGS) \
+		-G "Unix Makefiles"
+
+mk_build_all:
+	cmake --build build/. --parallel
+
+mk_build_docs:
+	cmake --build build/. --target gendocsall --parallel
+
+mk_build_kompute:
+	cmake --build build/. --target kompute --parallel
+
+mk_build_tests:
+	cmake --build build/. --target kompute_tests --parallel
+
+mk_run_docs: mk_build_docs
+	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
+
+# An alternative would be: ctest -vv --test-dir build/.
+# But this is not possible since we need to filter specific tests, not complete executables, which is not possible with ctest.
+# https://gitlab.kitware.com/cmake/cmake/-/issues/13168 
+mk_run_tests: mk_build_tests
+	./build/bin/kompute_tests --gtest_filter=$(FILTER_TESTS)
+
+mk_build_swiftshader_library:
+	git clone https://github.com/google/swiftshader || echo "Assuming already cloned"
+	# GCC 8 or above is required otherwise error on "filesystem" lib will appear
+	CC="/usr/bin/gcc-8" CXX="/usr/bin/g++-8" cmake swiftshader/. -Bswiftshader/build/
+	cmake --build swiftshader/build/. --parallel
+
+mk_run_tests_cpu: export VK_ICD_FILENAMES=$(PWD)/swiftshader/build/vk_swiftshader_icd.json
+mk_run_tests_cpu: mk_build_swiftshader_library mk_build_tests mk_run_tests_cpu_only
+
+
+####### Visual studio build shortcut commands #######
+
+VS_BUILD_TYPE ?= "Debug"
+# Run with multiprocessin / parallel build by default
+VS_CMAKE_EXTRA_FLAGS ?= ""
+VS_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
+VS_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
+
+vs_cmake:
+	$(CMAKE_BIN) \
+		-Bbuild \
+		$(VS_CMAKE_EXTRA_FLAGS) \
+		-DCMAKE_TOOLCHAIN_FILE=$(VCPKG_WIN_PATH) \
+		-DCMAKE_CXX_FLAGS=$(VS_KOMPUTE_EXTRA_CXX_FLAGS) \
+		-DCMAKE_INSTALL_PREFIX=$(VS_INSTALL_PATH) \
+		-DKOMPUTE_OPT_INSTALL=ON \
+		-DKOMPUTE_OPT_BUILD_TESTS=ON \
+		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
+		-DKOMPUTE_OPT_CODE_COVERAGE=OFF \
+		-DKOMPUTE_OPT_BUILD_DOCS=OFF \
+		-G "Visual Studio 16 2019" \
+		-DCMAKE_BUILD_TYPE=$(VS_BUILD_TYPE)
+
+vs_build_all:
+	cmake --build build/. --parallel
+
+vs_build_docs:
+	cmake --build build/. --target gendocsall --parallel
+
+vs_install_kompute:
+	cmake --build build/. --target install --parallel
+
+vs_build_kompute:
+	cmake --build build/. --target kompute --parallel
+
+vs_build_tests:
+	cmake --build build/. --target kompute_tests --parallel
+
+vs_run_docs: vs_build_docs
+	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
+
+vs_run_tests: vs_build_tests
+	./build/test/$(VS_BUILD_TYPE)/bin/kompute_tests.exe --gtest_filter=$(FILTER_TESTS)
+
+
+#### PYTHONG ####
+
+test_python:
+	python3 -m pytest -s --log-cli-level=DEBUG -v python/test/
+
+####### Run CI Commands #######
+
+# This command uses act to replicate github action
+# https://github.com/nektos/act
+run_ci:
+	act
+
+####### General project commands #######
+
+generate_python_docstrings:
+	python -m pybind11_mkdoc \
+		-o python/src/docstrings.hpp \
+		kompute/Kompute.hpp \
+		-Iexternal/fmt/include/ \
+		-Iexternal/spdlog/include/ \
+		-Iexternal/glslang/ \
+		-I/usr/include/c++/7.5.0/
+
+install_python_reqs:
+	python3 -m pip install -r scripts/requirements.txt
+
+install_lcov:
+	sudo apt install lcov -y
+
+build_shaders:
+	python3 scripts/convert_shaders.py \
+		--shader-path shaders/glsl \
+		--shader-binary $(SCMP_BIN) \
+		--header-path src/include/kompute/shaders/ \
+		-v
+	python3 scripts/convert_shaders.py \
+		--shader-path test/shaders/glsl \
+		--shader-binary $(SCMP_BIN) \
+		--header-path test/compiled_shaders_include/kompute_test/shaders/ \
+		-v
+
+build_single_header:
+	quom \
+		--include_directory \
+		"src/include/" \
+		"single_include/AggregateHeaders.cpp" \
+		"single_include/kompute/Kompute.hpp"
+
+win_build_xxd:
+	cd external/bin/ && gcc.exe -o xxd.exe xxd.c -DCYGWIN
+
+format:
+	for val in "examples single_include src test" ; do \
+    	find $$val -depth -iname *.h -or -iname *.c -or -iname *.hpp -or -iname *.cpp | grep -v "shaders" | xargs $(CLANG_FORMAT_BIN) -style=file -i; \
+	done
+
+static_scan:
+	cppcheck --project=build/compile_commands.json -iexternal/
+
+build_changelog:
+	docker run --rm -it -v "$(PWD)":/usr/local/src/your-app -e CHANGELOG_GITHUB_TOKEN=${CHANGELOG_GITHUB_TOKEN} ferrarimarco/github-changelog-generator:1.15.2 -u KomputeProject -p kompute
+	chmod 664 CHANGELOG.md # (Read+Write, Read+Write, Read)
+	sed -i -e 's/\(HEAD\|Unreleased\)/v${VERSION}/g' CHANGELOG.md # Replacing unreleased version with latest tag
diff --git a/kompute/README.md b/kompute/README.md
new file mode 100644
index 0000000000000..b169da254bcd8
--- /dev/null
+++ b/kompute/README.md
@@ -0,0 +1,513 @@
+
+![GitHub](https://img.shields.io/badge/Version-0.7.0-green.svg)
+![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
+![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
+![GitHub](https://img.shields.io/badge/Python-3.7—3.9-blue.svg)
+![GitHub](https://img.shields.io/badge/License-Apache-black.svg)
+[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/4834/badge)](https://bestpractices.coreinfrastructure.org/projects/4834)
+
+<table>
+<tr>
+
+<td width="20%">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute.jpg">
+</td>
+
+<td>
+
+<h1>Kompute</h1>
+<h3>The general purpose GPU compute framework for cross vendor graphics cards (AMD, Qualcomm, NVIDIA & friends)</h3>
+
+</td>
+
+</tr>
+</table>
+
+<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU acceleration usecases.</h4>
+
+💬 [Join the Discord & Community Calls](https://kompute.cc/overview/community.html) 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾
+
+<hr>
+
+##### Kompute is backed by the Linux Foundation as a <a href="https://lfaidata.foundation/blog/2021/08/26/kompute-joins-lf-ai-data-as-new-sandbox-project/">hosted project</a> by the LF AI & Data Foundation.
+
+<table>
+<tr>
+<td>
+<a href="https://www.linuxfoundation.org/projects/">
+<img src="https://upload.wikimedia.org/wikipedia/commons/b/b5/Linux_Foundation_logo.png">
+</a>
+</td>
+<td>
+<a href="https://lfaidata.foundation/projects/">
+<img src="https://raw.githubusercontent.com/lfai/artwork/main/lfaidata-assets/lfaidata/horizontal/color/lfaidata-horizontal-color.png">
+</a>
+</td>
+</tr>
+</table>
+
+
+## Principles & Features
+
+* [Flexible Python module](#your-first-kompute-python) with [C++ SDK](#your-first-kompute-c) for optimizations
+* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
+* [Mobile enabled](#mobile-enabled) with examples via Android NDK across several architectures
+* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
+* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
+* Robust codebase with [90% unit test code coverage](https://kompute.cc/codecov/)
+* Advanced use-cases on [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) and [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
+* Active community with [monthly calls, discord chat and more](https://kompute.cc/overview/community.html)
+
+![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/komputer-logos.gif)
+
+## Getting Started
+
+Below you can find a GPU multiplication example using the C++ and Python Kompute interfaces.
+
+You can [join the Discord](https://discord.gg/MaH5Jv5zwv) for questions / discussion, open a [github issue](https://github.com/KomputeProject/kompute/issues/new), or read [the documentation](https://kompute.cc/).
+
+### Your First Kompute (C++)
+
+The C++ interface provides low level access to the native components of Kompute, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html).
+
+```c++
+
+void kompute(const std::string& shader) {
+
+    // 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
+    kp::Manager mgr; 
+
+    // 2. Create and initialise Kompute Tensors through manager
+
+    // Default tensor constructor simplifies creation of float values
+    auto tensorInA = mgr.tensor({ 2., 2., 2. });
+    auto tensorInB = mgr.tensor({ 1., 2., 3. });
+    // Explicit type constructor supports uint32, int32, double, float and bool
+    auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
+    auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });
+
+    std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
+
+    // 3. Create algorithm based on shader (supports buffers & push/spec constants)
+    kp::Workgroup workgroup({3, 1, 1});
+    std::vector<float> specConsts({ 2 });
+    std::vector<float> pushConstsA({ 2.0 });
+    std::vector<float> pushConstsB({ 3.0 });
+
+    auto algorithm = mgr.algorithm(params,
+                                   // See documentation shader section for compileSource
+                                   compileSource(shader),
+                                   workgroup,
+                                   specConsts,
+                                   pushConstsA);
+
+    // 4. Run operation synchronously using sequence
+    mgr.sequence()
+        ->record<kp::OpTensorSyncDevice>(params)
+        ->record<kp::OpAlgoDispatch>(algorithm) // Binds default push consts
+        ->eval() // Evaluates the two recorded operations
+        ->record<kp::OpAlgoDispatch>(algorithm, pushConstsB) // Overrides push consts
+        ->eval(); // Evaluates only last recorded operation
+
+    // 5. Sync results from the GPU asynchronously
+    auto sq = mgr.sequence();
+    sq->evalAsync<kp::OpTensorSyncLocal>(params);
+
+    // ... Do other work asynchronously whilst GPU finishes
+
+    sq->evalAwait();
+
+    // Prints the first output which is: { 4, 8, 12 }
+    for (const float& elem : tensorOutA->vector()) std::cout << elem << "  ";
+    // Prints the second output which is: { 10, 10, 10 }
+    for (const float& elem : tensorOutB->vector()) std::cout << elem << "  ";
+
+} // Manages / releases all CPU and GPU memory resources
+
+int main() {
+
+    // Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
+    // files). This shader shows some of the main components including constants, buffers, etc
+    std::string shader = (R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
+        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
+
+        // Kompute supports push constants updated on dispatch
+        layout(push_constant) uniform PushConstants {
+            float val;
+        } push_const;
+
+        // Kompute also supports spec constants on initalization
+        layout(constant_id = 0) const float const_one = 0;
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
+        }
+    )");
+
+    // Run the function declared above with our raw string shader
+    kompute(shader);
+}
+
+```
+
+### Your First Kompute (Python)
+
+The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.
+
+```python
+
+from .utils import compile_source # using util function from python/test/utils
+
+def kompute(shader):
+    # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
+    mgr = kp.Manager()
+
+    # 2. Create and initialise Kompute Tensors through manager
+
+    # Default tensor constructor simplifies creation of float values
+    tensor_in_a = mgr.tensor([2, 2, 2])
+    tensor_in_b = mgr.tensor([1, 2, 3])
+    # Explicit type constructor supports uint32, int32, double, float and bool
+    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
+    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
+
+    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
+
+    # 3. Create algorithm based on shader (supports buffers & push/spec constants)
+    workgroup = (3, 1, 1)
+    spec_consts = [2]
+    push_consts_a = [2]
+    push_consts_b = [3]
+
+    # See documentation shader section for compile_source
+    spirv = compile_source(shader)
+
+    algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a)
+
+    # 4. Run operation synchronously using sequence
+    (mgr.sequence()
+        .record(kp.OpTensorSyncDevice(params))
+        .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided
+        .eval() # evaluates the two recorded ops
+        .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts
+        .eval()) # evaluates only the last recorded op
+
+    # 5. Sync results from the GPU asynchronously
+    sq = mgr.sequence()
+    sq.eval_async(kp.OpTensorSyncLocal(params))
+
+    # ... Do other work asynchronously whilst GPU finishes
+
+    sq.eval_await()
+
+    # Prints the first output which is: { 4, 8, 12 }
+    print(tensor_out_a)
+    # Prints the first output which is: { 10, 10, 10 }
+    print(tensor_out_b)
+
+if __name__ == "__main__":
+
+    # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
+    # files). This shader shows some of the main components including constants, buffers, etc
+    shader = """
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
+        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
+
+        // Kompute supports push constants updated on dispatch
+        layout(push_constant) uniform PushConstants {
+            float val;
+        } push_const;
+
+        // Kompute also supports spec constants on initalization
+        layout(constant_id = 0) const float const_one = 0;
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
+        }
+    """
+
+    kompute(shader)
+
+```
+
+### Interactive Notebooks & Hands on Videos
+
+You are able to try out the interactive Colab Notebooks which allow you to use a free GPU. The available examples are the Python and C++ examples below:
+
+<table>
+<tr>
+
+<td width="50%">
+<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?usp=sharing">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
+</td>
+
+<td>
+<h5>Try the interactive <a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">Python Colab</a> from <a href="https://towardsdatascience.com/beyond-cuda-gpu-accelerated-python-for-machine-learning-in-cross-vendor-graphics-cards-made-simple-6cc828a45cc3">Blog Post</a></h5>
+</td>
+
+</tr>
+<tr>
+
+<td width="50%">
+<a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?authuser=1#scrollTo=1BipBsO-fQRD">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-cpp.jpg">
+</a>
+</td>
+
+<td>
+<a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-python.jpg">
+</a>
+</td>
+
+</tr>
+</table>
+
+
+You can also check out the two following talks presented at the FOSDEM 2021 conference. 
+
+Both videos have timestamps which will allow you to skip to the most relevant section for you - the intro & motivations for both is almost the same so you can skip to the more specific content.
+
+<table>
+<tr>
+
+<td width="50%">
+<h5>Watch the video for <a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">C++ Enthusiasts</a> </h5>
+</td>
+
+<td>
+<h5>Watch the video for <a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">Python & Machine Learning</a> Enthusiasts</h5>
+</td>
+
+</tr>
+<tr>
+
+<td width="50%">
+<a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-cpp-video.png">
+</a>
+</td>
+
+<td>
+<a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-python-video.png">
+</a>
+</td>
+
+</tr>
+</table>
+
+
+## Architectural Overview
+
+The core architecture of Kompute includes the following:
+* [Kompute Manager](https://kompute.cc/overview/reference.html#manager) - Base orchestrator which creates and manages device and child components
+* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
+* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
+* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
+* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU
+
+To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).
+
+<table>
+<th>
+Full Architecture
+</th>
+<th>
+Simplified Kompute Components
+</th>
+<tr>
+<td width=30%>
+
+
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-vulkan-architecture.jpg">
+
+<br>
+<br>
+(very tiny, check the <a href="https://ethicalml.github.io/vulkan-kompute/overview/reference.html">full reference diagram in docs for details</a>)
+<br>
+<br>
+
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/suspicious.jfif">
+
+</td>
+<td>
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-architecture.jpg">
+</td>
+</tr>
+</table>
+
+
+## Asynchronous and Parallel Operations
+
+Kompute provides flexibility to run operations in an asynrchonous way through vk::Fences. Furthermore, Kompute enables for explicit allocation of queues, which allow for parallel execution of operations across queue families.
+
+The image below provides an intuition on how Kompute Sequences can be allocated to different queues to enable parallel execution based on hardware. You can see the [hands on example](https://kompute.cc/overview/advanced-examples.html#parallel-operations), as well as the [detailed documentation page](https://kompute.cc/overview/async-parallel.html) describing how it would work using an NVIDIA 1650 as an example. 
+
+![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/queue-allocation.jpg)
+
+## Mobile Enabled
+
+Kompute has been optimized to work in mobile environments. The [build system](#build-overview) enables for dynamic loading of the Vulkan shared library for Android environments, together with a working [Android NDK wrapper](https://github.com/KomputeProject/kompute/tree/master/vk_ndk_wrapper_include) for the CPP headers.
+
+<table>
+<tr>
+
+<td width="70%">
+<p>
+For a full deep dive you can read the blog post "<a href="https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617">Supercharging your Mobile Apps with On-Device GPU Accelerated Machine Learning</a>". 
+
+You can also access the <a href="https://github.com/KomputeProject/kompute/tree/v0.4.0/examples/android/android-simple">end-to-end example code</a> in the repository, which can be run using android studio.
+
+</p>
+
+
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-editor.jpg">
+
+</td>
+
+
+<td width="30%">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-kompute.jpg">
+</td>
+
+</tr>
+</table>
+
+## More examples
+
+### Simple examples
+
+* [Simple multiplication example](https://kompute.cc/overview/advanced-examples.html#simple-shader-example)
+* [Record batch commands with a Kompute Sequence](https://kompute.cc/overview/advanced-examples.html#record-batch-commands)
+* [Run Asynchronous Operations](https://kompute.cc/overview/advanced-examples.html#asynchronous-operations)
+* [Run Parallel Operations Across Multiple GPU Queues](https://kompute.cc/overview/advanced-examples.html#parallel-operations)
+* [Create your custom Kompute Operations](https://kompute.cc/overview/advanced-examples.html#your-custom-kompute-operation)
+* [Implementing logistic regression from scratch](https://kompute.cc/overview/advanced-examples.html#logistic-regression-example)
+
+### End-to-end examples
+
+* [Machine Learning Logistic Regression Implementation](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a)
+* [Parallelizing GPU-intensive Workloads via Multi-Queue Operations](https://towardsdatascience.com/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc)
+* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
+* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
+
+## Python Package
+
+Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
+
+The only dependencies are Python 3.5+ and Cmake 3.4.1+. You can install Kompute from the [Python pypi package](https://pypi.org/project/kp/) using the following command.
+
+```
+pip install kp
+```
+
+You can also install from master branch using:
+
+```
+pip install git+git://github.com/KomputeProject/kompute.git@master
+```
+
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
+
+## C++ Build Overview
+
+The build system provided uses `cmake`, which allows for cross platform builds.
+
+The top level `Makefile` provides a set of optimized configurations for development as well as the docker image build, but you can start a build with the following command:
+
+```
+   cmake -Bbuild
+```
+
+You also are able to add Kompute in your repo with `add_subdirectory` - the [Android example CMakeLists.txt file](https://github.com/KomputeProject/kompute/blob/7c8c0eeba2cdc098349fcd999102bb2cca1bf711/examples/android/android-simple/app/src/main/cpp/CMakeLists.txt#L3) shows how this would be done.
+
+For a more advanced overview of the build configuration check out the [Build System Deep Dive](https://kompute.cc/overview/build-system.html) documentation.
+
+## Kompute Development
+
+We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Kompute and reporting issues is a great contribution!
+
+### Contributing
+
+#### Dev Dependencies
+
+* Testing
+    + GTest
+* Documentation
+    + Doxygen (with Dot)
+    + Sphynx
+
+#### Development
+
+* Follows Mozilla C++ Style Guide https://www-archive.mozilla.org/hacking/mozilla-style-guide.html
+    + Uses post-commit hook to run the linter, you can set it up so it runs the linter before commit
+    + All dependencies are defined in vcpkg.json 
+* Uses cmake as build system, and provides a top level makefile with recommended command
+* Uses xxd (or xxd.exe windows 64bit port) to convert shader spirv to header files
+* Uses doxygen and sphinx for documentation and autodocs
+* Uses vcpkg for finding the dependencies, it's the recommended set up to retrieve the libraries
+
+If you want to run with debug layers you can add them with the `KOMPUTE_ENV_DEBUG_LAYERS` parameter as:
+
+```
+export KOMPUTE_ENV_DEBUG_LAYERS="VK_LAYER_LUNARG_api_dump"
+```
+
+##### Updating documentation
+
+To update the documentation you will need to:
+* Run the gendoxygen target in the build system
+* Run the gensphynx target in the build-system 
+* Push to github pages with `make push_docs_to_ghpages`
+
+##### Running tests
+
+Running the unit tests has been significantly simplified for contributors.
+
+The tests run on CPU, and can be triggered using the ACT command line interface (https://github.com/nektos/act) - once you install the command line (And start the Docker daemon) you just have to type:
+
+```
+$ act
+
+[Python Tests/python-tests] 🚀  Start image=axsauze/kompute-builder:0.2
+[C++ Tests/cpp-tests      ] 🚀  Start image=axsauze/kompute-builder:0.2
+[C++ Tests/cpp-tests      ]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
+[Python Tests/python-tests]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
+...
+```
+
+The repository contains unit tests for the C++ and Python code, and can be found under the `test/` and `python/test` folder.
+
+The tests are currently run through the CI using Github Actions. It uses the images found in `docker-builders/`.
+
+In order to minimise hardware requirements the tests can run without a GPU, directly in the CPU using [Swiftshader](https://github.com/google/swiftshader).
+
+For more information on how the CI and tests are setup, you can go to the [CI, Docker and Tests Section](https://kompute.cc/overview/ci-tests.html) in the documentation.
+
+## Motivations
+
+This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+
+The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of the Vulkan SDK. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+
+We are currently developing Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on the Vulkan SDK's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Kompute architecture.
diff --git a/kompute/cmake/bin2h.cmake b/kompute/cmake/bin2h.cmake
new file mode 100644
index 0000000000000..21ad56cb11cd5
--- /dev/null
+++ b/kompute/cmake/bin2h.cmake
@@ -0,0 +1,106 @@
+##################################################################################
+# Based on: https://github.com/sivachandran/cmake-bin2h
+#
+# Copyright 2020 Sivachandran Paramasivam
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+##################################################################################
+
+include(CMakeParseArguments)
+
+# Function to wrap a given string into multiple lines at the given column position.
+# Parameters:
+#   VARIABLE    - The name of the CMake variable holding the string.
+#   AT_COLUMN   - The column position at which string will be wrapped.
+function(WRAP_STRING)
+    set(oneValueArgs VARIABLE AT_COLUMN)
+    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
+    math(EXPR offset "0")
+
+    while(stringLength GREATER 0)
+
+        if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
+            math(EXPR length "${WRAP_STRING_AT_COLUMN}")
+        else()
+            math(EXPR length "${stringLength}")
+        endif()
+
+        string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
+        set(lines "${lines}\n${line}")
+
+        math(EXPR stringLength "${stringLength} - ${length}")
+        math(EXPR offset "${offset} + ${length}")
+    endwhile()
+
+    set(${WRAP_STRING_VARIABLE} "${lines}" PARENT_SCOPE)
+endfunction()
+
+# Function to embed contents of a file as byte array in C/C++ header file(.h). The header file
+# will contain a byte array and integer variable holding the size of the array.
+# Parameters
+#   SOURCE_FILE      - The path of source file whose contents will be embedded in the header file.
+#   VARIABLE_NAME    - The name of the variable for the byte array. The string "_SIZE" will be append
+#                      to this name and will be used a variable name for size variable.
+#   HEADER_FILE      - The path of header file.
+#   APPEND           - If specified appends to the header file instead of overwriting it
+#   NULL_TERMINATE   - If specified a null byte(zero) will be append to the byte array. This will be
+#                      useful if the source file is a text file and we want to use the file contents
+#                      as string. But the size variable holds size of the byte array without this
+#                      null byte.
+#   HEADER_NAMESPACE - The namespace, where the array should be located in.
+#   IS_BIG_ENDIAN    - If set to true, will not revers the byte order for the uint32_t to match the
+#                      big endian system architecture
+# Usage:
+#   bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
+function(BIN2H)
+    set(options APPEND NULL_TERMINATE)
+    set(oneValueArgs SOURCE_FILE VARIABLE_NAME HEADER_FILE)
+    cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    # reads source file contents as hex string
+    file(READ ${BIN2H_SOURCE_FILE} hexString HEX)
+    string(LENGTH ${hexString} hexStringLength)
+
+    # appends null byte if asked
+    if(BIN2H_NULL_TERMINATE)
+        set(hexString "${hexString}00")
+    endif()
+
+    # wraps the hex string into multiple lines at column 32(i.e. 16 bytes per line)
+    wrap_string(VARIABLE hexString AT_COLUMN 32)
+    math(EXPR arraySize "${hexStringLength} / 8")
+
+    # adds '0x' prefix and comma suffix before and after every byte respectively
+    if(IS_BIG_ENDIAN)
+        message(STATUS "Interpreting shader in big endian...")
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\1\\2\\3\\4, " arrayValues ${hexString})
+    else()
+        message(STATUS "Interpreting shader in little endian...")
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\4\\3\\2\\1, " arrayValues ${hexString})
+    endif()
+    # removes trailing comma
+    string(REGEX REPLACE ", $" "" arrayValues ${arrayValues})
+
+    # converts the variable name into proper C identifier
+    string(MAKE_C_IDENTIFIER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
+    string(TOUPPER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
+
+    # declares byte array and the length variables
+    set(namespaceStart "namespace ${HEADER_NAMESPACE} {")
+    set(namespaceEnd "} // namespace ${HEADER_NAMESPACE}")
+    set(arrayIncludes "#pragma once\n#include <array>\n#include <cstdint>")
+    set(arrayDefinition "const std::array<uint32_t, ${arraySize}> ${BIN2H_VARIABLE_NAME} = { ${arrayValues} };")
+
+    set(declarations "${arrayIncludes}\n\n${namespaceStart}\n${arrayDefinition}\n${namespaceEnd}\n\n")
+    if(BIN2H_APPEND)
+        file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
+    else()
+        file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
+    endif()
+endfunction()
\ No newline at end of file
diff --git a/kompute/cmake/bin_file_to_header.cmake b/kompute/cmake/bin_file_to_header.cmake
new file mode 100644
index 0000000000000..b47b3613939e9
--- /dev/null
+++ b/kompute/cmake/bin_file_to_header.cmake
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.20)
+
+if(${INPUT_SHADER_FILE} STREQUAL "")
+    message(FATAL_ERROR "No input file path provided via 'INPUT_SHADER_FILE'.")
+endif()
+
+if(${OUTPUT_HEADER_FILE} STREQUAL "")
+    message(FATAL_ERROR "No output file path provided via 'OUTPUT_HEADER_FILE'.")
+endif()
+
+if(${HEADER_NAMESPACE} STREQUAL "")
+    message(FATAL_ERROR "No header namespace provided via 'HEADER_NAMESPACE'.")
+endif()
+
+include(bin2h.cmake)
+
+get_filename_component(BINARY_FILE_CONTENT ${INPUT_SHADER_FILE} NAME)
+bin2h(SOURCE_FILE ${INPUT_SHADER_FILE} HEADER_FILE ${OUTPUT_HEADER_FILE} VARIABLE_NAME ${BINARY_FILE_CONTENT} HEADER_NAMESPACE ${HEADER_NAMESPACE})
+file(APPEND ${OUTPUT_HEADER_FILE} "\n")
\ No newline at end of file
diff --git a/kompute/cmake/check_vulkan_version.cmake b/kompute/cmake/check_vulkan_version.cmake
new file mode 100644
index 0000000000000..0372d32060d8b
--- /dev/null
+++ b/kompute/cmake/check_vulkan_version.cmake
@@ -0,0 +1,139 @@
+# Current issue: Only checks the result of GPU0
+function(check_vulkan_version)
+    cmake_parse_arguments(VULKAN_CHECK_VERSION "" "INCLUDE_DIR" "" ${ARGN})
+    message(STATUS "Ensuring the currently installed driver supports the Vulkan version requested by the Vulkan Header.")
+
+    # Get the current Vulkan Header version (e.g. 1.2.189).
+    # This snippet is based on: https://gitlab.kitware.com/cmake/cmake/-/blob/v3.23.1/Modules/FindVulkan.cmake#L140-156
+    if(VULKAN_CHECK_VERSION_INCLUDE_DIR)
+        set(VULKAN_CORE_H ${VULKAN_CHECK_VERSION_INCLUDE_DIR}/vulkan/vulkan_core.h)
+        if(EXISTS ${VULKAN_CORE_H})
+            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE REGEX "^#define VK_HEADER_VERSION ")
+            string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION "${VULKAN_HEADER_VERSION_LINE}")
+            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE2 REGEX "^#define VK_HEADER_VERSION_COMPLETE ")
+            if(NOT ${VULKAN_HEADER_VERSION_LINE2} STREQUAL "")
+                string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION2 "${VULKAN_HEADER_VERSION_LINE2}")
+                list(LENGTH VULKAN_HEADER_VERSION2 _len)
+                # Versions >= 1.2.175 have an additional numbers in front of e.g. '0, 1, 2' instead of '1, 2'
+                if(_len EQUAL 3)
+                    list(REMOVE_AT VULKAN_HEADER_VERSION2 0)
+                endif()
+                list(APPEND VULKAN_HEADER_VERSION2 ${VULKAN_HEADER_VERSION})
+                list(JOIN VULKAN_HEADER_VERSION2 "." VULKAN_HEADER_VERSION)
+            else()
+                file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_2 REGEX "^#define VK_API_VERSION_1_2.*")
+                if(NOT ${VULKAN_HEADER_API_VERSION_1_2} STREQUAL "")
+                    set(VULKAN_HEADER_VERSION "1.2.${VULKAN_HEADER_VERSION}")
+                else()
+                    file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_1 REGEX "^#define VK_API_VERSION_1_1.*")
+                    if(NOT ${VULKAN_HEADER_API_VERSION_1_1} STREQUAL "")
+                        set(VULKAN_HEADER_VERSION "1.1.${VULKAN_HEADER_VERSION}")
+                    else()
+                        message(FATAL_ERROR "'${VULKAN_CORE_H}' does not contain a supported Vulkan version. Probably because its < 1.2.0.")
+                    endif()
+                endif()
+            endif()
+        else()
+            message(FATAL_ERROR "'${VULKAN_CORE_H}' does not exist. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
+            return()
+        endif()
+    else()
+        message(FATAL_ERROR "Invalid Vulkan include directory given. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
+        return()
+    endif()
+    message(STATUS "Found Vulkan Header version: ${VULKAN_HEADER_VERSION}")
+
+    # Get Vulkan version supported by driver
+    find_program(VULKAN_INFO_PATH NAMES vulkaninfo)
+    if(VULKAN_INFO_PATH STREQUAL "VULKAN_INFO_PATH-NOTFOUND")
+        message(FATAL_ERROR "vulkaninfo not found. The Vulkan SDK might not be installed properly. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
+        return()
+    endif()
+
+    execute_process(COMMAND "vulkaninfo"
+                    OUTPUT_VARIABLE VULKAN_INFO_OUTPUT
+                    RESULT_VARIABLE VULKAN_INFO_RETURN)
+    if(NOT ${VULKAN_INFO_RETURN} EQUAL 0)
+        message(FATAL_ERROR "Running vulkaninfo failed with return code ${VULKAN_INFO_RETURN}. Make sure you have 'vulkan-tools' installed. Result:\n${VULKAN_INFO_OUTPUT}?")
+        return()
+    else()
+        message(STATUS "Running vulkaninfo was successful. Parsing the output...")
+    endif()
+
+    # Check if running vulkaninfo was successfully
+    string(FIND "${VULKAN_INFO_OUTPUT}" "Vulkan Instance Version" VULKAN_INFO_SUCCESSFUL)
+    if(VULKAN_INFO_SUCCESSFUL LESS 0)
+        message(FATAL_ERROR "Running vulkaninfo failed. Make sure you have 'vulkan-tools' installed and DISPLAY is configured. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON). Result:\n${VULKAN_INFO_OUTPUT}?")
+    endif()
+
+    string(REGEX MATCHALL "(GPU[0-9]+)" GPU_IDS "${VULKAN_INFO_OUTPUT}")
+    if(NOT GPU_IDS)
+        message(FATAL_ERROR "No GPU supporting Vulkan found in vulkaninfo. Does your GPU (driver) support Vulkan?")
+    endif()
+
+    string(REGEX MATCHALL "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*" GPU_API_VERSIONS ${VULKAN_INFO_OUTPUT})
+    if(NOT GPU_API_VERSIONS)
+        message(FATAL_ERROR "No valid Vulkan API version found in vulkaninfo. Does your GPU (driver) support Vulkan?")
+    endif()
+
+    # Check length
+    # message(FATAL_ERROR "GPUS: ${GPU_IDS}")
+    list(LENGTH GPU_IDS GPU_IDS_LENGTH)
+    list(LENGTH GPU_API_VERSIONS GPU_API_VERSIONS_LENGTH)
+    if(NOT ${GPU_IDS_LENGTH} EQUAL ${GPU_API_VERSIONS_LENGTH})
+        message(FATAL_ERROR "Found ${GPU_IDS_LENGTH} GPUs, but ${GPU_API_VERSIONS_LENGTH} API versions in vulkaninfo. We expected to find an equal amount of them.")
+    endif()
+
+    # Compare versions
+    set(VALID_GPU "")
+    set(VALID_VULKAN_VERSION "")
+    math(EXPR ITER_LEN "${GPU_IDS_LENGTH} - 1")
+    foreach(INDEX RANGE ${ITER_LEN})
+        list(GET GPU_IDS ${INDEX} GPU)
+        list(GET GPU_API_VERSIONS ${INDEX} API_VERSION)
+
+        # Extract API version
+        if(${API_VERSION} MATCHES "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*")
+            set(VULKAN_DRIVER_VERSION ${CMAKE_MATCH_1})
+        else()
+            message(FATAL_ERROR "API version match failed. This should not have happened...")
+        endif()
+
+        message(STATUS "${GPU} supports Vulkan API version '${VULKAN_DRIVER_VERSION}'.")
+
+        # Compare driver and header version
+        if(${VULKAN_DRIVER_VERSION} VERSION_LESS ${VULKAN_HEADER_VERSION})
+        # Version missmatch. Let us check if the minor version is the same.
+            if(${VULKAN_DRIVER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
+                set(VULKAN_DRIVER_MINOR_VERSION ${CMAKE_MATCH_1})
+            else()
+                message(FATAL_ERROR "Invalid Vulkan driver version '${VULKAN_DRIVER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
+            endif()
+            if(${VULKAN_HEADER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
+                set(VULKAN_HEADER_MINOR_VERSION ${CMAKE_MATCH_1})
+            else()
+                message(FATAL_ERROR "Invalid Vulkan Header version '${VULKAN_HEADER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
+            endif()
+
+            if(${VULKAN_DRIVER_MINOR_VERSION} EQUAL ${VULKAN_HEADER_MINOR_VERSION})
+                message(WARNING "Your GPU driver does not support Vulkan > ${VULKAN_DRIVER_VERSION}, but you try to use Vulkan Header ${VULKAN_HEADER_VERSION}. At least your driver supports the same minor version (${VULKAN_DRIVER_MINOR_VERSION}), so this should be fine but keep it in mind in case you encounter any strange behavior.")
+                set(VALID_GPU ${GPU})
+                set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
+                break()
+            else()
+                message(STATUS "${GPU} does not support Vulkan > ${VULKAN_DRIVER_VERSION}.")
+            endif()
+        else()
+            set(VALID_GPU ${GPU})
+            set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
+            break()
+        endif()
+    endforeach()
+
+    if("${VALID_GPU}" STREQUAL "")
+        message(FATAL_ERROR "None of your GPUs supports Vulkan Header ${VULKAN_HEADER_VERSION}. Please try updating your driver, or downgrade your Vulkan headers. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
+    else()
+        message("Valid GPU (${VALID_GPU}) for Vulkan header version ${VULKAN_HEADER_VERSION} found. ${VALID_GPU} supports up to Vulkan ${VALID_VULKAN_VERSION}.")
+    endif()
+
+endfunction()
diff --git a/kompute/cmake/code_coverage.cmake b/kompute/cmake/code_coverage.cmake
new file mode 100644
index 0000000000000..7fb6ce264b6ab
--- /dev/null
+++ b/kompute/cmake/code_coverage.cmake
@@ -0,0 +1,35 @@
+# Code coverage
+set(CMAKE_BUILD_TYPE COVERAGE CACHE INTERNAL "Coverage build enabled")
+message(STATUS "Enabling gcov support")
+
+if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(COVERAGE_FLAG "--coverage")
+endif()
+
+set(CMAKE_CXX_FLAGS_COVERAGE
+    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
+    CACHE STRING "Flags used by the C++ compiler during coverage builds."
+    FORCE)
+set(CMAKE_C_FLAGS_COVERAGE
+    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
+    CACHE STRING "Flags used by the C compiler during coverage builds."
+    FORCE)
+set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
+    ""
+    CACHE STRING "Flags used for linking binaries during coverage builds."
+    FORCE)
+set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
+    ""
+    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
+    FORCE)
+
+set(CODECOV_DIR ${CMAKE_CURRENT_BINARY_DIR}/codecov/)
+set(CODECOV_DIR_LCOV ${CODECOV_DIR}lcov/)
+set(CODECOV_FILENAME_LCOV_INFO lcov.info)
+set(CODECOV_FILENAME_LCOV_INFO_FULL lcov_full.info)
+set(CODECOV_DIR_HTML ${CODECOV_DIR}html/)
+
+mark_as_advanced(CMAKE_CXX_FLAGS_COVERAGE
+    CMAKE_C_FLAGS_COVERAGE
+    CMAKE_EXE_LINKER_FLAGS_COVERAGE
+    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
diff --git a/kompute/cmake/deprecation_warnings.cmake b/kompute/cmake/deprecation_warnings.cmake
new file mode 100644
index 0000000000000..1ed1f455507a8
--- /dev/null
+++ b/kompute/cmake/deprecation_warnings.cmake
@@ -0,0 +1,15 @@
+if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
+    message(FATAL_ERROR "'KOMPUTE_OPT_REPO_SUBMODULE_BUILD' got replaced by 'KOMPUTE_OPT_USE_BUILT_IN_SPDLOG', 'KOMPUTE_OPT_USE_BUILT_IN_FMT', 'KOMPUTE_OPT_USE_BUILT_IN_GOOGLE_TEST', 'KOMPUTE_OPT_USE_BUILT_IN_PYBIND11' and 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER'. Please use them instead.")
+endif()
+
+if(KOMPUTE_OPT_BUILD_AS_SHARED_LIB)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_AS_SHARED_LIB' is deprecated and should not be used. Instead use the default 'BUILD_SHARED_LIBS' CMake switch.")
+endif()
+
+if(KOMPUTE_OPT_BUILD_SINGLE_HEADER)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_SINGLE_HEADER' is deprecated and should not be used. The single header will now always be build and can be included via '#include<kompute/kompute.h>'.")
+endif()
+
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_ENABLE_SPDLOG' is deprecated and should not be used. It got replaced by 'KOMPUTE_OPT_LOG_LEVEL'. This option can be set to a variety of log levels (e.g. 'Off', 'Trace', 'Debug', 'Default', ...).")
+endif()
\ No newline at end of file
diff --git a/kompute/cmake/komputeConfig.cmake.in b/kompute/cmake/komputeConfig.cmake.in
new file mode 100644
index 0000000000000..87e8a99e23e99
--- /dev/null
+++ b/kompute/cmake/komputeConfig.cmake.in
@@ -0,0 +1,8 @@
+include(CMakeFindDependencyMacro)
+@PACKAGE_INIT@
+
+find_dependency(VULKAN REQUIRED)
+
+include(${CMAKE_CURRENT_LIST_DIR}/komputeTargets.cmake)
+
+check_required_components(kompute)
\ No newline at end of file
diff --git a/kompute/cmake/vulkan_shader_compiler.cmake b/kompute/cmake/vulkan_shader_compiler.cmake
new file mode 100644
index 0000000000000..acc27b57c2acc
--- /dev/null
+++ b/kompute/cmake/vulkan_shader_compiler.cmake
@@ -0,0 +1,43 @@
+function(vulkan_compile_shader)
+     find_program(GLS_LANG_VALIDATOR_PATH NAMES glslangValidator)
+     if(GLS_LANG_VALIDATOR_PATH STREQUAL "GLS_LANG_VALIDATOR_PATH-NOTFOUND")
+          message(FATAL_ERROR "glslangValidator not found.")
+          return()
+     endif()
+
+     cmake_parse_arguments(SHADER_COMPILE "" "INFILE;OUTFILE;NAMESPACE;RELATIVE_PATH" "" ${ARGN})
+     set(SHADER_COMPILE_INFILE_FULL "${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_COMPILE_INFILE}")
+     set(SHADER_COMPILE_SPV_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_INFILE}.spv")
+     set(SHADER_COMPILE_HEADER_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_OUTFILE}")
+
+     if(NOT SHADER_COMPILE_RELATIVE_PATH)
+          set(SHADER_COMPILE_RELATIVE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+     endif()
+    
+     # .comp -> .spv
+     add_custom_command(OUTPUT "${SHADER_COMPILE_SPV_FILE_FULL}"
+                        COMMAND "${GLS_LANG_VALIDATOR_PATH}"
+                        ARGS "-V"
+                             "${SHADER_COMPILE_INFILE_FULL}"
+                             "-o"
+                             "${SHADER_COMPILE_SPV_FILE_FULL}"
+                        COMMENT "Compile vulkan compute shader from file '${SHADER_COMPILE_INFILE_FULL}' to '${SHADER_COMPILE_SPV_FILE_FULL}'."
+                        MAIN_DEPENDENCY "${SHADER_COMPILE_INFILE_FULL}")
+
+     # Check if big or little endian
+     include (TestBigEndian)
+     TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
+
+     # .spv -> .hpp
+     add_custom_command(OUTPUT "${SHADER_COMPILE_HEADER_FILE_FULL}"
+                        COMMAND ${CMAKE_COMMAND}
+                        ARGS "-DINPUT_SHADER_FILE=${SHADER_COMPILE_SPV_FILE_FULL}"
+                             "-DOUTPUT_HEADER_FILE=${SHADER_COMPILE_HEADER_FILE_FULL}"
+                             "-DHEADER_NAMESPACE=${SHADER_COMPILE_NAMESPACE}"
+                             "-DIS_BIG_ENDIAN=${IS_BIG_ENDIAN}"
+                             "-P"
+                             "${SHADER_COMPILE_RELATIVE_PATH}/bin_file_to_header.cmake"
+                        WORKING_DIRECTORY "${SHADER_COMPILE_RELATIVE_PATH}"
+                        COMMENT "Converting compiled shader '${SHADER_COMPILE_SPV_FILE_FULL}' to header file '${SHADER_COMPILE_HEADER_FILE_FULL}'."
+                        MAIN_DEPENDENCY "${SHADER_COMPILE_SPV_FILE_FULL}")
+endfunction()
diff --git a/kompute/config/FindSphinx.cmake b/kompute/config/FindSphinx.cmake
new file mode 100644
index 0000000000000..c645ccc9ff366
--- /dev/null
+++ b/kompute/config/FindSphinx.cmake
@@ -0,0 +1,16 @@
+# Look for an executable called sphinx-build
+find_program(SPHINX_EXECUTABLE
+    NAMES sphinx-build
+    DOC "Path to sphinx-build executable")
+
+if(SPHINX_EXECUTABLE STREQUAL "SPHINX_EXECUTABLE-NOTFOUND")
+    message(FATAL_ERROR "sphinx-build not found.")
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+# Handle standard arguments to find_package like REQUIRED and QUIET
+find_package_handle_standard_args(
+    Sphinx
+    "Failed to find sphinx-build executable"
+    SPHINX_EXECUTABLE)
diff --git a/kompute/external/bin/xxd.c b/kompute/external/bin/xxd.c
new file mode 100644
index 0000000000000..60ed3f712a766
--- /dev/null
+++ b/kompute/external/bin/xxd.c
@@ -0,0 +1,819 @@
+/*
+As indicated at https://lists.debian.org/debian-legal/2015/01/msg00037.html,
+the author has permitted redistribution of xxd under the MIT license, as follows:
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * xxd: my hexdump facility. jw
+ *
+ *  2.10.90 changed to word output
+ *  3.03.93 new indent style, dumb bug inserted and fixed.
+ *	    -c option, mls
+ * 26.04.94 better option parser, -ps, -l, -s added.
+ *  1.07.94 -r badly needs - as input file.  Per default autoskip over
+ *	       consecutive lines of zeroes, as unix od does.
+ *	    -a shows them too.
+ *	    -i dump as c-style #include "file.h"
+ *  1.11.95 if "xxd -i" knows the filename, an 'unsigned char filename_bits[]'
+ *	    array is written in correct c-syntax.
+ *	    -s improved, now defaults to absolute seek, relative requires a '+'.
+ *	    -r improved, now -r -s -0x... is supported.
+ *	       change/suppress leading '\0' bytes.
+ *	    -l n improved: stops exactly after n bytes.
+ *	    -r improved, better handling of partial lines with trailing garbage.
+ *	    -r improved, now -r -p works again!
+ *	    -r improved, less flushing, much faster now! (that was silly)
+ *  3.04.96 Per repeated request of a single person: autoskip defaults to off.
+ * 15.05.96 -v added. They want to know the version.
+ *	    -a fixed, to show last line inf file ends in all zeros.
+ *	    -u added: Print upper case hex-letters, as preferred by unix bc.
+ *	    -h added to usage message. Usage message extended.
+ *	    Now using outfile if specified even in normal mode, aehem.
+ *	    No longer mixing of ints and longs. May help doze people.
+ *	    Added binify ioctl for same reason. (Enough Doze stress for 1996!)
+ * 16.05.96 -p improved, removed occasional superfluous linefeed.
+ * 20.05.96 -l 0 fixed. tried to read anyway.
+ * 21.05.96 -i fixed. now honours -u, and prepends __ to numeric filenames.
+ *	    compile -DWIN32 for NT or W95. George V. Reilly, * -v improved :-)
+ *	    support --gnuish-longhorn-options
+ * 25.05.96 MAC support added: CodeWarrior already uses ``outline'' in Types.h
+ *	    which is included by MacHeaders (Axel Kielhorn). Renamed to
+ *	    xxdline().
+ *  7.06.96 -i printed 'int' instead of 'char'. *blush*
+ *	    added Bram's OS2 ifdefs...
+ * 18.07.96 gcc -Wall @ SunOS4 is now slient.
+ *	    Added osver for MSDOS/DJGPP/WIN32.
+ * 29.08.96 Added size_t to strncmp() for Amiga.
+ * 24.03.97 Windows NT support (Phil Hanna). Clean exit for Amiga WB (Bram)
+ * 02.04.97 Added -E option, to have EBCDIC translation instead of ASCII
+ *	    (azc10@yahoo.com)
+ * 22.05.97 added -g (group octets) option (jcook@namerica.kla.com).
+ * 23.09.98 nasty -p -r misfeature fixed: slightly wrong output, when -c was
+ *	    missing or wrong.
+ * 26.09.98 Fixed: 'xxd -i infile outfile' did not truncate outfile.
+ * 27.10.98 Fixed: -g option parser required blank.
+ *	    option -b added: 01000101 binary output in normal format.
+ * 16.05.00 Added VAXC changes by Stephen P. Wall
+ * 16.05.00 Improved MMS file and merge for VMS by Zoltan Arpadffy
+ *
+ * (c) 1990-1998 by Juergen Weigert (jnweiger@informatik.uni-erlangen.de)
+ *
+ * Small changes made afterwards by Bram Moolenaar et al.
+ *
+ * Distribute freely and credit me,
+ * make money and share with me,
+ * lose money and don't ask me.
+ *
+ *
+ */
+
+/* Visual Studio 2005 has 'deprecated' many of the standard CRT functions */
+#if _MSC_VER >= 1400
+# define _CRT_SECURE_NO_DEPRECATE
+# define _CRT_NONSTDC_NO_DEPRECATE
+#endif
+
+#include <stdio.h>
+#ifdef VAXC
+# include <file.h>
+#else
+# include <fcntl.h>
+#endif
+#ifdef __TSC__
+# define MSDOS
+#endif
+#if !defined(OS2) && defined(__EMX__)
+# define OS2
+#endif
+#if defined(MSDOS) || defined(WIN32) || defined(OS2) || defined(__BORLANDC__) || defined(CYGWIN)
+# include <io.h>	/* for setmode() */
+#else
+# ifdef UNIX
+#  include <unistd.h>
+# endif
+#endif
+#include <stdlib.h>
+#include <string.h>	/* for strncmp() */
+#include <ctype.h>	/* for isalnum() */
+#if __MWERKS__ && !defined(BEBOX)
+# include <unix.h>	/* for fdopen() on MAC */
+#endif
+
+#if defined(__BORLANDC__) && __BORLANDC__ <= 0x0410 && !defined(fileno)
+/* Missing define and prototype grabbed from the BC 4.0 <stdio.h> */
+# define fileno(f)       ((f)->fd)
+FILE   _FAR *_Cdecl _FARFUNC fdopen(int __handle, char _FAR *__type);
+#endif
+
+
+/*  This corrects the problem of missing prototypes for certain functions
+ *  in some GNU installations (e.g. SunOS 4.1.x).
+ *  Darren Hiebert <darren@hmi.com> (sparc-sun-sunos4.1.3_U1/2.7.2.2)
+ */
+#if defined(__GNUC__) && defined(__STDC__)
+# ifndef __USE_FIXED_PROTOTYPES__
+#  define __USE_FIXED_PROTOTYPES__
+# endif
+#endif
+
+#ifndef __USE_FIXED_PROTOTYPES__
+/*
+ * This is historic and works only if the compiler really has no prototypes:
+ *
+ * Include prototypes for Sun OS 4.x, when using an ANSI compiler.
+ * FILE is defined on OS 4.x, not on 5.x (Solaris).
+ * if __SVR4 is defined (some Solaris versions), don't include this.
+ */
+#if defined(sun) && defined(FILE) && !defined(__SVR4) && defined(__STDC__)
+#  define __P(a) a
+/* excerpt from my sun_stdlib.h */
+extern int fprintf __P((FILE *, char *, ...));
+extern int fputs   __P((char *, FILE *));
+extern int _flsbuf __P((unsigned char, FILE *));
+extern int _filbuf __P((FILE *));
+extern int fflush  __P((FILE *));
+extern int fclose  __P((FILE *));
+extern int fseek   __P((FILE *, long, int));
+extern int rewind  __P((FILE *));
+
+extern void perror __P((char *));
+# endif
+#endif
+
+extern long int strtol();
+extern long int ftell();
+
+char version[] = "xxd V1.10 27oct98 by Juergen Weigert";
+#ifdef WIN32
+char osver[] = " (Win32)";
+#else
+# ifdef DJGPP
+char osver[] = " (dos 32 bit)";
+# else
+#  ifdef MSDOS
+char osver[] = " (dos 16 bit)";
+#  else
+char osver[] = "";
+#  endif
+# endif
+#endif
+
+#if !defined(CYGWIN) && (defined(CYGWIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__))
+# define CYGWIN
+#endif
+#if defined(MSDOS) || defined(WIN32) || defined(OS2)
+# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
+# define BIN_WRITE(yes) ((yes) ? "wb" : "wt")
+# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
+# define BIN_ASSIGN(fp, yes) setmode(fileno(fp), (yes) ? O_BINARY : O_TEXT)
+# define PATH_SEP '\\'
+#elif defined(CYGWIN)
+# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
+# define BIN_WRITE(yes) ((yes) ? "wb" : "w")
+# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
+# define BIN_ASSIGN(fp, yes) ((yes) ? (void) setmode(fileno(fp), O_BINARY) : (void) (fp))
+# define PATH_SEP '/'
+#else
+# ifdef VMS
+#  define BIN_READ(dummy)  "r"
+#  define BIN_WRITE(dummy) "w"
+#  define BIN_CREAT(dummy) O_CREAT
+#  define BIN_ASSIGN(fp, dummy) fp
+#  define PATH_SEP ']'
+#  define FILE_SEP '.'
+# else
+#  define BIN_READ(dummy)  "r"
+#  define BIN_WRITE(dummy) "w"
+#  define BIN_CREAT(dummy) O_CREAT
+#  define BIN_ASSIGN(fp, dummy) fp
+#  define PATH_SEP '/'
+# endif
+#endif
+
+/* open has only to arguments on the Mac */
+#if __MWERKS__
+# define OPEN(name, mode, umask) open(name, mode)
+#else
+# define OPEN(name, mode, umask) open(name, mode, umask)
+#endif
+
+#ifdef AMIGA
+# define STRNCMP(s1, s2, l) strncmp(s1, s2, (size_t)l)
+#else
+# define STRNCMP(s1, s2, l) strncmp(s1, s2, l)
+#endif
+
+#ifndef __P
+# if defined(__STDC__) || defined(MSDOS) || defined(WIN32) || defined(OS2) \
+        || defined(__BORLANDC__)
+#  define __P(a) a
+# else
+#  define __P(a) ()
+# endif
+#endif
+
+/* Let's collect some prototypes */
+/* CodeWarrior is really picky about missing prototypes */
+static void exit_with_usage __P((char *));
+static int huntype __P((FILE *, FILE *, FILE *, char *, int, int, long));
+static void xxdline __P((FILE *, char *, int));
+
+#define TRY_SEEK	/* attempt to use lseek, or skip forward by reading */
+#define COLS 256	/* change here, if you ever need more columns */
+#define LLEN (11 + (9*COLS-1)/1 + COLS + 2)
+
+char hexxa[] = "0123456789abcdef0123456789ABCDEF", *hexx = hexxa;
+
+/* the different hextypes known by this program: */
+#define HEX_NORMAL 0
+#define HEX_POSTSCRIPT 1
+#define HEX_CINCLUDE 2
+#define HEX_BITS 3		/* not hex a dump, but bits: 01111001 */
+
+static void
+exit_with_usage(pname)
+char *pname;
+{
+  fprintf(stderr, "Usage:\n       %s [options] [infile [outfile]]\n", pname);
+  fprintf(stderr, "    or\n       %s -r [-s [-]offset] [-c cols] [-ps] [infile [outfile]]\n", pname);
+  fprintf(stderr, "Options:\n");
+  fprintf(stderr, "    -a          toggle autoskip: A single '*' replaces nul-lines. Default off.\n");
+  fprintf(stderr, "    -b          binary digit dump (incompatible with -p,-i,-r). Default hex.\n");
+  fprintf(stderr, "    -c cols     format <cols> octets per line. Default 16 (-i: 12, -ps: 30).\n");
+  fprintf(stderr, "    -E          show characters in EBCDIC. Default ASCII.\n");
+  fprintf(stderr, "    -g          number of octets per group in normal output. Default 2.\n");
+  fprintf(stderr, "    -h          print this summary.\n");
+  fprintf(stderr, "    -i          output in C include file style.\n");
+  fprintf(stderr, "    -l len      stop after <len> octets.\n");
+  fprintf(stderr, "    -ps         output in postscript plain hexdump style.\n");
+  fprintf(stderr, "    -r          reverse operation: convert (or patch) hexdump into binary.\n");
+  fprintf(stderr, "    -r -s off   revert with <off> added to file positions found in hexdump.\n");
+  fprintf(stderr, "    -s %sseek  start at <seek> bytes abs. %sinfile offset.\n",
+#ifdef TRY_SEEK
+      "[+][-]", "(or +: rel.) ");
+#else
+      "", "");
+#endif
+  fprintf(stderr, "    -u          use upper case hex letters.\n");
+  fprintf(stderr, "    -v          show version: \"%s%s\".\n", version, osver);
+  exit(1);
+}
+
+/*
+ * Max. cols binary characters are decoded from the input stream per line.
+ * Two adjacent garbage characters after evaluated data delimit valid data.
+ * Everything up to the next newline is discarded.
+ *
+ * The name is historic and came from 'undo type opt h'.
+ */
+static int
+huntype(fpi, fpo, fperr, pname, cols, hextype, base_off)
+FILE *fpi, *fpo, *fperr;
+char *pname;
+int cols, hextype;
+long base_off;
+{
+  int c, ign_garb = 1, n1 = -1, n2 = 0, n3, p = cols;
+  long have_off = 0, want_off = 0;
+
+  rewind(fpi);
+
+  while ((c = getc(fpi)) != EOF)
+    {
+      if (c == '\r')	/* Doze style input file? */
+    continue;
+
+#if 0	/* this doesn't work when there is normal text after the hex codes in
+       the last line that looks like hex */
+      if (c == ' ' || c == '\n' || c == '\t')  /* allow multiple spaces */
+    continue;
+#endif
+
+      n3 = n2;
+      n2 = n1;
+
+      if (c >= '0' && c <= '9')
+    n1 = c - '0';
+      else if (c >= 'a' && c <= 'f')
+    n1 = c - 'a' + 10;
+      else if (c >= 'A' && c <= 'F')
+    n1 = c - 'A' + 10;
+      else
+    {
+      n1 = -1;
+      if (ign_garb)
+        continue;
+    }
+
+      ign_garb = 0;
+
+      if (p >= cols)
+    {
+      if (!hextype)
+        {
+          if (n1 < 0)
+        {
+          p = 0;
+          continue;
+        }
+          want_off = (want_off << 4) | n1;
+          continue;
+        }
+      else
+        p = 0;
+    }
+
+      if (base_off + want_off != have_off)
+    {
+      fflush(fpo);
+#ifdef TRY_SEEK
+      c = fseek(fpo, base_off + want_off - have_off, 1);
+      if (c >= 0)
+        have_off = base_off + want_off;
+#endif
+      if (base_off + want_off < have_off)
+        {
+          fprintf(fperr, "%s: sorry, cannot seek backwards.\n", pname);
+          return 5;
+        }
+      for (; have_off < base_off + want_off; have_off++)
+        putc(0, fpo);
+    }
+
+      if (n2 >= 0 && n1 >= 0)
+    {
+      putc((n2 << 4) | n1, fpo);
+      have_off++;
+      want_off++;
+      n1 = -1;
+      if ((++p >= cols) && !hextype)
+        {
+          /* skip rest of line as garbage */
+          want_off = 0;
+          while ((c = getc(fpi)) != '\n' && c != EOF)
+        ;
+          ign_garb = 1;
+        }
+    }
+      else if (n1 < 0 && n2 < 0 && n3 < 0)
+    {
+      /* already stumbled into garbage, skip line, wait and see */
+      if (!hextype)
+        want_off = 0;
+      while ((c = getc(fpi)) != '\n' && c != EOF)
+        ;
+      ign_garb = 1;
+    }
+    }
+  fflush(fpo);
+#ifdef TRY_SEEK
+  fseek(fpo, 0L, 2);
+#endif
+  fclose(fpo);
+  fclose(fpi);
+  return 0;
+}
+
+/*
+ * Print line l. If nz is false, xxdline regards the line a line of
+ * zeroes. If there are three or more consecutive lines of zeroes,
+ * they are replaced by a single '*' character.
+ *
+ * If the output ends with more than two lines of zeroes, you
+ * should call xxdline again with l being the last line and nz
+ * negative. This ensures that the last line is shown even when
+ * it is all zeroes.
+ *
+ * If nz is always positive, lines are never suppressed.
+ */
+static void
+xxdline(fp, l, nz)
+FILE *fp;
+char *l;
+int nz;
+{
+  static char z[LLEN+1];
+  static int zero_seen = 0;
+
+  if (!nz && zero_seen == 1)
+    strcpy(z, l);
+
+  if (nz || !zero_seen++)
+    {
+      if (nz)
+    {
+      if (nz < 0)
+        zero_seen--;
+      if (zero_seen == 2)
+        fputs(z, fp);
+      if (zero_seen > 2)
+        fputs("*\n", fp);
+    }
+      if (nz >= 0 || zero_seen > 0)
+    fputs(l, fp);
+      if (nz)
+    zero_seen = 0;
+    }
+}
+
+/* This is an EBCDIC to ASCII conversion table */
+/* from a proposed BTL standard April 16, 1979 */
+static unsigned char etoa64[] =
+{
+    0040,0240,0241,0242,0243,0244,0245,0246,
+    0247,0250,0325,0056,0074,0050,0053,0174,
+    0046,0251,0252,0253,0254,0255,0256,0257,
+    0260,0261,0041,0044,0052,0051,0073,0176,
+    0055,0057,0262,0263,0264,0265,0266,0267,
+    0270,0271,0313,0054,0045,0137,0076,0077,
+    0272,0273,0274,0275,0276,0277,0300,0301,
+    0302,0140,0072,0043,0100,0047,0075,0042,
+    0303,0141,0142,0143,0144,0145,0146,0147,
+    0150,0151,0304,0305,0306,0307,0310,0311,
+    0312,0152,0153,0154,0155,0156,0157,0160,
+    0161,0162,0136,0314,0315,0316,0317,0320,
+    0321,0345,0163,0164,0165,0166,0167,0170,
+    0171,0172,0322,0323,0324,0133,0326,0327,
+    0330,0331,0332,0333,0334,0335,0336,0337,
+    0340,0341,0342,0343,0344,0135,0346,0347,
+    0173,0101,0102,0103,0104,0105,0106,0107,
+    0110,0111,0350,0351,0352,0353,0354,0355,
+    0175,0112,0113,0114,0115,0116,0117,0120,
+    0121,0122,0356,0357,0360,0361,0362,0363,
+    0134,0237,0123,0124,0125,0126,0127,0130,
+    0131,0132,0364,0365,0366,0367,0370,0371,
+    0060,0061,0062,0063,0064,0065,0066,0067,
+    0070,0071,0372,0373,0374,0375,0376,0377
+};
+
+const char* extract_filename(const char* path) {
+    const char* filename = strrchr(path, '/');
+    if (filename) {
+        return filename + 1;
+    }
+    return path;
+}
+
+int
+main(argc, argv)
+int argc;
+char *argv[];
+{
+  FILE *fp, *fpo;
+  int c, e, p = 0, relseek = 1, negseek = 0, revert = 0;
+  int cols = 0, nonzero = 0, autoskip = 0, hextype = HEX_NORMAL;
+  int ebcdic = 0;
+  int octspergrp = -1;	/* number of octets grouped in output */
+  int grplen;		/* total chars per octet group */
+  long length = -1, n = 0, seekoff = 0;
+  char l[LLEN+1];
+  char *pname, *pp;
+
+#ifdef AMIGA
+  /* This program doesn't work when started from the Workbench */
+  if (argc == 0)
+    exit(1);
+#endif
+
+  pname = argv[0];
+  for (pp = pname; *pp; )
+    if (*pp++ == PATH_SEP)
+      pname = pp;
+#ifdef FILE_SEP
+  for (pp = pname; *pp; pp++)
+    if (*pp == FILE_SEP)
+      {
+    *pp = '\0';
+    break;
+      }
+#endif
+
+  while (argc >= 2)
+    {
+      pp = argv[1] + (!STRNCMP(argv[1], "--", 2) && argv[1][2]);
+       if (!STRNCMP(pp, "-a", 2)) autoskip = 1 - autoskip;
+      else if (!STRNCMP(pp, "-b", 2)) hextype = HEX_BITS;
+      else if (!STRNCMP(pp, "-u", 2)) hexx = hexxa + 16;
+      else if (!STRNCMP(pp, "-p", 2)) hextype = HEX_POSTSCRIPT;
+      else if (!STRNCMP(pp, "-i", 2)) hextype = HEX_CINCLUDE;
+      else if (!STRNCMP(pp, "-r", 2)) revert++;
+      else if (!STRNCMP(pp, "-E", 2)) ebcdic++;
+      else if (!STRNCMP(pp, "-v", 2))
+    {
+      fprintf(stderr, "%s%s\n", version, osver);
+      exit(0);
+    }
+      else if (!STRNCMP(pp, "-c", 2))
+    {
+      if (pp[2] && STRNCMP("ols", pp + 2, 3))
+        cols = (int)strtol(pp + 2, NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          cols = (int)strtol(argv[2], NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-g", 2))
+    {
+      if (pp[2] && STRNCMP("roupsize", pp + 2, 8))
+        octspergrp = (int)strtol(pp + 2, NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          octspergrp = (int)strtol(argv[2], NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-s", 2))
+    {
+      relseek = 0;
+      negseek = 0;
+      if (pp[2] && STRNCMP("kip", pp+2, 3) && STRNCMP("eek", pp+2, 3))
+        {
+#ifdef TRY_SEEK
+          if (pp[2] == '+')
+        relseek++;
+          if (pp[2+relseek] == '-')
+        negseek++;
+#endif
+          seekoff = strtol(pp + 2+relseek+negseek, (char **)NULL, 0);
+        }
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+#ifdef TRY_SEEK
+          if (argv[2][0] == '+')
+        relseek++;
+          if (argv[2][relseek] == '-')
+        negseek++;
+#endif
+          seekoff = strtol(argv[2] + relseek+negseek, (char **)NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-l", 2))
+    {
+      if (pp[2] && STRNCMP("en", pp + 2, 2))
+        length = strtol(pp + 2, (char **)NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          length = strtol(argv[2], (char **)NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!strcmp(pp, "--"))	/* end of options */
+    {
+      argv++;
+      argc--;
+      break;
+    }
+      else if (pp[0] == '-' && pp[1])	/* unknown option */
+    exit_with_usage(pname);
+      else
+    break;				/* not an option */
+
+      argv++;				/* advance to next argument */
+      argc--;
+    }
+
+  if (!cols)
+    switch (hextype)
+      {
+      case HEX_POSTSCRIPT:	cols = 30; break;
+      case HEX_CINCLUDE:	cols = 12; break;
+      case HEX_BITS:		cols = 6; break;
+      case HEX_NORMAL:
+      default:			cols = 16; break;
+      }
+
+  if (octspergrp < 0)
+    switch (hextype)
+      {
+      case HEX_BITS:		octspergrp = 1; break;
+      case HEX_NORMAL:		octspergrp = 2; break;
+      case HEX_POSTSCRIPT:
+      case HEX_CINCLUDE:
+      default:			octspergrp = 0; break;
+      }
+
+  if (cols < 1 || ((hextype == HEX_NORMAL || hextype == HEX_BITS)
+                                && (cols > COLS)))
+    {
+      fprintf(stderr, "%s: invalid number of columns (max. %d).\n", pname, COLS);
+      exit(1);
+    }
+
+  if (octspergrp < 1)
+    octspergrp = cols;
+
+  if (argc > 3)
+    exit_with_usage(pname);
+
+  if (argc == 1 || (argv[1][0] == '-' && !argv[1][1]))
+    BIN_ASSIGN(fp = stdin, !revert);
+  else
+    {
+      if ((fp = fopen(argv[1], BIN_READ(!revert))) == NULL)
+    {
+      fprintf(stderr,"%s: ", pname);
+      perror(argv[1]);
+      return 2;
+    }
+    }
+
+  if (argc < 3 || (argv[2][0] == '-' && !argv[2][1]))
+    BIN_ASSIGN(fpo = stdout, revert);
+  else
+    {
+      int fd;
+      int mode = revert ? O_WRONLY : (O_TRUNC|O_WRONLY);
+
+      if (((fd = OPEN(argv[2], mode | BIN_CREAT(revert), 0666)) < 0) ||
+      (fpo = fdopen(fd, BIN_WRITE(revert))) == NULL)
+    {
+      fprintf(stderr, "%s: ", pname);
+      perror(argv[2]);
+      return 3;
+    }
+      rewind(fpo);
+    }
+
+  if (revert)
+    {
+      if (hextype && (hextype != HEX_POSTSCRIPT))
+    {
+      fprintf(stderr, "%s: sorry, cannot revert this type of hexdump\n", pname);
+      return -1;
+    }
+      return huntype(fp, fpo, stderr, pname, cols, hextype,
+        negseek ? -seekoff : seekoff);
+    }
+
+  if (seekoff || negseek || !relseek)
+    {
+#ifdef TRY_SEEK
+      if (relseek)
+    e = fseek(fp, negseek ? -seekoff : seekoff, 1);
+      else
+    e = fseek(fp, negseek ? -seekoff : seekoff, negseek ? 2 : 0);
+      if (e < 0 && negseek)
+    {
+      fprintf(stderr, "%s: sorry cannot seek.\n", pname);
+      return 4;
+    }
+      if (e >= 0)
+    seekoff = ftell(fp);
+      else
+#endif
+    {
+      long s = seekoff;
+
+      while (s--)
+        (void)getc(fp);
+    }
+    }
+
+  if (hextype == HEX_CINCLUDE)
+    {
+      const char* filename = extract_filename(argv[1]);
+
+      if (fp != stdin)
+    {
+      fprintf(fpo, "unsigned char %s", isdigit((int)filename[0]) ? "__" : "");
+      for (e = 0; (c = filename[e]) != 0; e++)
+        putc(isalnum(c) ? c : '_', fpo);
+      fputs("[] = {\n", fpo);
+    }
+
+      p = 0;
+      while ((length < 0 || p < length) && (c = getc(fp)) != EOF)
+    {
+      fprintf(fpo, (hexx == hexxa) ? "%s0x%02x" : "%s0X%02X",
+        (p % cols) ? ", " : ",\n  "+2*!p,  c);
+      p++;
+    }
+
+      if (p)
+    fputs("\n};\n"+3*(fp == stdin), fpo);
+
+      if (fp != stdin)
+    {
+      fprintf(fpo, "unsigned int %s", isdigit((int)filename[0]) ? "__" : "");
+      for (e = 0; (c = filename[e]) != 0; e++)
+        putc(isalnum(c) ? c : '_', fpo);
+      fprintf(fpo, "_len = %d;\n", p);
+    }
+
+      fclose(fp);
+      fclose(fpo);
+      return 0;
+    }
+
+  if (hextype == HEX_POSTSCRIPT)
+    {
+      p = cols;
+      while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
+    {
+      putchar(hexx[(e >> 4) & 0xf]);
+      putchar(hexx[(e     ) & 0xf]);
+      n++;
+      if (!--p)
+        {
+          putchar('\n');
+          p = cols;
+        }
+    }
+      if (p < cols)
+    putchar('\n');
+      fclose(fp);
+      fclose(fpo);
+      return 0;
+    }
+
+  /* hextype: HEX_NORMAL or HEX_BITS */
+
+  if (hextype == HEX_NORMAL)
+    grplen = octspergrp + octspergrp + 1;	/* chars per octet group */
+  else	/* hextype == HEX_BITS */
+    grplen = 8 * octspergrp + 1;
+
+  while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
+    {
+      if (p == 0)
+    {
+      sprintf(l, "%07lx: ", n + seekoff);
+      for (c = 9; c < LLEN; l[c++] = ' ');
+    }
+      if (hextype == HEX_NORMAL)
+    {
+      l[c = (9 + (grplen * p) / octspergrp)] = hexx[(e >> 4) & 0xf];
+      l[++c]			       = hexx[ e       & 0xf];
+    }
+      else /* hextype == HEX_BITS */
+    {
+      int i;
+
+      c = (9 + (grplen * p) / octspergrp) - 1;
+      for (i = 7; i >= 0; i--)
+        l[++c] = (e & (1 << i)) ? '1' : '0';
+    }
+      if (ebcdic)
+    e = (e < 64) ? '.' : etoa64[e-64];
+      /* When changing this update definition of LLEN above. */
+      l[11 + (grplen * cols - 1)/octspergrp + p] =
+#ifdef __MVS__
+      (e >= 64)
+#else
+      (e > 31 && e < 127)
+#endif
+      ? e : '.';
+      if (e)
+    nonzero++;
+      n++;
+      if (++p == cols)
+    {
+      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
+      xxdline(fpo, l, autoskip ? nonzero : 1);
+      nonzero = 0;
+      p = 0;
+    }
+    }
+  if (p)
+    {
+      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
+      xxdline(fpo, l, 1);
+    }
+  else if (autoskip)
+    xxdline(fpo, l, -1);	/* last chance to flush out suppressed lines */
+
+  fclose(fp);
+  fclose(fpo);
+  return 0;
+}
diff --git a/kompute/kompute-config.cmake b/kompute/kompute-config.cmake
new file mode 100644
index 0000000000000..10425252ce476
--- /dev/null
+++ b/kompute/kompute-config.cmake
@@ -0,0 +1,28 @@
+# General purpose GPU compute framework built on Vulkan to
+# support 1000s of cross vendor graphics cards
+# (AMD, Qualcomm, NVIDIA & friends). Blazing fast, mobile-enabled,
+# asynchronous and optimized for advanced GPU data processing use cases.
+# Backed by the Linux Foundation. 
+#
+# Finding this module will define the following variables:
+#  KOMPUTE_FOUND - True if the core library has been found
+#  KOMPUTE_LIBRARIES - Path to the core library archive
+#  KOMPUTE_INCLUDE_DIRS - Path to the include directories. Gives access
+#                     to kompute.h, as a single include which must be included in every
+#                     file that uses this interface. Else it also points to the
+#                     directory for individual includes.
+
+find_path(KOMPUTE_INCLUDE_DIR
+          NAMES kompute.h)
+
+find_library(KOMPUTE_LIBRARY
+             NAMES kompute
+             HINTS ${KOMPUTE_LIBRARY_ROOT})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(KOMPUTE REQUIRED_VARS KOMPUTE_LIBRARY KOMPUTE_INCLUDE_DIR)
+
+if(KOMPUTE_FOUND)
+    set(KOMPUTE_LIBRARIES ${KOMPUTE_LIBRARY})
+    set(KOMPUTE_INCLUDE_DIRS ${KOMPUTE_INCLUDE_DIR})
+endif()
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
new file mode 100644
index 0000000000000..7e4e43d7547a1
--- /dev/null
+++ b/kompute/op_add.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
new file mode 100644
index 0000000000000..492f672e5612a
--- /dev/null
+++ b/kompute/op_addrow.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
new file mode 100644
index 0000000000000..40d756ae57ded
--- /dev/null
+++ b/kompute/op_cpy_f16_f16.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float16_t
+#define IN_TYPE_SIZE 2
+#define OUT_TYPE float16_t
+#define OUT_TYPE_SIZE 2
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
new file mode 100644
index 0000000000000..309c48aed2a8f
--- /dev/null
+++ b/kompute/op_cpy_f16_f32.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float16_t
+#define IN_TYPE_SIZE 2
+#define OUT_TYPE float
+#define OUT_TYPE_SIZE 4
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
new file mode 100644
index 0000000000000..fb0e00d677940
--- /dev/null
+++ b/kompute/op_cpy_f32_f16.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float
+#define IN_TYPE_SIZE 4
+#define OUT_TYPE float16_t
+#define OUT_TYPE_SIZE 2
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp
new file mode 100644
index 0000000000000..f43480b8d5254
--- /dev/null
+++ b/kompute/op_cpy_f32_f32.comp
@@ -0,0 +1,168 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float
+#define IN_TYPE_SIZE 4
+#define OUT_TYPE float
+#define OUT_TYPE_SIZE 4
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_diagmask.comp b/kompute/op_diagmask.comp
new file mode 100644
index 0000000000000..18b0192d720ac
--- /dev/null
+++ b/kompute/op_diagmask.comp
@@ -0,0 +1,153 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint n_past;
+    int ne00;
+    int ne01;
+} pcs;
+
+void main() {
+    const uint i02 = gl_WorkGroupID.z;
+    const uint i01 = gl_WorkGroupID.y;
+    const uint i00 = gl_WorkGroupID.x;
+
+    const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
+
+    if (i00 > pcs.n_past + i01) {
+        out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
+    } else {
+        out_[index + pcs.outOff] = in_[index + pcs.inOff];
+    }
+}
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
new file mode 100644
index 0000000000000..8079b8ef28766
--- /dev/null
+++ b/kompute/op_gelu.comp
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const float x = in_[i + pcs.inOff];
+
+    out_[i + pcs.outOff] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
+}
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
new file mode 100644
index 0000000000000..e0f5bb16ec70e
--- /dev/null
+++ b/kompute/op_getrows_f16.comp
@@ -0,0 +1,150 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    for (int j = 0; j < pcs.ne00; j++) {
+        out_[i*pcs.nb1 + j + pcs.outOff] = inA[r*pcs.nb01/2+j + pcs.inAOff];
+    }
+}
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
new file mode 100644
index 0000000000000..cddba929b5701
--- /dev/null
+++ b/kompute/op_getrows_q4_0.comp
@@ -0,0 +1,179 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+#define UNALIGNED_INPUT inA
+
+block_q4_0 get_unaligned_block_q4_0(uint index) {
+    block_q4_0 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+2+it];
+    }
+    return fres;
+}
+
+void dequantize_row_q4_0(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    const uint qk = QK4_0;
+
+    const uint nb = k / qk;
+
+    for (uint i = 0; i < nb; i++) {
+        const block_q4_0 block = get_unaligned_block_q4_0(x + i*sizeof_block_q4_0);
+
+        const float16_t d = block.d;
+
+        for (uint j = 0; j < qk/2; ++j) {
+            const int x0 = (block.qs[j] & 0x0F) - 8;
+            const int x1 = (block.qs[j] >>   4) - 8;
+
+            out_[y+i*qk + j + 0   ] = float(x0)*d;
+            out_[y+i*qk + j + qk/2] = float(x1)*d;
+        }
+    }
+}
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    dequantize_row_q4_0(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
+}
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
new file mode 100644
index 0000000000000..151848a9d0468
--- /dev/null
+++ b/kompute/op_getrows_q4_1.comp
@@ -0,0 +1,181 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+#define UNALIGNED_INPUT inA
+
+block_q4_1 get_unaligned_block_q4_1(uint index) {
+    block_q4_1 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+    }
+    return fres;
+}
+
+void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    const uint qk = QK4_1;
+
+    const uint nb = k / qk;
+
+    for (uint i = 0; i < nb; i++) {
+        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
+
+        const float16_t d = block.d;
+        const float16_t m = block.m;
+
+        for (uint j = 0; j < qk/2; ++j) {
+            const int x0 = (block.qs[j] & 0x0F);
+            const int x1 = (block.qs[j] >>   4);
+
+            out_[y+i*qk + j + 0   ] = float(x0)*d + m;
+            out_[y+i*qk + j + qk/2] = float(x1)*d + m;
+        }
+    }
+}
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    dequantize_row_q4_1(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
+}
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
new file mode 100644
index 0000000000000..4907015d8ca37
--- /dev/null
+++ b/kompute/op_mul.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
new file mode 100644
index 0000000000000..f1198b59384f2
--- /dev/null
+++ b/kompute/op_mul_mat_f16.comp
@@ -0,0 +1,177 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 64) in;
+
+layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    uint nb01;
+    uint nb02;
+    uint nb11;
+    uint nb12;
+    int ne0;
+    int ne1;
+} pcs;
+
+shared float sum[gl_WorkGroupSize.x];
+
+void main() {
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+
+    const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
+    const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+
+    sum[gl_LocalInvocationID.x] = 0.0;
+
+    for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
+        sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
+    }
+
+    // accumulate the sum from all threads in the threadgroup
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    if (gl_LocalInvocationID.x == 0) {
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
+    }
+}
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
new file mode 100644
index 0000000000000..206aea7d5a512
--- /dev/null
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -0,0 +1,195 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+} pcs;
+
+shared float sum[64];
+
+void main() {
+    const uint nb = uint(pcs.ne00/QK4_0);
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+
+    const uint x = r0*nb; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
+
+    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
+    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
+
+    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
+    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
+
+    const uint first = 4 * iy;
+
+    float sumf = 0.0;
+
+    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
+        const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
+        const float d = float(u8BufToFloat16(inA, index));
+
+        const uint xl = first; // Based from bl->qs
+        const uint yl = y + i * QK4_0 + first; // Based from inB
+
+        vec2 acc = vec2(0.0, 0.0);
+
+        for (int j = 0; j < 4; ++j) {
+            const uint8_t b = inA[index+2+xl+j];
+            acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
+            acc.y += inB[yl+j] + inB[yl+j+16];
+        }
+
+        sumf += d * (acc.x - 8.*acc.y);
+    }
+
+    sum[ith] = sumf;
+
+    //
+    // Accumulate the sum from all threads in the threadgroup
+    //
+    barrier();
+    if (ith == 0) {
+        float sumTotal = 0.0;
+        for (uint i = 0; i < nth; ++i) {
+            sumTotal += sum[i];
+        }
+        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
+    }
+}
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
new file mode 100644
index 0000000000000..8bdf810a1fa6b
--- /dev/null
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -0,0 +1,218 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+} pcs;
+
+shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
+
+#define UNALIGNED_INPUT inA
+
+block_q4_1 get_unaligned_block_q4_1(uint index) {
+    block_q4_1 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+    }
+    return fres;
+}
+
+void main() {
+    const uint nb = uint(pcs.ne00/QK4_1);
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+
+    const uint x = r0*nb; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
+
+    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
+    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
+
+    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
+    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
+
+    const uint first = 4 * iy;
+
+    float sumf = 0.0;
+
+    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
+        //TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
+
+        const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
+
+        const float d = float(block.d);
+        const float m = float(block.m);
+
+        const uint xl = first; // Based from bl->qs
+        const uint yl = y + i * QK4_1 + first; // Based from inB
+
+        vec2 acc = vec2(0.0, 0.0);
+
+        for (int j = 0; j < 4; ++j) {
+            acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
+            acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
+        }
+
+        sumf += d * (acc.x - acc.y);
+    }
+
+    sum[ith] = sumf;
+
+    //
+    // Accumulate the sum from all threads in the threadgroup
+    //
+    barrier();
+    memoryBarrierShared();
+    if (ith%4 == 0) {
+        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
+    }
+    barrier();
+    memoryBarrierShared();
+    if (ith%16 == 0) {
+        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
+    }
+    barrier();
+    memoryBarrierShared();
+    if (ith == 0) {
+        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
+        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
+    }
+}
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
new file mode 100644
index 0000000000000..3defd0a5f492f
--- /dev/null
+++ b/kompute/op_mulrow.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
new file mode 100644
index 0000000000000..ec0a8568d0a14
--- /dev/null
+++ b/kompute/op_norm.comp
@@ -0,0 +1,209 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 256
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint ne00;
+    uint nb01;
+    float eps;
+} pcs;
+
+shared float sum[nth];
+
+void main() {
+    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
+    // MEAN
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += in_[x+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+    const float mean = sum[0];
+
+    // recenter
+    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] = in_[x+i00] - mean;
+    }
+
+    // VARIANCE
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+    const float variance = sum[0];
+
+    const float scale = 1.0f/sqrt(variance + pcs.eps);
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] *= scale;
+    }
+}
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
new file mode 100644
index 0000000000000..bc2c31f4368db
--- /dev/null
+++ b/kompute/op_relu.comp
@@ -0,0 +1,141 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
+}
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
new file mode 100644
index 0000000000000..784713c36ef26
--- /dev/null
+++ b/kompute/op_rmsnorm.comp
@@ -0,0 +1,178 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 256
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint ne00;
+    uint nb01;
+    float eps;
+} pcs;
+
+shared float sum[nth];
+
+void main() {
+    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
+
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+
+    const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
+
+    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] = in_[x+i00] * scale;
+    }
+}
diff --git a/kompute/op_rope.comp b/kompute/op_rope.comp
new file mode 100644
index 0000000000000..ca6bb6831e06c
--- /dev/null
+++ b/kompute/op_rope.comp
@@ -0,0 +1,183 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorIn { float in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    uint n_past;
+    int n_dims;
+    int mode;
+    float freq_base;
+    float freq_scale;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    const bool is_neox = (pcs.mode & 2) != 0;
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    const uint p = ((pcs.mode & 1) == 0 ? pcs.n_past + i2 : i2);
+
+    float theta = pcs.freq_scale * float(p);
+
+    if (!is_neox) {
+        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
+            const float cos_theta = cos(theta);
+            const float sin_theta = sin(theta);
+
+            theta *= theta_scale;
+
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = in_[src];
+            const float x1 = in_[src+1];
+
+            out_[dst_data] = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
+        }
+    } else {
+        // TODO: implement
+    }
+}
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
new file mode 100644
index 0000000000000..f537121a4945f
--- /dev/null
+++ b/kompute/op_scale.comp
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    float scale;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+}
\ No newline at end of file
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
new file mode 100644
index 0000000000000..90c034ac7c341
--- /dev/null
+++ b/kompute/op_silu.comp
@@ -0,0 +1,141 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const float x = in_[i + pcs.inOff];
+
+    out_[i + pcs.outOff] = x / (1.0 + exp(-x));
+}
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
new file mode 100644
index 0000000000000..ce0e71924b4c9
--- /dev/null
+++ b/kompute/op_softmax.comp
@@ -0,0 +1,197 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+} pcs;
+
+shared float buf[nth];
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
+    const uint psrc0 = extra_off + pcs.inOff; // Based from in_
+    const uint pdst = extra_off + pcs.outOff; // Based from out_
+
+    // parallel max
+    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    const float max_ = buf[0];
+
+    // parallel sum
+    buf[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    const float sum = buf[0];
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
+    }
+}
diff --git a/kompute/scripts/convert_shaders.py b/kompute/scripts/convert_shaders.py
new file mode 100644
index 0000000000000..9375b6701461e
--- /dev/null
+++ b/kompute/scripts/convert_shaders.py
@@ -0,0 +1,148 @@
+"""
+    Script to handle conversion of compute shaders to spirv and to headers
+"""
+import os
+import sys
+import logging
+import click
+import subprocess
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+
+is_windows = sys.platform.startswith('win')
+
+CWD=os.path.dirname(os.path.abspath(__file__))
+XXD_LINUX_CMD="xxd"
+XXD_WINDOWS_CMD=os.path.abspath(os.path.join(CWD, "..\\external\\bin\\", "xxd.exe"))
+
+SHADER_GENERATED_NOTICE = """/*
+    THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT
+
+    ---
+
+    Copyright 2020 The Institute for Ethical AI & Machine Learning
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+"""
+
+@click.command()
+@click.option(
+    "--shader-path",
+    "-p",
+    envvar="KOMPUTE_SHADER_PATH",
+    required=True,
+    help="The path for the directory to build and convert shaders",
+)
+@click.option(
+    "--shader-binary",
+    "-s",
+    envvar="KOMPUTE_SHADER_BINARY",
+    required=True,
+    help="The path for the directory to build and convert shaders",
+)
+@click.option(
+    "--header-path",
+    "-c",
+    envvar="KOMPUTE_HEADER_PATH",
+    default="",
+    required=False,
+    help="The (optional) output file for the cpp header files",
+)
+@click.option(
+    "--verbose",
+    "-v",
+    envvar="KOMPUTE_HEADER_PATH",
+    default=False,
+    is_flag=True,
+    help="Enable versbosity if flag is provided",
+)
+def run_cli(
+    shader_path: str = None,
+    shader_binary: str = None,
+    header_path: bool = None,
+    verbose: bool = None,
+):
+    """
+    CLI function for shader generation
+    """
+
+    if verbose:
+        logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.WARNING)
+
+    logger.debug(f"Starting script with variables: {locals()}")
+
+    if is_windows:
+        logger.debug(f"Running on windows, converting input paths")
+        shader_path = shader_path.replace("/", "\\")
+        header_path = header_path.replace("/", "\\")
+
+    shader_files = []
+    for root, directory, files in os.walk(shader_path):
+        for file in files:
+            if file.endswith(".comp"):
+                shader_files.append(os.path.join(root, file))
+
+    run_cmd = lambda *args: subprocess.check_output([*args]).decode()
+
+    logger.debug(f"Output spirv path: {shader_path}")
+    logger.debug(f"Converting files to spirv: {shader_files}")
+
+    spirv_files = []
+    for file in shader_files:
+        logger.debug(f"Converting to spirv: {file}")
+        spirv_file = f"{file}.spv"
+        run_cmd(shader_binary, "-V", file, "-o", spirv_file)
+        spirv_files.append(spirv_file)
+
+    # Create cpp files if header_path provided
+    if header_path:
+        logger.debug(f"Header path provided. Converting bin files to hpp.")
+        logger.debug(f"Output header path: {shader_path}")
+
+        # Check if xxd command options are available
+        if is_windows:
+            xxd_cmd = XXD_WINDOWS_CMD
+        else:
+            xxd_cmd = XXD_LINUX_CMD
+
+        for file in spirv_files:
+            print(xxd_cmd)
+            header_data = str(run_cmd(xxd_cmd, "-i", file))
+            # Ensuring the variable is a static const unsigned
+            header_data = header_data.replace("unsigned", "static const unsigned")
+            if is_windows:
+                raw_file_name = file.split("\\")[-1]
+            else:
+                raw_file_name = file.split("/")[-1]
+            file_name = f"shader{raw_file_name}"
+            header_file = file_name.replace(".comp.spv", ".hpp")
+            header_file_define = "SHADEROP_" + header_file.replace(".", "_").upper()
+            logger.debug(f"Converting to hpp: {file_name}")
+            with open(os.path.join(header_path, header_file), "w+", newline='\n') as fstream:
+                fstream.write(f"{SHADER_GENERATED_NOTICE}\n")
+                fstream.write(f"#ifndef {header_file_define}\n")
+                fstream.write(f"#define {header_file_define}\n\n")
+                fstream.write("namespace kp {\n")
+                fstream.write("namespace shader_data {\n")
+                fstream.write(f"{header_data}")
+                fstream.write("}\n")
+                fstream.write("}\n")
+                fstream.write(f"#endif // define {header_file_define}\n")
+
+
+if __name__ == "__main__":
+    run_cli()
diff --git a/kompute/scripts/requirements.txt b/kompute/scripts/requirements.txt
new file mode 100644
index 0000000000000..4da0425044e90
--- /dev/null
+++ b/kompute/scripts/requirements.txt
@@ -0,0 +1,11 @@
+# CLI dependencies
+click==7.1.2
+
+# Dev dependencies
+black==19.10b0
+quom==1.2.0
+Sphinx==3.2.1
+sphinx_material==0.0.30
+breathe==4.20.0
+m2r2==0.2.5
+git+git://github.com/pybind/pybind11_mkdoc.git@master
diff --git a/kompute/setup.py b/kompute/setup.py
new file mode 100644
index 0000000000000..09faa8d1a7d32
--- /dev/null
+++ b/kompute/setup.py
@@ -0,0 +1,93 @@
+import os
+import re
+import platform
+import sys
+import sysconfig
+import subprocess
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+curr_dir = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(curr_dir, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
+        if cmake_version < '3.15':
+            raise RuntimeError("CMake >= 3.15 is required")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        # required for auto-detection of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DKOMPUTE_OPT_BUILD_PYTHON=ON',
+                      '-DKOMPUTE_OPT_LOG_LEVEL=Off',
+                      '-DKOMPUTE_OPT_USE_SPDLOG=Off',
+                      '-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
+                      '-DPYTHON_EXECUTABLE=' + sys.executable,
+                      '-DPYTHON_INCLUDE_DIR=' + sysconfig.get_path('include'),
+                      '-DPYTHON_LIBRARY=' + sysconfig.get_path('stdlib'),
+        ]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+
+        env = os.environ.copy()
+        oldCxxFlags = env.get('CXXFLAGS', '')
+        env['CXXFLAGS'] = f'{oldCxxFlags} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
+
+        if platform.system() == "Windows":
+            cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}']
+            if sys.maxsize > 2**32:
+                cmake_args += ['-A', 'x64']
+            build_args += ['--', '/m']
+        else:
+            env['CXXFLAGS'] += ' -fPIC'
+            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+            build_args += ['--', '-j']
+            # Optional environment variable to limit the number of parallel jobs for GitHub actions to reduce RAM usage
+            if 'KOMPUTE_PYTHON_NUM_PARALLEL_THREADS' in env:
+                build_args += env['KOMPUTE_PYTHON_NUM_PARALLEL_THREADS']
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+
+setup(
+    name='kp',
+    version='0.8.1',
+    author='Alejandro Saucedo',
+    description='Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    ext_modules=[CMakeExtension('kp')],
+    install_requires=[
+        "numpy<2.0.0"
+    ],
+    cmdclass=dict(build_ext=CMakeBuild),
+    zip_safe=False,
+    include_package_data=True,
+)
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
new file mode 100644
index 0000000000000..9c41ec90f1f24
--- /dev/null
+++ b/kompute/src/Algorithm.cpp
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include <fstream>
+
+#include "kompute/Algorithm.hpp"
+
+namespace kp {
+
+Algorithm::~Algorithm()
+{
+    KP_LOG_DEBUG("Kompute Algorithm Destructor started");
+
+    this->destroy();
+}
+
+bool
+Algorithm::isInit()
+{
+    return this->mPipeline && this->mPipelineCache && this->mPipelineLayout &&
+           this->mDescriptorPool && this->mDescriptorSet &&
+           this->mDescriptorSetLayout && this->mShaderModule;
+}
+
+void
+Algorithm::destroy()
+{
+    // We don't have to free memory on destroy as it's freed by the
+    // commandBuffer destructor if (this->mPushConstantsData) {
+    //     free(this->mPushConstantsData);
+    // }
+    // if (this->mSpecializationConstantsData) {
+    //     free(this->mSpecializationConstantsData);
+    // }
+
+    if (!this->mDevice) {
+        KP_LOG_WARN("Kompute Algorithm destroy function reached with null "
+                    "Device pointer");
+        return;
+    }
+
+    if (this->mFreePipeline && this->mPipeline) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline");
+        if (!this->mPipeline) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipeline,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipeline = nullptr;
+    }
+
+    if (this->mFreePipelineCache && this->mPipelineCache) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline cache");
+        if (!this->mPipelineCache) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline cache but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineCache = nullptr;
+    }
+
+    if (this->mFreePipelineLayout && this->mPipelineLayout) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
+        if (!this->mPipelineLayout) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline layout but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineLayout = nullptr;
+    }
+
+    if (this->mFreeShaderModule && this->mShaderModule) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying shader module");
+        if (!this->mShaderModule) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader "
+                        "module but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mShaderModule,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mShaderModule = nullptr;
+    }
+
+    freeParameters();
+}
+
+void
+Algorithm::freeParameters()
+{
+    if (this->mFreeDescriptorSetLayout && this->mDescriptorSetLayout) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Set Layout");
+        if (!this->mDescriptorSetLayout) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "descriptor set layout but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mDescriptorSetLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDescriptorSetLayout = nullptr;
+    }
+}
+
+void
+Algorithm::createParameters()
+{
+    KP_LOG_DEBUG("Kompute Algorithm createParameters started");
+    if (!*this->mDescriptorPool) {
+        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
+        return;
+    }
+
+    std::vector<vk::DescriptorSetLayoutBinding> descriptorSetBindings;
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        descriptorSetBindings.push_back(
+          vk::DescriptorSetLayoutBinding(i, // Binding index
+                                         vk::DescriptorType::eStorageBuffer,
+                                         1, // Descriptor count
+                                         vk::ShaderStageFlagBits::eCompute));
+    }
+
+    // This is the component that is fed into the pipeline
+    vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo(
+      vk::DescriptorSetLayoutCreateFlags(),
+      static_cast<uint32_t>(descriptorSetBindings.size()),
+      descriptorSetBindings.data());
+
+    KP_LOG_DEBUG("Kompute Algorithm creating descriptor set layout");
+    this->mDescriptorSetLayout = std::make_shared<vk::DescriptorSetLayout>();
+    vk::Result result = this->mDevice->createDescriptorSetLayout(
+      &descriptorSetLayoutInfo, nullptr, this->mDescriptorSetLayout.get());
+
+   if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to create descriptor set layout. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSetLayout = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor set layout.");
+    }
+
+    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
+      *this->mDescriptorPool,
+      1, // Descriptor set layout count
+      this->mDescriptorSetLayout.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
+    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
+    result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
+                                          this->mDescriptorSet.get());
+
+    if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSet = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
+    }
+
+    this->mFreeDescriptorSet = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
+
+        vk::DescriptorBufferInfo descriptorBufferInfo =
+          this->mTensors[i]->constructDescriptorBufferInfo();
+
+        computeWriteDescriptorSets.push_back(
+          vk::WriteDescriptorSet(*this->mDescriptorSet,
+                                 i, // Destination binding
+                                 0, // Destination array element
+                                 1, // Descriptor count
+                                 vk::DescriptorType::eStorageBuffer,
+                                 nullptr, // Descriptor image info
+                                 &descriptorBufferInfo));
+
+        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
+                                            nullptr);
+    }
+
+    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
+}
+
+void
+Algorithm::updateParameters()
+{
+    KP_LOG_DEBUG("Kompute Algorithm updateParameters started");
+    if (!*this->mDescriptorPool) {
+        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
+        return;
+    }
+
+    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
+      *this->mDescriptorPool,
+      1, // Descriptor set layout count
+      this->mDescriptorSetLayout.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
+    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
+    vk::Result result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
+                                          this->mDescriptorSet.get());
+
+    if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSet = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
+    }
+
+    this->mFreeDescriptorSet = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
+
+        vk::DescriptorBufferInfo descriptorBufferInfo =
+          this->mTensors[i]->constructDescriptorBufferInfo();
+
+        computeWriteDescriptorSets.push_back(
+          vk::WriteDescriptorSet(*this->mDescriptorSet,
+                                 i, // Destination binding
+                                 0, // Destination array element
+                                 1, // Descriptor count
+                                 vk::DescriptorType::eStorageBuffer,
+                                 nullptr, // Descriptor image info
+                                 &descriptorBufferInfo));
+
+        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
+                                            nullptr);
+    }
+
+    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
+}
+
+void
+Algorithm::createShaderModule()
+{
+    KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");
+
+    vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
+                                                sizeof(uint32_t) *
+                                                  this->mSpirv.size(),
+                                                this->mSpirv.data());
+
+    KP_LOG_DEBUG("Kompute Algorithm Creating shader module. ShaderFileSize: {}",
+                 this->mSpirv.size());
+    this->mFreeShaderModule = true;
+    this->mShaderModule = std::make_shared<vk::ShaderModule>();
+    this->mDevice->createShaderModule(
+      &shaderModuleInfo, nullptr, this->mShaderModule.get());
+    this->mFreeShaderModule = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm create shader module success");
+}
+
+void
+Algorithm::createPipeline()
+{
+    KP_LOG_DEBUG("Kompute Algorithm calling create Pipeline");
+
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo(
+      vk::PipelineLayoutCreateFlags(),
+      1, // Set layout count
+      this->mDescriptorSetLayout.get());
+
+    vk::PushConstantRange pushConstantRange;
+    if (this->mPushConstantsSize) {
+        pushConstantRange.setStageFlags(vk::ShaderStageFlagBits::eCompute);
+        pushConstantRange.setOffset(0);
+        pushConstantRange.setSize(this->mPushConstantsDataTypeMemorySize *
+                                  this->mPushConstantsSize);
+
+        pipelineLayoutInfo.setPushConstantRangeCount(1);
+        pipelineLayoutInfo.setPPushConstantRanges(&pushConstantRange);
+    }
+
+    this->mPipelineLayout = std::make_shared<vk::PipelineLayout>();
+    this->mDevice->createPipelineLayout(
+      &pipelineLayoutInfo, nullptr, this->mPipelineLayout.get());
+    this->mFreePipelineLayout = true;
+
+    std::vector<vk::SpecializationMapEntry> specializationEntries;
+
+    for (uint32_t i = 0; i < this->mSpecializationConstantsSize; i++) {
+        vk::SpecializationMapEntry specializationEntry(
+          static_cast<uint32_t>(i),
+          static_cast<uint32_t>(
+            this->mSpecializationConstantsDataTypeMemorySize * i),
+          this->mSpecializationConstantsDataTypeMemorySize);
+
+        specializationEntries.push_back(specializationEntry);
+    }
+
+    // This passes ownership of the memory so we remove ownership from
+    // specialization container by using "transferDataOwnership"
+    vk::SpecializationInfo specializationInfo(
+      static_cast<uint32_t>(specializationEntries.size()),
+      specializationEntries.data(),
+      this->mSpecializationConstantsDataTypeMemorySize *
+        this->mSpecializationConstantsSize,
+      this->mSpecializationConstantsData);
+
+    vk::PipelineShaderStageCreateInfo shaderStage(
+      vk::PipelineShaderStageCreateFlags(),
+      vk::ShaderStageFlagBits::eCompute,
+      *this->mShaderModule,
+      "main",
+      &specializationInfo);
+
+    static std::shared_ptr<vk::PipelineCache> globalPipelineCache = std::make_shared<vk::PipelineCache>();
+    if(!*globalPipelineCache) {
+       vk::PipelineCacheCreateInfo pipelineCacheInfo =
+         vk::PipelineCacheCreateInfo();
+      this->mPipelineCache = globalPipelineCache;
+      this->mFreePipelineCache = true;
+      this->mDevice->createPipelineCache(
+        &pipelineCacheInfo, nullptr, globalPipelineCache.get());
+    }
+
+    vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
+                                               shaderStage,
+                                               *this->mPipelineLayout,
+                                               vk::Pipeline(),
+                                               0);
+
+#ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
+    vk::ResultValue<vk::Pipeline> pipelineResult =
+      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo);
+
+    if (pipelineResult.result != vk::Result::eSuccess) {
+        throw std::runtime_error("Failed to create pipeline result: " +
+                                 vk::to_string(pipelineResult.result));
+    }
+
+    vk::Pipeline& pipeline = pipelineResult.value;
+    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
+    this->mFreePipeline = true;
+#else
+    vk::Pipeline pipeline =
+      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo)
+        .value;
+    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
+    this->mFreePipeline = true;
+#endif
+
+    // TODO: Update to consistent
+    // this->mPipeline = std::make_shared<vk::Pipeline>();
+    // this->mDevice->createComputePipelines(
+    //         *this->mPipelineCache, 1, &pipelineInfo, nullptr,
+    //         this->mPipeline.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm Create Pipeline Success");
+}
+
+void
+Algorithm::recordBindCore(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute Algorithm binding pipeline");
+
+    commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute,
+                               *this->mPipeline);
+
+    KP_LOG_DEBUG("Kompute Algorithm binding descriptor sets");
+
+    commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+                                     *this->mPipelineLayout,
+                                     0, // First set
+                                     *this->mDescriptorSet,
+                                     nullptr // Dispatcher
+    );
+}
+
+void
+Algorithm::recordBindPush(const vk::CommandBuffer& commandBuffer)
+{
+    if (this->mPushConstantsSize) {
+        KP_LOG_DEBUG("Kompute Algorithm binding push constants memory size: {}",
+                     this->mPushConstantsSize *
+                       this->mPushConstantsDataTypeMemorySize);
+
+        commandBuffer.pushConstants(*this->mPipelineLayout,
+                                    vk::ShaderStageFlagBits::eCompute,
+                                    0,
+                                    this->mPushConstantsSize *
+                                      this->mPushConstantsDataTypeMemorySize,
+                                    this->mPushConstantsData);
+    }
+}
+
+void
+Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute Algorithm recording dispatch");
+
+    commandBuffer.dispatch(
+      this->mWorkgroup[0], this->mWorkgroup[1], this->mWorkgroup[2]);
+}
+
+void
+Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
+{
+
+    KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
+
+    // The dispatch size is set up based on either explicitly provided template
+    // parameters or by default it would take the shape and size of the tensors
+    if (workgroup[0] > 0) {
+        // If at least the x value is provided we use mainly the parameters
+        // provided
+        this->mWorkgroup = { workgroup[0],
+                             workgroup[1] > 0 ? workgroup[1] : 1,
+                             workgroup[2] > 0 ? workgroup[2] : 1 };
+    } else {
+        this->mWorkgroup = { minSize, 1, 1 };
+    }
+
+    KP_LOG_INFO("Kompute OpAlgoCreate set dispatch size X: {}, Y: {}, Z: {}",
+                this->mWorkgroup[0],
+                this->mWorkgroup[1],
+                this->mWorkgroup[2]);
+}
+
+const Workgroup&
+Algorithm::getWorkgroup()
+{
+    return this->mWorkgroup;
+}
+
+const std::vector<std::shared_ptr<Tensor>>&
+Algorithm::getTensors()
+{
+    return this->mTensors;
+}
+
+void Algorithm::setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    this->mTensors = tensors;
+}
+
+}
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
new file mode 100644
index 0000000000000..f4f8440f4ffdb
--- /dev/null
+++ b/kompute/src/CMakeLists.txt
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+if(KOMPUTE_OPT_ANDROID_BUILD)
+    find_library(android android)
+endif()
+
+cmake_minimum_required(VERSION 3.20)
+
+add_library(kompute Algorithm.cpp
+    Manager.cpp
+    OpAlgoDispatch.cpp
+    OpMemoryBarrier.cpp
+    OpTensorCopy.cpp
+    OpTensorSyncDevice.cpp
+    OpTensorSyncLocal.cpp
+    OpBufferSyncDevice.cpp
+    OpBufferSyncLocal.cpp
+    Sequence.cpp
+    Tensor.cpp
+    Core.cpp)
+
+add_library(kompute::kompute ALIAS kompute)
+
+# Set version for shared libraries.
+set_target_properties(kompute
+    PROPERTIES
+    VERSION ${${PROJECT_NAME}_VERSION}
+    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR})
+
+# Import GNU common install directory variables
+include(GNUInstallDirs)
+
+install(TARGETS kompute
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+# Include CMake helpers for package config files
+# Follow this installation guideline: https://cmake.org/cmake/help/latest/manual/cmake-packages.7.html
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
+    "${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+
+install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
+    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+
+# ####################################################
+# Linking
+# ####################################################
+if(KOMPUTE_OPT_ANDROID_BUILD)
+    target_link_libraries(kompute PUBLIC vulkanAndroid
+        android
+        kp_logger
+        kp_shader
+        fmt::fmt)
+else()
+    target_link_libraries(kompute PUBLIC Vulkan::Vulkan
+        kp_logger
+        kp_shader
+        fmt::fmt)
+endif()
+
+if(KOMPUTE_OPT_BUILD_PYTHON)
+    include_directories(${PYTHON_INCLUDE_DIRS})
+
+    target_link_libraries(kompute PRIVATE pybind11::headers ${PYTHON_LIBRARIES})
+endif()
+
+if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
+    target_link_libraries(kompute PUBLIC Vulkan-Headers)
+endif()
+
+# ####################################################
+# Misc
+# ####################################################
+add_subdirectory(logger)
+add_subdirectory(shaders)
+add_subdirectory(include)
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
new file mode 100644
index 0000000000000..60849a3ecd940
--- /dev/null
+++ b/kompute/src/Core.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Core.hpp"
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+#ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+/**
+ * Ensures support for dynamic loading of Vulkan functions on Android.
+ * Acts as a default store for loaded functions.
+ * More information:
+ * https://github.com/KhronosGroup/Vulkan-Hpp#vulkan_hpp_default_dispatcher
+ **/
+VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+namespace kp {
+} // namespace kp
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
new file mode 100644
index 0000000000000..07514ed9a10c2
--- /dev/null
+++ b/kompute/src/Manager.cpp
@@ -0,0 +1,493 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Manager.hpp"
+#include "fmt/format.h"
+#include "kompute/logger/Logger.hpp"
+#include <fmt/core.h>
+#include <iterator>
+#include <set>
+#include <sstream>
+#include <string>
+
+namespace kp {
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+static VKAPI_ATTR VkBool32 VKAPI_CALL
+debugMessageCallback(VkDebugReportFlagsEXT /*flags*/,
+                     VkDebugReportObjectTypeEXT /*objectType*/,
+                     uint64_t /*object*/,
+                     size_t /*location*/,
+                     int32_t /*messageCode*/,
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
+                     const char* pLayerPrefix,
+                     const char* pMessage,
+#else
+                     const char* /*pLayerPrefix*/,
+                     const char* /*pMessage*/,
+#endif
+                     void* /*pUserData*/)
+{
+    KP_LOG_DEBUG("[VALIDATION]: {} - {}", pLayerPrefix, pMessage);
+    return VK_FALSE;
+}
+#endif
+
+Manager::Manager()
+{
+    this->mManageResources = true;
+
+// Make sure the logger is setup
+#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
+    logger::setupLogger();
+#endif
+    this->createInstance();
+}
+
+void Manager::initializeDevice(uint32_t physicalDeviceIndex,
+                               const std::vector<uint32_t>& familyQueueIndices,
+                               const std::vector<std::string>& desiredExtensions)
+{
+    this->createDevice(
+      familyQueueIndices, physicalDeviceIndex, desiredExtensions);
+}
+
+Manager::~Manager()
+{
+    KP_LOG_DEBUG("Kompute Manager Destructor started");
+    this->destroy();
+}
+
+void
+Manager::destroy()
+{
+
+    KP_LOG_DEBUG("Kompute Manager destroy() started");
+
+    if (this->mDevice == nullptr) {
+        KP_LOG_ERROR(
+          "Kompute Manager destructor reached with null Device pointer");
+        return;
+    }
+
+    if (this->mManageResources && this->mManagedSequences.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly running destructor for "
+                     "managed sequences");
+        for (const std::weak_ptr<Sequence>& weakSq : this->mManagedSequences) {
+            if (std::shared_ptr<Sequence> sq = weakSq.lock()) {
+                sq->destroy();
+            }
+        }
+        this->mManagedSequences.clear();
+    }
+
+    if (this->mManageResources && this->mManagedAlgorithms.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
+        for (const std::weak_ptr<Algorithm>& weakAlgorithm :
+             this->mManagedAlgorithms) {
+            if (std::shared_ptr<Algorithm> algorithm = weakAlgorithm.lock()) {
+                algorithm->destroy();
+            }
+        }
+        this->mManagedAlgorithms.clear();
+    }
+
+    if (this->mManageResources && this->mManagedTensors.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly freeing tensors");
+        for (const std::weak_ptr<Tensor>& weakTensor : this->mManagedTensors) {
+            if (std::shared_ptr<Tensor> tensor = weakTensor.lock()) {
+                tensor->destroy();
+            }
+        }
+        this->mManagedTensors.clear();
+    }
+
+    if (this->mFreeDevice) {
+        KP_LOG_INFO("Destroying device");
+        this->mDevice->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice = nullptr;
+        KP_LOG_DEBUG("Kompute Manager Destroyed Device");
+    }
+
+    if (this->mInstance == nullptr) {
+        KP_LOG_ERROR(
+          "Kompute Manager destructor reached with null Instance pointer");
+        return;
+    }
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    if (this->mDebugReportCallback) {
+        this->mInstance->destroyDebugReportCallbackEXT(
+          this->mDebugReportCallback, nullptr, this->mDebugDispatcher);
+        KP_LOG_DEBUG("Kompute Manager Destroyed Debug Report Callback");
+    }
+#endif
+
+    if (this->mFreeInstance) {
+        this->mInstance->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mInstance = nullptr;
+        KP_LOG_DEBUG("Kompute Manager Destroyed Instance");
+    }
+}
+
+void
+Manager::createInstance()
+{
+
+    KP_LOG_DEBUG("Kompute Manager creating instance");
+
+    this->mFreeInstance = true;
+
+    vk::ApplicationInfo applicationInfo;
+    applicationInfo.pApplicationName = "Kompute";
+    applicationInfo.pEngineName = "Kompute";
+    applicationInfo.apiVersion = KOMPUTE_VK_API_VERSION;
+    applicationInfo.engineVersion = KOMPUTE_VK_API_VERSION;
+    applicationInfo.applicationVersion = KOMPUTE_VK_API_VERSION;
+
+    std::vector<const char*> applicationExtensions;
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
+#endif
+
+    vk::InstanceCreateInfo computeInstanceCreateInfo;
+    computeInstanceCreateInfo.pApplicationInfo = &applicationInfo;
+    if (!applicationExtensions.empty()) {
+        computeInstanceCreateInfo.enabledExtensionCount =
+          (uint32_t)applicationExtensions.size();
+        computeInstanceCreateInfo.ppEnabledExtensionNames =
+          applicationExtensions.data();
+    }
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
+    // We'll identify the layers that are supported
+    std::vector<const char*> validLayerNames;
+    std::vector<const char*> desiredLayerNames = {
+        "VK_LAYER_LUNARG_assistant_layer",
+        "VK_LAYER_LUNARG_standard_validation",
+        "VK_LAYER_KHRONOS_validation",
+    };
+    std::vector<std::string> envLayerNames;
+    const char* envLayerNamesVal = std::getenv("KOMPUTE_ENV_DEBUG_LAYERS");
+    if (envLayerNamesVal != nullptr && *envLayerNamesVal != '\0') {
+        KP_LOG_DEBUG("Kompute Manager adding environment layers: {}",
+                     envLayerNamesVal);
+        std::istringstream iss(envLayerNamesVal);
+        std::istream_iterator<std::string> beg(iss);
+        std::istream_iterator<std::string> end;
+        envLayerNames = std::vector<std::string>(beg, end);
+        for (const std::string& layerName : envLayerNames) {
+            desiredLayerNames.push_back(layerName.c_str());
+        }
+        KP_LOG_DEBUG("Desired layers: {}", fmt::join(desiredLayerNames, ", "));
+    }
+
+    // Identify the valid layer names based on the desiredLayerNames
+    {
+        std::set<std::string> uniqueLayerNames;
+        std::vector<vk::LayerProperties> availableLayerProperties =
+          vk::enumerateInstanceLayerProperties();
+        for (vk::LayerProperties layerProperties : availableLayerProperties) {
+            std::string layerName(layerProperties.layerName.data());
+            uniqueLayerNames.insert(layerName);
+        }
+        KP_LOG_DEBUG("Available layers: {}", fmt::join(uniqueLayerNames, ", "));
+        for (const char* desiredLayerName : desiredLayerNames) {
+            if (uniqueLayerNames.count(desiredLayerName) != 0) {
+                validLayerNames.push_back(desiredLayerName);
+            }
+        }
+    }
+
+    if (!validLayerNames.empty()) {
+        KP_LOG_DEBUG(
+          "Kompute Manager Initializing instance with valid layers: {}",
+          fmt::join(validLayerNames, ", "));
+        computeInstanceCreateInfo.enabledLayerCount =
+          static_cast<uint32_t>(validLayerNames.size());
+        computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data();
+    } else {
+        KP_LOG_WARN("Kompute Manager no valid layer names found from desired "
+                    "layer names");
+    }
+#endif
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+    vk::DynamicLoader dl;
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
+      dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+    this->mInstance = std::make_shared<vk::Instance>();
+    vk::createInstance(
+      &computeInstanceCreateInfo, nullptr, this->mInstance.get());
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+    KP_LOG_DEBUG("Kompute Manager Instance Created");
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    KP_LOG_DEBUG("Kompute Manager adding debug callbacks");
+    if (validLayerNames.size() > 0) {
+        vk::DebugReportFlagsEXT debugFlags =
+          vk::DebugReportFlagBitsEXT::eError |
+          vk::DebugReportFlagBitsEXT::eWarning;
+        vk::DebugReportCallbackCreateInfoEXT debugCreateInfo = {};
+        debugCreateInfo.pfnCallback =
+          (PFN_vkDebugReportCallbackEXT)debugMessageCallback;
+        debugCreateInfo.flags = debugFlags;
+
+        this->mDebugDispatcher.init(*this->mInstance, &vkGetInstanceProcAddr);
+        this->mDebugReportCallback =
+          this->mInstance->createDebugReportCallbackEXT(
+            debugCreateInfo, nullptr, this->mDebugDispatcher);
+    }
+#endif
+}
+
+void
+Manager::clear()
+{
+    if (this->mManageResources) {
+        this->mManagedTensors.erase(
+          std::remove_if(begin(this->mManagedTensors),
+                         end(this->mManagedTensors),
+                         [](std::weak_ptr<Tensor> t) { return t.expired(); }),
+          end(this->mManagedTensors));
+        this->mManagedAlgorithms.erase(
+          std::remove_if(
+            begin(this->mManagedAlgorithms),
+            end(this->mManagedAlgorithms),
+            [](std::weak_ptr<Algorithm> t) { return t.expired(); }),
+          end(this->mManagedAlgorithms));
+        this->mManagedSequences.erase(
+          std::remove_if(begin(this->mManagedSequences),
+                         end(this->mManagedSequences),
+                         [](std::weak_ptr<Sequence> t) { return t.expired(); }),
+          end(this->mManagedSequences));
+    }
+}
+
+void
+Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
+                      uint32_t physicalDeviceIndex,
+                      const std::vector<std::string>& desiredExtensions)
+{
+
+    KP_LOG_DEBUG("Kompute Manager creating Device");
+
+    if (this->mInstance == nullptr) {
+        throw std::runtime_error("Kompute Manager instance is null");
+    }
+
+    this->mFreeDevice = true;
+
+    // Getting an integer that says how many vuklan devices we have
+    std::vector<vk::PhysicalDevice> physicalDevices =
+      this->mInstance->enumeratePhysicalDevices();
+    uint32_t deviceCount = physicalDevices.size();
+
+    // This means there are no devices at all
+    if (deviceCount == 0) {
+        throw std::runtime_error("Failed to find GPUs with Vulkan support! "
+                                 "Maybe you haven't installed vulkan drivers?");
+    }
+
+    // This means that we're exceeding our device limit, for
+    // example if we have 2 devices, just physicalDeviceIndex
+    // 0 and 1 are acceptable. Hence, physicalDeviceIndex should
+    // always be less than deviceCount, else we raise an error
+    if (!(deviceCount > physicalDeviceIndex)) {
+        throw std::runtime_error("There is no such physical index or device, "
+                                 "please use your existing device");
+    }
+
+    vk::PhysicalDevice physicalDevice = physicalDevices[physicalDeviceIndex];
+
+    this->mPhysicalDevice =
+      std::make_shared<vk::PhysicalDevice>(physicalDevice);
+
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      physicalDevice.getProperties();
+#endif
+
+    KP_LOG_INFO("Using physical device index {} found {}",
+                physicalDeviceIndex,
+                physicalDeviceProperties.deviceName);
+
+    if (familyQueueIndices.empty()) {
+        // Find compute queue
+        std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
+          physicalDevice.getQueueFamilyProperties();
+
+        uint32_t computeQueueFamilyIndex = 0;
+        bool computeQueueSupported = false;
+        for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
+            vk::QueueFamilyProperties queueFamilyProperties =
+              allQueueFamilyProperties[i];
+
+            if (queueFamilyProperties.queueFlags &
+                vk::QueueFlagBits::eCompute) {
+                computeQueueFamilyIndex = i;
+                computeQueueSupported = true;
+                break;
+            }
+        }
+
+        if (!computeQueueSupported) {
+            throw std::runtime_error("Compute queue is not supported");
+        }
+
+        this->mComputeQueueFamilyIndices.push_back(computeQueueFamilyIndex);
+    } else {
+        this->mComputeQueueFamilyIndices = familyQueueIndices;
+    }
+
+    std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
+    std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
+    for (const auto& value : this->mComputeQueueFamilyIndices) {
+        familyQueueCounts[value]++;
+        familyQueuePriorities[value].push_back(1.0f);
+    }
+
+    std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
+    std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
+    for (const auto& familyQueueInfo : familyQueueCounts) {
+        // Setting the device count to 0
+        familyQueueIndexCount[familyQueueInfo.first] = 0;
+
+        // Creating the respective device queue
+        vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
+          vk::DeviceQueueCreateFlags(),
+          familyQueueInfo.first,
+          familyQueueInfo.second,
+          familyQueuePriorities[familyQueueInfo.first].data());
+        deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
+    }
+
+    KP_LOG_DEBUG("Kompute Manager desired extension layers {}",
+                 fmt::join(desiredExtensions, ", "));
+
+    std::vector<vk::ExtensionProperties> deviceExtensions =
+      this->mPhysicalDevice->enumerateDeviceExtensionProperties();
+
+    std::set<std::string> uniqueExtensionNames;
+    for (const vk::ExtensionProperties& ext : deviceExtensions) {
+        uniqueExtensionNames.insert(ext.extensionName);
+    }
+    KP_LOG_DEBUG("Kompute Manager available extensions {}",
+                 fmt::join(uniqueExtensionNames, ", "));
+    std::vector<const char*> validExtensions;
+    for (const std::string& ext : desiredExtensions) {
+        if (uniqueExtensionNames.count(ext) != 0) {
+            validExtensions.push_back(ext.c_str());
+        }
+    }
+    if (desiredExtensions.size() != validExtensions.size()) {
+        KP_LOG_ERROR("Kompute Manager not all extensions were added: {}",
+                     fmt::join(validExtensions, ", "));
+    }
+
+    vk::PhysicalDeviceFeatures features;
+    features.shaderInt16 = true;
+
+    vk::PhysicalDeviceVulkan11Features features11;
+    features11.uniformAndStorageBuffer16BitAccess = true;
+    features11.storageBuffer16BitAccess = true;
+    features11.pNext = nullptr;
+
+    vk::PhysicalDeviceVulkan12Features features12;
+    features12.storageBuffer8BitAccess = true;
+    features12.uniformAndStorageBuffer8BitAccess = true;
+    features12.shaderFloat16 = true;
+    features12.shaderInt8 = true;
+    features12.pNext = &features11;
+
+    vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
+                                          deviceQueueCreateInfos.size(),
+                                          deviceQueueCreateInfos.data(),
+                                          {},
+                                          {},
+                                          validExtensions.size(),
+                                          validExtensions.data(),
+                                          &features);
+
+    deviceCreateInfo.pNext = &features12;
+
+    this->mDevice = std::make_shared<vk::Device>();
+    vk::Result r = physicalDevice.createDevice(
+      &deviceCreateInfo, nullptr, this->mDevice.get());
+    if (r != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Kompute Manager could not create device");
+    }
+
+    KP_LOG_DEBUG("Kompute Manager device created");
+
+    for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndices) {
+        std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
+
+        this->mDevice->getQueue(familyQueueIndex,
+                                familyQueueIndexCount[familyQueueIndex],
+                                currQueue.get());
+
+        familyQueueIndexCount[familyQueueIndex]++;
+
+        this->mComputeQueues.push_back(currQueue);
+    }
+
+    KP_LOG_DEBUG("Kompute Manager compute queue obtained");
+}
+
+std::shared_ptr<Sequence>
+Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
+
+    std::shared_ptr<Sequence> sq{ new kp::Sequence(
+      this->mPhysicalDevice,
+      this->mDevice,
+      this->mComputeQueues[queueIndex],
+      this->mComputeQueueFamilyIndices[queueIndex],
+      totalTimestamps) };
+
+    if (this->mManageResources) {
+        this->mManagedSequences.push_back(sq);
+    }
+
+    return sq;
+}
+
+vk::PhysicalDeviceProperties
+Manager::getDeviceProperties() const
+{
+    return this->mPhysicalDevice->getProperties();
+}
+
+std::vector<vk::PhysicalDevice>
+Manager::listDevices() const
+{
+    return this->mInstance->enumeratePhysicalDevices();
+}
+
+std::shared_ptr<vk::Instance>
+Manager::getVkInstance() const
+{
+    return this->mInstance;
+}
+
+}
diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
new file mode 100644
index 0000000000000..cad334f0c5d21
--- /dev/null
+++ b/kompute/src/OpAlgoDispatch.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+
+namespace kp {
+
+OpAlgoDispatch::~OpAlgoDispatch()
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch destructor started");
+
+    if (this->mPushConstantsData) {
+        KP_LOG_DEBUG("Kompute freeing push constants data");
+        free(this->mPushConstantsData);
+    }
+}
+
+void
+OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (const std::shared_ptr<Tensor>& tensor :
+         this->mAlgorithm->getTensors()) {
+        tensor->recordPrimaryBufferMemoryBarrier(
+          commandBuffer,
+          vk::AccessFlagBits::eTransferWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eTransfer,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    if (this->mPushConstantsSize) {
+        this->mAlgorithm->setPushConstants(
+          this->mPushConstantsData,
+          this->mPushConstantsSize,
+          this->mPushConstantsDataTypeMemorySize);
+    }
+
+    this->mAlgorithm->recordBindCore(commandBuffer);
+    this->mAlgorithm->recordBindPush(commandBuffer);
+    this->mAlgorithm->recordDispatch(commandBuffer);
+}
+
+void
+OpAlgoDispatch::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch preEval called");
+}
+
+void
+OpAlgoDispatch::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch postSubmit called");
+}
+
+}
diff --git a/kompute/src/OpBufferSyncDevice.cpp b/kompute/src/OpBufferSyncDevice.cpp
new file mode 100644
index 0000000000000..baaafda0fa386
--- /dev/null
+++ b/kompute/src/OpBufferSyncDevice.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpBufferSyncDevice.hpp"
+
+namespace kp {
+
+OpBufferSyncDevice::OpBufferSyncDevice(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size)
+  : mPrimaryBuffer(primaryBuffer)
+  , mStagingBuffer(stagingBuffer)
+  , mSize(size)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice constructor with params");
+}
+
+OpBufferSyncDevice::~OpBufferSyncDevice()
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice destructor started");
+}
+
+void
+OpBufferSyncDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice record called");
+    vk::BufferCopy copyRegion(0, 0, mSize);
+    commandBuffer.copyBuffer(*mStagingBuffer, *mPrimaryBuffer, copyRegion);
+}
+
+void
+OpBufferSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice preEval called");
+}
+
+void
+OpBufferSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice postEval called");
+}
+
+}
diff --git a/kompute/src/OpBufferSyncLocal.cpp b/kompute/src/OpBufferSyncLocal.cpp
new file mode 100644
index 0000000000000..63739a351e07c
--- /dev/null
+++ b/kompute/src/OpBufferSyncLocal.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpBufferSyncLocal.hpp"
+
+namespace kp {
+
+OpBufferSyncLocal::OpBufferSyncLocal(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size)
+  : mPrimaryBuffer(primaryBuffer)
+  , mStagingBuffer(stagingBuffer)
+  , mSize(size)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal constructor with params");
+}
+
+OpBufferSyncLocal::~OpBufferSyncLocal()
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal destructor started");
+}
+
+void
+OpBufferSyncLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal record called");
+    vk::BufferCopy copyRegion(0, 0, mSize);
+    commandBuffer.copyBuffer(*mPrimaryBuffer, *mStagingBuffer, copyRegion);
+}
+
+void
+OpBufferSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal preEval called");
+}
+
+void
+OpBufferSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal postEval called");
+}
+
+}
diff --git a/kompute/src/OpMemoryBarrier.cpp b/kompute/src/OpMemoryBarrier.cpp
new file mode 100644
index 0000000000000..89d44d85e6599
--- /dev/null
+++ b/kompute/src/OpMemoryBarrier.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpMemoryBarrier.hpp"
+
+namespace kp {
+
+OpMemoryBarrier::OpMemoryBarrier(
+  const std::vector<std::shared_ptr<Tensor>>& tensors,
+  const vk::AccessFlagBits& srcAccessMask,
+  const vk::AccessFlagBits& dstAccessMask,
+  const vk::PipelineStageFlagBits& srcStageMask,
+  const vk::PipelineStageFlagBits& dstStageMask,
+  bool barrierOnPrimary)
+  : mSrcAccessMask(srcAccessMask)
+  , mDstAccessMask(dstAccessMask)
+  , mSrcStageMask(srcStageMask)
+  , mDstStageMask(dstStageMask)
+  , mBarrierOnPrimary(barrierOnPrimary)
+  , mTensors(tensors)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor");
+}
+
+OpMemoryBarrier::~OpMemoryBarrier()
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started");
+}
+
+void
+OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    if (this->mBarrierOnPrimary) {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordPrimaryBufferMemoryBarrier(commandBuffer,
+                                                     this->mSrcAccessMask,
+                                                     this->mDstAccessMask,
+                                                     this->mSrcStageMask,
+                                                     this->mDstStageMask);
+        }
+    } else {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordStagingBufferMemoryBarrier(commandBuffer,
+                                                     this->mSrcAccessMask,
+                                                     this->mDstAccessMask,
+                                                     this->mSrcStageMask,
+                                                     this->mDstStageMask);
+        }
+    }
+}
+
+void
+OpMemoryBarrier::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called");
+}
+
+void
+OpMemoryBarrier::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called");
+}
+
+}
diff --git a/kompute/src/OpTensorCopy.cpp b/kompute/src/OpTensorCopy.cpp
new file mode 100644
index 0000000000000..e732cc4137c00
--- /dev/null
+++ b/kompute/src/OpTensorCopy.cpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorCopy.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+OpTensorCopy::OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy constructor with params");
+
+    this->mTensors = tensors;
+
+    if (this->mTensors.size() < 2) {
+        throw std::runtime_error(
+          "Kompute OpTensorCopy called with less than 2 tensor");
+    }
+
+    kp::Tensor::TensorDataTypes dataType = this->mTensors[0]->dataType();
+    uint32_t size = this->mTensors[0]->size();
+    for (const std::shared_ptr<Tensor>& tensor : tensors) {
+        if (tensor->dataType() != dataType) {
+            throw std::runtime_error(fmt::format(
+              "Attempting to copy tensors of different types from {} to {}",
+              Tensor::toString(dataType),
+              Tensor::toString(tensor->dataType())));
+        }
+        if (tensor->size() != size) {
+            throw std::runtime_error(fmt::format(
+              "Attempting to copy tensors of different sizes from {} to {}",
+              size,
+              tensor->size()));
+        }
+    }
+}
+
+OpTensorCopy::~OpTensorCopy()
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy destructor started");
+}
+
+void
+OpTensorCopy::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy record called");
+
+    // We iterate from the second tensor onwards and record a copy to all
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->recordCopyFrom(commandBuffer, this->mTensors[0]);
+    }
+}
+
+void
+OpTensorCopy::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy preEval called");
+}
+
+void
+OpTensorCopy::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");
+
+    // Do not copy on CPU side if source is storage tensor
+    if (this->mTensors[0]->tensorType() == kp::Tensor::TensorTypes::eStorage)
+    {
+        KP_LOG_DEBUG("Kompute OpTensorCopy not copying tensor source given it's of eStorage type");
+        return;
+    }
+    void* data = this->mTensors[0]->rawData();
+
+    // Copy the data from the first tensor into all the tensors
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == kp::Tensor::TensorTypes::eStorage) {
+            KP_LOG_DEBUG("Kompute OpTensorCopy not copying to tensor dest given it's of eStorage type");
+            continue;
+        }
+        this->mTensors[i]->setRawData(data);
+    }
+}
+
+}
diff --git a/kompute/src/OpTensorSyncDevice.cpp b/kompute/src/OpTensorSyncDevice.cpp
new file mode 100644
index 0000000000000..4cc6abf71d08a
--- /dev/null
+++ b/kompute/src/OpTensorSyncDevice.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorSyncDevice.hpp"
+
+namespace kp {
+
+OpTensorSyncDevice::OpTensorSyncDevice(
+  const std::vector<std::shared_ptr<Tensor>>& tensors)
+  : mPrimaryBuffer(nullptr)
+  , mStagingBuffer(nullptr)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncDevice called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorSyncDevice::~OpTensorSyncDevice()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
+
+    this->mTensors.clear();
+}
+
+void
+OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
+        }
+    }
+}
+
+void
+OpTensorSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
+}
+
+void
+OpTensorSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
+}
+
+}
diff --git a/kompute/src/OpTensorSyncLocal.cpp b/kompute/src/OpTensorSyncLocal.cpp
new file mode 100644
index 0000000000000..1aa091b733c6b
--- /dev/null
+++ b/kompute/src/OpTensorSyncLocal.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpTensorSyncLocal.hpp"
+
+namespace kp {
+
+OpTensorSyncLocal::OpTensorSyncLocal(
+  const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncLocal called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorSyncLocal::~OpTensorSyncLocal()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
+}
+
+void
+OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eShaderWrite,
+              vk::AccessFlagBits::eTransferRead,
+              vk::PipelineStageFlagBits::eComputeShader,
+              vk::PipelineStageFlagBits::eTransfer);
+
+            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eTransferWrite,
+              vk::AccessFlagBits::eHostRead,
+              vk::PipelineStageFlagBits::eTransfer,
+              vk::PipelineStageFlagBits::eHost);
+        }
+    }
+}
+
+void
+OpTensorSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
+}
+
+void
+OpTensorSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
+
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
+}
+
+}
diff --git a/kompute/src/Sequence.cpp b/kompute/src/Sequence.cpp
new file mode 100644
index 0000000000000..3b5fb5fb59b4d
--- /dev/null
+++ b/kompute/src/Sequence.cpp
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Sequence.hpp"
+
+namespace kp {
+
+Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::Queue> computeQueue,
+                   uint32_t queueIndex,
+                   uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
+
+    this->mPhysicalDevice = physicalDevice;
+    this->mDevice = device;
+    this->mComputeQueue = computeQueue;
+    this->mQueueIndex = queueIndex;
+
+    this->createCommandPool();
+    this->createCommandBuffer();
+    if (totalTimestamps > 0)
+        this->createTimestampQueryPool(totalTimestamps +
+                                       1); //+1 for the first one
+}
+
+Sequence::~Sequence()
+{
+    KP_LOG_DEBUG("Kompute Sequence Destructor started");
+
+    if (this->mDevice) {
+        this->destroy();
+    }
+}
+
+void
+Sequence::begin()
+{
+    KP_LOG_DEBUG("Kompute sequence called BEGIN");
+
+    if (this->isRecording()) {
+        KP_LOG_DEBUG("Kompute Sequence begin called when already recording");
+        return;
+    }
+
+    if (this->isRunning()) {
+        throw std::runtime_error(
+          "Kompute Sequence begin called when sequence still running");
+    }
+
+    KP_LOG_INFO("Kompute Sequence command now started recording");
+    this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
+    this->mRecording = true;
+
+    // latch the first timestamp before any commands are submitted
+    if (this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+          vk::PipelineStageFlagBits::eAllCommands,
+          *this->timestampQueryPool,
+          0);
+}
+
+void
+Sequence::end()
+{
+    KP_LOG_DEBUG("Kompute Sequence calling END");
+
+    if (this->isRunning()) {
+        throw std::runtime_error(
+          "Kompute Sequence begin called when sequence still running");
+    }
+
+    if (!this->isRecording()) {
+        KP_LOG_WARN("Kompute Sequence end called when not recording");
+        return;
+    } else {
+        KP_LOG_INFO("Kompute Sequence command recording END");
+        this->mCommandBuffer->end();
+        this->mRecording = false;
+    }
+}
+
+void
+Sequence::clear()
+{
+    KP_LOG_DEBUG("Kompute Sequence calling clear");
+    if (this->isRecording()) {
+        this->end();
+    }
+}
+
+std::shared_ptr<Sequence>
+Sequence::eval()
+{
+    KP_LOG_DEBUG("Kompute sequence EVAL BEGIN");
+
+    return this->evalAsync()->evalAwait();
+}
+
+std::shared_ptr<Sequence>
+Sequence::eval(std::shared_ptr<OpBase> op)
+{
+    this->clear();
+    return this->record(op)->eval();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync()
+{
+    if (this->isRecording()) {
+        this->end();
+    }
+
+    if (this->mIsRunning) {
+        throw std::runtime_error(
+          "Kompute Sequence evalAsync called when an eval async was "
+          "called without successful wait");
+    }
+
+    this->mIsRunning = true;
+
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->preEval(*this->mCommandBuffer);
+    }
+
+    vk::SubmitInfo submitInfo(
+      0, nullptr, nullptr, 1, this->mCommandBuffer.get());
+
+    this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
+
+    KP_LOG_DEBUG(
+      "Kompute sequence submitting command buffer into compute queue");
+
+    this->mComputeQueue->submit(1, &submitInfo, this->mFence);
+
+    return shared_from_this();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync(std::shared_ptr<OpBase> op)
+{
+    this->clear();
+    this->record(op);
+    this->evalAsync();
+    return shared_from_this();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAwait(uint64_t waitFor)
+{
+    if (!this->mIsRunning) {
+        KP_LOG_WARN("Kompute Sequence evalAwait called without existing eval");
+        return shared_from_this();
+    }
+
+    vk::Result result =
+      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
+    this->mDevice->destroy(
+      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+    this->mIsRunning = false;
+
+    if (result == vk::Result::eTimeout) {
+        KP_LOG_WARN("Kompute Sequence evalAwait reached timeout of {}",
+                    waitFor);
+        return shared_from_this();
+    }
+
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->postEval(*this->mCommandBuffer);
+    }
+
+    return shared_from_this();
+}
+
+bool
+Sequence::isRunning() const
+{
+    return this->mIsRunning;
+}
+
+bool
+Sequence::isRecording() const
+{
+    return this->mRecording;
+}
+
+bool
+Sequence::isInit() const
+{
+    return this->mDevice && this->mCommandPool && this->mCommandBuffer &&
+           this->mComputeQueue;
+}
+
+void
+Sequence::rerecord()
+{
+    this->end();
+    std::vector<std::shared_ptr<OpBase>> ops = this->mOperations;
+    this->mOperations.clear();
+    for (const std::shared_ptr<kp::OpBase>& op : ops) {
+        this->record(op);
+    }
+}
+
+void
+Sequence::destroy()
+{
+    KP_LOG_DEBUG("Kompute Sequence destroy called");
+
+    if (!this->mDevice) {
+        KP_LOG_WARN("Kompute Sequence destroy called "
+                    "with null Device pointer");
+        return;
+    }
+
+    if (this->mFreeCommandBuffer) {
+        KP_LOG_INFO("Freeing CommandBuffer");
+        if (!this->mCommandBuffer) {
+            KP_LOG_WARN("Kompute Sequence destroy called with null "
+                        "CommandPool pointer");
+            return;
+        }
+        this->mDevice->freeCommandBuffers(
+          *this->mCommandPool, 1, this->mCommandBuffer.get());
+
+        this->mCommandBuffer = nullptr;
+        this->mFreeCommandBuffer = false;
+
+        KP_LOG_DEBUG("Kompute Sequence Freed CommandBuffer");
+    }
+
+    if (this->mFreeCommandPool) {
+        KP_LOG_INFO("Destroying CommandPool");
+        if (this->mCommandPool == nullptr) {
+            KP_LOG_WARN("Kompute Sequence destroy called with null "
+                        "CommandPool pointer");
+            return;
+        }
+        this->mDevice->destroy(
+          *this->mCommandPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+        this->mCommandPool = nullptr;
+        this->mFreeCommandPool = false;
+
+        KP_LOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    }
+
+    if (this->mOperations.size()) {
+        KP_LOG_INFO("Kompute Sequence clearing operations buffer");
+        this->mOperations.clear();
+    }
+
+    if (this->timestampQueryPool) {
+        KP_LOG_INFO("Destroying QueryPool");
+        this->mDevice->destroy(
+          *this->timestampQueryPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+        this->timestampQueryPool = nullptr;
+        KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
+    }
+
+    if (this->mDevice) {
+        this->mDevice = nullptr;
+    }
+    if (this->mPhysicalDevice) {
+        this->mPhysicalDevice = nullptr;
+    }
+    if (this->mComputeQueue) {
+        this->mComputeQueue = nullptr;
+    }
+}
+
+std::shared_ptr<Sequence>
+Sequence::record(std::shared_ptr<OpBase> op)
+{
+    KP_LOG_DEBUG("Kompute Sequence record function started");
+
+    this->begin();
+
+    KP_LOG_DEBUG(
+      "Kompute Sequence running record on OpBase derived class instance");
+
+    op->record(*this->mCommandBuffer);
+
+    this->mOperations.push_back(op);
+
+    if (this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+          vk::PipelineStageFlagBits::eAllCommands,
+          *this->timestampQueryPool,
+          this->mOperations.size());
+
+    return shared_from_this();
+}
+
+void
+Sequence::createCommandPool()
+{
+    KP_LOG_DEBUG("Kompute Sequence creating command pool");
+
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Sequence device is null");
+    }
+
+    this->mFreeCommandPool = true;
+
+    vk::CommandPoolCreateInfo commandPoolInfo(vk::CommandPoolCreateFlags(),
+                                              this->mQueueIndex);
+    this->mCommandPool = std::make_shared<vk::CommandPool>();
+    this->mDevice->createCommandPool(
+      &commandPoolInfo, nullptr, this->mCommandPool.get());
+    KP_LOG_DEBUG("Kompute Sequence Command Pool Created");
+}
+
+void
+Sequence::createCommandBuffer()
+{
+    KP_LOG_DEBUG("Kompute Sequence creating command buffer");
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Sequence device is null");
+    }
+    if (!this->mCommandPool) {
+        throw std::runtime_error("Kompute Sequence command pool is null");
+    }
+
+    this->mFreeCommandBuffer = true;
+
+    vk::CommandBufferAllocateInfo commandBufferAllocateInfo(
+      *this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1);
+
+    this->mCommandBuffer = std::make_shared<vk::CommandBuffer>();
+    this->mDevice->allocateCommandBuffers(&commandBufferAllocateInfo,
+                                          this->mCommandBuffer.get());
+    KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
+}
+
+void
+Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence creating query pool");
+    if (!this->isInit()) {
+        throw std::runtime_error(
+          "createTimestampQueryPool() called on uninitialized Sequence");
+    }
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Sequence physical device is null");
+    }
+
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      this->mPhysicalDevice->getProperties();
+
+    if (physicalDeviceProperties.limits.timestampComputeAndGraphics) {
+        vk::QueryPoolCreateInfo queryPoolInfo;
+        queryPoolInfo.setQueryCount(totalTimestamps);
+        queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
+        this->timestampQueryPool = std::make_shared<vk::QueryPool>(
+          this->mDevice->createQueryPool(queryPoolInfo));
+
+        KP_LOG_DEBUG("Query pool for timestamps created");
+    } else {
+        throw std::runtime_error("Device does not support timestamps");
+    }
+}
+
+std::vector<std::uint64_t>
+Sequence::getTimestamps()
+{
+    if (!this->timestampQueryPool)
+        throw std::runtime_error("Timestamp latching not enabled");
+
+    const auto n = this->mOperations.size() + 1;
+    std::vector<std::uint64_t> timestamps(n, 0);
+    this->mDevice->getQueryPoolResults(
+      *this->timestampQueryPool,
+      0,
+      n,
+      timestamps.size() * sizeof(std::uint64_t),
+      timestamps.data(),
+      sizeof(uint64_t),
+      vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
+
+    return timestamps;
+}
+
+}
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
new file mode 100644
index 0000000000000..9c343ff139181
--- /dev/null
+++ b/kompute/src/Tensor.cpp
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+std::string
+Tensor::toString(Tensor::TensorDataTypes dt)
+{
+    switch (dt) {
+        case TensorDataTypes::eBool:
+            return "eBool";
+        case TensorDataTypes::eInt:
+            return "eInt";
+        case TensorDataTypes::eUnsignedInt:
+            return "eUnsignedInt";
+        case TensorDataTypes::eFloat:
+            return "eFloat";
+        case TensorDataTypes::eDouble:
+            return "eDouble";
+        default:
+            return "unknown";
+    }
+}
+
+std::string
+Tensor::toString(Tensor::TensorTypes dt)
+{
+    switch (dt) {
+        case TensorTypes::eDevice:
+            return "eDevice";
+        case TensorTypes::eHost:
+            return "eHost";
+        case TensorTypes::eStorage:
+            return "eStorage";
+        default:
+            return "unknown";
+    }
+}
+
+Tensor::Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+               std::shared_ptr<vk::Device> device,
+               void* data,
+               uint32_t elementTotalCount,
+               uint32_t elementMemorySize,
+               const TensorDataTypes& dataType,
+               vk::DeviceMemory *primaryMemory,
+               vk::Buffer *primaryBuffer,
+               vk::DeviceMemory *stagingMemory,
+               vk::Buffer *stagingBuffer,
+               vk::DeviceSize offset,
+               const TensorTypes& tensorType)
+{
+    KP_LOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
+                 elementTotalCount,
+                 Tensor::toString(tensorType));
+
+    this->mPhysicalDevice = physicalDevice;
+    this->mDevice = device;
+    this->mDataType = dataType;
+    this->mTensorType = tensorType;
+
+    this->rebuild(data, elementTotalCount, elementMemorySize, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
+}
+
+Tensor::~Tensor()
+{
+    KP_LOG_DEBUG("Kompute Tensor destructor started. Type: {}",
+                 Tensor::toString(this->tensorType()));
+
+    if (this->mDevice) {
+        this->destroy();
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor destructor success");
+}
+
+void
+Tensor::rebuild(void* /*data*/,
+                uint32_t elementTotalCount,
+                uint64_t memorySize,
+                vk::DeviceMemory *primaryMemory,
+                vk::Buffer *primaryBuffer,
+                vk::DeviceMemory *stagingMemory,
+                vk::Buffer *stagingBuffer,
+                vk::DeviceSize offset)
+{
+    KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", elementTotalCount);
+
+    this->mSize = elementTotalCount;
+    this->mMemorySize = memorySize;
+    this->mOffset = offset;
+
+    if (this->mPrimaryBuffer || this->mPrimaryMemory) {
+        KP_LOG_DEBUG(
+          "Kompute Tensor destroying existing resources before rebuild");
+        this->destroy();
+    }
+
+    this->setGPUResources(primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
+}
+
+Tensor::TensorTypes
+Tensor::tensorType()
+{
+    return this->mTensorType;
+}
+
+bool
+Tensor::isInit()
+{
+    return this->mDevice && this->mPrimaryBuffer && this->mPrimaryMemory &&
+           this->mRawData;
+}
+
+uint32_t
+Tensor::size()
+{
+    return this->mSize;
+}
+
+uint64_t
+Tensor::memorySize()
+{
+    return this->mMemorySize;
+}
+
+kp::Tensor::TensorDataTypes
+Tensor::dataType()
+{
+    return this->mDataType;
+}
+
+void*
+Tensor::rawData()
+{
+    return this->mRawData;
+}
+
+void
+Tensor::setRawData(const void* data)
+{
+    memcpy(this->mRawData, data, this->memorySize());
+}
+
+void
+Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                       std::shared_ptr<Tensor> copyFromTensor)
+{
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           copyFromTensor->mPrimaryBuffer,
+                           this->mPrimaryBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           this->mStagingBuffer,
+                           this->mPrimaryBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           this->mPrimaryBuffer,
+                           this->mStagingBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
+                         vk::Buffer *bufferFrom,
+                         vk::Buffer *bufferTo,
+                         vk::DeviceSize /*bufferSize*/,
+                         vk::BufferCopy copyRegion)
+{
+
+    commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
+}
+
+void
+Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                         vk::AccessFlagBits srcAccessMask,
+                                         vk::AccessFlagBits dstAccessMask,
+                                         vk::PipelineStageFlagBits srcStageMask,
+                                         vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+                                    *this->mPrimaryBuffer,
+                                    srcAccessMask,
+                                    dstAccessMask,
+                                    srcStageMask,
+                                    dstStageMask);
+}
+
+void
+Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                         vk::AccessFlagBits srcAccessMask,
+                                         vk::AccessFlagBits dstAccessMask,
+                                         vk::PipelineStageFlagBits srcStageMask,
+                                         vk::PipelineStageFlagBits dstStageMask)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    KP_LOG_DEBUG("Kompute Tensor recording STAGING buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+                                    *this->mStagingBuffer,
+                                    srcAccessMask,
+                                    dstAccessMask,
+                                    srcStageMask,
+                                    dstStageMask);
+}
+
+void
+Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                  const vk::Buffer& buffer,
+                                  vk::AccessFlagBits srcAccessMask,
+                                  vk::AccessFlagBits dstAccessMask,
+                                  vk::PipelineStageFlagBits srcStageMask,
+                                  vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording buffer memory barrier");
+
+    vk::DeviceSize bufferSize = this->memorySize();
+
+    vk::BufferMemoryBarrier bufferMemoryBarrier;
+    bufferMemoryBarrier.buffer = buffer;
+    bufferMemoryBarrier.size = bufferSize;
+    bufferMemoryBarrier.srcAccessMask = srcAccessMask;
+    bufferMemoryBarrier.dstAccessMask = dstAccessMask;
+    bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+
+    commandBuffer.pipelineBarrier(srcStageMask,
+                                  dstStageMask,
+                                  vk::DependencyFlags(),
+                                  nullptr,
+                                  bufferMemoryBarrier,
+                                  nullptr);
+}
+
+vk::DescriptorBufferInfo
+Tensor::constructDescriptorBufferInfo()
+{
+    KP_LOG_DEBUG("Kompute Tensor construct descriptor buffer info size {}",
+                 this->memorySize());
+    vk::DeviceSize bufferSize = this->memorySize();
+    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
+                                    mOffset, // offset
+                                    bufferSize);
+}
+
+vk::BufferUsageFlags
+Tensor::getPrimaryBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        case TensorTypes::eHost:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        case TensorTypes::eStorage:
+            return vk::BufferUsageFlagBits::eStorageBuffer;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getPrimaryMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        case TensorTypes::eHost:
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
+            break;
+        case TensorTypes::eStorage:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::BufferUsageFlags
+Tensor::getStagingBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getStagingMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+void
+Tensor::setGPUResources(vk::DeviceMemory *primaryMemory,
+                        vk::Buffer *primaryBuffer,
+                        vk::DeviceMemory *stagingMemory,
+                        vk::Buffer *stagingBuffer,
+                        vk::DeviceSize /*offset*/)
+{
+    KP_LOG_DEBUG("Kompute Tensor creating buffer");
+
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Tensor phyisical device is null");
+    }
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Tensor device is null");
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor creating primary buffer and memory");
+
+    this->mPrimaryBuffer = primaryBuffer;
+    this->mPrimaryMemory = primaryMemory;
+
+    if (this->mTensorType == TensorTypes::eDevice) {
+        KP_LOG_DEBUG("Kompute Tensor creating staging buffer and memory");
+
+        this->mStagingBuffer = stagingBuffer;
+        this->mStagingMemory = stagingMemory;
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor buffer & memory creation successful");
+}
+
+void
+Tensor::destroy()
+{
+    KP_LOG_DEBUG("Kompute Tensor started destroy()");
+
+    // Setting raw data to null regardless whether device is available to
+    // invalidate Tensor
+    this->mRawData = nullptr;
+    this->mSize = 0;
+    this->mMemorySize = 0;
+
+    if (!this->mDevice) {
+        KP_LOG_WARN(
+          "Kompute Tensor destructor reached with null Device pointer");
+        return;
+    }
+
+    if (this->mDevice) {
+        this->mDevice = nullptr;
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor successful destroy()");
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<bool>::dataType()
+{
+    return Tensor::TensorDataTypes::eBool;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<int32_t>::dataType()
+{
+    return Tensor::TensorDataTypes::eInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<uint32_t>::dataType()
+{
+    return Tensor::TensorDataTypes::eUnsignedInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<float>::dataType()
+{
+    return Tensor::TensorDataTypes::eFloat;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<double>::dataType()
+{
+    return Tensor::TensorDataTypes::eDouble;
+}
+
+}
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
new file mode 100644
index 0000000000000..05e1ed5e15532
--- /dev/null
+++ b/kompute/src/include/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.20)
+
+# ####################################################
+# Kompute
+# ####################################################
+target_include_directories(kompute PUBLIC $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+target_sources(kompute PRIVATE
+
+    # Header files (useful in IDEs)
+    kompute/Algorithm.hpp
+    kompute/Core.hpp
+    kompute/Kompute.hpp
+    kompute/Manager.hpp
+    kompute/Sequence.hpp
+    kompute/Tensor.hpp
+
+    kompute/operations/OpAlgoDispatch.hpp
+    kompute/operations/OpBase.hpp
+    kompute/operations/OpMemoryBarrier.hpp
+    kompute/operations/OpMult.hpp
+    kompute/operations/OpTensorCopy.hpp
+    kompute/operations/OpTensorSyncDevice.hpp
+    kompute/operations/OpTensorSyncLocal.hpp
+    kompute/operations/OpBufferSyncDevice.hpp
+    kompute/operations/OpBufferSyncLocal.hpp
+
+    kompute/logger/Logger.hpp
+)
+
+install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+# ####################################################
+# Logger
+# ####################################################
+target_include_directories(kp_logger PUBLIC $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+target_sources(kp_logger PRIVATE
+
+    # Header files (useful in IDEs)
+    kompute/logger/Logger.hpp
+)
+
+install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
new file mode 100644
index 0000000000000..90fe48fef8637
--- /dev/null
+++ b/kompute/src/include/kompute/Algorithm.hpp
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "fmt/format.h"
+#include "kompute/Tensor.hpp"
+#include "logger/Logger.hpp"
+
+namespace kp {
+
+/**
+    Abstraction for compute shaders that are run on top of tensors grouped via
+   ParameterGroups (which group descriptorsets)
+*/
+class Algorithm
+{
+  public:
+    /**
+     *  Main constructor for algorithm with configuration parameters to create
+     *  the underlying resources.
+     *
+     *  @param device The Vulkan device to use for creating resources
+     *  @param tensors (optional) The tensors to use to create the descriptor
+     * resources
+     *  @param spirv (optional) The spirv code to use to create the algorithm
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The templatable param is to be
+     * used to initialize the specialization constants which cannot be changed
+     * once set.
+     *  @param pushConstants (optional) This templatable param is to be used
+     * when initializing the pipeline, which set the size of the push constants
+     * - these can be modified but all new values must have the same data type
+     * and length as otherwise it will result in errors.
+     */
+    template<typename S = float, typename P = float>
+    Algorithm(std::shared_ptr<vk::Device> device,
+              vk::DescriptorPool *pool,
+              const std::vector<std::shared_ptr<Tensor>>& tensors = {},
+              const std::vector<uint32_t>& spirv = {},
+              const Workgroup& workgroup = {},
+              const std::vector<S>& specializationConstants = {},
+              const std::vector<P>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
+
+        this->mDevice = device;
+        this->mDescriptorPool = pool;
+
+        if (tensors.size() && spirv.size()) {
+            KP_LOG_INFO(
+              "Kompute Algorithm initialising with tensor size: {} and "
+              "spirv size: {}",
+              tensors.size(),
+              spirv.size());
+            this->rebuild(tensors,
+                          spirv,
+                          workgroup,
+                          specializationConstants,
+                          pushConstants);
+        } else {
+            KP_LOG_INFO(
+              "Kompute Algorithm constructor with empty tensors and or "
+              "spirv so not rebuilding vulkan components");
+        }
+    }
+
+    /**
+     *  Rebuild function to reconstruct algorithm with configuration parameters
+     * to create the underlying resources.
+     *
+     *  @param tensors The tensors to use to create the descriptor resources
+     *  @param spirv The spirv code to use to create the algorithm
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The std::vector<float> to use
+     * to initialize the specialization constants which cannot be changed once
+     * set.
+     *  @param pushConstants (optional) The std::vector<float> to use when
+     * initializing the pipeline, which set the size of the push constants -
+     * these can be modified but all new values must have the same vector size
+     * as this initial value.
+     */
+    template<typename S = float, typename P = float>
+    void rebuild(const std::vector<std::shared_ptr<Tensor>>& tensors,
+                 const std::vector<uint32_t>& spirv,
+                 const Workgroup& workgroup = {},
+                 const std::vector<S>& specializationConstants = {},
+                 const std::vector<P>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute Algorithm rebuild started");
+
+        this->mTensors = tensors;
+        this->mSpirv = spirv;
+
+        if (specializationConstants.size()) {
+            if (this->mSpecializationConstantsData) {
+                free(this->mSpecializationConstantsData);
+            }
+            uint32_t memorySize =
+              sizeof(decltype(specializationConstants.back()));
+            uint32_t size = specializationConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mSpecializationConstantsData = malloc(totalSize);
+            memcpy(this->mSpecializationConstantsData,
+                   specializationConstants.data(),
+                   totalSize);
+            this->mSpecializationConstantsDataTypeMemorySize = memorySize;
+            this->mSpecializationConstantsSize = size;
+        }
+
+        if (pushConstants.size()) {
+            if (this->mPushConstantsData) {
+                free(this->mPushConstantsData);
+            }
+            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+            uint32_t size = pushConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mPushConstantsData = malloc(totalSize);
+            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
+            this->mPushConstantsDataTypeMemorySize = memorySize;
+            this->mPushConstantsSize = size;
+        }
+
+        this->setWorkgroup(
+          workgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
+
+        // Descriptor pool is created first so if available then destroy all
+        // before rebuild
+        if (this->isInit()) {
+            this->destroy();
+        }
+
+        this->createParameters();
+        this->createShaderModule();
+        this->createPipeline();
+    }
+
+    /**
+     * Destructor for Algorithm which is responsible for freeing and desroying
+     * respective pipelines and owned parameter groups.
+     */
+    ~Algorithm();
+
+    /**
+     * Records the dispatch function with the provided template parameters or
+     * alternatively using the size of the tensor by default.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordDispatch(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records command that binds the "core" algorithm components which consist
+     * of binding the pipeline and binding the descriptorsets.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordBindCore(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records command that binds the push constants to the command buffer
+     * provided
+     * - it is required that the pushConstants provided are of the same size as
+     * the ones provided during initialization.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordBindPush(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * function that checks all the gpu resource components to verify if these
+     * have been created and returns true if all are valid.
+     *
+     * @returns returns true if the algorithm is currently initialized.
+     */
+    bool isInit();
+
+    /**
+     * Sets the work group to use in the recordDispatch
+     *
+     * @param workgroup The kp::Workgroup value to use to update the algorithm.
+     * It must have a value greater than 1 on the x value (index 1) otherwise it
+     * will be initialized on the size of the first tensor (ie.
+     * this->mTensor[0]->size())
+     */
+    void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
+    /**
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush()
+     *
+     * @param pushConstants The templatable vector is to be used to set the push
+     * constants to use in the next bindPush(...) calls. The constants provided
+     * must be of the same size as the ones created during initialization.
+     */
+    template<typename T>
+    void setPushConstants(const std::vector<T>& pushConstants)
+    {
+        uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+        uint32_t size = pushConstants.size();
+        this->setPushConstants(pushConstants.data(), size, memorySize);
+    }
+
+    void updateDescriptors(vk::DescriptorPool *pool)
+    {
+        this->mDescriptorPool = pool;
+        this->setWorkgroup(
+          this->mWorkgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
+
+        this->updateParameters(); // TODO: See if we can reduce this
+    }
+
+    /**
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush() with the raw memory block location and memory size to be used.
+     *
+     * @param data The raw data point to copy the data from, without modifying
+     * the pointer.
+     * @param size The number of data elements provided in the data
+     * @param memorySize The memory size of each of the data elements in bytes.
+     */
+    void setPushConstants(const void* data, uint32_t size, uint32_t memorySize)
+    {
+
+        uint32_t totalSize = memorySize * size;
+        uint32_t previousTotalSize =
+          this->mPushConstantsDataTypeMemorySize * this->mPushConstantsSize;
+
+        if (totalSize != previousTotalSize) {
+            throw std::runtime_error(fmt::format(
+              "Kompute Algorithm push "
+              "constant total memory size provided is {} but expected {} bytes",
+              totalSize,
+              previousTotalSize));
+        }
+        if (this->mPushConstantsData) {
+            free(this->mPushConstantsData);
+        }
+
+        this->mPushConstantsData = malloc(totalSize);
+        memcpy(this->mPushConstantsData, data, totalSize);
+        this->mPushConstantsDataTypeMemorySize = memorySize;
+        this->mPushConstantsSize = size;
+    }
+
+    /**
+     * Gets the current workgroup from the algorithm.
+     *
+     * @param The kp::Constant to use to set the push constants to use in the
+     * next bindPush(...) calls. The constants provided must be of the same size
+     * as the ones created during initialization.
+     */
+    const Workgroup& getWorkgroup();
+    /**
+     * Gets the specialization constants of the current algorithm.
+     *
+     * @returns The std::vector<float> currently set for specialization
+     * constants
+     */
+    template<typename T>
+    const std::vector<T> getSpecializationConstants()
+    {
+        return { (T*)this->mSpecializationConstantsData,
+                 ((T*)this->mSpecializationConstantsData) +
+                   this->mSpecializationConstantsSize };
+    }
+    /**
+     * Gets the specialization constants of the current algorithm.
+     *
+     * @returns The std::vector<float> currently set for push constants
+     */
+    template<typename T>
+    const std::vector<T> getPushConstants()
+    {
+        return { (T*)this->mPushConstantsData,
+                 ((T*)this->mPushConstantsData) + this->mPushConstantsSize };
+    }
+    /**
+     * Gets the current tensors that are used in the algorithm.
+     *
+     * @returns The list of tensors used in the algorithm.
+     */
+    const std::vector<std::shared_ptr<Tensor>>& getTensors();
+    void setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    void destroy();
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::Device> mDevice;
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
+    bool mFreeDescriptorSetLayout = false;
+    vk::DescriptorPool *mDescriptorPool = nullptr;
+    std::shared_ptr<vk::DescriptorSet> mDescriptorSet;
+    bool mFreeDescriptorSet = false;
+    std::shared_ptr<vk::ShaderModule> mShaderModule;
+    bool mFreeShaderModule = false;
+    std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
+    bool mFreePipelineLayout = false;
+    std::shared_ptr<vk::PipelineCache> mPipelineCache;
+    bool mFreePipelineCache = false;
+    std::shared_ptr<vk::Pipeline> mPipeline;
+    bool mFreePipeline = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<uint32_t> mSpirv;
+    void* mSpecializationConstantsData = nullptr;
+    uint32_t mSpecializationConstantsDataTypeMemorySize = 0;
+    uint32_t mSpecializationConstantsSize = 0;
+    void* mPushConstantsData = nullptr;
+    uint32_t mPushConstantsDataTypeMemorySize = 0;
+    uint32_t mPushConstantsSize = 0;
+    Workgroup mWorkgroup;
+
+    // Create util functions
+    void createShaderModule();
+    void createPipeline();
+
+    // Parameters
+    void freeParameters();
+    void createParameters();
+    void updateParameters();
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Core.hpp b/kompute/src/include/kompute/Core.hpp
new file mode 100644
index 0000000000000..99222cbde9f8d
--- /dev/null
+++ b/kompute/src/include/kompute/Core.hpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <vulkan/vulkan.hpp>
+
+// Typedefs to simplify interaction with core types
+namespace kp {
+typedef std::array<uint32_t, 3> Workgroup;
+typedef std::vector<float> Constants;
+}
+
+// Must be after vulkan is included
+#ifndef KOMPUTE_VK_API_VERSION
+#ifndef KOMPUTE_VK_API_MAJOR_VERSION
+#define KOMPUTE_VK_API_MAJOR_VERSION 1
+#endif // KOMPUTE_VK_API_MAJOR_VERSION
+#ifndef KOMPUTE_VK_API_MINOR_VERSION
+#define KOMPUTE_VK_API_MINOR_VERSION 2
+#endif // KOMPUTE_VK_API_MINOR_VERSION
+#define KOMPUTE_VK_API_VERSION                                                 \
+    VK_MAKE_VERSION(                                                           \
+      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
+#endif // KOMPUTE_VK_API_VERSION
+
+#if defined(KOMPUTE_BUILD_PYTHON)
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+// from python/src/main.cpp
+extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
+#endif
diff --git a/kompute/src/include/kompute/Kompute.hpp b/kompute/src/include/kompute/Kompute.hpp
new file mode 100644
index 0000000000000..f59a63b50ba44
--- /dev/null
+++ b/kompute/src/include/kompute/Kompute.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "Algorithm.hpp"
+#include "Core.hpp"
+#include "Manager.hpp"
+#include "Sequence.hpp"
+#include "Tensor.hpp"
+
+#include "operations/OpAlgoDispatch.hpp"
+#include "operations/OpBase.hpp"
+#include "operations/OpMemoryBarrier.hpp"
+#include "operations/OpMult.hpp"
+#include "operations/OpTensorCopy.hpp"
+#include "operations/OpTensorSyncDevice.hpp"
+#include "operations/OpTensorSyncLocal.hpp"
+#include "operations/OpBufferSyncDevice.hpp"
+#include "operations/OpBufferSyncLocal.hpp"
+
+// Will be build by CMake and placed inside the build directory
+#include "ShaderLogisticRegression.hpp"
+#include "ShaderOpMult.hpp"
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
new file mode 100644
index 0000000000000..8fda58f84b909
--- /dev/null
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <set>
+#include <unordered_map>
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Sequence.hpp"
+#include "logger/Logger.hpp"
+
+#define KP_DEFAULT_SESSION "DEFAULT"
+
+namespace kp {
+
+/**
+    Base orchestrator which creates and manages device and child components
+*/
+class Manager
+{
+  public:
+    /**
+        Base constructor.
+    */
+    Manager();
+
+    /**
+     * Manager destructor which would ensure all owned resources are destroyed
+     * unless explicitly stated that resources should not be destroyed or freed.
+     */
+    ~Manager();
+
+    bool hasDevice() const {
+        return this->mDevice.get();
+    }
+
+    /**
+     * Initialize a device.
+     *
+     * @param physicalDeviceIndex The index of the physical device to use
+     * @param familyQueueIndices (Optional) List of queue indices to add for
+     * explicit allocation
+     * @param desiredExtensions The desired extensions to load from
+     * physicalDevice
+     */
+    void initializeDevice(uint32_t physicalDeviceIndex,
+            const std::vector<uint32_t>& familyQueueIndices = {},
+            const std::vector<std::string>& desiredExtensions = {});
+
+    /**
+     * Create a managed sequence that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param queueIndex The queue to use from the available queues
+     * @param nrOfTimestamps The maximum number of timestamps to allocate.
+     * If zero (default), disables latching of timestamps.
+     * @returns Shared pointer with initialised sequence
+     */
+    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
+                                       uint32_t totalTimestamps = 0);
+
+    /**
+     * Create a managed tensor that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param data The data to initialize the tensor with
+     * @param tensorType The type of tensor to initialize
+     * @returns Shared pointer with initialised tensor
+     */
+    template<typename T>
+    std::shared_ptr<TensorT<T>> tensorT(
+      const std::vector<T>& data,
+       vk::DeviceMemory *primaryMemory,
+       vk::Buffer *primaryBuffer,
+       vk::DeviceMemory *stagingMemory,
+       vk::Buffer *stagingBuffer,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
+
+        std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
+          this->mPhysicalDevice, this->mDevice, data, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    std::shared_ptr<Tensor> tensor(
+      void* data,
+      uint32_t elementTotalCount,
+      uint64_t memorySize,
+      const Tensor::TensorDataTypes& dataType,
+      vk::DeviceMemory *primaryMemory,
+      vk::Buffer *primaryBuffer,
+      vk::DeviceMemory *stagingMemory,
+      vk::Buffer *stagingBuffer,
+      vk::DeviceSize offset,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
+                                                       this->mDevice,
+                                                       data,
+                                                       elementTotalCount,
+                                                       memorySize,
+                                                       dataType,
+                                                       primaryMemory,
+                                                       primaryBuffer,
+                                                       stagingMemory,
+                                                       stagingBuffer,
+                                                       offset,
+                                                       tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    /**
+     * Default non-template function that can be used to create algorithm
+     * objects which provides default types to the push and spec constants as
+     * floats.
+     *
+     * @param tensors (optional) The tensors to initialise the algorithm with
+     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
+     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
+     * defaults to (tensor[0].size(), 1, 1)
+     * @param specializationConstants (optional) float vector to use for
+     * specialization constants, and defaults to an empty constant
+     * @param pushConstants (optional) float vector to use for push constants,
+     * and defaults to an empty constant
+     * @returns Shared pointer with initialised algorithm
+     */
+    std::shared_ptr<Algorithm> algorithm(
+      vk::DescriptorPool *pool,
+      const std::vector<std::shared_ptr<Tensor>>& tensors = {},
+      const std::vector<uint32_t>& spirv = {},
+      const Workgroup& workgroup = {},
+      const std::vector<float>& specializationConstants = {},
+      const std::vector<float>& pushConstants = {})
+    {
+        return this->algorithm<>(
+          pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
+    }
+
+    /**
+     * Create a managed algorithm that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param tensors (optional) The tensors to initialise the algorithm with
+     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
+     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
+     * defaults to (tensor[0].size(), 1, 1)
+     * @param specializationConstants (optional) templatable vector parameter to
+     * use for specialization constants, and defaults to an empty constant
+     * @param pushConstants (optional) templatable vector parameter to use for
+     * push constants, and defaults to an empty constant
+     * @returns Shared pointer with initialised algorithm
+     */
+    template<typename S = float, typename P = float>
+    std::shared_ptr<Algorithm> algorithm(
+      vk::DescriptorPool *pool,
+      const std::vector<std::shared_ptr<Tensor>>& tensors,
+      const std::vector<uint32_t>& spirv,
+      const Workgroup& workgroup,
+      const std::vector<S>& specializationConstants,
+      const std::vector<P>& pushConstants)
+    {
+
+        KP_LOG_DEBUG("Kompute Manager algorithm creation triggered");
+
+        std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
+          this->mDevice,
+          pool,
+          tensors,
+          spirv,
+          workgroup,
+          specializationConstants,
+          pushConstants) };
+
+        if (this->mManageResources) {
+            this->mManagedAlgorithms.push_back(algorithm);
+        }
+
+        return algorithm;
+    }
+
+    /**
+     * Destroy the GPU resources and all managed resources by manager.
+     **/
+    void destroy();
+    /**
+     * Run a pseudo-garbage collection to release all the managed resources
+     * that have been already freed due to these reaching to zero ref count.
+     **/
+    void clear();
+
+    /**
+     * Information about the current device.
+     *
+     * @return vk::PhysicalDeviceProperties containing information about the
+     *device
+     **/
+    vk::PhysicalDeviceProperties getDeviceProperties() const;
+
+    /**
+     * List the devices available in the current vulkan instance.
+     *
+     * @return vector of physical devices containing their respective properties
+     **/
+    std::vector<vk::PhysicalDevice> listDevices() const;
+
+    /**
+     * The current Vulkan instance.
+     *
+     * @return a shared pointer to the current Vulkan instance held by this
+     *object
+     **/
+    std::shared_ptr<vk::Instance> getVkInstance() const;
+
+    std::shared_ptr<vk::Device> device() const { return mDevice; }
+    std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
+
+  private:
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::Instance> mInstance = nullptr;
+    bool mFreeInstance = false;
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
+    std::shared_ptr<vk::Device> mDevice = nullptr;
+    bool mFreeDevice = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::weak_ptr<Tensor>> mManagedTensors;
+    std::vector<std::weak_ptr<Sequence>> mManagedSequences;
+    std::vector<std::weak_ptr<Algorithm>> mManagedAlgorithms;
+
+    std::vector<uint32_t> mComputeQueueFamilyIndices;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+
+    bool mManageResources = false;
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    vk::DebugReportCallbackEXT mDebugReportCallback;
+    vk::DispatchLoaderDynamic mDebugDispatcher;
+#endif
+
+    // Create functions
+    void createInstance();
+    void createDevice(const std::vector<uint32_t>& familyQueueIndices = {},
+                      uint32_t physicalDeviceIndex = 0,
+                      const std::vector<std::string>& desiredExtensions = {});
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Sequence.hpp b/kompute/src/include/kompute/Sequence.hpp
new file mode 100644
index 0000000000000..e282242f1d991
--- /dev/null
+++ b/kompute/src/include/kompute/Sequence.hpp
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ *  Container of operations that can be sent to GPU as batch
+ */
+class Sequence : public std::enable_shared_from_this<Sequence>
+{
+  public:
+    /**
+     * Main constructor for sequence which requires core vulkan components to
+     * generate all dependent resources.
+     *
+     * @param physicalDevice Vulkan physical device
+     * @param device Vulkan logical device
+     * @param computeQueue Vulkan compute queue
+     * @param queueIndex Vulkan compute queue index in device
+     * @param totalTimestamps Maximum number of timestamps to allocate
+     */
+    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+             std::shared_ptr<vk::Device> device,
+             std::shared_ptr<vk::Queue> computeQueue,
+             uint32_t queueIndex,
+             uint32_t totalTimestamps = 0);
+    /**
+     * Destructor for sequence which is responsible for cleaning all subsequent
+     * owned operations.
+     */
+    ~Sequence();
+
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param op Object derived from kp::BaseOp that will be recoreded by the
+     * sequence which will be used when the operation is evaluated.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
+
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> record(
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->record(op);
+    }
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
+                                     TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->record(op);
+    }
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job synchronously (with a barrier).
+     *
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> eval();
+
+    /**
+     * Resets all the recorded and stored operations, records the operation
+     * provided and submits into the gpu as a submit job synchronously (with a
+     * barrier).
+     *
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(vk::Buffer *primaryBuffer,
+                                   vk::Buffer *stagingBuffer,
+                                   vk::DeviceSize size,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(primaryBuffer, stagingBuffer, size, std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    /**
+     * Eval Async sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    std::shared_ptr<Sequence> evalAsync();
+    /**
+     * Clears currnet operations to record provided one in the vector of
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> evalAsync(
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->evalAsync(op);
+    }
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
+                                        TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->evalAsync(op);
+    }
+
+    /**
+     * Eval Await waits for the fence to finish processing and then once it
+     * finishes, it runs the postEval of all operations.
+     *
+     * @param waitFor Number of milliseconds to wait before timing out.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
+
+    /**
+     * Clear function clears all operations currently recorded and starts
+     * recording again.
+     */
+    void clear();
+
+    /**
+     * Return the timestamps that were latched at the beginning and
+     * after each operation during the last eval() call.
+     */
+    std::vector<std::uint64_t> getTimestamps();
+
+    /**
+     * Begins recording commands for commands to be submitted into the command
+     * buffer.
+     */
+    void begin();
+
+    /**
+     * Ends the recording and stops recording commands when the record command
+     * is sent.
+     */
+    void end();
+
+    /**
+     * Returns true if the sequence is currently in recording activated.
+     *
+     * @return Boolean stating if recording ongoing.
+     */
+    bool isRecording() const;
+
+    /**
+     * Returns true if the sequence has been initialised, and it's based on the
+     * GPU resources being referenced.
+     *
+     * @return Boolean stating if is initialized
+     */
+    bool isInit() const;
+
+    /**
+     * Clears command buffer and triggers re-record of all the current
+     * operations saved, which is useful if the underlying kp::Tensors or
+     * kp::Algorithms are modified and need to be re-recorded.
+     */
+    void rerecord();
+
+    /**
+     * Returns true if the sequence is currently running - mostly used for async
+     * workloads.
+     *
+     * @return Boolean stating if currently running.
+     */
+    bool isRunning() const;
+
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
+     */
+    void destroy();
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
+    std::shared_ptr<vk::Device> mDevice = nullptr;
+    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
+    uint32_t mQueueIndex = -1;
+
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
+    bool mFreeCommandPool = false;
+    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
+    bool mFreeCommandBuffer = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    vk::Fence mFence;
+    std::vector<std::shared_ptr<OpBase>> mOperations{};
+    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
+
+    // State
+    bool mRecording = false;
+    bool mIsRunning = false;
+
+    // Create functions
+    void createCommandPool();
+    void createCommandBuffer();
+    void createTimestampQueryPool(uint32_t totalTimestamps);
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
new file mode 100644
index 0000000000000..4c260ce6b9c63
--- /dev/null
+++ b/kompute/src/include/kompute/Tensor.hpp
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "logger/Logger.hpp"
+#include <memory>
+#include <string>
+
+namespace kp {
+
+/**
+ * Structured data used in GPU operations.
+ *
+ * Tensors are the base building block in Kompute to perform operations across
+ * GPUs. Each tensor would have a respective Vulkan memory and buffer, which
+ * would be used to store their respective data. The tensors can be used for GPU
+ * data storage or transfer.
+ */
+class Tensor
+{
+  public:
+    /**
+     * Type for tensors created: Device allows memory to be transferred from
+     * staging buffers. Staging are host memory visible. Storage are device
+     * visible but are not set up to transfer or receive data (only for shader
+     * storage).
+     */
+    enum class TensorTypes
+    {
+        eDevice = 0,  ///< Type is device memory, source and destination
+        eHost = 1,    ///< Type is host memory, source and destination
+        eStorage = 2, ///< Type is Device memory (only)
+    };
+    enum class TensorDataTypes
+    {
+        eBool = 0,
+        eInt = 1,
+        eUnsignedInt = 2,
+        eFloat = 3,
+        eDouble = 4,
+    };
+
+    static std::string toString(TensorDataTypes dt);
+    static std::string toString(TensorTypes dt);
+
+    /**
+     *  Constructor with data provided which would be used to create the
+     * respective vulkan buffer and memory.
+     *
+     *  @param physicalDevice The physical device to use to fetch properties
+     *  @param device The device to use to create the buffer and memory from
+     *  @param data Non-zero-sized vector of data that will be used by the
+     * tensor
+     *  @param tensorTypes Type for the tensor which is of type TensorTypes
+     */
+    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           void* data,
+           uint32_t elementTotalCount,
+           uint32_t memorySize,
+           const TensorDataTypes& dataType,
+           vk::DeviceMemory *primaryMemory,
+           vk::Buffer *primaryBuffer,
+           vk::DeviceMemory *stagingMemory,
+           vk::Buffer *stagingBuffer,
+           vk::DeviceSize offset,
+           const TensorTypes& tensorType = TensorTypes::eDevice);
+
+    /**
+     * Destructor which is in charge of freeing vulkan resources unless they
+     * have been provided externally.
+     */
+    virtual ~Tensor();
+
+    /**
+     * Function to trigger reinitialisation of the tensor buffer and memory with
+     * new data as well as new potential device type.
+     *
+     * @param data Vector of data to use to initialise vector from
+     * @param tensorType The type to use for the tensor
+     */
+    void rebuild(void* data,
+                 uint32_t elementTotalCount,
+                 uint64_t memorySize,
+                 vk::DeviceMemory *primaryMemory,
+                 vk::Buffer *primaryBuffer,
+                 vk::DeviceMemory *stagingMemory,
+                 vk::Buffer *stagingBuffer,
+                 vk::DeviceSize offset);
+
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory.
+     */
+    void destroy();
+
+    /**
+     * Check whether tensor is initialized based on the created gpu resources.
+     *
+     * @returns Boolean stating whether tensor is initialized
+     */
+    bool isInit();
+
+    /**
+     * Retrieve the tensor type of the Tensor
+     *
+     * @return Tensor type of tensor
+     */
+    TensorTypes tensorType();
+
+    /**
+     * Records a copy from the memory of the tensor provided to the current
+     * thensor. This is intended to pass memory into a processing, to perform
+     * a staging buffer transfer, or to gather output (between others).
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param copyFromTensor Tensor to copy the data from
+     */
+    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                        std::shared_ptr<Tensor> copyFromTensor);
+
+    /**
+     * Records a copy from the internal staging memory to the device memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     */
+    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     */
+    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records the buffer memory barrier into the primary buffer and command
+     * buffer which ensures that relevant data transfers are carried out
+     * correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordPrimaryBufferMemoryBarrier(
+      const vk::CommandBuffer& commandBuffer,
+      vk::AccessFlagBits srcAccessMask,
+      vk::AccessFlagBits dstAccessMask,
+      vk::PipelineStageFlagBits srcStageMask,
+      vk::PipelineStageFlagBits dstStageMask);
+    /**
+     * Records the buffer memory barrier into the staging buffer and command
+     * buffer which ensures that relevant data transfers are carried out
+     * correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordStagingBufferMemoryBarrier(
+      const vk::CommandBuffer& commandBuffer,
+      vk::AccessFlagBits srcAccessMask,
+      vk::AccessFlagBits dstAccessMask,
+      vk::PipelineStageFlagBits srcStageMask,
+      vk::PipelineStageFlagBits dstStageMask);
+
+    /**
+     * Constructs a vulkan descriptor buffer info which can be used to specify
+     * and reference the underlying buffer component of the tensor without
+     * exposing it.
+     *
+     * @return Descriptor buffer info with own buffer
+     */
+    vk::DescriptorBufferInfo constructDescriptorBufferInfo();
+
+    /**
+     * Returns the size/magnitude of the Tensor, which will be the total number
+     * of elements across all dimensions
+     *
+     * @return Unsigned integer representing the total number of elements
+     */
+    uint32_t size();
+
+    /**
+     * Returns the total memory size of the data contained by the Tensor object
+     *
+     * @return Unsigned integer representing the memory of the tensor in bytes.
+     */
+    uint64_t memorySize();
+
+    /**
+     * Retrieve the data type of the tensor (host, device, storage)
+     *
+     * @return Data type of tensor of type kp::Tensor::TensorDataTypes
+     */
+    TensorDataTypes dataType();
+
+    /**
+     * Retrieve the raw data via the pointer to the memory that contains the raw
+     * memory of this current tensor. This tensor gets changed to a nullptr when
+     * the Tensor is removed.
+     *
+     * @return Pointer to raw memory containing raw bytes data of Tensor.
+     */
+    void* rawData();
+
+    /**
+     * Sets / resets the data of the tensor which is directly done on the GPU
+     * host visible memory available by the tensor.
+     */
+    void setRawData(const void* data);
+
+    /**
+     * Template to return the pointer data converted by specific type, which
+     * would be any of the supported types including float, double, int32,
+     * uint32 and bool.
+     *
+     * @return Pointer to raw memory containing raw bytes data of Tensor.
+     */
+    template<typename T>
+    T* data()
+    {
+        return (T*)this->mRawData;
+    }
+
+    /**
+     * Template to get the data of the current tensor as a vector of specific
+     * type, which would be any of the supported types including float, double,
+     * int32, uint32 and bool.
+     *
+     * @return Vector of type provided by template.
+     */
+    template<typename T>
+    std::vector<T> vector()
+    {
+        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
+    }
+
+  protected:
+    // -------------- ALWAYS OWNED RESOURCES
+    TensorTypes mTensorType;
+    TensorDataTypes mDataType;
+    uint32_t mSize = 0;
+    uint64_t mMemorySize = 0;
+    vk::DeviceSize mOffset = 0;
+    void* mRawData = nullptr;
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
+    std::shared_ptr<vk::Device> mDevice;
+    vk::Buffer *mPrimaryBuffer = nullptr;
+    vk::Buffer *mStagingBuffer = nullptr;
+    vk::DeviceMemory *mPrimaryMemory = nullptr;
+    vk::DeviceMemory *mStagingMemory = nullptr;
+
+    void setGPUResources(vk::DeviceMemory *primaryMemory,
+                         vk::Buffer *primaryBuffer,
+                         vk::DeviceMemory *stagingMemory,
+                         vk::Buffer *stagingBuffer,
+                         vk::DeviceSize offset);
+    void recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
+                          vk::Buffer *bufferFrom,
+                          vk::Buffer *bufferTo,
+                          vk::DeviceSize bufferSize,
+                          vk::BufferCopy copyRegion);
+    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                   const vk::Buffer& buffer,
+                                   vk::AccessFlagBits srcAccessMask,
+                                   vk::AccessFlagBits dstAccessMask,
+                                   vk::PipelineStageFlagBits srcStageMask,
+                                   vk::PipelineStageFlagBits dstStageMask);
+
+    // Private util functions
+    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
+    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
+    vk::BufferUsageFlags getStagingBufferUsageFlags();
+    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
+};
+
+template<typename T>
+class TensorT : public Tensor
+{
+
+  public:
+    ~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); }
+
+    TensorDataTypes dataType();
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/logger/Logger.hpp b/kompute/src/include/kompute/logger/Logger.hpp
new file mode 100644
index 0000000000000..f97e95cf06c4c
--- /dev/null
+++ b/kompute/src/include/kompute/logger/Logger.hpp
@@ -0,0 +1,197 @@
+#pragma once
+
+#define KOMPUTE_LOG_LEVEL_TRACE 0
+#define KOMPUTE_LOG_LEVEL_DEBUG 1
+#define KOMPUTE_LOG_LEVEL_INFO 2
+#define KOMPUTE_LOG_LEVEL_WARN 3
+#define KOMPUTE_LOG_LEVEL_ERROR 4
+#define KOMPUTE_LOG_LEVEL_CRITICAL 5
+#define KOMPUTE_LOG_LEVEL_OFF 6
+
+// Logging is disabled entirely.
+#if KOMPUTE_OPT_LOG_LEVEL_DISABLED
+#define KP_LOG_TRACE(...)
+#define KP_LOG_DEBUG(...)
+#define KP_LOG_INFO(...)
+#define KP_LOG_WARN(...)
+#define KP_LOG_ERROR(...)
+#else
+
+#if !KOMPUTE_OPT_USE_SPDLOG
+#if VK_USE_PLATFORM_ANDROID_KHR
+#include <android/log.h>
+#include <fmt/core.h>
+static const char* KOMPUTE_LOG_TAG = "KomputeLog";
+#else
+#if KOMPUTE_BUILD_PYTHON
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+// from python/src/main.cpp
+extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
+#else
+#include <fmt/core.h>
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#include <spdlog/spdlog.h>
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+#include <set>
+#include <string>
+#include <vector>
+namespace logger {
+// Setup the logger, note the loglevel can not be set below the CMake log level
+// (To change this use -DKOMPUTE_OPT_LOG_LEVEL=...)
+void
+setupLogger();
+
+// Logging is enabled, but we do not use Spdlog. So we use fmt in case nothing
+// else is defined, overriding logging.
+#if !KOMPUTE_OPT_USE_SPDLOG
+
+#ifndef KP_LOG_TRACE
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_TRACE
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_TRACE(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_VERBOSE, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_trace(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_TRACE(...)                                                      \
+    fmt::print("[{} {}] [trace] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_TRACE(...)
+#endif
+#endif // !KP_LOG_TRACE
+
+#ifndef KP_LOG_DEBUG
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_DEBUG(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_debug(fmt::format(__VA_ARGS__))
+#else
+#ifdef __FILE_NAME__ // gcc 12 provides only file name without path
+#define KP_LOG_DEBUG(...)                                                      \
+    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE_NAME__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_DEBUG(...)                                                      \
+    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // __FILE__NAME__
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_DEBUG(...)
+#endif
+#endif // !KP_LOG_DEBUG
+
+#ifndef KP_LOG_INFO
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_INFO(...)                                                       \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_info(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_INFO(...)                                                       \
+    fmt::print("[{} {}] [info] [{}:{}] {}\n",                                  \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_INFO(...)
+#endif
+#endif // !KP_LOG_INFO
+
+#ifndef KP_LOG_WARN
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_WARN
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_WARN(...)                                                       \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_WARN, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_warning(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_WARN(...)                                                       \
+    fmt::print("[{} {}] [warn] [{}:{}] {}\n",                                  \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_WARN(...)
+#endif
+#endif // !KP_LOG_WARN
+
+#ifndef KP_LOG_ERROR
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_ERROR
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_ERROR(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_ERROR, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_error(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_ERROR(...)                                                      \
+    fmt::print("[{} {}] [error] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_ERROR(...)
+#endif
+#endif // !KP_LOG_ERROR
+#else
+
+#define KP_LOG_TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
+#define KP_LOG_DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
+#define KP_LOG_INFO(...) SPDLOG_INFO(__VA_ARGS__)
+#define KP_LOG_WARN(...) SPDLOG_WARN(__VA_ARGS__)
+#define KP_LOG_ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
+
+void
+setLogLevel(spdlog::level::level_enum level);
+
+spdlog::level::level_enum
+getLogLevel();
+
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+} // namespace logger
+
+#endif // KOMPUTE_OPT_LOG_LEVEL_DISABLED
diff --git a/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
new file mode 100644
index 0000000000000..e91598f0562c2
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of
+ * algorithm and parameter components which can be used with shaders.
+ * By default it enables the user to provide a dynamic number of tensors
+ * which are then passed as inputs.
+ */
+class OpAlgoDispatch : public OpBase
+{
+  public:
+    /**
+     * Constructor that stores the algorithm to use as well as the relevant
+     * push constants to override when recording.
+     *
+     * @param algorithm The algorithm object to use for dispatch
+     * @param pushConstants The push constants to use for override
+     */
+    template<typename T = float>
+    OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
+                   const std::vector<T>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor");
+
+        this->mAlgorithm = algorithm;
+
+        if (pushConstants.size()) {
+            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+            uint32_t size = pushConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mPushConstantsData = malloc(totalSize);
+            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
+            this->mPushConstantsDataTypeMemorySize = memorySize;
+            this->mPushConstantsSize = size;
+        }
+    }
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    virtual ~OpAlgoDispatch() override;
+
+    /**
+     * This records the commands that are to be sent to the GPU. This includes
+     * the barriers that ensure the memory has been copied before going in and
+     * out of the shader, as well as the dispatch operation that sends the
+     * shader processing to the gpu. This function also records the GPU memory
+     * copy of the output data for the staging buffer so it can be read by the
+     * host.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::shared_ptr<Algorithm> mAlgorithm;
+    void* mPushConstantsData = nullptr;
+    uint32_t mPushConstantsDataTypeMemorySize = 0;
+    uint32_t mPushConstantsSize = 0;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBase.hpp b/kompute/src/include/kompute/operations/OpBase.hpp
new file mode 100644
index 0000000000000..737670846350d
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBase.hpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+/**
+ *  Base Operation which provides the high level interface that Kompute
+ *  operations implement in order to perform a set of actions in the GPU.
+ *
+ *  Operations can perform actions on tensors, and optionally can also own an
+ *  Algorithm with respective parameters. kp::Operations with kp::Algorithms
+ *  would inherit from kp::OpBaseAlgo.
+ */
+class OpBase
+{
+  public:
+    /**
+     * Default destructor for OpBase class. This OpBase destructor class should
+     * always be called to destroy and free owned resources unless it is
+     * intended to destroy the resources in the parent class.
+     */
+    virtual ~OpBase() { KP_LOG_DEBUG("Kompute OpBase destructor started"); }
+
+    /**
+     * The record function is intended to only send a record command or run
+     * commands that are expected to record operations that are to be submitted
+     * as a batch into the GPU.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
+
+    /**
+     * Pre eval is called before the Sequence has called eval and submitted the
+     * commands to the GPU for processing, and can be used to perform any
+     * per-eval setup steps required as the computation iteration begins. It's
+     * worth noting that there are situations where eval can be called multiple
+     * times, so the resources that are created should be idempotent in case
+     * it's called multiple times in a row.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
+
+    /**
+     * Post eval is called after the Sequence has called eval and submitted the
+     * commands to the GPU for processing, and can be used to perform any
+     * tear-down steps required as the computation iteration finishes. It's
+     * worth noting that there are situations where eval can be called multiple
+     * times, so the resources that are destroyed should not require a re-init
+     * unless explicitly provided by the user.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
new file mode 100644
index 0000000000000..50d8e97072412
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+class OpBufferSyncDevice : public OpBase
+{
+  public:
+    OpBufferSyncDevice(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpBufferSyncDevice() override;
+
+    /**
+     * For device buffers, it records the copy command for the buffer to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
new file mode 100644
index 0000000000000..7db9971991c59
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+class OpBufferSyncLocal : public OpBase
+{
+  public:
+    OpBufferSyncLocal(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpBufferSyncLocal() override;
+
+    /**
+     * For device buffers, it records the copy command for the buffer to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
new file mode 100644
index 0000000000000..4a232232397cf
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of
+ * algorithm and parameter components which can be used with shaders.
+ * It exposes the pipeline barrier functionality specifically for memory
+ * barriers that can be configured through the respective source and destination
+ * masks
+ */
+class OpMemoryBarrier : public OpBase
+{
+  public:
+    /**
+     * Constructor that stores tensors as well as memory barrier parameters to
+     * be used to create a pipeline barrier on the respective primary or staging
+     * tensor.
+     *
+     * @param tensors The tensors to apply the memory barriers on
+     * @param srcAccessMask The kp::AccessFlagBits for the source access mask
+     * @param dstAccessMask The kp::AccessFlagBits for the destination access
+     * mask
+     * @param srcStageMask The kp::PipelineStageFlagBits for the source stage
+     * mask
+     * @param dstStageMask The kp::PipelineStageFlagBits for the destination
+     * stage mask
+     * @param barrierOnPrimary Boolean to select primary or secondary buffers on
+     * tensors
+     */
+    OpMemoryBarrier(const std::vector<std::shared_ptr<Tensor>>& tensors,
+                    const vk::AccessFlagBits& srcAccessMask,
+                    const vk::AccessFlagBits& dstAccessMask,
+                    const vk::PipelineStageFlagBits& srcStageMask,
+                    const vk::PipelineStageFlagBits& dstStageMask,
+                    bool barrierOnPrimary = true);
+
+    /**
+     * Default destructor, which is in charge of destroying the reference to the
+     * tensors and all the relevant access / stage masks created
+     */
+    virtual ~OpMemoryBarrier() override;
+
+    /**
+     * This records the memory barrier with the access and stage masks provided
+     * across all relevant tensors.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    const vk::AccessFlagBits mSrcAccessMask;
+    const vk::AccessFlagBits mDstAccessMask;
+    const vk::PipelineStageFlagBits mSrcStageMask;
+    const vk::PipelineStageFlagBits mDstStageMask;
+    const bool mBarrierOnPrimary;
+    const std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMult.hpp b/kompute/src/include/kompute/operations/OpMult.hpp
new file mode 100644
index 0000000000000..f75ccc4fbb763
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpMult.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <fstream>
+
+#include "kompute/Core.hpp"
+
+#include "ShaderOpMult.hpp"
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+
+namespace kp {
+
+/**
+ * Operation that performs multiplication on two tensors and outpus on third
+ * tensor.
+ */
+class OpMult : public OpAlgoDispatch
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the bare minimum
+     * requirements for the operations to be able to create and manage their
+     * sub-components.
+     *
+     * @param tensors Tensors that are to be used in this operation
+     * @param algorithm An algorithm that will be overridden with the OpMult
+     * shader data and the tensors provided which are expected to be 3
+     */
+    OpMult(std::vector<std::shared_ptr<Tensor>> tensors,
+           std::shared_ptr<Algorithm> algorithm)
+      : OpAlgoDispatch(algorithm)
+    {
+        KP_LOG_DEBUG("Kompute OpMult constructor with params");
+
+        if (tensors.size() != 3) {
+            throw std::runtime_error(
+              "Kompute OpMult expected 3 tensors but got " +
+              std::to_string(tensors.size()));
+        }
+
+        const std::vector<uint32_t> spirv = std::vector<uint32_t>(
+          SHADEROPMULT_COMP_SPV.begin(), SHADEROPMULT_COMP_SPV.end());
+
+        algorithm->rebuild<>(tensors, spirv);
+    }
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    ~OpMult() override { KP_LOG_DEBUG("Kompute OpMult destructor started"); }
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorCopy.hpp b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
new file mode 100644
index 0000000000000..968c1065a3388
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that copies the data from the first tensor to the rest of the
+ * tensors provided, using a record command for all the vectors. This operation
+ * does not own/manage the memory of the tensors passed to it. The operation
+ * must only receive tensors of type
+ */
+class OpTensorCopy : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorCopy() override;
+
+    /**
+     * Records the copy commands from the first tensor into all the other
+     * tensors provided. Also optionally records a barrier.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Copies the local vectors for all the tensors to sync the data with the
+     * gpu.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
new file mode 100644
index 0000000000000..9b39e490f774e
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's device by mapping local data into the device
+ * memory. For TensorTypes::eDevice it will use a record operation for the
+ * memory to be syncd into GPU memory which means that the operation will be
+ * done in sync with GPU commands. For TensorTypes::eHost it will only map the
+ * data into host memory which will happen during preEval before the recorded
+ * commands are dispatched.
+ */
+class OpTensorSyncDevice : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensos
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncDevice() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
new file mode 100644
index 0000000000000..4216003e530c5
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's local memory by mapping device data into the
+ * local CPU memory. For TensorTypes::eDevice it will use a record operation
+ * for the memory to be syncd into GPU memory which means that the operation
+ * will be done in sync with GPU commands. For TensorTypes::eHost it will
+ * only map the data into host memory which will happen during preEval before
+ * the recorded commands are dispatched.
+ */
+class OpTensorSyncLocal : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensors
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncLocal() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its device to staging memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into
+     * local memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/logger/CMakeLists.txt b/kompute/src/logger/CMakeLists.txt
new file mode 100644
index 0000000000000..1dcc1e6b5a9c9
--- /dev/null
+++ b/kompute/src/logger/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.20)
+
+set(LOGGER_SOURCES Logger.cpp)
+
+add_library(kp_logger ${LOGGER_SOURCES})
+
+# Define log levels in code
+add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_DEBUG=1)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_INFO=2)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_WARN=3)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_ERROR=4)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_CRITICAL=5)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_OFF=6)
+
+if(KOMPUTE_OPT_BUILD_PYTHON AND KOMPUTE_OPT_USE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_PYTHON' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
+endif()
+
+if(KOMPUTE_OPT_ANDROID_BUILD AND KOMPUTE_OPT_USE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_ANDROID_BUILD' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
+endif()
+
+if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Trace")
+    set(KOMPUTE_OPT_LOG_LEVEL TRACE)
+    message(STATUS "Using log level Trace")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Debug")
+    set(KOMPUTE_OPT_LOG_LEVEL DEBUG)
+    message(STATUS "Using log level Debug")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Info")
+    set(KOMPUTE_OPT_LOG_LEVEL INFO)
+    message(STATUS "Using log level Info")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Warn")
+    set(KOMPUTE_OPT_LOG_LEVEL WARN)
+    message(STATUS "Using log level Warn")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Error")
+    set(KOMPUTE_OPT_LOG_LEVEL ERROR)
+    message(STATUS "Using log level Error")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Critical")
+    set(KOMPUTE_OPT_LOG_LEVEL CRITICAL)
+    message(STATUS "Using log level Critical")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
+    set(KOMPUTE_OPT_LOG_LEVEL OFF)
+    message(STATUS "Using log level Off")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Default")
+    set(KOMPUTE_OPT_LOG_LEVEL $<IF:$<CONFIG:Debug>,DEBUG,INFO>)
+    message(STATUS "Setting KOMPUTE_OPT_LOG_LEVEL to according to the build type")
+else()
+    message(FATAL_ERROR "Log level '${KOMPUTE_OPT_LOG_LEVEL}' unknown, use -DKOMPUTE_OPT_LOG_LEVEL={Trace, Debug, Info, Warn, Error, Critical, Off, Default} to set it to a correct value.")
+endif()
+
+# Always make sure we define the Kompute log level independent of the Spdlog log level
+target_compile_definitions(kp_logger INTERFACE KOMPUTE_OPT_ACTIVE_LOG_LEVEL=KOMPUTE_LOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+
+# Link depending on how the logger should be setup
+if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
+    if(KOMPUTE_OPT_USE_SPDLOG)
+        target_link_libraries(kp_logger PUBLIC spdlog::spdlog)
+        target_compile_definitions(spdlog INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+        target_compile_definitions(kp_logger INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+        message(STATUS "setting SPDLOG_ACTIVE_LEVEL to SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL}")
+
+        if(KOMPUTE_OPT_SPDLOG_ASYNC_MODE)
+            target_compile_definitions(kp_logger INTERFACE KOMPUTE_SPDLOG_ASYNC_LOGGING=1)
+        endif()
+    else()
+        target_link_libraries(kp_logger PUBLIC fmt::fmt)
+    endif()
+endif()
diff --git a/kompute/src/logger/Logger.cpp b/kompute/src/logger/Logger.cpp
new file mode 100644
index 0000000000000..69df2b609610c
--- /dev/null
+++ b/kompute/src/logger/Logger.cpp
@@ -0,0 +1,101 @@
+#include "kompute/logger/Logger.hpp"
+
+#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
+#if !KOMPUTE_OPT_USE_SPDLOG
+#else
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <spdlog/async.h>
+#include <spdlog/common.h>
+#include <spdlog/logger.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+#include <string>
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+
+namespace logger {
+#if !KOMPUTE_OPT_USE_SPDLOG
+
+void
+setupLogger()
+{
+}
+
+#else
+constexpr int THREAD_QUEUE_LENGTH = 8192;
+
+void
+setupLogger()
+{
+    // Ensure we setup the logger only once
+    static bool setup = false;
+    static std::mutex setupMutex{};
+    setupMutex.lock();
+    if (setup) {
+        setupMutex.unlock();
+        return;
+    }
+    setup = true;
+    setupMutex.unlock();
+
+    spdlog::init_thread_pool(THREAD_QUEUE_LENGTH, 1);
+    spdlog::sink_ptr console_sink =
+      std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
+#if SPDLOG_ACTIVE_LEVEL < SPDLOG_LEVEL_INFO
+    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=21s] %v");
+#else
+    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=15s] %v");
+#endif
+    std::vector<spdlog::sink_ptr> sinks{ console_sink };
+    // TODO: Add flag in compile flags
+    std::shared_ptr<spdlog::logger> logger =
+#if KOMPUTE_SPDLOG_ASYNC_LOGGING
+          std::make_shared<spdlog::async_logger>(
+            "",
+            sinks.begin(),
+            sinks.end(),
+            spdlog::thread_pool(),
+            spdlog::async_overflow_policy::block);
+#else
+          std::make_shared<spdlog::logger>(
+            "",
+            sinks.begin(),
+            sinks.end());
+#endif
+
+    logger->set_level(getLogLevel());
+
+    spdlog::set_default_logger(logger);
+}
+
+spdlog::level::level_enum
+getLogLevel()
+{
+#if SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_TRACE
+    return spdlog::level::trace;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_DEBUG
+    return spdlog::level::debug;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_INFO
+    return spdlog::level::info;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_WARN
+    return spdlog::level::warn;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_ERROR
+    return spdlog::level::error;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_CRITICAL
+    return spdlog::level::critical;
+#else
+    return spdlog::level::off;
+#endif
+}
+
+void
+setLogLevel(const spdlog::level::level_enum level)
+{
+    spdlog::default_logger()->set_level(level);
+}
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+} // namespace logger
+
+#endif
diff --git a/kompute/src/shaders/CMakeLists.txt b/kompute/src/shaders/CMakeLists.txt
new file mode 100644
index 0000000000000..901bf3e8a8af2
--- /dev/null
+++ b/kompute/src/shaders/CMakeLists.txt
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# ######################
+cmake_minimum_required(VERSION 3.20)
+
+add_subdirectory(glsl)
\ No newline at end of file
diff --git a/kompute/src/shaders/glsl/CMakeLists.txt b/kompute/src/shaders/glsl/CMakeLists.txt
new file mode 100644
index 0000000000000..3101a2b17b751
--- /dev/null
+++ b/kompute/src/shaders/glsl/CMakeLists.txt
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# ######################
+cmake_minimum_required(VERSION 3.20)
+
+# Check if build shaders from source is enabled
+if(KOMPUTE_OPT_BUILD_SHADERS)
+    vulkan_compile_shader(INFILE ShaderOpMult.comp
+        OUTFILE ShaderOpMult.hpp
+        NAMESPACE "kp")
+
+    vulkan_compile_shader(INFILE ShaderLogisticRegression.comp
+        OUTFILE ShaderLogisticRegression.hpp
+        NAMESPACE "kp")
+else() # Else we will use our precompiled versions
+    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderOpMult.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp)
+    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderLogisticRegression.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp)
+endif()
+
+add_library(kp_shader INTERFACE "${CMAKE_CURRENT_BINARY_DIR}/ShaderOpMult.hpp"
+    "${CMAKE_CURRENT_BINARY_DIR}/ShaderLogisticRegression.hpp")
+
+target_include_directories(kp_shader INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
+
+# Make sure we install shaders:
+install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.comp b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
new file mode 100644
index 0000000000000..5a1c5d9486754
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
@@ -0,0 +1,52 @@
+#version 450
+
+layout (constant_id = 0) const float m = 0;
+
+layout (local_size_x = 1) in;
+
+layout(set = 0, binding = 0) buffer bxi { float xi[]; };
+layout(set = 0, binding = 1) buffer bxj { float xj[]; };
+layout(set = 0, binding = 2) buffer by { float y[]; };
+layout(set = 0, binding = 3) buffer bwin { float win[]; };
+layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
+layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
+layout(set = 0, binding = 6) buffer bbin { float bin[]; };
+layout(set = 0, binding = 7) buffer bbout { float bout[]; };
+layout(set = 0, binding = 8) buffer blout { float lout[]; };
+
+float sigmoid(float z) {
+    return 1.0 / (1.0 + exp(-z));
+}
+
+float inference(vec2 x, vec2 w, float b) {
+    // Compute the linear mapping function
+    float z = dot(w, x) + b;
+    // Calculate the y-hat with sigmoid
+    float yHat = sigmoid(z);
+    return yHat;
+}
+
+float calculateLoss(float yHat, float y) {
+    return -(y * log(yHat)  +  (1.0 - y) * log(1.0 - yHat));
+}
+
+void main() {
+    uint idx = gl_GlobalInvocationID.x;
+
+    vec2 wCurr = vec2(win[0], win[1]);
+    float bCurr = bin[0];
+
+    vec2 xCurr = vec2(xi[idx], xj[idx]);
+    float yCurr = y[idx];
+
+    float yHat = inference(xCurr, wCurr, bCurr);
+
+    float dZ = yHat - yCurr;
+    vec2 dW = (1. / m) * xCurr * dZ;
+    float dB = (1. / m) * dZ;
+    wouti[idx] = dW.x;
+    woutj[idx] = dW.y;
+    bout[idx] = dB;
+
+    lout[idx] = calculateLoss(yHat, yCurr);
+}
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
new file mode 100644
index 0000000000000..bfe7792c6c8d9
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
@@ -0,0 +1,310 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace kp {
+const std::array<uint32_t, 1204> SHADERLOGISTICREGRESSION_COMP_SPV = { 
+0x07230203, 0x00010000, 0x0008000a, 0x000000ae, 
+0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
+0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
+0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
+0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
+0x00000000, 0x00000041, 0x00060010, 0x00000004, 
+0x00000011, 0x00000001, 0x00000001, 0x00000001, 
+0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
+0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 
+0x0000000a, 0x6d676973, 0x2864696f, 0x003b3166, 
+0x00030005, 0x00000009, 0x0000007a, 0x00080005, 
+0x00000012, 0x65666e69, 0x636e6572, 0x66762865, 
+0x66763b32, 0x31663b32, 0x0000003b, 0x00030005, 
+0x0000000f, 0x00000078, 0x00030005, 0x00000010, 
+0x00000077, 0x00030005, 0x00000011, 0x00000062, 
+0x00080005, 0x00000017, 0x636c6163, 0x74616c75, 
+0x736f4c65, 0x31662873, 0x3b31663b, 0x00000000, 
+0x00040005, 0x00000015, 0x74614879, 0x00000000, 
+0x00030005, 0x00000016, 0x00000079, 0x00030005, 
+0x00000021, 0x0000007a, 0x00040005, 0x00000027, 
+0x74614879, 0x00000000, 0x00040005, 0x00000028, 
+0x61726170, 0x0000006d, 0x00030005, 0x0000003e, 
+0x00786469, 0x00080005, 0x00000041, 0x475f6c67, 
+0x61626f6c, 0x766e496c, 0x7461636f, 0x496e6f69, 
+0x00000044, 0x00040005, 0x00000046, 0x72754377, 
+0x00000072, 0x00040005, 0x00000048, 0x6e697762, 
+0x00000000, 0x00040006, 0x00000048, 0x00000000, 
+0x006e6977, 0x00030005, 0x0000004a, 0x00000000, 
+0x00040005, 0x00000054, 0x72754362, 0x00000072, 
+0x00040005, 0x00000056, 0x6e696262, 0x00000000, 
+0x00040006, 0x00000056, 0x00000000, 0x006e6962, 
+0x00030005, 0x00000058, 0x00000000, 0x00040005, 
+0x0000005b, 0x72754378, 0x00000072, 0x00030005, 
+0x0000005d, 0x00697862, 0x00040006, 0x0000005d, 
+0x00000000, 0x00006978, 0x00030005, 0x0000005f, 
+0x00000000, 0x00030005, 0x00000064, 0x006a7862, 
+0x00040006, 0x00000064, 0x00000000, 0x00006a78, 
+0x00030005, 0x00000066, 0x00000000, 0x00040005, 
+0x0000006b, 0x72754379, 0x00000072, 0x00030005, 
+0x0000006d, 0x00007962, 0x00040006, 0x0000006d, 
+0x00000000, 0x00000079, 0x00030005, 0x0000006f, 
+0x00000000, 0x00040005, 0x00000073, 0x74614879, 
+0x00000000, 0x00040005, 0x00000074, 0x61726170, 
+0x0000006d, 0x00040005, 0x00000076, 0x61726170, 
+0x0000006d, 0x00040005, 0x00000078, 0x61726170, 
+0x0000006d, 0x00030005, 0x0000007b, 0x00005a64, 
+0x00030005, 0x0000007f, 0x00005764, 0x00030005, 
+0x00000080, 0x0000006d, 0x00030005, 0x00000086, 
+0x00004264, 0x00040005, 0x0000008b, 0x756f7762, 
+0x00006974, 0x00050006, 0x0000008b, 0x00000000, 
+0x74756f77, 0x00000069, 0x00030005, 0x0000008d, 
+0x00000000, 0x00040005, 0x00000093, 0x756f7762, 
+0x00006a74, 0x00050006, 0x00000093, 0x00000000, 
+0x74756f77, 0x0000006a, 0x00030005, 0x00000095, 
+0x00000000, 0x00040005, 0x0000009c, 0x756f6262, 
+0x00000074, 0x00050006, 0x0000009c, 0x00000000, 
+0x74756f62, 0x00000000, 0x00030005, 0x0000009e, 
+0x00000000, 0x00040005, 0x000000a3, 0x756f6c62, 
+0x00000074, 0x00050006, 0x000000a3, 0x00000000, 
+0x74756f6c, 0x00000000, 0x00030005, 0x000000a5, 
+0x00000000, 0x00040005, 0x000000a7, 0x61726170, 
+0x0000006d, 0x00040005, 0x000000a9, 0x61726170, 
+0x0000006d, 0x00040047, 0x00000041, 0x0000000b, 
+0x0000001c, 0x00040047, 0x00000047, 0x00000006, 
+0x00000004, 0x00050048, 0x00000048, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000048, 
+0x00000003, 0x00040047, 0x0000004a, 0x00000022, 
+0x00000000, 0x00040047, 0x0000004a, 0x00000021, 
+0x00000003, 0x00040047, 0x00000055, 0x00000006, 
+0x00000004, 0x00050048, 0x00000056, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000056, 
+0x00000003, 0x00040047, 0x00000058, 0x00000022, 
+0x00000000, 0x00040047, 0x00000058, 0x00000021, 
+0x00000006, 0x00040047, 0x0000005c, 0x00000006, 
+0x00000004, 0x00050048, 0x0000005d, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000005d, 
+0x00000003, 0x00040047, 0x0000005f, 0x00000022, 
+0x00000000, 0x00040047, 0x0000005f, 0x00000021, 
+0x00000000, 0x00040047, 0x00000063, 0x00000006, 
+0x00000004, 0x00050048, 0x00000064, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000064, 
+0x00000003, 0x00040047, 0x00000066, 0x00000022, 
+0x00000000, 0x00040047, 0x00000066, 0x00000021, 
+0x00000001, 0x00040047, 0x0000006c, 0x00000006, 
+0x00000004, 0x00050048, 0x0000006d, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000006d, 
+0x00000003, 0x00040047, 0x0000006f, 0x00000022, 
+0x00000000, 0x00040047, 0x0000006f, 0x00000021, 
+0x00000002, 0x00040047, 0x00000080, 0x00000001, 
+0x00000000, 0x00040047, 0x0000008a, 0x00000006, 
+0x00000004, 0x00050048, 0x0000008b, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000008b, 
+0x00000003, 0x00040047, 0x0000008d, 0x00000022, 
+0x00000000, 0x00040047, 0x0000008d, 0x00000021, 
+0x00000004, 0x00040047, 0x00000092, 0x00000006, 
+0x00000004, 0x00050048, 0x00000093, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000093, 
+0x00000003, 0x00040047, 0x00000095, 0x00000022, 
+0x00000000, 0x00040047, 0x00000095, 0x00000021, 
+0x00000005, 0x00040047, 0x0000009b, 0x00000006, 
+0x00000004, 0x00050048, 0x0000009c, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000009c, 
+0x00000003, 0x00040047, 0x0000009e, 0x00000022, 
+0x00000000, 0x00040047, 0x0000009e, 0x00000021, 
+0x00000007, 0x00040047, 0x000000a2, 0x00000006, 
+0x00000004, 0x00050048, 0x000000a3, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x000000a3, 
+0x00000003, 0x00040047, 0x000000a5, 0x00000022, 
+0x00000000, 0x00040047, 0x000000a5, 0x00000021, 
+0x00000008, 0x00040047, 0x000000ad, 0x0000000b, 
+0x00000019, 0x00020013, 0x00000002, 0x00030021, 
+0x00000003, 0x00000002, 0x00030016, 0x00000006, 
+0x00000020, 0x00040020, 0x00000007, 0x00000007, 
+0x00000006, 0x00040021, 0x00000008, 0x00000006, 
+0x00000007, 0x00040017, 0x0000000c, 0x00000006, 
+0x00000002, 0x00040020, 0x0000000d, 0x00000007, 
+0x0000000c, 0x00060021, 0x0000000e, 0x00000006, 
+0x0000000d, 0x0000000d, 0x00000007, 0x00050021, 
+0x00000014, 0x00000006, 0x00000007, 0x00000007, 
+0x0004002b, 0x00000006, 0x00000019, 0x3f800000, 
+0x00040015, 0x0000003c, 0x00000020, 0x00000000, 
+0x00040020, 0x0000003d, 0x00000007, 0x0000003c, 
+0x00040017, 0x0000003f, 0x0000003c, 0x00000003, 
+0x00040020, 0x00000040, 0x00000001, 0x0000003f, 
+0x0004003b, 0x00000040, 0x00000041, 0x00000001, 
+0x0004002b, 0x0000003c, 0x00000042, 0x00000000, 
+0x00040020, 0x00000043, 0x00000001, 0x0000003c, 
+0x0003001d, 0x00000047, 0x00000006, 0x0003001e, 
+0x00000048, 0x00000047, 0x00040020, 0x00000049, 
+0x00000002, 0x00000048, 0x0004003b, 0x00000049, 
+0x0000004a, 0x00000002, 0x00040015, 0x0000004b, 
+0x00000020, 0x00000001, 0x0004002b, 0x0000004b, 
+0x0000004c, 0x00000000, 0x00040020, 0x0000004d, 
+0x00000002, 0x00000006, 0x0004002b, 0x0000004b, 
+0x00000050, 0x00000001, 0x0003001d, 0x00000055, 
+0x00000006, 0x0003001e, 0x00000056, 0x00000055, 
+0x00040020, 0x00000057, 0x00000002, 0x00000056, 
+0x0004003b, 0x00000057, 0x00000058, 0x00000002, 
+0x0003001d, 0x0000005c, 0x00000006, 0x0003001e, 
+0x0000005d, 0x0000005c, 0x00040020, 0x0000005e, 
+0x00000002, 0x0000005d, 0x0004003b, 0x0000005e, 
+0x0000005f, 0x00000002, 0x0003001d, 0x00000063, 
+0x00000006, 0x0003001e, 0x00000064, 0x00000063, 
+0x00040020, 0x00000065, 0x00000002, 0x00000064, 
+0x0004003b, 0x00000065, 0x00000066, 0x00000002, 
+0x0003001d, 0x0000006c, 0x00000006, 0x0003001e, 
+0x0000006d, 0x0000006c, 0x00040020, 0x0000006e, 
+0x00000002, 0x0000006d, 0x0004003b, 0x0000006e, 
+0x0000006f, 0x00000002, 0x00040032, 0x00000006, 
+0x00000080, 0x00000000, 0x0003001d, 0x0000008a, 
+0x00000006, 0x0003001e, 0x0000008b, 0x0000008a, 
+0x00040020, 0x0000008c, 0x00000002, 0x0000008b, 
+0x0004003b, 0x0000008c, 0x0000008d, 0x00000002, 
+0x0003001d, 0x00000092, 0x00000006, 0x0003001e, 
+0x00000093, 0x00000092, 0x00040020, 0x00000094, 
+0x00000002, 0x00000093, 0x0004003b, 0x00000094, 
+0x00000095, 0x00000002, 0x0004002b, 0x0000003c, 
+0x00000097, 0x00000001, 0x0003001d, 0x0000009b, 
+0x00000006, 0x0003001e, 0x0000009c, 0x0000009b, 
+0x00040020, 0x0000009d, 0x00000002, 0x0000009c, 
+0x0004003b, 0x0000009d, 0x0000009e, 0x00000002, 
+0x0003001d, 0x000000a2, 0x00000006, 0x0003001e, 
+0x000000a3, 0x000000a2, 0x00040020, 0x000000a4, 
+0x00000002, 0x000000a3, 0x0004003b, 0x000000a4, 
+0x000000a5, 0x00000002, 0x0006002c, 0x0000003f, 
+0x000000ad, 0x00000097, 0x00000097, 0x00000097, 
+0x00050036, 0x00000002, 0x00000004, 0x00000000, 
+0x00000003, 0x000200f8, 0x00000005, 0x0004003b, 
+0x0000003d, 0x0000003e, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000046, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000054, 0x00000007, 0x0004003b, 
+0x0000000d, 0x0000005b, 0x00000007, 0x0004003b, 
+0x00000007, 0x0000006b, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000073, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000074, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000076, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000078, 0x00000007, 0x0004003b, 
+0x00000007, 0x0000007b, 0x00000007, 0x0004003b, 
+0x0000000d, 0x0000007f, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000086, 0x00000007, 0x0004003b, 
+0x00000007, 0x000000a7, 0x00000007, 0x0004003b, 
+0x00000007, 0x000000a9, 0x00000007, 0x00050041, 
+0x00000043, 0x00000044, 0x00000041, 0x00000042, 
+0x0004003d, 0x0000003c, 0x00000045, 0x00000044, 
+0x0003003e, 0x0000003e, 0x00000045, 0x00060041, 
+0x0000004d, 0x0000004e, 0x0000004a, 0x0000004c, 
+0x0000004c, 0x0004003d, 0x00000006, 0x0000004f, 
+0x0000004e, 0x00060041, 0x0000004d, 0x00000051, 
+0x0000004a, 0x0000004c, 0x00000050, 0x0004003d, 
+0x00000006, 0x00000052, 0x00000051, 0x00050050, 
+0x0000000c, 0x00000053, 0x0000004f, 0x00000052, 
+0x0003003e, 0x00000046, 0x00000053, 0x00060041, 
+0x0000004d, 0x00000059, 0x00000058, 0x0000004c, 
+0x0000004c, 0x0004003d, 0x00000006, 0x0000005a, 
+0x00000059, 0x0003003e, 0x00000054, 0x0000005a, 
+0x0004003d, 0x0000003c, 0x00000060, 0x0000003e, 
+0x00060041, 0x0000004d, 0x00000061, 0x0000005f, 
+0x0000004c, 0x00000060, 0x0004003d, 0x00000006, 
+0x00000062, 0x00000061, 0x0004003d, 0x0000003c, 
+0x00000067, 0x0000003e, 0x00060041, 0x0000004d, 
+0x00000068, 0x00000066, 0x0000004c, 0x00000067, 
+0x0004003d, 0x00000006, 0x00000069, 0x00000068, 
+0x00050050, 0x0000000c, 0x0000006a, 0x00000062, 
+0x00000069, 0x0003003e, 0x0000005b, 0x0000006a, 
+0x0004003d, 0x0000003c, 0x00000070, 0x0000003e, 
+0x00060041, 0x0000004d, 0x00000071, 0x0000006f, 
+0x0000004c, 0x00000070, 0x0004003d, 0x00000006, 
+0x00000072, 0x00000071, 0x0003003e, 0x0000006b, 
+0x00000072, 0x0004003d, 0x0000000c, 0x00000075, 
+0x0000005b, 0x0003003e, 0x00000074, 0x00000075, 
+0x0004003d, 0x0000000c, 0x00000077, 0x00000046, 
+0x0003003e, 0x00000076, 0x00000077, 0x0004003d, 
+0x00000006, 0x00000079, 0x00000054, 0x0003003e, 
+0x00000078, 0x00000079, 0x00070039, 0x00000006, 
+0x0000007a, 0x00000012, 0x00000074, 0x00000076, 
+0x00000078, 0x0003003e, 0x00000073, 0x0000007a, 
+0x0004003d, 0x00000006, 0x0000007c, 0x00000073, 
+0x0004003d, 0x00000006, 0x0000007d, 0x0000006b, 
+0x00050083, 0x00000006, 0x0000007e, 0x0000007c, 
+0x0000007d, 0x0003003e, 0x0000007b, 0x0000007e, 
+0x00050088, 0x00000006, 0x00000081, 0x00000019, 
+0x00000080, 0x0004003d, 0x0000000c, 0x00000082, 
+0x0000005b, 0x0005008e, 0x0000000c, 0x00000083, 
+0x00000082, 0x00000081, 0x0004003d, 0x00000006, 
+0x00000084, 0x0000007b, 0x0005008e, 0x0000000c, 
+0x00000085, 0x00000083, 0x00000084, 0x0003003e, 
+0x0000007f, 0x00000085, 0x00050088, 0x00000006, 
+0x00000087, 0x00000019, 0x00000080, 0x0004003d, 
+0x00000006, 0x00000088, 0x0000007b, 0x00050085, 
+0x00000006, 0x00000089, 0x00000087, 0x00000088, 
+0x0003003e, 0x00000086, 0x00000089, 0x0004003d, 
+0x0000003c, 0x0000008e, 0x0000003e, 0x00050041, 
+0x00000007, 0x0000008f, 0x0000007f, 0x00000042, 
+0x0004003d, 0x00000006, 0x00000090, 0x0000008f, 
+0x00060041, 0x0000004d, 0x00000091, 0x0000008d, 
+0x0000004c, 0x0000008e, 0x0003003e, 0x00000091, 
+0x00000090, 0x0004003d, 0x0000003c, 0x00000096, 
+0x0000003e, 0x00050041, 0x00000007, 0x00000098, 
+0x0000007f, 0x00000097, 0x0004003d, 0x00000006, 
+0x00000099, 0x00000098, 0x00060041, 0x0000004d, 
+0x0000009a, 0x00000095, 0x0000004c, 0x00000096, 
+0x0003003e, 0x0000009a, 0x00000099, 0x0004003d, 
+0x0000003c, 0x0000009f, 0x0000003e, 0x0004003d, 
+0x00000006, 0x000000a0, 0x00000086, 0x00060041, 
+0x0000004d, 0x000000a1, 0x0000009e, 0x0000004c, 
+0x0000009f, 0x0003003e, 0x000000a1, 0x000000a0, 
+0x0004003d, 0x0000003c, 0x000000a6, 0x0000003e, 
+0x0004003d, 0x00000006, 0x000000a8, 0x00000073, 
+0x0003003e, 0x000000a7, 0x000000a8, 0x0004003d, 
+0x00000006, 0x000000aa, 0x0000006b, 0x0003003e, 
+0x000000a9, 0x000000aa, 0x00060039, 0x00000006, 
+0x000000ab, 0x00000017, 0x000000a7, 0x000000a9, 
+0x00060041, 0x0000004d, 0x000000ac, 0x000000a5, 
+0x0000004c, 0x000000a6, 0x0003003e, 0x000000ac, 
+0x000000ab, 0x000100fd, 0x00010038, 0x00050036, 
+0x00000006, 0x0000000a, 0x00000000, 0x00000008, 
+0x00030037, 0x00000007, 0x00000009, 0x000200f8, 
+0x0000000b, 0x0004003d, 0x00000006, 0x0000001a, 
+0x00000009, 0x0004007f, 0x00000006, 0x0000001b, 
+0x0000001a, 0x0006000c, 0x00000006, 0x0000001c, 
+0x00000001, 0x0000001b, 0x0000001b, 0x00050081, 
+0x00000006, 0x0000001d, 0x00000019, 0x0000001c, 
+0x00050088, 0x00000006, 0x0000001e, 0x00000019, 
+0x0000001d, 0x000200fe, 0x0000001e, 0x00010038, 
+0x00050036, 0x00000006, 0x00000012, 0x00000000, 
+0x0000000e, 0x00030037, 0x0000000d, 0x0000000f, 
+0x00030037, 0x0000000d, 0x00000010, 0x00030037, 
+0x00000007, 0x00000011, 0x000200f8, 0x00000013, 
+0x0004003b, 0x00000007, 0x00000021, 0x00000007, 
+0x0004003b, 0x00000007, 0x00000027, 0x00000007, 
+0x0004003b, 0x00000007, 0x00000028, 0x00000007, 
+0x0004003d, 0x0000000c, 0x00000022, 0x00000010, 
+0x0004003d, 0x0000000c, 0x00000023, 0x0000000f, 
+0x00050094, 0x00000006, 0x00000024, 0x00000022, 
+0x00000023, 0x0004003d, 0x00000006, 0x00000025, 
+0x00000011, 0x00050081, 0x00000006, 0x00000026, 
+0x00000024, 0x00000025, 0x0003003e, 0x00000021, 
+0x00000026, 0x0004003d, 0x00000006, 0x00000029, 
+0x00000021, 0x0003003e, 0x00000028, 0x00000029, 
+0x00050039, 0x00000006, 0x0000002a, 0x0000000a, 
+0x00000028, 0x0003003e, 0x00000027, 0x0000002a, 
+0x0004003d, 0x00000006, 0x0000002b, 0x00000027, 
+0x000200fe, 0x0000002b, 0x00010038, 0x00050036, 
+0x00000006, 0x00000017, 0x00000000, 0x00000014, 
+0x00030037, 0x00000007, 0x00000015, 0x00030037, 
+0x00000007, 0x00000016, 0x000200f8, 0x00000018, 
+0x0004003d, 0x00000006, 0x0000002e, 0x00000016, 
+0x0004003d, 0x00000006, 0x0000002f, 0x00000015, 
+0x0006000c, 0x00000006, 0x00000030, 0x00000001, 
+0x0000001c, 0x0000002f, 0x00050085, 0x00000006, 
+0x00000031, 0x0000002e, 0x00000030, 0x0004003d, 
+0x00000006, 0x00000032, 0x00000016, 0x00050083, 
+0x00000006, 0x00000033, 0x00000019, 0x00000032, 
+0x0004003d, 0x00000006, 0x00000034, 0x00000015, 
+0x00050083, 0x00000006, 0x00000035, 0x00000019, 
+0x00000034, 0x0006000c, 0x00000006, 0x00000036, 
+0x00000001, 0x0000001c, 0x00000035, 0x00050085, 
+0x00000006, 0x00000037, 0x00000033, 0x00000036, 
+0x00050081, 0x00000006, 0x00000038, 0x00000031, 
+0x00000037, 0x0004007f, 0x00000006, 0x00000039, 
+0x00000038, 0x000200fe, 0x00000039, 0x00010038 };
+} // namespace kp
+
+
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.comp b/kompute/src/shaders/glsl/ShaderOpMult.comp
new file mode 100644
index 0000000000000..d5486503760c1
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderOpMult.comp
@@ -0,0 +1,28 @@
+#version 450
+
+layout(set = 0, binding = 0) buffer tensorLhs {
+   float valuesLhs[ ];
+};
+
+layout(set = 0, binding = 1) buffer tensorRhs {
+   float valuesRhs[ ];
+};
+
+layout(set = 0, binding = 2) buffer tensorOutput {
+   float valuesOutput[ ];
+};
+
+layout (constant_id = 0) const uint LEN_LHS = 0;
+layout (constant_id = 1) const uint LEN_RHS = 0;
+layout (constant_id = 2) const uint LEN_OUT = 0;
+
+layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+void main() 
+{
+	uint index = gl_GlobalInvocationID.x;
+
+    valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+}
+
+
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.hpp.in b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
new file mode 100644
index 0000000000000..5af29c66d1214
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
@@ -0,0 +1,101 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace kp {
+const std::array<uint32_t, 366> SHADEROPMULT_COMP_SPV = { 
+0x07230203, 0x00010000, 0x0008000a, 0x0000002e, 
+0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
+0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
+0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
+0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
+0x00000000, 0x0000000b, 0x00060010, 0x00000004, 
+0x00000011, 0x00000001, 0x00000001, 0x00000001, 
+0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
+0x00000004, 0x6e69616d, 0x00000000, 0x00040005, 
+0x00000008, 0x65646e69, 0x00000078, 0x00080005, 
+0x0000000b, 0x475f6c67, 0x61626f6c, 0x766e496c, 
+0x7461636f, 0x496e6f69, 0x00000044, 0x00060005, 
+0x00000012, 0x736e6574, 0x754f726f, 0x74757074, 
+0x00000000, 0x00070006, 0x00000012, 0x00000000, 
+0x756c6176, 0x754f7365, 0x74757074, 0x00000000, 
+0x00030005, 0x00000014, 0x00000000, 0x00050005, 
+0x00000019, 0x736e6574, 0x684c726f, 0x00000073, 
+0x00060006, 0x00000019, 0x00000000, 0x756c6176, 
+0x684c7365, 0x00000073, 0x00030005, 0x0000001b, 
+0x00000000, 0x00050005, 0x00000021, 0x736e6574, 
+0x6852726f, 0x00000073, 0x00060006, 0x00000021, 
+0x00000000, 0x756c6176, 0x68527365, 0x00000073, 
+0x00030005, 0x00000023, 0x00000000, 0x00040005, 
+0x00000029, 0x5f4e454c, 0x0053484c, 0x00040005, 
+0x0000002a, 0x5f4e454c, 0x00534852, 0x00040005, 
+0x0000002b, 0x5f4e454c, 0x0054554f, 0x00040047, 
+0x0000000b, 0x0000000b, 0x0000001c, 0x00040047, 
+0x00000011, 0x00000006, 0x00000004, 0x00050048, 
+0x00000012, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000012, 0x00000003, 0x00040047, 
+0x00000014, 0x00000022, 0x00000000, 0x00040047, 
+0x00000014, 0x00000021, 0x00000002, 0x00040047, 
+0x00000018, 0x00000006, 0x00000004, 0x00050048, 
+0x00000019, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000019, 0x00000003, 0x00040047, 
+0x0000001b, 0x00000022, 0x00000000, 0x00040047, 
+0x0000001b, 0x00000021, 0x00000000, 0x00040047, 
+0x00000020, 0x00000006, 0x00000004, 0x00050048, 
+0x00000021, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000021, 0x00000003, 0x00040047, 
+0x00000023, 0x00000022, 0x00000000, 0x00040047, 
+0x00000023, 0x00000021, 0x00000001, 0x00040047, 
+0x00000029, 0x00000001, 0x00000000, 0x00040047, 
+0x0000002a, 0x00000001, 0x00000001, 0x00040047, 
+0x0000002b, 0x00000001, 0x00000002, 0x00040047, 
+0x0000002d, 0x0000000b, 0x00000019, 0x00020013, 
+0x00000002, 0x00030021, 0x00000003, 0x00000002, 
+0x00040015, 0x00000006, 0x00000020, 0x00000000, 
+0x00040020, 0x00000007, 0x00000007, 0x00000006, 
+0x00040017, 0x00000009, 0x00000006, 0x00000003, 
+0x00040020, 0x0000000a, 0x00000001, 0x00000009, 
+0x0004003b, 0x0000000a, 0x0000000b, 0x00000001, 
+0x0004002b, 0x00000006, 0x0000000c, 0x00000000, 
+0x00040020, 0x0000000d, 0x00000001, 0x00000006, 
+0x00030016, 0x00000010, 0x00000020, 0x0003001d, 
+0x00000011, 0x00000010, 0x0003001e, 0x00000012, 
+0x00000011, 0x00040020, 0x00000013, 0x00000002, 
+0x00000012, 0x0004003b, 0x00000013, 0x00000014, 
+0x00000002, 0x00040015, 0x00000015, 0x00000020, 
+0x00000001, 0x0004002b, 0x00000015, 0x00000016, 
+0x00000000, 0x0003001d, 0x00000018, 0x00000010, 
+0x0003001e, 0x00000019, 0x00000018, 0x00040020, 
+0x0000001a, 0x00000002, 0x00000019, 0x0004003b, 
+0x0000001a, 0x0000001b, 0x00000002, 0x00040020, 
+0x0000001d, 0x00000002, 0x00000010, 0x0003001d, 
+0x00000020, 0x00000010, 0x0003001e, 0x00000021, 
+0x00000020, 0x00040020, 0x00000022, 0x00000002, 
+0x00000021, 0x0004003b, 0x00000022, 0x00000023, 
+0x00000002, 0x00040032, 0x00000006, 0x00000029, 
+0x00000000, 0x00040032, 0x00000006, 0x0000002a, 
+0x00000000, 0x00040032, 0x00000006, 0x0000002b, 
+0x00000000, 0x0004002b, 0x00000006, 0x0000002c, 
+0x00000001, 0x0006002c, 0x00000009, 0x0000002d, 
+0x0000002c, 0x0000002c, 0x0000002c, 0x00050036, 
+0x00000002, 0x00000004, 0x00000000, 0x00000003, 
+0x000200f8, 0x00000005, 0x0004003b, 0x00000007, 
+0x00000008, 0x00000007, 0x00050041, 0x0000000d, 
+0x0000000e, 0x0000000b, 0x0000000c, 0x0004003d, 
+0x00000006, 0x0000000f, 0x0000000e, 0x0003003e, 
+0x00000008, 0x0000000f, 0x0004003d, 0x00000006, 
+0x00000017, 0x00000008, 0x0004003d, 0x00000006, 
+0x0000001c, 0x00000008, 0x00060041, 0x0000001d, 
+0x0000001e, 0x0000001b, 0x00000016, 0x0000001c, 
+0x0004003d, 0x00000010, 0x0000001f, 0x0000001e, 
+0x0004003d, 0x00000006, 0x00000024, 0x00000008, 
+0x00060041, 0x0000001d, 0x00000025, 0x00000023, 
+0x00000016, 0x00000024, 0x0004003d, 0x00000010, 
+0x00000026, 0x00000025, 0x00050085, 0x00000010, 
+0x00000027, 0x0000001f, 0x00000026, 0x00060041, 
+0x0000001d, 0x00000028, 0x00000014, 0x00000016, 
+0x00000017, 0x0003003e, 0x00000028, 0x00000027, 
+0x000100fd, 0x00010038 };
+} // namespace kp
+
+
diff --git a/kompute/src/shaders/hlsl/computeheadless.comp b/kompute/src/shaders/hlsl/computeheadless.comp
new file mode 100644
index 0000000000000..ee3cd024f0466
--- /dev/null
+++ b/kompute/src/shaders/hlsl/computeheadless.comp
@@ -0,0 +1,29 @@
+// Copyright 2020 Google LLC
+
+RWStructuredBuffer<uint> values : register(u0);
+[[vk::constant_id(0)]] const uint BUFFER_ELEMENTS = 32;
+
+uint fibonacci(uint n) {
+	if(n <= 1){
+		return n;
+	}
+	uint curr = 1;
+	uint prev = 1;
+	for(uint i = 2; i < n; ++i) {
+		uint temp = curr;
+		curr += prev;
+		prev = temp;
+	}
+	return curr;
+}
+
+[numthreads(1, 1, 1)]
+void main(uint3 GlobalInvocationID : SV_DispatchThreadID)
+{
+	uint index = GlobalInvocationID.x;
+	if (index >= BUFFER_ELEMENTS)
+		return;
+	values[index] = fibonacci(values[index]);
+}
+
+
diff --git a/llama.cpp b/llama.cpp
index 6e23a0772325d..c835c6fd407d9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,6 +9,8 @@
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
+#elif defined(GGML_USE_KOMPUTE)
+#   include "ggml-vulkan.h"
 #endif
 
 #ifdef GGML_USE_METAL
@@ -1182,11 +1184,14 @@ struct llama_context {
 
 #ifdef GGML_USE_METAL
     ggml_metal_context * ctx_metal = NULL;
+#elif defined(GGML_USE_KOMPUTE)
+    ggml_kompute_context * ctx_kompute = NULL;
 #endif
 
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
 #endif
+
 };
 
 //
@@ -2474,6 +2479,9 @@ static struct ggml_cgraph * llm_build_llama(
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
+#if defined(GGML_USE_KOMPUTE)
+    struct ggml_tensor * toDeviceTensor = nullptr;
+#endif
 
     if (tokens) {
         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -2483,6 +2491,9 @@ static struct ggml_cgraph * llm_build_llama(
             memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
         }
         ggml_set_name(inp_tokens, "inp_tokens");
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inp_tokens;
+#endif
 
         inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
@@ -2491,6 +2502,9 @@ static struct ggml_cgraph * llm_build_llama(
 #endif
 
         inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inpL;
+#endif
 
         ggml_allocr_alloc(lctx.alloc, inpL);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -2693,7 +2707,6 @@ static struct ggml_cgraph * llm_build_llama(
                 offload_func(cur);
                 ggml_set_name(cur, "ffn_norm");
             }
-
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model.layers[il].w3,
                     cur);
@@ -2752,6 +2765,16 @@ static struct ggml_cgraph * llm_build_llama(
 
     ggml_free(ctx0);
 
+#if defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute && N == 1) {
+        if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
+            ggml_vk_h2d_all(lctx.ctx_kompute);
+        } else {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+        }
+    }
+#endif
+
     return gf;
 }
 
@@ -3792,6 +3815,17 @@ static bool llama_eval_internal(
     } else {
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
+#elif defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute && N == 1) {
+        ggml_vk_graph_compute(lctx.ctx_kompute, gf);
+        ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
+    } else {
+        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        if (lctx.ctx_kompute) {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
+        }
+    }
 #else
     ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif
@@ -3833,12 +3867,12 @@ static bool llama_eval_internal(
     }
 
     // extract embeddings
-    if (!lctx.embedding.empty()) {
-        auto & embedding_out = lctx.embedding;
+    //if (!lctx.embedding.empty()) {
+    //    auto & embedding_out = lctx.embedding;
 
-        embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
-    }
+    //    embedding_out.resize(n_embd);
+    //    memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+    //}
 
     // measure the performance only for the single-token evals
     if (N == 1) {
@@ -5904,6 +5938,7 @@ static int llama_apply_lora_from_file_internal(
 ) {
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
+
     const int64_t t_start_lora_us = ggml_time_us();
 
     auto fin = std::ifstream(path_lora, std::ios::binary);
diff --git a/llama.h b/llama.h
index 350268b9a94aa..3d911adcac948 100644
--- a/llama.h
+++ b/llama.h
@@ -42,7 +42,7 @@
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 1
 
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
diff --git a/undump.py b/undump.py
new file mode 100644
index 0000000000000..db19ffe695dab
--- /dev/null
+++ b/undump.py
@@ -0,0 +1,18 @@
+import struct
+import numpy as np
+from pathlib import Path
+
+def undump(fn):
+    with open(fn, 'rb') as df:
+        dims = struct.unpack('=QQQQ', df.read(8*4))
+        (dsz,) = struct.unpack('=Q', df.read(8))
+        ## assume f32
+        data = df.read(dsz)
+        data = [i for (i,) in struct.iter_unpack('=f', data)]
+        return np.array(data).reshape(dims).squeeze()
+
+if __name__ == '__main__':
+    for dfn in sorted(Path('.').glob('*.dump')):
+        darr = undump(dfn)
+        print(f'{dfn}: {darr.shape}\n{darr}')
+

From 48a45ea435d091a8465d0b4daf5e9ebdcebf0802 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 30 Aug 2023 14:33:31 -0400
Subject: [PATCH 002/140] Remove warning which fails on windows.

---
 ggml-vulkan.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 32590d03ec1ab..9b5c01f68c212 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -49,10 +49,6 @@
 #include <immintrin.h>
 #include <kompute/Kompute.hpp>
 
-#ifndef __STDC_IEC_559__
-#warning Your C implementation does not seem to be IEC 559 compliant, which is required for proper Vulkan interop.
-#endif
-
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32

From 8563fa001f20f3d292778c39f4288bd6b06d2460 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 5 Sep 2023 13:42:27 -0700
Subject: [PATCH 003/140] remove dynamic deps from kompute build

should no longer have new external deps other than libvulkan

```
ubuntu@ip-172-31-1-24:~/repo/gpt4all/gpt4all-backend/build$ ldd ./libllamamodel-mainline-avxonly.so
        linux-vdso.so.1 (0x00007ffcb53bb000)
        libvulkan.so.1 => /lib/x86_64-linux-gnu/libvulkan.so.1 (0x00007f239dab5000)
        libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f239d800000)
        libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f239d719000)
        libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f239da95000)
        libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f239d400000)
        /lib64/ld-linux-x86-64.so.2 (0x00007f239dd1d000)
```
---
 kompute/src/CMakeLists.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index f4f8440f4ffdb..5f02ce12c1f9d 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -8,7 +8,7 @@ endif()
 
 cmake_minimum_required(VERSION 3.20)
 
-add_library(kompute Algorithm.cpp
+add_library(kompute STATIC Algorithm.cpp
     Manager.cpp
     OpAlgoDispatch.cpp
     OpMemoryBarrier.cpp
@@ -27,7 +27,8 @@ add_library(kompute::kompute ALIAS kompute)
 set_target_properties(kompute
     PROPERTIES
     VERSION ${${PROJECT_NAME}_VERSION}
-    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR})
+    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR}
+    POSITION_INDEPENDENT_CODE TRUE)
 
 # Import GNU common install directory variables
 include(GNUInstallDirs)
@@ -56,12 +57,12 @@ if(KOMPUTE_OPT_ANDROID_BUILD)
         android
         kp_logger
         kp_shader
-        fmt::fmt)
+        fmt::fmt-header-only)
 else()
     target_link_libraries(kompute PUBLIC Vulkan::Vulkan
         kp_logger
         kp_shader
-        fmt::fmt)
+        fmt::fmt-header-only)
 endif()
 
 if(KOMPUTE_OPT_BUILD_PYTHON)

From 45c8778b49184c60946718dc67cdf935c0031585 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 12 Sep 2023 12:39:38 -0400
Subject: [PATCH 004/140] Switch to a dynamic dispatch table instead of linking
 hard against libvulkan.

---
 ggml-vulkan.cpp                         | 15 +++++++--------
 ggml-vulkan.h                           |  1 +
 kompute/CMakeLists.txt                  |  2 ++
 kompute/src/CMakeLists.txt              |  2 +-
 kompute/src/Core.cpp                    |  2 --
 kompute/src/Manager.cpp                 | 13 +++++++------
 kompute/src/include/kompute/Manager.hpp |  5 +++++
 7 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 9b5c01f68c212..055b1124d1715 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -123,21 +123,20 @@ static std::string ggml_vk_getVendorName(uint32_t vendorID) {
 }
 
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
-    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
-    uint32_t deviceCount = physicalDevices.size();
 
     std::vector<ggml_vk_device> results;
+    if (!mgr.hasVulkan())
+        return results;
+
+    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
+    uint32_t deviceCount = physicalDevices.size();
 
     if (deviceCount == 0)
         return results;
 
     for (uint32_t i = 0; i < deviceCount; i++) {
-        VkPhysicalDeviceProperties properties;
-        vkGetPhysicalDeviceProperties(physicalDevices.at(i), &properties);
-
-        VkPhysicalDeviceMemoryProperties memoryProperties;
-        vkGetPhysicalDeviceMemoryProperties(physicalDevices.at(i), &memoryProperties);
-
+        VkPhysicalDeviceProperties properties = physicalDevices.at(i).getProperties();
+        VkPhysicalDeviceMemoryProperties memoryProperties = physicalDevices.at(i).getMemoryProperties();
         const uint32_t major = VK_VERSION_MAJOR(properties.apiVersion);
         const uint32_t minor = VK_VERSION_MINOR(properties.apiVersion);
         if (major < 1 || minor < 2)
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index ad8b41e4d205e..d13ed41844b77 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -40,6 +40,7 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
 bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
 bool ggml_vk_init_device(const ggml_vk_device &device);
 bool ggml_vk_init_device(int device);
+bool ggml_vk_has_vulkan();
 bool ggml_vk_has_device();
 ggml_vk_device ggml_vk_current_device();
 struct ggml_kompute_context * ggml_vk_init(void);
diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
index f89e13d1d7e6c..aa228653aa86e 100644
--- a/kompute/CMakeLists.txt
+++ b/kompute/CMakeLists.txt
@@ -158,6 +158,8 @@ else()
     find_package(fmt REQUIRED)
 endif()
 
+add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
+
 # ####################################################
 # Preprocessor Macros
 # ####################################################
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index 5f02ce12c1f9d..4179a81f27c9b 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -59,7 +59,7 @@ if(KOMPUTE_OPT_ANDROID_BUILD)
         kp_shader
         fmt::fmt-header-only)
 else()
-    target_link_libraries(kompute PUBLIC Vulkan::Vulkan
+    target_link_libraries(kompute PUBLIC
         kp_logger
         kp_shader
         fmt::fmt-header-only)
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
index 60849a3ecd940..9b0483232cda8 100644
--- a/kompute/src/Core.cpp
+++ b/kompute/src/Core.cpp
@@ -10,7 +10,6 @@
 
 #include "kompute/Core.hpp"
 
-#if VK_USE_PLATFORM_ANDROID_KHR
 #ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
 #define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
 /**
@@ -21,7 +20,6 @@
  **/
 VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
 #endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-#endif // VK_USE_PLATFORM_ANDROID_KHR
 
 namespace kp {
 } // namespace kp
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 07514ed9a10c2..2c86b6e104a3a 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -223,20 +223,21 @@ Manager::createInstance()
     }
 #endif
 
-#if VK_USE_PLATFORM_ANDROID_KHR
-    vk::DynamicLoader dl;
+    try {
+        mDynamicLoader = std::make_shared<vk::DynamicLoader>();
+    } catch (const std::exception & err) {
+        return;
+    }
+
     PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
-      dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
+      mDynamicLoader->getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
     VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
-#endif // VK_USE_PLATFORM_ANDROID_KHR
 
     this->mInstance = std::make_shared<vk::Instance>();
     vk::createInstance(
       &computeInstanceCreateInfo, nullptr, this->mInstance.get());
 
-#if VK_USE_PLATFORM_ANDROID_KHR
     VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
-#endif // VK_USE_PLATFORM_ANDROID_KHR
 
     KP_LOG_DEBUG("Kompute Manager Instance Created");
 
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
index 8fda58f84b909..42336f4e8e141 100644
--- a/kompute/src/include/kompute/Manager.hpp
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -43,6 +43,10 @@ class Manager
         return this->mDevice.get();
     }
 
+    bool hasVulkan() const {
+        return this->mDynamicLoader.get();
+    }
+
     /**
      * Initialize a device.
      *
@@ -240,6 +244,7 @@ class Manager
     bool mFreeInstance = false;
     std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
     std::shared_ptr<vk::Device> mDevice = nullptr;
+    std::shared_ptr<vk::DynamicLoader> mDynamicLoader = nullptr;
     bool mFreeDevice = false;
 
     // -------------- ALWAYS OWNED RESOURCES

From b7e2e691d40ca0a6e8e1e1a9186e16eafde599ae Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 12 Sep 2023 13:04:55 -0400
Subject: [PATCH 005/140] Completely revamp how we do object management with
 the vulkan backend and stop using so many static objects so we can tear down
 and bring up vulkan on new devices in the same runtime.

---
 ggml-vulkan.cpp                           | 185 +++++++++++++---------
 ggml-vulkan.h                             |   1 +
 kompute/src/Algorithm.cpp                 |  26 +--
 kompute/src/Manager.cpp                   |  41 +++--
 kompute/src/include/kompute/Algorithm.hpp |   5 +-
 kompute/src/include/kompute/Manager.hpp   |  27 +++-
 6 files changed, 172 insertions(+), 113 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 055b1124d1715..89de70fa4fdb5 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -65,9 +65,21 @@ struct ggml_kompute_context {
     }
 };
 
+// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
+// and consolidate the init functions and simplify object lifetime management. As it currently stands,
+// we *have* to have the kompute manager no matter what for device discovery, but the kompute context
+// is only created when a device is set and vulkan is explicitly turned on.
 ggml_kompute_context *ggml_kompute_context::instance;
-
-kp::Manager mgr;
+kp::Manager *komputeManager() {
+    static kp::Manager *s_mgr = nullptr;
+    if (s_mgr && !s_mgr->hasInstance()) {
+        delete s_mgr;
+        s_mgr = nullptr;
+    }
+    if (!s_mgr)
+        s_mgr = new kp::Manager;
+    return s_mgr;
+}
 
 #ifdef __linux__
 __attribute__((constructor))
@@ -123,12 +135,11 @@ static std::string ggml_vk_getVendorName(uint32_t vendorID) {
 }
 
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
-
     std::vector<ggml_vk_device> results;
-    if (!mgr.hasVulkan())
+    if (!komputeManager()->hasVulkan())
         return results;
 
-    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
+    std::vector<vk::PhysicalDevice> physicalDevices = komputeManager()->listDevices();
     uint32_t deviceCount = physicalDevices.size();
 
     if (deviceCount == 0)
@@ -228,22 +239,33 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
 }
 
 bool ggml_vk_init_device(int device) {
-    mgr.initializeDevice(device, {},
+    komputeManager()->initializeDevice(device, {},
                          {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
                           "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
     return ggml_vk_has_device();
 }
 
+bool ggml_vk_free_device() {
+    if (!ggml_vk_has_device())
+        return false;
+    komputeManager()->destroy();
+    return true;
+}
+
+bool ggml_vk_has_vulkan() {
+    return komputeManager()->hasVulkan();
+}
+
 bool ggml_vk_has_device() {
-    return mgr.hasDevice();
+    return komputeManager()->hasDevice();
 }
 
 ggml_vk_device ggml_vk_current_device() {
-    if (!mgr.hasDevice())
+    if (!komputeManager()->hasDevice())
         return ggml_vk_device();
 
     std::vector<ggml_vk_device> devices = ggml_vk_available_devices(0);
-    ggml_vk_filterByName(devices, mgr.physicalDevice()->getProperties().deviceName);
+    ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName);
     return devices.front();
 }
 
@@ -275,7 +297,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
       descriptorPoolSizes.data());
 
     ctx->pool = std::make_shared<vk::DescriptorPool>();
-    vk::Result r = mgr.device()->createDescriptorPool(
+    vk::Result r = komputeManager()->device()->createDescriptorPool(
       &descriptorPoolInfo, nullptr, ctx->pool.get());
     if (r != vk::Result::eSuccess)
         std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
@@ -284,7 +306,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
 static
 void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
     if (ctx->pool) {
-        mgr.device()->destroy(
+        komputeManager()->device()->destroy(
           *ctx->pool,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         ctx->pool = nullptr;
@@ -301,7 +323,7 @@ vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
     bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
 
     vk::Buffer *vkBuffer = new vk::Buffer;
-    vk::Result r = mgr.device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
+    vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
     if (r != vk::Result::eSuccess)
         std::cerr << "Error allocating buffer" << vk::to_string(r);
     return vkBuffer;
@@ -312,7 +334,7 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
 
     uint32_t memoryTypeIndex = -1;
     bool memoryTypeIndexFound = false;
-    vk::PhysicalDeviceMemoryProperties memoryProperties = mgr.physicalDevice()->getMemoryProperties();
+    vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
     for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
         if (requirements.memoryTypeBits & (1 << i)) {
             if (((memoryProperties.memoryTypes[i]).propertyFlags &
@@ -335,7 +357,7 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     allocInfo.allocationSize = size;
     allocInfo.memoryTypeIndex = memoryTypeIndex;
     vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
-    vk::Result r = mgr.device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
+    vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
     if (r != vk::Result::eSuccess)
         std::cerr << "Error allocating memory" << vk::to_string(r);
     return vkDeviceMemory;
@@ -346,7 +368,7 @@ size_t ggml_vk_aligned_offset(size_t offset) {
     static size_t minStorageBufferOffsetAlignment = 0;
     if (minStorageBufferOffsetAlignment == 0) {
         vk::PhysicalDeviceProperties deviceProperties;
-        deviceProperties = mgr.physicalDevice()->getProperties();
+        deviceProperties = komputeManager()->physicalDevice()->getProperties();
         vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
         minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
     }
@@ -362,12 +384,12 @@ size_t ggml_vk_aligned_offset(size_t offset) {
 
 static void ggml_vk_h2d_buffer(const ggml_vk_memory &memory) {
     if (memory.stagingBuffer)
-        mgr.sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
 }
 
 static void ggml_vk_d2h_buffer(const ggml_vk_memory &memory) {
     if (memory.stagingBuffer)
-        mgr.sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+        komputeManager()->sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
 }
 
 ggml_vk_memory ggml_vk_allocate(size_t size) {
@@ -375,12 +397,12 @@ ggml_vk_memory ggml_vk_allocate(size_t size) {
     bool isHostVisible = false;
     {
         memory.primaryBuffer = ggml_vk_allocate_buffer(size);
-        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.primaryBuffer);
+        vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer);
         vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
         memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
-        mgr.device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
+        komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
         if (isHostVisible) {
-            vk::Result r = mgr.device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+            vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
             if (r != vk::Result::eSuccess)
                 std::cerr << "Error mapping memory" << vk::to_string(r);
         }
@@ -388,13 +410,13 @@ ggml_vk_memory ggml_vk_allocate(size_t size) {
 
     if (!isHostVisible) {
         memory.stagingBuffer = ggml_vk_allocate_buffer(size);
-        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.stagingBuffer);
+        vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer);
         vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
                                                       vk::MemoryPropertyFlagBits::eHostCoherent |
                                                       vk::MemoryPropertyFlagBits::eHostCached;
         memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
-        mgr.device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
-        vk::Result r = mgr.device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+        komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
+        vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
         if (r != vk::Result::eSuccess)
             std::cerr << "Error mapping memory" << vk::to_string(r);
     }
@@ -405,19 +427,19 @@ ggml_vk_memory ggml_vk_allocate(size_t size) {
 
 void ggml_vk_free_memory(ggml_vk_memory &memory)
 {
-    mgr.device()->destroy(
+    komputeManager()->device()->destroy(
       *memory.primaryBuffer,
       (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     if (memory.stagingBuffer) {
-        mgr.device()->destroy(
+        komputeManager()->device()->destroy(
           *memory.stagingBuffer,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
-    mgr.device()->freeMemory(
+    komputeManager()->device()->freeMemory(
       *memory.primaryMemory,
       (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     if (memory.stagingMemory) {
-        mgr.device()->freeMemory(
+        komputeManager()->device()->freeMemory(
           *memory.stagingMemory,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
@@ -457,7 +479,7 @@ const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context
         nbytes += *alignedOffset;
     }
 
-    return mgr.tensor(
+    return komputeManager()->tensor(
         t->data,
         nelements,
         nbytes, kp::Tensor::TensorDataTypes::eFloat,
@@ -476,7 +498,7 @@ void ggml_vk_add_buffer(
 void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
     GGML_ASSERT(res);
-    mgr.sequence()->eval<kp::OpTensorSyncDevice>({res});
+    komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
 }
 
 void ggml_vk_h2d_all(struct ggml_kompute_context * ctx) {
@@ -496,7 +518,7 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
 
     GGML_ASSERT(res);
-    mgr.sequence()->eval<kp::OpTensorSyncLocal>({res});
+    komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
 }
 
 std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
@@ -537,10 +559,11 @@ void ggml_vk_add(kp::Sequence& seq,
         safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -567,10 +590,11 @@ void ggml_vk_addrow(kp::Sequence& seq,
         row
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -595,10 +619,11 @@ void ggml_vk_mul(kp::Sequence& seq,
         safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -625,10 +650,11 @@ void ggml_vk_mulrow(kp::Sequence& seq,
         row
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -653,10 +679,11 @@ void ggml_vk_scale(kp::Sequence& seq,
         scale
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -676,10 +703,11 @@ void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
         safe_divide(inOff, 4), safe_divide(outOff, 4),
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -729,10 +757,11 @@ void ggml_vk_soft_max(kp::Sequence& seq,
         ne00, ne01, ne02
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -761,10 +790,11 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
         (uint32_t)ne00, (uint32_t)nb01, epsilon
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -808,10 +838,11 @@ void ggml_vk_diag_mask_inf(kp::Sequence& seq,
         ne00, ne01
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -844,10 +875,11 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
         ne00, nb01, nb02, nb11, nb12, ne0, ne1,
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -871,10 +903,11 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
         ne00, ne10, ne0,
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -921,10 +954,11 @@ void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
         ne00, nb01, nb1
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -996,10 +1030,11 @@ void ggml_vk_rope(kp::Sequence& seq,
         nb0, nb1, nb2, nb3
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1032,10 +1067,14 @@ void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
         nb0, nb1, nb2, nb3
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    static std::string unique_name = std::string(__func__) +
+                                     "_i_" + std::to_string(in_element_size) +
+                                     "_o_" + std::to_string(out_element_size);
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(unique_name))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(unique_name, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(unique_name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1082,7 +1121,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
     std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
 
     for (auto& sequence : sequences) {
-        sequence = mgr.sequence();
+        sequence = komputeManager()->sequence();
     }
     for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
         const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index d13ed41844b77..e1d20e3885c71 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -40,6 +40,7 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
 bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
 bool ggml_vk_init_device(const ggml_vk_device &device);
 bool ggml_vk_init_device(int device);
+bool ggml_vk_free_device();
 bool ggml_vk_has_vulkan();
 bool ggml_vk_has_device();
 ggml_vk_device ggml_vk_current_device();
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
index 9c41ec90f1f24..ea81fd97b1a6f 100644
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@@ -58,18 +58,6 @@ Algorithm::destroy()
         this->mPipeline = nullptr;
     }
 
-    if (this->mFreePipelineCache && this->mPipelineCache) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline cache");
-        if (!this->mPipelineCache) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
-                        "pipeline cache but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mPipelineCache,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mPipelineCache = nullptr;
-    }
-
     if (this->mFreePipelineLayout && this->mPipelineLayout) {
         KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
         if (!this->mPipelineLayout) {
@@ -317,16 +305,6 @@ Algorithm::createPipeline()
       "main",
       &specializationInfo);
 
-    static std::shared_ptr<vk::PipelineCache> globalPipelineCache = std::make_shared<vk::PipelineCache>();
-    if(!*globalPipelineCache) {
-       vk::PipelineCacheCreateInfo pipelineCacheInfo =
-         vk::PipelineCacheCreateInfo();
-      this->mPipelineCache = globalPipelineCache;
-      this->mFreePipelineCache = true;
-      this->mDevice->createPipelineCache(
-        &pipelineCacheInfo, nullptr, globalPipelineCache.get());
-    }
-
     vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
                                                shaderStage,
                                                *this->mPipelineLayout,
@@ -335,7 +313,7 @@ Algorithm::createPipeline()
 
 #ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
     vk::ResultValue<vk::Pipeline> pipelineResult =
-      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo);
+      this->mDevice->createComputePipeline(*mPipelineCache, pipelineInfo);
 
     if (pipelineResult.result != vk::Result::eSuccess) {
         throw std::runtime_error("Failed to create pipeline result: " +
@@ -347,7 +325,7 @@ Algorithm::createPipeline()
     this->mFreePipeline = true;
 #else
     vk::Pipeline pipeline =
-      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo)
+      this->mDevice->createComputePipeline(*mPipelineCache, pipelineInfo)
         .value;
     this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
     this->mFreePipeline = true;
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 2c86b6e104a3a..2a02b7b10e146 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -88,15 +88,14 @@ Manager::destroy()
         this->mManagedSequences.clear();
     }
 
-    if (this->mManageResources && this->mManagedAlgorithms.size()) {
+    if (this->mManageResources && !this->mManagedAlgorithmsMap.empty()) {
         KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
-        for (const std::weak_ptr<Algorithm>& weakAlgorithm :
-             this->mManagedAlgorithms) {
-            if (std::shared_ptr<Algorithm> algorithm = weakAlgorithm.lock()) {
+        for (const auto& kv : this->mManagedAlgorithmsMap) {
+            if (std::shared_ptr<Algorithm> algorithm = kv.second) {
                 algorithm->destroy();
             }
         }
-        this->mManagedAlgorithms.clear();
+        this->mManagedAlgorithmsMap.clear();
     }
 
     if (this->mManageResources && this->mManagedTensors.size()) {
@@ -109,6 +108,18 @@ Manager::destroy()
         this->mManagedTensors.clear();
     }
 
+    if (this->mPipelineCache) {
+        KP_LOG_DEBUG("Kompute Manager Destroying pipeline cache");
+        if (!this->mPipelineCache) {
+            KP_LOG_WARN("Kompute Manager Error requested to destroy "
+                        "pipeline cache but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineCache = nullptr;
+    }
+
     if (this->mFreeDevice) {
         KP_LOG_INFO("Destroying device");
         this->mDevice->destroy(
@@ -269,12 +280,14 @@ Manager::clear()
                          end(this->mManagedTensors),
                          [](std::weak_ptr<Tensor> t) { return t.expired(); }),
           end(this->mManagedTensors));
-        this->mManagedAlgorithms.erase(
-          std::remove_if(
-            begin(this->mManagedAlgorithms),
-            end(this->mManagedAlgorithms),
-            [](std::weak_ptr<Algorithm> t) { return t.expired(); }),
-          end(this->mManagedAlgorithms));
+        for (auto it = this->mManagedAlgorithmsMap.begin();
+             it != this->mManagedAlgorithmsMap.end();) {
+            if (it->second) {
+                it = this->mManagedAlgorithmsMap.erase(it);
+            } else {
+                ++it;
+            }
+        }
         this->mManagedSequences.erase(
           std::remove_if(begin(this->mManagedSequences),
                          end(this->mManagedSequences),
@@ -452,6 +465,12 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
     }
 
     KP_LOG_DEBUG("Kompute Manager compute queue obtained");
+
+    mPipelineCache = std::make_shared<vk::PipelineCache>();
+    vk::PipelineCacheCreateInfo pipelineCacheInfo =
+        vk::PipelineCacheCreateInfo();
+    this->mDevice->createPipelineCache(
+        &pipelineCacheInfo, nullptr, mPipelineCache.get());
 }
 
 std::shared_ptr<Sequence>
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
index 90fe48fef8637..ef11234eeb621 100644
--- a/kompute/src/include/kompute/Algorithm.hpp
+++ b/kompute/src/include/kompute/Algorithm.hpp
@@ -45,6 +45,7 @@ class Algorithm
      */
     template<typename S = float, typename P = float>
     Algorithm(std::shared_ptr<vk::Device> device,
+              vk::PipelineCache *pipelineCache,
               vk::DescriptorPool *pool,
               const std::vector<std::shared_ptr<Tensor>>& tensors = {},
               const std::vector<uint32_t>& spirv = {},
@@ -55,6 +56,7 @@ class Algorithm
         KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
 
         this->mDevice = device;
+        this->mPipelineCache = pipelineCache;
         this->mDescriptorPool = pool;
 
         if (tensors.size() && spirv.size()) {
@@ -310,8 +312,7 @@ class Algorithm
     bool mFreeShaderModule = false;
     std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
     bool mFreePipelineLayout = false;
-    std::shared_ptr<vk::PipelineCache> mPipelineCache;
-    bool mFreePipelineCache = false;
+    vk::PipelineCache *mPipelineCache = nullptr;
     std::shared_ptr<vk::Pipeline> mPipeline;
     bool mFreePipeline = false;
 
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
index 42336f4e8e141..e910b2b81838c 100644
--- a/kompute/src/include/kompute/Manager.hpp
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -39,6 +39,10 @@ class Manager
      */
     ~Manager();
 
+    bool hasInstance() const {
+        return this->mInstance.get();
+    }
+
     bool hasDevice() const {
         return this->mDevice.get();
     }
@@ -149,6 +153,7 @@ class Manager
      * @returns Shared pointer with initialised algorithm
      */
     std::shared_ptr<Algorithm> algorithm(
+      const std::string &name,
       vk::DescriptorPool *pool,
       const std::vector<std::shared_ptr<Tensor>>& tensors = {},
       const std::vector<uint32_t>& spirv = {},
@@ -157,7 +162,7 @@ class Manager
       const std::vector<float>& pushConstants = {})
     {
         return this->algorithm<>(
-          pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
+          name, pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
     }
 
     /**
@@ -176,6 +181,7 @@ class Manager
      */
     template<typename S = float, typename P = float>
     std::shared_ptr<Algorithm> algorithm(
+      const std::string &name,
       vk::DescriptorPool *pool,
       const std::vector<std::shared_ptr<Tensor>>& tensors,
       const std::vector<uint32_t>& spirv,
@@ -188,6 +194,7 @@ class Manager
 
         std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
           this->mDevice,
+          mPipelineCache.get(),
           pool,
           tensors,
           spirv,
@@ -196,12 +203,24 @@ class Manager
           pushConstants) };
 
         if (this->mManageResources) {
-            this->mManagedAlgorithms.push_back(algorithm);
+            this->mManagedAlgorithmsMap.insert({name, algorithm});
         }
 
         return algorithm;
     }
 
+    bool hasAlgorithm(const std::string &name) const {
+        return mManagedAlgorithmsMap.find(name) != mManagedAlgorithmsMap.end();
+    }
+
+    std::shared_ptr<Algorithm> getAlgorithm(const std::string &name) const {
+        auto it = mManagedAlgorithmsMap.find(name);
+        if (it != mManagedAlgorithmsMap.end()) {
+            return it->second;
+        }
+        return nullptr;
+    }
+
     /**
      * Destroy the GPU resources and all managed resources by manager.
      **/
@@ -237,6 +256,7 @@ class Manager
 
     std::shared_ptr<vk::Device> device() const { return mDevice; }
     std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
+    std::shared_ptr<vk::PipelineCache> pipelineCache() const { return mPipelineCache; }
 
   private:
     // -------------- OPTIONALLY OWNED RESOURCES
@@ -250,10 +270,11 @@ class Manager
     // -------------- ALWAYS OWNED RESOURCES
     std::vector<std::weak_ptr<Tensor>> mManagedTensors;
     std::vector<std::weak_ptr<Sequence>> mManagedSequences;
-    std::vector<std::weak_ptr<Algorithm>> mManagedAlgorithms;
+    std::unordered_map<std::string, std::shared_ptr<Algorithm>> mManagedAlgorithmsMap;
 
     std::vector<uint32_t> mComputeQueueFamilyIndices;
     std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+    std::shared_ptr<vk::PipelineCache> mPipelineCache;
 
     bool mManageResources = false;
 

From beee57266f701ac75d41c25198e05a7d40a6dfd5 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 12 Sep 2023 12:36:13 -0700
Subject: [PATCH 006/140] Make kompute actually include external SDK headers
 when requested

---
 kompute/src/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index 4179a81f27c9b..329f9bf93818a 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -73,6 +73,8 @@ endif()
 
 if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
     target_link_libraries(kompute PUBLIC Vulkan-Headers)
+else()
+    target_link_libraries(kompute PUBLIC Vulkan::Headers)
 endif()
 
 # ####################################################

From 68cf1df6fba8a6f0ef4a8751133ac37b0963dd30 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 13 Sep 2023 10:32:43 -0400
Subject: [PATCH 007/140] Throw an exception when allocation fails for vulkan.

---
 ggml-vulkan.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 89de70fa4fdb5..c7bb3ed2bdb92 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -325,7 +325,7 @@ vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
     vk::Buffer *vkBuffer = new vk::Buffer;
     vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
     if (r != vk::Result::eSuccess)
-        std::cerr << "Error allocating buffer" << vk::to_string(r);
+        std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl;
     return vkBuffer;
 }
 
@@ -358,8 +358,10 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     allocInfo.memoryTypeIndex = memoryTypeIndex;
     vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
     vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
-    if (r != vk::Result::eSuccess)
-        std::cerr << "Error allocating memory" << vk::to_string(r);
+    if (r != vk::Result::eSuccess) {
+        std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl;
+        throw std::runtime_error("Error allocating vulkan memory.");
+    }
     return vkDeviceMemory;
 }
 

From 8bea7198792206f283a652bafe5b73686490ce01 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 13 Sep 2023 09:51:40 -0700
Subject: [PATCH 008/140] vulkan: disambiguate gpus with the same name

---
 ggml-vulkan.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index c7bb3ed2bdb92..378f1d6e67394 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -145,6 +145,8 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
     if (deviceCount == 0)
         return results;
 
+    std::unordered_map<std::string, size_t> count_by_name;
+
     for (uint32_t i = 0; i < deviceCount; i++) {
         VkPhysicalDeviceProperties properties = physicalDevices.at(i).getProperties();
         VkPhysicalDeviceMemoryProperties memoryProperties = physicalDevices.at(i).getMemoryProperties();
@@ -173,6 +175,10 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
         d.type = properties.deviceType;
         d.heapSize = heapSize;
         d.name = properties.deviceName;
+        size_t n_idx = ++count_by_name[d.name];
+        if (n_idx > 1) {
+            d.name += " (" + std::to_string(n_idx) + ")";
+        }
         d.vendor = ggml_vk_getVendorName(properties.vendorID);
         results.push_back(d);
     }

From bd5f6399bb7ae8068e83895356f125a6d8ee513b Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 13 Sep 2023 17:04:47 -0400
Subject: [PATCH 009/140] Don't try and install kompute artifacts.

---
 kompute/src/CMakeLists.txt         | 4 ++--
 kompute/src/include/CMakeLists.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index 329f9bf93818a..b5c3879afaba6 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -46,8 +46,8 @@ configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
     "${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
     INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
 
-install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
-    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+#install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
+#    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
 
 # ####################################################
 # Linking
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
index 05e1ed5e15532..313f4831191bd 100644
--- a/kompute/src/include/CMakeLists.txt
+++ b/kompute/src/include/CMakeLists.txt
@@ -29,7 +29,7 @@ target_sources(kompute PRIVATE
     kompute/logger/Logger.hpp
 )
 
-install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+#install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 # ####################################################
 # Logger
@@ -43,4 +43,4 @@ target_sources(kp_logger PRIVATE
     kompute/logger/Logger.hpp
 )
 
-install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
+#install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file

From 4ed25b2f88e49b48677c100c03cc3d7159782075 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 13 Sep 2023 20:47:40 -0400
Subject: [PATCH 010/140] Sync from device back to host at begin of new prompt.

---
 llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index c835c6fd407d9..45db293be5131 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3820,6 +3820,10 @@ static bool llama_eval_internal(
         ggml_vk_graph_compute(lctx.ctx_kompute, gf);
         ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
     } else {
+        if (lctx.ctx_kompute) {
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
+        }
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
         if (lctx.ctx_kompute) {
             ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);

From 68aca6be08f05e6a3b66f58fd3c6eb69a0bbb0ca Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 14 Sep 2023 09:58:28 -0400
Subject: [PATCH 011/140] Only use vulkan with known quant that work.

---
 ggml-vulkan.cpp | 72 +++++++++++++++++++++++++------------------------
 ggml-vulkan.h   |  1 +
 2 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 378f1d6e67394..36cf0b8ae940e 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -54,22 +54,17 @@
 #define QK4_1 32
 
 typedef ggml_fp16_t half;
-
 struct ggml_kompute_context {
     bool hasH2DAll = false;
     std::vector<ggml_vk_memory> buffers;
     std::shared_ptr<vk::DescriptorPool> pool;
-    static ggml_kompute_context *instance;
-    ggml_kompute_context() {
-        instance = this;
-    }
 };
 
 // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
 // and consolidate the init functions and simplify object lifetime management. As it currently stands,
 // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
 // is only created when a device is set and vulkan is explicitly turned on.
-ggml_kompute_context *ggml_kompute_context::instance;
+ggml_kompute_context *s_kompute_context = nullptr;
 kp::Manager *komputeManager() {
     static kp::Manager *s_mgr = nullptr;
     if (s_mgr && !s_mgr->hasInstance()) {
@@ -266,6 +261,10 @@ bool ggml_vk_has_device() {
     return komputeManager()->hasDevice();
 }
 
+bool ggml_vk_using_vulkan() {
+    return s_kompute_context != nullptr;
+}
+
 ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
@@ -276,7 +275,8 @@ ggml_vk_device ggml_vk_current_device() {
 }
 
 ggml_kompute_context *ggml_vk_init() {
-    return new ggml_kompute_context;
+    s_kompute_context = new ggml_kompute_context;
+    return s_kompute_context;
 }
 
 bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
@@ -284,6 +284,8 @@ bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
 }
 
 void ggml_vk_free(struct ggml_kompute_context * ctx) {
+    assert(ctx == s_kompute_context);
+    s_kompute_context = nullptr;
     delete ctx;
 }
 
@@ -569,13 +571,13 @@ void ggml_vk_add(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -600,13 +602,13 @@ void ggml_vk_addrow(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -629,13 +631,13 @@ void ggml_vk_mul(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -660,13 +662,13 @@ void ggml_vk_mulrow(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -689,13 +691,13 @@ void ggml_vk_scale(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -713,13 +715,13 @@ void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -767,13 +769,13 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -800,13 +802,13 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -848,13 +850,13 @@ void ggml_vk_diag_mask_inf(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -885,13 +887,13 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -913,13 +915,13 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -964,13 +966,13 @@ void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -1040,13 +1042,13 @@ void ggml_vk_rope(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -1080,13 +1082,13 @@ void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
                                      "_o_" + std::to_string(out_element_size);
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(unique_name))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(unique_name, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(unique_name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(unique_name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index e1d20e3885c71..614959ba86392 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -43,6 +43,7 @@ bool ggml_vk_init_device(int device);
 bool ggml_vk_free_device();
 bool ggml_vk_has_vulkan();
 bool ggml_vk_has_device();
+bool ggml_vk_using_vulkan();
 ggml_vk_device ggml_vk_current_device();
 struct ggml_kompute_context * ggml_vk_init(void);
 bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);

From addac252939a6e03e6d2b9fe8f840b5da66c89d4 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 14 Sep 2023 16:38:28 -0400
Subject: [PATCH 012/140] Set the singleton to nullptr here.

---
 ggml-vulkan.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 36cf0b8ae940e..a008ed3fb4c4a 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -250,6 +250,10 @@ bool ggml_vk_free_device() {
     if (!ggml_vk_has_device())
         return false;
     komputeManager()->destroy();
+    // FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact
+    // the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which
+    // is very brittle
+    s_kompute_context = nullptr;
     return true;
 }
 

From 2c24d67e7b78e07390c247340f67300523033194 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Sat, 16 Sep 2023 12:17:29 -0400
Subject: [PATCH 013/140] Don't crash on available devices if we can't even
 create an instance.

---
 ggml-vulkan.cpp         | 2 +-
 kompute/src/Manager.cpp | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index a008ed3fb4c4a..c64fde83230f7 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -131,7 +131,7 @@ static std::string ggml_vk_getVendorName(uint32_t vendorID) {
 
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
     std::vector<ggml_vk_device> results;
-    if (!komputeManager()->hasVulkan())
+    if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
         return results;
 
     std::vector<vk::PhysicalDevice> physicalDevices = komputeManager()->listDevices();
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 2a02b7b10e146..2a3ad2cc9ee45 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -245,8 +245,15 @@ Manager::createInstance()
     VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
 
     this->mInstance = std::make_shared<vk::Instance>();
-    vk::createInstance(
+    vk::Result r = vk::createInstance(
       &computeInstanceCreateInfo, nullptr, this->mInstance.get());
+    if (r != vk::Result::eSuccess) {
+        KP_LOG_ERROR(
+          "Kompute Manager Error allocating vulkan instance", vk::to_string(r));
+        this->mInstance = nullptr;
+        this->mFreeInstance = false;
+        return;
+    }
 
     VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
 

From 1b1416d7b73f4e857ed931eac7445d259b861fb2 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 21 Sep 2023 12:39:33 -0400
Subject: [PATCH 014/140] Support for gguf.

---
 llama.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 45db293be5131..e8ca52d5fd07f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -514,6 +514,9 @@ static std::string llama_format_win_err(DWORD err) {
 struct llama_buffer {
     void * data = NULL;
     size_t size = 0;
+#if defined(GGML_USE_KOMPUTE)
+    ggml_vk_memory memory;
+#endif
 
     // fallback to malloc / free
     // useful in cases where CUDA can try to allocate PINNED memory
@@ -522,6 +525,14 @@ struct llama_buffer {
     void resize(size_t n) {
         llama_host_free(data);
 
+#if defined(GGML_USE_KOMPUTE)
+        if (ggml_vk_has_device()) {
+            this->memory = ggml_vk_allocate(n);
+            this->data = (uint8_t*)memory.data;
+            this->size = n;
+            return;
+        }
+#endif
         data = llama_host_malloc(n);
         if (!data) {
             fallback = true;
@@ -536,6 +547,13 @@ struct llama_buffer {
 
     ~llama_buffer() {
         if (data) {
+#if defined(GGML_USE_KOMPUTE)
+            if (ggml_vk_has_device()) {
+                ggml_vk_free_memory(memory);
+                data = NULL;
+                return;
+            }
+#endif
             if (fallback) { // NOLINT
                 free(data);
             } else {
@@ -1398,6 +1416,9 @@ struct llama_model_loader {
             use_mmap = false;
         }
 
+#if defined(GGML_USE_KOMPUTE)
+        use_mmap = false;
+#endif
         this->use_mmap = use_mmap;
     }
 
@@ -6470,6 +6491,23 @@ struct llama_context * llama_new_context_with_model(
             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
 #undef LLAMA_METAL_CHECK_BUF
         }
+#elif defined(GGML_USE_KOMPUTE)
+    if (ggml_vk_has_device() && params.n_gpu_layers > 0
+        && (model->ftype == LLAMA_FTYPE_ALL_F32
+            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
+        // this allocates all Vulkan resources and memory buffers
+        ctx->ctx_kompute = ggml_vk_init();
+
+        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+
+        printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+
+        ggml_vk_add_buffer(ctx->ctx_kompute, "data", ctx->model.buf.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->kv_self.buf.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "alloc", ctx->buf_alloc.memory);
+    }
 #endif
     }
 
@@ -6503,7 +6541,13 @@ static struct llama_context * llama_init_from_file(
 }
 
 void llama_free(struct llama_context * ctx) {
+#ifdef GGML_USE_KOMPUTE
+    ggml_vk_free(ctx->ctx_kompute);
+#endif
     delete ctx;
+#ifdef GGML_USE_KOMPUTE
+    ggml_vk_free_device();
+#endif
 }
 
 int llama_n_vocab(const struct llama_context * ctx) {

From 6b6c73a9e3b299227cd5b51552f10e5d102810d4 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Tue, 26 Sep 2023 10:35:05 -0400
Subject: [PATCH 015/140] kompute : don't fail build because of -Warray-bounds

There are some warnings in debug builds that are likely to be false
positives.
---
 kompute/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
index aa228653aa86e..1bd84d7ede7b8 100644
--- a/kompute/CMakeLists.txt
+++ b/kompute/CMakeLists.txt
@@ -169,7 +169,7 @@ endif()
 
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror -Wno-error=array-bounds")
 endif()
 
 # If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h

From 9e4f8b4acc387f3c0f0cdb62c2582dc01a67caad Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 26 Sep 2023 11:58:39 -0400
Subject: [PATCH 016/140] Upload immediately to device.

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index e8ca52d5fd07f..1432696bde53b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2787,7 +2787,7 @@ static struct ggml_cgraph * llm_build_llama(
     ggml_free(ctx0);
 
 #if defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute && N == 1) {
+    if (lctx.ctx_kompute) {
         if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
             ggml_vk_h2d_all(lctx.ctx_kompute);
         } else {

From 77135a3bf506d4ed782f5ea93ae6f3f61b056117 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 21 Sep 2023 13:00:10 -0400
Subject: [PATCH 017/140] Add a common boilerplate code via include and elim
 copy pasta

---
 CMakeLists.txt               |   2 +-
 kompute/common.comp          | 124 +++++++++++++++++++++++++++++++++++
 kompute/op_add.comp          | 117 +--------------------------------
 kompute/op_addrow.comp       | 117 +--------------------------------
 kompute/op_cpy_f16_f16.comp  | 117 +--------------------------------
 kompute/op_cpy_f16_f32.comp  | 117 +--------------------------------
 kompute/op_cpy_f32_f16.comp  | 117 +--------------------------------
 kompute/op_cpy_f32_f32.comp  | 117 +--------------------------------
 kompute/op_diagmask.comp     | 117 +--------------------------------
 kompute/op_gelu.comp         | 117 +--------------------------------
 kompute/op_getrows_f16.comp  | 117 +--------------------------------
 kompute/op_getrows_q4_0.comp | 117 +--------------------------------
 kompute/op_getrows_q4_1.comp | 117 +--------------------------------
 kompute/op_mul.comp          | 117 +--------------------------------
 kompute/op_mul_mat_f16.comp  | 117 +--------------------------------
 kompute/op_mul_mat_q4_0.comp | 117 +--------------------------------
 kompute/op_mul_mat_q4_1.comp | 117 +--------------------------------
 kompute/op_mulrow.comp       | 117 +--------------------------------
 kompute/op_norm.comp         | 117 +--------------------------------
 kompute/op_relu.comp         | 117 +--------------------------------
 kompute/op_rmsnorm.comp      | 117 +--------------------------------
 kompute/op_rope.comp         | 117 +--------------------------------
 kompute/op_scale.comp        | 116 +-------------------------------
 kompute/op_silu.comp         | 117 +--------------------------------
 kompute/op_softmax.comp      | 117 +--------------------------------
 25 files changed, 148 insertions(+), 2668 deletions(-)
 create mode 100644 kompute/common.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88585fb933495..31532df919793 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -429,7 +429,7 @@ if (LLAMA_KOMPUTE)
         set(spv_file ${source}.spv)
         add_custom_command(
             OUTPUT ${spv_file}
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp
             COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
             COMMENT "Compiling ${source} to ${source}.spv"
         )
diff --git a/kompute/common.comp b/kompute/common.comp
new file mode 100644
index 0000000000000..12fc7d8b5c267
--- /dev/null
+++ b/kompute/common.comp
@@ -0,0 +1,124 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index 7e4e43d7547a1..019a68449e3c3 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
index 492f672e5612a..926c929e4253a 100644
--- a/kompute/op_addrow.comp
+++ b/kompute/op_addrow.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
index 40d756ae57ded..5f425ae28798c 100644
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 #define IN_TYPE float16_t
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
index 309c48aed2a8f..4298bebdd729c 100644
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 #define IN_TYPE float16_t
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
index fb0e00d677940..2d763edfd3d43 100644
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 #define IN_TYPE float
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp
index f43480b8d5254..4e5b1d39303fd 100644
--- a/kompute/op_cpy_f32_f32.comp
+++ b/kompute/op_cpy_f32_f32.comp
@@ -1,121 +1,6 @@
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 #define IN_TYPE float
diff --git a/kompute/op_diagmask.comp b/kompute/op_diagmask.comp
index 18b0192d720ac..8dc2cc60a7942 100644
--- a/kompute/op_diagmask.comp
+++ b/kompute/op_diagmask.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
index 8079b8ef28766..c9f8ce3cf2012 100644
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
index e0f5bb16ec70e..17b478b5e1934 100644
--- a/kompute/op_getrows_f16.comp
+++ b/kompute/op_getrows_f16.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
index cddba929b5701..590f218e68367 100644
--- a/kompute/op_getrows_q4_0.comp
+++ b/kompute/op_getrows_q4_0.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
index 151848a9d0468..44718c6af5a48 100644
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
index 4907015d8ca37..348eae7b363c5 100644
--- a/kompute/op_mul.comp
+++ b/kompute/op_mul.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
index f1198b59384f2..1390c00cf4e93 100644
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 64) in;
 
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
index 206aea7d5a512..9b6dd72dc6296 100644
--- a/kompute/op_mul_mat_q4_0.comp
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 8, local_size_y = 8) in;
 
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
index 8bdf810a1fa6b..fb7b051b8d2de 100644
--- a/kompute/op_mul_mat_q4_1.comp
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 8, local_size_y = 8) in;
 
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
index 3defd0a5f492f..498dbdfcd6af5 100644
--- a/kompute/op_mulrow.comp
+++ b/kompute/op_mulrow.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index ec0a8568d0a14..4b2db25e3593c 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 256
 
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
index bc2c31f4368db..41f46be961a87 100644
--- a/kompute/op_relu.comp
+++ b/kompute/op_relu.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index 784713c36ef26..dd2c5cddef670 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 256
 
diff --git a/kompute/op_rope.comp b/kompute/op_rope.comp
index ca6bb6831e06c..3fa84f5798851 100644
--- a/kompute/op_rope.comp
+++ b/kompute/op_rope.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
index f537121a4945f..8530aaf3e6999 100644
--- a/kompute/op_scale.comp
+++ b/kompute/op_scale.comp
@@ -8,122 +8,8 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
+#include "common.comp"
 
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
 layout(local_size_x = 1) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
index 90c034ac7c341..c5acac281902a 100644
--- a/kompute/op_silu.comp
+++ b/kompute/op_silu.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index ce0e71924b4c9..e936d8f68972e 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 

From 93306f16d046831a750d9971be3b720ef0ef8136 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Fri, 29 Sep 2023 10:02:22 -0400
Subject: [PATCH 018/140] Consolidate code for mat x vec kernels and use
 subgroups more extensively.

---
 ggml-vulkan.cpp                               |  91 ++++++++++------
 ggml-vulkan.h                                 |   1 +
 kompute/op_getrows_q4_1.comp                  |   2 +-
 kompute/op_mul_mat_f16.comp                   |  29 ++---
 kompute/op_mul_mat_q4_0.comp                  |  77 +++++---------
 kompute/op_mul_mat_q4_1.comp                  | 100 +++++-------------
 kompute/op_mul_mv_q_n.comp                    |  49 +++++++++
 kompute/op_softmax.comp                       |  54 +++-------
 kompute/src/CMakeLists.txt                    |   1 +
 kompute/src/OpTensorFill.cpp                  |  55 ++++++++++
 kompute/src/Tensor.cpp                        |   7 ++
 kompute/src/include/CMakeLists.txt            |   1 +
 kompute/src/include/kompute/Kompute.hpp       |   1 +
 kompute/src/include/kompute/Tensor.hpp        |   4 +
 .../kompute/operations/OpTensorFill.hpp       |  58 ++++++++++
 llama.cpp                                     |   3 +-
 16 files changed, 320 insertions(+), 213 deletions(-)
 create mode 100644 kompute/op_mul_mv_q_n.comp
 create mode 100644 kompute/src/OpTensorFill.cpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorFill.hpp

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index c64fde83230f7..74dd0f00f3892 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -165,11 +165,20 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
         if (heapSize < memoryRequired)
             continue;
 
+        vk::PhysicalDeviceSubgroupProperties subgroupProperties;
+        vk::PhysicalDeviceProperties2 deviceProperties2;
+        deviceProperties2.pNext = &subgroupProperties;
+        physicalDevices.at(i).getProperties2(&deviceProperties2);
+
+        if (subgroupProperties.subgroupSize < 32)
+            continue;
+
         ggml_vk_device d;
         d.index = i;
         d.type = properties.deviceType;
         d.heapSize = heapSize;
         d.name = properties.deviceName;
+        d.subgroupSize = subgroupProperties.subgroupSize;
         size_t n_idx = ++count_by_name[d.name];
         if (n_idx > 1) {
             d.name += " (" + std::to_string(n_idx) + ")";
@@ -242,7 +251,7 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
 bool ggml_vk_init_device(int device) {
     komputeManager()->initializeDevice(device, {},
                          {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
-                          "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
+                          "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
     return ggml_vk_has_device();
 }
 
@@ -772,9 +781,10 @@ void ggml_vk_soft_max(kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
@@ -890,9 +900,10 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
@@ -907,26 +918,28 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
                           const std::shared_ptr<kp::Tensor>& inB,
                           const std::shared_ptr<kp::Tensor>& out,
                           uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                          int32_t ne00, int32_t ne10, int32_t ne0,
-                          int32_t ne01, int32_t ne11) {
+                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
+                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0;
+        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
     } pushConsts {
         safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0,
+        ne00, ne10, ne0, ne1, ne01, ne12/ne02
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
+        s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
+    seq.record<kp::OpTensorFill>({out});
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
@@ -1182,7 +1195,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const uint32_t nb3 = dst ? dst->nb[3] : 0;
 
             const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-//            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
             const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
 
             const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
@@ -1263,30 +1276,46 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_MUL_MAT:
                     {
-                        if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
-                            && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
-                        } else if (src0->type == GGML_TYPE_Q4_0
-                                   && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
-                        } else if (src0->type == GGML_TYPE_Q4_1
-                                   && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
-                        } else {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
+                        if (src1t != GGML_TYPE_F32) {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                             goto not_implemented;
                         }
+
+                        if (!ggml_is_transposed(src0)
+                            && !ggml_is_transposed(src1)
+                            && ne00%32 == 0
+                            && ne11 > 1) {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                            goto not_implemented;
+                        } else {
+                            switch (src0t) {
+                                case GGML_TYPE_F16:
+                                case GGML_TYPE_F32:
+                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                    break;
+                                case GGML_TYPE_Q4_0:
+                                    ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
+                                case GGML_TYPE_Q4_1:
+                                    ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
+                                default: {
+                                    fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                                    goto not_implemented;
+                                }
+                            }
+                        }
                     } break;
                 case GGML_OP_GET_ROWS:
                     {
-                        if (src0->type == GGML_TYPE_F16) {
+                        if (src0t == GGML_TYPE_F16) {
                             ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0->type == GGML_TYPE_Q4_0) {
+                        } else if (src0t == GGML_TYPE_Q4_0) {
                             ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0->type == GGML_TYPE_Q4_1) {
+                        } else if (src0t == GGML_TYPE_Q4_1) {
                             ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                             goto not_implemented;
                         }
                     } break;
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index 614959ba86392..7989cfc1fa7fb 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -34,6 +34,7 @@ struct ggml_vk_device {
     size_t heapSize = 0;
     std::string name;
     std::string vendor;
+    int subgroupSize = 0;
 };
 
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
index 44718c6af5a48..3d00928d356e9 100644
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -43,7 +43,7 @@ void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based fro
     const uint nb = k / qk;
 
     for (uint i = 0; i < nb; i++) {
-        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
+        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
 
         const float16_t d = block.d;
         const float16_t m = block.m;
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
index 1390c00cf4e93..72a667f925f4a 100644
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@@ -10,7 +10,9 @@
 
 #include "common.comp"
 
-layout(local_size_x = 64) in;
+#extension GL_KHR_shader_subgroup_arithmetic : require
+
+layout(local_size_x_id = 0) in;
 
 layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -29,8 +31,6 @@ layout (push_constant) uniform parameter {
     int ne1;
 } pcs;
 
-shared float sum[gl_WorkGroupSize.x];
-
 void main() {
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
@@ -39,24 +39,13 @@ void main() {
     const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
     const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
 
-    sum[gl_LocalInvocationID.x] = 0.0;
-
-    for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
-        sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
-    }
-
-    // accumulate the sum from all threads in the threadgroup
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
+    float sumf = 0.0f;
+    for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
+        sumf += float(inA[x+i]) * float(inB[y+i]);
     }
 
-    if (gl_LocalInvocationID.x == 0) {
-        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
+    const float all_sum = subgroupAdd(sumf);
+    if (subgroupElect()) {
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
     }
 }
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
index 9b6dd72dc6296..165df3c376163 100644
--- a/kompute/op_mul_mat_q4_0.comp
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -10,7 +10,13 @@
 
 #include "common.comp"
 
-layout(local_size_x = 8, local_size_y = 8) in;
+#define BLOCKS_IN_QUANT QK4_0
+#define SIZE_OF_BLOCK sizeof_block_q4_0
+#define N_ROWS 4
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -23,58 +29,31 @@ layout (push_constant) uniform parameter {
     int ne00;
     int ne10;
     int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
 } pcs;
 
-shared float sum[64];
-
-void main() {
-    const uint nb = uint(pcs.ne00/QK4_0);
-
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-
-    const uint x = r0*nb; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
-
-    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
-    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
-
-    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
-    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
-
-    const uint first = 4 * iy;
+// The q4_0 version of this function
+float block_q_n_dot_y(uint block_index, uint yb, uint il) {
+    vec2 acc = vec2(0.0, 0.0);
+    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
+    float d = float(u8BufToFloat16(inA, index));
+    float sumy = 0.0f;
+    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
+        const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
 
-    float sumf = 0.0;
+        const float yl0 = inB[yb + i];
+        const float yl1 = inB[yb + i + 1];
+        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
+        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
 
-    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
-        const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
-        const float d = float(u8BufToFloat16(inA, index));
+        sumy += yl0 + yl1 + yl8 + yl9;
 
-        const uint xl = first; // Based from bl->qs
-        const uint yl = y + i * QK4_0 + first; // Based from inB
-
-        vec2 acc = vec2(0.0, 0.0);
-
-        for (int j = 0; j < 4; ++j) {
-            const uint8_t b = inA[index+2+xl+j];
-            acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
-            acc.y += inB[yl+j] + inB[yl+j+16];
-        }
-
-        sumf += d * (acc.x - 8.*acc.y);
-    }
-
-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    barrier();
-    if (ith == 0) {
-        float sumTotal = 0.0;
-        for (uint i = 0; i < nth; ++i) {
-            sumTotal += sum[i];
-        }
-        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
+        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
+        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
     }
+    return d * (sumy * -8.f + acc[0] + acc[1]);
 }
+
+#include "op_mul_mv_q_n.comp"
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
index fb7b051b8d2de..683b695caf95d 100644
--- a/kompute/op_mul_mat_q4_1.comp
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -10,7 +10,13 @@
 
 #include "common.comp"
 
-layout(local_size_x = 8, local_size_y = 8) in;
+#define BLOCKS_IN_QUANT QK4_1
+#define SIZE_OF_BLOCK sizeof_block_q4_1
+#define N_ROWS 4
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -23,81 +29,33 @@ layout (push_constant) uniform parameter {
     int ne00;
     int ne10;
     int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
 } pcs;
 
-shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
-
-#define UNALIGNED_INPUT inA
-
-block_q4_1 get_unaligned_block_q4_1(uint index) {
-    block_q4_1 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
-    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
-    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
-    }
-    return fres;
-}
-
-void main() {
-    const uint nb = uint(pcs.ne00/QK4_1);
-
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-
-    const uint x = r0*nb; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
-
-    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
-    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
-
-    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
-    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
-
-    const uint first = 4 * iy;
+// The q4_1 version of this function
+float block_q_n_dot_y(uint block_index, uint yb, uint il) {
+    vec2 acc = vec2(0.0, 0.0);
+    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
+    float d = float(u8BufToFloat16(inA, index));
+    float m = float(u8BufToFloat16(inA, index+2));
 
-    float sumf = 0.0;
+    float sumy = 0.0f;
+    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
+        const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
 
-    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
-        //TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
+        const float yl0 = inB[yb + i];
+        const float yl1 = inB[yb + i + 1];
+        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
+        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
 
-        const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
+        sumy += yl0 + yl1 + yl8 + yl9;
 
-        const float d = float(block.d);
-        const float m = float(block.m);
-
-        const uint xl = first; // Based from bl->qs
-        const uint yl = y + i * QK4_1 + first; // Based from inB
-
-        vec2 acc = vec2(0.0, 0.0);
-
-        for (int j = 0; j < 4; ++j) {
-            acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
-            acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
-        }
-
-        sumf += d * (acc.x - acc.y);
-    }
-
-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    barrier();
-    memoryBarrierShared();
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    barrier();
-    memoryBarrierShared();
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    barrier();
-    memoryBarrierShared();
-    if (ith == 0) {
-        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
-        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
+        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
+        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
     }
+    return d * (acc[0] + acc[1]) + sumy * m;
 }
+
+#include "op_mul_mv_q_n.comp"
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp
new file mode 100644
index 0000000000000..83de952dde473
--- /dev/null
+++ b/kompute/op_mul_mv_q_n.comp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+void main() {
+    const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+    const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
+    const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0);
+
+    const uint x = offset0; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+    const uint ix = gl_SubgroupInvocationID/2;
+    const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
+
+    uint yb = y + ix * BLOCKS_IN_QUANT + il;
+
+    debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
+        gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
+        gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
+
+    for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
+        for (int row = 0; row < N_ROWS; row++) {
+            const uint block_index = x + ib + row * nb;
+            sumf[row] += block_q_n_dot_y(block_index, yb, il);
+        }
+
+        yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
+    }
+
+    for (int row = 0; row < N_ROWS; ++row) {
+        const float tot = subgroupAdd(sumf[row]);
+        if (first_row + row < pcs.ne01 && subgroupElect()) {
+            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
+        }
+    }
+}
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index e936d8f68972e..60456a3bb0102 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -10,9 +10,9 @@
 
 #include "common.comp"
 
-#define nth 32
+#extension GL_KHR_shader_subgroup_arithmetic : require
 
-layout(local_size_x = nth) in;
+layout(local_size_x_id = 0) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
@@ -25,8 +25,6 @@ layout(push_constant) uniform PushConstants {
     int ne02;
 } pcs;
 
-shared float buf[nth];
-
 void main() {
     const uint i03 = gl_WorkGroupID.z;
     const uint i02 = gl_WorkGroupID.y;
@@ -37,46 +35,22 @@ void main() {
     const uint pdst = extra_off + pcs.outOff; // Based from out_
 
     // parallel max
-    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
-    }
-
-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
-        }
-        barrier();
-        memoryBarrierShared();
+    float localMax = uintBitsToFloat(0xFF800000);
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        localMax = max(localMax, in_[psrc0 + i00]);
     }
-
-    // broadcast
-    const float max_ = buf[0];
+    float max_ = subgroupMax(localMax);
 
     // parallel sum
-    buf[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
+    float localSum = 0.0f;
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
+        localSum += exp_psrc0;
+        out_[pdst + i00] = exp_psrc0;
     }
 
-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-
-    // broadcast
-    const float sum = buf[0];
-
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
+    const float sum = subgroupAdd(localSum);
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        out_[pdst + i00] /= sum;
     }
 }
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index b5c3879afaba6..42b7d07f5e5b9 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -13,6 +13,7 @@ add_library(kompute STATIC Algorithm.cpp
     OpAlgoDispatch.cpp
     OpMemoryBarrier.cpp
     OpTensorCopy.cpp
+    OpTensorFill.cpp
     OpTensorSyncDevice.cpp
     OpTensorSyncLocal.cpp
     OpBufferSyncDevice.cpp
diff --git a/kompute/src/OpTensorFill.cpp b/kompute/src/OpTensorFill.cpp
new file mode 100644
index 0000000000000..da477dcc7f6ee
--- /dev/null
+++ b/kompute/src/OpTensorFill.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorFill.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorFill called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorFill::~OpTensorFill()
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
+}
+
+void
+OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->recordFill(commandBuffer, 0);
+    }
+}
+
+void
+OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
+}
+
+void
+OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
+}
+
+}
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
index 9c343ff139181..65279206d017e 100644
--- a/kompute/src/Tensor.cpp
+++ b/kompute/src/Tensor.cpp
@@ -215,6 +215,13 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
     commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
 }
 
+void
+Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
+                   uint32_t fill)
+{
+    commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
+}
+
 void
 Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
                                          vk::AccessFlagBits srcAccessMask,
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
index 313f4831191bd..53e9d8ae616be 100644
--- a/kompute/src/include/CMakeLists.txt
+++ b/kompute/src/include/CMakeLists.txt
@@ -21,6 +21,7 @@ target_sources(kompute PRIVATE
     kompute/operations/OpMemoryBarrier.hpp
     kompute/operations/OpMult.hpp
     kompute/operations/OpTensorCopy.hpp
+    kompute/operations/OpTensorFill.hpp
     kompute/operations/OpTensorSyncDevice.hpp
     kompute/operations/OpTensorSyncLocal.hpp
     kompute/operations/OpBufferSyncDevice.hpp
diff --git a/kompute/src/include/kompute/Kompute.hpp b/kompute/src/include/kompute/Kompute.hpp
index f59a63b50ba44..70e0dd433c44f 100644
--- a/kompute/src/include/kompute/Kompute.hpp
+++ b/kompute/src/include/kompute/Kompute.hpp
@@ -15,6 +15,7 @@
 #include "operations/OpTensorSyncLocal.hpp"
 #include "operations/OpBufferSyncDevice.hpp"
 #include "operations/OpBufferSyncLocal.hpp"
+#include "operations/OpTensorFill.hpp"
 
 // Will be build by CMake and placed inside the build directory
 #include "ShaderLogisticRegression.hpp"
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
index 4c260ce6b9c63..2ab88eb308178 100644
--- a/kompute/src/include/kompute/Tensor.hpp
+++ b/kompute/src/include/kompute/Tensor.hpp
@@ -126,6 +126,9 @@ class Tensor
     void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
                         std::shared_ptr<Tensor> copyFromTensor);
 
+    void recordFill(const vk::CommandBuffer &commandBuffer,
+                    uint32_t fill);
+
     /**
      * Records a copy from the internal staging memory to the device memory
      * using an optional barrier to wait for the operation. This function would
@@ -279,6 +282,7 @@ class Tensor
                           vk::Buffer *bufferTo,
                           vk::DeviceSize bufferSize,
                           vk::BufferCopy copyRegion);
+
     void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
                                    const vk::Buffer& buffer,
                                    vk::AccessFlagBits srcAccessMask,
diff --git a/kompute/src/include/kompute/operations/OpTensorFill.hpp b/kompute/src/include/kompute/operations/OpTensorFill.hpp
new file mode 100644
index 0000000000000..9a6bf131e88f0
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorFill.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that fills the tensor
+ */
+class OpTensorFill : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorFill() override;
+
+    /**
+     * Records the fill command for tensor.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/llama.cpp b/llama.cpp
index 1432696bde53b..245174898046f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6495,7 +6495,8 @@ struct llama_context * llama_new_context_with_model(
     if (ggml_vk_has_device() && params.n_gpu_layers > 0
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 

From 601905e75ee6cbacec0ee5aa523c96fb0258bd63 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:00:55 -0400
Subject: [PATCH 019/140] Move the subgroups and printf into common.

---
 kompute/common.comp        | 2 ++
 kompute/op_mul_mv_q_n.comp | 9 +++------
 kompute/op_softmax.comp    | 2 --
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/kompute/common.comp b/kompute/common.comp
index 12fc7d8b5c267..2e843a87815cf 100644
--- a/kompute/common.comp
+++ b/kompute/common.comp
@@ -12,6 +12,8 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
 #extension GL_EXT_control_flow_attributes: enable
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
 
 #define QK4_0 32
 #define QR4_0 2
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp
index 83de952dde473..15bcbf765875b 100644
--- a/kompute/op_mul_mv_q_n.comp
+++ b/kompute/op_mul_mv_q_n.comp
@@ -6,9 +6,6 @@
  * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
  */
 
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
 void main() {
     const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
     const uint r0 = gl_WorkGroupID.x;
@@ -27,9 +24,9 @@ void main() {
 
     uint yb = y + ix * BLOCKS_IN_QUANT + il;
 
-    debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
-        gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
-        gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
+    //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
+    //    gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
+    //    gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
 
     for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
         for (int row = 0; row < N_ROWS; row++) {
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index 60456a3bb0102..d21577ac0f59c 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -10,8 +10,6 @@
 
 #include "common.comp"
 
-#extension GL_KHR_shader_subgroup_arithmetic : require
-
 layout(local_size_x_id = 0) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };

From 5509f743187f69624fc617faeefc82c175d33e57 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:01:45 -0400
Subject: [PATCH 020/140] Minor cleanup.

---
 ggml-vulkan.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 74dd0f00f3892..f770a2d0c1142 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -939,7 +939,6 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
-    seq.record<kp::OpTensorFill>({out});
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
@@ -951,7 +950,6 @@ void ggml_vk_mul_mat_q4_0(Args&&... args) {
     ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
-// FIXME: This could be improved like was done in q4_0 version but needs testing...
 template <typename... Args>
 void ggml_vk_mul_mat_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,

From 4b223ec4329a24f3b932ea1a9c0456ef11b851ea Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:04:02 -0400
Subject: [PATCH 021/140] Refactor getrows to use common code and get ready for
 q6_k.

---
 kompute/common.comp          | 138 +++++++++++++++--------------------
 kompute/op_getrows.comp      |  25 +++++++
 kompute/op_getrows_f16.comp  |  10 ++-
 kompute/op_getrows_q4_0.comp |  38 +++-------
 kompute/op_getrows_q4_1.comp |  41 +++--------
 5 files changed, 111 insertions(+), 141 deletions(-)
 create mode 100644 kompute/op_getrows.comp

diff --git a/kompute/common.comp b/kompute/common.comp
index 2e843a87815cf..040b87375ecd2 100644
--- a/kompute/common.comp
+++ b/kompute/common.comp
@@ -16,27 +16,12 @@
 #extension GL_EXT_debug_printf : enable
 
 #define QK4_0 32
-#define QR4_0 2
 #define QK4_1 32
 
 #define GELU_COEF_A 0.044715
 #define SQRT_2_OVER_PI 0.79788456080286535587989211986876
 
-#ifndef QK_K
 #define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
 
 #define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
 #define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
@@ -44,83 +29,76 @@
 #define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
 
 #define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
 struct block_q4_0 {
     float16_t d;
     uint8_t qs[QK4_0 / 2];
 };
+mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
+    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
+    const float d2 = d1 / 256.f;
+    const float md = -8.f * xb.d;
+    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
+    const uint16_t mask1 = mask0 << 8;
+
+    mat4 reg;
+    for (int i=0;i<8;i++) {
+        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
+        reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
+        reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
+    }
+    return reg;
+}
+
+#define sizeof_block_q4_1 0x14
 struct block_q4_1 {
     float16_t d;
     float16_t m;
     uint8_t qs[QK4_1 / 2];
 };
+mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
+    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
+    const float d2 = d1 / 256.f;
+    const float  m = xb.m;
+    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
+    const uint16_t mask1 = mask0 << 8;
 
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
+    mat4 reg;
+    for (int i=0;i<8;i++) {
+        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
+        reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
+        reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
+    }
+    return reg;
+}
 
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
+#define sizeof_block_q6_k 210
+struct block_q6_k {
     uint8_t ql[QK_K/2];      // quants, lower 4 bits
     uint8_t qh[QK_K/4];      // quants, upper 2 bits
     int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
+    float16_t d;             // super-block scale
 };
-// 210 bytes / block
+mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
+    const float16_t d_all = xb.d;
+    uint8_t ql[QK_K/2];
+    uint8_t qh[QK_K/4];
+    int8_t  scales[QK_K/16];
+
+    const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
+    const uint qhIndex = 32*(il/8) + 16*(il&1);
+    float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
+    il = (il/2) & 3;
+
+    const uint16_t  kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
+    const uint16_t  kmask2 = il>1 ? uint8_t(0xF0)             : uint8_t(0x0F);
+    const float16_t coef   = il>1 ? float16_t(1.f/16.f)       : float16_t(1.f);
+    const float16_t ml = float16_t(d_all * sc * 32.f);
+    const float16_t dl = float16_t(d_all * sc * coef);
+    mat4 reg;
+    for (int i = 0; i < 16; ++i) {
+        const float16_t q = (il&1) != 0 ? ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 2))
+                                        : ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 4));
+        reg[i/4][i%4] = dl * q - ml;
+    }
+    return reg;
+}
diff --git a/kompute/op_getrows.comp b/kompute/op_getrows.comp
new file mode 100644
index 0000000000000..a4d8bb9a0ad5d
--- /dev/null
+++ b/kompute/op_getrows.comp
@@ -0,0 +1,25 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    int z = 0;
+    for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
+        const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
+        const mat4 result = dequantize_block(inIndex, ind%NL);
+        for (uint j = 0; j < 4; ++j) {
+            for (uint k = 0; k < 4; ++k) {
+                const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
+                out_[outIndex] = result[j][k];
+                ++z;
+            }
+        }
+    }
+}
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
index 17b478b5e1934..3f2b167243f39 100644
--- a/kompute/op_getrows_f16.comp
+++ b/kompute/op_getrows_f16.comp
@@ -25,11 +25,15 @@ layout (push_constant) uniform parameter {
     int nb1;
 } pcs;
 
+void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    for (int j = 0; j < k; j++) {
+        out_[y + j] = inA[x + j];
+    }
+}
+
 void main() {
     const uint i = gl_WorkGroupID.x;
     const int r = inB[i + pcs.inBOff];
 
-    for (int j = 0; j < pcs.ne00; j++) {
-        out_[i*pcs.nb1 + j + pcs.outOff] = inA[r*pcs.nb01/2+j + pcs.inAOff];
-    }
+    dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1 + pcs.outOff, pcs.ne00);
 }
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
index 590f218e68367..0449b19877bdc 100644
--- a/kompute/op_getrows_q4_0.comp
+++ b/kompute/op_getrows_q4_0.comp
@@ -10,6 +10,10 @@
 
 #include "common.comp"
 
+#define NL 2
+#define BYTES_FOR_TYPE 4 /*bytes for float*/
+#define SIZE_OF_BLOCK sizeof_block_q4_0
+
 layout(local_size_x = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@@ -25,40 +29,18 @@ layout (push_constant) uniform parameter {
     int nb1;
 } pcs;
 
-#define UNALIGNED_INPUT inA
-
 block_q4_0 get_unaligned_block_q4_0(uint index) {
     block_q4_0 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    fres.d = u8BufToFloat16(inA, index);
     [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+2+it];
+        fres.qs[it] = inA[index+2+it];
     }
     return fres;
 }
 
-void dequantize_row_q4_0(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
-    const uint qk = QK4_0;
-
-    const uint nb = k / qk;
-
-    for (uint i = 0; i < nb; i++) {
-        const block_q4_0 block = get_unaligned_block_q4_0(x + i*sizeof_block_q4_0);
-
-        const float16_t d = block.d;
-
-        for (uint j = 0; j < qk/2; ++j) {
-            const int x0 = (block.qs[j] & 0x0F) - 8;
-            const int x1 = (block.qs[j] >>   4) - 8;
-
-            out_[y+i*qk + j + 0   ] = float(x0)*d;
-            out_[y+i*qk + j + qk/2] = float(x1)*d;
-        }
-    }
+mat4 dequantize_block(uint index, uint il) {
+    const block_q4_0 block = get_unaligned_block_q4_0(index);
+    return dequantize_q4_0(block, il);
 }
 
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-
-    dequantize_row_q4_0(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
-}
+#include "op_getrows.comp"
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
index 3d00928d356e9..64586cdc9c788 100644
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -10,6 +10,10 @@
 
 #include "common.comp"
 
+#define NL 2
+#define BYTES_FOR_TYPE 4 /*bytes for float*/
+#define SIZE_OF_BLOCK sizeof_block_q4_1
+
 layout(local_size_x = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@@ -25,42 +29,19 @@ layout (push_constant) uniform parameter {
     int nb1;
 } pcs;
 
-#define UNALIGNED_INPUT inA
-
 block_q4_1 get_unaligned_block_q4_1(uint index) {
     block_q4_1 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
-    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+    fres.d = u8BufToFloat16(inA, index);
+    fres.m = u8BufToFloat16(inA, index+2);
     [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+        fres.qs[it] = inA[index+4+it];
     }
     return fres;
 }
 
-void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
-    const uint qk = QK4_1;
-
-    const uint nb = k / qk;
-
-    for (uint i = 0; i < nb; i++) {
-        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
-
-        const float16_t d = block.d;
-        const float16_t m = block.m;
-
-        for (uint j = 0; j < qk/2; ++j) {
-            const int x0 = (block.qs[j] & 0x0F);
-            const int x1 = (block.qs[j] >>   4);
-
-            out_[y+i*qk + j + 0   ] = float(x0)*d + m;
-            out_[y+i*qk + j + qk/2] = float(x1)*d + m;
-        }
-    }
+mat4 dequantize_block(uint index, uint il) {
+    const block_q4_1 block = get_unaligned_block_q4_1(index);
+    return dequantize_q4_1(block, il);
 }
 
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-
-    dequantize_row_q4_1(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
-}
+#include "op_getrows.comp"

From f1c9bc18216606b992a4b13b4154ddf97e443a92 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:05:22 -0400
Subject: [PATCH 022/140] Add q6_k getrows and mul*vec kernel.

---
 ggml-vulkan.cpp              |  47 ++++++++++++++
 kompute/op_getrows_q6_k.comp |  52 ++++++++++++++++
 kompute/op_mul_mat_q6_k.comp | 117 +++++++++++++++++++++++++++++++++++
 llama.cpp                    |   3 +-
 4 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 kompute/op_getrows_q6_k.comp
 create mode 100644 kompute/op_mul_mat_q6_k.comp

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index f770a2d0c1142..1dd504127d6ee 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -25,9 +25,11 @@
 #include "shaderop_mul_mat_f16.h"
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
+#include "shaderop_mul_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
+#include "shaderop_getrows_q6_k.h"
 #include "shaderop_rope.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
@@ -52,6 +54,7 @@
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32
+#define QK_NL 16
 
 typedef ggml_fp16_t half;
 struct ggml_kompute_context {
@@ -958,6 +961,38 @@ void ggml_vk_mul_mat_q4_1(Args&&... args) {
     ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
+void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
+                          const std::shared_ptr<kp::Tensor>& inA,
+                          const std::shared_ptr<kp::Tensor>& inB,
+                          const std::shared_ptr<kp::Tensor>& out,
+                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
+                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
+        kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0, ne1, ne01, ne12/ne02
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+//        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
 void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
                       unsigned element_size, unsigned qk,
                       kp::Sequence& seq,
@@ -1016,6 +1051,13 @@ void ggml_vk_get_rows_q4_1(Args&&... args) {
     ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
 }
 
+template <typename... Args>
+void ggml_vk_get_rows_q6_k(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
+        kp::shader_data::op_getrows_q6_k_comp_spv_len);
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
+}
+
 void ggml_vk_rope(kp::Sequence& seq,
                   const std::shared_ptr<kp::Tensor>& in,
                   const std::shared_ptr<kp::Tensor>& out,
@@ -1297,6 +1339,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                 case GGML_TYPE_Q4_1:
                                     ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                     break;
+                                case GGML_TYPE_Q6_K:
+                                    ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
                                 default: {
                                     fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                                     goto not_implemented;
@@ -1312,6 +1357,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else if (src0t == GGML_TYPE_Q4_1) {
                             ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0t == GGML_TYPE_Q6_K) {
+                            ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else {
                             fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                             goto not_implemented;
diff --git a/kompute/op_getrows_q6_k.comp b/kompute/op_getrows_q6_k.comp
new file mode 100644
index 0000000000000..95817b4871a40
--- /dev/null
+++ b/kompute/op_getrows_q6_k.comp
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#define NL 16
+#define BYTES_FOR_TYPE 4 /*bytes for float*/
+#define SIZE_OF_BLOCK sizeof_block_q6_k
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+block_q6_k get_unaligned_block_q6_k(uint index) {
+    block_q6_k fres;
+    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
+        fres.ql[it] = inA[index + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
+        fres.qh[it] = inA[index + QK_K/2 + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
+        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
+    }
+    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
+    return fres;
+}
+
+mat4 dequantize_block(uint index, uint il) {
+    const block_q6_k block = get_unaligned_block_q6_k(index);
+    return dequantize_q6_k(block, il);
+}
+
+#include "op_getrows.comp"
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
new file mode 100644
index 0000000000000..1e4ea37f8a87a
--- /dev/null
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#define SIZE_OF_BLOCK sizeof_block_q6_k
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y_id = 1) in;
+layout(local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
+} pcs;
+
+block_q6_k get_unaligned_block_q6_k(uint index) {
+    block_q6_k fres;
+    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
+        fres.ql[it] = inA[index + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
+        fres.qh[it] = inA[index + QK_K/2 + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
+        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
+    }
+    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
+    return fres;
+}
+
+void main() {
+    const uint8_t kmask1 = uint8_t(0x03);
+    const uint8_t kmask2 = uint8_t(0x0C);
+    const uint8_t kmask3 = uint8_t(0x30);
+    const uint8_t kmask4 = uint8_t(0xC0);
+
+    const int nb = pcs.ne00/QK_K;
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint r2 = gl_WorkGroupID.z;
+
+    const uint row = 2 * r0 + gl_SubgroupID;
+    const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
+    const uint x = row * nb + offset0; // Based from inA without base offset
+    const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    float sumf = 0;
+
+    const uint tid  = gl_SubgroupInvocationID/2;
+    const uint ix   = gl_SubgroupInvocationID%2;
+    const uint ip   = tid/8;         // 0 or 1
+    const uint il   = tid%8;
+    const uint n    = 4;
+    const uint l0   = n*il;
+    const uint is   = 8*ip + l0/16;
+
+    const uint y_offset = 128*ip + l0;
+    const uint q_offset_l = 64*ip + l0;
+    const uint q_offset_h = 32*ip + l0;
+
+    for (uint i = ix; i < nb; i += 2) {
+
+        const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
+//        const uint index = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
+//        const block_q6_k block = get_unaligned_block_q6_k(index);
+
+        const uint qlIndex = q_offset_l;
+        const uint q2Index = qlIndex + 32;
+        const uint qhIndex = q_offset_h;
+        const uint y = yy + i * QK_K + y_offset;
+
+        float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (uint l = 0; l < n; ++l) {
+
+//            const uint8_t currentQ1 = block.ql[qlIndex + l];
+//            const uint8_t currentQ2 = block.ql[q2Index + l];
+//            const uint8_t currentQh = block.qh[qhIndex + l];
+            const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
+            const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
+            const uint8_t currentQh = inA[baseIndex + qhIndex + l];
+
+            sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
+            sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
+            sums[2] += inB[y+l+64] * (int8_t((currentQ1  >> 4) | ((currentQh & kmask3) << 0)) - 32);
+            sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
+        }
+
+//        sumf += block.d * (sums[0] * block.scales[0+is] + sums[1] * block.scales[2+is] + sums[2] * block.scales[4+is] + sums[3] * block.scales[6+is]);
+        float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
+        sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
+    }
+
+    const float tot = subgroupAdd(sumf);
+    if (subgroupElect()) {
+        out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
+    }
+}
diff --git a/llama.cpp b/llama.cpp
index 245174898046f..603f7cc64270f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6496,7 +6496,8 @@ struct llama_context * llama_new_context_with_model(
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q6_K)) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 

From 06d4b21598da0162999b35429cfb567ed962d7ec Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 11:30:10 -0400
Subject: [PATCH 023/140] Fix offset into the qh and now we have working vulkan
 accelerated for gguff'd llama.

---
 kompute/op_mul_mat_q6_k.comp | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
index 1e4ea37f8a87a..c7b9aa753c656 100644
--- a/kompute/op_mul_mat_q6_k.comp
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -32,28 +32,13 @@ layout (push_constant) uniform parameter {
     int gqa;
 } pcs;
 
-block_q6_k get_unaligned_block_q6_k(uint index) {
-    block_q6_k fres;
-    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
-        fres.ql[it] = inA[index + it];
-    }
-    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
-        fres.qh[it] = inA[index + QK_K/2 + it];
-    }
-    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
-        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
-    }
-    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
-    return fres;
-}
-
 void main() {
     const uint8_t kmask1 = uint8_t(0x03);
     const uint8_t kmask2 = uint8_t(0x0C);
     const uint8_t kmask3 = uint8_t(0x30);
     const uint8_t kmask4 = uint8_t(0xC0);
 
-    const int nb = pcs.ne00/QK_K;
+    const uint nb = pcs.ne00/QK_K;
 
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
@@ -81,8 +66,6 @@ void main() {
     for (uint i = ix; i < nb; i += 2) {
 
         const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
-//        const uint index = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
-//        const block_q6_k block = get_unaligned_block_q6_k(index);
 
         const uint qlIndex = q_offset_l;
         const uint q2Index = qlIndex + 32;
@@ -91,13 +74,9 @@ void main() {
 
         float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
         for (uint l = 0; l < n; ++l) {
-
-//            const uint8_t currentQ1 = block.ql[qlIndex + l];
-//            const uint8_t currentQ2 = block.ql[q2Index + l];
-//            const uint8_t currentQh = block.qh[qhIndex + l];
             const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
             const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
-            const uint8_t currentQh = inA[baseIndex + qhIndex + l];
+            const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
 
             sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
             sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
@@ -105,7 +84,6 @@ void main() {
             sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
         }
 
-//        sumf += block.d * (sums[0] * block.scales[0+is] + sums[1] * block.scales[2+is] + sums[2] * block.scales[4+is] + sums[3] * block.scales[6+is]);
         float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
         sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
     }

From 32289aa447344fa8a5a8d9f6289af41fb15fd910 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 21:00:48 -0400
Subject: [PATCH 024/140] Fixes for norm.

---
 kompute/op_norm.comp    | 2 +-
 kompute/op_rmsnorm.comp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index 4b2db25e3593c..5aafeaac53aaf 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -56,7 +56,7 @@ void main() {
     const float mean = sum[0];
 
     // recenter
-    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
     for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
         out_[y+i00] = in_[x+i00] - mean;
     }
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index dd2c5cddef670..8d6c0fa6a5e48 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -10,7 +10,7 @@
 
 #include "common.comp"
 
-#define nth 256
+#define nth 512
 
 layout(local_size_x = nth) in;
 
@@ -56,7 +56,7 @@ void main() {
 
     const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
 
-    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
     for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
         out_[y+i00] = in_[x+i00] * scale;
     }

From 6ac39752bf8f1e3596386238fd3d0e68aaf2dfd5 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 3 Oct 2023 12:40:24 -0400
Subject: [PATCH 025/140] Fixup the upstream CMakelists.txt so we can build
 just llama.cpp with our branch.

---
 CMakeLists.txt | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31532df919793..2445d177cc15f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -414,6 +414,7 @@ if (LLAMA_HIPBLAS)
 endif()
 
 if (LLAMA_KOMPUTE)
+    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
     find_package(Vulkan COMPONENTS glslc REQUIRED)
     find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
     if (NOT glslc_executable)
@@ -429,8 +430,11 @@ if (LLAMA_KOMPUTE)
         set(spv_file ${source}.spv)
         add_custom_command(
             OUTPUT ${spv_file}
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp
-            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/op_getrows.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/op_mul_mv_q_n.comp
+              COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
             COMMENT "Compiling ${source} to ${source}.spv"
         )
 
@@ -478,9 +482,11 @@ if (LLAMA_KOMPUTE)
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q4_0.comp
           kompute/op_mul_mat_q4_1.comp
+          kompute/op_mul_mat_q6_k.comp
           kompute/op_getrows_f16.comp
           kompute/op_getrows_q4_0.comp
           kompute/op_getrows_q4_1.comp
+          kompute/op_getrows_q6_k.comp
           kompute/op_rope.comp
           kompute/op_cpy_f16_f16.comp
           kompute/op_cpy_f16_f32.comp
@@ -505,9 +511,11 @@ if (LLAMA_KOMPUTE)
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q4_0.h
           shaderop_mul_mat_q4_1.h
+          shaderop_mul_mat_q6_k.h
           shaderop_getrows_f16.h
           shaderop_getrows_q4_0.h
           shaderop_getrows_q4_1.h
+          shaderop_getrows_q6_k.h
           shaderop_rope.h
           shaderop_cpy_f16_f16.h
           shaderop_cpy_f16_f32.h

From de589ced7cea1e9d5a352668e905986a92efc866 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 3 Oct 2023 13:30:23 -0400
Subject: [PATCH 026/140] Change this back to be in agreement with metal and
 our previous softmax kernel.

---
 ggml-vulkan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 1dd504127d6ee..2326f56b5067f 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -785,7 +785,7 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
         s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);

From bc4b5ed1cb2ea9bdf71c0ea4356bfcc7f4a988b3 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 4 Oct 2023 14:24:35 -0400
Subject: [PATCH 027/140] Fixes for subgroup size to bring AMD and NVIDIA
 inline with eachother for all kernels.

---
 ggml-vulkan.cpp              |  7 ++++---
 kompute/op_mul_mat_q6_k.comp | 27 +++++++++++++++++----------
 kompute/op_mul_mv_q_n.comp   |  7 +++++--
 kompute/op_softmax.comp      |  9 ++++++---
 4 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 2326f56b5067f..86794e8865142 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -785,7 +785,8 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
+        // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
+        const uint32_t local_x = 32;
         s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
@@ -981,8 +982,8 @@ void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-//        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts});
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
index c7b9aa753c656..6148053b279e2 100644
--- a/kompute/op_mul_mat_q6_k.comp
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -44,31 +44,38 @@ void main() {
     const uint r1 = gl_WorkGroupID.y;
     const uint r2 = gl_WorkGroupID.z;
 
-    const uint row = 2 * r0 + gl_SubgroupID;
+    const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
     const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
     const uint x = row * nb + offset0; // Based from inA without base offset
     const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
 
     float sumf = 0;
 
-    const uint tid  = gl_SubgroupInvocationID/2;
-    const uint ix   = gl_SubgroupInvocationID%2;
-    const uint ip   = tid/8;         // 0 or 1
-    const uint il   = tid%8;
-    const uint n    = 4;
-    const uint l0   = n*il;
-    const uint is   = 8*ip + l0/16;
+    // bits of invocation ID for gl_SubgroupSize=32:
+    //  x   x   x   x   x
+    //  4   3   2   1   0
+    // (     tid     ) ix
+    //  ip (   il    )
+
+    const uint block_stride = gl_SubgroupSize / 16;         // number of blocks each subgroup processes
+    const uint tid  = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
+    const uint ix   = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
+    const uint ip   = tid/8;        // first or second half of block (0 or 1)
+    const uint il   = tid%8;        // each half has 8 parts, one per scale
+    const uint n    = 4;            // 4 scales at a time (and 4 sums)
+    const uint l0   = n*il;         // offset into half-block, 0..28
+    const uint is   = 8*ip + l0/16; // 0, 1, 8, 9
 
     const uint y_offset = 128*ip + l0;
     const uint q_offset_l = 64*ip + l0;
     const uint q_offset_h = 32*ip + l0;
 
-    for (uint i = ix; i < nb; i += 2) {
+    for (uint i = ix; i < nb; i += block_stride) {
 
         const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
 
         const uint qlIndex = q_offset_l;
-        const uint q2Index = qlIndex + 32;
+        const uint q2Index = qlIndex + QK_K/8;
         const uint qhIndex = q_offset_h;
         const uint y = yy + i * QK_K + y_offset;
 
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp
index 15bcbf765875b..a9b64fe167a29 100644
--- a/kompute/op_mul_mv_q_n.comp
+++ b/kompute/op_mul_mv_q_n.comp
@@ -7,6 +7,9 @@
  */
 
 void main() {
+    if (gl_SubgroupInvocationID > 31)
+        return;
+
     const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
@@ -28,13 +31,13 @@ void main() {
     //    gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
     //    gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
 
-    for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
+    for (uint ib = ix; ib < nb; ib += 16) {
         for (int row = 0; row < N_ROWS; row++) {
             const uint block_index = x + ib + row * nb;
             sumf[row] += block_q_n_dot_y(block_index, yb, il);
         }
 
-        yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
+        yb += BLOCKS_IN_QUANT * 16;
     }
 
     for (int row = 0; row < N_ROWS; ++row) {
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index d21577ac0f59c..30b6f0260e7d6 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -24,6 +24,9 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
+    if (gl_SubgroupInvocationID > 31)
+        return;
+
     const uint i03 = gl_WorkGroupID.z;
     const uint i02 = gl_WorkGroupID.y;
     const uint i01 = gl_WorkGroupID.x;
@@ -34,21 +37,21 @@ void main() {
 
     // parallel max
     float localMax = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
         localMax = max(localMax, in_[psrc0 + i00]);
     }
     float max_ = subgroupMax(localMax);
 
     // parallel sum
     float localSum = 0.0f;
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
         const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
         localSum += exp_psrc0;
         out_[pdst + i00] = exp_psrc0;
     }
 
     const float sum = subgroupAdd(localSum);
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
         out_[pdst + i00] /= sum;
     }
 }

From 24a4a5956af130148d6cee6bdb5397bf3e5ce824 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 4 Oct 2023 16:16:04 -0400
Subject: [PATCH 028/140] kompute : only try to use Vulkan for LLaMA itself

---
 llama.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 603f7cc64270f..6e7a53407ed9e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6492,7 +6492,9 @@ struct llama_context * llama_new_context_with_model(
 #undef LLAMA_METAL_CHECK_BUF
         }
 #elif defined(GGML_USE_KOMPUTE)
+    // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
     if (ggml_vk_has_device() && params.n_gpu_layers > 0
+        && model->arch == LLM_ARCH_LLAMA
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0

From 3d850db7671a48dd290ea543859f3b594dc4e0a0 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 4 Oct 2023 16:19:19 -0400
Subject: [PATCH 029/140] kompute : remove Q6_K from list of supported quant
 types

---
 llama.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 6e7a53407ed9e..e792511947c89 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6498,8 +6498,7 @@ struct llama_context * llama_new_context_with_model(
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q6_K)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 

From 9db90cbe1215b7850c1b3cbc10508931f55a3141 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 4 Oct 2023 21:49:55 -0700
Subject: [PATCH 030/140] f16 mv broadcasting fix (gqa fix)

---
 ggml-vulkan.cpp             | 11 ++++++-----
 kompute/op_mul_mat_f16.comp |  9 +++++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 86794e8865142..bf732be3215c4 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -884,7 +884,7 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
                          uint32_t nb01, uint32_t nb02,
                          int32_t ne11, int32_t ne12,
                          uint32_t nb11, uint32_t nb12,
@@ -897,20 +897,21 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
         int32_t ne00;
         uint32_t nb01, nb02;
         uint32_t nb11, nb12;
+        int32_t ne02, ne12;
         int32_t ne0, ne1;
     } pushConsts {
         safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
+        ne00, nb01, nb02, nb11, nb12, ne02, ne12, ne0, ne1,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
         const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(std::max(ne12, ne02))}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(std::max(ne12, ne02))});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1332,7 +1333,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             switch (src0t) {
                                 case GGML_TYPE_F16:
                                 case GGML_TYPE_F32:
-                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                     break;
                                 case GGML_TYPE_Q4_0:
                                     ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
index 72a667f925f4a..b56d14f770456 100644
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@@ -27,6 +27,8 @@ layout (push_constant) uniform parameter {
     uint nb02;
     uint nb11;
     uint nb12;
+    uint ne02;
+    uint ne12;
     int ne0;
     int ne1;
 } pcs;
@@ -36,8 +38,11 @@ void main() {
     const uint r1 = gl_WorkGroupID.y;
     const uint im = gl_WorkGroupID.z;
 
-    const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
-    const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+    uint bc_ab = pcs.ne12 > pcs.ne02 ? im / (pcs.ne12 / pcs.ne02) : im;
+    uint bc_ba = pcs.ne02 > pcs.ne12 ? im / (pcs.ne02 / pcs.ne12) : im;
+
+    const uint x = (r0*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
+    const uint y = (r1*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
 
     float sumf = 0.0f;
     for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {

From ff4212d20fcbc675106efb19c5278af60e18e97d Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 4 Oct 2023 21:02:17 -0700
Subject: [PATCH 031/140] q8 mat*vec

---
 CMakeLists.txt               |  2 ++
 ggml-vulkan.cpp              | 41 +++++++++++++++++++++++
 kompute/op_mul_mat_q8_0.comp | 64 ++++++++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+)
 create mode 100644 kompute/op_mul_mat_q8_0.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2445d177cc15f..c0538eb8869f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -480,6 +480,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_rmsnorm.comp
           kompute/op_diagmask.comp
           kompute/op_mul_mat_f16.comp
+          kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
           kompute/op_mul_mat_q4_1.comp
           kompute/op_mul_mat_q6_k.comp
@@ -509,6 +510,7 @@ if (LLAMA_KOMPUTE)
           shaderop_rmsnorm.h
           shaderop_diagmask.h
           shaderop_mul_mat_f16.h
+          shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h
           shaderop_mul_mat_q4_1.h
           shaderop_mul_mat_q6_k.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index bf732be3215c4..59852c6491ac3 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -23,6 +23,7 @@
 #include "shaderop_rmsnorm.h"
 #include "shaderop_diagmask.h"
 #include "shaderop_mul_mat_f16.h"
+#include "shaderop_mul_mat_q8_0.h"
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
 #include "shaderop_mul_mat_q6_k.h"
@@ -918,6 +919,43 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
+void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         int32_t ne0, int32_t ne1) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
+        kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        int32_t ne0, ne1;
+    } pushConsts {
+        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
 void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
@@ -1335,6 +1373,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                 case GGML_TYPE_F32:
                                     ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                     break;
+                                case GGML_TYPE_Q8_0:
+                                    ggml_vk_mul_mat_q8_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                    break;
                                 case GGML_TYPE_Q4_0:
                                     ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                     break;
diff --git a/kompute/op_mul_mat_q8_0.comp b/kompute/op_mul_mat_q8_0.comp
new file mode 100644
index 0000000000000..2ba48127b7576
--- /dev/null
+++ b/kompute/op_mul_mat_q8_0.comp
@@ -0,0 +1,64 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#define BLOCKS_IN_QUANT QK8_0
+#define SIZE_OF_BLOCK sizeof_block_q8_0
+#define N_ROWS 4
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
+} pcs;
+
+#define ELS_PER_BLOCK 32
+#define SIZE_OF_D 2
+#define BLOCK_SIZE (ELS_PER_BLOCK + SIZE_OF_D)
+
+void main() {
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+
+    const uint x = r0 * (pcs.ne00/ELS_PER_BLOCK) * BLOCK_SIZE + pcs.inAOff; // Based from inA
+    const uint y = r1 * pcs.ne10 + pcs.inBOff; // based from inB
+
+    float sumf = 0.0f;
+    for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
+        const uint block_number = i / ELS_PER_BLOCK;
+        const uint block_offset = block_number * BLOCK_SIZE;
+        const float d = u8BufToFloat16(inA, x + block_offset);
+        const uint position_in_block = i % ELS_PER_BLOCK;
+        const int q = int8_t(inA[x+block_offset+SIZE_OF_D+position_in_block]);
+        const float dq = d * q;
+        sumf += dq * float(inB[y+i]);
+    }
+
+    const float all_sum = subgroupAdd(sumf);
+    if (subgroupElect()) {
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
+    }
+}

From 020b1745a02e255fb059b575e0ca63248c84dd31 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 4 Oct 2023 23:36:24 -0700
Subject: [PATCH 032/140] vulkan: implement neox mode for rope

---
 kompute/op_rope.comp | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/kompute/op_rope.comp b/kompute/op_rope.comp
index 3fa84f5798851..8c28546369b26 100644
--- a/kompute/op_rope.comp
+++ b/kompute/op_rope.comp
@@ -63,6 +63,25 @@ void main() {
             out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
         }
     } else {
-        // TODO: implement
+        const float inv_ndims = -1.f/pcs.n_dims;
+        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
+            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
+                const float cos_theta = cos(theta);
+                const float sin_theta = sin(theta);
+
+                theta *= theta_scale;
+
+                const uint i0 = ib*pcs.n_dims + ic/2;
+
+                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+
+                const float x0 = in_[src];
+                const float x1 = in_[src+pcs.n_dims/2];
+
+                out_[dst_data] = x0*cos_theta - x1*sin_theta;
+                out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
+            }
+        }
     }
 }

From 8564f79036c724615f1677138d5e6ed5f61075ae Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 4 Oct 2023 21:03:27 -0700
Subject: [PATCH 033/140] falcon h2d + reenable vulkan

---
 llama.cpp | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e792511947c89..858494244543e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3210,6 +3210,9 @@ static struct ggml_cgraph * llm_build_falcon(
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
+#if defined(GGML_USE_KOMPUTE)
+    struct ggml_tensor * toDeviceTensor = nullptr;
+#endif
 
     if (tokens) {
         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -3219,7 +3222,9 @@ static struct ggml_cgraph * llm_build_falcon(
             memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
         }
         ggml_set_name(inp_tokens, "inp_tokens");
-
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inp_tokens;
+#endif
         inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
 #ifdef GGML_USE_MPI
@@ -3232,6 +3237,9 @@ static struct ggml_cgraph * llm_build_falcon(
         if (!ggml_allocr_is_measure(lctx.alloc)) {
             memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
         }
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inpL;
+#endif
     }
 
     const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3463,6 +3471,16 @@ static struct ggml_cgraph * llm_build_falcon(
     ggml_build_forward_expand(gf, cur);
 
     ggml_free(ctx0);
+ 
+#if defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute) {
+        if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
+            ggml_vk_h2d_all(lctx.ctx_kompute);
+        } else {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+        }
+    }
+#endif
 
     return gf;
 }
@@ -6494,7 +6512,7 @@ struct llama_context * llama_new_context_with_model(
 #elif defined(GGML_USE_KOMPUTE)
     // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
     if (ggml_vk_has_device() && params.n_gpu_layers > 0
-        && model->arch == LLM_ARCH_LLAMA
+        && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0

From 09d83f04013f9e8551c3ff54449cf28e1ca00784 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 5 Oct 2023 10:52:04 -0400
Subject: [PATCH 034/140] Delete TODO now that we have q8_0.

---
 llama.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 858494244543e..f5e0eac81b5fa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6510,7 +6510,6 @@ struct llama_context * llama_new_context_with_model(
 #undef LLAMA_METAL_CHECK_BUF
         }
 #elif defined(GGML_USE_KOMPUTE)
-    // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
     if (ggml_vk_has_device() && params.n_gpu_layers > 0
         && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
         && (model->ftype == LLAMA_FTYPE_ALL_F32

From f0cd38b9adfa2105c2a19c4fd02edf71e1d1135a Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 10 Oct 2023 21:37:07 -0700
Subject: [PATCH 035/140] add mat*mat ops

---
 CMakeLists.txt                   |   8 +
 ggml-vulkan.cpp                  | 266 ++++++++++++++++++++++++++++++-
 kompute/op_mul_mat_mat_f16.comp  |  56 +++++++
 kompute/op_mul_mat_mat_f32.comp  |  53 ++++++
 kompute/op_mul_mat_mat_q4_0.comp |  77 +++++++++
 kompute/op_mul_mat_mat_q8_0.comp |  66 ++++++++
 llama.cpp                        |   2 +-
 7 files changed, 521 insertions(+), 7 deletions(-)
 create mode 100644 kompute/op_mul_mat_mat_f16.comp
 create mode 100644 kompute/op_mul_mat_mat_f32.comp
 create mode 100644 kompute/op_mul_mat_mat_q4_0.comp
 create mode 100644 kompute/op_mul_mat_mat_q8_0.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0538eb8869f6..cf4042ea3fa8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -479,6 +479,10 @@ if (LLAMA_KOMPUTE)
           kompute/op_norm.comp
           kompute/op_rmsnorm.comp
           kompute/op_diagmask.comp
+          kompute/op_mul_mat_mat_f16.comp
+          kompute/op_mul_mat_mat_f32.comp
+          kompute/op_mul_mat_mat_q4_0.comp
+          kompute/op_mul_mat_mat_q8_0.comp
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
@@ -509,6 +513,10 @@ if (LLAMA_KOMPUTE)
           shaderop_norm.h
           shaderop_rmsnorm.h
           shaderop_diagmask.h
+          shaderop_mul_mat_mat_f16.h
+          shaderop_mul_mat_mat_f32.h
+          shaderop_mul_mat_mat_q4_0.h
+          shaderop_mul_mat_mat_q8_0.h
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 59852c6491ac3..6ae1a8fc3098b 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -27,6 +27,10 @@
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
 #include "shaderop_mul_mat_q6_k.h"
+#include "shaderop_mul_mat_mat_f32.h"
+#include "shaderop_mul_mat_mat_f16.h"
+#include "shaderop_mul_mat_mat_q4_0.h"
+#include "shaderop_mul_mat_mat_q8_0.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
@@ -938,7 +942,7 @@ void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
         uint32_t nb11, nb12;
         int32_t ne0, ne1;
     } pushConsts {
-        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, nb01, nb02, nb11, nb12, ne0, ne1,
     };
 
@@ -956,6 +960,211 @@ void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
+
+void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv,
+        kp::shader_data::op_mul_mat_mat_f32_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        //std::cerr << "init f32 matmat shader" << std::endl;
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01),
+         unsigned(ne11),
+         unsigned(ne12)},
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01),
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02))});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    //seq.record<kp::OpTensorFill>({out});
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul_mat_mat_f16(kp::Sequence& seq,
+                          const std::shared_ptr<kp::Tensor>& inA,
+                          const std::shared_ptr<kp::Tensor>& inB,
+                          const std::shared_ptr<kp::Tensor>& out,
+                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f16_comp_spv,
+        kp::shader_data::op_mul_mat_mat_f16_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01),
+         unsigned(ne11),
+         unsigned(std::max(ne12, ne02))
+         },
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01),
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02)),
+                              });
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+
+void ggml_vk_mul_mat_mat_q8_0(
+                         kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q8_0_comp_spv,
+        kp::shader_data::op_mul_mat_mat_q8_0_comp_spv_len);
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01),
+         unsigned(ne11),
+         unsigned(std::max(ne12, ne02))
+         },
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01),
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02)),
+                              });
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+
+void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
+                         kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01),
+         unsigned(ne11),
+         unsigned(std::max(ne12, ne02))},
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01),
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02)),
+                              });
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+
+template <typename... Args>
+void ggml_vk_mul_mat_mat_q4_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q4_0_comp_spv,
+        kp::shader_data::op_mul_mat_mat_q4_0_comp_spv_len);
+
+    ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
+}
+
 void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
@@ -1357,16 +1566,61 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_MUL_MAT:
                     {
                         if (src1t != GGML_TYPE_F32) {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                            fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                             goto not_implemented;
                         }
 
                         if (!ggml_is_transposed(src0)
                             && !ggml_is_transposed(src1)
-                            && ne00%32 == 0
-                            && ne11 > 1) {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                            goto not_implemented;
+                            //&& ne00%32 == 0
+                            && ne11 > 1
+                            ) {
+                            switch (src0t) {
+                                case GGML_TYPE_F32:
+                                    ggml_vk_mul_mat_mat_f32(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
+                                case GGML_TYPE_F16:
+                                    ggml_vk_mul_mat_mat_f16(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
+                                case GGML_TYPE_Q4_0:
+                                    ggml_vk_mul_mat_mat_q4_0(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
+                                case GGML_TYPE_Q8_0:
+                                    ggml_vk_mul_mat_mat_q8_0(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
+                                default: {
+                                    fprintf(stderr, "%s: %s: Unsupported quantization for M*M: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                                    goto not_implemented;
+                                }
+                            }
                         } else {
                             switch (src0t) {
                                 case GGML_TYPE_F16:
diff --git a/kompute/op_mul_mat_mat_f16.comp b/kompute/op_mul_mat_mat_f16.comp
new file mode 100644
index 0000000000000..b62f06d109945
--- /dev/null
+++ b/kompute/op_mul_mat_mat_f16.comp
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+// layout(local_size_x = 8) in;
+
+layout(binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i ++) {
+      sum += float(inA[x+i]) * float(inB[y+i]);
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute/op_mul_mat_mat_f32.comp
new file mode 100644
index 0000000000000..6234322ca4b4e
--- /dev/null
+++ b/kompute/op_mul_mat_mat_f32.comp
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+// layout(local_size_x = 8) in;
+
+layout(binding = 0) readonly buffer tensorInA { float inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  const uint x = (gid.x*pcs.nb01 + gid.z/(pcs.ne12/pcs.ne02)*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + gid.z/(pcs.ne02/pcs.ne12)*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i ++) {
+      sum += float(inA[x+i]) * float(inB[y+i]);
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
new file mode 100644
index 0000000000000..93dcfdaedd9bb
--- /dev/null
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+// layout(local_size_x = 8) in;
+
+layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+#define ELS_PER_BLOCK 32
+#define QS_OFFSET 2
+#define BLOCK_SIZE ((ELS_PER_BLOCK / 2) + QS_OFFSET)
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i+=ELS_PER_BLOCK) {
+    for (uint j = 0; j < ELS_PER_BLOCK / 2; j++) {
+      const uint block_number = i / ELS_PER_BLOCK;
+      const uint block_offset = block_number * BLOCK_SIZE;
+      const float d = u8BufToFloat16(inA, x + block_offset);
+      const uint byte_position_in_block = j;
+      const int q0 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] & 0x0F) - 8;
+      const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4) - 8;
+      const float dq0 = d * q0;
+      const float dq1 = d * q1;
+      // if (gid.x == 0 && gid.y == 0 && gid.z == 0 && i < 4 && j < 4) {
+      //   debugPrintfEXT("shp=%d,%d,%d gid=%d,%d,%d i=%d, d=%f, q0=%d, q1=%d, dqs=%f,%f\n",
+      //     pcs.ne01, pcs.ne11, pcs.ne12,
+      //     gid.x, gid.y, gid.z, i, d, q0, q1, dq0, dq1
+      //   );
+      // }
+      sum += (dq0 * float(inB[y+i+j])) + \
+             (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
+    }
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q8_0.comp b/kompute/op_mul_mat_mat_q8_0.comp
new file mode 100644
index 0000000000000..715e533e215b3
--- /dev/null
+++ b/kompute/op_mul_mat_mat_q8_0.comp
@@ -0,0 +1,66 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+// layout(local_size_x = 8) in;
+
+layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+#define ELS_PER_BLOCK 32
+#define QS_OFFSET 2 // d
+#define BLOCK_SIZE (ELS_PER_BLOCK + 2)
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i++) {
+      const uint block_number = i / ELS_PER_BLOCK;
+      const uint block_offset = block_number * BLOCK_SIZE;
+      const float d = u8BufToFloat16(inA, x + block_offset);
+      const uint position_in_block = i % ELS_PER_BLOCK;
+      const int q0 = int8_t(inA[x+block_offset+QS_OFFSET+position_in_block]);
+      const float dq0 = d * q0;
+      sum += (dq0 * float(inB[y+i]));
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
\ No newline at end of file
diff --git a/llama.cpp b/llama.cpp
index f5e0eac81b5fa..0ff459ba53891 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3855,7 +3855,7 @@ static bool llama_eval_internal(
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
 #elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute && N == 1) {
+    if (lctx.ctx_kompute) { // && N == 1) {
         ggml_vk_graph_compute(lctx.ctx_kompute, gf);
         ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
     } else {

From 46385ee0d52f38fc7db2a0ec3a071ae8d1bd6511 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 10 Oct 2023 21:38:18 -0700
Subject: [PATCH 036/140] misc vulkan cleanup

make pushconts consistent w/ dispatch, avoid a double free
---
 ggml-vulkan.cpp     | 4 +++-
 kompute/op_add.comp | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 6ae1a8fc3098b..a0a2a9b0ef6fb 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -307,7 +307,9 @@ bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
 void ggml_vk_free(struct ggml_kompute_context * ctx) {
     assert(ctx == s_kompute_context);
     s_kompute_context = nullptr;
-    delete ctx;
+    if (ctx != nullptr) {
+        delete ctx;
+    }
 }
 
 static
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index 019a68449e3c3..f242864ddf933 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -20,7 +20,6 @@ layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
     uint outOff;
-    uint row;
 } pcs;
 
 void main() {

From 3327d84a7fba14ad0b2778982013a88c808a1132 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 11 Oct 2023 16:02:53 -0700
Subject: [PATCH 037/140] perf: use bigger threadgroups in mm

---
 ggml-vulkan.cpp                  | 2 +-
 kompute/op_mul_mat_mat_q4_0.comp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index a0a2a9b0ef6fb..57813cb3d4563 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1148,7 +1148,7 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
+        s_algo->setWorkgroup({unsigned(ne01)/32,
                               unsigned(ne11),
                               unsigned(std::max(ne12, ne02)),
                               });
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
index 93dcfdaedd9bb..994aadc8a674a 100644
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@@ -14,7 +14,7 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-// layout(local_size_x = 8) in;
+layout(local_size_x = 32) in;
 
 layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };

From d5741c07a53f86f4d987b7e22f87a72e1da70e46 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 11 Oct 2023 18:40:07 -0700
Subject: [PATCH 038/140] use op param epsilon for norms

---
 ggml-vulkan.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 57813cb3d4563..f2320f3cc119b 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -810,12 +810,10 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
                    int32_t ne00, int32_t nb01,
-                   int32_t nrows) {
+                   int32_t nrows, float epsilon) {
     GGML_ASSERT(nb01%sizeof(float) == 0);
     GGML_ASSERT(ne00%sizeof(float) == 0);
 
-    const float epsilon = 1e-6f; // this is what ggml.c uses for rms norm
-
     struct PushConstants {
         uint32_t inOff, outOff;
         uint32_t ne00, nb01;
@@ -1559,11 +1557,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_NORM:
                     {
-                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                        float eps;
+                        memcpy(&eps, dst->op_params, sizeof(float));
+                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
                     } break;
                 case GGML_OP_RMS_NORM:
                     {
-                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                        float eps;
+                        memcpy(&eps, dst->op_params, sizeof(float));
+                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
                     } break;
                 case GGML_OP_MUL_MAT:
                     {

From b78a94bc6d72c42bf1f1ac9a867ef232ddc26b04 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 11 Oct 2023 17:10:42 -0700
Subject: [PATCH 039/140] q6k mm works

---
 CMakeLists.txt                   |  2 +
 ggml-vulkan.cpp                  | 61 +++++++++++++++++++++-
 kompute/op_mul_mat_mat_q6_k.comp | 88 ++++++++++++++++++++++++++++++++
 3 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 kompute/op_mul_mat_mat_q6_k.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf4042ea3fa8c..fbbb46bbf4d60 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -483,6 +483,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_mul_mat_mat_f32.comp
           kompute/op_mul_mat_mat_q4_0.comp
           kompute/op_mul_mat_mat_q8_0.comp
+          kompute/op_mul_mat_mat_q6_k.comp
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
@@ -517,6 +518,7 @@ if (LLAMA_KOMPUTE)
           shaderop_mul_mat_mat_f32.h
           shaderop_mul_mat_mat_q4_0.h
           shaderop_mul_mat_mat_q8_0.h
+          shaderop_mul_mat_mat_q6_k.h
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index f2320f3cc119b..488683ec3cd4a 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -31,6 +31,7 @@
 #include "shaderop_mul_mat_mat_f16.h"
 #include "shaderop_mul_mat_mat_q4_0.h"
 #include "shaderop_mul_mat_mat_q8_0.h"
+#include "shaderop_mul_mat_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
@@ -1109,6 +1110,54 @@ void ggml_vk_mul_mat_mat_q8_0(
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
+void ggml_vk_mul_mat_mat_q6_k(
+                         kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q6_k_comp_spv,
+        kp::shader_data::op_mul_mat_mat_q6_k_comp_spv_len);
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01)/32,
+         unsigned(ne11),
+         unsigned(std::max(ne12, ne02))
+         },
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01)/32,
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02)),
+                              });
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
 
 void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
                          kp::Sequence& seq,
@@ -1138,7 +1187,7 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
     if (!komputeManager()->hasAlgorithm(__func__)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
-        {unsigned(ne01),
+        {unsigned(ne01)/32,
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))},
         {},
@@ -1619,6 +1668,16 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                         ne11, ne12,
                                         nb11, nb12,
                                         nb1, nb2);
+                                break;
+                                case GGML_TYPE_Q6_K:
+                                    ggml_vk_mul_mat_mat_q6_k(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
                                     break;
                                 default: {
                                     fprintf(stderr, "%s: %s: Unsupported quantization for M*M: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
diff --git a/kompute/op_mul_mat_mat_q6_k.comp b/kompute/op_mul_mat_mat_q6_k.comp
new file mode 100644
index 0000000000000..127f17df669c2
--- /dev/null
+++ b/kompute/op_mul_mat_mat_q6_k.comp
@@ -0,0 +1,88 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+layout(local_size_x = 32) in;
+
+layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+
+#define ELS_PER_BLOCK 256  //QK_K
+#define QH_OFFSET (ELS_PER_BLOCK / 2)
+#define QSCALES_OFFSET (QH_OFFSET + (ELS_PER_BLOCK / 4))
+#define SCALE_SCALE_OFFSET (QSCALES_OFFSET + (ELS_PER_BLOCK / 16))
+#define BLOCK_SIZE (SCALE_SCALE_OFFSET + 2)
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+
+  float sum = 0.0f;
+  const uint n_blocks = pcs.ne00 / ELS_PER_BLOCK;
+  // this is pretty much all lifted right from dequantize_row_q6_K
+  uint outoff = 0;
+  for (uint i = 0; i < n_blocks; i++) {
+    const uint block_number = i;
+    const uint block_offset = block_number * BLOCK_SIZE;
+    const float scales_d = u8BufToFloat16(inA, x + block_offset + SCALE_SCALE_OFFSET);
+    uint qloff = block_offset;
+    uint qhoff = block_offset + QH_OFFSET;
+    uint scoff = block_offset + QSCALES_OFFSET;
+    for (int n = 0; n < 256; n += 128) {
+        for (int l = 0; l < 32; ++l) {
+            int is = l/16;
+            const int q1 = int((inA[x + qloff + l +  0] & 0xF) | (((inA[x + qhoff + l] >> 0) & 3) << 4)) - 32;
+            const int q2 = int((inA[x + qloff + l + 32] & 0xF) | (((inA[x + qhoff + l] >> 2) & 3) << 4)) - 32;
+            const int q3 = int((inA[x + qloff + l +  0]  >> 4) | (((inA[x + qhoff + l] >> 4) & 3) << 4)) - 32;
+            const int q4 = int((inA[x + qloff + l + 32]  >> 4) | (((inA[x + qhoff + l] >> 6) & 3) << 4)) - 32;
+            sum += inB[y + outoff + l +  0] * scales_d * int8_t(inA[x + scoff + is + 0]) * q1;
+            sum += inB[y + outoff + l + 32] * scales_d * int8_t(inA[x + scoff + is + 2]) * q2;
+            sum += inB[y + outoff + l + 64] * scales_d * int8_t(inA[x + scoff + is + 4]) * q3;
+            sum += inB[y + outoff + l + 96] * scales_d * int8_t(inA[x + scoff + is + 6]) * q4;
+        }
+        outoff += 128;
+        qloff += 64;
+        qhoff += 32;
+        scoff += 8;
+    }
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
\ No newline at end of file

From 4809890d805ff27752fd344a281250888a86acdd Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Thu, 12 Oct 2023 10:23:09 -0700
Subject: [PATCH 040/140] rm commented dbg print

---
 kompute/op_mul_mat_mat_q4_0.comp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
index 994aadc8a674a..aecd04cca6a92 100644
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@@ -62,12 +62,6 @@ void main() {
       const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4) - 8;
       const float dq0 = d * q0;
       const float dq1 = d * q1;
-      // if (gid.x == 0 && gid.y == 0 && gid.z == 0 && i < 4 && j < 4) {
-      //   debugPrintfEXT("shp=%d,%d,%d gid=%d,%d,%d i=%d, d=%f, q0=%d, q1=%d, dqs=%f,%f\n",
-      //     pcs.ne01, pcs.ne11, pcs.ne12,
-      //     gid.x, gid.y, gid.z, i, d, q0, q1, dq0, dq1
-      //   );
-      // }
       sum += (dq0 * float(inB[y+i+j])) + \
              (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
     }

From cd0257ed0d748465d5753eeff74dffea92d91641 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Thu, 12 Oct 2023 11:22:31 -0700
Subject: [PATCH 041/140] q4_1 mat*mat

---
 CMakeLists.txt                   |  2 +
 ggml-vulkan.cpp                  | 19 +++++++++
 kompute/op_mul_mat_mat_q4_1.comp | 73 ++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+)
 create mode 100644 kompute/op_mul_mat_mat_q4_1.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbbb46bbf4d60..df6b53dce7be6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -482,6 +482,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_mul_mat_mat_f16.comp
           kompute/op_mul_mat_mat_f32.comp
           kompute/op_mul_mat_mat_q4_0.comp
+          kompute/op_mul_mat_mat_q4_1.comp
           kompute/op_mul_mat_mat_q8_0.comp
           kompute/op_mul_mat_mat_q6_k.comp
           kompute/op_mul_mat_f16.comp
@@ -517,6 +518,7 @@ if (LLAMA_KOMPUTE)
           shaderop_mul_mat_mat_f16.h
           shaderop_mul_mat_mat_f32.h
           shaderop_mul_mat_mat_q4_0.h
+          shaderop_mul_mat_mat_q4_1.h
           shaderop_mul_mat_mat_q8_0.h
           shaderop_mul_mat_mat_q6_k.h
           shaderop_mul_mat_f16.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 488683ec3cd4a..56f15310dc8c6 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -30,6 +30,7 @@
 #include "shaderop_mul_mat_mat_f32.h"
 #include "shaderop_mul_mat_mat_f16.h"
 #include "shaderop_mul_mat_mat_q4_0.h"
+#include "shaderop_mul_mat_mat_q4_1.h"
 #include "shaderop_mul_mat_mat_q8_0.h"
 #include "shaderop_mul_mat_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
@@ -1214,6 +1215,14 @@ void ggml_vk_mul_mat_mat_q4_0(Args&&... args) {
     ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
 }
 
+template <typename... Args>
+void ggml_vk_mul_mat_mat_q4_1(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q4_1_comp_spv,
+        kp::shader_data::op_mul_mat_mat_q4_1_comp_spv_len);
+
+    ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
+}
+
 void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
@@ -1659,6 +1668,16 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                         nb11, nb12,
                                         nb1, nb2);
                                     break;
+                                case GGML_TYPE_Q4_1:
+                                    ggml_vk_mul_mat_mat_q4_1(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
                                 case GGML_TYPE_Q8_0:
                                     ggml_vk_mul_mat_mat_q8_0(seq,
                                         id_src0, id_src1, id_dst,
diff --git a/kompute/op_mul_mat_mat_q4_1.comp b/kompute/op_mul_mat_mat_q4_1.comp
new file mode 100644
index 0000000000000..d7fbc96db4d58
--- /dev/null
+++ b/kompute/op_mul_mat_mat_q4_1.comp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+layout(local_size_x = 32) in;
+
+layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+#define ELS_PER_BLOCK 32
+#define M_OFFSET 2
+#define QS_OFFSET 4
+#define BLOCK_SIZE ((ELS_PER_BLOCK / 2) + QS_OFFSET)
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i+=ELS_PER_BLOCK) {
+    for (uint j = 0; j < ELS_PER_BLOCK / 2; j++) {
+      const uint block_number = i / ELS_PER_BLOCK;
+      const uint block_offset = block_number * BLOCK_SIZE;
+      const float d = u8BufToFloat16(inA, x + block_offset);
+      const float m = u8BufToFloat16(inA, x + block_offset + M_OFFSET);
+      const uint byte_position_in_block = j;
+      const int q0 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] & 0x0F);
+      const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4);
+      const float dq0 = (d * q0) + m;
+      const float dq1 = (d * q1) + m;
+      sum += (dq0 * float(inB[y+i+j])) + \
+             (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
+    }
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}

From 8dc79ac380942a8a0006ff7123d1b126130cba3c Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Thu, 12 Oct 2023 11:46:30 -0700
Subject: [PATCH 042/140] clean up vulkan/cpu switch

---
 llama.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0ff459ba53891..3afbebe2ab320 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3855,19 +3855,11 @@ static bool llama_eval_internal(
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
 #elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute) { // && N == 1) {
+    if (lctx.ctx_kompute) {
         ggml_vk_graph_compute(lctx.ctx_kompute, gf);
         ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
     } else {
-        if (lctx.ctx_kompute) {
-            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
-            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
-        }
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
-        if (lctx.ctx_kompute) {
-            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
-            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
-        }
     }
 #else
     ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);

From 9bc52ebae313c028c2293c260d12d0d0049c5ea1 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Fri, 13 Oct 2023 11:10:02 -0700
Subject: [PATCH 043/140] attempted speedups

---
 ggml-vulkan.cpp                  |  9 ++++----
 kompute/op_mul_mat_mat_q4_0.comp | 38 ++++++++++++++++++--------------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 56f15310dc8c6..67270a3c77f30 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1186,17 +1186,18 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
-        {unsigned(ne01)/32,
+        {unsigned(ne01),
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))},
-        {},
+        {local_x, 4},
         {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01)/32,
+        s_algo->setWorkgroup({unsigned(ne01),
                               unsigned(ne11),
                               unsigned(std::max(ne12, ne02)),
                               });
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
index aecd04cca6a92..80a1ff6270b50 100644
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@@ -14,7 +14,9 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-layout(local_size_x = 32) in;
+layout (local_size_x_id = 0) in;
+layout (local_size_y_id = 1) in;
+layout (constant_id = 1) const uint nsg = 2;
 
 layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -38,34 +40,38 @@ layout(push_constant) uniform parameter {
 }
 pcs;
 
-#define ELS_PER_BLOCK 32
-#define QS_OFFSET 2
-#define BLOCK_SIZE ((ELS_PER_BLOCK / 2) + QS_OFFSET)
+const uint els_per_block = 32;
+const uint qs_offset = 2;
+const uint block_size = (els_per_block / 2) + qs_offset;
+
 
 void main() {
-  uvec3 gid = gl_GlobalInvocationID;
+  uvec3 gid = gl_WorkGroupID;
+  uvec3 lid = gl_LocalInvocationID;
+  gid.y = gid.y * nsg + lid.y;
 
   uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
   uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
 
-
   const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
   const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
   float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i+=ELS_PER_BLOCK) {
-    for (uint j = 0; j < ELS_PER_BLOCK / 2; j++) {
-      const uint block_number = i / ELS_PER_BLOCK;
-      const uint block_offset = block_number * BLOCK_SIZE;
+  for (uint i = gl_SubgroupInvocationID * 2; i < pcs.ne00; i+=gl_SubgroupSize * 2) {
+      const uint block_number = i / els_per_block;
+      const uint block_offset = block_number * block_size;
       const float d = u8BufToFloat16(inA, x + block_offset);
+      const uint j = (i % els_per_block) / 2;
       const uint byte_position_in_block = j;
-      const int q0 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] & 0x0F) - 8;
-      const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4) - 8;
+      const int q0 = (inA[x+block_offset+qs_offset+byte_position_in_block] & 0x0F) - 8;
+      const int q1 = (inA[x+block_offset+qs_offset+byte_position_in_block] >>   4) - 8;
       const float dq0 = d * q0;
       const float dq1 = d * q1;
-      sum += (dq0 * float(inB[y+i+j])) + \
-             (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
-    }
+      const uint block_base = block_number * els_per_block;
+      sum += (dq0 * float(inB[y+block_base+j])) + \
+             (dq1 * float(inB[y+block_base+j+(els_per_block/2)]));
   }
 
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+  const float all_sum = subgroupAdd(sum);
+  if (subgroupElect())
+    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
 }
\ No newline at end of file

From c1fd64548d2c8d42eaedae940c619a6cf2d9741f Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Fri, 13 Oct 2023 13:14:36 -0700
Subject: [PATCH 044/140] attempted speedups 2

---
 ggml-vulkan.cpp                  | 24 +++++++++++++-----------
 kompute/op_mul_mat_mat_f16.comp  | 12 ++++++++----
 kompute/op_mul_mat_mat_f32.comp  | 21 ++++++++++++++-------
 kompute/op_mul_mat_mat_q6_k.comp |  2 +-
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 67270a3c77f30..010f49226e2ff 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -989,26 +989,27 @@ void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
         nb1, nb2
     };
 
+    const uint32_t local_x = ggml_vk_current_device().subgroupSize;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        //std::cerr << "init f32 matmat shader" << std::endl;
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
         {unsigned(ne01),
          unsigned(ne11),
-         unsigned(ne12)},
-        {},
+         unsigned(std::max(ne12, ne02))
+         },
+        {local_x},
         {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01),
                               unsigned(ne11),
-                              unsigned(std::max(ne12, ne02))});
+                              unsigned(std::max(ne12, ne02)),
+                              });
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
-    //seq.record<kp::OpTensorFill>({out});
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
@@ -1038,15 +1039,16 @@ void ggml_vk_mul_mat_mat_f16(kp::Sequence& seq,
         nb1, nb2
     };
 
+    const uint32_t local_x = ggml_vk_current_device().subgroupSize;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
         {unsigned(ne01),
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))
          },
-        {},
+        {local_x},
         {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
@@ -1141,7 +1143,7 @@ void ggml_vk_mul_mat_mat_q6_k(
     if (!komputeManager()->hasAlgorithm(__func__)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
-        {unsigned(ne01)/32,
+        {unsigned(ne01)/256,
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))
          },
@@ -1150,7 +1152,7 @@ void ggml_vk_mul_mat_mat_q6_k(
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01)/32,
+        s_algo->setWorkgroup({unsigned(ne01)/256,
                               unsigned(ne11),
                               unsigned(std::max(ne12, ne02)),
                               });
@@ -1192,7 +1194,7 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
         {unsigned(ne01),
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))},
-        {local_x, 4},
+        {local_x, 1},
         {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
diff --git a/kompute/op_mul_mat_mat_f16.comp b/kompute/op_mul_mat_mat_f16.comp
index b62f06d109945..03872fed5e709 100644
--- a/kompute/op_mul_mat_mat_f16.comp
+++ b/kompute/op_mul_mat_mat_f16.comp
@@ -14,7 +14,8 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-// layout(local_size_x = 8) in;
+// device subgroup size
+layout (local_size_x_id = 0) in;
 
 layout(binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -40,7 +41,7 @@ pcs;
 
 
 void main() {
-  uvec3 gid = gl_GlobalInvocationID;
+  uvec3 gid = gl_WorkGroupID;
 
   uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
   uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
@@ -48,9 +49,12 @@ void main() {
   const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
   const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
   float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i ++) {
+  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
       sum += float(inA[x+i]) * float(inB[y+i]);
   }
 
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+  const float all_sum = subgroupAdd(sum);
+  if (subgroupElect()) {
+    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
+  }
 }
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute/op_mul_mat_mat_f32.comp
index 6234322ca4b4e..a2dba05608fc7 100644
--- a/kompute/op_mul_mat_mat_f32.comp
+++ b/kompute/op_mul_mat_mat_f32.comp
@@ -14,7 +14,8 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-// layout(local_size_x = 8) in;
+// device subgroup size
+layout (local_size_x_id = 0) in;
 
 layout(binding = 0) readonly buffer tensorInA { float inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -40,14 +41,20 @@ pcs;
 
 
 void main() {
-  uvec3 gid = gl_GlobalInvocationID;
+  uvec3 gid = gl_WorkGroupID;
 
-  const uint x = (gid.x*pcs.nb01 + gid.z/(pcs.ne12/pcs.ne02)*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + gid.z/(pcs.ne02/pcs.ne12)*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
   float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i ++) {
+  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
       sum += float(inA[x+i]) * float(inB[y+i]);
   }
 
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
+  const float all_sum = subgroupAdd(sum);
+  if (subgroupElect()) {
+    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
+  }
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q6_k.comp b/kompute/op_mul_mat_mat_q6_k.comp
index 127f17df669c2..8e3e44d7de732 100644
--- a/kompute/op_mul_mat_mat_q6_k.comp
+++ b/kompute/op_mul_mat_mat_q6_k.comp
@@ -14,7 +14,7 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-layout(local_size_x = 32) in;
+layout(local_size_x = 256) in;
 
 layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };

From cc05a602d6e20d514330cd424cb45438ff87f9ea Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Mon, 16 Oct 2023 10:00:25 -0700
Subject: [PATCH 045/140] use mat*vec shaders for mat*mat

I wrote the mat*mat shaders from scratch so I understand them better but
they are currently not faster than just multiply-invoking the mat*vec
shaders, by a significant degree - so, except for f32 which needed a new
shader, revert to the m*v ones here.
---
 CMakeLists.txt                   |  10 -
 ggml-vulkan.cpp                  | 330 +++----------------------------
 kompute/op_mul_mat_mat_f16.comp  |  60 ------
 kompute/op_mul_mat_mat_q4_0.comp |  77 --------
 kompute/op_mul_mat_mat_q4_1.comp |  73 -------
 kompute/op_mul_mat_mat_q6_k.comp |  88 ---------
 kompute/op_mul_mat_mat_q8_0.comp |  66 -------
 7 files changed, 27 insertions(+), 677 deletions(-)
 delete mode 100644 kompute/op_mul_mat_mat_f16.comp
 delete mode 100644 kompute/op_mul_mat_mat_q4_0.comp
 delete mode 100644 kompute/op_mul_mat_mat_q4_1.comp
 delete mode 100644 kompute/op_mul_mat_mat_q6_k.comp
 delete mode 100644 kompute/op_mul_mat_mat_q8_0.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df6b53dce7be6..33a8bdd17ea7a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -479,12 +479,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_norm.comp
           kompute/op_rmsnorm.comp
           kompute/op_diagmask.comp
-          kompute/op_mul_mat_mat_f16.comp
           kompute/op_mul_mat_mat_f32.comp
-          kompute/op_mul_mat_mat_q4_0.comp
-          kompute/op_mul_mat_mat_q4_1.comp
-          kompute/op_mul_mat_mat_q8_0.comp
-          kompute/op_mul_mat_mat_q6_k.comp
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
@@ -515,12 +510,7 @@ if (LLAMA_KOMPUTE)
           shaderop_norm.h
           shaderop_rmsnorm.h
           shaderop_diagmask.h
-          shaderop_mul_mat_mat_f16.h
           shaderop_mul_mat_mat_f32.h
-          shaderop_mul_mat_mat_q4_0.h
-          shaderop_mul_mat_mat_q4_1.h
-          shaderop_mul_mat_mat_q8_0.h
-          shaderop_mul_mat_mat_q6_k.h
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 010f49226e2ff..08042330fde9b 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -28,11 +28,6 @@
 #include "shaderop_mul_mat_q4_1.h"
 #include "shaderop_mul_mat_q6_k.h"
 #include "shaderop_mul_mat_mat_f32.h"
-#include "shaderop_mul_mat_mat_f16.h"
-#include "shaderop_mul_mat_mat_q4_0.h"
-#include "shaderop_mul_mat_mat_q4_1.h"
-#include "shaderop_mul_mat_mat_q8_0.h"
-#include "shaderop_mul_mat_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
@@ -1013,219 +1008,6 @@ void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul_mat_mat_f16(kp::Sequence& seq,
-                          const std::shared_ptr<kp::Tensor>& inA,
-                          const std::shared_ptr<kp::Tensor>& inB,
-                          const std::shared_ptr<kp::Tensor>& out,
-                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f16_comp_spv,
-        kp::shader_data::op_mul_mat_mat_f16_comp_spv_len);
-
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-
-    const uint32_t local_x = ggml_vk_current_device().subgroupSize;
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01),
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))
-         },
-        {local_x},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-
-void ggml_vk_mul_mat_mat_q8_0(
-                         kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q8_0_comp_spv,
-        kp::shader_data::op_mul_mat_mat_q8_0_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01),
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))
-         },
-        {},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-void ggml_vk_mul_mat_mat_q6_k(
-                         kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q6_k_comp_spv,
-        kp::shader_data::op_mul_mat_mat_q6_k_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01)/256,
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))
-         },
-        {},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01)/256,
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
-                         kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01),
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))},
-        {local_x, 1},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-
-template <typename... Args>
-void ggml_vk_mul_mat_mat_q4_0(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q4_0_comp_spv,
-        kp::shader_data::op_mul_mat_mat_q4_0_comp_spv_len);
-
-    ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-void ggml_vk_mul_mat_mat_q4_1(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q4_1_comp_spv,
-        kp::shader_data::op_mul_mat_mat_q4_1_comp_spv_len);
-
-    ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
-}
-
 void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
@@ -1635,54 +1417,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             goto not_implemented;
                         }
 
-                        if (!ggml_is_transposed(src0)
-                            && !ggml_is_transposed(src1)
-                            //&& ne00%32 == 0
-                            && ne11 > 1
-                            ) {
-                            switch (src0t) {
-                                case GGML_TYPE_F32:
-                                    ggml_vk_mul_mat_mat_f32(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                case GGML_TYPE_F16:
-                                    ggml_vk_mul_mat_mat_f16(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                case GGML_TYPE_Q4_0:
-                                    ggml_vk_mul_mat_mat_q4_0(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                case GGML_TYPE_Q4_1:
-                                    ggml_vk_mul_mat_mat_q4_1(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                case GGML_TYPE_Q8_0:
-                                    ggml_vk_mul_mat_mat_q8_0(seq,
+                        if (ggml_is_transposed(src0) ||
+                            ggml_is_transposed(src1)) {
+                            fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                            goto not_implemented;
+                        } 
+
+                        switch (src0t) {        
+                            case GGML_TYPE_F32:
+                                ggml_vk_mul_mat_mat_f32(seq,
                                         id_src0, id_src1, id_dst,
                                         off_src0, off_src1, off_dst,
                                         ne00, ne01, ne02,
@@ -1690,46 +1433,27 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                         ne11, ne12,
                                         nb11, nb12,
                                         nb1, nb2);
+                            case GGML_TYPE_F16:
+                                ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                 break;
-                                case GGML_TYPE_Q6_K:
-                                    ggml_vk_mul_mat_mat_q6_k(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                default: {
-                                    fprintf(stderr, "%s: %s: Unsupported quantization for M*M: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                                    goto not_implemented;
-                                }
-                            }
-                        } else {
-                            switch (src0t) {
-                                case GGML_TYPE_F16:
-                                case GGML_TYPE_F32:
-                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
-                                    break;
-                                case GGML_TYPE_Q8_0:
-                                    ggml_vk_mul_mat_q8_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
-                                    break;
-                                case GGML_TYPE_Q4_0:
-                                    ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
-                                    break;
-                                case GGML_TYPE_Q4_1:
-                                    ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
-                                    break;
-                                case GGML_TYPE_Q6_K:
-                                    ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
-                                    break;
-                                default: {
-                                    fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                                    goto not_implemented;
-                                }
+                            case GGML_TYPE_Q8_0:
+                                ggml_vk_mul_mat_q8_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                break;
+                            case GGML_TYPE_Q4_0:
+                                ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                break;
+                            case GGML_TYPE_Q4_1:
+                                ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                break;
+                            case GGML_TYPE_Q6_K:
+                                ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                break;
+                            default: {
+                                fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                                goto not_implemented;
                             }
                         }
+                        
                     } break;
                 case GGML_OP_GET_ROWS:
                     {
diff --git a/kompute/op_mul_mat_mat_f16.comp b/kompute/op_mul_mat_mat_f16.comp
deleted file mode 100644
index 03872fed5e709..0000000000000
--- a/kompute/op_mul_mat_mat_f16.comp
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-// device subgroup size
-layout (local_size_x_id = 0) in;
-
-layout(binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-
-void main() {
-  uvec3 gid = gl_WorkGroupID;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-      sum += float(inA[x+i]) * float(inB[y+i]);
-  }
-
-  const float all_sum = subgroupAdd(sum);
-  if (subgroupElect()) {
-    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
-  }
-}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
deleted file mode 100644
index 80a1ff6270b50..0000000000000
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-layout (local_size_x_id = 0) in;
-layout (local_size_y_id = 1) in;
-layout (constant_id = 1) const uint nsg = 2;
-
-layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-const uint els_per_block = 32;
-const uint qs_offset = 2;
-const uint block_size = (els_per_block / 2) + qs_offset;
-
-
-void main() {
-  uvec3 gid = gl_WorkGroupID;
-  uvec3 lid = gl_LocalInvocationID;
-  gid.y = gid.y * nsg + lid.y;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = gl_SubgroupInvocationID * 2; i < pcs.ne00; i+=gl_SubgroupSize * 2) {
-      const uint block_number = i / els_per_block;
-      const uint block_offset = block_number * block_size;
-      const float d = u8BufToFloat16(inA, x + block_offset);
-      const uint j = (i % els_per_block) / 2;
-      const uint byte_position_in_block = j;
-      const int q0 = (inA[x+block_offset+qs_offset+byte_position_in_block] & 0x0F) - 8;
-      const int q1 = (inA[x+block_offset+qs_offset+byte_position_in_block] >>   4) - 8;
-      const float dq0 = d * q0;
-      const float dq1 = d * q1;
-      const uint block_base = block_number * els_per_block;
-      sum += (dq0 * float(inB[y+block_base+j])) + \
-             (dq1 * float(inB[y+block_base+j+(els_per_block/2)]));
-  }
-
-  const float all_sum = subgroupAdd(sum);
-  if (subgroupElect())
-    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
-}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q4_1.comp b/kompute/op_mul_mat_mat_q4_1.comp
deleted file mode 100644
index d7fbc96db4d58..0000000000000
--- a/kompute/op_mul_mat_mat_q4_1.comp
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-layout(local_size_x = 32) in;
-
-layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-#define ELS_PER_BLOCK 32
-#define M_OFFSET 2
-#define QS_OFFSET 4
-#define BLOCK_SIZE ((ELS_PER_BLOCK / 2) + QS_OFFSET)
-
-void main() {
-  uvec3 gid = gl_GlobalInvocationID;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i+=ELS_PER_BLOCK) {
-    for (uint j = 0; j < ELS_PER_BLOCK / 2; j++) {
-      const uint block_number = i / ELS_PER_BLOCK;
-      const uint block_offset = block_number * BLOCK_SIZE;
-      const float d = u8BufToFloat16(inA, x + block_offset);
-      const float m = u8BufToFloat16(inA, x + block_offset + M_OFFSET);
-      const uint byte_position_in_block = j;
-      const int q0 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] & 0x0F);
-      const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4);
-      const float dq0 = (d * q0) + m;
-      const float dq1 = (d * q1) + m;
-      sum += (dq0 * float(inB[y+i+j])) + \
-             (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
-    }
-  }
-
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
diff --git a/kompute/op_mul_mat_mat_q6_k.comp b/kompute/op_mul_mat_mat_q6_k.comp
deleted file mode 100644
index 8e3e44d7de732..0000000000000
--- a/kompute/op_mul_mat_mat_q6_k.comp
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-layout(local_size_x = 256) in;
-
-layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-
-#define ELS_PER_BLOCK 256  //QK_K
-#define QH_OFFSET (ELS_PER_BLOCK / 2)
-#define QSCALES_OFFSET (QH_OFFSET + (ELS_PER_BLOCK / 4))
-#define SCALE_SCALE_OFFSET (QSCALES_OFFSET + (ELS_PER_BLOCK / 16))
-#define BLOCK_SIZE (SCALE_SCALE_OFFSET + 2)
-
-void main() {
-  uvec3 gid = gl_GlobalInvocationID;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-
-  float sum = 0.0f;
-  const uint n_blocks = pcs.ne00 / ELS_PER_BLOCK;
-  // this is pretty much all lifted right from dequantize_row_q6_K
-  uint outoff = 0;
-  for (uint i = 0; i < n_blocks; i++) {
-    const uint block_number = i;
-    const uint block_offset = block_number * BLOCK_SIZE;
-    const float scales_d = u8BufToFloat16(inA, x + block_offset + SCALE_SCALE_OFFSET);
-    uint qloff = block_offset;
-    uint qhoff = block_offset + QH_OFFSET;
-    uint scoff = block_offset + QSCALES_OFFSET;
-    for (int n = 0; n < 256; n += 128) {
-        for (int l = 0; l < 32; ++l) {
-            int is = l/16;
-            const int q1 = int((inA[x + qloff + l +  0] & 0xF) | (((inA[x + qhoff + l] >> 0) & 3) << 4)) - 32;
-            const int q2 = int((inA[x + qloff + l + 32] & 0xF) | (((inA[x + qhoff + l] >> 2) & 3) << 4)) - 32;
-            const int q3 = int((inA[x + qloff + l +  0]  >> 4) | (((inA[x + qhoff + l] >> 4) & 3) << 4)) - 32;
-            const int q4 = int((inA[x + qloff + l + 32]  >> 4) | (((inA[x + qhoff + l] >> 6) & 3) << 4)) - 32;
-            sum += inB[y + outoff + l +  0] * scales_d * int8_t(inA[x + scoff + is + 0]) * q1;
-            sum += inB[y + outoff + l + 32] * scales_d * int8_t(inA[x + scoff + is + 2]) * q2;
-            sum += inB[y + outoff + l + 64] * scales_d * int8_t(inA[x + scoff + is + 4]) * q3;
-            sum += inB[y + outoff + l + 96] * scales_d * int8_t(inA[x + scoff + is + 6]) * q4;
-        }
-        outoff += 128;
-        qloff += 64;
-        qhoff += 32;
-        scoff += 8;
-    }
-  }
-
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q8_0.comp b/kompute/op_mul_mat_mat_q8_0.comp
deleted file mode 100644
index 715e533e215b3..0000000000000
--- a/kompute/op_mul_mat_mat_q8_0.comp
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-// layout(local_size_x = 8) in;
-
-layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-#define ELS_PER_BLOCK 32
-#define QS_OFFSET 2 // d
-#define BLOCK_SIZE (ELS_PER_BLOCK + 2)
-
-void main() {
-  uvec3 gid = gl_GlobalInvocationID;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i++) {
-      const uint block_number = i / ELS_PER_BLOCK;
-      const uint block_offset = block_number * BLOCK_SIZE;
-      const float d = u8BufToFloat16(inA, x + block_offset);
-      const uint position_in_block = i % ELS_PER_BLOCK;
-      const int q0 = int8_t(inA[x+block_offset+QS_OFFSET+position_in_block]);
-      const float dq0 = d * q0;
-      sum += (dq0 * float(inB[y+i]));
-  }
-
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
\ No newline at end of file

From 21841d31635b34a03d63d762af726f1dfae1ca4e Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 16 Oct 2023 16:51:41 -0400
Subject: [PATCH 046/140] kompute : enable kp_logger and make it static (#8)

---
 CMakeLists.txt                    | 1 +
 kompute/src/logger/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33a8bdd17ea7a..d26aedaf3e47b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -463,6 +463,7 @@ if (LLAMA_KOMPUTE)
 
     if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
         message(STATUS "Kompute found")
+        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
         add_subdirectory(kompute)
 
         # Compile our shaders
diff --git a/kompute/src/logger/CMakeLists.txt b/kompute/src/logger/CMakeLists.txt
index 1dcc1e6b5a9c9..1f8695acd2673 100644
--- a/kompute/src/logger/CMakeLists.txt
+++ b/kompute/src/logger/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.20)
 
 set(LOGGER_SOURCES Logger.cpp)
 
-add_library(kp_logger ${LOGGER_SOURCES})
+add_library(kp_logger STATIC ${LOGGER_SOURCES})
 
 # Define log levels in code
 add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)

From cbc0d1af797304d9bf5c27cc0ce8c01064e9d78c Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 23 Oct 2023 11:46:26 -0400
Subject: [PATCH 047/140] kompute : make scripts executable

---
 kompute/scripts/convert_shaders.py | 1 +
 undump.py                          | 1 +
 2 files changed, 2 insertions(+)
 mode change 100644 => 100755 kompute/scripts/convert_shaders.py
 mode change 100644 => 100755 undump.py

diff --git a/kompute/scripts/convert_shaders.py b/kompute/scripts/convert_shaders.py
old mode 100644
new mode 100755
index 9375b6701461e..11a3ab974d6a6
--- a/kompute/scripts/convert_shaders.py
+++ b/kompute/scripts/convert_shaders.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
     Script to handle conversion of compute shaders to spirv and to headers
 """
diff --git a/undump.py b/undump.py
old mode 100644
new mode 100755
index db19ffe695dab..c3d8993be66c8
--- a/undump.py
+++ b/undump.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import struct
 import numpy as np
 from pathlib import Path

From 8400015337705461ecfae335683d265015a4a613 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 26 Oct 2023 13:00:53 -0400
Subject: [PATCH 048/140] Don't try an allocation on a heap that is smaller
 than the size we require.

---
 ggml-vulkan.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 08042330fde9b..265933832fd48 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -364,6 +364,12 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     bool memoryTypeIndexFound = false;
     vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
     for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
+        const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i];
+        const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex];
+        if (memoryHeap.size < size) {
+            continue;
+        }
+
         if (requirements.memoryTypeBits & (1 << i)) {
             if (((memoryProperties.memoryTypes[i]).propertyFlags &
                  flags) == flags) {

From 752f7ebd61510a24000704cec7332c842d935588 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 26 Oct 2023 13:01:40 -0400
Subject: [PATCH 049/140] Remove unused push constant that was giving
 validation errors.

---
 kompute/op_mul.comp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
index 348eae7b363c5..31849b941b48a 100644
--- a/kompute/op_mul.comp
+++ b/kompute/op_mul.comp
@@ -20,7 +20,6 @@ layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
     uint outOff;
-    uint row;
 } pcs;
 
 void main() {

From 8d9efbf97a0bcfd9fa60a2279a8a45866ce932c8 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 26 Oct 2023 11:48:36 -0400
Subject: [PATCH 050/140] Lower the workgroup count for some shaders by
 providing a loop that processes four floats at a time.

---
 ggml-vulkan.cpp      | 16 ++++++++--------
 kompute/op_add.comp  |  9 ++++++---
 kompute/op_gelu.comp |  9 ++++++---
 kompute/op_mul.comp  |  7 +++++--
 kompute/op_relu.comp |  7 +++++--
 kompute/op_silu.comp | 10 +++++++---
 6 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 265933832fd48..b70b7ac45dcb1 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1358,7 +1358,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             // src1 is a row
                             ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
                         } else {
-                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst));
+                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
                         }
                     } break;
                 case GGML_OP_MUL:
@@ -1367,7 +1367,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             // src1 is a row
                             ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
                         } else {
-                            ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst));
+                            ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
                         }
                     } break;
                 case GGML_OP_SCALE:
@@ -1379,15 +1379,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     switch (ggml_get_unary_op(gf->nodes[i])) {
                         case GGML_UNARY_OP_SILU:
                             {
-                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
                             } break;
                         case GGML_UNARY_OP_RELU:
                             {
-                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
                             } break;
                         case GGML_UNARY_OP_GELU:
                             {
-                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
                             } break;
                         default:
                             {
@@ -1427,9 +1427,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             ggml_is_transposed(src1)) {
                             fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                             goto not_implemented;
-                        } 
+                        }
 
-                        switch (src0t) {        
+                        switch (src0t) {
                             case GGML_TYPE_F32:
                                 ggml_vk_mul_mat_mat_f32(seq,
                                         id_src0, id_src1, id_dst,
@@ -1459,7 +1459,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                 goto not_implemented;
                             }
                         }
-                        
+
                     } break;
                 case GGML_OP_GET_ROWS:
                     {
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index f242864ddf933..314116aac49a9 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -23,7 +23,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i) + pcs.inBOff];
-}
\ No newline at end of file
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[i + pcs.inBOff];
+    }
+}
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
index c9f8ce3cf2012..f74a14f7e8fb3 100644
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -20,8 +20,11 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
-    const float x = in_[i + pcs.inOff];
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        const float y = in_[i + pcs.inOff];
+        out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y)));
+    }
 }
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
index 31849b941b48a..662ea8177f7da 100644
--- a/kompute/op_mul.comp
+++ b/kompute/op_mul.comp
@@ -23,7 +23,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
+    }
 }
\ No newline at end of file
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
index 41f46be961a87..c6ed044a38831 100644
--- a/kompute/op_relu.comp
+++ b/kompute/op_relu.comp
@@ -20,7 +20,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
+    }
 }
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
index c5acac281902a..8c7bfe321b4fb 100644
--- a/kompute/op_silu.comp
+++ b/kompute/op_silu.comp
@@ -19,8 +19,12 @@ layout(push_constant) uniform PushConstants {
     uint outOff;
 } pcs;
 void main() {
-    const uint i = gl_WorkGroupID.x;
-    const float x = in_[i + pcs.inOff];
 
-    out_[i + pcs.outOff] = x / (1.0 + exp(-x));
+    const uint baseIndex = gl_WorkGroupID.x * 4;
+
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        const float y = in_[i + pcs.inOff];
+        out_[i + pcs.outOff] = y / (1.0 + exp(-y));
+    }
 }

From 74ddf0f17da1daf83de6aaf4ef22274068dcd72f Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Fri, 27 Oct 2023 12:05:24 -0400
Subject: [PATCH 051/140] Fix synchronization problem for AMD Radeon with
 amdvlk driver or windows drivers. Does not have any performance or fidelity
 effect on other gpu/driver combos I've tested.

FIXES: https://github.com/nomic-ai/gpt4all/issues/1507
---
 kompute/src/OpAlgoDispatch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
index cad334f0c5d21..dc39cdc3fd0b1 100644
--- a/kompute/src/OpAlgoDispatch.cpp
+++ b/kompute/src/OpAlgoDispatch.cpp
@@ -32,9 +32,9 @@ OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
          this->mAlgorithm->getTensors()) {
         tensor->recordPrimaryBufferMemoryBarrier(
           commandBuffer,
-          vk::AccessFlagBits::eTransferWrite,
+          vk::AccessFlagBits::eShaderWrite,
           vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eTransfer,
+          vk::PipelineStageFlagBits::eComputeShader,
           vk::PipelineStageFlagBits::eComputeShader);
     }
 

From 1c1701018861810d3db0c746df12a00915d4a6dc Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 23 Oct 2023 12:22:27 -0400
Subject: [PATCH 052/140] vulkan : fix missing break in matmul selection (#9)

---
 ggml-vulkan.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index b70b7ac45dcb1..4747850cfa77d 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1439,6 +1439,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                         ne11, ne12,
                                         nb11, nb12,
                                         nb1, nb2);
+                                break;
                             case GGML_TYPE_F16:
                                 ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                 break;

From 89b71278ff2543658a366fc3259802f1183e8aab Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Fri, 27 Oct 2023 19:04:26 -0400
Subject: [PATCH 053/140] llama : decide to disable Vulkan before loading
 tensors (#7)

---
 llama.cpp | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 3afbebe2ab320..cb0a1227abdfe 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2407,7 +2407,7 @@ static bool llama_model_load(
         llama_model & model,
         int n_ctx,
         int n_batch,
-        int n_gpu_layers,
+        int * n_gpu_layers,
         int main_gpu,
         const float * tensor_split,
         const bool mul_mat_q,
@@ -2438,8 +2438,23 @@ static bool llama_model_load(
             return true;
         }
 
+#ifdef GGML_USE_KOMPUTE
+        if (ggml_vk_has_device() && *n_gpu_layers > 0 && (
+            !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
+            || !(
+                model.ftype == LLAMA_FTYPE_ALL_F32 ||
+                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
+                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
+                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
+            )
+        )) {
+            // disable Vulkan due to unsupported model architecture or quantization type
+            *n_gpu_layers = 0;
+        }
+#endif
+
         llm_load_tensors(
-                *ml, model, n_batch, n_gpu_layers,
+                *ml, model, n_batch, *n_gpu_layers,
                 main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
                 use_mlock, progress_callback, progress_callback_user_data);
     } catch (const std::exception & err) {
@@ -6354,7 +6369,7 @@ struct llama_model * llama_load_model_from_file(
         };
     }
 
-    if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
+    if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, &params.n_gpu_layers,
                 params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
                 params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
                 params.progress_callback, params.progress_callback_user_data)) {
@@ -6502,12 +6517,7 @@ struct llama_context * llama_new_context_with_model(
 #undef LLAMA_METAL_CHECK_BUF
         }
 #elif defined(GGML_USE_KOMPUTE)
-    if (ggml_vk_has_device() && params.n_gpu_layers > 0
-        && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
-        && (model->ftype == LLAMA_FTYPE_ALL_F32
-            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
+    if (ggml_vk_has_device() && params.n_gpu_layers > 0) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 

From e006d377dd32cce14ecf2f272305b16b516db906 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Fri, 27 Oct 2023 18:32:29 -0400
Subject: [PATCH 054/140] Scale the workgroup count down to allow correct
 generation for falcon with AMD radeon cards with lower workgroup count limit

Partially fixes #1581
---
 ggml-vulkan.cpp           | 8 ++++----
 kompute/op_addrow.comp    | 9 ++++++---
 kompute/op_gelu.comp      | 4 ++--
 kompute/op_mulrow.comp    | 7 +++++--
 kompute/op_scale.comp     | 7 +++++--
 kompute/op_silu.comp      | 2 +-
 kompute/src/Algorithm.cpp | 4 ++++
 7 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 4747850cfa77d..239f913f59283 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1356,7 +1356,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         if (ggml_nelements(src1) == ne10) {
                             // src1 is a row
-                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
                             ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
                         }
@@ -1365,7 +1365,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         if (ggml_nelements(src1) == ne10) {
                             // src1 is a row
-                            ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                            ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
                             ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
                         }
@@ -1373,7 +1373,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_SCALE:
                     {
                         const float scale = *(const float *) src1->data;
-                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/8, scale);
                     } break;
                 case GGML_OP_UNARY:
                     switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -1387,7 +1387,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             } break;
                         case GGML_UNARY_OP_GELU:
                             {
-                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
+                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/8);
                             } break;
                         default:
                             {
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
index 926c929e4253a..bf674f8296ccf 100644
--- a/kompute/op_addrow.comp
+++ b/kompute/op_addrow.comp
@@ -24,7 +24,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
-}
\ No newline at end of file
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
+    }
+}
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
index f74a14f7e8fb3..1412ee1abe1bf 100644
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -20,9 +20,9 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
+    const uint baseIndex = gl_WorkGroupID.x * 8;
 
-    for (uint x = 0; x < 4; x++) {
+    for (uint x = 0; x < 8; x++) {
         const uint i = baseIndex + x;
         const float y = in_[i + pcs.inOff];
         out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y)));
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
index 498dbdfcd6af5..955fe26bf0dc6 100644
--- a/kompute/op_mulrow.comp
+++ b/kompute/op_mulrow.comp
@@ -24,7 +24,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
+    }
 }
\ No newline at end of file
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
index 8530aaf3e6999..2ec5244352179 100644
--- a/kompute/op_scale.comp
+++ b/kompute/op_scale.comp
@@ -22,7 +22,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 8;
 
-    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+    for (uint x = 0; x < 8; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+    }
 }
\ No newline at end of file
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
index 8c7bfe321b4fb..9233fd5a1fc30 100644
--- a/kompute/op_silu.comp
+++ b/kompute/op_silu.comp
@@ -18,8 +18,8 @@ layout(push_constant) uniform PushConstants {
     uint inOff;
     uint outOff;
 } pcs;
-void main() {
 
+void main() {
     const uint baseIndex = gl_WorkGroupID.x * 4;
 
     for (uint x = 0; x < 4; x++) {
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
index ea81fd97b1a6f..f8f1c7e363729 100644
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@@ -387,6 +387,10 @@ Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
 void
 Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
 {
+    if (workgroup[0] > 65535) {
+        fprintf(stderr, "workgroup size is %d\n", workgroup[0]);
+        fflush(stderr);
+    }
 
     KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
 

From a5eb001eab32554ea73f1027c323473699ea68aa Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Fri, 27 Oct 2023 18:32:51 -0400
Subject: [PATCH 055/140] Revert the prompt processing on gpu for now.

Fixes issues #1580 and #1581
---
 llama.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cb0a1227abdfe..a196b428ff7d7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3486,7 +3486,7 @@ static struct ggml_cgraph * llm_build_falcon(
     ggml_build_forward_expand(gf, cur);
 
     ggml_free(ctx0);
- 
+
 #if defined(GGML_USE_KOMPUTE)
     if (lctx.ctx_kompute) {
         if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
@@ -3870,11 +3870,19 @@ static bool llama_eval_internal(
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
 #elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute) {
+    if (lctx.ctx_kompute && N == 1) {
         ggml_vk_graph_compute(lctx.ctx_kompute, gf);
         ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
     } else {
+        if (lctx.ctx_kompute) {
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
+        }
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        if (lctx.ctx_kompute) {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
+        }
     }
 #else
     ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);

From ffd0624be2d9e2c908c1fe9d21feb2a0b2f59ae2 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 30 Oct 2023 11:38:21 -0400
Subject: [PATCH 056/140] Remove this debug code.

---
 kompute/src/Algorithm.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
index f8f1c7e363729..0378591bd576b 100644
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@@ -387,11 +387,6 @@ Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
 void
 Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
 {
-    if (workgroup[0] > 65535) {
-        fprintf(stderr, "workgroup size is %d\n", workgroup[0]);
-        fflush(stderr);
-    }
-
     KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
 
     // The dispatch size is set up based on either explicitly provided template

From f88b19888514a1f2d4f3f0b854cb59dda674c081 Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 1 Nov 2023 09:46:15 -0400
Subject: [PATCH 057/140] llama : fix Vulkan whitelist (#11)

---
 llama.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index a196b428ff7d7..5fc93bd2e1781 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6352,9 +6352,11 @@ int64_t llama_time_us(void) {
     return ggml_time_us();
 }
 
-struct llama_model * llama_load_model_from_file(
-                             const char * path_model,
-            struct llama_context_params   params) {
+static struct llama_model * llama_load_model_from_file_internal(
+    const char * path_model, struct llama_context_params * params_p
+) {
+    auto & params = *params_p;
+
     ggml_time_init();
 
     llama_model * model = new llama_model;
@@ -6389,6 +6391,10 @@ struct llama_model * llama_load_model_from_file(
     return model;
 }
 
+struct llama_model * llama_load_model_from_file(const char * path_model, struct llama_context_params params) {
+    return llama_load_model_from_file_internal(path_model, &params);
+}
+
 void llama_free_model(struct llama_model * model) {
     delete model;
 }
@@ -6559,7 +6565,7 @@ struct llama_context * llama_new_context_with_model(
 static struct llama_context * llama_init_from_file(
                              const char * path_model,
             struct llama_context_params   params) {
-    struct llama_model * model = llama_load_model_from_file(path_model, params);
+    struct llama_model * model = llama_load_model_from_file_internal(path_model, &params);
     if (!model) {
         return nullptr;
     }

From a8cac53207ceeeb28a63bb0e141cb75fa6db4028 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 6 Nov 2023 17:24:14 -0500
Subject: [PATCH 058/140] kompute : fix issues with debug layers

---
 kompute/src/Manager.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 2a3ad2cc9ee45..2d2370f63ee4e 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -180,6 +180,16 @@ Manager::createInstance()
           applicationExtensions.data();
     }
 
+    try {
+        mDynamicLoader = std::make_shared<vk::DynamicLoader>();
+    } catch (const std::exception & err) {
+        return;
+    }
+
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
+      mDynamicLoader->getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+
 #ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
     KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
     // We'll identify the layers that are supported
@@ -234,16 +244,6 @@ Manager::createInstance()
     }
 #endif
 
-    try {
-        mDynamicLoader = std::make_shared<vk::DynamicLoader>();
-    } catch (const std::exception & err) {
-        return;
-    }
-
-    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
-      mDynamicLoader->getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
-    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
-
     this->mInstance = std::make_shared<vk::Instance>();
     vk::Result r = vk::createInstance(
       &computeInstanceCreateInfo, nullptr, this->mInstance.get());
@@ -270,7 +270,7 @@ Manager::createInstance()
           (PFN_vkDebugReportCallbackEXT)debugMessageCallback;
         debugCreateInfo.flags = debugFlags;
 
-        this->mDebugDispatcher.init(*this->mInstance, &vkGetInstanceProcAddr);
+        this->mDebugDispatcher.init(*this->mInstance, vkGetInstanceProcAddr);
         this->mDebugReportCallback =
           this->mInstance->createDebugReportCallbackEXT(
             debugCreateInfo, nullptr, this->mDebugDispatcher);

From c438c168969fa1c5f9dc362d9bca2fa42444766e Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 6 Nov 2023 21:08:48 -0500
Subject: [PATCH 059/140] fix build with external fmtlib (v10)

Co-authored-by: ToKiNoBug <tokinobug@163.com>
---
 kompute/src/Manager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 2d2370f63ee4e..c5060b1ead35a 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -349,7 +349,7 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
 
     KP_LOG_INFO("Using physical device index {} found {}",
                 physicalDeviceIndex,
-                physicalDeviceProperties.deviceName);
+                physicalDeviceProperties.deviceName.data());
 
     if (familyQueueIndices.empty()) {
         // Find compute queue

From 71565eb0c3f2b26b17685ce184bb78a47d89cc15 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 17:18:27 -0500
Subject: [PATCH 060/140] vulkan : replace ggml_diag_mask_inf with ggml_add
 (custom -inf mask)

---
 ggml-vulkan.cpp     | 59 +++++++++++++++++++++++++++++++++------------
 kompute/op_add.comp | 44 +++++++++++++++++++++++++++++----
 2 files changed, 82 insertions(+), 21 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 239f913f59283..01d70d1a6baa9 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -579,29 +579,48 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
     return a / b;
 }
 
-void ggml_vk_add(kp::Sequence& seq,
-                    const std::shared_ptr<kp::Tensor>& inA,
-                    const std::shared_ptr<kp::Tensor>& inB,
-                    const std::shared_ptr<kp::Tensor>& out,
-                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                    uint32_t size) {
+void ggml_vk_add(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
+    int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
+    int32_t ne0,
+    int32_t nb0,  int32_t nb1,  int32_t nb2,  int32_t nb3
+) {
 
     const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
         kp::shader_data::op_add_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
+        int32_t ne00;
+        int32_t nb00, nb01, nb02, nb03;
+        int32_t ne10, ne11, ne12, ne13;
+        int32_t nb10, nb11, nb12, nb13;
+        int32_t ne0;
+        int32_t nb0, nb1, nb2, nb3;
     } const pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00,
+        nb00, nb01, nb02, nb03,
+        ne10, ne11, ne12, ne13,
+        nb10, nb11, nb12, nb13,
+        ne0,
+        nb0, nb1, nb2, nb3
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({size});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1315,12 +1334,12 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const int32_t ne10 = src1 ? src1->ne[0] : 0;
             const int32_t ne11 = src1 ? src1->ne[1] : 0;
             const int32_t ne12 = src1 ? src1->ne[2] : 0;
-//            const int32_t ne13 = src1 ? src1->ne[3] : 0;
+            const int32_t ne13 = src1 ? src1->ne[3] : 0;
 
-//            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
             const uint32_t nb11 = src1 ? src1->nb[1] : 0;
             const uint32_t nb12 = src1 ? src1->nb[2] : 0;
-//            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
+            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
 
             const int32_t ne0 = dst ? dst->ne[0] : 0;
             const int32_t ne1 = dst ? dst->ne[1] : 0;
@@ -1354,11 +1373,19 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_ADD:
                     {
-                        if (ggml_nelements(src1) == ne10) {
+                        if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
                             // src1 is a row
                             ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
-                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
+                            ggml_vk_add(
+                                seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                ne00, ne01, ne02, ne03,
+                                nb00, nb01, nb02, nb03,
+                                ne10, ne11, ne12, ne13,
+                                nb10, nb11, nb12, nb13,
+                                ne0,
+                                nb0, nb1, nb2, nb3
+                            );
                         }
                     } break;
                 case GGML_OP_MUL:
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index 314116aac49a9..df3fdc59cdc8e 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -10,7 +10,7 @@
 
 #include "common.comp"
 
-layout(local_size_x = 1) in;
+layout(local_size_x = 1024) in;
 
 layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
@@ -20,13 +20,47 @@ layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
     uint outOff;
+    int ne00;
+    int nb00;
+    int nb01;
+    int nb02;
+    int nb03;
+    int ne10;
+    int ne11;
+    int ne12;
+    int ne13;
+    int nb10;
+    int nb11;
+    int nb12;
+    int nb13;
+    int ne0;
+    int nb0;
+    int nb1;
+    int nb2;
+    int nb3;
 } pcs;
 
+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
 void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
 
-    for (uint x = 0; x < 4; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[i + pcs.inBOff];
+    const uint i13 = i03 % pcs.ne13;
+    const uint i12 = i02 % pcs.ne12;
+    const uint i11 = i01 % pcs.ne11;
+
+    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + gl_SubgroupInvocationID.x*pcs.nb00) / 4);
+    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 + gl_SubgroupInvocationID.x*pcs.nb10) / 4);
+    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + gl_SubgroupInvocationID.x*pcs.nb0 ) / 4);
+
+    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
+        out_[pcs.outOff + dst_off] = inA[pcs.inAOff + src0_off] + inB[pcs.inBOff + src1_off];
+
+        src0_off += gl_WorkGroupSize.x*pcs.ne00;
+        src1_off += gl_WorkGroupSize.x*pcs.ne10;
+        dst_off  += gl_WorkGroupSize.x*pcs.ne0;
     }
 }

From 84f7fc4553775c1d1e8401750ce3369ec1ed70ee Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 17:18:42 -0500
Subject: [PATCH 061/140] vulkan : rope n_past is now KQ_pos, f16 rope kernel

---
 CMakeLists.txt                             |  6 +-
 ggml-vulkan.cpp                            | 84 ++++++++++++--------
 kompute/op_rope_f16.comp                   | 89 ++++++++++++++++++++++
 kompute/{op_rope.comp => op_rope_f32.comp} | 23 +++---
 llama.cpp                                  | 16 +++-
 5 files changed, 170 insertions(+), 48 deletions(-)
 create mode 100644 kompute/op_rope_f16.comp
 rename kompute/{op_rope.comp => op_rope_f32.comp} (78%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d26aedaf3e47b..aa453b6b2f2ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -490,7 +490,8 @@ if (LLAMA_KOMPUTE)
           kompute/op_getrows_q4_0.comp
           kompute/op_getrows_q4_1.comp
           kompute/op_getrows_q6_k.comp
-          kompute/op_rope.comp
+          kompute/op_rope_f16.comp
+          kompute/op_rope_f32.comp
           kompute/op_cpy_f16_f16.comp
           kompute/op_cpy_f16_f32.comp
           kompute/op_cpy_f32_f16.comp
@@ -521,7 +522,8 @@ if (LLAMA_KOMPUTE)
           shaderop_getrows_q4_0.h
           shaderop_getrows_q4_1.h
           shaderop_getrows_q6_k.h
-          shaderop_rope.h
+          shaderop_rope_f16.h
+          shaderop_rope_f32.h
           shaderop_cpy_f16_f16.h
           shaderop_cpy_f16_f32.h
           shaderop_cpy_f32_f16.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 01d70d1a6baa9..3e3f6cc8099fb 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -32,7 +32,8 @@
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
 #include "shaderop_getrows_q6_k.h"
-#include "shaderop_rope.h"
+#include "shaderop_rope_f16.h"
+#include "shaderop_rope_f32.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
 #include "shaderop_cpy_f32_f16.h"
@@ -1175,51 +1176,66 @@ void ggml_vk_get_rows_q6_k(Args&&... args) {
     ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
 }
 
-void ggml_vk_rope(kp::Sequence& seq,
-                  const std::shared_ptr<kp::Tensor>& in,
-                  const std::shared_ptr<kp::Tensor>& out,
-                  uint32_t inOff, uint32_t outOff,
-                  uint32_t n_past, int32_t n_dims, int32_t mode,
-                  float freq_base, float freq_scale,
-                  int32_t ne01, int32_t ne02, int32_t ne03,
-                  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
-                  int32_t ne0,
-                  uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_rope_comp_spv,
-        kp::shader_data::op_rope_comp_spv_len);
-
-    GGML_ASSERT(nb03%sizeof(float) == 0);
-    GGML_ASSERT(nb02%sizeof(float) == 0);
-    GGML_ASSERT(nb01%sizeof(float) == 0);
-    GGML_ASSERT(nb00%sizeof(float) == 0);
-    GGML_ASSERT(nb3%sizeof(float) == 0);
-    GGML_ASSERT(nb2%sizeof(float) == 0);
-    GGML_ASSERT(nb1%sizeof(float) == 0);
-    GGML_ASSERT(nb0%sizeof(float) == 0);
+void ggml_vk_rope(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    ggml_type src0t, int32_t n_dims, int32_t mode,
+    float freq_base, float freq_scale,
+    int32_t ne01, int32_t ne02, int32_t ne03,
+    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    int32_t ne0,
+    uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
+) {
+    GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
+
+    static const auto spirv_f16 = getSpirvShader(
+        kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len
+    );
+    static const auto spirv_f32 = getSpirvShader(
+        kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len
+    );
+
+    int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
+
+    GGML_ASSERT(nb03 % type_size == 0);
+    GGML_ASSERT(nb02 % type_size == 0);
+    GGML_ASSERT(nb01 % type_size == 0);
+    GGML_ASSERT(nb00 % type_size == 0);
+    GGML_ASSERT(nb3  % type_size == 0);
+    GGML_ASSERT(nb2  % type_size == 0);
+    GGML_ASSERT(nb1  % type_size == 0);
+    GGML_ASSERT(nb0  % type_size == 0);
 
     struct PushConstants {
-        uint32_t inOff, outOff;
-        uint32_t n_past;
+        uint32_t inAOff, inBOff, outOff;
         int32_t n_dims, mode;
         float freq_base, freq_scale;
         uint32_t nb00, nb01, nb02, nb03;
         int32_t ne0;
         uint32_t nb0, nb1, nb2, nb3;
     } pushConsts {
-        safe_divide(inOff, 4), safe_divide(outOff, 4),
-        n_past, n_dims, mode,
+        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
+        n_dims, mode,
         freq_base, freq_scale,
         nb00, nb01, nb02, nb03,
         ne0,
         nb0, nb1, nb2, nb3
     };
 
+    auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({in, out});
+    if (!komputeManager()->hasAlgorithm(name)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(
+            name, s_kompute_context->pool.get(), {inA, inB, out},
+            src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
+            {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
+        );
+    } else {
+        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
@@ -1506,14 +1522,16 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_ROPE:
                     {
-                        const int n_past = ((int32_t *) dst->op_params)[0];
+                        GGML_ASSERT(ne10 == ne02);
+                        GGML_ASSERT(src0t == dstt);
+                        // const int n_past = ((int32_t *) dst->op_params)[0];
                         const int n_dims = ((int32_t *) dst->op_params)[1];
                         const int mode   = ((int32_t *) dst->op_params)[2];
                         float freq_base;
                         float freq_scale;
                         memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
                         memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-                        ggml_vk_rope(seq, id_src0, id_dst, off_src0, off_dst, n_past, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
+                        ggml_vk_rope(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
                     } break;
                 case GGML_OP_DUP:
                 case GGML_OP_CPY:
diff --git a/kompute/op_rope_f16.comp b/kompute/op_rope_f16.comp
new file mode 100644
index 0000000000000..fd3943c8108c8
--- /dev/null
+++ b/kompute/op_rope_f16.comp
@@ -0,0 +1,89 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+// TODO: use a local size of 32 or more (Metal uses 1024)
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int n_dims;
+    int mode;
+    float freq_base;
+    float freq_scale;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    const bool is_neox = (pcs.mode & 2) != 0;
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    const int p = inB[pcs.inBOff + i2];
+
+    float theta = pcs.freq_scale * float(p);
+
+    if (!is_neox) {
+        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
+            const float cos_theta = cos(theta);
+            const float sin_theta = sin(theta);
+
+            theta *= theta_scale;
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+1]);
+
+            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
+        }
+    } else {
+        const float inv_ndims = -1.f/pcs.n_dims;
+        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
+            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
+                const float cos_theta = cos(theta);
+                const float sin_theta = sin(theta);
+
+                theta *= theta_scale;
+
+                const uint i0 = ib*pcs.n_dims + ic/2;
+
+                const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+                const float x0 = float(inA[src]);
+                const float x1 = float(inA[src+pcs.n_dims/2]);
+
+                out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
+                out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
+            }
+        }
+    }
+}
diff --git a/kompute/op_rope.comp b/kompute/op_rope_f32.comp
similarity index 78%
rename from kompute/op_rope.comp
rename to kompute/op_rope_f32.comp
index 8c28546369b26..6024c3e5e64d5 100644
--- a/kompute/op_rope.comp
+++ b/kompute/op_rope_f32.comp
@@ -12,13 +12,14 @@
 
 layout(local_size_x = 1) in;
 
-layout (binding = 0) readonly buffer tensorIn { float in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { float out_[]; };
+layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 
 layout (push_constant) uniform parameter {
-    uint inOff;
+    uint inAOff;
+    uint inBOff;
     uint outOff;
-    uint n_past;
     int n_dims;
     int mode;
     float freq_base;
@@ -42,7 +43,7 @@ void main() {
     const bool is_neox = (pcs.mode & 2) != 0;
     const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
 
-    const uint p = ((pcs.mode & 1) == 0 ? pcs.n_past + i2 : i2);
+    const int p = inB[pcs.inBOff + i2];
 
     float theta = pcs.freq_scale * float(p);
 
@@ -53,11 +54,11 @@ void main() {
 
             theta *= theta_scale;
 
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
             const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
 
-            const float x0 = in_[src];
-            const float x1 = in_[src+1];
+            const float x0 = inA[src];
+            const float x1 = inA[src+1];
 
             out_[dst_data] = x0*cos_theta - x1*sin_theta;
             out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
@@ -73,11 +74,11 @@ void main() {
 
                 const uint i0 = ib*pcs.n_dims + ic/2;
 
-                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
                 const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
 
-                const float x0 = in_[src];
-                const float x1 = in_[src+pcs.n_dims/2];
+                const float x0 = inA[src];
+                const float x1 = inA[src+pcs.n_dims/2];
 
                 out_[dst_data] = x0*cos_theta - x1*sin_theta;
                 out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
diff --git a/llama.cpp b/llama.cpp
index a56ffce9f6e35..8455424b4cd46 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2772,8 +2772,9 @@ static struct ggml_cgraph * llm_build_llama(
     }
 
     // shift the entire K-cache if needed
+    struct ggml_tensor * K_shift = nullptr;
     if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
         offload_func_kq(K_shift);
         ggml_set_name(K_shift, "K_shift");
         ggml_allocr_alloc(lctx.alloc, K_shift);
@@ -3024,6 +3025,11 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_vk_h2d_all(lctx.ctx_kompute);
         } else {
             ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_pos);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_mask);
+            if (K_shift) {
+                ggml_vk_h2d_tensor(lctx.ctx_kompute, K_shift);
+            }
         }
     }
 #endif
@@ -3589,8 +3595,9 @@ static struct ggml_cgraph * llm_build_falcon(
     }
 
     // shift the entire K-cache if needed
+    struct ggml_tensor * K_shift = nullptr;
     if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
         offload_func_kq(K_shift);
         ggml_set_name(K_shift, "K_shift");
         ggml_allocr_alloc(lctx.alloc, K_shift);
@@ -3820,6 +3827,11 @@ static struct ggml_cgraph * llm_build_falcon(
             ggml_vk_h2d_all(lctx.ctx_kompute);
         } else {
             ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_pos);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_mask);
+            if (K_shift) {
+                ggml_vk_h2d_tensor(lctx.ctx_kompute, K_shift);
+            }
         }
     }
 #endif

From 39abedd1d75b83cc9ff6f5c951d2e4f63d840bdf Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 17:18:48 -0500
Subject: [PATCH 062/140] vulkan : optimize workgroup sizes

---
 ggml-vulkan.cpp             |  4 ++--
 kompute/op_cpy_f16_f16.comp |  5 ++---
 kompute/op_cpy_f16_f32.comp |  5 ++---
 kompute/op_cpy_f32_f16.comp |  5 ++---
 kompute/op_cpy_f32_f32.comp |  5 ++---
 kompute/op_norm.comp        | 18 ++++++++----------
 kompute/op_rmsnorm.comp     | 12 +++++-------
 kompute/op_rope_f32.comp    |  1 +
 8 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 3e3f6cc8099fb..74d9fceb6c390 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -847,9 +847,9 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(__func__)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
-    else {
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
index 5f425ae28798c..652db031368e6 100644
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
index 4298bebdd729c..aa204248c1f49 100644
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
index 2d763edfd3d43..4fdab483108b8 100644
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp
index 4e5b1d39303fd..2fc998492b7f8 100644
--- a/kompute/op_cpy_f32_f32.comp
+++ b/kompute/op_cpy_f32_f32.comp
@@ -2,13 +2,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -46,7 +45,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index 5aafeaac53aaf..1d685cf362378 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -10,9 +10,7 @@
 
 #include "common.comp"
 
-#define nth 256
-
-layout(local_size_x = nth) in;
+layout(local_size_x = 256) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
     float eps;
 } pcs;
 
-shared float sum[nth];
+shared float sum[gl_WorkGroupSize.x];
 
 void main() {
     const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
     // MEAN
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += in_[x+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -57,21 +55,21 @@ void main() {
 
     // recenter
     const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] = in_[x+i00] - mean;
     }
 
     // VARIANCE
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -88,7 +86,7 @@ void main() {
     const float variance = sum[0];
 
     const float scale = 1.0f/sqrt(variance + pcs.eps);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] *= scale;
     }
 }
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index 8d6c0fa6a5e48..5ebaf22691376 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -10,9 +10,7 @@
 
 #include "common.comp"
 
-#define nth 512
-
-layout(local_size_x = nth) in;
+layout(local_size_x = 512) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
     float eps;
 } pcs;
 
-shared float sum[nth];
+shared float sum[gl_WorkGroupSize.x];
 
 void main() {
     const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
 
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -57,7 +55,7 @@ void main() {
     const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
 
     const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] = in_[x+i00] * scale;
     }
 }
diff --git a/kompute/op_rope_f32.comp b/kompute/op_rope_f32.comp
index 6024c3e5e64d5..0cf83fec0031c 100644
--- a/kompute/op_rope_f32.comp
+++ b/kompute/op_rope_f32.comp
@@ -10,6 +10,7 @@
 
 #include "common.comp"
 
+// TODO: use a local size of 32 or more (Metal uses 1024)
 layout(local_size_x = 1) in;
 
 layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };

From a934b2cb8a1cbe2aad1ca10a119df60bbcf8d5d1 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 14 Nov 2023 11:59:58 -0500
Subject: [PATCH 063/140] vulkan : assert various kernel requirements

---
 ggml-vulkan.cpp | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 74d9fceb6c390..d4d6d1b873463 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1416,27 +1416,34 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_SCALE:
                     {
                         const float scale = *(const float *) src1->data;
-                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/8, scale);
+                        int64_t n = ggml_nelements(dst);
+                        GGML_ASSERT(n % 8 == 0);
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, n/8, scale);
                     } break;
                 case GGML_OP_UNARY:
-                    switch (ggml_get_unary_op(gf->nodes[i])) {
-                        case GGML_UNARY_OP_SILU:
-                            {
-                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
-                            } break;
-                        case GGML_UNARY_OP_RELU:
-                            {
-                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
-                            } break;
-                        case GGML_UNARY_OP_GELU:
-                            {
-                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/8);
-                            } break;
-                        default:
-                            {
-                                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                GGML_ASSERT(false);
-                            }
+                    {
+                        int64_t n = ggml_nelements(dst);
+                        GGML_ASSERT(n % 4 == 0);
+                        switch (ggml_get_unary_op(gf->nodes[i])) {
+                            case GGML_UNARY_OP_SILU:
+                                {
+                                    ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
+                                } break;
+                            case GGML_UNARY_OP_RELU:
+                                {
+                                    ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
+                                } break;
+                            case GGML_UNARY_OP_GELU:
+                                {
+                                    GGML_ASSERT(n % 8 == 0);
+                                    ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8);
+                                } break;
+                            default:
+                                {
+                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    GGML_ASSERT(false);
+                                }
+                        }
                     } break;
                 case GGML_OP_SOFT_MAX:
                     {
@@ -1455,6 +1462,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_RMS_NORM:
                     {
+                        GGML_ASSERT(ne00 % 4 == 0);
+
                         float eps;
                         memcpy(&eps, dst->op_params, sizeof(float));
                         ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);

From 6474fc879ac708daa22f7ac80337f9b4a323b387 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 14 Nov 2023 12:10:52 -0500
Subject: [PATCH 064/140] vulkan : handle ggml_scale for n%8 != 0

ref ggerganov/llama.cpp#3754
---
 CMakeLists.txt          |  2 ++
 ggml-vulkan.cpp         | 29 ++++++++++++++++++++---------
 kompute/op_scale.comp   | 10 +++-------
 kompute/op_scale_8.comp | 31 +++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 16 deletions(-)
 create mode 100644 kompute/op_scale_8.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39dd95eb086d6..76a03d95f9c8a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -476,6 +476,7 @@ if (LLAMA_KOMPUTE)
         # Compile our shaders
         compile_shader(SOURCES
           kompute/op_scale.comp
+          kompute/op_scale_8.comp
           kompute/op_add.comp
           kompute/op_addrow.comp
           kompute/op_mul.comp
@@ -508,6 +509,7 @@ if (LLAMA_KOMPUTE)
         # Create a custom target for our generated shaders
         add_custom_target(generated_shaders DEPENDS
           shaderop_scale.h
+          shaderop_scale_8.h
           shaderop_add.h
           shaderop_addrow.h
           shaderop_mul.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index d4d6d1b873463..8c048c77d4efb 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -11,6 +11,7 @@
 
 // These are generated at build time by cmake custom command
 #include "shaderop_scale.h"
+#include "shaderop_scale_8.h"
 #include "shaderop_add.h"
 #include "shaderop_addrow.h"
 #include "shaderop_mul.h"
@@ -724,8 +725,12 @@ void ggml_vk_scale(kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
                    uint32_t size, float scale) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_scale_comp_spv,
-        kp::shader_data::op_scale_comp_spv_len);
+    const static auto spirv_1 = getSpirvShader(
+        kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len
+    );
+    const static auto spirv_8 = getSpirvShader(
+        kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len
+    );
 
     struct PushConstants {
         uint32_t inOff, outOff;
@@ -735,11 +740,19 @@ void ggml_vk_scale(kp::Sequence& seq,
         scale
     };
 
+    const auto * spirv = &spirv_1;
+    std::string name(__func__);
+    if (size % 8 == 0) {
+        size /= 8;
+        name += "_8";
+        spirv = &spirv_8;
+    }
+
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+    if (!komputeManager()->hasAlgorithm(name)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1416,9 +1429,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_SCALE:
                     {
                         const float scale = *(const float *) src1->data;
-                        int64_t n = ggml_nelements(dst);
-                        GGML_ASSERT(n % 8 == 0);
-                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, n/8, scale);
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
                     } break;
                 case GGML_OP_UNARY:
                     {
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
index 2ec5244352179..be68060912a1e 100644
--- a/kompute/op_scale.comp
+++ b/kompute/op_scale.comp
@@ -22,10 +22,6 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 8;
-
-    for (uint x = 0; x < 8; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
-    }
-}
\ No newline at end of file
+    const uint i = gl_WorkGroupID.x;
+    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+}
diff --git a/kompute/op_scale_8.comp b/kompute/op_scale_8.comp
new file mode 100644
index 0000000000000..29fa9b35a55b5
--- /dev/null
+++ b/kompute/op_scale_8.comp
@@ -0,0 +1,31 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    float scale;
+} pcs;
+
+void main() {
+    const uint baseIndex = gl_WorkGroupID.x * 8;
+
+    for (uint x = 0; x < 8; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+    }
+}

From 9c4dfd06e8172486678a37e66ff5b1a47c8b88f6 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 15 Nov 2023 15:51:55 -0500
Subject: [PATCH 065/140] mention skipped change

---
 kompute/op_softmax.comp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index 30b6f0260e7d6..a8c2682dce8b9 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -6,6 +6,8 @@
  * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
  */
 
+// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
+
 #version 450
 
 #include "common.comp"

From 02c3309f6d3f7892803e8b75e1e6ad77d580a79b Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 14 Nov 2023 15:54:26 -0500
Subject: [PATCH 066/140] merge fixup
 (e16b9fa4baa8a09c6619b116159830e898050942)

---
 llama.cpp | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ed6bd18e15f8b..ca170f596e2ae 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3506,6 +3506,10 @@ struct llm_build_context {
 
     llama_buffer & buf_compute;
 
+#if defined(GGML_USE_KOMPUTE)
+    ggml_kompute_context * ctx_kompute;
+#endif
+
     struct ggml_context * ctx0 = nullptr;
 
     // TODO: consider making the entire interface noexcept
@@ -3535,7 +3539,11 @@ struct llm_build_context {
         kv_head       (worst_case ? n_ctx - n_tokens : kv_self.head),
         do_rope_shift (worst_case || kv_self.has_shift),
         cb            (cb),
-        buf_compute   (lctx.buf_compute) {
+        buf_compute   (lctx.buf_compute)
+#if defined(GGML_USE_KOMPUTE)
+      , ctx_kompute   (lctx.ctx_kompute)
+#endif
+        {
             GGML_ASSERT(!!kv_self.ctx);
 
             // all initializations should be done in init()
@@ -3662,15 +3670,15 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, cur);
 
 #if defined(GGML_USE_KOMPUTE)
-        if (lctx.ctx_kompute) {
-            if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
-                ggml_vk_h2d_all(lctx.ctx_kompute);
+        if (ctx_kompute) {
+            if (!ggml_vk_has_h2d_all(ctx_kompute)) {
+                ggml_vk_h2d_all(ctx_kompute);
             } else {
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, to_device_tensor);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, inp_pos);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_mask);
+                ggml_vk_h2d_tensor(ctx_kompute, to_device_tensor);
+                ggml_vk_h2d_tensor(ctx_kompute, inp_pos);
+                ggml_vk_h2d_tensor(ctx_kompute, KQ_mask);
                 if (K_shift) {
-                    ggml_vk_h2d_tensor(lctx.ctx_kompute, K_shift);
+                    ggml_vk_h2d_tensor(ctx_kompute, K_shift);
                 }
             }
         }
@@ -3907,15 +3915,15 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, cur);
 
 #if defined(GGML_USE_KOMPUTE)
-        if (lctx.ctx_kompute) {
-            if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
-                ggml_vk_h2d_all(lctx.ctx_kompute);
+        if (ctx_kompute) {
+            if (!ggml_vk_has_h2d_all(ctx_kompute)) {
+                ggml_vk_h2d_all(ctx_kompute);
             } else {
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, to_device_tensor);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, inp_pos);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_mask);
+                ggml_vk_h2d_tensor(ctx_kompute, to_device_tensor);
+                ggml_vk_h2d_tensor(ctx_kompute, inp_pos);
+                ggml_vk_h2d_tensor(ctx_kompute, KQ_mask);
                 if (K_shift) {
-                    ggml_vk_h2d_tensor(lctx.ctx_kompute, K_shift);
+                    ggml_vk_h2d_tensor(ctx_kompute, K_shift);
                 }
             }
         }

From 208cd52f7d2ca3eb9708cfd457dde0592ed0e38b Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 15 Nov 2023 17:58:19 -0500
Subject: [PATCH 067/140] vulkan : implement YaRN RoPE scaling (#2268)

The NeoX cur_rot part is different because I'm pretty sure my original
implementation was wrong.
---
 ggml-vulkan.cpp          | 36 ++++++++++++-------
 kompute/common.comp      |  1 +
 kompute/op_rope_f16.comp | 40 +++++++--------------
 kompute/op_rope_f32.comp | 40 +++++++--------------
 kompute/rope_common.comp | 75 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 123 insertions(+), 69 deletions(-)
 create mode 100644 kompute/rope_common.comp

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 8c048c77d4efb..a4f9ade0ed54a 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1195,8 +1195,8 @@ void ggml_vk_rope(
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    ggml_type src0t, int32_t n_dims, int32_t mode,
-    float freq_base, float freq_scale,
+    ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_orig_ctx,
+    float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
     int32_t ne01, int32_t ne02, int32_t ne03,
     uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
     int32_t ne0,
@@ -1224,15 +1224,15 @@ void ggml_vk_rope(
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t n_dims, mode;
-        float freq_base, freq_scale;
+        int32_t n_dims, mode, n_orig_ctx;
+        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
         uint32_t nb00, nb01, nb02, nb03;
         int32_t ne0;
         uint32_t nb0, nb1, nb2, nb3;
     } pushConsts {
         safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
-        n_dims, mode,
-        freq_base, freq_scale,
+        n_dims, mode, n_orig_ctx,
+        freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
         nb00, nb01, nb02, nb03,
         ne0,
         nb0, nb1, nb2, nb3
@@ -1545,13 +1545,23 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         GGML_ASSERT(ne10 == ne02);
                         GGML_ASSERT(src0t == dstt);
                         // const int n_past = ((int32_t *) dst->op_params)[0];
-                        const int n_dims = ((int32_t *) dst->op_params)[1];
-                        const int mode   = ((int32_t *) dst->op_params)[2];
-                        float freq_base;
-                        float freq_scale;
-                        memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-                        memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-                        ggml_vk_rope(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
+                        const int n_dims     = ((int32_t *) dst->op_params)[1];
+                        const int mode       = ((int32_t *) dst->op_params)[2];
+                        // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
+                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+
+                        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+                        memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+                        memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+                        memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+                        ggml_vk_rope(
+                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_orig_ctx,
+                            freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
+                            ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
+                        );
                     } break;
                 case GGML_OP_DUP:
                 case GGML_OP_CPY:
diff --git a/kompute/common.comp b/kompute/common.comp
index 040b87375ecd2..fe0bc5d15b7d6 100644
--- a/kompute/common.comp
+++ b/kompute/common.comp
@@ -20,6 +20,7 @@
 
 #define GELU_COEF_A 0.044715
 #define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+#define TWOPI_F 6.283185307179586f
 
 #define QK_K 256
 
diff --git a/kompute/op_rope_f16.comp b/kompute/op_rope_f16.comp
index fd3943c8108c8..e4b5ccca363fb 100644
--- a/kompute/op_rope_f16.comp
+++ b/kompute/op_rope_f16.comp
@@ -8,50 +8,32 @@
 
 #version 450
 
-#include "common.comp"
-
-// TODO: use a local size of 32 or more (Metal uses 1024)
-layout(local_size_x = 1) in;
+#include "rope_common.comp"
 
 layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
 layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
 
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int n_dims;
-    int mode;
-    float freq_base;
-    float freq_scale;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
 void main() {
     const uint i3 = gl_WorkGroupID.z;
     const uint i2 = gl_WorkGroupID.y;
     const uint i1 = gl_WorkGroupID.x;
 
     const bool is_neox = (pcs.mode & 2) != 0;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
     const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
 
     const int p = inB[pcs.inBOff + i2];
 
-    float theta = pcs.freq_scale * float(p);
+    float theta = float(p);
 
     if (!is_neox) {
         for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            const float cos_theta = cos(theta);
-            const float sin_theta = sin(theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
             theta *= theta_scale;
 
@@ -68,8 +50,10 @@ void main() {
         const float inv_ndims = -1.f/pcs.n_dims;
         for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
             for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-                const float cos_theta = cos(theta);
-                const float sin_theta = sin(theta);
+                const uint cur_rot = ib * pcs.n_dims + ic;
+
+                float cos_theta, sin_theta;
+                rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
                 theta *= theta_scale;
 
diff --git a/kompute/op_rope_f32.comp b/kompute/op_rope_f32.comp
index 0cf83fec0031c..0a882879d9384 100644
--- a/kompute/op_rope_f32.comp
+++ b/kompute/op_rope_f32.comp
@@ -8,50 +8,32 @@
 
 #version 450
 
-#include "common.comp"
-
-// TODO: use a local size of 32 or more (Metal uses 1024)
-layout(local_size_x = 1) in;
+#include "rope_common.comp"
 
 layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int n_dims;
-    int mode;
-    float freq_base;
-    float freq_scale;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
 void main() {
     const uint i3 = gl_WorkGroupID.z;
     const uint i2 = gl_WorkGroupID.y;
     const uint i1 = gl_WorkGroupID.x;
 
     const bool is_neox = (pcs.mode & 2) != 0;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
     const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
 
     const int p = inB[pcs.inBOff + i2];
 
-    float theta = pcs.freq_scale * float(p);
+    float theta = float(p);
 
     if (!is_neox) {
         for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            const float cos_theta = cos(theta);
-            const float sin_theta = sin(theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
             theta *= theta_scale;
 
@@ -68,8 +50,10 @@ void main() {
         const float inv_ndims = -1.f/pcs.n_dims;
         for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
             for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-                const float cos_theta = cos(theta);
-                const float sin_theta = sin(theta);
+                const uint cur_rot = ib * pcs.n_dims + ic;
+
+                float cos_theta, sin_theta;
+                rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
                 theta *= theta_scale;
 
diff --git a/kompute/rope_common.comp b/kompute/rope_common.comp
new file mode 100644
index 0000000000000..45682dc28bdfb
--- /dev/null
+++ b/kompute/rope_common.comp
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "common.comp"
+
+// TODO: use a local size of 32 or more (Metal uses 1024)
+layout(local_size_x = 1) in;
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int n_dims;
+    int mode;
+    int n_orig_ctx;
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+float rope_yarn_ramp(const float low, const float high, const float i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
+    out float cos_theta, out float sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
+    }
+    cos_theta = cos(theta) * mscale;
+    sin_theta = sin(theta) * mscale;
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
+    return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
+}
+
+void rope_yarn_corr_dims(
+    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
+) {
+    // start and end correction dims
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
+}

From a4bb9c5ced174b306958fb79f11c3b5bfafcf5ea Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 12:20:07 -0500
Subject: [PATCH 068/140] vulkan : sync with "migrate to dynamic graphs"

---
 ggml-vulkan.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index a4f9ade0ed54a..a3308191c52b6 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1350,6 +1350,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             struct ggml_tensor * dst = gf->nodes[i];
             GGML_ASSERT(dst->data != nullptr);
 
+            switch (dst->op) {
+                case GGML_OP_NONE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_VIEW:
+                case GGML_OP_TRANSPOSE:
+                case GGML_OP_PERMUTE:
+                    continue; // noop -> next node
+            }
+
             const int32_t ne00 = src0 ? src0->ne[0] : 0;
             const int32_t ne01 = src0 ? src0->ne[1] : 0;
             const int32_t ne02 = src0 ? src0->ne[2] : 0;
@@ -1393,13 +1402,6 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const std::shared_ptr<kp::Tensor>& id_dst  = dst ? ggml_vk_get_tensor(ctx, dst, &off_dst)  : nullTensor;
 
             switch (dst->op) {
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_PERMUTE:
-                    {
-                        // noop
-                    } break;
                 case GGML_OP_ADD:
                     {
                         if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {

From 56430c3209bebbc6547cd13db32c83cc32b5f4ce Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 16:54:06 -0500
Subject: [PATCH 069/140] relicense Vulkan backend as MIT

---
 LICENSE_SOM.txt                           | 30 -----------------------
 ggml-vulkan.cpp                           |  8 ------
 ggml-vulkan.h                             |  8 ------
 kompute/common.comp                       |  8 ------
 kompute/op_add.comp                       |  8 ------
 kompute/op_addrow.comp                    |  8 ------
 kompute/op_cpy_f16_f16.comp               |  8 ------
 kompute/op_cpy_f16_f32.comp               |  8 ------
 kompute/op_cpy_f32_f16.comp               |  8 ------
 kompute/op_diagmask.comp                  |  8 ------
 kompute/op_gelu.comp                      |  8 ------
 kompute/op_getrows.comp                   |  8 ------
 kompute/op_getrows_f16.comp               |  8 ------
 kompute/op_getrows_q4_0.comp              |  8 ------
 kompute/op_getrows_q4_1.comp              |  8 ------
 kompute/op_getrows_q6_k.comp              |  8 ------
 kompute/op_mul.comp                       |  8 ------
 kompute/op_mul_mat_f16.comp               |  8 ------
 kompute/op_mul_mat_mat_f32.comp           |  9 -------
 kompute/op_mul_mat_q4_0.comp              |  8 ------
 kompute/op_mul_mat_q4_1.comp              |  8 ------
 kompute/op_mul_mat_q6_k.comp              |  8 ------
 kompute/op_mul_mat_q8_0.comp              |  8 ------
 kompute/op_mul_mv_q_n.comp                |  8 ------
 kompute/op_mulrow.comp                    |  8 ------
 kompute/op_norm.comp                      |  8 ------
 kompute/op_relu.comp                      |  8 ------
 kompute/op_rmsnorm.comp                   |  8 ------
 kompute/op_rope_f16.comp                  |  8 ------
 kompute/op_rope_f32.comp                  |  8 ------
 kompute/op_scale.comp                     |  8 ------
 kompute/op_scale_8.comp                   |  8 ------
 kompute/op_silu.comp                      |  8 ------
 kompute/op_softmax.comp                   |  8 ------
 kompute/rope_common.comp                  |  8 ------
 kompute/src/Algorithm.cpp                 |  9 -------
 kompute/src/Core.cpp                      |  8 ------
 kompute/src/Manager.cpp                   |  8 ------
 kompute/src/OpAlgoDispatch.cpp            |  8 ------
 kompute/src/OpBufferSyncDevice.cpp        |  8 ------
 kompute/src/OpBufferSyncLocal.cpp         |  8 ------
 kompute/src/OpMemoryBarrier.cpp           |  8 ------
 kompute/src/OpTensorCopy.cpp              |  8 ------
 kompute/src/OpTensorFill.cpp              |  8 ------
 kompute/src/OpTensorSyncDevice.cpp        |  8 ------
 kompute/src/OpTensorSyncLocal.cpp         |  8 ------
 kompute/src/Sequence.cpp                  |  8 ------
 kompute/src/Tensor.cpp                    |  8 ------
 kompute/src/include/kompute/Algorithm.hpp |  9 -------
 kompute/src/include/kompute/Core.hpp      |  9 -------
 kompute/src/include/kompute/Manager.hpp   |  9 -------
 kompute/src/include/kompute/Sequence.hpp  |  9 -------
 kompute/src/include/kompute/Tensor.hpp    |  8 ------
 53 files changed, 452 deletions(-)
 delete mode 100644 LICENSE_SOM.txt

diff --git a/LICENSE_SOM.txt b/LICENSE_SOM.txt
deleted file mode 100644
index eb912c0fd9d30..0000000000000
--- a/LICENSE_SOM.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-Software for Open Models License (SOM)
-Version 1.0 dated August 30th, 2023
-
-This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
-
-This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
-
-1. Definitions
-The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
-A “Model” is the output of a machine learning algorithm, and excludes the Software.
-“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
-“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
-
-2. Grant of Rights. Subject to the conditions and limitations in section 3:
-(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
-
-(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
-
-3. Conditions and Limitations
-(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
-
-(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
-
-(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
-
-(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
-
-(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
-
-(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index a3308191c52b6..1abf1e69970c2 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "ggml-vulkan.h"
 #include "ggml.h"
 
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index 7989cfc1fa7fb..ac8a4d4a0bc39 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include <cstddef>
diff --git a/kompute/common.comp b/kompute/common.comp
index fe0bc5d15b7d6..0df6db7d046fc 100644
--- a/kompute/common.comp
+++ b/kompute/common.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #extension GL_EXT_shader_16bit_storage: require
 #extension GL_EXT_shader_8bit_storage: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index df3fdc59cdc8e..c866734523e74 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
index bf674f8296ccf..2376a6b8f036f 100644
--- a/kompute/op_addrow.comp
+++ b/kompute/op_addrow.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
index 652db031368e6..d57247d2dcc24 100644
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
index aa204248c1f49..b568bcd7b2665 100644
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
index 4fdab483108b8..99b22834308e5 100644
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_diagmask.comp b/kompute/op_diagmask.comp
index 8dc2cc60a7942..291c3fc1897ab 100644
--- a/kompute/op_diagmask.comp
+++ b/kompute/op_diagmask.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
index 1412ee1abe1bf..5b547f414a10b 100644
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_getrows.comp b/kompute/op_getrows.comp
index a4d8bb9a0ad5d..1a5581b23a9db 100644
--- a/kompute/op_getrows.comp
+++ b/kompute/op_getrows.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 void main() {
     const uint i = gl_WorkGroupID.x;
     const int r = inB[i + pcs.inBOff];
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
index 3f2b167243f39..34acbcd700f72 100644
--- a/kompute/op_getrows_f16.comp
+++ b/kompute/op_getrows_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
index 0449b19877bdc..32b2e891e8fcd 100644
--- a/kompute/op_getrows_q4_0.comp
+++ b/kompute/op_getrows_q4_0.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
index 64586cdc9c788..87f2fbe17bb3a 100644
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_getrows_q6_k.comp b/kompute/op_getrows_q6_k.comp
index 95817b4871a40..9ce3545d1ecf4 100644
--- a/kompute/op_getrows_q6_k.comp
+++ b/kompute/op_getrows_q6_k.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
index 662ea8177f7da..d599460c3e961 100644
--- a/kompute/op_mul.comp
+++ b/kompute/op_mul.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
index b56d14f770456..dd1e139794d53 100644
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute/op_mul_mat_mat_f32.comp
index a2dba05608fc7..6cc5558b2725d 100644
--- a/kompute/op_mul_mat_mat_f32.comp
+++ b/kompute/op_mul_mat_mat_f32.comp
@@ -1,12 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
index 165df3c376163..03788c92090b6 100644
--- a/kompute/op_mul_mat_q4_0.comp
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
index 683b695caf95d..0ae8f8c7d5d67 100644
--- a/kompute/op_mul_mat_q4_1.comp
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
index 6148053b279e2..c9baebdf4baac 100644
--- a/kompute/op_mul_mat_q6_k.comp
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_q8_0.comp b/kompute/op_mul_mat_q8_0.comp
index 2ba48127b7576..1c4ddbb083ed7 100644
--- a/kompute/op_mul_mat_q8_0.comp
+++ b/kompute/op_mul_mat_q8_0.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp
index a9b64fe167a29..8b6e6a2e2a6f2 100644
--- a/kompute/op_mul_mv_q_n.comp
+++ b/kompute/op_mul_mv_q_n.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 void main() {
     if (gl_SubgroupInvocationID > 31)
         return;
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
index 955fe26bf0dc6..ae71063208c2f 100644
--- a/kompute/op_mulrow.comp
+++ b/kompute/op_mulrow.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index 1d685cf362378..ad0c3c01b9dd0 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
index c6ed044a38831..52a601fe6da6a 100644
--- a/kompute/op_relu.comp
+++ b/kompute/op_relu.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index 5ebaf22691376..da658c1601e7c 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_rope_f16.comp b/kompute/op_rope_f16.comp
index e4b5ccca363fb..3abe3ed33f701 100644
--- a/kompute/op_rope_f16.comp
+++ b/kompute/op_rope_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "rope_common.comp"
diff --git a/kompute/op_rope_f32.comp b/kompute/op_rope_f32.comp
index 0a882879d9384..104ae0ba4836c 100644
--- a/kompute/op_rope_f32.comp
+++ b/kompute/op_rope_f32.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "rope_common.comp"
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
index be68060912a1e..bdae267382093 100644
--- a/kompute/op_scale.comp
+++ b/kompute/op_scale.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_scale_8.comp b/kompute/op_scale_8.comp
index 29fa9b35a55b5..ada69754b2c14 100644
--- a/kompute/op_scale_8.comp
+++ b/kompute/op_scale_8.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
index 9233fd5a1fc30..0fb8e4b74056d 100644
--- a/kompute/op_silu.comp
+++ b/kompute/op_silu.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index a8c2682dce8b9..89de1b701851d 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 // TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
 
 #version 450
diff --git a/kompute/rope_common.comp b/kompute/rope_common.comp
index 45682dc28bdfb..57ba6597a7eb2 100644
--- a/kompute/rope_common.comp
+++ b/kompute/rope_common.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "common.comp"
 
 // TODO: use a local size of 32 or more (Metal uses 1024)
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
index 0378591bd576b..c2d8554e1fc5e 100644
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include <fstream>
 
 #include "kompute/Algorithm.hpp"
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
index 9b0483232cda8..020f441604022 100644
--- a/kompute/src/Core.cpp
+++ b/kompute/src/Core.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Core.hpp"
 
 #ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index c5060b1ead35a..0c588e19be7b9 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Manager.hpp"
 #include "fmt/format.h"
 #include "kompute/logger/Logger.hpp"
diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
index dc39cdc3fd0b1..edc0f6eb63448 100644
--- a/kompute/src/OpAlgoDispatch.cpp
+++ b/kompute/src/OpAlgoDispatch.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpAlgoDispatch.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpBufferSyncDevice.cpp b/kompute/src/OpBufferSyncDevice.cpp
index baaafda0fa386..1812d04b2428e 100644
--- a/kompute/src/OpBufferSyncDevice.cpp
+++ b/kompute/src/OpBufferSyncDevice.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpBufferSyncDevice.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpBufferSyncLocal.cpp b/kompute/src/OpBufferSyncLocal.cpp
index 63739a351e07c..a829819fa603a 100644
--- a/kompute/src/OpBufferSyncLocal.cpp
+++ b/kompute/src/OpBufferSyncLocal.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpBufferSyncLocal.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpMemoryBarrier.cpp b/kompute/src/OpMemoryBarrier.cpp
index 89d44d85e6599..1f075a3c434e5 100644
--- a/kompute/src/OpMemoryBarrier.cpp
+++ b/kompute/src/OpMemoryBarrier.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpMemoryBarrier.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpTensorCopy.cpp b/kompute/src/OpTensorCopy.cpp
index e732cc4137c00..1eaf428b85556 100644
--- a/kompute/src/OpTensorCopy.cpp
+++ b/kompute/src/OpTensorCopy.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpTensorCopy.hpp"
 #include "kompute/Tensor.hpp"
 
diff --git a/kompute/src/OpTensorFill.cpp b/kompute/src/OpTensorFill.cpp
index da477dcc7f6ee..bda7d6040eb21 100644
--- a/kompute/src/OpTensorFill.cpp
+++ b/kompute/src/OpTensorFill.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpTensorFill.hpp"
 #include "kompute/Tensor.hpp"
 
diff --git a/kompute/src/OpTensorSyncDevice.cpp b/kompute/src/OpTensorSyncDevice.cpp
index 4cc6abf71d08a..b563529ea7822 100644
--- a/kompute/src/OpTensorSyncDevice.cpp
+++ b/kompute/src/OpTensorSyncDevice.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpTensorSyncDevice.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpTensorSyncLocal.cpp b/kompute/src/OpTensorSyncLocal.cpp
index 1aa091b733c6b..7818db565aaa7 100644
--- a/kompute/src/OpTensorSyncLocal.cpp
+++ b/kompute/src/OpTensorSyncLocal.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Tensor.hpp"
 
 #include "kompute/operations/OpTensorSyncLocal.hpp"
diff --git a/kompute/src/Sequence.cpp b/kompute/src/Sequence.cpp
index 3b5fb5fb59b4d..da3b379a3104c 100644
--- a/kompute/src/Sequence.cpp
+++ b/kompute/src/Sequence.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Sequence.hpp"
 
 namespace kp {
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
index 65279206d017e..84dce08e02457 100644
--- a/kompute/src/Tensor.cpp
+++ b/kompute/src/Tensor.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Tensor.hpp"
 
 namespace kp {
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
index ef11234eeb621..e5fef1f56d849 100644
--- a/kompute/src/include/kompute/Algorithm.hpp
+++ b/kompute/src/include/kompute/Algorithm.hpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include "kompute/Core.hpp"
diff --git a/kompute/src/include/kompute/Core.hpp b/kompute/src/include/kompute/Core.hpp
index 99222cbde9f8d..406e6b5d481d5 100644
--- a/kompute/src/include/kompute/Core.hpp
+++ b/kompute/src/include/kompute/Core.hpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include <vulkan/vulkan.hpp>
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
index e910b2b81838c..780c352ebc43a 100644
--- a/kompute/src/include/kompute/Manager.hpp
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include <set>
diff --git a/kompute/src/include/kompute/Sequence.hpp b/kompute/src/include/kompute/Sequence.hpp
index e282242f1d991..3b29a6e2e66ae 100644
--- a/kompute/src/include/kompute/Sequence.hpp
+++ b/kompute/src/include/kompute/Sequence.hpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include "kompute/Core.hpp"
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
index 2ab88eb308178..20939093da7af 100644
--- a/kompute/src/include/kompute/Tensor.hpp
+++ b/kompute/src/include/kompute/Tensor.hpp
@@ -1,12 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
 #pragma once
 
 #include "kompute/Core.hpp"

From 3e09e127ebba12d175d180d65e6c1da165e8424f Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:10:32 -0500
Subject: [PATCH 070/140] rename ggml-vulkan -> ggml-kompute

---
 CMakeLists.txt                      | 10 +++++-----
 examples/main/main.cpp              |  2 +-
 ggml-vulkan.cpp => ggml-kompute.cpp |  2 +-
 ggml-vulkan.h => ggml-kompute.h     |  0
 llama.cpp                           |  2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)
 rename ggml-vulkan.cpp => ggml-kompute.cpp (99%)
 rename ggml-vulkan.h => ggml-kompute.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76f489691b434..0e9183625ee73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -510,15 +510,15 @@ if (LLAMA_KOMPUTE)
 
         # Create a custom command that depends on the generated_shaders
         add_custom_command(
-            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
-            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
             DEPENDS generated_shaders
-            COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
+            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
         )
 
         # Add the stamp to the main sources to ensure dependency tracking
-        set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
-        set(GGML_HEADERS_KOMPUTE ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+        set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
         add_compile_definitions(GGML_USE_KOMPUTE)
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index afcb566c4703f..31cc074341499 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -32,7 +32,7 @@
 #endif
 
 #if defined(GGML_USE_KOMPUTE)
-#include "ggml-vulkan.h"
+#include "ggml-kompute.h"
 #endif
 
 static llama_context           ** g_ctx;
diff --git a/ggml-vulkan.cpp b/ggml-kompute.cpp
similarity index 99%
rename from ggml-vulkan.cpp
rename to ggml-kompute.cpp
index 1abf1e69970c2..df8bcca3dadd3 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-kompute.cpp
@@ -1,4 +1,4 @@
-#include "ggml-vulkan.h"
+#include "ggml-kompute.h"
 #include "ggml.h"
 
 // These are generated at build time by cmake custom command
diff --git a/ggml-vulkan.h b/ggml-kompute.h
similarity index 100%
rename from ggml-vulkan.h
rename to ggml-kompute.h
diff --git a/llama.cpp b/llama.cpp
index f7991b275c0ac..97a688f4b4f81 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12,7 +12,7 @@
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
 #elif defined(GGML_USE_KOMPUTE)
-#   include "ggml-vulkan.h"
+#   include "ggml-kompute.h"
 #endif
 
 #ifdef GGML_USE_METAL

From 27631dbb6eabfc24a6ec4406967145e46d345542 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:22:19 -0500
Subject: [PATCH 071/140] separate shaders from kompute itself

---
 CMakeLists.txt                                | 69 ++++++++++---------
 {kompute => kompute-shaders}/common.comp      |  0
 {kompute => kompute-shaders}/op_add.comp      |  0
 {kompute => kompute-shaders}/op_addrow.comp   |  0
 .../op_cpy_f16_f16.comp                       |  0
 .../op_cpy_f16_f32.comp                       |  0
 .../op_cpy_f32_f16.comp                       |  0
 .../op_cpy_f32_f32.comp                       |  0
 {kompute => kompute-shaders}/op_diagmask.comp |  0
 {kompute => kompute-shaders}/op_gelu.comp     |  0
 {kompute => kompute-shaders}/op_getrows.comp  |  0
 .../op_getrows_f16.comp                       |  0
 .../op_getrows_q4_0.comp                      |  0
 .../op_getrows_q4_1.comp                      |  0
 .../op_getrows_q6_k.comp                      |  0
 {kompute => kompute-shaders}/op_mul.comp      |  0
 .../op_mul_mat_f16.comp                       |  0
 .../op_mul_mat_mat_f32.comp                   |  0
 .../op_mul_mat_q4_0.comp                      |  0
 .../op_mul_mat_q4_1.comp                      |  0
 .../op_mul_mat_q6_k.comp                      |  0
 .../op_mul_mat_q8_0.comp                      |  0
 .../op_mul_mv_q_n.comp                        |  0
 {kompute => kompute-shaders}/op_mulrow.comp   |  0
 {kompute => kompute-shaders}/op_norm.comp     |  0
 {kompute => kompute-shaders}/op_relu.comp     |  0
 {kompute => kompute-shaders}/op_rmsnorm.comp  |  0
 {kompute => kompute-shaders}/op_rope_f16.comp |  0
 {kompute => kompute-shaders}/op_rope_f32.comp |  0
 {kompute => kompute-shaders}/op_scale.comp    |  0
 {kompute => kompute-shaders}/op_scale_8.comp  |  0
 {kompute => kompute-shaders}/op_silu.comp     |  0
 {kompute => kompute-shaders}/op_softmax.comp  |  0
 {kompute => kompute-shaders}/rope_common.comp |  0
 34 files changed, 35 insertions(+), 34 deletions(-)
 rename {kompute => kompute-shaders}/common.comp (100%)
 rename {kompute => kompute-shaders}/op_add.comp (100%)
 rename {kompute => kompute-shaders}/op_addrow.comp (100%)
 rename {kompute => kompute-shaders}/op_cpy_f16_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_cpy_f16_f32.comp (100%)
 rename {kompute => kompute-shaders}/op_cpy_f32_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_cpy_f32_f32.comp (100%)
 rename {kompute => kompute-shaders}/op_diagmask.comp (100%)
 rename {kompute => kompute-shaders}/op_gelu.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows_q4_0.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows_q4_1.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows_q6_k.comp (100%)
 rename {kompute => kompute-shaders}/op_mul.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_mat_f32.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_q4_0.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_q4_1.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_q6_k.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_q8_0.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mv_q_n.comp (100%)
 rename {kompute => kompute-shaders}/op_mulrow.comp (100%)
 rename {kompute => kompute-shaders}/op_norm.comp (100%)
 rename {kompute => kompute-shaders}/op_relu.comp (100%)
 rename {kompute => kompute-shaders}/op_rmsnorm.comp (100%)
 rename {kompute => kompute-shaders}/op_rope_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_rope_f32.comp (100%)
 rename {kompute => kompute-shaders}/op_scale.comp (100%)
 rename {kompute => kompute-shaders}/op_scale_8.comp (100%)
 rename {kompute => kompute-shaders}/op_silu.comp (100%)
 rename {kompute => kompute-shaders}/op_softmax.comp (100%)
 rename {kompute => kompute-shaders}/rope_common.comp (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e9183625ee73..8260dd6cddae3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -403,15 +403,16 @@ if (LLAMA_KOMPUTE)
       set(multiValueArgs SOURCES)
       cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
       foreach(source ${compile_shader_SOURCES})
-        set(spv_file ${source}.spv)
+        get_filename_component(filename ${source} NAME)
+        set(spv_file ${filename}.spv)
         add_custom_command(
             OUTPUT ${spv_file}
             DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/op_getrows.comp
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/op_mul_mv_q_n.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
               COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-            COMMENT "Compiling ${source} to ${source}.spv"
+            COMMENT "Compiling ${source} to ${spv_file}"
         )
 
         get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
@@ -444,35 +445,35 @@ if (LLAMA_KOMPUTE)
 
         # Compile our shaders
         compile_shader(SOURCES
-          kompute/op_scale.comp
-          kompute/op_scale_8.comp
-          kompute/op_add.comp
-          kompute/op_addrow.comp
-          kompute/op_mul.comp
-          kompute/op_mulrow.comp
-          kompute/op_silu.comp
-          kompute/op_relu.comp
-          kompute/op_gelu.comp
-          kompute/op_softmax.comp
-          kompute/op_norm.comp
-          kompute/op_rmsnorm.comp
-          kompute/op_diagmask.comp
-          kompute/op_mul_mat_mat_f32.comp
-          kompute/op_mul_mat_f16.comp
-          kompute/op_mul_mat_q8_0.comp
-          kompute/op_mul_mat_q4_0.comp
-          kompute/op_mul_mat_q4_1.comp
-          kompute/op_mul_mat_q6_k.comp
-          kompute/op_getrows_f16.comp
-          kompute/op_getrows_q4_0.comp
-          kompute/op_getrows_q4_1.comp
-          kompute/op_getrows_q6_k.comp
-          kompute/op_rope_f16.comp
-          kompute/op_rope_f32.comp
-          kompute/op_cpy_f16_f16.comp
-          kompute/op_cpy_f16_f32.comp
-          kompute/op_cpy_f32_f16.comp
-          kompute/op_cpy_f32_f32.comp
+          kompute-shaders/op_scale.comp
+          kompute-shaders/op_scale_8.comp
+          kompute-shaders/op_add.comp
+          kompute-shaders/op_addrow.comp
+          kompute-shaders/op_mul.comp
+          kompute-shaders/op_mulrow.comp
+          kompute-shaders/op_silu.comp
+          kompute-shaders/op_relu.comp
+          kompute-shaders/op_gelu.comp
+          kompute-shaders/op_softmax.comp
+          kompute-shaders/op_norm.comp
+          kompute-shaders/op_rmsnorm.comp
+          kompute-shaders/op_diagmask.comp
+          kompute-shaders/op_mul_mat_mat_f32.comp
+          kompute-shaders/op_mul_mat_f16.comp
+          kompute-shaders/op_mul_mat_q8_0.comp
+          kompute-shaders/op_mul_mat_q4_0.comp
+          kompute-shaders/op_mul_mat_q4_1.comp
+          kompute-shaders/op_mul_mat_q6_k.comp
+          kompute-shaders/op_getrows_f16.comp
+          kompute-shaders/op_getrows_q4_0.comp
+          kompute-shaders/op_getrows_q4_1.comp
+          kompute-shaders/op_getrows_q6_k.comp
+          kompute-shaders/op_rope_f16.comp
+          kompute-shaders/op_rope_f32.comp
+          kompute-shaders/op_cpy_f16_f16.comp
+          kompute-shaders/op_cpy_f16_f32.comp
+          kompute-shaders/op_cpy_f32_f16.comp
+          kompute-shaders/op_cpy_f32_f32.comp
         )
 
         # Create a custom target for our generated shaders
diff --git a/kompute/common.comp b/kompute-shaders/common.comp
similarity index 100%
rename from kompute/common.comp
rename to kompute-shaders/common.comp
diff --git a/kompute/op_add.comp b/kompute-shaders/op_add.comp
similarity index 100%
rename from kompute/op_add.comp
rename to kompute-shaders/op_add.comp
diff --git a/kompute/op_addrow.comp b/kompute-shaders/op_addrow.comp
similarity index 100%
rename from kompute/op_addrow.comp
rename to kompute-shaders/op_addrow.comp
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute-shaders/op_cpy_f16_f16.comp
similarity index 100%
rename from kompute/op_cpy_f16_f16.comp
rename to kompute-shaders/op_cpy_f16_f16.comp
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute-shaders/op_cpy_f16_f32.comp
similarity index 100%
rename from kompute/op_cpy_f16_f32.comp
rename to kompute-shaders/op_cpy_f16_f32.comp
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute-shaders/op_cpy_f32_f16.comp
similarity index 100%
rename from kompute/op_cpy_f32_f16.comp
rename to kompute-shaders/op_cpy_f32_f16.comp
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute-shaders/op_cpy_f32_f32.comp
similarity index 100%
rename from kompute/op_cpy_f32_f32.comp
rename to kompute-shaders/op_cpy_f32_f32.comp
diff --git a/kompute/op_diagmask.comp b/kompute-shaders/op_diagmask.comp
similarity index 100%
rename from kompute/op_diagmask.comp
rename to kompute-shaders/op_diagmask.comp
diff --git a/kompute/op_gelu.comp b/kompute-shaders/op_gelu.comp
similarity index 100%
rename from kompute/op_gelu.comp
rename to kompute-shaders/op_gelu.comp
diff --git a/kompute/op_getrows.comp b/kompute-shaders/op_getrows.comp
similarity index 100%
rename from kompute/op_getrows.comp
rename to kompute-shaders/op_getrows.comp
diff --git a/kompute/op_getrows_f16.comp b/kompute-shaders/op_getrows_f16.comp
similarity index 100%
rename from kompute/op_getrows_f16.comp
rename to kompute-shaders/op_getrows_f16.comp
diff --git a/kompute/op_getrows_q4_0.comp b/kompute-shaders/op_getrows_q4_0.comp
similarity index 100%
rename from kompute/op_getrows_q4_0.comp
rename to kompute-shaders/op_getrows_q4_0.comp
diff --git a/kompute/op_getrows_q4_1.comp b/kompute-shaders/op_getrows_q4_1.comp
similarity index 100%
rename from kompute/op_getrows_q4_1.comp
rename to kompute-shaders/op_getrows_q4_1.comp
diff --git a/kompute/op_getrows_q6_k.comp b/kompute-shaders/op_getrows_q6_k.comp
similarity index 100%
rename from kompute/op_getrows_q6_k.comp
rename to kompute-shaders/op_getrows_q6_k.comp
diff --git a/kompute/op_mul.comp b/kompute-shaders/op_mul.comp
similarity index 100%
rename from kompute/op_mul.comp
rename to kompute-shaders/op_mul.comp
diff --git a/kompute/op_mul_mat_f16.comp b/kompute-shaders/op_mul_mat_f16.comp
similarity index 100%
rename from kompute/op_mul_mat_f16.comp
rename to kompute-shaders/op_mul_mat_f16.comp
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute-shaders/op_mul_mat_mat_f32.comp
similarity index 100%
rename from kompute/op_mul_mat_mat_f32.comp
rename to kompute-shaders/op_mul_mat_mat_f32.comp
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute-shaders/op_mul_mat_q4_0.comp
similarity index 100%
rename from kompute/op_mul_mat_q4_0.comp
rename to kompute-shaders/op_mul_mat_q4_0.comp
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute-shaders/op_mul_mat_q4_1.comp
similarity index 100%
rename from kompute/op_mul_mat_q4_1.comp
rename to kompute-shaders/op_mul_mat_q4_1.comp
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute-shaders/op_mul_mat_q6_k.comp
similarity index 100%
rename from kompute/op_mul_mat_q6_k.comp
rename to kompute-shaders/op_mul_mat_q6_k.comp
diff --git a/kompute/op_mul_mat_q8_0.comp b/kompute-shaders/op_mul_mat_q8_0.comp
similarity index 100%
rename from kompute/op_mul_mat_q8_0.comp
rename to kompute-shaders/op_mul_mat_q8_0.comp
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute-shaders/op_mul_mv_q_n.comp
similarity index 100%
rename from kompute/op_mul_mv_q_n.comp
rename to kompute-shaders/op_mul_mv_q_n.comp
diff --git a/kompute/op_mulrow.comp b/kompute-shaders/op_mulrow.comp
similarity index 100%
rename from kompute/op_mulrow.comp
rename to kompute-shaders/op_mulrow.comp
diff --git a/kompute/op_norm.comp b/kompute-shaders/op_norm.comp
similarity index 100%
rename from kompute/op_norm.comp
rename to kompute-shaders/op_norm.comp
diff --git a/kompute/op_relu.comp b/kompute-shaders/op_relu.comp
similarity index 100%
rename from kompute/op_relu.comp
rename to kompute-shaders/op_relu.comp
diff --git a/kompute/op_rmsnorm.comp b/kompute-shaders/op_rmsnorm.comp
similarity index 100%
rename from kompute/op_rmsnorm.comp
rename to kompute-shaders/op_rmsnorm.comp
diff --git a/kompute/op_rope_f16.comp b/kompute-shaders/op_rope_f16.comp
similarity index 100%
rename from kompute/op_rope_f16.comp
rename to kompute-shaders/op_rope_f16.comp
diff --git a/kompute/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp
similarity index 100%
rename from kompute/op_rope_f32.comp
rename to kompute-shaders/op_rope_f32.comp
diff --git a/kompute/op_scale.comp b/kompute-shaders/op_scale.comp
similarity index 100%
rename from kompute/op_scale.comp
rename to kompute-shaders/op_scale.comp
diff --git a/kompute/op_scale_8.comp b/kompute-shaders/op_scale_8.comp
similarity index 100%
rename from kompute/op_scale_8.comp
rename to kompute-shaders/op_scale_8.comp
diff --git a/kompute/op_silu.comp b/kompute-shaders/op_silu.comp
similarity index 100%
rename from kompute/op_silu.comp
rename to kompute-shaders/op_silu.comp
diff --git a/kompute/op_softmax.comp b/kompute-shaders/op_softmax.comp
similarity index 100%
rename from kompute/op_softmax.comp
rename to kompute-shaders/op_softmax.comp
diff --git a/kompute/rope_common.comp b/kompute-shaders/rope_common.comp
similarity index 100%
rename from kompute/rope_common.comp
rename to kompute-shaders/rope_common.comp

From b906e126ca1aace9aebf2b705a033a78998e4ef5 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:30:38 -0500
Subject: [PATCH 072/140] kompute : fix compile warnings

---
 ggml-kompute.cpp | 70 +++++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index df8bcca3dadd3..f70231bedaef2 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -64,7 +64,7 @@ struct ggml_kompute_context {
 // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
 // is only created when a device is set and vulkan is explicitly turned on.
 ggml_kompute_context *s_kompute_context = nullptr;
-kp::Manager *komputeManager() {
+static kp::Manager *komputeManager() {
     static kp::Manager *s_mgr = nullptr;
     if (s_mgr && !s_mgr->hasInstance()) {
         delete s_mgr;
@@ -551,7 +551,7 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
 }
 
-std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
+static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
     if (size % sizeof(uint32_t) != 0) {
         throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
     }
@@ -573,7 +573,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
     return a / b;
 }
 
-void ggml_vk_add(
+static void ggml_vk_add(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -621,7 +621,7 @@ void ggml_vk_add(
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_addrow(kp::Sequence& seq,
+static void ggml_vk_addrow(kp::Sequence& seq,
                  const std::shared_ptr<kp::Tensor>& inA,
                  const std::shared_ptr<kp::Tensor>& inB,
                  const std::shared_ptr<kp::Tensor>& out,
@@ -652,7 +652,7 @@ void ggml_vk_addrow(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul(kp::Sequence& seq,
+static void ggml_vk_mul(kp::Sequence& seq,
                     const std::shared_ptr<kp::Tensor>& inA,
                     const std::shared_ptr<kp::Tensor>& inB,
                     const std::shared_ptr<kp::Tensor>& out,
@@ -681,7 +681,7 @@ void ggml_vk_mul(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mulrow(kp::Sequence& seq,
+static void ggml_vk_mulrow(kp::Sequence& seq,
                  const std::shared_ptr<kp::Tensor>& inA,
                  const std::shared_ptr<kp::Tensor>& inB,
                  const std::shared_ptr<kp::Tensor>& out,
@@ -712,7 +712,7 @@ void ggml_vk_mulrow(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_scale(kp::Sequence& seq,
+static void ggml_vk_scale(kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
@@ -753,7 +753,7 @@ void ggml_vk_scale(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+static void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
                   const std::shared_ptr<kp::Tensor>& in,
                   const std::shared_ptr<kp::Tensor>& out,
                   uint32_t inOff, uint32_t outOff,
@@ -778,7 +778,7 @@ void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 }
 
 template <typename... Args>
-void ggml_vk_silu(Args&&... args) {
+static void ggml_vk_silu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
         kp::shader_data::op_silu_comp_spv_len);
 
@@ -786,7 +786,7 @@ void ggml_vk_silu(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_relu(Args&&... args) {
+static void ggml_vk_relu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
         kp::shader_data::op_relu_comp_spv_len);
 
@@ -794,14 +794,14 @@ void ggml_vk_relu(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_gelu(Args&&... args) {
+static void ggml_vk_gelu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
         kp::shader_data::op_gelu_comp_spv_len);
 
     ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
 
-void ggml_vk_soft_max(kp::Sequence& seq,
+static void ggml_vk_soft_max(kp::Sequence& seq,
                       const std::shared_ptr<kp::Tensor>& in,
                       const std::shared_ptr<kp::Tensor>& out,
                       uint32_t inOff, uint32_t outOff,
@@ -833,7 +833,7 @@ void ggml_vk_soft_max(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+static void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
@@ -865,7 +865,7 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 }
 
 template <typename... Args>
-void ggml_vk_norm(Args&&... args) {
+static void ggml_vk_norm(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
         kp::shader_data::op_norm_comp_spv_len);
 
@@ -873,14 +873,14 @@ void ggml_vk_norm(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_rms_norm(Args&&... args) {
+static void ggml_vk_rms_norm(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
         kp::shader_data::op_rmsnorm_comp_spv_len);
 
     ggml_vk_norm_(spirv, std::forward<Args>(args)...);
 }
 
-void ggml_vk_diag_mask_inf(kp::Sequence& seq,
+static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
                            const std::shared_ptr<kp::Tensor>& in,
                            const std::shared_ptr<kp::Tensor>& out,
                            uint32_t inOff, uint32_t outOff,
@@ -912,7 +912,7 @@ void ggml_vk_diag_mask_inf(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul_mat_f16(kp::Sequence& seq,
+static void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
@@ -951,7 +951,7 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
+static void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
@@ -989,7 +989,7 @@ void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
 }
 
 
-void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
+static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
@@ -1039,7 +1039,7 @@ void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
+static void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
                           const std::shared_ptr<kp::Tensor>& out,
@@ -1069,7 +1069,7 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
 }
 
 template <typename... Args>
-void ggml_vk_mul_mat_q4_0(Args&&... args) {
+static void ggml_vk_mul_mat_q4_0(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
         kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
 
@@ -1077,14 +1077,14 @@ void ggml_vk_mul_mat_q4_0(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_mul_mat_q4_1(Args&&... args) {
+static void ggml_vk_mul_mat_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
         kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
 
     ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
-void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
+static void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
                           const std::shared_ptr<kp::Tensor>& out,
@@ -1116,7 +1116,7 @@ void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
+static void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
                       unsigned element_size, unsigned qk,
                       kp::Sequence& seq,
                       const std::shared_ptr<kp::Tensor>& inA,
@@ -1151,7 +1151,7 @@ void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
 }
 
 template <typename... Args>
-void ggml_vk_get_rows_f16(Args&&... args) {
+static void ggml_vk_get_rows_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
         kp::shader_data::op_getrows_f16_comp_spv_len);
 
@@ -1159,7 +1159,7 @@ void ggml_vk_get_rows_f16(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_get_rows_q4_0(Args&&... args) {
+static void ggml_vk_get_rows_q4_0(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
         kp::shader_data::op_getrows_q4_0_comp_spv_len);
 
@@ -1167,7 +1167,7 @@ void ggml_vk_get_rows_q4_0(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_get_rows_q4_1(Args&&... args) {
+static void ggml_vk_get_rows_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
         kp::shader_data::op_getrows_q4_1_comp_spv_len);
 
@@ -1175,13 +1175,13 @@ void ggml_vk_get_rows_q4_1(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_get_rows_q6_k(Args&&... args) {
+static void ggml_vk_get_rows_q6_k(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
         kp::shader_data::op_getrows_q6_k_comp_spv_len);
     ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
 }
 
-void ggml_vk_rope(
+static void ggml_vk_rope(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -1249,7 +1249,7 @@ void ggml_vk_rope(
 }
 
 template<uint32_t in_element_size, uint32_t out_element_size>
-void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
+static void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
                  kp::Sequence& seq,
                  const std::shared_ptr<kp::Tensor>& in,
                  const std::shared_ptr<kp::Tensor>& out,
@@ -1289,28 +1289,28 @@ void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
 }
 
 template <typename... Args>
-void ggml_vk_cpy_f32_f16(Args&&... args) {
+static void ggml_vk_cpy_f32_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
         kp::shader_data::op_cpy_f32_f16_comp_spv_len);
     ggml_vk_cpy<4, 2>(spirv, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-void ggml_vk_cpy_f32_f32(Args&&... args) {
+static void ggml_vk_cpy_f32_f32(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
         kp::shader_data::op_cpy_f32_f32_comp_spv_len);
     ggml_vk_cpy<4, 4>(spirv, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-void ggml_vk_cpy_f16_f16(Args&&... args) {
+static void ggml_vk_cpy_f16_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
         kp::shader_data::op_cpy_f16_f16_comp_spv_len);
     ggml_vk_cpy<2, 2>(spirv, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-void ggml_vk_cpy_f16_f32(Args&&... args) {
+static void ggml_vk_cpy_f16_f32(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
         kp::shader_data::op_cpy_f16_f32_comp_spv_len);
     ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
@@ -1349,6 +1349,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_TRANSPOSE:
                 case GGML_OP_PERMUTE:
                     continue; // noop -> next node
+                default:
+                    break;
             }
 
             const int32_t ne00 = src0 ? src0->ne[0] : 0;

From 9af7f58b7beb2a66f8a199758a0cfe74989a45df Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:54:35 -0500
Subject: [PATCH 073/140] move kompute to a submodule

---
 .gitmodules                                   |   3 +
 kompute                                       |   1 +
 kompute/.ccls                                 |  27 -
 kompute/.clang-format                         |   5 -
 kompute/.dockerignore                         |   4 -
 kompute/.github/workflows/cpp_examples.yml    |  58 --
 kompute/.github/workflows/cpp_tests.yml       | 104 ---
 kompute/.github/workflows/python_tests.yml    |  28 -
 kompute/CMakeLists.txt                        | 189 ----
 kompute/LICENSE                               | 203 -----
 kompute/Makefile                              | 210 -----
 kompute/README.md                             | 513 -----------
 kompute/cmake/bin2h.cmake                     | 106 ---
 kompute/cmake/bin_file_to_header.cmake        |  19 -
 kompute/cmake/check_vulkan_version.cmake      | 139 ---
 kompute/cmake/code_coverage.cmake             |  35 -
 kompute/cmake/deprecation_warnings.cmake      |  15 -
 kompute/cmake/komputeConfig.cmake.in          |   8 -
 kompute/cmake/vulkan_shader_compiler.cmake    |  43 -
 kompute/config/FindSphinx.cmake               |  16 -
 kompute/external/bin/xxd.c                    | 819 ------------------
 kompute/kompute-config.cmake                  |  28 -
 kompute/scripts/convert_shaders.py            | 149 ----
 kompute/scripts/requirements.txt              |  11 -
 kompute/setup.py                              |  93 --
 kompute/src/Algorithm.cpp                     | 418 ---------
 kompute/src/CMakeLists.txt                    |  86 --
 kompute/src/Core.cpp                          |  17 -
 kompute/src/Manager.cpp                       | 512 -----------
 kompute/src/OpAlgoDispatch.cpp                |  57 --
 kompute/src/OpBufferSyncDevice.cpp            |  43 -
 kompute/src/OpBufferSyncLocal.cpp             |  43 -
 kompute/src/OpMemoryBarrier.cpp               |  66 --
 kompute/src/OpTensorCopy.cpp                  |  82 --
 kompute/src/OpTensorFill.cpp                  |  47 -
 kompute/src/OpTensorSyncDevice.cpp            |  53 --
 kompute/src/OpTensorSyncLocal.cpp             |  68 --
 kompute/src/Sequence.cpp                      | 388 ---------
 kompute/src/Tensor.cpp                        | 450 ----------
 kompute/src/include/CMakeLists.txt            |  47 -
 kompute/src/include/kompute/Algorithm.hpp     | 330 -------
 kompute/src/include/kompute/Core.hpp          |  30 -
 kompute/src/include/kompute/Kompute.hpp       |  22 -
 kompute/src/include/kompute/Manager.hpp       | 284 ------
 kompute/src/include/kompute/Sequence.hpp      | 304 -------
 kompute/src/include/kompute/Tensor.hpp        | 302 -------
 kompute/src/include/kompute/logger/Logger.hpp | 197 -----
 .../kompute/operations/OpAlgoDispatch.hpp     |  86 --
 .../src/include/kompute/operations/OpBase.hpp |  62 --
 .../kompute/operations/OpBufferSyncDevice.hpp |  50 --
 .../kompute/operations/OpBufferSyncLocal.hpp  |  50 --
 .../kompute/operations/OpMemoryBarrier.hpp    |  81 --
 .../src/include/kompute/operations/OpMult.hpp |  58 --
 .../kompute/operations/OpTensorCopy.hpp       |  63 --
 .../kompute/operations/OpTensorFill.hpp       |  58 --
 .../kompute/operations/OpTensorSyncDevice.hpp |  66 --
 .../kompute/operations/OpTensorSyncLocal.hpp  |  66 --
 kompute/src/logger/CMakeLists.txt             |  69 --
 kompute/src/logger/Logger.cpp                 | 101 ---
 kompute/src/shaders/CMakeLists.txt            |   5 -
 kompute/src/shaders/glsl/CMakeLists.txt       |  26 -
 .../glsl/ShaderLogisticRegression.comp        |  52 --
 .../glsl/ShaderLogisticRegression.hpp.in      | 310 -------
 kompute/src/shaders/glsl/ShaderOpMult.comp    |  28 -
 kompute/src/shaders/glsl/ShaderOpMult.hpp.in  | 101 ---
 kompute/src/shaders/hlsl/computeheadless.comp |  29 -
 66 files changed, 4 insertions(+), 8029 deletions(-)
 create mode 160000 kompute
 delete mode 100644 kompute/.ccls
 delete mode 100644 kompute/.clang-format
 delete mode 100644 kompute/.dockerignore
 delete mode 100644 kompute/.github/workflows/cpp_examples.yml
 delete mode 100644 kompute/.github/workflows/cpp_tests.yml
 delete mode 100644 kompute/.github/workflows/python_tests.yml
 delete mode 100644 kompute/CMakeLists.txt
 delete mode 100644 kompute/LICENSE
 delete mode 100644 kompute/Makefile
 delete mode 100644 kompute/README.md
 delete mode 100644 kompute/cmake/bin2h.cmake
 delete mode 100644 kompute/cmake/bin_file_to_header.cmake
 delete mode 100644 kompute/cmake/check_vulkan_version.cmake
 delete mode 100644 kompute/cmake/code_coverage.cmake
 delete mode 100644 kompute/cmake/deprecation_warnings.cmake
 delete mode 100644 kompute/cmake/komputeConfig.cmake.in
 delete mode 100644 kompute/cmake/vulkan_shader_compiler.cmake
 delete mode 100644 kompute/config/FindSphinx.cmake
 delete mode 100644 kompute/external/bin/xxd.c
 delete mode 100644 kompute/kompute-config.cmake
 delete mode 100755 kompute/scripts/convert_shaders.py
 delete mode 100644 kompute/scripts/requirements.txt
 delete mode 100644 kompute/setup.py
 delete mode 100644 kompute/src/Algorithm.cpp
 delete mode 100644 kompute/src/CMakeLists.txt
 delete mode 100644 kompute/src/Core.cpp
 delete mode 100644 kompute/src/Manager.cpp
 delete mode 100644 kompute/src/OpAlgoDispatch.cpp
 delete mode 100644 kompute/src/OpBufferSyncDevice.cpp
 delete mode 100644 kompute/src/OpBufferSyncLocal.cpp
 delete mode 100644 kompute/src/OpMemoryBarrier.cpp
 delete mode 100644 kompute/src/OpTensorCopy.cpp
 delete mode 100644 kompute/src/OpTensorFill.cpp
 delete mode 100644 kompute/src/OpTensorSyncDevice.cpp
 delete mode 100644 kompute/src/OpTensorSyncLocal.cpp
 delete mode 100644 kompute/src/Sequence.cpp
 delete mode 100644 kompute/src/Tensor.cpp
 delete mode 100644 kompute/src/include/CMakeLists.txt
 delete mode 100644 kompute/src/include/kompute/Algorithm.hpp
 delete mode 100644 kompute/src/include/kompute/Core.hpp
 delete mode 100644 kompute/src/include/kompute/Kompute.hpp
 delete mode 100644 kompute/src/include/kompute/Manager.hpp
 delete mode 100644 kompute/src/include/kompute/Sequence.hpp
 delete mode 100644 kompute/src/include/kompute/Tensor.hpp
 delete mode 100644 kompute/src/include/kompute/logger/Logger.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpBase.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpMult.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpTensorCopy.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpTensorFill.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
 delete mode 100644 kompute/src/logger/CMakeLists.txt
 delete mode 100644 kompute/src/logger/Logger.cpp
 delete mode 100644 kompute/src/shaders/CMakeLists.txt
 delete mode 100644 kompute/src/shaders/glsl/CMakeLists.txt
 delete mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.comp
 delete mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
 delete mode 100644 kompute/src/shaders/glsl/ShaderOpMult.comp
 delete mode 100644 kompute/src/shaders/glsl/ShaderOpMult.hpp.in
 delete mode 100644 kompute/src/shaders/hlsl/computeheadless.comp

diff --git a/.gitmodules b/.gitmodules
index e69de29bb2d1d..b7e8b8ff2f64e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "kompute"]
+	path = kompute
+	url = https://github.com/nomic-ai/kompute.git
diff --git a/kompute b/kompute
new file mode 160000
index 0000000000000..2d0a8abc64e90
--- /dev/null
+++ b/kompute
@@ -0,0 +1 @@
+Subproject commit 2d0a8abc64e90a0956390aa3f1854cb6d48141db
diff --git a/kompute/.ccls b/kompute/.ccls
deleted file mode 100644
index 71d5d711e0797..0000000000000
--- a/kompute/.ccls
+++ /dev/null
@@ -1,27 +0,0 @@
-
-%clang
-
--fdeclspec
--fms-extensions
--Wall
--Wextra
--std=c++17
-
-%h -x
-%h c++-header
-
--DDEBUG=1
--DKOMPUTE_INCLUDE_FOR_SYNTAX
-
--I/usr/include/python3.6/
--I./python/pybind11/include/
-
--I./build/_deps/vulkan_header-src/include/
--I./build/_deps/spdlog-src/include/
--I./build/_deps/googletest-src/googletest/include/
--I./build/_deps/fmt-src/include/
-
--I./src/include/
--I./build/src/shaders/glsl/
--I./build/test/shaders/glsl/
--I./test/utils/
diff --git a/kompute/.clang-format b/kompute/.clang-format
deleted file mode 100644
index 5191313a38a18..0000000000000
--- a/kompute/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
-﻿---
-BasedOnStyle: Mozilla
-IndentWidth: 4
-
-...
diff --git a/kompute/.dockerignore b/kompute/.dockerignore
deleted file mode 100644
index 9498d9195f7b2..0000000000000
--- a/kompute/.dockerignore
+++ /dev/null
@@ -1,4 +0,0 @@
-build/*
-examples/*
-docker-builders/
-swiftshader/
diff --git a/kompute/.github/workflows/cpp_examples.yml b/kompute/.github/workflows/cpp_examples.yml
deleted file mode 100644
index ad5306e9b29e9..0000000000000
--- a/kompute/.github/workflows/cpp_examples.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: C++ Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  array-multiplication-example:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/examples/array_multiplication/build
-        source-dir: ${{github.workspace}}/examples/array_multiplication
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
-        build-options: --parallel # Given we don't build too many resources we can leverage parallel
-    - name: Run tests
-      run: ./examples/array_multiplication/build/src/kompute_array_mult
-
-  logistc-regression-example:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/examples/logistic_regression/build
-        source-dir: ${{github.workspace}}/examples/logistic_regression
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
-        build-options: --parallel # Given we don't build too many resources we can leverage parallel
-    - name: Run tests
-      run: ./examples/logistic_regression/build/src/kompute_logistic_regression
diff --git a/kompute/.github/workflows/cpp_tests.yml b/kompute/.github/workflows/cpp_tests.yml
deleted file mode 100644
index 53a90a145d386..0000000000000
--- a/kompute/.github/workflows/cpp_tests.yml
+++ /dev/null
@@ -1,104 +0,0 @@
-name: C++ Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  cpp-tests-debug-with-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-
-  cpp-tests-release-with-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Release
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-
-  cpp-tests-debug-without-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-  
-  cpp-tests-release-without-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Release
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
diff --git a/kompute/.github/workflows/python_tests.yml b/kompute/.github/workflows/python_tests.yml
deleted file mode 100644
index 9f84d1e854178..0000000000000
--- a/kompute/.github/workflows/python_tests.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Python Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  python-tests:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: Install Python Requirements
-      run: pip3 install --user -r python/test/requirements-dev.txt
-    - name: Python Build
-      env:
-        KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
-        KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
-      run: pip3 install --user . -v
-    - name: Python run Tests
-      run: |
-        export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
-        make test_python
diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
deleted file mode 100644
index 1bd84d7ede7b8..0000000000000
--- a/kompute/CMakeLists.txt
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-cmake_minimum_required(VERSION 3.20)
-project(kompute VERSION 0.8.1 LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 14)
-
-# Only change the folder behavior if kompute is not a subproject
-if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-    set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-    set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake")
-    set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin)
-    set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
-endif()
-
-# Avoid the dll boilerplate code for windows
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
-
-set(KOMPUTE_LIBRARIES kompute CACHE INTERNAL "")
-
-# ####################################################
-# Options
-# ####################################################
-macro(kompute_option OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    option(${OPTION_NAME} ${OPTION_TEXT} ${OPTION_DEFAULT})
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow overriding the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-macro(kompute_log_level OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
-    set_property(CACHE ${OPTION_NAME} PROPERTY STRINGS "Trace" "Debug" "Info" "Warn" "Error" "Critical" "Default" "Off")
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow setting the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    # Allow disabling logging completely and prevent linking against it:
-    if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
-        set(${OPTION_NAME}_DISABLED ON)
-        add_compile_definitions(${OPTION_NAME}_DISABLED=1)
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-macro(kompute_option_string OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow setting the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-message(STATUS "General purpose GPU compute framework built on Vulkan")
-message(STATUS "=======================================================")
-
-# Build options
-kompute_log_level(KOMPUTE_OPT_LOG_LEVEL "Internally we use Spdlog or fmt for logging, depending on the value of 'KOMPUTE_OPT_USE_SPDLOG'. The log level used can be changed here. Possible values: 'Trace', 'Debug', 'Info', 'Warn', 'Error', 'Critical', 'Off', 'Default'. If set to 'Off' logging will be deactivated completely. If set to 'Default', the log level will be set to 'Info' for release builds and 'Debug' else." "Off")
-kompute_option(KOMPUTE_OPT_USE_SPDLOG "If enabled, logging via KP_LOG_<DEBUG, INFO, etc...> will happen through Spdlog instead of plan fmt." OFF)
-kompute_option(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS "Explicitly disable debug layers even on debug." ON)
-kompute_option(KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK "Whether to check if your driver supports the Vulkan Header version you are linking against. This might be useful in case you build shared on a different system than you run later." OFF)
-kompute_option(KOMPUTE_OPT_BUILD_SHADERS "Rebuilds all compute shaders during compilation and does not use the already precompiled versions. Requires glslangValidator to be installed on your system." OFF)
-
-# External components
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG "Use the built-in version of Spdlog. Requires 'KOMPUTE_OPT_USE_SPDLOG' to be set to ON in order to have any effect." ON)
-kompute_option(KOMPUTE_OPT_SPDLOG_ASYNC_MODE "If spdlog is enabled this allows for selecting whether the default logger setup creates sync or async logger" OFF)
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_FMT "Use the built-in version of fmt." ON)
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER "Use the built-in version of Vulkan Headers. This could be helpful in case your system Vulkan Headers are too new for your driver. If you set this to OFF, please make sure your system Vulkan Headers are supported by your driver." ON)
-kompute_option_string(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "The git tag used for the built-in Vulkan Headers when 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER' is enabled. A list of tags can be found here: https://github.com/KhronosGroup/Vulkan-Headers/tags" "v1.3.231")
-message(STATUS "=======================================================")
-
-# ####################################################
-# Deprecated Options
-# ####################################################
-include(cmake/deprecation_warnings.cmake)
-
-# ####################################################
-# Dependencies
-# ####################################################
-include(cmake/vulkan_shader_compiler.cmake)
-include(cmake/check_vulkan_version.cmake)
-include(FetchContent)
-
-# Vulkan Header
-if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
-    FetchContent_Declare(vulkan_header GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers.git
-        GIT_TAG ${KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG}) # Source: https://github.com/KhronosGroup/Vulkan-Headers/tags
-    FetchContent_MakeAvailable(vulkan_header)
-
-    if(NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
-        # Ensure the driver supports this Vulkan version
-        check_vulkan_version(INCLUDE_DIR "${vulkan_header_SOURCE_DIR}/include")
-    endif()
-endif()
-
-find_package(Vulkan REQUIRED)
-
-if(Vulkan_FOUND AND NOT TARGET Vulkan::Headers)
-    add_library(Vulkan::Headers INTERFACE IMPORTED)
-    set_target_properties(Vulkan::Headers PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${Vulkan_INCLUDE_DIRS}")
-endif()
-
-if(NOT KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER AND NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
-    # Ensure the driver supports this Vulkan version
-    check_vulkan_version(INCLUDE_DIR ${Vulkan_INCLUDE_DIR})
-endif()
-
-# Spdlog
-if(KOMPUTE_OPT_USE_SPDLOG)
-    add_compile_definitions(KOMPUTE_OPT_USE_SPDLOG=1)
-
-    if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
-        if(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG)
-            set(SPDLOG_BUILD_SHARED ${BUILD_SHARED_LIBS})
-
-            FetchContent_Declare(spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git
-                GIT_TAG v1.10.0) # Source: https://github.com/gabime/spdlog/releases
-            FetchContent_MakeAvailable(spdlog)
-        else()
-            find_package(spdlog REQUIRED)
-        endif()
-    endif()
-endif()
-
-# fmt
-if(KOMPUTE_OPT_USE_BUILT_IN_FMT)
-    FetchContent_Declare(fmt GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-        GIT_TAG 10.0.0) # Source: https://github.com/fmtlib/fmt/releases
-    FetchContent_MakeAvailable(fmt)
-else()
-    find_package(fmt REQUIRED)
-endif()
-
-add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-
-# ####################################################
-# Preprocessor Macros
-# ####################################################
-if(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS)
-    add_compile_definitions(KOMPUTE_DISABLE_VK_DEBUG_LAYERS=1)
-endif()
-
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror -Wno-error=array-bounds")
-endif()
-
-# If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h
-# As after installation, SPIRV/ header files will be found in glslang/SPIRV/ , more info in #193
-if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
-    add_definitions(-DUSE_EXTERNAL_GLSLANG)
-endif()
-
-# Allow scripts to call main kompute Makefile
-function(kompute_make KOMPUTE_MAKE_TARGET)
-    add_custom_target(${KOMPUTE_MAKE_TARGET}
-        COMMAND make -C ${PROJECT_SOURCE_DIR} ${KOMPUTE_MAKE_TARGET})
-endfunction()
-
-add_executable(xxd external/bin/xxd.c)
-
-add_subdirectory(src)
diff --git a/kompute/LICENSE b/kompute/LICENSE
deleted file mode 100644
index 821a2723e9a83..0000000000000
--- a/kompute/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2021 The Institute for Ethical AI & Machine Learning
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
diff --git a/kompute/Makefile b/kompute/Makefile
deleted file mode 100644
index 62ad68b46ab11..0000000000000
--- a/kompute/Makefile
+++ /dev/null
@@ -1,210 +0,0 @@
-# This makefile is optimized to be run from WSL and to interact with the 
-# Windows host as there are limitations when building GPU programs. This
-# makefile contains the commands for interacting with the visual studio
-# build via command line for faster iterations, as the intention is to 
-# support other editors (optimised for vim). There are also commands that
-# support the builds for linux-native compilations and these are the commands
-# starting with mk_.
-
-VERSION := $(shell cat ./VERSION)
-
-VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsystems\\vcpkg.cmake"
-VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
-
-# These are the tests that don't work with swiftshader but can be run directly with vulkan
-FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps:TestPushConstants.TestConstantsDouble"
-
-ifeq ($(OS),Windows_NT)     # is Windows_NT on XP, 2000, 7, Vista, 10...
-	CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
-	SCMP_BIN="C:\\VulkanSDK\\1.2.141.2\\Bin32\\glslangValidator.exe"
-	MSBUILD_BIN ?= "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\Bin\\MSBuild.exe"
-else
-	CLANG_FORMAT_BIN ?= "/home/alejandro/Programming/lib/clang+llvm-10.0.0-x86_64-linux-gnu-ubuntu-18.04/bin/clang-format"
-	CMAKE_BIN ?= "/c/Program Files/CMake/bin/cmake.exe"
-	MSBUILD_BIN ?= "/c/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
-	# Choosing the binary based on whether it's on WSL or linux-native
-	KERNEL := $(shell uname -r)
-	IS_WSL := $(shell (if [[ "$(KERNEL)" =~ Microsoft$  ]]; then echo '0'; fi))
-	ifeq ($(IS_WSL),0)
-		SCMP_BIN ?= "/c/VulkanSDK/1.2.141.2/Bin32/glslangValidator.exe"
-	else
-		SCMP_BIN ?= "/usr/bin/glslangValidator"
-	endif
-endif
-
-
-####### Main Target Rules #######
-
-push_docs_to_ghpages:
-	GIT_DEPLOY_DIR="build/docs/sphinx/" \
-		GIT_DEPLOY_BRANCH="gh-pages" \
-		GIT_DEPLOY_REPO="origin" \
-			./scripts/push_folder_to_branch.sh
-
-####### CMAKE quickstart commands #######
-
-clean_cmake:
-	rm -rf build/
-
-####### Visual studio build shortcut commands #######
-
-MK_BUILD_TYPE ?= "Release"
-MK_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
-MK_CMAKE_EXTRA_FLAGS ?= ""
-MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
-
-mk_cmake:
-	cmake \
-		-Bbuild \
-		-DCMAKE_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
-		-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
-		-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
-		-DKOMPUTE_OPT_INSTALL=ON \
-		-DKOMPUTE_OPT_BUILD_TESTS=ON \
-		-DKOMPUTE_OPT_BUILD_DOCS=ON \
-		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
-		-DKOMPUTE_OPT_CODE_COVERAGE=ON \
-		-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-		-DKOMPUTE_OPT_LOG_LEVEL=Debug \
-		$(MK_CMAKE_EXTRA_FLAGS) \
-		-G "Unix Makefiles"
-
-mk_build_all:
-	cmake --build build/. --parallel
-
-mk_build_docs:
-	cmake --build build/. --target gendocsall --parallel
-
-mk_build_kompute:
-	cmake --build build/. --target kompute --parallel
-
-mk_build_tests:
-	cmake --build build/. --target kompute_tests --parallel
-
-mk_run_docs: mk_build_docs
-	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
-
-# An alternative would be: ctest -vv --test-dir build/.
-# But this is not possible since we need to filter specific tests, not complete executables, which is not possible with ctest.
-# https://gitlab.kitware.com/cmake/cmake/-/issues/13168 
-mk_run_tests: mk_build_tests
-	./build/bin/kompute_tests --gtest_filter=$(FILTER_TESTS)
-
-mk_build_swiftshader_library:
-	git clone https://github.com/google/swiftshader || echo "Assuming already cloned"
-	# GCC 8 or above is required otherwise error on "filesystem" lib will appear
-	CC="/usr/bin/gcc-8" CXX="/usr/bin/g++-8" cmake swiftshader/. -Bswiftshader/build/
-	cmake --build swiftshader/build/. --parallel
-
-mk_run_tests_cpu: export VK_ICD_FILENAMES=$(PWD)/swiftshader/build/vk_swiftshader_icd.json
-mk_run_tests_cpu: mk_build_swiftshader_library mk_build_tests mk_run_tests_cpu_only
-
-
-####### Visual studio build shortcut commands #######
-
-VS_BUILD_TYPE ?= "Debug"
-# Run with multiprocessin / parallel build by default
-VS_CMAKE_EXTRA_FLAGS ?= ""
-VS_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
-VS_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
-
-vs_cmake:
-	$(CMAKE_BIN) \
-		-Bbuild \
-		$(VS_CMAKE_EXTRA_FLAGS) \
-		-DCMAKE_TOOLCHAIN_FILE=$(VCPKG_WIN_PATH) \
-		-DCMAKE_CXX_FLAGS=$(VS_KOMPUTE_EXTRA_CXX_FLAGS) \
-		-DCMAKE_INSTALL_PREFIX=$(VS_INSTALL_PATH) \
-		-DKOMPUTE_OPT_INSTALL=ON \
-		-DKOMPUTE_OPT_BUILD_TESTS=ON \
-		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
-		-DKOMPUTE_OPT_CODE_COVERAGE=OFF \
-		-DKOMPUTE_OPT_BUILD_DOCS=OFF \
-		-G "Visual Studio 16 2019" \
-		-DCMAKE_BUILD_TYPE=$(VS_BUILD_TYPE)
-
-vs_build_all:
-	cmake --build build/. --parallel
-
-vs_build_docs:
-	cmake --build build/. --target gendocsall --parallel
-
-vs_install_kompute:
-	cmake --build build/. --target install --parallel
-
-vs_build_kompute:
-	cmake --build build/. --target kompute --parallel
-
-vs_build_tests:
-	cmake --build build/. --target kompute_tests --parallel
-
-vs_run_docs: vs_build_docs
-	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
-
-vs_run_tests: vs_build_tests
-	./build/test/$(VS_BUILD_TYPE)/bin/kompute_tests.exe --gtest_filter=$(FILTER_TESTS)
-
-
-#### PYTHONG ####
-
-test_python:
-	python3 -m pytest -s --log-cli-level=DEBUG -v python/test/
-
-####### Run CI Commands #######
-
-# This command uses act to replicate github action
-# https://github.com/nektos/act
-run_ci:
-	act
-
-####### General project commands #######
-
-generate_python_docstrings:
-	python -m pybind11_mkdoc \
-		-o python/src/docstrings.hpp \
-		kompute/Kompute.hpp \
-		-Iexternal/fmt/include/ \
-		-Iexternal/spdlog/include/ \
-		-Iexternal/glslang/ \
-		-I/usr/include/c++/7.5.0/
-
-install_python_reqs:
-	python3 -m pip install -r scripts/requirements.txt
-
-install_lcov:
-	sudo apt install lcov -y
-
-build_shaders:
-	python3 scripts/convert_shaders.py \
-		--shader-path shaders/glsl \
-		--shader-binary $(SCMP_BIN) \
-		--header-path src/include/kompute/shaders/ \
-		-v
-	python3 scripts/convert_shaders.py \
-		--shader-path test/shaders/glsl \
-		--shader-binary $(SCMP_BIN) \
-		--header-path test/compiled_shaders_include/kompute_test/shaders/ \
-		-v
-
-build_single_header:
-	quom \
-		--include_directory \
-		"src/include/" \
-		"single_include/AggregateHeaders.cpp" \
-		"single_include/kompute/Kompute.hpp"
-
-win_build_xxd:
-	cd external/bin/ && gcc.exe -o xxd.exe xxd.c -DCYGWIN
-
-format:
-	for val in "examples single_include src test" ; do \
-    	find $$val -depth -iname *.h -or -iname *.c -or -iname *.hpp -or -iname *.cpp | grep -v "shaders" | xargs $(CLANG_FORMAT_BIN) -style=file -i; \
-	done
-
-static_scan:
-	cppcheck --project=build/compile_commands.json -iexternal/
-
-build_changelog:
-	docker run --rm -it -v "$(PWD)":/usr/local/src/your-app -e CHANGELOG_GITHUB_TOKEN=${CHANGELOG_GITHUB_TOKEN} ferrarimarco/github-changelog-generator:1.15.2 -u KomputeProject -p kompute
-	chmod 664 CHANGELOG.md # (Read+Write, Read+Write, Read)
-	sed -i -e 's/\(HEAD\|Unreleased\)/v${VERSION}/g' CHANGELOG.md # Replacing unreleased version with latest tag
diff --git a/kompute/README.md b/kompute/README.md
deleted file mode 100644
index b169da254bcd8..0000000000000
--- a/kompute/README.md
+++ /dev/null
@@ -1,513 +0,0 @@
-
-![GitHub](https://img.shields.io/badge/Version-0.7.0-green.svg)
-![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
-![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
-![GitHub](https://img.shields.io/badge/Python-3.7—3.9-blue.svg)
-![GitHub](https://img.shields.io/badge/License-Apache-black.svg)
-[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/4834/badge)](https://bestpractices.coreinfrastructure.org/projects/4834)
-
-<table>
-<tr>
-
-<td width="20%">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute.jpg">
-</td>
-
-<td>
-
-<h1>Kompute</h1>
-<h3>The general purpose GPU compute framework for cross vendor graphics cards (AMD, Qualcomm, NVIDIA & friends)</h3>
-
-</td>
-
-</tr>
-</table>
-
-<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU acceleration usecases.</h4>
-
-💬 [Join the Discord & Community Calls](https://kompute.cc/overview/community.html) 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾
-
-<hr>
-
-##### Kompute is backed by the Linux Foundation as a <a href="https://lfaidata.foundation/blog/2021/08/26/kompute-joins-lf-ai-data-as-new-sandbox-project/">hosted project</a> by the LF AI & Data Foundation.
-
-<table>
-<tr>
-<td>
-<a href="https://www.linuxfoundation.org/projects/">
-<img src="https://upload.wikimedia.org/wikipedia/commons/b/b5/Linux_Foundation_logo.png">
-</a>
-</td>
-<td>
-<a href="https://lfaidata.foundation/projects/">
-<img src="https://raw.githubusercontent.com/lfai/artwork/main/lfaidata-assets/lfaidata/horizontal/color/lfaidata-horizontal-color.png">
-</a>
-</td>
-</tr>
-</table>
-
-
-## Principles & Features
-
-* [Flexible Python module](#your-first-kompute-python) with [C++ SDK](#your-first-kompute-c) for optimizations
-* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
-* [Mobile enabled](#mobile-enabled) with examples via Android NDK across several architectures
-* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
-* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
-* Robust codebase with [90% unit test code coverage](https://kompute.cc/codecov/)
-* Advanced use-cases on [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) and [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
-* Active community with [monthly calls, discord chat and more](https://kompute.cc/overview/community.html)
-
-![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/komputer-logos.gif)
-
-## Getting Started
-
-Below you can find a GPU multiplication example using the C++ and Python Kompute interfaces.
-
-You can [join the Discord](https://discord.gg/MaH5Jv5zwv) for questions / discussion, open a [github issue](https://github.com/KomputeProject/kompute/issues/new), or read [the documentation](https://kompute.cc/).
-
-### Your First Kompute (C++)
-
-The C++ interface provides low level access to the native components of Kompute, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html).
-
-```c++
-
-void kompute(const std::string& shader) {
-
-    // 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
-    kp::Manager mgr; 
-
-    // 2. Create and initialise Kompute Tensors through manager
-
-    // Default tensor constructor simplifies creation of float values
-    auto tensorInA = mgr.tensor({ 2., 2., 2. });
-    auto tensorInB = mgr.tensor({ 1., 2., 3. });
-    // Explicit type constructor supports uint32, int32, double, float and bool
-    auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
-    auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });
-
-    std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
-
-    // 3. Create algorithm based on shader (supports buffers & push/spec constants)
-    kp::Workgroup workgroup({3, 1, 1});
-    std::vector<float> specConsts({ 2 });
-    std::vector<float> pushConstsA({ 2.0 });
-    std::vector<float> pushConstsB({ 3.0 });
-
-    auto algorithm = mgr.algorithm(params,
-                                   // See documentation shader section for compileSource
-                                   compileSource(shader),
-                                   workgroup,
-                                   specConsts,
-                                   pushConstsA);
-
-    // 4. Run operation synchronously using sequence
-    mgr.sequence()
-        ->record<kp::OpTensorSyncDevice>(params)
-        ->record<kp::OpAlgoDispatch>(algorithm) // Binds default push consts
-        ->eval() // Evaluates the two recorded operations
-        ->record<kp::OpAlgoDispatch>(algorithm, pushConstsB) // Overrides push consts
-        ->eval(); // Evaluates only last recorded operation
-
-    // 5. Sync results from the GPU asynchronously
-    auto sq = mgr.sequence();
-    sq->evalAsync<kp::OpTensorSyncLocal>(params);
-
-    // ... Do other work asynchronously whilst GPU finishes
-
-    sq->evalAwait();
-
-    // Prints the first output which is: { 4, 8, 12 }
-    for (const float& elem : tensorOutA->vector()) std::cout << elem << "  ";
-    // Prints the second output which is: { 10, 10, 10 }
-    for (const float& elem : tensorOutB->vector()) std::cout << elem << "  ";
-
-} // Manages / releases all CPU and GPU memory resources
-
-int main() {
-
-    // Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
-    // files). This shader shows some of the main components including constants, buffers, etc
-    std::string shader = (R"(
-        #version 450
-
-        layout (local_size_x = 1) in;
-
-        // The input tensors bind index is relative to index in parameter passed
-        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
-        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
-
-        // Kompute supports push constants updated on dispatch
-        layout(push_constant) uniform PushConstants {
-            float val;
-        } push_const;
-
-        // Kompute also supports spec constants on initalization
-        layout(constant_id = 0) const float const_one = 0;
-
-        void main() {
-            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += uint( in_a[index] * in_b[index] );
-            out_b[index] += uint( const_one * push_const.val );
-        }
-    )");
-
-    // Run the function declared above with our raw string shader
-    kompute(shader);
-}
-
-```
-
-### Your First Kompute (Python)
-
-The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.
-
-```python
-
-from .utils import compile_source # using util function from python/test/utils
-
-def kompute(shader):
-    # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
-    mgr = kp.Manager()
-
-    # 2. Create and initialise Kompute Tensors through manager
-
-    # Default tensor constructor simplifies creation of float values
-    tensor_in_a = mgr.tensor([2, 2, 2])
-    tensor_in_b = mgr.tensor([1, 2, 3])
-    # Explicit type constructor supports uint32, int32, double, float and bool
-    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
-    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
-
-    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
-
-    # 3. Create algorithm based on shader (supports buffers & push/spec constants)
-    workgroup = (3, 1, 1)
-    spec_consts = [2]
-    push_consts_a = [2]
-    push_consts_b = [3]
-
-    # See documentation shader section for compile_source
-    spirv = compile_source(shader)
-
-    algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a)
-
-    # 4. Run operation synchronously using sequence
-    (mgr.sequence()
-        .record(kp.OpTensorSyncDevice(params))
-        .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided
-        .eval() # evaluates the two recorded ops
-        .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts
-        .eval()) # evaluates only the last recorded op
-
-    # 5. Sync results from the GPU asynchronously
-    sq = mgr.sequence()
-    sq.eval_async(kp.OpTensorSyncLocal(params))
-
-    # ... Do other work asynchronously whilst GPU finishes
-
-    sq.eval_await()
-
-    # Prints the first output which is: { 4, 8, 12 }
-    print(tensor_out_a)
-    # Prints the first output which is: { 10, 10, 10 }
-    print(tensor_out_b)
-
-if __name__ == "__main__":
-
-    # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
-    # files). This shader shows some of the main components including constants, buffers, etc
-    shader = """
-        #version 450
-
-        layout (local_size_x = 1) in;
-
-        // The input tensors bind index is relative to index in parameter passed
-        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
-        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
-
-        // Kompute supports push constants updated on dispatch
-        layout(push_constant) uniform PushConstants {
-            float val;
-        } push_const;
-
-        // Kompute also supports spec constants on initalization
-        layout(constant_id = 0) const float const_one = 0;
-
-        void main() {
-            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += uint( in_a[index] * in_b[index] );
-            out_b[index] += uint( const_one * push_const.val );
-        }
-    """
-
-    kompute(shader)
-
-```
-
-### Interactive Notebooks & Hands on Videos
-
-You are able to try out the interactive Colab Notebooks which allow you to use a free GPU. The available examples are the Python and C++ examples below:
-
-<table>
-<tr>
-
-<td width="50%">
-<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?usp=sharing">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
-</td>
-
-<td>
-<h5>Try the interactive <a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">Python Colab</a> from <a href="https://towardsdatascience.com/beyond-cuda-gpu-accelerated-python-for-machine-learning-in-cross-vendor-graphics-cards-made-simple-6cc828a45cc3">Blog Post</a></h5>
-</td>
-
-</tr>
-<tr>
-
-<td width="50%">
-<a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?authuser=1#scrollTo=1BipBsO-fQRD">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-cpp.jpg">
-</a>
-</td>
-
-<td>
-<a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-python.jpg">
-</a>
-</td>
-
-</tr>
-</table>
-
-
-You can also check out the two following talks presented at the FOSDEM 2021 conference. 
-
-Both videos have timestamps which will allow you to skip to the most relevant section for you - the intro & motivations for both is almost the same so you can skip to the more specific content.
-
-<table>
-<tr>
-
-<td width="50%">
-<h5>Watch the video for <a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">C++ Enthusiasts</a> </h5>
-</td>
-
-<td>
-<h5>Watch the video for <a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">Python & Machine Learning</a> Enthusiasts</h5>
-</td>
-
-</tr>
-<tr>
-
-<td width="50%">
-<a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-cpp-video.png">
-</a>
-</td>
-
-<td>
-<a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-python-video.png">
-</a>
-</td>
-
-</tr>
-</table>
-
-
-## Architectural Overview
-
-The core architecture of Kompute includes the following:
-* [Kompute Manager](https://kompute.cc/overview/reference.html#manager) - Base orchestrator which creates and manages device and child components
-* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
-* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
-* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
-* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU
-
-To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).
-
-<table>
-<th>
-Full Architecture
-</th>
-<th>
-Simplified Kompute Components
-</th>
-<tr>
-<td width=30%>
-
-
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-vulkan-architecture.jpg">
-
-<br>
-<br>
-(very tiny, check the <a href="https://ethicalml.github.io/vulkan-kompute/overview/reference.html">full reference diagram in docs for details</a>)
-<br>
-<br>
-
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/suspicious.jfif">
-
-</td>
-<td>
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-architecture.jpg">
-</td>
-</tr>
-</table>
-
-
-## Asynchronous and Parallel Operations
-
-Kompute provides flexibility to run operations in an asynrchonous way through vk::Fences. Furthermore, Kompute enables for explicit allocation of queues, which allow for parallel execution of operations across queue families.
-
-The image below provides an intuition on how Kompute Sequences can be allocated to different queues to enable parallel execution based on hardware. You can see the [hands on example](https://kompute.cc/overview/advanced-examples.html#parallel-operations), as well as the [detailed documentation page](https://kompute.cc/overview/async-parallel.html) describing how it would work using an NVIDIA 1650 as an example. 
-
-![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/queue-allocation.jpg)
-
-## Mobile Enabled
-
-Kompute has been optimized to work in mobile environments. The [build system](#build-overview) enables for dynamic loading of the Vulkan shared library for Android environments, together with a working [Android NDK wrapper](https://github.com/KomputeProject/kompute/tree/master/vk_ndk_wrapper_include) for the CPP headers.
-
-<table>
-<tr>
-
-<td width="70%">
-<p>
-For a full deep dive you can read the blog post "<a href="https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617">Supercharging your Mobile Apps with On-Device GPU Accelerated Machine Learning</a>". 
-
-You can also access the <a href="https://github.com/KomputeProject/kompute/tree/v0.4.0/examples/android/android-simple">end-to-end example code</a> in the repository, which can be run using android studio.
-
-</p>
-
-
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-editor.jpg">
-
-</td>
-
-
-<td width="30%">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-kompute.jpg">
-</td>
-
-</tr>
-</table>
-
-## More examples
-
-### Simple examples
-
-* [Simple multiplication example](https://kompute.cc/overview/advanced-examples.html#simple-shader-example)
-* [Record batch commands with a Kompute Sequence](https://kompute.cc/overview/advanced-examples.html#record-batch-commands)
-* [Run Asynchronous Operations](https://kompute.cc/overview/advanced-examples.html#asynchronous-operations)
-* [Run Parallel Operations Across Multiple GPU Queues](https://kompute.cc/overview/advanced-examples.html#parallel-operations)
-* [Create your custom Kompute Operations](https://kompute.cc/overview/advanced-examples.html#your-custom-kompute-operation)
-* [Implementing logistic regression from scratch](https://kompute.cc/overview/advanced-examples.html#logistic-regression-example)
-
-### End-to-end examples
-
-* [Machine Learning Logistic Regression Implementation](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a)
-* [Parallelizing GPU-intensive Workloads via Multi-Queue Operations](https://towardsdatascience.com/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc)
-* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
-* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
-
-## Python Package
-
-Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
-
-The only dependencies are Python 3.5+ and Cmake 3.4.1+. You can install Kompute from the [Python pypi package](https://pypi.org/project/kp/) using the following command.
-
-```
-pip install kp
-```
-
-You can also install from master branch using:
-
-```
-pip install git+git://github.com/KomputeProject/kompute.git@master
-```
-
-For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
-
-## C++ Build Overview
-
-The build system provided uses `cmake`, which allows for cross platform builds.
-
-The top level `Makefile` provides a set of optimized configurations for development as well as the docker image build, but you can start a build with the following command:
-
-```
-   cmake -Bbuild
-```
-
-You also are able to add Kompute in your repo with `add_subdirectory` - the [Android example CMakeLists.txt file](https://github.com/KomputeProject/kompute/blob/7c8c0eeba2cdc098349fcd999102bb2cca1bf711/examples/android/android-simple/app/src/main/cpp/CMakeLists.txt#L3) shows how this would be done.
-
-For a more advanced overview of the build configuration check out the [Build System Deep Dive](https://kompute.cc/overview/build-system.html) documentation.
-
-## Kompute Development
-
-We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Kompute and reporting issues is a great contribution!
-
-### Contributing
-
-#### Dev Dependencies
-
-* Testing
-    + GTest
-* Documentation
-    + Doxygen (with Dot)
-    + Sphynx
-
-#### Development
-
-* Follows Mozilla C++ Style Guide https://www-archive.mozilla.org/hacking/mozilla-style-guide.html
-    + Uses post-commit hook to run the linter, you can set it up so it runs the linter before commit
-    + All dependencies are defined in vcpkg.json 
-* Uses cmake as build system, and provides a top level makefile with recommended command
-* Uses xxd (or xxd.exe windows 64bit port) to convert shader spirv to header files
-* Uses doxygen and sphinx for documentation and autodocs
-* Uses vcpkg for finding the dependencies, it's the recommended set up to retrieve the libraries
-
-If you want to run with debug layers you can add them with the `KOMPUTE_ENV_DEBUG_LAYERS` parameter as:
-
-```
-export KOMPUTE_ENV_DEBUG_LAYERS="VK_LAYER_LUNARG_api_dump"
-```
-
-##### Updating documentation
-
-To update the documentation you will need to:
-* Run the gendoxygen target in the build system
-* Run the gensphynx target in the build-system 
-* Push to github pages with `make push_docs_to_ghpages`
-
-##### Running tests
-
-Running the unit tests has been significantly simplified for contributors.
-
-The tests run on CPU, and can be triggered using the ACT command line interface (https://github.com/nektos/act) - once you install the command line (And start the Docker daemon) you just have to type:
-
-```
-$ act
-
-[Python Tests/python-tests] 🚀  Start image=axsauze/kompute-builder:0.2
-[C++ Tests/cpp-tests      ] 🚀  Start image=axsauze/kompute-builder:0.2
-[C++ Tests/cpp-tests      ]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
-[Python Tests/python-tests]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
-...
-```
-
-The repository contains unit tests for the C++ and Python code, and can be found under the `test/` and `python/test` folder.
-
-The tests are currently run through the CI using Github Actions. It uses the images found in `docker-builders/`.
-
-In order to minimise hardware requirements the tests can run without a GPU, directly in the CPU using [Swiftshader](https://github.com/google/swiftshader).
-
-For more information on how the CI and tests are setup, you can go to the [CI, Docker and Tests Section](https://kompute.cc/overview/ci-tests.html) in the documentation.
-
-## Motivations
-
-This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
-
-The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of the Vulkan SDK. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
-
-We are currently developing Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on the Vulkan SDK's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Kompute architecture.
diff --git a/kompute/cmake/bin2h.cmake b/kompute/cmake/bin2h.cmake
deleted file mode 100644
index 21ad56cb11cd5..0000000000000
--- a/kompute/cmake/bin2h.cmake
+++ /dev/null
@@ -1,106 +0,0 @@
-##################################################################################
-# Based on: https://github.com/sivachandran/cmake-bin2h
-#
-# Copyright 2020 Sivachandran Paramasivam
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-##################################################################################
-
-include(CMakeParseArguments)
-
-# Function to wrap a given string into multiple lines at the given column position.
-# Parameters:
-#   VARIABLE    - The name of the CMake variable holding the string.
-#   AT_COLUMN   - The column position at which string will be wrapped.
-function(WRAP_STRING)
-    set(oneValueArgs VARIABLE AT_COLUMN)
-    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
-
-    string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
-    math(EXPR offset "0")
-
-    while(stringLength GREATER 0)
-
-        if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
-            math(EXPR length "${WRAP_STRING_AT_COLUMN}")
-        else()
-            math(EXPR length "${stringLength}")
-        endif()
-
-        string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
-        set(lines "${lines}\n${line}")
-
-        math(EXPR stringLength "${stringLength} - ${length}")
-        math(EXPR offset "${offset} + ${length}")
-    endwhile()
-
-    set(${WRAP_STRING_VARIABLE} "${lines}" PARENT_SCOPE)
-endfunction()
-
-# Function to embed contents of a file as byte array in C/C++ header file(.h). The header file
-# will contain a byte array and integer variable holding the size of the array.
-# Parameters
-#   SOURCE_FILE      - The path of source file whose contents will be embedded in the header file.
-#   VARIABLE_NAME    - The name of the variable for the byte array. The string "_SIZE" will be append
-#                      to this name and will be used a variable name for size variable.
-#   HEADER_FILE      - The path of header file.
-#   APPEND           - If specified appends to the header file instead of overwriting it
-#   NULL_TERMINATE   - If specified a null byte(zero) will be append to the byte array. This will be
-#                      useful if the source file is a text file and we want to use the file contents
-#                      as string. But the size variable holds size of the byte array without this
-#                      null byte.
-#   HEADER_NAMESPACE - The namespace, where the array should be located in.
-#   IS_BIG_ENDIAN    - If set to true, will not revers the byte order for the uint32_t to match the
-#                      big endian system architecture
-# Usage:
-#   bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
-function(BIN2H)
-    set(options APPEND NULL_TERMINATE)
-    set(oneValueArgs SOURCE_FILE VARIABLE_NAME HEADER_FILE)
-    cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}" "" ${ARGN})
-
-    # reads source file contents as hex string
-    file(READ ${BIN2H_SOURCE_FILE} hexString HEX)
-    string(LENGTH ${hexString} hexStringLength)
-
-    # appends null byte if asked
-    if(BIN2H_NULL_TERMINATE)
-        set(hexString "${hexString}00")
-    endif()
-
-    # wraps the hex string into multiple lines at column 32(i.e. 16 bytes per line)
-    wrap_string(VARIABLE hexString AT_COLUMN 32)
-    math(EXPR arraySize "${hexStringLength} / 8")
-
-    # adds '0x' prefix and comma suffix before and after every byte respectively
-    if(IS_BIG_ENDIAN)
-        message(STATUS "Interpreting shader in big endian...")
-        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\1\\2\\3\\4, " arrayValues ${hexString})
-    else()
-        message(STATUS "Interpreting shader in little endian...")
-        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\4\\3\\2\\1, " arrayValues ${hexString})
-    endif()
-    # removes trailing comma
-    string(REGEX REPLACE ", $" "" arrayValues ${arrayValues})
-
-    # converts the variable name into proper C identifier
-    string(MAKE_C_IDENTIFIER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
-    string(TOUPPER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
-
-    # declares byte array and the length variables
-    set(namespaceStart "namespace ${HEADER_NAMESPACE} {")
-    set(namespaceEnd "} // namespace ${HEADER_NAMESPACE}")
-    set(arrayIncludes "#pragma once\n#include <array>\n#include <cstdint>")
-    set(arrayDefinition "const std::array<uint32_t, ${arraySize}> ${BIN2H_VARIABLE_NAME} = { ${arrayValues} };")
-
-    set(declarations "${arrayIncludes}\n\n${namespaceStart}\n${arrayDefinition}\n${namespaceEnd}\n\n")
-    if(BIN2H_APPEND)
-        file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
-    else()
-        file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
-    endif()
-endfunction()
\ No newline at end of file
diff --git a/kompute/cmake/bin_file_to_header.cmake b/kompute/cmake/bin_file_to_header.cmake
deleted file mode 100644
index b47b3613939e9..0000000000000
--- a/kompute/cmake/bin_file_to_header.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-if(${INPUT_SHADER_FILE} STREQUAL "")
-    message(FATAL_ERROR "No input file path provided via 'INPUT_SHADER_FILE'.")
-endif()
-
-if(${OUTPUT_HEADER_FILE} STREQUAL "")
-    message(FATAL_ERROR "No output file path provided via 'OUTPUT_HEADER_FILE'.")
-endif()
-
-if(${HEADER_NAMESPACE} STREQUAL "")
-    message(FATAL_ERROR "No header namespace provided via 'HEADER_NAMESPACE'.")
-endif()
-
-include(bin2h.cmake)
-
-get_filename_component(BINARY_FILE_CONTENT ${INPUT_SHADER_FILE} NAME)
-bin2h(SOURCE_FILE ${INPUT_SHADER_FILE} HEADER_FILE ${OUTPUT_HEADER_FILE} VARIABLE_NAME ${BINARY_FILE_CONTENT} HEADER_NAMESPACE ${HEADER_NAMESPACE})
-file(APPEND ${OUTPUT_HEADER_FILE} "\n")
\ No newline at end of file
diff --git a/kompute/cmake/check_vulkan_version.cmake b/kompute/cmake/check_vulkan_version.cmake
deleted file mode 100644
index 0372d32060d8b..0000000000000
--- a/kompute/cmake/check_vulkan_version.cmake
+++ /dev/null
@@ -1,139 +0,0 @@
-# Current issue: Only checks the result of GPU0
-function(check_vulkan_version)
-    cmake_parse_arguments(VULKAN_CHECK_VERSION "" "INCLUDE_DIR" "" ${ARGN})
-    message(STATUS "Ensuring the currently installed driver supports the Vulkan version requested by the Vulkan Header.")
-
-    # Get the current Vulkan Header version (e.g. 1.2.189).
-    # This snippet is based on: https://gitlab.kitware.com/cmake/cmake/-/blob/v3.23.1/Modules/FindVulkan.cmake#L140-156
-    if(VULKAN_CHECK_VERSION_INCLUDE_DIR)
-        set(VULKAN_CORE_H ${VULKAN_CHECK_VERSION_INCLUDE_DIR}/vulkan/vulkan_core.h)
-        if(EXISTS ${VULKAN_CORE_H})
-            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE REGEX "^#define VK_HEADER_VERSION ")
-            string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION "${VULKAN_HEADER_VERSION_LINE}")
-            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE2 REGEX "^#define VK_HEADER_VERSION_COMPLETE ")
-            if(NOT ${VULKAN_HEADER_VERSION_LINE2} STREQUAL "")
-                string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION2 "${VULKAN_HEADER_VERSION_LINE2}")
-                list(LENGTH VULKAN_HEADER_VERSION2 _len)
-                # Versions >= 1.2.175 have an additional numbers in front of e.g. '0, 1, 2' instead of '1, 2'
-                if(_len EQUAL 3)
-                    list(REMOVE_AT VULKAN_HEADER_VERSION2 0)
-                endif()
-                list(APPEND VULKAN_HEADER_VERSION2 ${VULKAN_HEADER_VERSION})
-                list(JOIN VULKAN_HEADER_VERSION2 "." VULKAN_HEADER_VERSION)
-            else()
-                file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_2 REGEX "^#define VK_API_VERSION_1_2.*")
-                if(NOT ${VULKAN_HEADER_API_VERSION_1_2} STREQUAL "")
-                    set(VULKAN_HEADER_VERSION "1.2.${VULKAN_HEADER_VERSION}")
-                else()
-                    file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_1 REGEX "^#define VK_API_VERSION_1_1.*")
-                    if(NOT ${VULKAN_HEADER_API_VERSION_1_1} STREQUAL "")
-                        set(VULKAN_HEADER_VERSION "1.1.${VULKAN_HEADER_VERSION}")
-                    else()
-                        message(FATAL_ERROR "'${VULKAN_CORE_H}' does not contain a supported Vulkan version. Probably because its < 1.2.0.")
-                    endif()
-                endif()
-            endif()
-        else()
-            message(FATAL_ERROR "'${VULKAN_CORE_H}' does not exist. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
-            return()
-        endif()
-    else()
-        message(FATAL_ERROR "Invalid Vulkan include directory given. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
-        return()
-    endif()
-    message(STATUS "Found Vulkan Header version: ${VULKAN_HEADER_VERSION}")
-
-    # Get Vulkan version supported by driver
-    find_program(VULKAN_INFO_PATH NAMES vulkaninfo)
-    if(VULKAN_INFO_PATH STREQUAL "VULKAN_INFO_PATH-NOTFOUND")
-        message(FATAL_ERROR "vulkaninfo not found. The Vulkan SDK might not be installed properly. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
-        return()
-    endif()
-
-    execute_process(COMMAND "vulkaninfo"
-                    OUTPUT_VARIABLE VULKAN_INFO_OUTPUT
-                    RESULT_VARIABLE VULKAN_INFO_RETURN)
-    if(NOT ${VULKAN_INFO_RETURN} EQUAL 0)
-        message(FATAL_ERROR "Running vulkaninfo failed with return code ${VULKAN_INFO_RETURN}. Make sure you have 'vulkan-tools' installed. Result:\n${VULKAN_INFO_OUTPUT}?")
-        return()
-    else()
-        message(STATUS "Running vulkaninfo was successful. Parsing the output...")
-    endif()
-
-    # Check if running vulkaninfo was successfully
-    string(FIND "${VULKAN_INFO_OUTPUT}" "Vulkan Instance Version" VULKAN_INFO_SUCCESSFUL)
-    if(VULKAN_INFO_SUCCESSFUL LESS 0)
-        message(FATAL_ERROR "Running vulkaninfo failed. Make sure you have 'vulkan-tools' installed and DISPLAY is configured. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON). Result:\n${VULKAN_INFO_OUTPUT}?")
-    endif()
-
-    string(REGEX MATCHALL "(GPU[0-9]+)" GPU_IDS "${VULKAN_INFO_OUTPUT}")
-    if(NOT GPU_IDS)
-        message(FATAL_ERROR "No GPU supporting Vulkan found in vulkaninfo. Does your GPU (driver) support Vulkan?")
-    endif()
-
-    string(REGEX MATCHALL "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*" GPU_API_VERSIONS ${VULKAN_INFO_OUTPUT})
-    if(NOT GPU_API_VERSIONS)
-        message(FATAL_ERROR "No valid Vulkan API version found in vulkaninfo. Does your GPU (driver) support Vulkan?")
-    endif()
-
-    # Check length
-    # message(FATAL_ERROR "GPUS: ${GPU_IDS}")
-    list(LENGTH GPU_IDS GPU_IDS_LENGTH)
-    list(LENGTH GPU_API_VERSIONS GPU_API_VERSIONS_LENGTH)
-    if(NOT ${GPU_IDS_LENGTH} EQUAL ${GPU_API_VERSIONS_LENGTH})
-        message(FATAL_ERROR "Found ${GPU_IDS_LENGTH} GPUs, but ${GPU_API_VERSIONS_LENGTH} API versions in vulkaninfo. We expected to find an equal amount of them.")
-    endif()
-
-    # Compare versions
-    set(VALID_GPU "")
-    set(VALID_VULKAN_VERSION "")
-    math(EXPR ITER_LEN "${GPU_IDS_LENGTH} - 1")
-    foreach(INDEX RANGE ${ITER_LEN})
-        list(GET GPU_IDS ${INDEX} GPU)
-        list(GET GPU_API_VERSIONS ${INDEX} API_VERSION)
-
-        # Extract API version
-        if(${API_VERSION} MATCHES "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*")
-            set(VULKAN_DRIVER_VERSION ${CMAKE_MATCH_1})
-        else()
-            message(FATAL_ERROR "API version match failed. This should not have happened...")
-        endif()
-
-        message(STATUS "${GPU} supports Vulkan API version '${VULKAN_DRIVER_VERSION}'.")
-
-        # Compare driver and header version
-        if(${VULKAN_DRIVER_VERSION} VERSION_LESS ${VULKAN_HEADER_VERSION})
-        # Version missmatch. Let us check if the minor version is the same.
-            if(${VULKAN_DRIVER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
-                set(VULKAN_DRIVER_MINOR_VERSION ${CMAKE_MATCH_1})
-            else()
-                message(FATAL_ERROR "Invalid Vulkan driver version '${VULKAN_DRIVER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
-            endif()
-            if(${VULKAN_HEADER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
-                set(VULKAN_HEADER_MINOR_VERSION ${CMAKE_MATCH_1})
-            else()
-                message(FATAL_ERROR "Invalid Vulkan Header version '${VULKAN_HEADER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
-            endif()
-
-            if(${VULKAN_DRIVER_MINOR_VERSION} EQUAL ${VULKAN_HEADER_MINOR_VERSION})
-                message(WARNING "Your GPU driver does not support Vulkan > ${VULKAN_DRIVER_VERSION}, but you try to use Vulkan Header ${VULKAN_HEADER_VERSION}. At least your driver supports the same minor version (${VULKAN_DRIVER_MINOR_VERSION}), so this should be fine but keep it in mind in case you encounter any strange behavior.")
-                set(VALID_GPU ${GPU})
-                set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
-                break()
-            else()
-                message(STATUS "${GPU} does not support Vulkan > ${VULKAN_DRIVER_VERSION}.")
-            endif()
-        else()
-            set(VALID_GPU ${GPU})
-            set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
-            break()
-        endif()
-    endforeach()
-
-    if("${VALID_GPU}" STREQUAL "")
-        message(FATAL_ERROR "None of your GPUs supports Vulkan Header ${VULKAN_HEADER_VERSION}. Please try updating your driver, or downgrade your Vulkan headers. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
-    else()
-        message("Valid GPU (${VALID_GPU}) for Vulkan header version ${VULKAN_HEADER_VERSION} found. ${VALID_GPU} supports up to Vulkan ${VALID_VULKAN_VERSION}.")
-    endif()
-
-endfunction()
diff --git a/kompute/cmake/code_coverage.cmake b/kompute/cmake/code_coverage.cmake
deleted file mode 100644
index 7fb6ce264b6ab..0000000000000
--- a/kompute/cmake/code_coverage.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# Code coverage
-set(CMAKE_BUILD_TYPE COVERAGE CACHE INTERNAL "Coverage build enabled")
-message(STATUS "Enabling gcov support")
-
-if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set(COVERAGE_FLAG "--coverage")
-endif()
-
-set(CMAKE_CXX_FLAGS_COVERAGE
-    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
-    CACHE STRING "Flags used by the C++ compiler during coverage builds."
-    FORCE)
-set(CMAKE_C_FLAGS_COVERAGE
-    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
-    CACHE STRING "Flags used by the C compiler during coverage builds."
-    FORCE)
-set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    ""
-    CACHE STRING "Flags used for linking binaries during coverage builds."
-    FORCE)
-set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
-    FORCE)
-
-set(CODECOV_DIR ${CMAKE_CURRENT_BINARY_DIR}/codecov/)
-set(CODECOV_DIR_LCOV ${CODECOV_DIR}lcov/)
-set(CODECOV_FILENAME_LCOV_INFO lcov.info)
-set(CODECOV_FILENAME_LCOV_INFO_FULL lcov_full.info)
-set(CODECOV_DIR_HTML ${CODECOV_DIR}html/)
-
-mark_as_advanced(CMAKE_CXX_FLAGS_COVERAGE
-    CMAKE_C_FLAGS_COVERAGE
-    CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
diff --git a/kompute/cmake/deprecation_warnings.cmake b/kompute/cmake/deprecation_warnings.cmake
deleted file mode 100644
index 1ed1f455507a8..0000000000000
--- a/kompute/cmake/deprecation_warnings.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
-    message(FATAL_ERROR "'KOMPUTE_OPT_REPO_SUBMODULE_BUILD' got replaced by 'KOMPUTE_OPT_USE_BUILT_IN_SPDLOG', 'KOMPUTE_OPT_USE_BUILT_IN_FMT', 'KOMPUTE_OPT_USE_BUILT_IN_GOOGLE_TEST', 'KOMPUTE_OPT_USE_BUILT_IN_PYBIND11' and 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER'. Please use them instead.")
-endif()
-
-if(KOMPUTE_OPT_BUILD_AS_SHARED_LIB)
-    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_AS_SHARED_LIB' is deprecated and should not be used. Instead use the default 'BUILD_SHARED_LIBS' CMake switch.")
-endif()
-
-if(KOMPUTE_OPT_BUILD_SINGLE_HEADER)
-    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_SINGLE_HEADER' is deprecated and should not be used. The single header will now always be build and can be included via '#include<kompute/kompute.h>'.")
-endif()
-
-if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    message(FATAL_ERROR "'KOMPUTE_OPT_ENABLE_SPDLOG' is deprecated and should not be used. It got replaced by 'KOMPUTE_OPT_LOG_LEVEL'. This option can be set to a variety of log levels (e.g. 'Off', 'Trace', 'Debug', 'Default', ...).")
-endif()
\ No newline at end of file
diff --git a/kompute/cmake/komputeConfig.cmake.in b/kompute/cmake/komputeConfig.cmake.in
deleted file mode 100644
index 87e8a99e23e99..0000000000000
--- a/kompute/cmake/komputeConfig.cmake.in
+++ /dev/null
@@ -1,8 +0,0 @@
-include(CMakeFindDependencyMacro)
-@PACKAGE_INIT@
-
-find_dependency(VULKAN REQUIRED)
-
-include(${CMAKE_CURRENT_LIST_DIR}/komputeTargets.cmake)
-
-check_required_components(kompute)
\ No newline at end of file
diff --git a/kompute/cmake/vulkan_shader_compiler.cmake b/kompute/cmake/vulkan_shader_compiler.cmake
deleted file mode 100644
index acc27b57c2acc..0000000000000
--- a/kompute/cmake/vulkan_shader_compiler.cmake
+++ /dev/null
@@ -1,43 +0,0 @@
-function(vulkan_compile_shader)
-     find_program(GLS_LANG_VALIDATOR_PATH NAMES glslangValidator)
-     if(GLS_LANG_VALIDATOR_PATH STREQUAL "GLS_LANG_VALIDATOR_PATH-NOTFOUND")
-          message(FATAL_ERROR "glslangValidator not found.")
-          return()
-     endif()
-
-     cmake_parse_arguments(SHADER_COMPILE "" "INFILE;OUTFILE;NAMESPACE;RELATIVE_PATH" "" ${ARGN})
-     set(SHADER_COMPILE_INFILE_FULL "${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_COMPILE_INFILE}")
-     set(SHADER_COMPILE_SPV_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_INFILE}.spv")
-     set(SHADER_COMPILE_HEADER_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_OUTFILE}")
-
-     if(NOT SHADER_COMPILE_RELATIVE_PATH)
-          set(SHADER_COMPILE_RELATIVE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-     endif()
-    
-     # .comp -> .spv
-     add_custom_command(OUTPUT "${SHADER_COMPILE_SPV_FILE_FULL}"
-                        COMMAND "${GLS_LANG_VALIDATOR_PATH}"
-                        ARGS "-V"
-                             "${SHADER_COMPILE_INFILE_FULL}"
-                             "-o"
-                             "${SHADER_COMPILE_SPV_FILE_FULL}"
-                        COMMENT "Compile vulkan compute shader from file '${SHADER_COMPILE_INFILE_FULL}' to '${SHADER_COMPILE_SPV_FILE_FULL}'."
-                        MAIN_DEPENDENCY "${SHADER_COMPILE_INFILE_FULL}")
-
-     # Check if big or little endian
-     include (TestBigEndian)
-     TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
-
-     # .spv -> .hpp
-     add_custom_command(OUTPUT "${SHADER_COMPILE_HEADER_FILE_FULL}"
-                        COMMAND ${CMAKE_COMMAND}
-                        ARGS "-DINPUT_SHADER_FILE=${SHADER_COMPILE_SPV_FILE_FULL}"
-                             "-DOUTPUT_HEADER_FILE=${SHADER_COMPILE_HEADER_FILE_FULL}"
-                             "-DHEADER_NAMESPACE=${SHADER_COMPILE_NAMESPACE}"
-                             "-DIS_BIG_ENDIAN=${IS_BIG_ENDIAN}"
-                             "-P"
-                             "${SHADER_COMPILE_RELATIVE_PATH}/bin_file_to_header.cmake"
-                        WORKING_DIRECTORY "${SHADER_COMPILE_RELATIVE_PATH}"
-                        COMMENT "Converting compiled shader '${SHADER_COMPILE_SPV_FILE_FULL}' to header file '${SHADER_COMPILE_HEADER_FILE_FULL}'."
-                        MAIN_DEPENDENCY "${SHADER_COMPILE_SPV_FILE_FULL}")
-endfunction()
diff --git a/kompute/config/FindSphinx.cmake b/kompute/config/FindSphinx.cmake
deleted file mode 100644
index c645ccc9ff366..0000000000000
--- a/kompute/config/FindSphinx.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# Look for an executable called sphinx-build
-find_program(SPHINX_EXECUTABLE
-    NAMES sphinx-build
-    DOC "Path to sphinx-build executable")
-
-if(SPHINX_EXECUTABLE STREQUAL "SPHINX_EXECUTABLE-NOTFOUND")
-    message(FATAL_ERROR "sphinx-build not found.")
-endif()
-
-include(FindPackageHandleStandardArgs)
-
-# Handle standard arguments to find_package like REQUIRED and QUIET
-find_package_handle_standard_args(
-    Sphinx
-    "Failed to find sphinx-build executable"
-    SPHINX_EXECUTABLE)
diff --git a/kompute/external/bin/xxd.c b/kompute/external/bin/xxd.c
deleted file mode 100644
index 60ed3f712a766..0000000000000
--- a/kompute/external/bin/xxd.c
+++ /dev/null
@@ -1,819 +0,0 @@
-/*
-As indicated at https://lists.debian.org/debian-legal/2015/01/msg00037.html,
-the author has permitted redistribution of xxd under the MIT license, as follows:
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * xxd: my hexdump facility. jw
- *
- *  2.10.90 changed to word output
- *  3.03.93 new indent style, dumb bug inserted and fixed.
- *	    -c option, mls
- * 26.04.94 better option parser, -ps, -l, -s added.
- *  1.07.94 -r badly needs - as input file.  Per default autoskip over
- *	       consecutive lines of zeroes, as unix od does.
- *	    -a shows them too.
- *	    -i dump as c-style #include "file.h"
- *  1.11.95 if "xxd -i" knows the filename, an 'unsigned char filename_bits[]'
- *	    array is written in correct c-syntax.
- *	    -s improved, now defaults to absolute seek, relative requires a '+'.
- *	    -r improved, now -r -s -0x... is supported.
- *	       change/suppress leading '\0' bytes.
- *	    -l n improved: stops exactly after n bytes.
- *	    -r improved, better handling of partial lines with trailing garbage.
- *	    -r improved, now -r -p works again!
- *	    -r improved, less flushing, much faster now! (that was silly)
- *  3.04.96 Per repeated request of a single person: autoskip defaults to off.
- * 15.05.96 -v added. They want to know the version.
- *	    -a fixed, to show last line inf file ends in all zeros.
- *	    -u added: Print upper case hex-letters, as preferred by unix bc.
- *	    -h added to usage message. Usage message extended.
- *	    Now using outfile if specified even in normal mode, aehem.
- *	    No longer mixing of ints and longs. May help doze people.
- *	    Added binify ioctl for same reason. (Enough Doze stress for 1996!)
- * 16.05.96 -p improved, removed occasional superfluous linefeed.
- * 20.05.96 -l 0 fixed. tried to read anyway.
- * 21.05.96 -i fixed. now honours -u, and prepends __ to numeric filenames.
- *	    compile -DWIN32 for NT or W95. George V. Reilly, * -v improved :-)
- *	    support --gnuish-longhorn-options
- * 25.05.96 MAC support added: CodeWarrior already uses ``outline'' in Types.h
- *	    which is included by MacHeaders (Axel Kielhorn). Renamed to
- *	    xxdline().
- *  7.06.96 -i printed 'int' instead of 'char'. *blush*
- *	    added Bram's OS2 ifdefs...
- * 18.07.96 gcc -Wall @ SunOS4 is now slient.
- *	    Added osver for MSDOS/DJGPP/WIN32.
- * 29.08.96 Added size_t to strncmp() for Amiga.
- * 24.03.97 Windows NT support (Phil Hanna). Clean exit for Amiga WB (Bram)
- * 02.04.97 Added -E option, to have EBCDIC translation instead of ASCII
- *	    (azc10@yahoo.com)
- * 22.05.97 added -g (group octets) option (jcook@namerica.kla.com).
- * 23.09.98 nasty -p -r misfeature fixed: slightly wrong output, when -c was
- *	    missing or wrong.
- * 26.09.98 Fixed: 'xxd -i infile outfile' did not truncate outfile.
- * 27.10.98 Fixed: -g option parser required blank.
- *	    option -b added: 01000101 binary output in normal format.
- * 16.05.00 Added VAXC changes by Stephen P. Wall
- * 16.05.00 Improved MMS file and merge for VMS by Zoltan Arpadffy
- *
- * (c) 1990-1998 by Juergen Weigert (jnweiger@informatik.uni-erlangen.de)
- *
- * Small changes made afterwards by Bram Moolenaar et al.
- *
- * Distribute freely and credit me,
- * make money and share with me,
- * lose money and don't ask me.
- *
- *
- */
-
-/* Visual Studio 2005 has 'deprecated' many of the standard CRT functions */
-#if _MSC_VER >= 1400
-# define _CRT_SECURE_NO_DEPRECATE
-# define _CRT_NONSTDC_NO_DEPRECATE
-#endif
-
-#include <stdio.h>
-#ifdef VAXC
-# include <file.h>
-#else
-# include <fcntl.h>
-#endif
-#ifdef __TSC__
-# define MSDOS
-#endif
-#if !defined(OS2) && defined(__EMX__)
-# define OS2
-#endif
-#if defined(MSDOS) || defined(WIN32) || defined(OS2) || defined(__BORLANDC__) || defined(CYGWIN)
-# include <io.h>	/* for setmode() */
-#else
-# ifdef UNIX
-#  include <unistd.h>
-# endif
-#endif
-#include <stdlib.h>
-#include <string.h>	/* for strncmp() */
-#include <ctype.h>	/* for isalnum() */
-#if __MWERKS__ && !defined(BEBOX)
-# include <unix.h>	/* for fdopen() on MAC */
-#endif
-
-#if defined(__BORLANDC__) && __BORLANDC__ <= 0x0410 && !defined(fileno)
-/* Missing define and prototype grabbed from the BC 4.0 <stdio.h> */
-# define fileno(f)       ((f)->fd)
-FILE   _FAR *_Cdecl _FARFUNC fdopen(int __handle, char _FAR *__type);
-#endif
-
-
-/*  This corrects the problem of missing prototypes for certain functions
- *  in some GNU installations (e.g. SunOS 4.1.x).
- *  Darren Hiebert <darren@hmi.com> (sparc-sun-sunos4.1.3_U1/2.7.2.2)
- */
-#if defined(__GNUC__) && defined(__STDC__)
-# ifndef __USE_FIXED_PROTOTYPES__
-#  define __USE_FIXED_PROTOTYPES__
-# endif
-#endif
-
-#ifndef __USE_FIXED_PROTOTYPES__
-/*
- * This is historic and works only if the compiler really has no prototypes:
- *
- * Include prototypes for Sun OS 4.x, when using an ANSI compiler.
- * FILE is defined on OS 4.x, not on 5.x (Solaris).
- * if __SVR4 is defined (some Solaris versions), don't include this.
- */
-#if defined(sun) && defined(FILE) && !defined(__SVR4) && defined(__STDC__)
-#  define __P(a) a
-/* excerpt from my sun_stdlib.h */
-extern int fprintf __P((FILE *, char *, ...));
-extern int fputs   __P((char *, FILE *));
-extern int _flsbuf __P((unsigned char, FILE *));
-extern int _filbuf __P((FILE *));
-extern int fflush  __P((FILE *));
-extern int fclose  __P((FILE *));
-extern int fseek   __P((FILE *, long, int));
-extern int rewind  __P((FILE *));
-
-extern void perror __P((char *));
-# endif
-#endif
-
-extern long int strtol();
-extern long int ftell();
-
-char version[] = "xxd V1.10 27oct98 by Juergen Weigert";
-#ifdef WIN32
-char osver[] = " (Win32)";
-#else
-# ifdef DJGPP
-char osver[] = " (dos 32 bit)";
-# else
-#  ifdef MSDOS
-char osver[] = " (dos 16 bit)";
-#  else
-char osver[] = "";
-#  endif
-# endif
-#endif
-
-#if !defined(CYGWIN) && (defined(CYGWIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__))
-# define CYGWIN
-#endif
-#if defined(MSDOS) || defined(WIN32) || defined(OS2)
-# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
-# define BIN_WRITE(yes) ((yes) ? "wb" : "wt")
-# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
-# define BIN_ASSIGN(fp, yes) setmode(fileno(fp), (yes) ? O_BINARY : O_TEXT)
-# define PATH_SEP '\\'
-#elif defined(CYGWIN)
-# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
-# define BIN_WRITE(yes) ((yes) ? "wb" : "w")
-# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
-# define BIN_ASSIGN(fp, yes) ((yes) ? (void) setmode(fileno(fp), O_BINARY) : (void) (fp))
-# define PATH_SEP '/'
-#else
-# ifdef VMS
-#  define BIN_READ(dummy)  "r"
-#  define BIN_WRITE(dummy) "w"
-#  define BIN_CREAT(dummy) O_CREAT
-#  define BIN_ASSIGN(fp, dummy) fp
-#  define PATH_SEP ']'
-#  define FILE_SEP '.'
-# else
-#  define BIN_READ(dummy)  "r"
-#  define BIN_WRITE(dummy) "w"
-#  define BIN_CREAT(dummy) O_CREAT
-#  define BIN_ASSIGN(fp, dummy) fp
-#  define PATH_SEP '/'
-# endif
-#endif
-
-/* open has only to arguments on the Mac */
-#if __MWERKS__
-# define OPEN(name, mode, umask) open(name, mode)
-#else
-# define OPEN(name, mode, umask) open(name, mode, umask)
-#endif
-
-#ifdef AMIGA
-# define STRNCMP(s1, s2, l) strncmp(s1, s2, (size_t)l)
-#else
-# define STRNCMP(s1, s2, l) strncmp(s1, s2, l)
-#endif
-
-#ifndef __P
-# if defined(__STDC__) || defined(MSDOS) || defined(WIN32) || defined(OS2) \
-        || defined(__BORLANDC__)
-#  define __P(a) a
-# else
-#  define __P(a) ()
-# endif
-#endif
-
-/* Let's collect some prototypes */
-/* CodeWarrior is really picky about missing prototypes */
-static void exit_with_usage __P((char *));
-static int huntype __P((FILE *, FILE *, FILE *, char *, int, int, long));
-static void xxdline __P((FILE *, char *, int));
-
-#define TRY_SEEK	/* attempt to use lseek, or skip forward by reading */
-#define COLS 256	/* change here, if you ever need more columns */
-#define LLEN (11 + (9*COLS-1)/1 + COLS + 2)
-
-char hexxa[] = "0123456789abcdef0123456789ABCDEF", *hexx = hexxa;
-
-/* the different hextypes known by this program: */
-#define HEX_NORMAL 0
-#define HEX_POSTSCRIPT 1
-#define HEX_CINCLUDE 2
-#define HEX_BITS 3		/* not hex a dump, but bits: 01111001 */
-
-static void
-exit_with_usage(pname)
-char *pname;
-{
-  fprintf(stderr, "Usage:\n       %s [options] [infile [outfile]]\n", pname);
-  fprintf(stderr, "    or\n       %s -r [-s [-]offset] [-c cols] [-ps] [infile [outfile]]\n", pname);
-  fprintf(stderr, "Options:\n");
-  fprintf(stderr, "    -a          toggle autoskip: A single '*' replaces nul-lines. Default off.\n");
-  fprintf(stderr, "    -b          binary digit dump (incompatible with -p,-i,-r). Default hex.\n");
-  fprintf(stderr, "    -c cols     format <cols> octets per line. Default 16 (-i: 12, -ps: 30).\n");
-  fprintf(stderr, "    -E          show characters in EBCDIC. Default ASCII.\n");
-  fprintf(stderr, "    -g          number of octets per group in normal output. Default 2.\n");
-  fprintf(stderr, "    -h          print this summary.\n");
-  fprintf(stderr, "    -i          output in C include file style.\n");
-  fprintf(stderr, "    -l len      stop after <len> octets.\n");
-  fprintf(stderr, "    -ps         output in postscript plain hexdump style.\n");
-  fprintf(stderr, "    -r          reverse operation: convert (or patch) hexdump into binary.\n");
-  fprintf(stderr, "    -r -s off   revert with <off> added to file positions found in hexdump.\n");
-  fprintf(stderr, "    -s %sseek  start at <seek> bytes abs. %sinfile offset.\n",
-#ifdef TRY_SEEK
-      "[+][-]", "(or +: rel.) ");
-#else
-      "", "");
-#endif
-  fprintf(stderr, "    -u          use upper case hex letters.\n");
-  fprintf(stderr, "    -v          show version: \"%s%s\".\n", version, osver);
-  exit(1);
-}
-
-/*
- * Max. cols binary characters are decoded from the input stream per line.
- * Two adjacent garbage characters after evaluated data delimit valid data.
- * Everything up to the next newline is discarded.
- *
- * The name is historic and came from 'undo type opt h'.
- */
-static int
-huntype(fpi, fpo, fperr, pname, cols, hextype, base_off)
-FILE *fpi, *fpo, *fperr;
-char *pname;
-int cols, hextype;
-long base_off;
-{
-  int c, ign_garb = 1, n1 = -1, n2 = 0, n3, p = cols;
-  long have_off = 0, want_off = 0;
-
-  rewind(fpi);
-
-  while ((c = getc(fpi)) != EOF)
-    {
-      if (c == '\r')	/* Doze style input file? */
-    continue;
-
-#if 0	/* this doesn't work when there is normal text after the hex codes in
-       the last line that looks like hex */
-      if (c == ' ' || c == '\n' || c == '\t')  /* allow multiple spaces */
-    continue;
-#endif
-
-      n3 = n2;
-      n2 = n1;
-
-      if (c >= '0' && c <= '9')
-    n1 = c - '0';
-      else if (c >= 'a' && c <= 'f')
-    n1 = c - 'a' + 10;
-      else if (c >= 'A' && c <= 'F')
-    n1 = c - 'A' + 10;
-      else
-    {
-      n1 = -1;
-      if (ign_garb)
-        continue;
-    }
-
-      ign_garb = 0;
-
-      if (p >= cols)
-    {
-      if (!hextype)
-        {
-          if (n1 < 0)
-        {
-          p = 0;
-          continue;
-        }
-          want_off = (want_off << 4) | n1;
-          continue;
-        }
-      else
-        p = 0;
-    }
-
-      if (base_off + want_off != have_off)
-    {
-      fflush(fpo);
-#ifdef TRY_SEEK
-      c = fseek(fpo, base_off + want_off - have_off, 1);
-      if (c >= 0)
-        have_off = base_off + want_off;
-#endif
-      if (base_off + want_off < have_off)
-        {
-          fprintf(fperr, "%s: sorry, cannot seek backwards.\n", pname);
-          return 5;
-        }
-      for (; have_off < base_off + want_off; have_off++)
-        putc(0, fpo);
-    }
-
-      if (n2 >= 0 && n1 >= 0)
-    {
-      putc((n2 << 4) | n1, fpo);
-      have_off++;
-      want_off++;
-      n1 = -1;
-      if ((++p >= cols) && !hextype)
-        {
-          /* skip rest of line as garbage */
-          want_off = 0;
-          while ((c = getc(fpi)) != '\n' && c != EOF)
-        ;
-          ign_garb = 1;
-        }
-    }
-      else if (n1 < 0 && n2 < 0 && n3 < 0)
-    {
-      /* already stumbled into garbage, skip line, wait and see */
-      if (!hextype)
-        want_off = 0;
-      while ((c = getc(fpi)) != '\n' && c != EOF)
-        ;
-      ign_garb = 1;
-    }
-    }
-  fflush(fpo);
-#ifdef TRY_SEEK
-  fseek(fpo, 0L, 2);
-#endif
-  fclose(fpo);
-  fclose(fpi);
-  return 0;
-}
-
-/*
- * Print line l. If nz is false, xxdline regards the line a line of
- * zeroes. If there are three or more consecutive lines of zeroes,
- * they are replaced by a single '*' character.
- *
- * If the output ends with more than two lines of zeroes, you
- * should call xxdline again with l being the last line and nz
- * negative. This ensures that the last line is shown even when
- * it is all zeroes.
- *
- * If nz is always positive, lines are never suppressed.
- */
-static void
-xxdline(fp, l, nz)
-FILE *fp;
-char *l;
-int nz;
-{
-  static char z[LLEN+1];
-  static int zero_seen = 0;
-
-  if (!nz && zero_seen == 1)
-    strcpy(z, l);
-
-  if (nz || !zero_seen++)
-    {
-      if (nz)
-    {
-      if (nz < 0)
-        zero_seen--;
-      if (zero_seen == 2)
-        fputs(z, fp);
-      if (zero_seen > 2)
-        fputs("*\n", fp);
-    }
-      if (nz >= 0 || zero_seen > 0)
-    fputs(l, fp);
-      if (nz)
-    zero_seen = 0;
-    }
-}
-
-/* This is an EBCDIC to ASCII conversion table */
-/* from a proposed BTL standard April 16, 1979 */
-static unsigned char etoa64[] =
-{
-    0040,0240,0241,0242,0243,0244,0245,0246,
-    0247,0250,0325,0056,0074,0050,0053,0174,
-    0046,0251,0252,0253,0254,0255,0256,0257,
-    0260,0261,0041,0044,0052,0051,0073,0176,
-    0055,0057,0262,0263,0264,0265,0266,0267,
-    0270,0271,0313,0054,0045,0137,0076,0077,
-    0272,0273,0274,0275,0276,0277,0300,0301,
-    0302,0140,0072,0043,0100,0047,0075,0042,
-    0303,0141,0142,0143,0144,0145,0146,0147,
-    0150,0151,0304,0305,0306,0307,0310,0311,
-    0312,0152,0153,0154,0155,0156,0157,0160,
-    0161,0162,0136,0314,0315,0316,0317,0320,
-    0321,0345,0163,0164,0165,0166,0167,0170,
-    0171,0172,0322,0323,0324,0133,0326,0327,
-    0330,0331,0332,0333,0334,0335,0336,0337,
-    0340,0341,0342,0343,0344,0135,0346,0347,
-    0173,0101,0102,0103,0104,0105,0106,0107,
-    0110,0111,0350,0351,0352,0353,0354,0355,
-    0175,0112,0113,0114,0115,0116,0117,0120,
-    0121,0122,0356,0357,0360,0361,0362,0363,
-    0134,0237,0123,0124,0125,0126,0127,0130,
-    0131,0132,0364,0365,0366,0367,0370,0371,
-    0060,0061,0062,0063,0064,0065,0066,0067,
-    0070,0071,0372,0373,0374,0375,0376,0377
-};
-
-const char* extract_filename(const char* path) {
-    const char* filename = strrchr(path, '/');
-    if (filename) {
-        return filename + 1;
-    }
-    return path;
-}
-
-int
-main(argc, argv)
-int argc;
-char *argv[];
-{
-  FILE *fp, *fpo;
-  int c, e, p = 0, relseek = 1, negseek = 0, revert = 0;
-  int cols = 0, nonzero = 0, autoskip = 0, hextype = HEX_NORMAL;
-  int ebcdic = 0;
-  int octspergrp = -1;	/* number of octets grouped in output */
-  int grplen;		/* total chars per octet group */
-  long length = -1, n = 0, seekoff = 0;
-  char l[LLEN+1];
-  char *pname, *pp;
-
-#ifdef AMIGA
-  /* This program doesn't work when started from the Workbench */
-  if (argc == 0)
-    exit(1);
-#endif
-
-  pname = argv[0];
-  for (pp = pname; *pp; )
-    if (*pp++ == PATH_SEP)
-      pname = pp;
-#ifdef FILE_SEP
-  for (pp = pname; *pp; pp++)
-    if (*pp == FILE_SEP)
-      {
-    *pp = '\0';
-    break;
-      }
-#endif
-
-  while (argc >= 2)
-    {
-      pp = argv[1] + (!STRNCMP(argv[1], "--", 2) && argv[1][2]);
-       if (!STRNCMP(pp, "-a", 2)) autoskip = 1 - autoskip;
-      else if (!STRNCMP(pp, "-b", 2)) hextype = HEX_BITS;
-      else if (!STRNCMP(pp, "-u", 2)) hexx = hexxa + 16;
-      else if (!STRNCMP(pp, "-p", 2)) hextype = HEX_POSTSCRIPT;
-      else if (!STRNCMP(pp, "-i", 2)) hextype = HEX_CINCLUDE;
-      else if (!STRNCMP(pp, "-r", 2)) revert++;
-      else if (!STRNCMP(pp, "-E", 2)) ebcdic++;
-      else if (!STRNCMP(pp, "-v", 2))
-    {
-      fprintf(stderr, "%s%s\n", version, osver);
-      exit(0);
-    }
-      else if (!STRNCMP(pp, "-c", 2))
-    {
-      if (pp[2] && STRNCMP("ols", pp + 2, 3))
-        cols = (int)strtol(pp + 2, NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          cols = (int)strtol(argv[2], NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-g", 2))
-    {
-      if (pp[2] && STRNCMP("roupsize", pp + 2, 8))
-        octspergrp = (int)strtol(pp + 2, NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          octspergrp = (int)strtol(argv[2], NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-s", 2))
-    {
-      relseek = 0;
-      negseek = 0;
-      if (pp[2] && STRNCMP("kip", pp+2, 3) && STRNCMP("eek", pp+2, 3))
-        {
-#ifdef TRY_SEEK
-          if (pp[2] == '+')
-        relseek++;
-          if (pp[2+relseek] == '-')
-        negseek++;
-#endif
-          seekoff = strtol(pp + 2+relseek+negseek, (char **)NULL, 0);
-        }
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-#ifdef TRY_SEEK
-          if (argv[2][0] == '+')
-        relseek++;
-          if (argv[2][relseek] == '-')
-        negseek++;
-#endif
-          seekoff = strtol(argv[2] + relseek+negseek, (char **)NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-l", 2))
-    {
-      if (pp[2] && STRNCMP("en", pp + 2, 2))
-        length = strtol(pp + 2, (char **)NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          length = strtol(argv[2], (char **)NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!strcmp(pp, "--"))	/* end of options */
-    {
-      argv++;
-      argc--;
-      break;
-    }
-      else if (pp[0] == '-' && pp[1])	/* unknown option */
-    exit_with_usage(pname);
-      else
-    break;				/* not an option */
-
-      argv++;				/* advance to next argument */
-      argc--;
-    }
-
-  if (!cols)
-    switch (hextype)
-      {
-      case HEX_POSTSCRIPT:	cols = 30; break;
-      case HEX_CINCLUDE:	cols = 12; break;
-      case HEX_BITS:		cols = 6; break;
-      case HEX_NORMAL:
-      default:			cols = 16; break;
-      }
-
-  if (octspergrp < 0)
-    switch (hextype)
-      {
-      case HEX_BITS:		octspergrp = 1; break;
-      case HEX_NORMAL:		octspergrp = 2; break;
-      case HEX_POSTSCRIPT:
-      case HEX_CINCLUDE:
-      default:			octspergrp = 0; break;
-      }
-
-  if (cols < 1 || ((hextype == HEX_NORMAL || hextype == HEX_BITS)
-                                && (cols > COLS)))
-    {
-      fprintf(stderr, "%s: invalid number of columns (max. %d).\n", pname, COLS);
-      exit(1);
-    }
-
-  if (octspergrp < 1)
-    octspergrp = cols;
-
-  if (argc > 3)
-    exit_with_usage(pname);
-
-  if (argc == 1 || (argv[1][0] == '-' && !argv[1][1]))
-    BIN_ASSIGN(fp = stdin, !revert);
-  else
-    {
-      if ((fp = fopen(argv[1], BIN_READ(!revert))) == NULL)
-    {
-      fprintf(stderr,"%s: ", pname);
-      perror(argv[1]);
-      return 2;
-    }
-    }
-
-  if (argc < 3 || (argv[2][0] == '-' && !argv[2][1]))
-    BIN_ASSIGN(fpo = stdout, revert);
-  else
-    {
-      int fd;
-      int mode = revert ? O_WRONLY : (O_TRUNC|O_WRONLY);
-
-      if (((fd = OPEN(argv[2], mode | BIN_CREAT(revert), 0666)) < 0) ||
-      (fpo = fdopen(fd, BIN_WRITE(revert))) == NULL)
-    {
-      fprintf(stderr, "%s: ", pname);
-      perror(argv[2]);
-      return 3;
-    }
-      rewind(fpo);
-    }
-
-  if (revert)
-    {
-      if (hextype && (hextype != HEX_POSTSCRIPT))
-    {
-      fprintf(stderr, "%s: sorry, cannot revert this type of hexdump\n", pname);
-      return -1;
-    }
-      return huntype(fp, fpo, stderr, pname, cols, hextype,
-        negseek ? -seekoff : seekoff);
-    }
-
-  if (seekoff || negseek || !relseek)
-    {
-#ifdef TRY_SEEK
-      if (relseek)
-    e = fseek(fp, negseek ? -seekoff : seekoff, 1);
-      else
-    e = fseek(fp, negseek ? -seekoff : seekoff, negseek ? 2 : 0);
-      if (e < 0 && negseek)
-    {
-      fprintf(stderr, "%s: sorry cannot seek.\n", pname);
-      return 4;
-    }
-      if (e >= 0)
-    seekoff = ftell(fp);
-      else
-#endif
-    {
-      long s = seekoff;
-
-      while (s--)
-        (void)getc(fp);
-    }
-    }
-
-  if (hextype == HEX_CINCLUDE)
-    {
-      const char* filename = extract_filename(argv[1]);
-
-      if (fp != stdin)
-    {
-      fprintf(fpo, "unsigned char %s", isdigit((int)filename[0]) ? "__" : "");
-      for (e = 0; (c = filename[e]) != 0; e++)
-        putc(isalnum(c) ? c : '_', fpo);
-      fputs("[] = {\n", fpo);
-    }
-
-      p = 0;
-      while ((length < 0 || p < length) && (c = getc(fp)) != EOF)
-    {
-      fprintf(fpo, (hexx == hexxa) ? "%s0x%02x" : "%s0X%02X",
-        (p % cols) ? ", " : ",\n  "+2*!p,  c);
-      p++;
-    }
-
-      if (p)
-    fputs("\n};\n"+3*(fp == stdin), fpo);
-
-      if (fp != stdin)
-    {
-      fprintf(fpo, "unsigned int %s", isdigit((int)filename[0]) ? "__" : "");
-      for (e = 0; (c = filename[e]) != 0; e++)
-        putc(isalnum(c) ? c : '_', fpo);
-      fprintf(fpo, "_len = %d;\n", p);
-    }
-
-      fclose(fp);
-      fclose(fpo);
-      return 0;
-    }
-
-  if (hextype == HEX_POSTSCRIPT)
-    {
-      p = cols;
-      while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
-    {
-      putchar(hexx[(e >> 4) & 0xf]);
-      putchar(hexx[(e     ) & 0xf]);
-      n++;
-      if (!--p)
-        {
-          putchar('\n');
-          p = cols;
-        }
-    }
-      if (p < cols)
-    putchar('\n');
-      fclose(fp);
-      fclose(fpo);
-      return 0;
-    }
-
-  /* hextype: HEX_NORMAL or HEX_BITS */
-
-  if (hextype == HEX_NORMAL)
-    grplen = octspergrp + octspergrp + 1;	/* chars per octet group */
-  else	/* hextype == HEX_BITS */
-    grplen = 8 * octspergrp + 1;
-
-  while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
-    {
-      if (p == 0)
-    {
-      sprintf(l, "%07lx: ", n + seekoff);
-      for (c = 9; c < LLEN; l[c++] = ' ');
-    }
-      if (hextype == HEX_NORMAL)
-    {
-      l[c = (9 + (grplen * p) / octspergrp)] = hexx[(e >> 4) & 0xf];
-      l[++c]			       = hexx[ e       & 0xf];
-    }
-      else /* hextype == HEX_BITS */
-    {
-      int i;
-
-      c = (9 + (grplen * p) / octspergrp) - 1;
-      for (i = 7; i >= 0; i--)
-        l[++c] = (e & (1 << i)) ? '1' : '0';
-    }
-      if (ebcdic)
-    e = (e < 64) ? '.' : etoa64[e-64];
-      /* When changing this update definition of LLEN above. */
-      l[11 + (grplen * cols - 1)/octspergrp + p] =
-#ifdef __MVS__
-      (e >= 64)
-#else
-      (e > 31 && e < 127)
-#endif
-      ? e : '.';
-      if (e)
-    nonzero++;
-      n++;
-      if (++p == cols)
-    {
-      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
-      xxdline(fpo, l, autoskip ? nonzero : 1);
-      nonzero = 0;
-      p = 0;
-    }
-    }
-  if (p)
-    {
-      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
-      xxdline(fpo, l, 1);
-    }
-  else if (autoskip)
-    xxdline(fpo, l, -1);	/* last chance to flush out suppressed lines */
-
-  fclose(fp);
-  fclose(fpo);
-  return 0;
-}
diff --git a/kompute/kompute-config.cmake b/kompute/kompute-config.cmake
deleted file mode 100644
index 10425252ce476..0000000000000
--- a/kompute/kompute-config.cmake
+++ /dev/null
@@ -1,28 +0,0 @@
-# General purpose GPU compute framework built on Vulkan to
-# support 1000s of cross vendor graphics cards
-# (AMD, Qualcomm, NVIDIA & friends). Blazing fast, mobile-enabled,
-# asynchronous and optimized for advanced GPU data processing use cases.
-# Backed by the Linux Foundation. 
-#
-# Finding this module will define the following variables:
-#  KOMPUTE_FOUND - True if the core library has been found
-#  KOMPUTE_LIBRARIES - Path to the core library archive
-#  KOMPUTE_INCLUDE_DIRS - Path to the include directories. Gives access
-#                     to kompute.h, as a single include which must be included in every
-#                     file that uses this interface. Else it also points to the
-#                     directory for individual includes.
-
-find_path(KOMPUTE_INCLUDE_DIR
-          NAMES kompute.h)
-
-find_library(KOMPUTE_LIBRARY
-             NAMES kompute
-             HINTS ${KOMPUTE_LIBRARY_ROOT})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(KOMPUTE REQUIRED_VARS KOMPUTE_LIBRARY KOMPUTE_INCLUDE_DIR)
-
-if(KOMPUTE_FOUND)
-    set(KOMPUTE_LIBRARIES ${KOMPUTE_LIBRARY})
-    set(KOMPUTE_INCLUDE_DIRS ${KOMPUTE_INCLUDE_DIR})
-endif()
diff --git a/kompute/scripts/convert_shaders.py b/kompute/scripts/convert_shaders.py
deleted file mode 100755
index 11a3ab974d6a6..0000000000000
--- a/kompute/scripts/convert_shaders.py
+++ /dev/null
@@ -1,149 +0,0 @@
-#!/usr/bin/env python3
-"""
-    Script to handle conversion of compute shaders to spirv and to headers
-"""
-import os
-import sys
-import logging
-import click
-import subprocess
-
-logger = logging.getLogger(__name__)
-logger.addHandler(logging.StreamHandler())
-
-is_windows = sys.platform.startswith('win')
-
-CWD=os.path.dirname(os.path.abspath(__file__))
-XXD_LINUX_CMD="xxd"
-XXD_WINDOWS_CMD=os.path.abspath(os.path.join(CWD, "..\\external\\bin\\", "xxd.exe"))
-
-SHADER_GENERATED_NOTICE = """/*
-    THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT
-
-    ---
-
-    Copyright 2020 The Institute for Ethical AI & Machine Learning
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-*/
-"""
-
-@click.command()
-@click.option(
-    "--shader-path",
-    "-p",
-    envvar="KOMPUTE_SHADER_PATH",
-    required=True,
-    help="The path for the directory to build and convert shaders",
-)
-@click.option(
-    "--shader-binary",
-    "-s",
-    envvar="KOMPUTE_SHADER_BINARY",
-    required=True,
-    help="The path for the directory to build and convert shaders",
-)
-@click.option(
-    "--header-path",
-    "-c",
-    envvar="KOMPUTE_HEADER_PATH",
-    default="",
-    required=False,
-    help="The (optional) output file for the cpp header files",
-)
-@click.option(
-    "--verbose",
-    "-v",
-    envvar="KOMPUTE_HEADER_PATH",
-    default=False,
-    is_flag=True,
-    help="Enable versbosity if flag is provided",
-)
-def run_cli(
-    shader_path: str = None,
-    shader_binary: str = None,
-    header_path: bool = None,
-    verbose: bool = None,
-):
-    """
-    CLI function for shader generation
-    """
-
-    if verbose:
-        logger.setLevel(logging.DEBUG)
-    else:
-        logger.setLevel(logging.WARNING)
-
-    logger.debug(f"Starting script with variables: {locals()}")
-
-    if is_windows:
-        logger.debug(f"Running on windows, converting input paths")
-        shader_path = shader_path.replace("/", "\\")
-        header_path = header_path.replace("/", "\\")
-
-    shader_files = []
-    for root, directory, files in os.walk(shader_path):
-        for file in files:
-            if file.endswith(".comp"):
-                shader_files.append(os.path.join(root, file))
-
-    run_cmd = lambda *args: subprocess.check_output([*args]).decode()
-
-    logger.debug(f"Output spirv path: {shader_path}")
-    logger.debug(f"Converting files to spirv: {shader_files}")
-
-    spirv_files = []
-    for file in shader_files:
-        logger.debug(f"Converting to spirv: {file}")
-        spirv_file = f"{file}.spv"
-        run_cmd(shader_binary, "-V", file, "-o", spirv_file)
-        spirv_files.append(spirv_file)
-
-    # Create cpp files if header_path provided
-    if header_path:
-        logger.debug(f"Header path provided. Converting bin files to hpp.")
-        logger.debug(f"Output header path: {shader_path}")
-
-        # Check if xxd command options are available
-        if is_windows:
-            xxd_cmd = XXD_WINDOWS_CMD
-        else:
-            xxd_cmd = XXD_LINUX_CMD
-
-        for file in spirv_files:
-            print(xxd_cmd)
-            header_data = str(run_cmd(xxd_cmd, "-i", file))
-            # Ensuring the variable is a static const unsigned
-            header_data = header_data.replace("unsigned", "static const unsigned")
-            if is_windows:
-                raw_file_name = file.split("\\")[-1]
-            else:
-                raw_file_name = file.split("/")[-1]
-            file_name = f"shader{raw_file_name}"
-            header_file = file_name.replace(".comp.spv", ".hpp")
-            header_file_define = "SHADEROP_" + header_file.replace(".", "_").upper()
-            logger.debug(f"Converting to hpp: {file_name}")
-            with open(os.path.join(header_path, header_file), "w+", newline='\n') as fstream:
-                fstream.write(f"{SHADER_GENERATED_NOTICE}\n")
-                fstream.write(f"#ifndef {header_file_define}\n")
-                fstream.write(f"#define {header_file_define}\n\n")
-                fstream.write("namespace kp {\n")
-                fstream.write("namespace shader_data {\n")
-                fstream.write(f"{header_data}")
-                fstream.write("}\n")
-                fstream.write("}\n")
-                fstream.write(f"#endif // define {header_file_define}\n")
-
-
-if __name__ == "__main__":
-    run_cli()
diff --git a/kompute/scripts/requirements.txt b/kompute/scripts/requirements.txt
deleted file mode 100644
index 4da0425044e90..0000000000000
--- a/kompute/scripts/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# CLI dependencies
-click==7.1.2
-
-# Dev dependencies
-black==19.10b0
-quom==1.2.0
-Sphinx==3.2.1
-sphinx_material==0.0.30
-breathe==4.20.0
-m2r2==0.2.5
-git+git://github.com/pybind/pybind11_mkdoc.git@master
diff --git a/kompute/setup.py b/kompute/setup.py
deleted file mode 100644
index 09faa8d1a7d32..0000000000000
--- a/kompute/setup.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import os
-import re
-import platform
-import sys
-import sysconfig
-import subprocess
-
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-from distutils.version import LooseVersion
-
-curr_dir = os.path.abspath(os.path.dirname(__file__))
-with open(os.path.join(curr_dir, 'README.md'), encoding='utf-8') as f:
-    long_description = f.read()
-
-class CMakeExtension(Extension):
-    def __init__(self, name, sourcedir=''):
-        Extension.__init__(self, name, sources=[])
-        self.sourcedir = os.path.abspath(sourcedir)
-
-
-class CMakeBuild(build_ext):
-    def run(self):
-        try:
-            out = subprocess.check_output(['cmake', '--version'])
-        except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " +
-                               ", ".join(e.name for e in self.extensions))
-
-        cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
-        if cmake_version < '3.15':
-            raise RuntimeError("CMake >= 3.15 is required")
-
-        for ext in self.extensions:
-            self.build_extension(ext)
-
-    def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-        # required for auto-detection of auxiliary "native" libs
-        if not extdir.endswith(os.path.sep):
-            extdir += os.path.sep
-
-        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
-                      '-DKOMPUTE_OPT_BUILD_PYTHON=ON',
-                      '-DKOMPUTE_OPT_LOG_LEVEL=Off',
-                      '-DKOMPUTE_OPT_USE_SPDLOG=Off',
-                      '-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
-                      '-DPYTHON_EXECUTABLE=' + sys.executable,
-                      '-DPYTHON_INCLUDE_DIR=' + sysconfig.get_path('include'),
-                      '-DPYTHON_LIBRARY=' + sysconfig.get_path('stdlib'),
-        ]
-
-        cfg = 'Debug' if self.debug else 'Release'
-        build_args = ['--config', cfg]
-
-        env = os.environ.copy()
-        oldCxxFlags = env.get('CXXFLAGS', '')
-        env['CXXFLAGS'] = f'{oldCxxFlags} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
-
-        if platform.system() == "Windows":
-            cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}']
-            if sys.maxsize > 2**32:
-                cmake_args += ['-A', 'x64']
-            build_args += ['--', '/m']
-        else:
-            env['CXXFLAGS'] += ' -fPIC'
-            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            build_args += ['--', '-j']
-            # Optional environment variable to limit the number of parallel jobs for GitHub actions to reduce RAM usage
-            if 'KOMPUTE_PYTHON_NUM_PARALLEL_THREADS' in env:
-                build_args += env['KOMPUTE_PYTHON_NUM_PARALLEL_THREADS']
-
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-
-        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
-
-setup(
-    name='kp',
-    version='0.8.1',
-    author='Alejandro Saucedo',
-    description='Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
-    long_description=long_description,
-    long_description_content_type='text/markdown',
-    ext_modules=[CMakeExtension('kp')],
-    install_requires=[
-        "numpy<2.0.0"
-    ],
-    cmdclass=dict(build_ext=CMakeBuild),
-    zip_safe=False,
-    include_package_data=True,
-)
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
deleted file mode 100644
index c2d8554e1fc5e..0000000000000
--- a/kompute/src/Algorithm.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#include <fstream>
-
-#include "kompute/Algorithm.hpp"
-
-namespace kp {
-
-Algorithm::~Algorithm()
-{
-    KP_LOG_DEBUG("Kompute Algorithm Destructor started");
-
-    this->destroy();
-}
-
-bool
-Algorithm::isInit()
-{
-    return this->mPipeline && this->mPipelineCache && this->mPipelineLayout &&
-           this->mDescriptorPool && this->mDescriptorSet &&
-           this->mDescriptorSetLayout && this->mShaderModule;
-}
-
-void
-Algorithm::destroy()
-{
-    // We don't have to free memory on destroy as it's freed by the
-    // commandBuffer destructor if (this->mPushConstantsData) {
-    //     free(this->mPushConstantsData);
-    // }
-    // if (this->mSpecializationConstantsData) {
-    //     free(this->mSpecializationConstantsData);
-    // }
-
-    if (!this->mDevice) {
-        KP_LOG_WARN("Kompute Algorithm destroy function reached with null "
-                    "Device pointer");
-        return;
-    }
-
-    if (this->mFreePipeline && this->mPipeline) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline");
-        if (!this->mPipeline) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
-                        "pipeline but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mPipeline,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mPipeline = nullptr;
-    }
-
-    if (this->mFreePipelineLayout && this->mPipelineLayout) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
-        if (!this->mPipelineLayout) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
-                        "pipeline layout but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mPipelineLayout,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mPipelineLayout = nullptr;
-    }
-
-    if (this->mFreeShaderModule && this->mShaderModule) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying shader module");
-        if (!this->mShaderModule) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader "
-                        "module but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mShaderModule,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mShaderModule = nullptr;
-    }
-
-    freeParameters();
-}
-
-void
-Algorithm::freeParameters()
-{
-    if (this->mFreeDescriptorSetLayout && this->mDescriptorSetLayout) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Set Layout");
-        if (!this->mDescriptorSetLayout) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
-                        "descriptor set layout but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mDescriptorSetLayout,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mDescriptorSetLayout = nullptr;
-    }
-}
-
-void
-Algorithm::createParameters()
-{
-    KP_LOG_DEBUG("Kompute Algorithm createParameters started");
-    if (!*this->mDescriptorPool) {
-        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
-        return;
-    }
-
-    std::vector<vk::DescriptorSetLayoutBinding> descriptorSetBindings;
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        descriptorSetBindings.push_back(
-          vk::DescriptorSetLayoutBinding(i, // Binding index
-                                         vk::DescriptorType::eStorageBuffer,
-                                         1, // Descriptor count
-                                         vk::ShaderStageFlagBits::eCompute));
-    }
-
-    // This is the component that is fed into the pipeline
-    vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo(
-      vk::DescriptorSetLayoutCreateFlags(),
-      static_cast<uint32_t>(descriptorSetBindings.size()),
-      descriptorSetBindings.data());
-
-    KP_LOG_DEBUG("Kompute Algorithm creating descriptor set layout");
-    this->mDescriptorSetLayout = std::make_shared<vk::DescriptorSetLayout>();
-    vk::Result result = this->mDevice->createDescriptorSetLayout(
-      &descriptorSetLayoutInfo, nullptr, this->mDescriptorSetLayout.get());
-
-   if (result != vk::Result::eSuccess) {
-        KP_LOG_ERROR("Failed to create descriptor set layout. Error code: {}", vk::to_string(result));
-    } else {
-        this->mFreeDescriptorSetLayout = true;
-        KP_LOG_DEBUG("Successfully allocated descriptor set layout.");
-    }
-
-    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
-      *this->mDescriptorPool,
-      1, // Descriptor set layout count
-      this->mDescriptorSetLayout.get());
-
-    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
-    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
-    result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
-                                          this->mDescriptorSet.get());
-
-    if (result != vk::Result::eSuccess) {
-        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
-    } else {
-        this->mFreeDescriptorSet = true;
-        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
-    }
-
-    this->mFreeDescriptorSet = true;
-
-    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
-
-        vk::DescriptorBufferInfo descriptorBufferInfo =
-          this->mTensors[i]->constructDescriptorBufferInfo();
-
-        computeWriteDescriptorSets.push_back(
-          vk::WriteDescriptorSet(*this->mDescriptorSet,
-                                 i, // Destination binding
-                                 0, // Destination array element
-                                 1, // Descriptor count
-                                 vk::DescriptorType::eStorageBuffer,
-                                 nullptr, // Descriptor image info
-                                 &descriptorBufferInfo));
-
-        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
-                                            nullptr);
-    }
-
-    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
-}
-
-void
-Algorithm::updateParameters()
-{
-    KP_LOG_DEBUG("Kompute Algorithm updateParameters started");
-    if (!*this->mDescriptorPool) {
-        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
-        return;
-    }
-
-    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
-      *this->mDescriptorPool,
-      1, // Descriptor set layout count
-      this->mDescriptorSetLayout.get());
-
-    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
-    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
-    vk::Result result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
-                                          this->mDescriptorSet.get());
-
-    if (result != vk::Result::eSuccess) {
-        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
-    } else {
-        this->mFreeDescriptorSet = true;
-        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
-    }
-
-    this->mFreeDescriptorSet = true;
-
-    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
-
-        vk::DescriptorBufferInfo descriptorBufferInfo =
-          this->mTensors[i]->constructDescriptorBufferInfo();
-
-        computeWriteDescriptorSets.push_back(
-          vk::WriteDescriptorSet(*this->mDescriptorSet,
-                                 i, // Destination binding
-                                 0, // Destination array element
-                                 1, // Descriptor count
-                                 vk::DescriptorType::eStorageBuffer,
-                                 nullptr, // Descriptor image info
-                                 &descriptorBufferInfo));
-
-        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
-                                            nullptr);
-    }
-
-    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
-}
-
-void
-Algorithm::createShaderModule()
-{
-    KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");
-
-    vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
-                                                sizeof(uint32_t) *
-                                                  this->mSpirv.size(),
-                                                this->mSpirv.data());
-
-    KP_LOG_DEBUG("Kompute Algorithm Creating shader module. ShaderFileSize: {}",
-                 this->mSpirv.size());
-    this->mFreeShaderModule = true;
-    this->mShaderModule = std::make_shared<vk::ShaderModule>();
-    this->mDevice->createShaderModule(
-      &shaderModuleInfo, nullptr, this->mShaderModule.get());
-    this->mFreeShaderModule = true;
-
-    KP_LOG_DEBUG("Kompute Algorithm create shader module success");
-}
-
-void
-Algorithm::createPipeline()
-{
-    KP_LOG_DEBUG("Kompute Algorithm calling create Pipeline");
-
-    vk::PipelineLayoutCreateInfo pipelineLayoutInfo(
-      vk::PipelineLayoutCreateFlags(),
-      1, // Set layout count
-      this->mDescriptorSetLayout.get());
-
-    vk::PushConstantRange pushConstantRange;
-    if (this->mPushConstantsSize) {
-        pushConstantRange.setStageFlags(vk::ShaderStageFlagBits::eCompute);
-        pushConstantRange.setOffset(0);
-        pushConstantRange.setSize(this->mPushConstantsDataTypeMemorySize *
-                                  this->mPushConstantsSize);
-
-        pipelineLayoutInfo.setPushConstantRangeCount(1);
-        pipelineLayoutInfo.setPPushConstantRanges(&pushConstantRange);
-    }
-
-    this->mPipelineLayout = std::make_shared<vk::PipelineLayout>();
-    this->mDevice->createPipelineLayout(
-      &pipelineLayoutInfo, nullptr, this->mPipelineLayout.get());
-    this->mFreePipelineLayout = true;
-
-    std::vector<vk::SpecializationMapEntry> specializationEntries;
-
-    for (uint32_t i = 0; i < this->mSpecializationConstantsSize; i++) {
-        vk::SpecializationMapEntry specializationEntry(
-          static_cast<uint32_t>(i),
-          static_cast<uint32_t>(
-            this->mSpecializationConstantsDataTypeMemorySize * i),
-          this->mSpecializationConstantsDataTypeMemorySize);
-
-        specializationEntries.push_back(specializationEntry);
-    }
-
-    // This passes ownership of the memory so we remove ownership from
-    // specialization container by using "transferDataOwnership"
-    vk::SpecializationInfo specializationInfo(
-      static_cast<uint32_t>(specializationEntries.size()),
-      specializationEntries.data(),
-      this->mSpecializationConstantsDataTypeMemorySize *
-        this->mSpecializationConstantsSize,
-      this->mSpecializationConstantsData);
-
-    vk::PipelineShaderStageCreateInfo shaderStage(
-      vk::PipelineShaderStageCreateFlags(),
-      vk::ShaderStageFlagBits::eCompute,
-      *this->mShaderModule,
-      "main",
-      &specializationInfo);
-
-    vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
-                                               shaderStage,
-                                               *this->mPipelineLayout,
-                                               vk::Pipeline(),
-                                               0);
-
-#ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
-    vk::ResultValue<vk::Pipeline> pipelineResult =
-      this->mDevice->createComputePipeline(*mPipelineCache, pipelineInfo);
-
-    if (pipelineResult.result != vk::Result::eSuccess) {
-        throw std::runtime_error("Failed to create pipeline result: " +
-                                 vk::to_string(pipelineResult.result));
-    }
-
-    vk::Pipeline& pipeline = pipelineResult.value;
-    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
-    this->mFreePipeline = true;
-#else
-    vk::Pipeline pipeline =
-      this->mDevice->createComputePipeline(*mPipelineCache, pipelineInfo)
-        .value;
-    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
-    this->mFreePipeline = true;
-#endif
-
-    // TODO: Update to consistent
-    // this->mPipeline = std::make_shared<vk::Pipeline>();
-    // this->mDevice->createComputePipelines(
-    //         *this->mPipelineCache, 1, &pipelineInfo, nullptr,
-    //         this->mPipeline.get());
-
-    KP_LOG_DEBUG("Kompute Algorithm Create Pipeline Success");
-}
-
-void
-Algorithm::recordBindCore(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute Algorithm binding pipeline");
-
-    commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute,
-                               *this->mPipeline);
-
-    KP_LOG_DEBUG("Kompute Algorithm binding descriptor sets");
-
-    commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
-                                     *this->mPipelineLayout,
-                                     0, // First set
-                                     *this->mDescriptorSet,
-                                     nullptr // Dispatcher
-    );
-}
-
-void
-Algorithm::recordBindPush(const vk::CommandBuffer& commandBuffer)
-{
-    if (this->mPushConstantsSize) {
-        KP_LOG_DEBUG("Kompute Algorithm binding push constants memory size: {}",
-                     this->mPushConstantsSize *
-                       this->mPushConstantsDataTypeMemorySize);
-
-        commandBuffer.pushConstants(*this->mPipelineLayout,
-                                    vk::ShaderStageFlagBits::eCompute,
-                                    0,
-                                    this->mPushConstantsSize *
-                                      this->mPushConstantsDataTypeMemorySize,
-                                    this->mPushConstantsData);
-    }
-}
-
-void
-Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute Algorithm recording dispatch");
-
-    commandBuffer.dispatch(
-      this->mWorkgroup[0], this->mWorkgroup[1], this->mWorkgroup[2]);
-}
-
-void
-Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
-{
-    KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (workgroup[0] > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mWorkgroup = { workgroup[0],
-                             workgroup[1] > 0 ? workgroup[1] : 1,
-                             workgroup[2] > 0 ? workgroup[2] : 1 };
-    } else {
-        this->mWorkgroup = { minSize, 1, 1 };
-    }
-
-    KP_LOG_INFO("Kompute OpAlgoCreate set dispatch size X: {}, Y: {}, Z: {}",
-                this->mWorkgroup[0],
-                this->mWorkgroup[1],
-                this->mWorkgroup[2]);
-}
-
-const Workgroup&
-Algorithm::getWorkgroup()
-{
-    return this->mWorkgroup;
-}
-
-const std::vector<std::shared_ptr<Tensor>>&
-Algorithm::getTensors()
-{
-    return this->mTensors;
-}
-
-void Algorithm::setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors)
-{
-    this->mTensors = tensors;
-}
-
-}
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
deleted file mode 100644
index 42b7d07f5e5b9..0000000000000
--- a/kompute/src/CMakeLists.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-cmake_minimum_required(VERSION 3.20)
-
-if(KOMPUTE_OPT_ANDROID_BUILD)
-    find_library(android android)
-endif()
-
-cmake_minimum_required(VERSION 3.20)
-
-add_library(kompute STATIC Algorithm.cpp
-    Manager.cpp
-    OpAlgoDispatch.cpp
-    OpMemoryBarrier.cpp
-    OpTensorCopy.cpp
-    OpTensorFill.cpp
-    OpTensorSyncDevice.cpp
-    OpTensorSyncLocal.cpp
-    OpBufferSyncDevice.cpp
-    OpBufferSyncLocal.cpp
-    Sequence.cpp
-    Tensor.cpp
-    Core.cpp)
-
-add_library(kompute::kompute ALIAS kompute)
-
-# Set version for shared libraries.
-set_target_properties(kompute
-    PROPERTIES
-    VERSION ${${PROJECT_NAME}_VERSION}
-    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR}
-    POSITION_INDEPENDENT_CODE TRUE)
-
-# Import GNU common install directory variables
-include(GNUInstallDirs)
-
-install(TARGETS kompute
-    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
-
-# Include CMake helpers for package config files
-# Follow this installation guideline: https://cmake.org/cmake/help/latest/manual/cmake-packages.7.html
-include(CMakePackageConfigHelpers)
-
-configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
-    "${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
-
-#install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
-#    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
-
-# ####################################################
-# Linking
-# ####################################################
-if(KOMPUTE_OPT_ANDROID_BUILD)
-    target_link_libraries(kompute PUBLIC vulkanAndroid
-        android
-        kp_logger
-        kp_shader
-        fmt::fmt-header-only)
-else()
-    target_link_libraries(kompute PUBLIC
-        kp_logger
-        kp_shader
-        fmt::fmt-header-only)
-endif()
-
-if(KOMPUTE_OPT_BUILD_PYTHON)
-    include_directories(${PYTHON_INCLUDE_DIRS})
-
-    target_link_libraries(kompute PRIVATE pybind11::headers ${PYTHON_LIBRARIES})
-endif()
-
-if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
-    target_link_libraries(kompute PUBLIC Vulkan-Headers)
-else()
-    target_link_libraries(kompute PUBLIC Vulkan::Headers)
-endif()
-
-# ####################################################
-# Misc
-# ####################################################
-add_subdirectory(logger)
-add_subdirectory(shaders)
-add_subdirectory(include)
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
deleted file mode 100644
index 020f441604022..0000000000000
--- a/kompute/src/Core.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Core.hpp"
-
-#ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-#define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-/**
- * Ensures support for dynamic loading of Vulkan functions on Android.
- * Acts as a default store for loaded functions.
- * More information:
- * https://github.com/KhronosGroup/Vulkan-Hpp#vulkan_hpp_default_dispatcher
- **/
-VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-#endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-
-namespace kp {
-} // namespace kp
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
deleted file mode 100644
index 0c588e19be7b9..0000000000000
--- a/kompute/src/Manager.cpp
+++ /dev/null
@@ -1,512 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Manager.hpp"
-#include "fmt/format.h"
-#include "kompute/logger/Logger.hpp"
-#include <fmt/core.h>
-#include <iterator>
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace kp {
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-static VKAPI_ATTR VkBool32 VKAPI_CALL
-debugMessageCallback(VkDebugReportFlagsEXT /*flags*/,
-                     VkDebugReportObjectTypeEXT /*objectType*/,
-                     uint64_t /*object*/,
-                     size_t /*location*/,
-                     int32_t /*messageCode*/,
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
-                     const char* pLayerPrefix,
-                     const char* pMessage,
-#else
-                     const char* /*pLayerPrefix*/,
-                     const char* /*pMessage*/,
-#endif
-                     void* /*pUserData*/)
-{
-    KP_LOG_DEBUG("[VALIDATION]: {} - {}", pLayerPrefix, pMessage);
-    return VK_FALSE;
-}
-#endif
-
-Manager::Manager()
-{
-    this->mManageResources = true;
-
-// Make sure the logger is setup
-#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
-    logger::setupLogger();
-#endif
-    this->createInstance();
-}
-
-void Manager::initializeDevice(uint32_t physicalDeviceIndex,
-                               const std::vector<uint32_t>& familyQueueIndices,
-                               const std::vector<std::string>& desiredExtensions)
-{
-    this->createDevice(
-      familyQueueIndices, physicalDeviceIndex, desiredExtensions);
-}
-
-Manager::~Manager()
-{
-    KP_LOG_DEBUG("Kompute Manager Destructor started");
-    this->destroy();
-}
-
-void
-Manager::destroy()
-{
-
-    KP_LOG_DEBUG("Kompute Manager destroy() started");
-
-    if (this->mDevice == nullptr) {
-        KP_LOG_ERROR(
-          "Kompute Manager destructor reached with null Device pointer");
-        return;
-    }
-
-    if (this->mManageResources && this->mManagedSequences.size()) {
-        KP_LOG_DEBUG("Kompute Manager explicitly running destructor for "
-                     "managed sequences");
-        for (const std::weak_ptr<Sequence>& weakSq : this->mManagedSequences) {
-            if (std::shared_ptr<Sequence> sq = weakSq.lock()) {
-                sq->destroy();
-            }
-        }
-        this->mManagedSequences.clear();
-    }
-
-    if (this->mManageResources && !this->mManagedAlgorithmsMap.empty()) {
-        KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
-        for (const auto& kv : this->mManagedAlgorithmsMap) {
-            if (std::shared_ptr<Algorithm> algorithm = kv.second) {
-                algorithm->destroy();
-            }
-        }
-        this->mManagedAlgorithmsMap.clear();
-    }
-
-    if (this->mManageResources && this->mManagedTensors.size()) {
-        KP_LOG_DEBUG("Kompute Manager explicitly freeing tensors");
-        for (const std::weak_ptr<Tensor>& weakTensor : this->mManagedTensors) {
-            if (std::shared_ptr<Tensor> tensor = weakTensor.lock()) {
-                tensor->destroy();
-            }
-        }
-        this->mManagedTensors.clear();
-    }
-
-    if (this->mPipelineCache) {
-        KP_LOG_DEBUG("Kompute Manager Destroying pipeline cache");
-        if (!this->mPipelineCache) {
-            KP_LOG_WARN("Kompute Manager Error requested to destroy "
-                        "pipeline cache but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mPipelineCache,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mPipelineCache = nullptr;
-    }
-
-    if (this->mFreeDevice) {
-        KP_LOG_INFO("Destroying device");
-        this->mDevice->destroy(
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mDevice = nullptr;
-        KP_LOG_DEBUG("Kompute Manager Destroyed Device");
-    }
-
-    if (this->mInstance == nullptr) {
-        KP_LOG_ERROR(
-          "Kompute Manager destructor reached with null Instance pointer");
-        return;
-    }
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    if (this->mDebugReportCallback) {
-        this->mInstance->destroyDebugReportCallbackEXT(
-          this->mDebugReportCallback, nullptr, this->mDebugDispatcher);
-        KP_LOG_DEBUG("Kompute Manager Destroyed Debug Report Callback");
-    }
-#endif
-
-    if (this->mFreeInstance) {
-        this->mInstance->destroy(
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mInstance = nullptr;
-        KP_LOG_DEBUG("Kompute Manager Destroyed Instance");
-    }
-}
-
-void
-Manager::createInstance()
-{
-
-    KP_LOG_DEBUG("Kompute Manager creating instance");
-
-    this->mFreeInstance = true;
-
-    vk::ApplicationInfo applicationInfo;
-    applicationInfo.pApplicationName = "Kompute";
-    applicationInfo.pEngineName = "Kompute";
-    applicationInfo.apiVersion = KOMPUTE_VK_API_VERSION;
-    applicationInfo.engineVersion = KOMPUTE_VK_API_VERSION;
-    applicationInfo.applicationVersion = KOMPUTE_VK_API_VERSION;
-
-    std::vector<const char*> applicationExtensions;
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
-#endif
-
-    vk::InstanceCreateInfo computeInstanceCreateInfo;
-    computeInstanceCreateInfo.pApplicationInfo = &applicationInfo;
-    if (!applicationExtensions.empty()) {
-        computeInstanceCreateInfo.enabledExtensionCount =
-          (uint32_t)applicationExtensions.size();
-        computeInstanceCreateInfo.ppEnabledExtensionNames =
-          applicationExtensions.data();
-    }
-
-    try {
-        mDynamicLoader = std::make_shared<vk::DynamicLoader>();
-    } catch (const std::exception & err) {
-        return;
-    }
-
-    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
-      mDynamicLoader->getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
-    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
-    // We'll identify the layers that are supported
-    std::vector<const char*> validLayerNames;
-    std::vector<const char*> desiredLayerNames = {
-        "VK_LAYER_LUNARG_assistant_layer",
-        "VK_LAYER_LUNARG_standard_validation",
-        "VK_LAYER_KHRONOS_validation",
-    };
-    std::vector<std::string> envLayerNames;
-    const char* envLayerNamesVal = std::getenv("KOMPUTE_ENV_DEBUG_LAYERS");
-    if (envLayerNamesVal != nullptr && *envLayerNamesVal != '\0') {
-        KP_LOG_DEBUG("Kompute Manager adding environment layers: {}",
-                     envLayerNamesVal);
-        std::istringstream iss(envLayerNamesVal);
-        std::istream_iterator<std::string> beg(iss);
-        std::istream_iterator<std::string> end;
-        envLayerNames = std::vector<std::string>(beg, end);
-        for (const std::string& layerName : envLayerNames) {
-            desiredLayerNames.push_back(layerName.c_str());
-        }
-        KP_LOG_DEBUG("Desired layers: {}", fmt::join(desiredLayerNames, ", "));
-    }
-
-    // Identify the valid layer names based on the desiredLayerNames
-    {
-        std::set<std::string> uniqueLayerNames;
-        std::vector<vk::LayerProperties> availableLayerProperties =
-          vk::enumerateInstanceLayerProperties();
-        for (vk::LayerProperties layerProperties : availableLayerProperties) {
-            std::string layerName(layerProperties.layerName.data());
-            uniqueLayerNames.insert(layerName);
-        }
-        KP_LOG_DEBUG("Available layers: {}", fmt::join(uniqueLayerNames, ", "));
-        for (const char* desiredLayerName : desiredLayerNames) {
-            if (uniqueLayerNames.count(desiredLayerName) != 0) {
-                validLayerNames.push_back(desiredLayerName);
-            }
-        }
-    }
-
-    if (!validLayerNames.empty()) {
-        KP_LOG_DEBUG(
-          "Kompute Manager Initializing instance with valid layers: {}",
-          fmt::join(validLayerNames, ", "));
-        computeInstanceCreateInfo.enabledLayerCount =
-          static_cast<uint32_t>(validLayerNames.size());
-        computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data();
-    } else {
-        KP_LOG_WARN("Kompute Manager no valid layer names found from desired "
-                    "layer names");
-    }
-#endif
-
-    this->mInstance = std::make_shared<vk::Instance>();
-    vk::Result r = vk::createInstance(
-      &computeInstanceCreateInfo, nullptr, this->mInstance.get());
-    if (r != vk::Result::eSuccess) {
-        KP_LOG_ERROR(
-          "Kompute Manager Error allocating vulkan instance", vk::to_string(r));
-        this->mInstance = nullptr;
-        this->mFreeInstance = false;
-        return;
-    }
-
-    VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
-
-    KP_LOG_DEBUG("Kompute Manager Instance Created");
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    KP_LOG_DEBUG("Kompute Manager adding debug callbacks");
-    if (validLayerNames.size() > 0) {
-        vk::DebugReportFlagsEXT debugFlags =
-          vk::DebugReportFlagBitsEXT::eError |
-          vk::DebugReportFlagBitsEXT::eWarning;
-        vk::DebugReportCallbackCreateInfoEXT debugCreateInfo = {};
-        debugCreateInfo.pfnCallback =
-          (PFN_vkDebugReportCallbackEXT)debugMessageCallback;
-        debugCreateInfo.flags = debugFlags;
-
-        this->mDebugDispatcher.init(*this->mInstance, vkGetInstanceProcAddr);
-        this->mDebugReportCallback =
-          this->mInstance->createDebugReportCallbackEXT(
-            debugCreateInfo, nullptr, this->mDebugDispatcher);
-    }
-#endif
-}
-
-void
-Manager::clear()
-{
-    if (this->mManageResources) {
-        this->mManagedTensors.erase(
-          std::remove_if(begin(this->mManagedTensors),
-                         end(this->mManagedTensors),
-                         [](std::weak_ptr<Tensor> t) { return t.expired(); }),
-          end(this->mManagedTensors));
-        for (auto it = this->mManagedAlgorithmsMap.begin();
-             it != this->mManagedAlgorithmsMap.end();) {
-            if (it->second) {
-                it = this->mManagedAlgorithmsMap.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        this->mManagedSequences.erase(
-          std::remove_if(begin(this->mManagedSequences),
-                         end(this->mManagedSequences),
-                         [](std::weak_ptr<Sequence> t) { return t.expired(); }),
-          end(this->mManagedSequences));
-    }
-}
-
-void
-Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
-                      uint32_t physicalDeviceIndex,
-                      const std::vector<std::string>& desiredExtensions)
-{
-
-    KP_LOG_DEBUG("Kompute Manager creating Device");
-
-    if (this->mInstance == nullptr) {
-        throw std::runtime_error("Kompute Manager instance is null");
-    }
-
-    this->mFreeDevice = true;
-
-    // Getting an integer that says how many vuklan devices we have
-    std::vector<vk::PhysicalDevice> physicalDevices =
-      this->mInstance->enumeratePhysicalDevices();
-    uint32_t deviceCount = physicalDevices.size();
-
-    // This means there are no devices at all
-    if (deviceCount == 0) {
-        throw std::runtime_error("Failed to find GPUs with Vulkan support! "
-                                 "Maybe you haven't installed vulkan drivers?");
-    }
-
-    // This means that we're exceeding our device limit, for
-    // example if we have 2 devices, just physicalDeviceIndex
-    // 0 and 1 are acceptable. Hence, physicalDeviceIndex should
-    // always be less than deviceCount, else we raise an error
-    if (!(deviceCount > physicalDeviceIndex)) {
-        throw std::runtime_error("There is no such physical index or device, "
-                                 "please use your existing device");
-    }
-
-    vk::PhysicalDevice physicalDevice = physicalDevices[physicalDeviceIndex];
-
-    this->mPhysicalDevice =
-      std::make_shared<vk::PhysicalDevice>(physicalDevice);
-
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
-    vk::PhysicalDeviceProperties physicalDeviceProperties =
-      physicalDevice.getProperties();
-#endif
-
-    KP_LOG_INFO("Using physical device index {} found {}",
-                physicalDeviceIndex,
-                physicalDeviceProperties.deviceName.data());
-
-    if (familyQueueIndices.empty()) {
-        // Find compute queue
-        std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
-          physicalDevice.getQueueFamilyProperties();
-
-        uint32_t computeQueueFamilyIndex = 0;
-        bool computeQueueSupported = false;
-        for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
-            vk::QueueFamilyProperties queueFamilyProperties =
-              allQueueFamilyProperties[i];
-
-            if (queueFamilyProperties.queueFlags &
-                vk::QueueFlagBits::eCompute) {
-                computeQueueFamilyIndex = i;
-                computeQueueSupported = true;
-                break;
-            }
-        }
-
-        if (!computeQueueSupported) {
-            throw std::runtime_error("Compute queue is not supported");
-        }
-
-        this->mComputeQueueFamilyIndices.push_back(computeQueueFamilyIndex);
-    } else {
-        this->mComputeQueueFamilyIndices = familyQueueIndices;
-    }
-
-    std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
-    std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
-    for (const auto& value : this->mComputeQueueFamilyIndices) {
-        familyQueueCounts[value]++;
-        familyQueuePriorities[value].push_back(1.0f);
-    }
-
-    std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
-    std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
-    for (const auto& familyQueueInfo : familyQueueCounts) {
-        // Setting the device count to 0
-        familyQueueIndexCount[familyQueueInfo.first] = 0;
-
-        // Creating the respective device queue
-        vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
-          vk::DeviceQueueCreateFlags(),
-          familyQueueInfo.first,
-          familyQueueInfo.second,
-          familyQueuePriorities[familyQueueInfo.first].data());
-        deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
-    }
-
-    KP_LOG_DEBUG("Kompute Manager desired extension layers {}",
-                 fmt::join(desiredExtensions, ", "));
-
-    std::vector<vk::ExtensionProperties> deviceExtensions =
-      this->mPhysicalDevice->enumerateDeviceExtensionProperties();
-
-    std::set<std::string> uniqueExtensionNames;
-    for (const vk::ExtensionProperties& ext : deviceExtensions) {
-        uniqueExtensionNames.insert(ext.extensionName);
-    }
-    KP_LOG_DEBUG("Kompute Manager available extensions {}",
-                 fmt::join(uniqueExtensionNames, ", "));
-    std::vector<const char*> validExtensions;
-    for (const std::string& ext : desiredExtensions) {
-        if (uniqueExtensionNames.count(ext) != 0) {
-            validExtensions.push_back(ext.c_str());
-        }
-    }
-    if (desiredExtensions.size() != validExtensions.size()) {
-        KP_LOG_ERROR("Kompute Manager not all extensions were added: {}",
-                     fmt::join(validExtensions, ", "));
-    }
-
-    vk::PhysicalDeviceFeatures features;
-    features.shaderInt16 = true;
-
-    vk::PhysicalDeviceVulkan11Features features11;
-    features11.uniformAndStorageBuffer16BitAccess = true;
-    features11.storageBuffer16BitAccess = true;
-    features11.pNext = nullptr;
-
-    vk::PhysicalDeviceVulkan12Features features12;
-    features12.storageBuffer8BitAccess = true;
-    features12.uniformAndStorageBuffer8BitAccess = true;
-    features12.shaderFloat16 = true;
-    features12.shaderInt8 = true;
-    features12.pNext = &features11;
-
-    vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
-                                          deviceQueueCreateInfos.size(),
-                                          deviceQueueCreateInfos.data(),
-                                          {},
-                                          {},
-                                          validExtensions.size(),
-                                          validExtensions.data(),
-                                          &features);
-
-    deviceCreateInfo.pNext = &features12;
-
-    this->mDevice = std::make_shared<vk::Device>();
-    vk::Result r = physicalDevice.createDevice(
-      &deviceCreateInfo, nullptr, this->mDevice.get());
-    if (r != vk::Result::eSuccess) {
-        KP_LOG_ERROR("Kompute Manager could not create device");
-    }
-
-    KP_LOG_DEBUG("Kompute Manager device created");
-
-    for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndices) {
-        std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
-
-        this->mDevice->getQueue(familyQueueIndex,
-                                familyQueueIndexCount[familyQueueIndex],
-                                currQueue.get());
-
-        familyQueueIndexCount[familyQueueIndex]++;
-
-        this->mComputeQueues.push_back(currQueue);
-    }
-
-    KP_LOG_DEBUG("Kompute Manager compute queue obtained");
-
-    mPipelineCache = std::make_shared<vk::PipelineCache>();
-    vk::PipelineCacheCreateInfo pipelineCacheInfo =
-        vk::PipelineCacheCreateInfo();
-    this->mDevice->createPipelineCache(
-        &pipelineCacheInfo, nullptr, mPipelineCache.get());
-}
-
-std::shared_ptr<Sequence>
-Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
-{
-    KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
-
-    std::shared_ptr<Sequence> sq{ new kp::Sequence(
-      this->mPhysicalDevice,
-      this->mDevice,
-      this->mComputeQueues[queueIndex],
-      this->mComputeQueueFamilyIndices[queueIndex],
-      totalTimestamps) };
-
-    if (this->mManageResources) {
-        this->mManagedSequences.push_back(sq);
-    }
-
-    return sq;
-}
-
-vk::PhysicalDeviceProperties
-Manager::getDeviceProperties() const
-{
-    return this->mPhysicalDevice->getProperties();
-}
-
-std::vector<vk::PhysicalDevice>
-Manager::listDevices() const
-{
-    return this->mInstance->enumeratePhysicalDevices();
-}
-
-std::shared_ptr<vk::Instance>
-Manager::getVkInstance() const
-{
-    return this->mInstance;
-}
-
-}
diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
deleted file mode 100644
index edc0f6eb63448..0000000000000
--- a/kompute/src/OpAlgoDispatch.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpAlgoDispatch.hpp"
-
-namespace kp {
-
-OpAlgoDispatch::~OpAlgoDispatch()
-{
-    KP_LOG_DEBUG("Kompute OpAlgoDispatch destructor started");
-
-    if (this->mPushConstantsData) {
-        KP_LOG_DEBUG("Kompute freeing push constants data");
-        free(this->mPushConstantsData);
-    }
-}
-
-void
-OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpAlgoDispatch record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (const std::shared_ptr<Tensor>& tensor :
-         this->mAlgorithm->getTensors()) {
-        tensor->recordPrimaryBufferMemoryBarrier(
-          commandBuffer,
-          vk::AccessFlagBits::eShaderWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eComputeShader,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    if (this->mPushConstantsSize) {
-        this->mAlgorithm->setPushConstants(
-          this->mPushConstantsData,
-          this->mPushConstantsSize,
-          this->mPushConstantsDataTypeMemorySize);
-    }
-
-    this->mAlgorithm->recordBindCore(commandBuffer);
-    this->mAlgorithm->recordBindPush(commandBuffer);
-    this->mAlgorithm->recordDispatch(commandBuffer);
-}
-
-void
-OpAlgoDispatch::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpAlgoDispatch preEval called");
-}
-
-void
-OpAlgoDispatch::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpAlgoDispatch postSubmit called");
-}
-
-}
diff --git a/kompute/src/OpBufferSyncDevice.cpp b/kompute/src/OpBufferSyncDevice.cpp
deleted file mode 100644
index 1812d04b2428e..0000000000000
--- a/kompute/src/OpBufferSyncDevice.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpBufferSyncDevice.hpp"
-
-namespace kp {
-
-OpBufferSyncDevice::OpBufferSyncDevice(
-        vk::Buffer *primaryBuffer,
-        vk::Buffer *stagingBuffer,
-        vk::DeviceSize size)
-  : mPrimaryBuffer(primaryBuffer)
-  , mStagingBuffer(stagingBuffer)
-  , mSize(size)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice constructor with params");
-}
-
-OpBufferSyncDevice::~OpBufferSyncDevice()
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice destructor started");
-}
-
-void
-OpBufferSyncDevice::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice record called");
-    vk::BufferCopy copyRegion(0, 0, mSize);
-    commandBuffer.copyBuffer(*mStagingBuffer, *mPrimaryBuffer, copyRegion);
-}
-
-void
-OpBufferSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice preEval called");
-}
-
-void
-OpBufferSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice postEval called");
-}
-
-}
diff --git a/kompute/src/OpBufferSyncLocal.cpp b/kompute/src/OpBufferSyncLocal.cpp
deleted file mode 100644
index a829819fa603a..0000000000000
--- a/kompute/src/OpBufferSyncLocal.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpBufferSyncLocal.hpp"
-
-namespace kp {
-
-OpBufferSyncLocal::OpBufferSyncLocal(
-        vk::Buffer *primaryBuffer,
-        vk::Buffer *stagingBuffer,
-        vk::DeviceSize size)
-  : mPrimaryBuffer(primaryBuffer)
-  , mStagingBuffer(stagingBuffer)
-  , mSize(size)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal constructor with params");
-}
-
-OpBufferSyncLocal::~OpBufferSyncLocal()
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal destructor started");
-}
-
-void
-OpBufferSyncLocal::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal record called");
-    vk::BufferCopy copyRegion(0, 0, mSize);
-    commandBuffer.copyBuffer(*mPrimaryBuffer, *mStagingBuffer, copyRegion);
-}
-
-void
-OpBufferSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal preEval called");
-}
-
-void
-OpBufferSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal postEval called");
-}
-
-}
diff --git a/kompute/src/OpMemoryBarrier.cpp b/kompute/src/OpMemoryBarrier.cpp
deleted file mode 100644
index 1f075a3c434e5..0000000000000
--- a/kompute/src/OpMemoryBarrier.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpMemoryBarrier.hpp"
-
-namespace kp {
-
-OpMemoryBarrier::OpMemoryBarrier(
-  const std::vector<std::shared_ptr<Tensor>>& tensors,
-  const vk::AccessFlagBits& srcAccessMask,
-  const vk::AccessFlagBits& dstAccessMask,
-  const vk::PipelineStageFlagBits& srcStageMask,
-  const vk::PipelineStageFlagBits& dstStageMask,
-  bool barrierOnPrimary)
-  : mSrcAccessMask(srcAccessMask)
-  , mDstAccessMask(dstAccessMask)
-  , mSrcStageMask(srcStageMask)
-  , mDstStageMask(dstStageMask)
-  , mBarrierOnPrimary(barrierOnPrimary)
-  , mTensors(tensors)
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor");
-}
-
-OpMemoryBarrier::~OpMemoryBarrier()
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started");
-}
-
-void
-OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    if (this->mBarrierOnPrimary) {
-        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
-            tensor->recordPrimaryBufferMemoryBarrier(commandBuffer,
-                                                     this->mSrcAccessMask,
-                                                     this->mDstAccessMask,
-                                                     this->mSrcStageMask,
-                                                     this->mDstStageMask);
-        }
-    } else {
-        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
-            tensor->recordStagingBufferMemoryBarrier(commandBuffer,
-                                                     this->mSrcAccessMask,
-                                                     this->mDstAccessMask,
-                                                     this->mSrcStageMask,
-                                                     this->mDstStageMask);
-        }
-    }
-}
-
-void
-OpMemoryBarrier::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called");
-}
-
-void
-OpMemoryBarrier::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called");
-}
-
-}
diff --git a/kompute/src/OpTensorCopy.cpp b/kompute/src/OpTensorCopy.cpp
deleted file mode 100644
index 1eaf428b85556..0000000000000
--- a/kompute/src/OpTensorCopy.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpTensorCopy.hpp"
-#include "kompute/Tensor.hpp"
-
-namespace kp {
-
-OpTensorCopy::OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors)
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy constructor with params");
-
-    this->mTensors = tensors;
-
-    if (this->mTensors.size() < 2) {
-        throw std::runtime_error(
-          "Kompute OpTensorCopy called with less than 2 tensor");
-    }
-
-    kp::Tensor::TensorDataTypes dataType = this->mTensors[0]->dataType();
-    uint32_t size = this->mTensors[0]->size();
-    for (const std::shared_ptr<Tensor>& tensor : tensors) {
-        if (tensor->dataType() != dataType) {
-            throw std::runtime_error(fmt::format(
-              "Attempting to copy tensors of different types from {} to {}",
-              Tensor::toString(dataType),
-              Tensor::toString(tensor->dataType())));
-        }
-        if (tensor->size() != size) {
-            throw std::runtime_error(fmt::format(
-              "Attempting to copy tensors of different sizes from {} to {}",
-              size,
-              tensor->size()));
-        }
-    }
-}
-
-OpTensorCopy::~OpTensorCopy()
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy destructor started");
-}
-
-void
-OpTensorCopy::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy record called");
-
-    // We iterate from the second tensor onwards and record a copy to all
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->recordCopyFrom(commandBuffer, this->mTensors[0]);
-    }
-}
-
-void
-OpTensorCopy::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy preEval called");
-}
-
-void
-OpTensorCopy::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");
-
-    // Do not copy on CPU side if source is storage tensor
-    if (this->mTensors[0]->tensorType() == kp::Tensor::TensorTypes::eStorage)
-    {
-        KP_LOG_DEBUG("Kompute OpTensorCopy not copying tensor source given it's of eStorage type");
-        return;
-    }
-    void* data = this->mTensors[0]->rawData();
-
-    // Copy the data from the first tensor into all the tensors
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == kp::Tensor::TensorTypes::eStorage) {
-            KP_LOG_DEBUG("Kompute OpTensorCopy not copying to tensor dest given it's of eStorage type");
-            continue;
-        }
-        this->mTensors[i]->setRawData(data);
-    }
-}
-
-}
diff --git a/kompute/src/OpTensorFill.cpp b/kompute/src/OpTensorFill.cpp
deleted file mode 100644
index bda7d6040eb21..0000000000000
--- a/kompute/src/OpTensorFill.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpTensorFill.hpp"
-#include "kompute/Tensor.hpp"
-
-namespace kp {
-
-OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
-
-    if (tensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpTensorFill called with less than 1 tensor");
-    }
-
-    this->mTensors = tensors;
-}
-
-OpTensorFill::~OpTensorFill()
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
-}
-
-void
-OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill record called");
-
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->recordFill(commandBuffer, 0);
-    }
-}
-
-void
-OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
-}
-
-void
-OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
-}
-
-}
diff --git a/kompute/src/OpTensorSyncDevice.cpp b/kompute/src/OpTensorSyncDevice.cpp
deleted file mode 100644
index b563529ea7822..0000000000000
--- a/kompute/src/OpTensorSyncDevice.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpTensorSyncDevice.hpp"
-
-namespace kp {
-
-OpTensorSyncDevice::OpTensorSyncDevice(
-  const std::vector<std::shared_ptr<Tensor>>& tensors)
-  : mPrimaryBuffer(nullptr)
-  , mStagingBuffer(nullptr)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice constructor with params");
-
-    if (tensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpTensorSyncDevice called with less than 1 tensor");
-    }
-
-    this->mTensors = tensors;
-}
-
-OpTensorSyncDevice::~OpTensorSyncDevice()
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
-
-    this->mTensors.clear();
-}
-
-void
-OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice record called");
-
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
-        }
-    }
-}
-
-void
-OpTensorSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
-}
-
-void
-OpTensorSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
-}
-
-}
diff --git a/kompute/src/OpTensorSyncLocal.cpp b/kompute/src/OpTensorSyncLocal.cpp
deleted file mode 100644
index 7818db565aaa7..0000000000000
--- a/kompute/src/OpTensorSyncLocal.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpTensorSyncLocal.hpp"
-
-namespace kp {
-
-OpTensorSyncLocal::OpTensorSyncLocal(
-  const std::vector<std::shared_ptr<Tensor>>& tensors)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
-
-    if (tensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpTensorSyncLocal called with less than 1 tensor");
-    }
-
-    this->mTensors = tensors;
-}
-
-OpTensorSyncLocal::~OpTensorSyncLocal()
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
-}
-
-void
-OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal record called");
-
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-
-            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
-              commandBuffer,
-              vk::AccessFlagBits::eShaderWrite,
-              vk::AccessFlagBits::eTransferRead,
-              vk::PipelineStageFlagBits::eComputeShader,
-              vk::PipelineStageFlagBits::eTransfer);
-
-            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
-
-            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
-              commandBuffer,
-              vk::AccessFlagBits::eTransferWrite,
-              vk::AccessFlagBits::eHostRead,
-              vk::PipelineStageFlagBits::eTransfer,
-              vk::PipelineStageFlagBits::eHost);
-        }
-    }
-}
-
-void
-OpTensorSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
-}
-
-void
-OpTensorSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
-
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
-}
-
-}
diff --git a/kompute/src/Sequence.cpp b/kompute/src/Sequence.cpp
deleted file mode 100644
index da3b379a3104c..0000000000000
--- a/kompute/src/Sequence.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Sequence.hpp"
-
-namespace kp {
-
-Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                   std::shared_ptr<vk::Device> device,
-                   std::shared_ptr<vk::Queue> computeQueue,
-                   uint32_t queueIndex,
-                   uint32_t totalTimestamps)
-{
-    KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
-
-    this->mPhysicalDevice = physicalDevice;
-    this->mDevice = device;
-    this->mComputeQueue = computeQueue;
-    this->mQueueIndex = queueIndex;
-
-    this->createCommandPool();
-    this->createCommandBuffer();
-    if (totalTimestamps > 0)
-        this->createTimestampQueryPool(totalTimestamps +
-                                       1); //+1 for the first one
-}
-
-Sequence::~Sequence()
-{
-    KP_LOG_DEBUG("Kompute Sequence Destructor started");
-
-    if (this->mDevice) {
-        this->destroy();
-    }
-}
-
-void
-Sequence::begin()
-{
-    KP_LOG_DEBUG("Kompute sequence called BEGIN");
-
-    if (this->isRecording()) {
-        KP_LOG_DEBUG("Kompute Sequence begin called when already recording");
-        return;
-    }
-
-    if (this->isRunning()) {
-        throw std::runtime_error(
-          "Kompute Sequence begin called when sequence still running");
-    }
-
-    KP_LOG_INFO("Kompute Sequence command now started recording");
-    this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
-    this->mRecording = true;
-
-    // latch the first timestamp before any commands are submitted
-    if (this->timestampQueryPool)
-        this->mCommandBuffer->writeTimestamp(
-          vk::PipelineStageFlagBits::eAllCommands,
-          *this->timestampQueryPool,
-          0);
-}
-
-void
-Sequence::end()
-{
-    KP_LOG_DEBUG("Kompute Sequence calling END");
-
-    if (this->isRunning()) {
-        throw std::runtime_error(
-          "Kompute Sequence begin called when sequence still running");
-    }
-
-    if (!this->isRecording()) {
-        KP_LOG_WARN("Kompute Sequence end called when not recording");
-        return;
-    } else {
-        KP_LOG_INFO("Kompute Sequence command recording END");
-        this->mCommandBuffer->end();
-        this->mRecording = false;
-    }
-}
-
-void
-Sequence::clear()
-{
-    KP_LOG_DEBUG("Kompute Sequence calling clear");
-    if (this->isRecording()) {
-        this->end();
-    }
-}
-
-std::shared_ptr<Sequence>
-Sequence::eval()
-{
-    KP_LOG_DEBUG("Kompute sequence EVAL BEGIN");
-
-    return this->evalAsync()->evalAwait();
-}
-
-std::shared_ptr<Sequence>
-Sequence::eval(std::shared_ptr<OpBase> op)
-{
-    this->clear();
-    return this->record(op)->eval();
-}
-
-std::shared_ptr<Sequence>
-Sequence::evalAsync()
-{
-    if (this->isRecording()) {
-        this->end();
-    }
-
-    if (this->mIsRunning) {
-        throw std::runtime_error(
-          "Kompute Sequence evalAsync called when an eval async was "
-          "called without successful wait");
-    }
-
-    this->mIsRunning = true;
-
-    for (size_t i = 0; i < this->mOperations.size(); i++) {
-        this->mOperations[i]->preEval(*this->mCommandBuffer);
-    }
-
-    vk::SubmitInfo submitInfo(
-      0, nullptr, nullptr, 1, this->mCommandBuffer.get());
-
-    this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
-
-    KP_LOG_DEBUG(
-      "Kompute sequence submitting command buffer into compute queue");
-
-    this->mComputeQueue->submit(1, &submitInfo, this->mFence);
-
-    return shared_from_this();
-}
-
-std::shared_ptr<Sequence>
-Sequence::evalAsync(std::shared_ptr<OpBase> op)
-{
-    this->clear();
-    this->record(op);
-    this->evalAsync();
-    return shared_from_this();
-}
-
-std::shared_ptr<Sequence>
-Sequence::evalAwait(uint64_t waitFor)
-{
-    if (!this->mIsRunning) {
-        KP_LOG_WARN("Kompute Sequence evalAwait called without existing eval");
-        return shared_from_this();
-    }
-
-    vk::Result result =
-      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
-    this->mDevice->destroy(
-      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-
-    this->mIsRunning = false;
-
-    if (result == vk::Result::eTimeout) {
-        KP_LOG_WARN("Kompute Sequence evalAwait reached timeout of {}",
-                    waitFor);
-        return shared_from_this();
-    }
-
-    for (size_t i = 0; i < this->mOperations.size(); i++) {
-        this->mOperations[i]->postEval(*this->mCommandBuffer);
-    }
-
-    return shared_from_this();
-}
-
-bool
-Sequence::isRunning() const
-{
-    return this->mIsRunning;
-}
-
-bool
-Sequence::isRecording() const
-{
-    return this->mRecording;
-}
-
-bool
-Sequence::isInit() const
-{
-    return this->mDevice && this->mCommandPool && this->mCommandBuffer &&
-           this->mComputeQueue;
-}
-
-void
-Sequence::rerecord()
-{
-    this->end();
-    std::vector<std::shared_ptr<OpBase>> ops = this->mOperations;
-    this->mOperations.clear();
-    for (const std::shared_ptr<kp::OpBase>& op : ops) {
-        this->record(op);
-    }
-}
-
-void
-Sequence::destroy()
-{
-    KP_LOG_DEBUG("Kompute Sequence destroy called");
-
-    if (!this->mDevice) {
-        KP_LOG_WARN("Kompute Sequence destroy called "
-                    "with null Device pointer");
-        return;
-    }
-
-    if (this->mFreeCommandBuffer) {
-        KP_LOG_INFO("Freeing CommandBuffer");
-        if (!this->mCommandBuffer) {
-            KP_LOG_WARN("Kompute Sequence destroy called with null "
-                        "CommandPool pointer");
-            return;
-        }
-        this->mDevice->freeCommandBuffers(
-          *this->mCommandPool, 1, this->mCommandBuffer.get());
-
-        this->mCommandBuffer = nullptr;
-        this->mFreeCommandBuffer = false;
-
-        KP_LOG_DEBUG("Kompute Sequence Freed CommandBuffer");
-    }
-
-    if (this->mFreeCommandPool) {
-        KP_LOG_INFO("Destroying CommandPool");
-        if (this->mCommandPool == nullptr) {
-            KP_LOG_WARN("Kompute Sequence destroy called with null "
-                        "CommandPool pointer");
-            return;
-        }
-        this->mDevice->destroy(
-          *this->mCommandPool,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-
-        this->mCommandPool = nullptr;
-        this->mFreeCommandPool = false;
-
-        KP_LOG_DEBUG("Kompute Sequence Destroyed CommandPool");
-    }
-
-    if (this->mOperations.size()) {
-        KP_LOG_INFO("Kompute Sequence clearing operations buffer");
-        this->mOperations.clear();
-    }
-
-    if (this->timestampQueryPool) {
-        KP_LOG_INFO("Destroying QueryPool");
-        this->mDevice->destroy(
-          *this->timestampQueryPool,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-
-        this->timestampQueryPool = nullptr;
-        KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
-    }
-
-    if (this->mDevice) {
-        this->mDevice = nullptr;
-    }
-    if (this->mPhysicalDevice) {
-        this->mPhysicalDevice = nullptr;
-    }
-    if (this->mComputeQueue) {
-        this->mComputeQueue = nullptr;
-    }
-}
-
-std::shared_ptr<Sequence>
-Sequence::record(std::shared_ptr<OpBase> op)
-{
-    KP_LOG_DEBUG("Kompute Sequence record function started");
-
-    this->begin();
-
-    KP_LOG_DEBUG(
-      "Kompute Sequence running record on OpBase derived class instance");
-
-    op->record(*this->mCommandBuffer);
-
-    this->mOperations.push_back(op);
-
-    if (this->timestampQueryPool)
-        this->mCommandBuffer->writeTimestamp(
-          vk::PipelineStageFlagBits::eAllCommands,
-          *this->timestampQueryPool,
-          this->mOperations.size());
-
-    return shared_from_this();
-}
-
-void
-Sequence::createCommandPool()
-{
-    KP_LOG_DEBUG("Kompute Sequence creating command pool");
-
-    if (!this->mDevice) {
-        throw std::runtime_error("Kompute Sequence device is null");
-    }
-
-    this->mFreeCommandPool = true;
-
-    vk::CommandPoolCreateInfo commandPoolInfo(vk::CommandPoolCreateFlags(),
-                                              this->mQueueIndex);
-    this->mCommandPool = std::make_shared<vk::CommandPool>();
-    this->mDevice->createCommandPool(
-      &commandPoolInfo, nullptr, this->mCommandPool.get());
-    KP_LOG_DEBUG("Kompute Sequence Command Pool Created");
-}
-
-void
-Sequence::createCommandBuffer()
-{
-    KP_LOG_DEBUG("Kompute Sequence creating command buffer");
-    if (!this->mDevice) {
-        throw std::runtime_error("Kompute Sequence device is null");
-    }
-    if (!this->mCommandPool) {
-        throw std::runtime_error("Kompute Sequence command pool is null");
-    }
-
-    this->mFreeCommandBuffer = true;
-
-    vk::CommandBufferAllocateInfo commandBufferAllocateInfo(
-      *this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1);
-
-    this->mCommandBuffer = std::make_shared<vk::CommandBuffer>();
-    this->mDevice->allocateCommandBuffers(&commandBufferAllocateInfo,
-                                          this->mCommandBuffer.get());
-    KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
-}
-
-void
-Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
-{
-    KP_LOG_DEBUG("Kompute Sequence creating query pool");
-    if (!this->isInit()) {
-        throw std::runtime_error(
-          "createTimestampQueryPool() called on uninitialized Sequence");
-    }
-    if (!this->mPhysicalDevice) {
-        throw std::runtime_error("Kompute Sequence physical device is null");
-    }
-
-    vk::PhysicalDeviceProperties physicalDeviceProperties =
-      this->mPhysicalDevice->getProperties();
-
-    if (physicalDeviceProperties.limits.timestampComputeAndGraphics) {
-        vk::QueryPoolCreateInfo queryPoolInfo;
-        queryPoolInfo.setQueryCount(totalTimestamps);
-        queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
-        this->timestampQueryPool = std::make_shared<vk::QueryPool>(
-          this->mDevice->createQueryPool(queryPoolInfo));
-
-        KP_LOG_DEBUG("Query pool for timestamps created");
-    } else {
-        throw std::runtime_error("Device does not support timestamps");
-    }
-}
-
-std::vector<std::uint64_t>
-Sequence::getTimestamps()
-{
-    if (!this->timestampQueryPool)
-        throw std::runtime_error("Timestamp latching not enabled");
-
-    const auto n = this->mOperations.size() + 1;
-    std::vector<std::uint64_t> timestamps(n, 0);
-    this->mDevice->getQueryPoolResults(
-      *this->timestampQueryPool,
-      0,
-      n,
-      timestamps.size() * sizeof(std::uint64_t),
-      timestamps.data(),
-      sizeof(uint64_t),
-      vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
-
-    return timestamps;
-}
-
-}
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
deleted file mode 100644
index 84dce08e02457..0000000000000
--- a/kompute/src/Tensor.cpp
+++ /dev/null
@@ -1,450 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Tensor.hpp"
-
-namespace kp {
-
-std::string
-Tensor::toString(Tensor::TensorDataTypes dt)
-{
-    switch (dt) {
-        case TensorDataTypes::eBool:
-            return "eBool";
-        case TensorDataTypes::eInt:
-            return "eInt";
-        case TensorDataTypes::eUnsignedInt:
-            return "eUnsignedInt";
-        case TensorDataTypes::eFloat:
-            return "eFloat";
-        case TensorDataTypes::eDouble:
-            return "eDouble";
-        default:
-            return "unknown";
-    }
-}
-
-std::string
-Tensor::toString(Tensor::TensorTypes dt)
-{
-    switch (dt) {
-        case TensorTypes::eDevice:
-            return "eDevice";
-        case TensorTypes::eHost:
-            return "eHost";
-        case TensorTypes::eStorage:
-            return "eStorage";
-        default:
-            return "unknown";
-    }
-}
-
-Tensor::Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-               std::shared_ptr<vk::Device> device,
-               void* data,
-               uint32_t elementTotalCount,
-               uint32_t elementMemorySize,
-               const TensorDataTypes& dataType,
-               vk::DeviceMemory *primaryMemory,
-               vk::Buffer *primaryBuffer,
-               vk::DeviceMemory *stagingMemory,
-               vk::Buffer *stagingBuffer,
-               vk::DeviceSize offset,
-               const TensorTypes& tensorType)
-{
-    KP_LOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
-                 elementTotalCount,
-                 Tensor::toString(tensorType));
-
-    this->mPhysicalDevice = physicalDevice;
-    this->mDevice = device;
-    this->mDataType = dataType;
-    this->mTensorType = tensorType;
-
-    this->rebuild(data, elementTotalCount, elementMemorySize, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
-}
-
-Tensor::~Tensor()
-{
-    KP_LOG_DEBUG("Kompute Tensor destructor started. Type: {}",
-                 Tensor::toString(this->tensorType()));
-
-    if (this->mDevice) {
-        this->destroy();
-    }
-
-    KP_LOG_DEBUG("Kompute Tensor destructor success");
-}
-
-void
-Tensor::rebuild(void* /*data*/,
-                uint32_t elementTotalCount,
-                uint64_t memorySize,
-                vk::DeviceMemory *primaryMemory,
-                vk::Buffer *primaryBuffer,
-                vk::DeviceMemory *stagingMemory,
-                vk::Buffer *stagingBuffer,
-                vk::DeviceSize offset)
-{
-    KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", elementTotalCount);
-
-    this->mSize = elementTotalCount;
-    this->mMemorySize = memorySize;
-    this->mOffset = offset;
-
-    if (this->mPrimaryBuffer || this->mPrimaryMemory) {
-        KP_LOG_DEBUG(
-          "Kompute Tensor destroying existing resources before rebuild");
-        this->destroy();
-    }
-
-    this->setGPUResources(primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
-}
-
-Tensor::TensorTypes
-Tensor::tensorType()
-{
-    return this->mTensorType;
-}
-
-bool
-Tensor::isInit()
-{
-    return this->mDevice && this->mPrimaryBuffer && this->mPrimaryMemory &&
-           this->mRawData;
-}
-
-uint32_t
-Tensor::size()
-{
-    return this->mSize;
-}
-
-uint64_t
-Tensor::memorySize()
-{
-    return this->mMemorySize;
-}
-
-kp::Tensor::TensorDataTypes
-Tensor::dataType()
-{
-    return this->mDataType;
-}
-
-void*
-Tensor::rawData()
-{
-    return this->mRawData;
-}
-
-void
-Tensor::setRawData(const void* data)
-{
-    memcpy(this->mRawData, data, this->memorySize());
-}
-
-void
-Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
-                       std::shared_ptr<Tensor> copyFromTensor)
-{
-
-    vk::DeviceSize bufferSize(this->memorySize());
-    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
-
-    KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
-
-    this->recordCopyBuffer(commandBuffer,
-                           copyFromTensor->mPrimaryBuffer,
-                           this->mPrimaryBuffer,
-                           bufferSize,
-                           copyRegion);
-}
-
-void
-Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
-{
-    if (!this->mStagingBuffer)
-        return;
-
-    vk::DeviceSize bufferSize(this->memorySize());
-    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
-
-    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
-
-    this->recordCopyBuffer(commandBuffer,
-                           this->mStagingBuffer,
-                           this->mPrimaryBuffer,
-                           bufferSize,
-                           copyRegion);
-}
-
-void
-Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
-{
-    if (!this->mStagingBuffer)
-        return;
-
-    vk::DeviceSize bufferSize(this->memorySize());
-    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
-
-    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
-
-    this->recordCopyBuffer(commandBuffer,
-                           this->mPrimaryBuffer,
-                           this->mStagingBuffer,
-                           bufferSize,
-                           copyRegion);
-}
-
-void
-Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
-                         vk::Buffer *bufferFrom,
-                         vk::Buffer *bufferTo,
-                         vk::DeviceSize /*bufferSize*/,
-                         vk::BufferCopy copyRegion)
-{
-
-    commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
-}
-
-void
-Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
-                   uint32_t fill)
-{
-    commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
-}
-
-void
-Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                         vk::AccessFlagBits srcAccessMask,
-                                         vk::AccessFlagBits dstAccessMask,
-                                         vk::PipelineStageFlagBits srcStageMask,
-                                         vk::PipelineStageFlagBits dstStageMask)
-{
-    KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
-
-    this->recordBufferMemoryBarrier(commandBuffer,
-                                    *this->mPrimaryBuffer,
-                                    srcAccessMask,
-                                    dstAccessMask,
-                                    srcStageMask,
-                                    dstStageMask);
-}
-
-void
-Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                         vk::AccessFlagBits srcAccessMask,
-                                         vk::AccessFlagBits dstAccessMask,
-                                         vk::PipelineStageFlagBits srcStageMask,
-                                         vk::PipelineStageFlagBits dstStageMask)
-{
-    if (!this->mStagingBuffer)
-        return;
-
-    KP_LOG_DEBUG("Kompute Tensor recording STAGING buffer memory barrier");
-
-    this->recordBufferMemoryBarrier(commandBuffer,
-                                    *this->mStagingBuffer,
-                                    srcAccessMask,
-                                    dstAccessMask,
-                                    srcStageMask,
-                                    dstStageMask);
-}
-
-void
-Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                  const vk::Buffer& buffer,
-                                  vk::AccessFlagBits srcAccessMask,
-                                  vk::AccessFlagBits dstAccessMask,
-                                  vk::PipelineStageFlagBits srcStageMask,
-                                  vk::PipelineStageFlagBits dstStageMask)
-{
-    KP_LOG_DEBUG("Kompute Tensor recording buffer memory barrier");
-
-    vk::DeviceSize bufferSize = this->memorySize();
-
-    vk::BufferMemoryBarrier bufferMemoryBarrier;
-    bufferMemoryBarrier.buffer = buffer;
-    bufferMemoryBarrier.size = bufferSize;
-    bufferMemoryBarrier.srcAccessMask = srcAccessMask;
-    bufferMemoryBarrier.dstAccessMask = dstAccessMask;
-    bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-
-    commandBuffer.pipelineBarrier(srcStageMask,
-                                  dstStageMask,
-                                  vk::DependencyFlags(),
-                                  nullptr,
-                                  bufferMemoryBarrier,
-                                  nullptr);
-}
-
-vk::DescriptorBufferInfo
-Tensor::constructDescriptorBufferInfo()
-{
-    KP_LOG_DEBUG("Kompute Tensor construct descriptor buffer info size {}",
-                 this->memorySize());
-    vk::DeviceSize bufferSize = this->memorySize();
-    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
-                                    mOffset, // offset
-                                    bufferSize);
-}
-
-vk::BufferUsageFlags
-Tensor::getPrimaryBufferUsageFlags()
-{
-    switch (this->mTensorType) {
-        case TensorTypes::eDevice:
-            return vk::BufferUsageFlagBits::eStorageBuffer |
-                   vk::BufferUsageFlagBits::eTransferSrc |
-                   vk::BufferUsageFlagBits::eTransferDst;
-            break;
-        case TensorTypes::eHost:
-            return vk::BufferUsageFlagBits::eStorageBuffer |
-                   vk::BufferUsageFlagBits::eTransferSrc |
-                   vk::BufferUsageFlagBits::eTransferDst;
-            break;
-        case TensorTypes::eStorage:
-            return vk::BufferUsageFlagBits::eStorageBuffer;
-            break;
-        default:
-            throw std::runtime_error("Kompute Tensor invalid tensor type");
-    }
-}
-
-vk::MemoryPropertyFlags
-Tensor::getPrimaryMemoryPropertyFlags()
-{
-    switch (this->mTensorType) {
-        case TensorTypes::eDevice:
-            return vk::MemoryPropertyFlagBits::eDeviceLocal;
-            break;
-        case TensorTypes::eHost:
-            return vk::MemoryPropertyFlagBits::eHostVisible |
-                   vk::MemoryPropertyFlagBits::eHostCoherent;
-            break;
-        case TensorTypes::eStorage:
-            return vk::MemoryPropertyFlagBits::eDeviceLocal;
-            break;
-        default:
-            throw std::runtime_error("Kompute Tensor invalid tensor type");
-    }
-}
-
-vk::BufferUsageFlags
-Tensor::getStagingBufferUsageFlags()
-{
-    switch (this->mTensorType) {
-        case TensorTypes::eDevice:
-            return vk::BufferUsageFlagBits::eTransferSrc |
-                   vk::BufferUsageFlagBits::eTransferDst;
-            break;
-        default:
-            throw std::runtime_error("Kompute Tensor invalid tensor type");
-    }
-}
-
-vk::MemoryPropertyFlags
-Tensor::getStagingMemoryPropertyFlags()
-{
-    switch (this->mTensorType) {
-        case TensorTypes::eDevice:
-            return vk::MemoryPropertyFlagBits::eHostVisible |
-                   vk::MemoryPropertyFlagBits::eHostCoherent;
-            break;
-        default:
-            throw std::runtime_error("Kompute Tensor invalid tensor type");
-    }
-}
-
-void
-Tensor::setGPUResources(vk::DeviceMemory *primaryMemory,
-                        vk::Buffer *primaryBuffer,
-                        vk::DeviceMemory *stagingMemory,
-                        vk::Buffer *stagingBuffer,
-                        vk::DeviceSize /*offset*/)
-{
-    KP_LOG_DEBUG("Kompute Tensor creating buffer");
-
-    if (!this->mPhysicalDevice) {
-        throw std::runtime_error("Kompute Tensor phyisical device is null");
-    }
-    if (!this->mDevice) {
-        throw std::runtime_error("Kompute Tensor device is null");
-    }
-
-    KP_LOG_DEBUG("Kompute Tensor creating primary buffer and memory");
-
-    this->mPrimaryBuffer = primaryBuffer;
-    this->mPrimaryMemory = primaryMemory;
-
-    if (this->mTensorType == TensorTypes::eDevice) {
-        KP_LOG_DEBUG("Kompute Tensor creating staging buffer and memory");
-
-        this->mStagingBuffer = stagingBuffer;
-        this->mStagingMemory = stagingMemory;
-    }
-
-    KP_LOG_DEBUG("Kompute Tensor buffer & memory creation successful");
-}
-
-void
-Tensor::destroy()
-{
-    KP_LOG_DEBUG("Kompute Tensor started destroy()");
-
-    // Setting raw data to null regardless whether device is available to
-    // invalidate Tensor
-    this->mRawData = nullptr;
-    this->mSize = 0;
-    this->mMemorySize = 0;
-
-    if (!this->mDevice) {
-        KP_LOG_WARN(
-          "Kompute Tensor destructor reached with null Device pointer");
-        return;
-    }
-
-    if (this->mDevice) {
-        this->mDevice = nullptr;
-    }
-
-    KP_LOG_DEBUG("Kompute Tensor successful destroy()");
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<bool>::dataType()
-{
-    return Tensor::TensorDataTypes::eBool;
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<int32_t>::dataType()
-{
-    return Tensor::TensorDataTypes::eInt;
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<uint32_t>::dataType()
-{
-    return Tensor::TensorDataTypes::eUnsignedInt;
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<float>::dataType()
-{
-    return Tensor::TensorDataTypes::eFloat;
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<double>::dataType()
-{
-    return Tensor::TensorDataTypes::eDouble;
-}
-
-}
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
deleted file mode 100644
index 53e9d8ae616be..0000000000000
--- a/kompute/src/include/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-# ####################################################
-# Kompute
-# ####################################################
-target_include_directories(kompute PUBLIC $<INSTALL_INTERFACE:include>
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
-
-target_sources(kompute PRIVATE
-
-    # Header files (useful in IDEs)
-    kompute/Algorithm.hpp
-    kompute/Core.hpp
-    kompute/Kompute.hpp
-    kompute/Manager.hpp
-    kompute/Sequence.hpp
-    kompute/Tensor.hpp
-
-    kompute/operations/OpAlgoDispatch.hpp
-    kompute/operations/OpBase.hpp
-    kompute/operations/OpMemoryBarrier.hpp
-    kompute/operations/OpMult.hpp
-    kompute/operations/OpTensorCopy.hpp
-    kompute/operations/OpTensorFill.hpp
-    kompute/operations/OpTensorSyncDevice.hpp
-    kompute/operations/OpTensorSyncLocal.hpp
-    kompute/operations/OpBufferSyncDevice.hpp
-    kompute/operations/OpBufferSyncLocal.hpp
-
-    kompute/logger/Logger.hpp
-)
-
-#install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-
-# ####################################################
-# Logger
-# ####################################################
-target_include_directories(kp_logger PUBLIC $<INSTALL_INTERFACE:include>
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
-
-target_sources(kp_logger PRIVATE
-
-    # Header files (useful in IDEs)
-    kompute/logger/Logger.hpp
-)
-
-#install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
deleted file mode 100644
index e5fef1f56d849..0000000000000
--- a/kompute/src/include/kompute/Algorithm.hpp
+++ /dev/null
@@ -1,330 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "fmt/format.h"
-#include "kompute/Tensor.hpp"
-#include "logger/Logger.hpp"
-
-namespace kp {
-
-/**
-    Abstraction for compute shaders that are run on top of tensors grouped via
-   ParameterGroups (which group descriptorsets)
-*/
-class Algorithm
-{
-  public:
-    /**
-     *  Main constructor for algorithm with configuration parameters to create
-     *  the underlying resources.
-     *
-     *  @param device The Vulkan device to use for creating resources
-     *  @param tensors (optional) The tensors to use to create the descriptor
-     * resources
-     *  @param spirv (optional) The spirv code to use to create the algorithm
-     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
-     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
-     *  @param specializationConstants (optional) The templatable param is to be
-     * used to initialize the specialization constants which cannot be changed
-     * once set.
-     *  @param pushConstants (optional) This templatable param is to be used
-     * when initializing the pipeline, which set the size of the push constants
-     * - these can be modified but all new values must have the same data type
-     * and length as otherwise it will result in errors.
-     */
-    template<typename S = float, typename P = float>
-    Algorithm(std::shared_ptr<vk::Device> device,
-              vk::PipelineCache *pipelineCache,
-              vk::DescriptorPool *pool,
-              const std::vector<std::shared_ptr<Tensor>>& tensors = {},
-              const std::vector<uint32_t>& spirv = {},
-              const Workgroup& workgroup = {},
-              const std::vector<S>& specializationConstants = {},
-              const std::vector<P>& pushConstants = {})
-    {
-        KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
-
-        this->mDevice = device;
-        this->mPipelineCache = pipelineCache;
-        this->mDescriptorPool = pool;
-
-        if (tensors.size() && spirv.size()) {
-            KP_LOG_INFO(
-              "Kompute Algorithm initialising with tensor size: {} and "
-              "spirv size: {}",
-              tensors.size(),
-              spirv.size());
-            this->rebuild(tensors,
-                          spirv,
-                          workgroup,
-                          specializationConstants,
-                          pushConstants);
-        } else {
-            KP_LOG_INFO(
-              "Kompute Algorithm constructor with empty tensors and or "
-              "spirv so not rebuilding vulkan components");
-        }
-    }
-
-    /**
-     *  Rebuild function to reconstruct algorithm with configuration parameters
-     * to create the underlying resources.
-     *
-     *  @param tensors The tensors to use to create the descriptor resources
-     *  @param spirv The spirv code to use to create the algorithm
-     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
-     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
-     *  @param specializationConstants (optional) The std::vector<float> to use
-     * to initialize the specialization constants which cannot be changed once
-     * set.
-     *  @param pushConstants (optional) The std::vector<float> to use when
-     * initializing the pipeline, which set the size of the push constants -
-     * these can be modified but all new values must have the same vector size
-     * as this initial value.
-     */
-    template<typename S = float, typename P = float>
-    void rebuild(const std::vector<std::shared_ptr<Tensor>>& tensors,
-                 const std::vector<uint32_t>& spirv,
-                 const Workgroup& workgroup = {},
-                 const std::vector<S>& specializationConstants = {},
-                 const std::vector<P>& pushConstants = {})
-    {
-        KP_LOG_DEBUG("Kompute Algorithm rebuild started");
-
-        this->mTensors = tensors;
-        this->mSpirv = spirv;
-
-        if (specializationConstants.size()) {
-            if (this->mSpecializationConstantsData) {
-                free(this->mSpecializationConstantsData);
-            }
-            uint32_t memorySize =
-              sizeof(decltype(specializationConstants.back()));
-            uint32_t size = specializationConstants.size();
-            uint32_t totalSize = size * memorySize;
-            this->mSpecializationConstantsData = malloc(totalSize);
-            memcpy(this->mSpecializationConstantsData,
-                   specializationConstants.data(),
-                   totalSize);
-            this->mSpecializationConstantsDataTypeMemorySize = memorySize;
-            this->mSpecializationConstantsSize = size;
-        }
-
-        if (pushConstants.size()) {
-            if (this->mPushConstantsData) {
-                free(this->mPushConstantsData);
-            }
-            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
-            uint32_t size = pushConstants.size();
-            uint32_t totalSize = size * memorySize;
-            this->mPushConstantsData = malloc(totalSize);
-            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
-            this->mPushConstantsDataTypeMemorySize = memorySize;
-            this->mPushConstantsSize = size;
-        }
-
-        this->setWorkgroup(
-          workgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
-
-        // Descriptor pool is created first so if available then destroy all
-        // before rebuild
-        if (this->isInit()) {
-            this->destroy();
-        }
-
-        this->createParameters();
-        this->createShaderModule();
-        this->createPipeline();
-    }
-
-    /**
-     * Destructor for Algorithm which is responsible for freeing and desroying
-     * respective pipelines and owned parameter groups.
-     */
-    ~Algorithm();
-
-    /**
-     * Records the dispatch function with the provided template parameters or
-     * alternatively using the size of the tensor by default.
-     *
-     * @param commandBuffer Command buffer to record the algorithm resources to
-     */
-    void recordDispatch(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * Records command that binds the "core" algorithm components which consist
-     * of binding the pipeline and binding the descriptorsets.
-     *
-     * @param commandBuffer Command buffer to record the algorithm resources to
-     */
-    void recordBindCore(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * Records command that binds the push constants to the command buffer
-     * provided
-     * - it is required that the pushConstants provided are of the same size as
-     * the ones provided during initialization.
-     *
-     * @param commandBuffer Command buffer to record the algorithm resources to
-     */
-    void recordBindPush(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * function that checks all the gpu resource components to verify if these
-     * have been created and returns true if all are valid.
-     *
-     * @returns returns true if the algorithm is currently initialized.
-     */
-    bool isInit();
-
-    /**
-     * Sets the work group to use in the recordDispatch
-     *
-     * @param workgroup The kp::Workgroup value to use to update the algorithm.
-     * It must have a value greater than 1 on the x value (index 1) otherwise it
-     * will be initialized on the size of the first tensor (ie.
-     * this->mTensor[0]->size())
-     */
-    void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
-    /**
-     * Sets the push constants to the new value provided to use in the next
-     * bindPush()
-     *
-     * @param pushConstants The templatable vector is to be used to set the push
-     * constants to use in the next bindPush(...) calls. The constants provided
-     * must be of the same size as the ones created during initialization.
-     */
-    template<typename T>
-    void setPushConstants(const std::vector<T>& pushConstants)
-    {
-        uint32_t memorySize = sizeof(decltype(pushConstants.back()));
-        uint32_t size = pushConstants.size();
-        this->setPushConstants(pushConstants.data(), size, memorySize);
-    }
-
-    void updateDescriptors(vk::DescriptorPool *pool)
-    {
-        this->mDescriptorPool = pool;
-        this->setWorkgroup(
-          this->mWorkgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
-
-        this->updateParameters(); // TODO: See if we can reduce this
-    }
-
-    /**
-     * Sets the push constants to the new value provided to use in the next
-     * bindPush() with the raw memory block location and memory size to be used.
-     *
-     * @param data The raw data point to copy the data from, without modifying
-     * the pointer.
-     * @param size The number of data elements provided in the data
-     * @param memorySize The memory size of each of the data elements in bytes.
-     */
-    void setPushConstants(const void* data, uint32_t size, uint32_t memorySize)
-    {
-
-        uint32_t totalSize = memorySize * size;
-        uint32_t previousTotalSize =
-          this->mPushConstantsDataTypeMemorySize * this->mPushConstantsSize;
-
-        if (totalSize != previousTotalSize) {
-            throw std::runtime_error(fmt::format(
-              "Kompute Algorithm push "
-              "constant total memory size provided is {} but expected {} bytes",
-              totalSize,
-              previousTotalSize));
-        }
-        if (this->mPushConstantsData) {
-            free(this->mPushConstantsData);
-        }
-
-        this->mPushConstantsData = malloc(totalSize);
-        memcpy(this->mPushConstantsData, data, totalSize);
-        this->mPushConstantsDataTypeMemorySize = memorySize;
-        this->mPushConstantsSize = size;
-    }
-
-    /**
-     * Gets the current workgroup from the algorithm.
-     *
-     * @param The kp::Constant to use to set the push constants to use in the
-     * next bindPush(...) calls. The constants provided must be of the same size
-     * as the ones created during initialization.
-     */
-    const Workgroup& getWorkgroup();
-    /**
-     * Gets the specialization constants of the current algorithm.
-     *
-     * @returns The std::vector<float> currently set for specialization
-     * constants
-     */
-    template<typename T>
-    const std::vector<T> getSpecializationConstants()
-    {
-        return { (T*)this->mSpecializationConstantsData,
-                 ((T*)this->mSpecializationConstantsData) +
-                   this->mSpecializationConstantsSize };
-    }
-    /**
-     * Gets the specialization constants of the current algorithm.
-     *
-     * @returns The std::vector<float> currently set for push constants
-     */
-    template<typename T>
-    const std::vector<T> getPushConstants()
-    {
-        return { (T*)this->mPushConstantsData,
-                 ((T*)this->mPushConstantsData) + this->mPushConstantsSize };
-    }
-    /**
-     * Gets the current tensors that are used in the algorithm.
-     *
-     * @returns The list of tensors used in the algorithm.
-     */
-    const std::vector<std::shared_ptr<Tensor>>& getTensors();
-    void setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    void destroy();
-
-  private:
-    // -------------- NEVER OWNED RESOURCES
-    std::shared_ptr<vk::Device> mDevice;
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-
-    // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
-    bool mFreeDescriptorSetLayout = false;
-    vk::DescriptorPool *mDescriptorPool = nullptr;
-    std::shared_ptr<vk::DescriptorSet> mDescriptorSet;
-    bool mFreeDescriptorSet = false;
-    std::shared_ptr<vk::ShaderModule> mShaderModule;
-    bool mFreeShaderModule = false;
-    std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
-    bool mFreePipelineLayout = false;
-    vk::PipelineCache *mPipelineCache = nullptr;
-    std::shared_ptr<vk::Pipeline> mPipeline;
-    bool mFreePipeline = false;
-
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<uint32_t> mSpirv;
-    void* mSpecializationConstantsData = nullptr;
-    uint32_t mSpecializationConstantsDataTypeMemorySize = 0;
-    uint32_t mSpecializationConstantsSize = 0;
-    void* mPushConstantsData = nullptr;
-    uint32_t mPushConstantsDataTypeMemorySize = 0;
-    uint32_t mPushConstantsSize = 0;
-    Workgroup mWorkgroup;
-
-    // Create util functions
-    void createShaderModule();
-    void createPipeline();
-
-    // Parameters
-    void freeParameters();
-    void createParameters();
-    void updateParameters();
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/Core.hpp b/kompute/src/include/kompute/Core.hpp
deleted file mode 100644
index 406e6b5d481d5..0000000000000
--- a/kompute/src/include/kompute/Core.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <vulkan/vulkan.hpp>
-
-// Typedefs to simplify interaction with core types
-namespace kp {
-typedef std::array<uint32_t, 3> Workgroup;
-typedef std::vector<float> Constants;
-}
-
-// Must be after vulkan is included
-#ifndef KOMPUTE_VK_API_VERSION
-#ifndef KOMPUTE_VK_API_MAJOR_VERSION
-#define KOMPUTE_VK_API_MAJOR_VERSION 1
-#endif // KOMPUTE_VK_API_MAJOR_VERSION
-#ifndef KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_MINOR_VERSION 2
-#endif // KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_VERSION                                                 \
-    VK_MAKE_VERSION(                                                           \
-      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
-#endif // KOMPUTE_VK_API_VERSION
-
-#if defined(KOMPUTE_BUILD_PYTHON)
-#include <pybind11/pybind11.h>
-namespace py = pybind11;
-// from python/src/main.cpp
-extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
-#endif
diff --git a/kompute/src/include/kompute/Kompute.hpp b/kompute/src/include/kompute/Kompute.hpp
deleted file mode 100644
index 70e0dd433c44f..0000000000000
--- a/kompute/src/include/kompute/Kompute.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "Algorithm.hpp"
-#include "Core.hpp"
-#include "Manager.hpp"
-#include "Sequence.hpp"
-#include "Tensor.hpp"
-
-#include "operations/OpAlgoDispatch.hpp"
-#include "operations/OpBase.hpp"
-#include "operations/OpMemoryBarrier.hpp"
-#include "operations/OpMult.hpp"
-#include "operations/OpTensorCopy.hpp"
-#include "operations/OpTensorSyncDevice.hpp"
-#include "operations/OpTensorSyncLocal.hpp"
-#include "operations/OpBufferSyncDevice.hpp"
-#include "operations/OpBufferSyncLocal.hpp"
-#include "operations/OpTensorFill.hpp"
-
-// Will be build by CMake and placed inside the build directory
-#include "ShaderLogisticRegression.hpp"
-#include "ShaderOpMult.hpp"
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
deleted file mode 100644
index 780c352ebc43a..0000000000000
--- a/kompute/src/include/kompute/Manager.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <set>
-#include <unordered_map>
-
-#include "kompute/Core.hpp"
-
-#include "kompute/Sequence.hpp"
-#include "logger/Logger.hpp"
-
-#define KP_DEFAULT_SESSION "DEFAULT"
-
-namespace kp {
-
-/**
-    Base orchestrator which creates and manages device and child components
-*/
-class Manager
-{
-  public:
-    /**
-        Base constructor.
-    */
-    Manager();
-
-    /**
-     * Manager destructor which would ensure all owned resources are destroyed
-     * unless explicitly stated that resources should not be destroyed or freed.
-     */
-    ~Manager();
-
-    bool hasInstance() const {
-        return this->mInstance.get();
-    }
-
-    bool hasDevice() const {
-        return this->mDevice.get();
-    }
-
-    bool hasVulkan() const {
-        return this->mDynamicLoader.get();
-    }
-
-    /**
-     * Initialize a device.
-     *
-     * @param physicalDeviceIndex The index of the physical device to use
-     * @param familyQueueIndices (Optional) List of queue indices to add for
-     * explicit allocation
-     * @param desiredExtensions The desired extensions to load from
-     * physicalDevice
-     */
-    void initializeDevice(uint32_t physicalDeviceIndex,
-            const std::vector<uint32_t>& familyQueueIndices = {},
-            const std::vector<std::string>& desiredExtensions = {});
-
-    /**
-     * Create a managed sequence that will be destroyed by this manager
-     * if it hasn't been destroyed by its reference count going to zero.
-     *
-     * @param queueIndex The queue to use from the available queues
-     * @param nrOfTimestamps The maximum number of timestamps to allocate.
-     * If zero (default), disables latching of timestamps.
-     * @returns Shared pointer with initialised sequence
-     */
-    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
-                                       uint32_t totalTimestamps = 0);
-
-    /**
-     * Create a managed tensor that will be destroyed by this manager
-     * if it hasn't been destroyed by its reference count going to zero.
-     *
-     * @param data The data to initialize the tensor with
-     * @param tensorType The type of tensor to initialize
-     * @returns Shared pointer with initialised tensor
-     */
-    template<typename T>
-    std::shared_ptr<TensorT<T>> tensorT(
-      const std::vector<T>& data,
-       vk::DeviceMemory *primaryMemory,
-       vk::Buffer *primaryBuffer,
-       vk::DeviceMemory *stagingMemory,
-       vk::Buffer *stagingBuffer,
-      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
-    {
-        KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
-
-        std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
-          this->mPhysicalDevice, this->mDevice, data, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, tensorType) };
-
-        if (this->mManageResources) {
-            this->mManagedTensors.push_back(tensor);
-        }
-
-        return tensor;
-    }
-
-    std::shared_ptr<Tensor> tensor(
-      void* data,
-      uint32_t elementTotalCount,
-      uint64_t memorySize,
-      const Tensor::TensorDataTypes& dataType,
-      vk::DeviceMemory *primaryMemory,
-      vk::Buffer *primaryBuffer,
-      vk::DeviceMemory *stagingMemory,
-      vk::Buffer *stagingBuffer,
-      vk::DeviceSize offset,
-      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
-    {
-        std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
-                                                       this->mDevice,
-                                                       data,
-                                                       elementTotalCount,
-                                                       memorySize,
-                                                       dataType,
-                                                       primaryMemory,
-                                                       primaryBuffer,
-                                                       stagingMemory,
-                                                       stagingBuffer,
-                                                       offset,
-                                                       tensorType) };
-
-        if (this->mManageResources) {
-            this->mManagedTensors.push_back(tensor);
-        }
-
-        return tensor;
-    }
-
-    /**
-     * Default non-template function that can be used to create algorithm
-     * objects which provides default types to the push and spec constants as
-     * floats.
-     *
-     * @param tensors (optional) The tensors to initialise the algorithm with
-     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
-     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
-     * defaults to (tensor[0].size(), 1, 1)
-     * @param specializationConstants (optional) float vector to use for
-     * specialization constants, and defaults to an empty constant
-     * @param pushConstants (optional) float vector to use for push constants,
-     * and defaults to an empty constant
-     * @returns Shared pointer with initialised algorithm
-     */
-    std::shared_ptr<Algorithm> algorithm(
-      const std::string &name,
-      vk::DescriptorPool *pool,
-      const std::vector<std::shared_ptr<Tensor>>& tensors = {},
-      const std::vector<uint32_t>& spirv = {},
-      const Workgroup& workgroup = {},
-      const std::vector<float>& specializationConstants = {},
-      const std::vector<float>& pushConstants = {})
-    {
-        return this->algorithm<>(
-          name, pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
-    }
-
-    /**
-     * Create a managed algorithm that will be destroyed by this manager
-     * if it hasn't been destroyed by its reference count going to zero.
-     *
-     * @param tensors (optional) The tensors to initialise the algorithm with
-     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
-     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
-     * defaults to (tensor[0].size(), 1, 1)
-     * @param specializationConstants (optional) templatable vector parameter to
-     * use for specialization constants, and defaults to an empty constant
-     * @param pushConstants (optional) templatable vector parameter to use for
-     * push constants, and defaults to an empty constant
-     * @returns Shared pointer with initialised algorithm
-     */
-    template<typename S = float, typename P = float>
-    std::shared_ptr<Algorithm> algorithm(
-      const std::string &name,
-      vk::DescriptorPool *pool,
-      const std::vector<std::shared_ptr<Tensor>>& tensors,
-      const std::vector<uint32_t>& spirv,
-      const Workgroup& workgroup,
-      const std::vector<S>& specializationConstants,
-      const std::vector<P>& pushConstants)
-    {
-
-        KP_LOG_DEBUG("Kompute Manager algorithm creation triggered");
-
-        std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
-          this->mDevice,
-          mPipelineCache.get(),
-          pool,
-          tensors,
-          spirv,
-          workgroup,
-          specializationConstants,
-          pushConstants) };
-
-        if (this->mManageResources) {
-            this->mManagedAlgorithmsMap.insert({name, algorithm});
-        }
-
-        return algorithm;
-    }
-
-    bool hasAlgorithm(const std::string &name) const {
-        return mManagedAlgorithmsMap.find(name) != mManagedAlgorithmsMap.end();
-    }
-
-    std::shared_ptr<Algorithm> getAlgorithm(const std::string &name) const {
-        auto it = mManagedAlgorithmsMap.find(name);
-        if (it != mManagedAlgorithmsMap.end()) {
-            return it->second;
-        }
-        return nullptr;
-    }
-
-    /**
-     * Destroy the GPU resources and all managed resources by manager.
-     **/
-    void destroy();
-    /**
-     * Run a pseudo-garbage collection to release all the managed resources
-     * that have been already freed due to these reaching to zero ref count.
-     **/
-    void clear();
-
-    /**
-     * Information about the current device.
-     *
-     * @return vk::PhysicalDeviceProperties containing information about the
-     *device
-     **/
-    vk::PhysicalDeviceProperties getDeviceProperties() const;
-
-    /**
-     * List the devices available in the current vulkan instance.
-     *
-     * @return vector of physical devices containing their respective properties
-     **/
-    std::vector<vk::PhysicalDevice> listDevices() const;
-
-    /**
-     * The current Vulkan instance.
-     *
-     * @return a shared pointer to the current Vulkan instance held by this
-     *object
-     **/
-    std::shared_ptr<vk::Instance> getVkInstance() const;
-
-    std::shared_ptr<vk::Device> device() const { return mDevice; }
-    std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
-    std::shared_ptr<vk::PipelineCache> pipelineCache() const { return mPipelineCache; }
-
-  private:
-    // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::Instance> mInstance = nullptr;
-    bool mFreeInstance = false;
-    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
-    std::shared_ptr<vk::Device> mDevice = nullptr;
-    std::shared_ptr<vk::DynamicLoader> mDynamicLoader = nullptr;
-    bool mFreeDevice = false;
-
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::weak_ptr<Tensor>> mManagedTensors;
-    std::vector<std::weak_ptr<Sequence>> mManagedSequences;
-    std::unordered_map<std::string, std::shared_ptr<Algorithm>> mManagedAlgorithmsMap;
-
-    std::vector<uint32_t> mComputeQueueFamilyIndices;
-    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
-    std::shared_ptr<vk::PipelineCache> mPipelineCache;
-
-    bool mManageResources = false;
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    vk::DebugReportCallbackEXT mDebugReportCallback;
-    vk::DispatchLoaderDynamic mDebugDispatcher;
-#endif
-
-    // Create functions
-    void createInstance();
-    void createDevice(const std::vector<uint32_t>& familyQueueIndices = {},
-                      uint32_t physicalDeviceIndex = 0,
-                      const std::vector<std::string>& desiredExtensions = {});
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/Sequence.hpp b/kompute/src/include/kompute/Sequence.hpp
deleted file mode 100644
index 3b29a6e2e66ae..0000000000000
--- a/kompute/src/include/kompute/Sequence.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "kompute/operations/OpAlgoDispatch.hpp"
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- *  Container of operations that can be sent to GPU as batch
- */
-class Sequence : public std::enable_shared_from_this<Sequence>
-{
-  public:
-    /**
-     * Main constructor for sequence which requires core vulkan components to
-     * generate all dependent resources.
-     *
-     * @param physicalDevice Vulkan physical device
-     * @param device Vulkan logical device
-     * @param computeQueue Vulkan compute queue
-     * @param queueIndex Vulkan compute queue index in device
-     * @param totalTimestamps Maximum number of timestamps to allocate
-     */
-    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-             std::shared_ptr<vk::Device> device,
-             std::shared_ptr<vk::Queue> computeQueue,
-             uint32_t queueIndex,
-             uint32_t totalTimestamps = 0);
-    /**
-     * Destructor for sequence which is responsible for cleaning all subsequent
-     * owned operations.
-     */
-    ~Sequence();
-
-    /**
-     * Record function for operation to be added to the GPU queue in batch. This
-     * template requires classes to be derived from the OpBase class. This
-     * function also requires the Sequence to be recording, otherwise it will
-     * not be able to add the operation.
-     *
-     * @param op Object derived from kp::BaseOp that will be recoreded by the
-     * sequence which will be used when the operation is evaluated.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
-
-    /**
-     * Record function for operation to be added to the GPU queue in batch. This
-     * template requires classes to be derived from the OpBase class. This
-     * function also requires the Sequence to be recording, otherwise it will
-     * not be able to add the operation.
-     *
-     * @param tensors Vector of tensors to use for the operation
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> record(
-      std::vector<std::shared_ptr<Tensor>> tensors,
-      TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->record(op);
-    }
-    /**
-     * Record function for operation to be added to the GPU queue in batch. This
-     * template requires classes to be derived from the OpBase class. This
-     * function also requires the Sequence to be recording, otherwise it will
-     * not be able to add the operation.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
-                                     TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
-        return this->record(op);
-    }
-
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job synchronously (with a barrier).
-     *
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    std::shared_ptr<Sequence> eval();
-
-    /**
-     * Resets all the recorded and stored operations, records the operation
-     * provided and submits into the gpu as a submit job synchronously (with a
-     * barrier).
-     *
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
-
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param tensors Vector of tensors to use for the operation
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
-                                   TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->eval(op);
-    }
-
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> eval(vk::Buffer *primaryBuffer,
-                                   vk::Buffer *stagingBuffer,
-                                   vk::DeviceSize size,
-                                   TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(primaryBuffer, stagingBuffer, size, std::forward<TArgs>(params)...) };
-        return this->eval(op);
-    }
-
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
-                                   TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
-        return this->eval(op);
-    }
-
-    /**
-     * Eval Async sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job without a barrier. EvalAwait()
-     * must ALWAYS be called after to ensure the sequence is terminated
-     * correctly.
-     *
-     * @return Boolean stating whether execution was successful.
-     */
-    std::shared_ptr<Sequence> evalAsync();
-    /**
-     * Clears currnet operations to record provided one in the vector of
-     * operations into the gpu as a submit job without a barrier. EvalAwait()
-     * must ALWAYS be called after to ensure the sequence is terminated
-     * correctly.
-     *
-     * @return Boolean stating whether execution was successful.
-     */
-    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param tensors Vector of tensors to use for the operation
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> evalAsync(
-      std::vector<std::shared_ptr<Tensor>> tensors,
-      TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->evalAsync(op);
-    }
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
-                                        TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
-        return this->evalAsync(op);
-    }
-
-    /**
-     * Eval Await waits for the fence to finish processing and then once it
-     * finishes, it runs the postEval of all operations.
-     *
-     * @param waitFor Number of milliseconds to wait before timing out.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
-
-    /**
-     * Clear function clears all operations currently recorded and starts
-     * recording again.
-     */
-    void clear();
-
-    /**
-     * Return the timestamps that were latched at the beginning and
-     * after each operation during the last eval() call.
-     */
-    std::vector<std::uint64_t> getTimestamps();
-
-    /**
-     * Begins recording commands for commands to be submitted into the command
-     * buffer.
-     */
-    void begin();
-
-    /**
-     * Ends the recording and stops recording commands when the record command
-     * is sent.
-     */
-    void end();
-
-    /**
-     * Returns true if the sequence is currently in recording activated.
-     *
-     * @return Boolean stating if recording ongoing.
-     */
-    bool isRecording() const;
-
-    /**
-     * Returns true if the sequence has been initialised, and it's based on the
-     * GPU resources being referenced.
-     *
-     * @return Boolean stating if is initialized
-     */
-    bool isInit() const;
-
-    /**
-     * Clears command buffer and triggers re-record of all the current
-     * operations saved, which is useful if the underlying kp::Tensors or
-     * kp::Algorithms are modified and need to be re-recorded.
-     */
-    void rerecord();
-
-    /**
-     * Returns true if the sequence is currently running - mostly used for async
-     * workloads.
-     *
-     * @return Boolean stating if currently running.
-     */
-    bool isRunning() const;
-
-    /**
-     * Destroys and frees the GPU resources which include the buffer and memory
-     * and sets the sequence as init=False.
-     */
-    void destroy();
-
-  private:
-    // -------------- NEVER OWNED RESOURCES
-    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
-    std::shared_ptr<vk::Device> mDevice = nullptr;
-    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
-    uint32_t mQueueIndex = -1;
-
-    // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
-    bool mFreeCommandPool = false;
-    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
-    bool mFreeCommandBuffer = false;
-
-    // -------------- ALWAYS OWNED RESOURCES
-    vk::Fence mFence;
-    std::vector<std::shared_ptr<OpBase>> mOperations{};
-    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
-
-    // State
-    bool mRecording = false;
-    bool mIsRunning = false;
-
-    // Create functions
-    void createCommandPool();
-    void createCommandBuffer();
-    void createTimestampQueryPool(uint32_t totalTimestamps);
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
deleted file mode 100644
index 20939093da7af..0000000000000
--- a/kompute/src/include/kompute/Tensor.hpp
+++ /dev/null
@@ -1,302 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-#include "logger/Logger.hpp"
-#include <memory>
-#include <string>
-
-namespace kp {
-
-/**
- * Structured data used in GPU operations.
- *
- * Tensors are the base building block in Kompute to perform operations across
- * GPUs. Each tensor would have a respective Vulkan memory and buffer, which
- * would be used to store their respective data. The tensors can be used for GPU
- * data storage or transfer.
- */
-class Tensor
-{
-  public:
-    /**
-     * Type for tensors created: Device allows memory to be transferred from
-     * staging buffers. Staging are host memory visible. Storage are device
-     * visible but are not set up to transfer or receive data (only for shader
-     * storage).
-     */
-    enum class TensorTypes
-    {
-        eDevice = 0,  ///< Type is device memory, source and destination
-        eHost = 1,    ///< Type is host memory, source and destination
-        eStorage = 2, ///< Type is Device memory (only)
-    };
-    enum class TensorDataTypes
-    {
-        eBool = 0,
-        eInt = 1,
-        eUnsignedInt = 2,
-        eFloat = 3,
-        eDouble = 4,
-    };
-
-    static std::string toString(TensorDataTypes dt);
-    static std::string toString(TensorTypes dt);
-
-    /**
-     *  Constructor with data provided which would be used to create the
-     * respective vulkan buffer and memory.
-     *
-     *  @param physicalDevice The physical device to use to fetch properties
-     *  @param device The device to use to create the buffer and memory from
-     *  @param data Non-zero-sized vector of data that will be used by the
-     * tensor
-     *  @param tensorTypes Type for the tensor which is of type TensorTypes
-     */
-    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-           std::shared_ptr<vk::Device> device,
-           void* data,
-           uint32_t elementTotalCount,
-           uint32_t memorySize,
-           const TensorDataTypes& dataType,
-           vk::DeviceMemory *primaryMemory,
-           vk::Buffer *primaryBuffer,
-           vk::DeviceMemory *stagingMemory,
-           vk::Buffer *stagingBuffer,
-           vk::DeviceSize offset,
-           const TensorTypes& tensorType = TensorTypes::eDevice);
-
-    /**
-     * Destructor which is in charge of freeing vulkan resources unless they
-     * have been provided externally.
-     */
-    virtual ~Tensor();
-
-    /**
-     * Function to trigger reinitialisation of the tensor buffer and memory with
-     * new data as well as new potential device type.
-     *
-     * @param data Vector of data to use to initialise vector from
-     * @param tensorType The type to use for the tensor
-     */
-    void rebuild(void* data,
-                 uint32_t elementTotalCount,
-                 uint64_t memorySize,
-                 vk::DeviceMemory *primaryMemory,
-                 vk::Buffer *primaryBuffer,
-                 vk::DeviceMemory *stagingMemory,
-                 vk::Buffer *stagingBuffer,
-                 vk::DeviceSize offset);
-
-    /**
-     * Destroys and frees the GPU resources which include the buffer and memory.
-     */
-    void destroy();
-
-    /**
-     * Check whether tensor is initialized based on the created gpu resources.
-     *
-     * @returns Boolean stating whether tensor is initialized
-     */
-    bool isInit();
-
-    /**
-     * Retrieve the tensor type of the Tensor
-     *
-     * @return Tensor type of tensor
-     */
-    TensorTypes tensorType();
-
-    /**
-     * Records a copy from the memory of the tensor provided to the current
-     * thensor. This is intended to pass memory into a processing, to perform
-     * a staging buffer transfer, or to gather output (between others).
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param copyFromTensor Tensor to copy the data from
-     */
-    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
-                        std::shared_ptr<Tensor> copyFromTensor);
-
-    void recordFill(const vk::CommandBuffer &commandBuffer,
-                    uint32_t fill);
-
-    /**
-     * Records a copy from the internal staging memory to the device memory
-     * using an optional barrier to wait for the operation. This function would
-     * only be relevant for kp::Tensors of type eDevice.
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     */
-    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * Records a copy from the internal device memory to the staging memory
-     * using an optional barrier to wait for the operation. This function would
-     * only be relevant for kp::Tensors of type eDevice.
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     */
-    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * Records the buffer memory barrier into the primary buffer and command
-     * buffer which ensures that relevant data transfers are carried out
-     * correctly.
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param srcAccessMask Access flags for source access mask
-     * @param dstAccessMask Access flags for destination access mask
-     * @param scrStageMask Pipeline stage flags for source stage mask
-     * @param dstStageMask Pipeline stage flags for destination stage mask
-     */
-    void recordPrimaryBufferMemoryBarrier(
-      const vk::CommandBuffer& commandBuffer,
-      vk::AccessFlagBits srcAccessMask,
-      vk::AccessFlagBits dstAccessMask,
-      vk::PipelineStageFlagBits srcStageMask,
-      vk::PipelineStageFlagBits dstStageMask);
-    /**
-     * Records the buffer memory barrier into the staging buffer and command
-     * buffer which ensures that relevant data transfers are carried out
-     * correctly.
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param srcAccessMask Access flags for source access mask
-     * @param dstAccessMask Access flags for destination access mask
-     * @param scrStageMask Pipeline stage flags for source stage mask
-     * @param dstStageMask Pipeline stage flags for destination stage mask
-     */
-    void recordStagingBufferMemoryBarrier(
-      const vk::CommandBuffer& commandBuffer,
-      vk::AccessFlagBits srcAccessMask,
-      vk::AccessFlagBits dstAccessMask,
-      vk::PipelineStageFlagBits srcStageMask,
-      vk::PipelineStageFlagBits dstStageMask);
-
-    /**
-     * Constructs a vulkan descriptor buffer info which can be used to specify
-     * and reference the underlying buffer component of the tensor without
-     * exposing it.
-     *
-     * @return Descriptor buffer info with own buffer
-     */
-    vk::DescriptorBufferInfo constructDescriptorBufferInfo();
-
-    /**
-     * Returns the size/magnitude of the Tensor, which will be the total number
-     * of elements across all dimensions
-     *
-     * @return Unsigned integer representing the total number of elements
-     */
-    uint32_t size();
-
-    /**
-     * Returns the total memory size of the data contained by the Tensor object
-     *
-     * @return Unsigned integer representing the memory of the tensor in bytes.
-     */
-    uint64_t memorySize();
-
-    /**
-     * Retrieve the data type of the tensor (host, device, storage)
-     *
-     * @return Data type of tensor of type kp::Tensor::TensorDataTypes
-     */
-    TensorDataTypes dataType();
-
-    /**
-     * Retrieve the raw data via the pointer to the memory that contains the raw
-     * memory of this current tensor. This tensor gets changed to a nullptr when
-     * the Tensor is removed.
-     *
-     * @return Pointer to raw memory containing raw bytes data of Tensor.
-     */
-    void* rawData();
-
-    /**
-     * Sets / resets the data of the tensor which is directly done on the GPU
-     * host visible memory available by the tensor.
-     */
-    void setRawData(const void* data);
-
-    /**
-     * Template to return the pointer data converted by specific type, which
-     * would be any of the supported types including float, double, int32,
-     * uint32 and bool.
-     *
-     * @return Pointer to raw memory containing raw bytes data of Tensor.
-     */
-    template<typename T>
-    T* data()
-    {
-        return (T*)this->mRawData;
-    }
-
-    /**
-     * Template to get the data of the current tensor as a vector of specific
-     * type, which would be any of the supported types including float, double,
-     * int32, uint32 and bool.
-     *
-     * @return Vector of type provided by template.
-     */
-    template<typename T>
-    std::vector<T> vector()
-    {
-        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
-    }
-
-  protected:
-    // -------------- ALWAYS OWNED RESOURCES
-    TensorTypes mTensorType;
-    TensorDataTypes mDataType;
-    uint32_t mSize = 0;
-    uint64_t mMemorySize = 0;
-    vk::DeviceSize mOffset = 0;
-    void* mRawData = nullptr;
-
-  private:
-    // -------------- NEVER OWNED RESOURCES
-    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
-    std::shared_ptr<vk::Device> mDevice;
-    vk::Buffer *mPrimaryBuffer = nullptr;
-    vk::Buffer *mStagingBuffer = nullptr;
-    vk::DeviceMemory *mPrimaryMemory = nullptr;
-    vk::DeviceMemory *mStagingMemory = nullptr;
-
-    void setGPUResources(vk::DeviceMemory *primaryMemory,
-                         vk::Buffer *primaryBuffer,
-                         vk::DeviceMemory *stagingMemory,
-                         vk::Buffer *stagingBuffer,
-                         vk::DeviceSize offset);
-    void recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
-                          vk::Buffer *bufferFrom,
-                          vk::Buffer *bufferTo,
-                          vk::DeviceSize bufferSize,
-                          vk::BufferCopy copyRegion);
-
-    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                   const vk::Buffer& buffer,
-                                   vk::AccessFlagBits srcAccessMask,
-                                   vk::AccessFlagBits dstAccessMask,
-                                   vk::PipelineStageFlagBits srcStageMask,
-                                   vk::PipelineStageFlagBits dstStageMask);
-
-    // Private util functions
-    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
-    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
-    vk::BufferUsageFlags getStagingBufferUsageFlags();
-    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
-};
-
-template<typename T>
-class TensorT : public Tensor
-{
-
-  public:
-    ~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); }
-
-    TensorDataTypes dataType();
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/logger/Logger.hpp b/kompute/src/include/kompute/logger/Logger.hpp
deleted file mode 100644
index f97e95cf06c4c..0000000000000
--- a/kompute/src/include/kompute/logger/Logger.hpp
+++ /dev/null
@@ -1,197 +0,0 @@
-#pragma once
-
-#define KOMPUTE_LOG_LEVEL_TRACE 0
-#define KOMPUTE_LOG_LEVEL_DEBUG 1
-#define KOMPUTE_LOG_LEVEL_INFO 2
-#define KOMPUTE_LOG_LEVEL_WARN 3
-#define KOMPUTE_LOG_LEVEL_ERROR 4
-#define KOMPUTE_LOG_LEVEL_CRITICAL 5
-#define KOMPUTE_LOG_LEVEL_OFF 6
-
-// Logging is disabled entirely.
-#if KOMPUTE_OPT_LOG_LEVEL_DISABLED
-#define KP_LOG_TRACE(...)
-#define KP_LOG_DEBUG(...)
-#define KP_LOG_INFO(...)
-#define KP_LOG_WARN(...)
-#define KP_LOG_ERROR(...)
-#else
-
-#if !KOMPUTE_OPT_USE_SPDLOG
-#if VK_USE_PLATFORM_ANDROID_KHR
-#include <android/log.h>
-#include <fmt/core.h>
-static const char* KOMPUTE_LOG_TAG = "KomputeLog";
-#else
-#if KOMPUTE_BUILD_PYTHON
-#include <pybind11/pybind11.h>
-namespace py = pybind11;
-// from python/src/main.cpp
-extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
-#else
-#include <fmt/core.h>
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#include <spdlog/spdlog.h>
-#endif // !KOMPUTE_OPT_USE_SPDLOG
-#include <set>
-#include <string>
-#include <vector>
-namespace logger {
-// Setup the logger, note the loglevel can not be set below the CMake log level
-// (To change this use -DKOMPUTE_OPT_LOG_LEVEL=...)
-void
-setupLogger();
-
-// Logging is enabled, but we do not use Spdlog. So we use fmt in case nothing
-// else is defined, overriding logging.
-#if !KOMPUTE_OPT_USE_SPDLOG
-
-#ifndef KP_LOG_TRACE
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_TRACE
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_TRACE(...)                                                      \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_VERBOSE, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_trace(fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_TRACE(...)                                                      \
-    fmt::print("[{} {}] [trace] [{}:{}] {}\n",                                 \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_TRACE(...)
-#endif
-#endif // !KP_LOG_TRACE
-
-#ifndef KP_LOG_DEBUG
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_DEBUG(...)                                                      \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_debug(fmt::format(__VA_ARGS__))
-#else
-#ifdef __FILE_NAME__ // gcc 12 provides only file name without path
-#define KP_LOG_DEBUG(...)                                                      \
-    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE_NAME__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_DEBUG(...)                                                      \
-    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // __FILE__NAME__
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_DEBUG(...)
-#endif
-#endif // !KP_LOG_DEBUG
-
-#ifndef KP_LOG_INFO
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_INFO(...)                                                       \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_info(fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_INFO(...)                                                       \
-    fmt::print("[{} {}] [info] [{}:{}] {}\n",                                  \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_INFO(...)
-#endif
-#endif // !KP_LOG_INFO
-
-#ifndef KP_LOG_WARN
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_WARN
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_WARN(...)                                                       \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_WARN, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_warning(fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_WARN(...)                                                       \
-    fmt::print("[{} {}] [warn] [{}:{}] {}\n",                                  \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_WARN(...)
-#endif
-#endif // !KP_LOG_WARN
-
-#ifndef KP_LOG_ERROR
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_ERROR
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_ERROR(...)                                                      \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_ERROR, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_error(fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_ERROR(...)                                                      \
-    fmt::print("[{} {}] [error] [{}:{}] {}\n",                                 \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_ERROR(...)
-#endif
-#endif // !KP_LOG_ERROR
-#else
-
-#define KP_LOG_TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
-#define KP_LOG_DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
-#define KP_LOG_INFO(...) SPDLOG_INFO(__VA_ARGS__)
-#define KP_LOG_WARN(...) SPDLOG_WARN(__VA_ARGS__)
-#define KP_LOG_ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
-
-void
-setLogLevel(spdlog::level::level_enum level);
-
-spdlog::level::level_enum
-getLogLevel();
-
-#endif // !KOMPUTE_OPT_USE_SPDLOG
-} // namespace logger
-
-#endif // KOMPUTE_OPT_LOG_LEVEL_DISABLED
diff --git a/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
deleted file mode 100644
index e91598f0562c2..0000000000000
--- a/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Algorithm.hpp"
-#include "kompute/Core.hpp"
-#include "kompute/Tensor.hpp"
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that provides a general abstraction that simplifies the use of
- * algorithm and parameter components which can be used with shaders.
- * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs.
- */
-class OpAlgoDispatch : public OpBase
-{
-  public:
-    /**
-     * Constructor that stores the algorithm to use as well as the relevant
-     * push constants to override when recording.
-     *
-     * @param algorithm The algorithm object to use for dispatch
-     * @param pushConstants The push constants to use for override
-     */
-    template<typename T = float>
-    OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
-                   const std::vector<T>& pushConstants = {})
-    {
-        KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor");
-
-        this->mAlgorithm = algorithm;
-
-        if (pushConstants.size()) {
-            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
-            uint32_t size = pushConstants.size();
-            uint32_t totalSize = size * memorySize;
-            this->mPushConstantsData = malloc(totalSize);
-            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
-            this->mPushConstantsDataTypeMemorySize = memorySize;
-            this->mPushConstantsSize = size;
-        }
-    }
-
-    /**
-     * Default destructor, which is in charge of destroying the algorithm
-     * components but does not destroy the underlying tensors
-     */
-    virtual ~OpAlgoDispatch() override;
-
-    /**
-     * This records the commands that are to be sent to the GPU. This includes
-     * the barriers that ensure the memory has been copied before going in and
-     * out of the shader, as well as the dispatch operation that sends the
-     * shader processing to the gpu. This function also records the GPU memory
-     * copy of the output data for the staging buffer so it can be read by the
-     * host.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::shared_ptr<Algorithm> mAlgorithm;
-    void* mPushConstantsData = nullptr;
-    uint32_t mPushConstantsDataTypeMemorySize = 0;
-    uint32_t mPushConstantsSize = 0;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBase.hpp b/kompute/src/include/kompute/operations/OpBase.hpp
deleted file mode 100644
index 737670846350d..0000000000000
--- a/kompute/src/include/kompute/operations/OpBase.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Algorithm.hpp"
-#include "kompute/Core.hpp"
-#include "kompute/Tensor.hpp"
-
-namespace kp {
-
-/**
- *  Base Operation which provides the high level interface that Kompute
- *  operations implement in order to perform a set of actions in the GPU.
- *
- *  Operations can perform actions on tensors, and optionally can also own an
- *  Algorithm with respective parameters. kp::Operations with kp::Algorithms
- *  would inherit from kp::OpBaseAlgo.
- */
-class OpBase
-{
-  public:
-    /**
-     * Default destructor for OpBase class. This OpBase destructor class should
-     * always be called to destroy and free owned resources unless it is
-     * intended to destroy the resources in the parent class.
-     */
-    virtual ~OpBase() { KP_LOG_DEBUG("Kompute OpBase destructor started"); }
-
-    /**
-     * The record function is intended to only send a record command or run
-     * commands that are expected to record operations that are to be submitted
-     * as a batch into the GPU.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
-
-    /**
-     * Pre eval is called before the Sequence has called eval and submitted the
-     * commands to the GPU for processing, and can be used to perform any
-     * per-eval setup steps required as the computation iteration begins. It's
-     * worth noting that there are situations where eval can be called multiple
-     * times, so the resources that are created should be idempotent in case
-     * it's called multiple times in a row.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
-
-    /**
-     * Post eval is called after the Sequence has called eval and submitted the
-     * commands to the GPU for processing, and can be used to perform any
-     * tear-down steps required as the computation iteration finishes. It's
-     * worth noting that there are situations where eval can be called multiple
-     * times, so the resources that are destroyed should not require a re-init
-     * unless explicitly provided by the user.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
deleted file mode 100644
index 50d8e97072412..0000000000000
--- a/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-class OpBufferSyncDevice : public OpBase
-{
-  public:
-    OpBufferSyncDevice(
-        vk::Buffer *primaryBuffer,
-        vk::Buffer *stagingBuffer,
-        vk::DeviceSize size);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpBufferSyncDevice() override;
-
-    /**
-     * For device buffers, it records the copy command for the buffer to copy
-     * the data from its staging to device memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    vk::Buffer *mPrimaryBuffer;
-    vk::Buffer *mStagingBuffer;
-    vk::DeviceSize mSize;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
deleted file mode 100644
index 7db9971991c59..0000000000000
--- a/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-class OpBufferSyncLocal : public OpBase
-{
-  public:
-    OpBufferSyncLocal(
-        vk::Buffer *primaryBuffer,
-        vk::Buffer *stagingBuffer,
-        vk::DeviceSize size);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpBufferSyncLocal() override;
-
-    /**
-     * For device buffers, it records the copy command for the buffer to copy
-     * the data from its staging to device memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    vk::Buffer *mPrimaryBuffer;
-    vk::Buffer *mStagingBuffer;
-    vk::DeviceSize mSize;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
deleted file mode 100644
index 4a232232397cf..0000000000000
--- a/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Algorithm.hpp"
-#include "kompute/Core.hpp"
-#include "kompute/Tensor.hpp"
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that provides a general abstraction that simplifies the use of
- * algorithm and parameter components which can be used with shaders.
- * It exposes the pipeline barrier functionality specifically for memory
- * barriers that can be configured through the respective source and destination
- * masks
- */
-class OpMemoryBarrier : public OpBase
-{
-  public:
-    /**
-     * Constructor that stores tensors as well as memory barrier parameters to
-     * be used to create a pipeline barrier on the respective primary or staging
-     * tensor.
-     *
-     * @param tensors The tensors to apply the memory barriers on
-     * @param srcAccessMask The kp::AccessFlagBits for the source access mask
-     * @param dstAccessMask The kp::AccessFlagBits for the destination access
-     * mask
-     * @param srcStageMask The kp::PipelineStageFlagBits for the source stage
-     * mask
-     * @param dstStageMask The kp::PipelineStageFlagBits for the destination
-     * stage mask
-     * @param barrierOnPrimary Boolean to select primary or secondary buffers on
-     * tensors
-     */
-    OpMemoryBarrier(const std::vector<std::shared_ptr<Tensor>>& tensors,
-                    const vk::AccessFlagBits& srcAccessMask,
-                    const vk::AccessFlagBits& dstAccessMask,
-                    const vk::PipelineStageFlagBits& srcStageMask,
-                    const vk::PipelineStageFlagBits& dstStageMask,
-                    bool barrierOnPrimary = true);
-
-    /**
-     * Default destructor, which is in charge of destroying the reference to the
-     * tensors and all the relevant access / stage masks created
-     */
-    virtual ~OpMemoryBarrier() override;
-
-    /**
-     * This records the memory barrier with the access and stage masks provided
-     * across all relevant tensors.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    const vk::AccessFlagBits mSrcAccessMask;
-    const vk::AccessFlagBits mDstAccessMask;
-    const vk::PipelineStageFlagBits mSrcStageMask;
-    const vk::PipelineStageFlagBits mDstStageMask;
-    const bool mBarrierOnPrimary;
-    const std::vector<std::shared_ptr<Tensor>> mTensors;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMult.hpp b/kompute/src/include/kompute/operations/OpMult.hpp
deleted file mode 100644
index f75ccc4fbb763..0000000000000
--- a/kompute/src/include/kompute/operations/OpMult.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <fstream>
-
-#include "kompute/Core.hpp"
-
-#include "ShaderOpMult.hpp"
-
-#include "kompute/Algorithm.hpp"
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpAlgoDispatch.hpp"
-
-namespace kp {
-
-/**
- * Operation that performs multiplication on two tensors and outpus on third
- * tensor.
- */
-class OpMult : public OpAlgoDispatch
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the bare minimum
-     * requirements for the operations to be able to create and manage their
-     * sub-components.
-     *
-     * @param tensors Tensors that are to be used in this operation
-     * @param algorithm An algorithm that will be overridden with the OpMult
-     * shader data and the tensors provided which are expected to be 3
-     */
-    OpMult(std::vector<std::shared_ptr<Tensor>> tensors,
-           std::shared_ptr<Algorithm> algorithm)
-      : OpAlgoDispatch(algorithm)
-    {
-        KP_LOG_DEBUG("Kompute OpMult constructor with params");
-
-        if (tensors.size() != 3) {
-            throw std::runtime_error(
-              "Kompute OpMult expected 3 tensors but got " +
-              std::to_string(tensors.size()));
-        }
-
-        const std::vector<uint32_t> spirv = std::vector<uint32_t>(
-          SHADEROPMULT_COMP_SPV.begin(), SHADEROPMULT_COMP_SPV.end());
-
-        algorithm->rebuild<>(tensors, spirv);
-    }
-
-    /**
-     * Default destructor, which is in charge of destroying the algorithm
-     * components but does not destroy the underlying tensors
-     */
-    ~OpMult() override { KP_LOG_DEBUG("Kompute OpMult destructor started"); }
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorCopy.hpp b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
deleted file mode 100644
index 968c1065a3388..0000000000000
--- a/kompute/src/include/kompute/operations/OpTensorCopy.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that copies the data from the first tensor to the rest of the
- * tensors provided, using a record command for all the vectors. This operation
- * does not own/manage the memory of the tensors passed to it. The operation
- * must only receive tensors of type
- */
-class OpTensorCopy : public OpBase
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation.
-     *
-     * @param tensors Tensors that will be used to create in operation.
-     */
-    OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpTensorCopy() override;
-
-    /**
-     * Records the copy commands from the first tensor into all the other
-     * tensors provided. Also optionally records a barrier.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Copies the local vectors for all the tensors to sync the data with the
-     * gpu.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorFill.hpp b/kompute/src/include/kompute/operations/OpTensorFill.hpp
deleted file mode 100644
index 9a6bf131e88f0..0000000000000
--- a/kompute/src/include/kompute/operations/OpTensorFill.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that fills the tensor
- */
-class OpTensorFill : public OpBase
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation.
-     *
-     * @param tensors Tensors that will be used to create in operation.
-     */
-    OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpTensorFill() override;
-
-    /**
-     * Records the fill command for tensor.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
deleted file mode 100644
index 9b39e490f774e..0000000000000
--- a/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-#include "kompute/Tensor.hpp"
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that syncs tensor's device by mapping local data into the device
- * memory. For TensorTypes::eDevice it will use a record operation for the
- * memory to be syncd into GPU memory which means that the operation will be
- * done in sync with GPU commands. For TensorTypes::eHost it will only map the
- * data into host memory which will happen during preEval before the recorded
- * commands are dispatched.
- */
-class OpTensorSyncDevice : public OpBase
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation. The tensos
-     * provided cannot be of type TensorTypes::eStorage.
-     *
-     * @param tensors Tensors that will be used to create in operation.
-     */
-    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpTensorSyncDevice() override;
-
-    /**
-     * For device tensors, it records the copy command for the tensor to copy
-     * the data from its staging to device memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-    vk::Buffer *mPrimaryBuffer;
-    vk::Buffer *mStagingBuffer;
-    vk::DeviceSize mSize;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
deleted file mode 100644
index 4216003e530c5..0000000000000
--- a/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that syncs tensor's local memory by mapping device data into the
- * local CPU memory. For TensorTypes::eDevice it will use a record operation
- * for the memory to be syncd into GPU memory which means that the operation
- * will be done in sync with GPU commands. For TensorTypes::eHost it will
- * only map the data into host memory which will happen during preEval before
- * the recorded commands are dispatched.
- */
-class OpTensorSyncLocal : public OpBase
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation. The tensors
-     * provided cannot be of type TensorTypes::eStorage.
-     *
-     * @param tensors Tensors that will be used to create in operation.
-     */
-    OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpTensorSyncLocal() override;
-
-    /**
-     * For device tensors, it records the copy command for the tensor to copy
-     * the data from its device to staging memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * For host tensors it performs the map command from the host memory into
-     * local memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-};
-
-} // End namespace kp
diff --git a/kompute/src/logger/CMakeLists.txt b/kompute/src/logger/CMakeLists.txt
deleted file mode 100644
index 1f8695acd2673..0000000000000
--- a/kompute/src/logger/CMakeLists.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-set(LOGGER_SOURCES Logger.cpp)
-
-add_library(kp_logger STATIC ${LOGGER_SOURCES})
-
-# Define log levels in code
-add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_DEBUG=1)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_INFO=2)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_WARN=3)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_ERROR=4)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_CRITICAL=5)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_OFF=6)
-
-if(KOMPUTE_OPT_BUILD_PYTHON AND KOMPUTE_OPT_USE_SPDLOG)
-    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_PYTHON' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
-endif()
-
-if(KOMPUTE_OPT_ANDROID_BUILD AND KOMPUTE_OPT_USE_SPDLOG)
-    message(FATAL_ERROR "'KOMPUTE_OPT_ANDROID_BUILD' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
-endif()
-
-if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Trace")
-    set(KOMPUTE_OPT_LOG_LEVEL TRACE)
-    message(STATUS "Using log level Trace")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Debug")
-    set(KOMPUTE_OPT_LOG_LEVEL DEBUG)
-    message(STATUS "Using log level Debug")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Info")
-    set(KOMPUTE_OPT_LOG_LEVEL INFO)
-    message(STATUS "Using log level Info")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Warn")
-    set(KOMPUTE_OPT_LOG_LEVEL WARN)
-    message(STATUS "Using log level Warn")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Error")
-    set(KOMPUTE_OPT_LOG_LEVEL ERROR)
-    message(STATUS "Using log level Error")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Critical")
-    set(KOMPUTE_OPT_LOG_LEVEL CRITICAL)
-    message(STATUS "Using log level Critical")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
-    set(KOMPUTE_OPT_LOG_LEVEL OFF)
-    message(STATUS "Using log level Off")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Default")
-    set(KOMPUTE_OPT_LOG_LEVEL $<IF:$<CONFIG:Debug>,DEBUG,INFO>)
-    message(STATUS "Setting KOMPUTE_OPT_LOG_LEVEL to according to the build type")
-else()
-    message(FATAL_ERROR "Log level '${KOMPUTE_OPT_LOG_LEVEL}' unknown, use -DKOMPUTE_OPT_LOG_LEVEL={Trace, Debug, Info, Warn, Error, Critical, Off, Default} to set it to a correct value.")
-endif()
-
-# Always make sure we define the Kompute log level independent of the Spdlog log level
-target_compile_definitions(kp_logger INTERFACE KOMPUTE_OPT_ACTIVE_LOG_LEVEL=KOMPUTE_LOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
-
-# Link depending on how the logger should be setup
-if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
-    if(KOMPUTE_OPT_USE_SPDLOG)
-        target_link_libraries(kp_logger PUBLIC spdlog::spdlog)
-        target_compile_definitions(spdlog INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
-        target_compile_definitions(kp_logger INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
-        message(STATUS "setting SPDLOG_ACTIVE_LEVEL to SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL}")
-
-        if(KOMPUTE_OPT_SPDLOG_ASYNC_MODE)
-            target_compile_definitions(kp_logger INTERFACE KOMPUTE_SPDLOG_ASYNC_LOGGING=1)
-        endif()
-    else()
-        target_link_libraries(kp_logger PUBLIC fmt::fmt)
-    endif()
-endif()
diff --git a/kompute/src/logger/Logger.cpp b/kompute/src/logger/Logger.cpp
deleted file mode 100644
index 69df2b609610c..0000000000000
--- a/kompute/src/logger/Logger.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-#include "kompute/logger/Logger.hpp"
-
-#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
-#if !KOMPUTE_OPT_USE_SPDLOG
-#else
-#include <cassert>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <spdlog/async.h>
-#include <spdlog/common.h>
-#include <spdlog/logger.h>
-#include <spdlog/sinks/stdout_color_sinks.h>
-#include <spdlog/spdlog.h>
-#include <string>
-#endif // !KOMPUTE_OPT_USE_SPDLOG
-
-namespace logger {
-#if !KOMPUTE_OPT_USE_SPDLOG
-
-void
-setupLogger()
-{
-}
-
-#else
-constexpr int THREAD_QUEUE_LENGTH = 8192;
-
-void
-setupLogger()
-{
-    // Ensure we setup the logger only once
-    static bool setup = false;
-    static std::mutex setupMutex{};
-    setupMutex.lock();
-    if (setup) {
-        setupMutex.unlock();
-        return;
-    }
-    setup = true;
-    setupMutex.unlock();
-
-    spdlog::init_thread_pool(THREAD_QUEUE_LENGTH, 1);
-    spdlog::sink_ptr console_sink =
-      std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
-#if SPDLOG_ACTIVE_LEVEL < SPDLOG_LEVEL_INFO
-    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=21s] %v");
-#else
-    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=15s] %v");
-#endif
-    std::vector<spdlog::sink_ptr> sinks{ console_sink };
-    // TODO: Add flag in compile flags
-    std::shared_ptr<spdlog::logger> logger =
-#if KOMPUTE_SPDLOG_ASYNC_LOGGING
-          std::make_shared<spdlog::async_logger>(
-            "",
-            sinks.begin(),
-            sinks.end(),
-            spdlog::thread_pool(),
-            spdlog::async_overflow_policy::block);
-#else
-          std::make_shared<spdlog::logger>(
-            "",
-            sinks.begin(),
-            sinks.end());
-#endif
-
-    logger->set_level(getLogLevel());
-
-    spdlog::set_default_logger(logger);
-}
-
-spdlog::level::level_enum
-getLogLevel()
-{
-#if SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_TRACE
-    return spdlog::level::trace;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_DEBUG
-    return spdlog::level::debug;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_INFO
-    return spdlog::level::info;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_WARN
-    return spdlog::level::warn;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_ERROR
-    return spdlog::level::error;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_CRITICAL
-    return spdlog::level::critical;
-#else
-    return spdlog::level::off;
-#endif
-}
-
-void
-setLogLevel(const spdlog::level::level_enum level)
-{
-    spdlog::default_logger()->set_level(level);
-}
-#endif // !KOMPUTE_OPT_USE_SPDLOG
-} // namespace logger
-
-#endif
diff --git a/kompute/src/shaders/CMakeLists.txt b/kompute/src/shaders/CMakeLists.txt
deleted file mode 100644
index 901bf3e8a8af2..0000000000000
--- a/kompute/src/shaders/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# ######################
-cmake_minimum_required(VERSION 3.20)
-
-add_subdirectory(glsl)
\ No newline at end of file
diff --git a/kompute/src/shaders/glsl/CMakeLists.txt b/kompute/src/shaders/glsl/CMakeLists.txt
deleted file mode 100644
index 3101a2b17b751..0000000000000
--- a/kompute/src/shaders/glsl/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# ######################
-cmake_minimum_required(VERSION 3.20)
-
-# Check if build shaders from source is enabled
-if(KOMPUTE_OPT_BUILD_SHADERS)
-    vulkan_compile_shader(INFILE ShaderOpMult.comp
-        OUTFILE ShaderOpMult.hpp
-        NAMESPACE "kp")
-
-    vulkan_compile_shader(INFILE ShaderLogisticRegression.comp
-        OUTFILE ShaderLogisticRegression.hpp
-        NAMESPACE "kp")
-else() # Else we will use our precompiled versions
-    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderOpMult.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp)
-    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderLogisticRegression.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp)
-endif()
-
-add_library(kp_shader INTERFACE "${CMAKE_CURRENT_BINARY_DIR}/ShaderOpMult.hpp"
-    "${CMAKE_CURRENT_BINARY_DIR}/ShaderLogisticRegression.hpp")
-
-target_include_directories(kp_shader INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
-
-# Make sure we install shaders:
-install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.comp b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
deleted file mode 100644
index 5a1c5d9486754..0000000000000
--- a/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
+++ /dev/null
@@ -1,52 +0,0 @@
-#version 450
-
-layout (constant_id = 0) const float m = 0;
-
-layout (local_size_x = 1) in;
-
-layout(set = 0, binding = 0) buffer bxi { float xi[]; };
-layout(set = 0, binding = 1) buffer bxj { float xj[]; };
-layout(set = 0, binding = 2) buffer by { float y[]; };
-layout(set = 0, binding = 3) buffer bwin { float win[]; };
-layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
-layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
-layout(set = 0, binding = 6) buffer bbin { float bin[]; };
-layout(set = 0, binding = 7) buffer bbout { float bout[]; };
-layout(set = 0, binding = 8) buffer blout { float lout[]; };
-
-float sigmoid(float z) {
-    return 1.0 / (1.0 + exp(-z));
-}
-
-float inference(vec2 x, vec2 w, float b) {
-    // Compute the linear mapping function
-    float z = dot(w, x) + b;
-    // Calculate the y-hat with sigmoid
-    float yHat = sigmoid(z);
-    return yHat;
-}
-
-float calculateLoss(float yHat, float y) {
-    return -(y * log(yHat)  +  (1.0 - y) * log(1.0 - yHat));
-}
-
-void main() {
-    uint idx = gl_GlobalInvocationID.x;
-
-    vec2 wCurr = vec2(win[0], win[1]);
-    float bCurr = bin[0];
-
-    vec2 xCurr = vec2(xi[idx], xj[idx]);
-    float yCurr = y[idx];
-
-    float yHat = inference(xCurr, wCurr, bCurr);
-
-    float dZ = yHat - yCurr;
-    vec2 dW = (1. / m) * xCurr * dZ;
-    float dB = (1. / m) * dZ;
-    wouti[idx] = dW.x;
-    woutj[idx] = dW.y;
-    bout[idx] = dB;
-
-    lout[idx] = calculateLoss(yHat, yCurr);
-}
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
deleted file mode 100644
index bfe7792c6c8d9..0000000000000
--- a/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
+++ /dev/null
@@ -1,310 +0,0 @@
-#pragma once
-#include <array>
-#include <cstdint>
-
-namespace kp {
-const std::array<uint32_t, 1204> SHADERLOGISTICREGRESSION_COMP_SPV = { 
-0x07230203, 0x00010000, 0x0008000a, 0x000000ae, 
-0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
-0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
-0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
-0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
-0x00000000, 0x00000041, 0x00060010, 0x00000004, 
-0x00000011, 0x00000001, 0x00000001, 0x00000001, 
-0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
-0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 
-0x0000000a, 0x6d676973, 0x2864696f, 0x003b3166, 
-0x00030005, 0x00000009, 0x0000007a, 0x00080005, 
-0x00000012, 0x65666e69, 0x636e6572, 0x66762865, 
-0x66763b32, 0x31663b32, 0x0000003b, 0x00030005, 
-0x0000000f, 0x00000078, 0x00030005, 0x00000010, 
-0x00000077, 0x00030005, 0x00000011, 0x00000062, 
-0x00080005, 0x00000017, 0x636c6163, 0x74616c75, 
-0x736f4c65, 0x31662873, 0x3b31663b, 0x00000000, 
-0x00040005, 0x00000015, 0x74614879, 0x00000000, 
-0x00030005, 0x00000016, 0x00000079, 0x00030005, 
-0x00000021, 0x0000007a, 0x00040005, 0x00000027, 
-0x74614879, 0x00000000, 0x00040005, 0x00000028, 
-0x61726170, 0x0000006d, 0x00030005, 0x0000003e, 
-0x00786469, 0x00080005, 0x00000041, 0x475f6c67, 
-0x61626f6c, 0x766e496c, 0x7461636f, 0x496e6f69, 
-0x00000044, 0x00040005, 0x00000046, 0x72754377, 
-0x00000072, 0x00040005, 0x00000048, 0x6e697762, 
-0x00000000, 0x00040006, 0x00000048, 0x00000000, 
-0x006e6977, 0x00030005, 0x0000004a, 0x00000000, 
-0x00040005, 0x00000054, 0x72754362, 0x00000072, 
-0x00040005, 0x00000056, 0x6e696262, 0x00000000, 
-0x00040006, 0x00000056, 0x00000000, 0x006e6962, 
-0x00030005, 0x00000058, 0x00000000, 0x00040005, 
-0x0000005b, 0x72754378, 0x00000072, 0x00030005, 
-0x0000005d, 0x00697862, 0x00040006, 0x0000005d, 
-0x00000000, 0x00006978, 0x00030005, 0x0000005f, 
-0x00000000, 0x00030005, 0x00000064, 0x006a7862, 
-0x00040006, 0x00000064, 0x00000000, 0x00006a78, 
-0x00030005, 0x00000066, 0x00000000, 0x00040005, 
-0x0000006b, 0x72754379, 0x00000072, 0x00030005, 
-0x0000006d, 0x00007962, 0x00040006, 0x0000006d, 
-0x00000000, 0x00000079, 0x00030005, 0x0000006f, 
-0x00000000, 0x00040005, 0x00000073, 0x74614879, 
-0x00000000, 0x00040005, 0x00000074, 0x61726170, 
-0x0000006d, 0x00040005, 0x00000076, 0x61726170, 
-0x0000006d, 0x00040005, 0x00000078, 0x61726170, 
-0x0000006d, 0x00030005, 0x0000007b, 0x00005a64, 
-0x00030005, 0x0000007f, 0x00005764, 0x00030005, 
-0x00000080, 0x0000006d, 0x00030005, 0x00000086, 
-0x00004264, 0x00040005, 0x0000008b, 0x756f7762, 
-0x00006974, 0x00050006, 0x0000008b, 0x00000000, 
-0x74756f77, 0x00000069, 0x00030005, 0x0000008d, 
-0x00000000, 0x00040005, 0x00000093, 0x756f7762, 
-0x00006a74, 0x00050006, 0x00000093, 0x00000000, 
-0x74756f77, 0x0000006a, 0x00030005, 0x00000095, 
-0x00000000, 0x00040005, 0x0000009c, 0x756f6262, 
-0x00000074, 0x00050006, 0x0000009c, 0x00000000, 
-0x74756f62, 0x00000000, 0x00030005, 0x0000009e, 
-0x00000000, 0x00040005, 0x000000a3, 0x756f6c62, 
-0x00000074, 0x00050006, 0x000000a3, 0x00000000, 
-0x74756f6c, 0x00000000, 0x00030005, 0x000000a5, 
-0x00000000, 0x00040005, 0x000000a7, 0x61726170, 
-0x0000006d, 0x00040005, 0x000000a9, 0x61726170, 
-0x0000006d, 0x00040047, 0x00000041, 0x0000000b, 
-0x0000001c, 0x00040047, 0x00000047, 0x00000006, 
-0x00000004, 0x00050048, 0x00000048, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x00000048, 
-0x00000003, 0x00040047, 0x0000004a, 0x00000022, 
-0x00000000, 0x00040047, 0x0000004a, 0x00000021, 
-0x00000003, 0x00040047, 0x00000055, 0x00000006, 
-0x00000004, 0x00050048, 0x00000056, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x00000056, 
-0x00000003, 0x00040047, 0x00000058, 0x00000022, 
-0x00000000, 0x00040047, 0x00000058, 0x00000021, 
-0x00000006, 0x00040047, 0x0000005c, 0x00000006, 
-0x00000004, 0x00050048, 0x0000005d, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x0000005d, 
-0x00000003, 0x00040047, 0x0000005f, 0x00000022, 
-0x00000000, 0x00040047, 0x0000005f, 0x00000021, 
-0x00000000, 0x00040047, 0x00000063, 0x00000006, 
-0x00000004, 0x00050048, 0x00000064, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x00000064, 
-0x00000003, 0x00040047, 0x00000066, 0x00000022, 
-0x00000000, 0x00040047, 0x00000066, 0x00000021, 
-0x00000001, 0x00040047, 0x0000006c, 0x00000006, 
-0x00000004, 0x00050048, 0x0000006d, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x0000006d, 
-0x00000003, 0x00040047, 0x0000006f, 0x00000022, 
-0x00000000, 0x00040047, 0x0000006f, 0x00000021, 
-0x00000002, 0x00040047, 0x00000080, 0x00000001, 
-0x00000000, 0x00040047, 0x0000008a, 0x00000006, 
-0x00000004, 0x00050048, 0x0000008b, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x0000008b, 
-0x00000003, 0x00040047, 0x0000008d, 0x00000022, 
-0x00000000, 0x00040047, 0x0000008d, 0x00000021, 
-0x00000004, 0x00040047, 0x00000092, 0x00000006, 
-0x00000004, 0x00050048, 0x00000093, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x00000093, 
-0x00000003, 0x00040047, 0x00000095, 0x00000022, 
-0x00000000, 0x00040047, 0x00000095, 0x00000021, 
-0x00000005, 0x00040047, 0x0000009b, 0x00000006, 
-0x00000004, 0x00050048, 0x0000009c, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x0000009c, 
-0x00000003, 0x00040047, 0x0000009e, 0x00000022, 
-0x00000000, 0x00040047, 0x0000009e, 0x00000021, 
-0x00000007, 0x00040047, 0x000000a2, 0x00000006, 
-0x00000004, 0x00050048, 0x000000a3, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x000000a3, 
-0x00000003, 0x00040047, 0x000000a5, 0x00000022, 
-0x00000000, 0x00040047, 0x000000a5, 0x00000021, 
-0x00000008, 0x00040047, 0x000000ad, 0x0000000b, 
-0x00000019, 0x00020013, 0x00000002, 0x00030021, 
-0x00000003, 0x00000002, 0x00030016, 0x00000006, 
-0x00000020, 0x00040020, 0x00000007, 0x00000007, 
-0x00000006, 0x00040021, 0x00000008, 0x00000006, 
-0x00000007, 0x00040017, 0x0000000c, 0x00000006, 
-0x00000002, 0x00040020, 0x0000000d, 0x00000007, 
-0x0000000c, 0x00060021, 0x0000000e, 0x00000006, 
-0x0000000d, 0x0000000d, 0x00000007, 0x00050021, 
-0x00000014, 0x00000006, 0x00000007, 0x00000007, 
-0x0004002b, 0x00000006, 0x00000019, 0x3f800000, 
-0x00040015, 0x0000003c, 0x00000020, 0x00000000, 
-0x00040020, 0x0000003d, 0x00000007, 0x0000003c, 
-0x00040017, 0x0000003f, 0x0000003c, 0x00000003, 
-0x00040020, 0x00000040, 0x00000001, 0x0000003f, 
-0x0004003b, 0x00000040, 0x00000041, 0x00000001, 
-0x0004002b, 0x0000003c, 0x00000042, 0x00000000, 
-0x00040020, 0x00000043, 0x00000001, 0x0000003c, 
-0x0003001d, 0x00000047, 0x00000006, 0x0003001e, 
-0x00000048, 0x00000047, 0x00040020, 0x00000049, 
-0x00000002, 0x00000048, 0x0004003b, 0x00000049, 
-0x0000004a, 0x00000002, 0x00040015, 0x0000004b, 
-0x00000020, 0x00000001, 0x0004002b, 0x0000004b, 
-0x0000004c, 0x00000000, 0x00040020, 0x0000004d, 
-0x00000002, 0x00000006, 0x0004002b, 0x0000004b, 
-0x00000050, 0x00000001, 0x0003001d, 0x00000055, 
-0x00000006, 0x0003001e, 0x00000056, 0x00000055, 
-0x00040020, 0x00000057, 0x00000002, 0x00000056, 
-0x0004003b, 0x00000057, 0x00000058, 0x00000002, 
-0x0003001d, 0x0000005c, 0x00000006, 0x0003001e, 
-0x0000005d, 0x0000005c, 0x00040020, 0x0000005e, 
-0x00000002, 0x0000005d, 0x0004003b, 0x0000005e, 
-0x0000005f, 0x00000002, 0x0003001d, 0x00000063, 
-0x00000006, 0x0003001e, 0x00000064, 0x00000063, 
-0x00040020, 0x00000065, 0x00000002, 0x00000064, 
-0x0004003b, 0x00000065, 0x00000066, 0x00000002, 
-0x0003001d, 0x0000006c, 0x00000006, 0x0003001e, 
-0x0000006d, 0x0000006c, 0x00040020, 0x0000006e, 
-0x00000002, 0x0000006d, 0x0004003b, 0x0000006e, 
-0x0000006f, 0x00000002, 0x00040032, 0x00000006, 
-0x00000080, 0x00000000, 0x0003001d, 0x0000008a, 
-0x00000006, 0x0003001e, 0x0000008b, 0x0000008a, 
-0x00040020, 0x0000008c, 0x00000002, 0x0000008b, 
-0x0004003b, 0x0000008c, 0x0000008d, 0x00000002, 
-0x0003001d, 0x00000092, 0x00000006, 0x0003001e, 
-0x00000093, 0x00000092, 0x00040020, 0x00000094, 
-0x00000002, 0x00000093, 0x0004003b, 0x00000094, 
-0x00000095, 0x00000002, 0x0004002b, 0x0000003c, 
-0x00000097, 0x00000001, 0x0003001d, 0x0000009b, 
-0x00000006, 0x0003001e, 0x0000009c, 0x0000009b, 
-0x00040020, 0x0000009d, 0x00000002, 0x0000009c, 
-0x0004003b, 0x0000009d, 0x0000009e, 0x00000002, 
-0x0003001d, 0x000000a2, 0x00000006, 0x0003001e, 
-0x000000a3, 0x000000a2, 0x00040020, 0x000000a4, 
-0x00000002, 0x000000a3, 0x0004003b, 0x000000a4, 
-0x000000a5, 0x00000002, 0x0006002c, 0x0000003f, 
-0x000000ad, 0x00000097, 0x00000097, 0x00000097, 
-0x00050036, 0x00000002, 0x00000004, 0x00000000, 
-0x00000003, 0x000200f8, 0x00000005, 0x0004003b, 
-0x0000003d, 0x0000003e, 0x00000007, 0x0004003b, 
-0x0000000d, 0x00000046, 0x00000007, 0x0004003b, 
-0x00000007, 0x00000054, 0x00000007, 0x0004003b, 
-0x0000000d, 0x0000005b, 0x00000007, 0x0004003b, 
-0x00000007, 0x0000006b, 0x00000007, 0x0004003b, 
-0x00000007, 0x00000073, 0x00000007, 0x0004003b, 
-0x0000000d, 0x00000074, 0x00000007, 0x0004003b, 
-0x0000000d, 0x00000076, 0x00000007, 0x0004003b, 
-0x00000007, 0x00000078, 0x00000007, 0x0004003b, 
-0x00000007, 0x0000007b, 0x00000007, 0x0004003b, 
-0x0000000d, 0x0000007f, 0x00000007, 0x0004003b, 
-0x00000007, 0x00000086, 0x00000007, 0x0004003b, 
-0x00000007, 0x000000a7, 0x00000007, 0x0004003b, 
-0x00000007, 0x000000a9, 0x00000007, 0x00050041, 
-0x00000043, 0x00000044, 0x00000041, 0x00000042, 
-0x0004003d, 0x0000003c, 0x00000045, 0x00000044, 
-0x0003003e, 0x0000003e, 0x00000045, 0x00060041, 
-0x0000004d, 0x0000004e, 0x0000004a, 0x0000004c, 
-0x0000004c, 0x0004003d, 0x00000006, 0x0000004f, 
-0x0000004e, 0x00060041, 0x0000004d, 0x00000051, 
-0x0000004a, 0x0000004c, 0x00000050, 0x0004003d, 
-0x00000006, 0x00000052, 0x00000051, 0x00050050, 
-0x0000000c, 0x00000053, 0x0000004f, 0x00000052, 
-0x0003003e, 0x00000046, 0x00000053, 0x00060041, 
-0x0000004d, 0x00000059, 0x00000058, 0x0000004c, 
-0x0000004c, 0x0004003d, 0x00000006, 0x0000005a, 
-0x00000059, 0x0003003e, 0x00000054, 0x0000005a, 
-0x0004003d, 0x0000003c, 0x00000060, 0x0000003e, 
-0x00060041, 0x0000004d, 0x00000061, 0x0000005f, 
-0x0000004c, 0x00000060, 0x0004003d, 0x00000006, 
-0x00000062, 0x00000061, 0x0004003d, 0x0000003c, 
-0x00000067, 0x0000003e, 0x00060041, 0x0000004d, 
-0x00000068, 0x00000066, 0x0000004c, 0x00000067, 
-0x0004003d, 0x00000006, 0x00000069, 0x00000068, 
-0x00050050, 0x0000000c, 0x0000006a, 0x00000062, 
-0x00000069, 0x0003003e, 0x0000005b, 0x0000006a, 
-0x0004003d, 0x0000003c, 0x00000070, 0x0000003e, 
-0x00060041, 0x0000004d, 0x00000071, 0x0000006f, 
-0x0000004c, 0x00000070, 0x0004003d, 0x00000006, 
-0x00000072, 0x00000071, 0x0003003e, 0x0000006b, 
-0x00000072, 0x0004003d, 0x0000000c, 0x00000075, 
-0x0000005b, 0x0003003e, 0x00000074, 0x00000075, 
-0x0004003d, 0x0000000c, 0x00000077, 0x00000046, 
-0x0003003e, 0x00000076, 0x00000077, 0x0004003d, 
-0x00000006, 0x00000079, 0x00000054, 0x0003003e, 
-0x00000078, 0x00000079, 0x00070039, 0x00000006, 
-0x0000007a, 0x00000012, 0x00000074, 0x00000076, 
-0x00000078, 0x0003003e, 0x00000073, 0x0000007a, 
-0x0004003d, 0x00000006, 0x0000007c, 0x00000073, 
-0x0004003d, 0x00000006, 0x0000007d, 0x0000006b, 
-0x00050083, 0x00000006, 0x0000007e, 0x0000007c, 
-0x0000007d, 0x0003003e, 0x0000007b, 0x0000007e, 
-0x00050088, 0x00000006, 0x00000081, 0x00000019, 
-0x00000080, 0x0004003d, 0x0000000c, 0x00000082, 
-0x0000005b, 0x0005008e, 0x0000000c, 0x00000083, 
-0x00000082, 0x00000081, 0x0004003d, 0x00000006, 
-0x00000084, 0x0000007b, 0x0005008e, 0x0000000c, 
-0x00000085, 0x00000083, 0x00000084, 0x0003003e, 
-0x0000007f, 0x00000085, 0x00050088, 0x00000006, 
-0x00000087, 0x00000019, 0x00000080, 0x0004003d, 
-0x00000006, 0x00000088, 0x0000007b, 0x00050085, 
-0x00000006, 0x00000089, 0x00000087, 0x00000088, 
-0x0003003e, 0x00000086, 0x00000089, 0x0004003d, 
-0x0000003c, 0x0000008e, 0x0000003e, 0x00050041, 
-0x00000007, 0x0000008f, 0x0000007f, 0x00000042, 
-0x0004003d, 0x00000006, 0x00000090, 0x0000008f, 
-0x00060041, 0x0000004d, 0x00000091, 0x0000008d, 
-0x0000004c, 0x0000008e, 0x0003003e, 0x00000091, 
-0x00000090, 0x0004003d, 0x0000003c, 0x00000096, 
-0x0000003e, 0x00050041, 0x00000007, 0x00000098, 
-0x0000007f, 0x00000097, 0x0004003d, 0x00000006, 
-0x00000099, 0x00000098, 0x00060041, 0x0000004d, 
-0x0000009a, 0x00000095, 0x0000004c, 0x00000096, 
-0x0003003e, 0x0000009a, 0x00000099, 0x0004003d, 
-0x0000003c, 0x0000009f, 0x0000003e, 0x0004003d, 
-0x00000006, 0x000000a0, 0x00000086, 0x00060041, 
-0x0000004d, 0x000000a1, 0x0000009e, 0x0000004c, 
-0x0000009f, 0x0003003e, 0x000000a1, 0x000000a0, 
-0x0004003d, 0x0000003c, 0x000000a6, 0x0000003e, 
-0x0004003d, 0x00000006, 0x000000a8, 0x00000073, 
-0x0003003e, 0x000000a7, 0x000000a8, 0x0004003d, 
-0x00000006, 0x000000aa, 0x0000006b, 0x0003003e, 
-0x000000a9, 0x000000aa, 0x00060039, 0x00000006, 
-0x000000ab, 0x00000017, 0x000000a7, 0x000000a9, 
-0x00060041, 0x0000004d, 0x000000ac, 0x000000a5, 
-0x0000004c, 0x000000a6, 0x0003003e, 0x000000ac, 
-0x000000ab, 0x000100fd, 0x00010038, 0x00050036, 
-0x00000006, 0x0000000a, 0x00000000, 0x00000008, 
-0x00030037, 0x00000007, 0x00000009, 0x000200f8, 
-0x0000000b, 0x0004003d, 0x00000006, 0x0000001a, 
-0x00000009, 0x0004007f, 0x00000006, 0x0000001b, 
-0x0000001a, 0x0006000c, 0x00000006, 0x0000001c, 
-0x00000001, 0x0000001b, 0x0000001b, 0x00050081, 
-0x00000006, 0x0000001d, 0x00000019, 0x0000001c, 
-0x00050088, 0x00000006, 0x0000001e, 0x00000019, 
-0x0000001d, 0x000200fe, 0x0000001e, 0x00010038, 
-0x00050036, 0x00000006, 0x00000012, 0x00000000, 
-0x0000000e, 0x00030037, 0x0000000d, 0x0000000f, 
-0x00030037, 0x0000000d, 0x00000010, 0x00030037, 
-0x00000007, 0x00000011, 0x000200f8, 0x00000013, 
-0x0004003b, 0x00000007, 0x00000021, 0x00000007, 
-0x0004003b, 0x00000007, 0x00000027, 0x00000007, 
-0x0004003b, 0x00000007, 0x00000028, 0x00000007, 
-0x0004003d, 0x0000000c, 0x00000022, 0x00000010, 
-0x0004003d, 0x0000000c, 0x00000023, 0x0000000f, 
-0x00050094, 0x00000006, 0x00000024, 0x00000022, 
-0x00000023, 0x0004003d, 0x00000006, 0x00000025, 
-0x00000011, 0x00050081, 0x00000006, 0x00000026, 
-0x00000024, 0x00000025, 0x0003003e, 0x00000021, 
-0x00000026, 0x0004003d, 0x00000006, 0x00000029, 
-0x00000021, 0x0003003e, 0x00000028, 0x00000029, 
-0x00050039, 0x00000006, 0x0000002a, 0x0000000a, 
-0x00000028, 0x0003003e, 0x00000027, 0x0000002a, 
-0x0004003d, 0x00000006, 0x0000002b, 0x00000027, 
-0x000200fe, 0x0000002b, 0x00010038, 0x00050036, 
-0x00000006, 0x00000017, 0x00000000, 0x00000014, 
-0x00030037, 0x00000007, 0x00000015, 0x00030037, 
-0x00000007, 0x00000016, 0x000200f8, 0x00000018, 
-0x0004003d, 0x00000006, 0x0000002e, 0x00000016, 
-0x0004003d, 0x00000006, 0x0000002f, 0x00000015, 
-0x0006000c, 0x00000006, 0x00000030, 0x00000001, 
-0x0000001c, 0x0000002f, 0x00050085, 0x00000006, 
-0x00000031, 0x0000002e, 0x00000030, 0x0004003d, 
-0x00000006, 0x00000032, 0x00000016, 0x00050083, 
-0x00000006, 0x00000033, 0x00000019, 0x00000032, 
-0x0004003d, 0x00000006, 0x00000034, 0x00000015, 
-0x00050083, 0x00000006, 0x00000035, 0x00000019, 
-0x00000034, 0x0006000c, 0x00000006, 0x00000036, 
-0x00000001, 0x0000001c, 0x00000035, 0x00050085, 
-0x00000006, 0x00000037, 0x00000033, 0x00000036, 
-0x00050081, 0x00000006, 0x00000038, 0x00000031, 
-0x00000037, 0x0004007f, 0x00000006, 0x00000039, 
-0x00000038, 0x000200fe, 0x00000039, 0x00010038 };
-} // namespace kp
-
-
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.comp b/kompute/src/shaders/glsl/ShaderOpMult.comp
deleted file mode 100644
index d5486503760c1..0000000000000
--- a/kompute/src/shaders/glsl/ShaderOpMult.comp
+++ /dev/null
@@ -1,28 +0,0 @@
-#version 450
-
-layout(set = 0, binding = 0) buffer tensorLhs {
-   float valuesLhs[ ];
-};
-
-layout(set = 0, binding = 1) buffer tensorRhs {
-   float valuesRhs[ ];
-};
-
-layout(set = 0, binding = 2) buffer tensorOutput {
-   float valuesOutput[ ];
-};
-
-layout (constant_id = 0) const uint LEN_LHS = 0;
-layout (constant_id = 1) const uint LEN_RHS = 0;
-layout (constant_id = 2) const uint LEN_OUT = 0;
-
-layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
-
-void main() 
-{
-	uint index = gl_GlobalInvocationID.x;
-
-    valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
-}
-
-
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.hpp.in b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
deleted file mode 100644
index 5af29c66d1214..0000000000000
--- a/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
+++ /dev/null
@@ -1,101 +0,0 @@
-#pragma once
-#include <array>
-#include <cstdint>
-
-namespace kp {
-const std::array<uint32_t, 366> SHADEROPMULT_COMP_SPV = { 
-0x07230203, 0x00010000, 0x0008000a, 0x0000002e, 
-0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
-0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
-0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
-0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
-0x00000000, 0x0000000b, 0x00060010, 0x00000004, 
-0x00000011, 0x00000001, 0x00000001, 0x00000001, 
-0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
-0x00000004, 0x6e69616d, 0x00000000, 0x00040005, 
-0x00000008, 0x65646e69, 0x00000078, 0x00080005, 
-0x0000000b, 0x475f6c67, 0x61626f6c, 0x766e496c, 
-0x7461636f, 0x496e6f69, 0x00000044, 0x00060005, 
-0x00000012, 0x736e6574, 0x754f726f, 0x74757074, 
-0x00000000, 0x00070006, 0x00000012, 0x00000000, 
-0x756c6176, 0x754f7365, 0x74757074, 0x00000000, 
-0x00030005, 0x00000014, 0x00000000, 0x00050005, 
-0x00000019, 0x736e6574, 0x684c726f, 0x00000073, 
-0x00060006, 0x00000019, 0x00000000, 0x756c6176, 
-0x684c7365, 0x00000073, 0x00030005, 0x0000001b, 
-0x00000000, 0x00050005, 0x00000021, 0x736e6574, 
-0x6852726f, 0x00000073, 0x00060006, 0x00000021, 
-0x00000000, 0x756c6176, 0x68527365, 0x00000073, 
-0x00030005, 0x00000023, 0x00000000, 0x00040005, 
-0x00000029, 0x5f4e454c, 0x0053484c, 0x00040005, 
-0x0000002a, 0x5f4e454c, 0x00534852, 0x00040005, 
-0x0000002b, 0x5f4e454c, 0x0054554f, 0x00040047, 
-0x0000000b, 0x0000000b, 0x0000001c, 0x00040047, 
-0x00000011, 0x00000006, 0x00000004, 0x00050048, 
-0x00000012, 0x00000000, 0x00000023, 0x00000000, 
-0x00030047, 0x00000012, 0x00000003, 0x00040047, 
-0x00000014, 0x00000022, 0x00000000, 0x00040047, 
-0x00000014, 0x00000021, 0x00000002, 0x00040047, 
-0x00000018, 0x00000006, 0x00000004, 0x00050048, 
-0x00000019, 0x00000000, 0x00000023, 0x00000000, 
-0x00030047, 0x00000019, 0x00000003, 0x00040047, 
-0x0000001b, 0x00000022, 0x00000000, 0x00040047, 
-0x0000001b, 0x00000021, 0x00000000, 0x00040047, 
-0x00000020, 0x00000006, 0x00000004, 0x00050048, 
-0x00000021, 0x00000000, 0x00000023, 0x00000000, 
-0x00030047, 0x00000021, 0x00000003, 0x00040047, 
-0x00000023, 0x00000022, 0x00000000, 0x00040047, 
-0x00000023, 0x00000021, 0x00000001, 0x00040047, 
-0x00000029, 0x00000001, 0x00000000, 0x00040047, 
-0x0000002a, 0x00000001, 0x00000001, 0x00040047, 
-0x0000002b, 0x00000001, 0x00000002, 0x00040047, 
-0x0000002d, 0x0000000b, 0x00000019, 0x00020013, 
-0x00000002, 0x00030021, 0x00000003, 0x00000002, 
-0x00040015, 0x00000006, 0x00000020, 0x00000000, 
-0x00040020, 0x00000007, 0x00000007, 0x00000006, 
-0x00040017, 0x00000009, 0x00000006, 0x00000003, 
-0x00040020, 0x0000000a, 0x00000001, 0x00000009, 
-0x0004003b, 0x0000000a, 0x0000000b, 0x00000001, 
-0x0004002b, 0x00000006, 0x0000000c, 0x00000000, 
-0x00040020, 0x0000000d, 0x00000001, 0x00000006, 
-0x00030016, 0x00000010, 0x00000020, 0x0003001d, 
-0x00000011, 0x00000010, 0x0003001e, 0x00000012, 
-0x00000011, 0x00040020, 0x00000013, 0x00000002, 
-0x00000012, 0x0004003b, 0x00000013, 0x00000014, 
-0x00000002, 0x00040015, 0x00000015, 0x00000020, 
-0x00000001, 0x0004002b, 0x00000015, 0x00000016, 
-0x00000000, 0x0003001d, 0x00000018, 0x00000010, 
-0x0003001e, 0x00000019, 0x00000018, 0x00040020, 
-0x0000001a, 0x00000002, 0x00000019, 0x0004003b, 
-0x0000001a, 0x0000001b, 0x00000002, 0x00040020, 
-0x0000001d, 0x00000002, 0x00000010, 0x0003001d, 
-0x00000020, 0x00000010, 0x0003001e, 0x00000021, 
-0x00000020, 0x00040020, 0x00000022, 0x00000002, 
-0x00000021, 0x0004003b, 0x00000022, 0x00000023, 
-0x00000002, 0x00040032, 0x00000006, 0x00000029, 
-0x00000000, 0x00040032, 0x00000006, 0x0000002a, 
-0x00000000, 0x00040032, 0x00000006, 0x0000002b, 
-0x00000000, 0x0004002b, 0x00000006, 0x0000002c, 
-0x00000001, 0x0006002c, 0x00000009, 0x0000002d, 
-0x0000002c, 0x0000002c, 0x0000002c, 0x00050036, 
-0x00000002, 0x00000004, 0x00000000, 0x00000003, 
-0x000200f8, 0x00000005, 0x0004003b, 0x00000007, 
-0x00000008, 0x00000007, 0x00050041, 0x0000000d, 
-0x0000000e, 0x0000000b, 0x0000000c, 0x0004003d, 
-0x00000006, 0x0000000f, 0x0000000e, 0x0003003e, 
-0x00000008, 0x0000000f, 0x0004003d, 0x00000006, 
-0x00000017, 0x00000008, 0x0004003d, 0x00000006, 
-0x0000001c, 0x00000008, 0x00060041, 0x0000001d, 
-0x0000001e, 0x0000001b, 0x00000016, 0x0000001c, 
-0x0004003d, 0x00000010, 0x0000001f, 0x0000001e, 
-0x0004003d, 0x00000006, 0x00000024, 0x00000008, 
-0x00060041, 0x0000001d, 0x00000025, 0x00000023, 
-0x00000016, 0x00000024, 0x0004003d, 0x00000010, 
-0x00000026, 0x00000025, 0x00050085, 0x00000010, 
-0x00000027, 0x0000001f, 0x00000026, 0x00060041, 
-0x0000001d, 0x00000028, 0x00000014, 0x00000016, 
-0x00000017, 0x0003003e, 0x00000028, 0x00000027, 
-0x000100fd, 0x00010038 };
-} // namespace kp
-
-
diff --git a/kompute/src/shaders/hlsl/computeheadless.comp b/kompute/src/shaders/hlsl/computeheadless.comp
deleted file mode 100644
index ee3cd024f0466..0000000000000
--- a/kompute/src/shaders/hlsl/computeheadless.comp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2020 Google LLC
-
-RWStructuredBuffer<uint> values : register(u0);
-[[vk::constant_id(0)]] const uint BUFFER_ELEMENTS = 32;
-
-uint fibonacci(uint n) {
-	if(n <= 1){
-		return n;
-	}
-	uint curr = 1;
-	uint prev = 1;
-	for(uint i = 2; i < n; ++i) {
-		uint temp = curr;
-		curr += prev;
-		prev = temp;
-	}
-	return curr;
-}
-
-[numthreads(1, 1, 1)]
-void main(uint3 GlobalInvocationID : SV_DispatchThreadID)
-{
-	uint index = GlobalInvocationID.x;
-	if (index >= BUFFER_ELEMENTS)
-		return;
-	values[index] = fibonacci(values[index]);
-}
-
-

From f7cb0a65ef7a6e5cf1ad318ca8514abef74771bd Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:55:41 -0500
Subject: [PATCH 074/140] remove script with unclear purpose

---
 undump.py | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100755 undump.py

diff --git a/undump.py b/undump.py
deleted file mode 100755
index c3d8993be66c8..0000000000000
--- a/undump.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python3
-import struct
-import numpy as np
-from pathlib import Path
-
-def undump(fn):
-    with open(fn, 'rb') as df:
-        dims = struct.unpack('=QQQQ', df.read(8*4))
-        (dsz,) = struct.unpack('=Q', df.read(8))
-        ## assume f32
-        data = df.read(dsz)
-        data = [i for (i,) in struct.iter_unpack('=f', data)]
-        return np.array(data).reshape(dims).squeeze()
-
-if __name__ == '__main__':
-    for dfn in sorted(Path('.').glob('*.dump')):
-        darr = undump(dfn)
-        print(f'{dfn}: {darr.shape}\n{darr}')
-

From c8fd4ba8465db1fcf2af020f9b8ac0937e2721a2 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 14 Dec 2023 13:18:14 -0500
Subject: [PATCH 075/140] ggml : restore 'static' specifiers

---
 ggml.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2eaba0a828cfe..f743df1f3709a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -7104,7 +7104,7 @@ static void ggml_compute_forward_add_q_f32(
     }
 }
 
-void ggml_compute_forward_add(
+static void ggml_compute_forward_add(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -7682,7 +7682,7 @@ static void ggml_compute_forward_mul_f32(
     }
 }
 
-void ggml_compute_forward_mul(
+static void ggml_compute_forward_mul(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -8653,7 +8653,7 @@ static void ggml_compute_forward_elu(
 
 // ggml_compute_forward_relu
 
-void ggml_compute_forward_relu_f32(
+static void ggml_compute_forward_relu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8677,7 +8677,7 @@ void ggml_compute_forward_relu_f32(
     }
 }
 
-void ggml_compute_forward_relu(
+static void ggml_compute_forward_relu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8695,7 +8695,7 @@ void ggml_compute_forward_relu(
 
 // ggml_compute_forward_gelu
 
-void ggml_compute_forward_gelu_f32(
+static void ggml_compute_forward_gelu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8736,7 +8736,7 @@ void ggml_compute_forward_gelu_f32(
     }
 }
 
-void ggml_compute_forward_gelu(
+static void ggml_compute_forward_gelu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8813,7 +8813,7 @@ static void ggml_compute_forward_gelu_quick(
 
 // ggml_compute_forward_silu
 
-void ggml_compute_forward_silu_f32(
+static void ggml_compute_forward_silu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8854,7 +8854,7 @@ void ggml_compute_forward_silu_f32(
     }
 }
 
-void ggml_compute_forward_silu(
+static void ggml_compute_forward_silu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -9029,7 +9029,7 @@ static void ggml_compute_forward_norm_f32(
     }
 }
 
-void ggml_compute_forward_norm(
+static void ggml_compute_forward_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -9095,7 +9095,7 @@ static void ggml_compute_forward_rms_norm_f32(
     }
 }
 
-void ggml_compute_forward_rms_norm(
+static void ggml_compute_forward_rms_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -9999,7 +9999,7 @@ static void ggml_compute_forward_scale_f32(
     }
 }
 
-void ggml_compute_forward_scale(
+static void ggml_compute_forward_scale(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -10120,7 +10120,7 @@ static void ggml_compute_forward_set(
 
 // ggml_compute_forward_cpy
 
-void ggml_compute_forward_cpy(
+static void ggml_compute_forward_cpy(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10264,7 +10264,7 @@ static void ggml_compute_forward_get_rows_f32(
     }
 }
 
-void ggml_compute_forward_get_rows(
+static void ggml_compute_forward_get_rows(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -10536,7 +10536,7 @@ static void ggml_compute_forward_diag_mask_f32(
     }
 }
 
-void ggml_compute_forward_diag_mask_inf(
+static void ggml_compute_forward_diag_mask_inf(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10570,7 +10570,7 @@ static void ggml_compute_forward_diag_mask_zero(
 
 // ggml_compute_forward_soft_max
 
-void ggml_compute_forward_soft_max_f32(
+static void ggml_compute_forward_soft_max_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -11359,7 +11359,7 @@ static void ggml_compute_forward_rope_f16(
     }
 }
 
-void ggml_compute_forward_rope(
+static void ggml_compute_forward_rope(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,

From f58f581ca8e4eef9bff1d98965e4a049a7a14cb5 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 15 Dec 2023 13:38:54 -0500
Subject: [PATCH 076/140] refactor llama.cpp modifications

---
 llama.cpp | 96 +++++++++++++++++++++----------------------------------
 1 file changed, 37 insertions(+), 59 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 82e1abbbd8b01..f7c6f26d271f0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -731,7 +731,7 @@ static std::string llama_format_win_err(DWORD err) {
 struct llama_buffer {
     void * data = NULL;
     size_t size = 0;
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
     ggml_vk_memory memory;
 #endif
 
@@ -742,7 +742,7 @@ struct llama_buffer {
     void resize(size_t n) {
         llama_host_free(data);
 
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
         if (ggml_vk_has_device()) {
             this->memory = ggml_vk_allocate(n);
             this->data = (uint8_t*)memory.data;
@@ -764,7 +764,7 @@ struct llama_buffer {
 
     ~llama_buffer() {
         if (data) {
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
             if (ggml_vk_has_device()) {
                 ggml_vk_free_memory(memory);
                 data = NULL;
@@ -1517,7 +1517,6 @@ struct llama_context {
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
 #endif
-
 };
 
 //
@@ -2113,7 +2112,7 @@ struct llama_model_loader {
             use_mmap = false;
         }
 
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
         use_mmap = false;
 #endif
         this->use_mmap = use_mmap;
@@ -3790,8 +3789,7 @@ static struct ggml_tensor * llm_build_inp_embd(
         const llama_hparams & hparams,
           const llama_batch & batch,
          struct ggml_tensor * tok_embd,
-         const llm_build_cb & cb,
-        struct ggml_tensor ** to_device_tensor = nullptr) {
+         const llm_build_cb & cb) {
     const int64_t n_embd = hparams.n_embd;
 
     struct ggml_tensor * inpL;
@@ -3799,9 +3797,6 @@ static struct ggml_tensor * llm_build_inp_embd(
     if (batch.token) {
         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
         cb(inp_tokens, "inp_tokens", -1);
-        if (to_device_tensor) {
-            *to_device_tensor = inp_tokens;
-        }
 
         inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
     } else {
@@ -3810,9 +3805,6 @@ static struct ggml_tensor * llm_build_inp_embd(
 #endif
 
         inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
-        if (to_device_tensor) {
-            *to_device_tensor = inpL;
-        }
     }
 
     return inpL;
@@ -3820,7 +3812,7 @@ static struct ggml_tensor * llm_build_inp_embd(
 
 // Persimmon: n_rot = n_embd_head/2
 // Other:     n_rot = n_embd_head
-static struct ggml_tensor * llm_build_k_shift(
+static void llm_build_k_shift(
       struct ggml_context * ctx,
       const llama_hparams & hparams,
       const llama_cparams & cparams,
@@ -3869,8 +3861,6 @@ static struct ggml_tensor * llm_build_k_shift(
         cb(tmp, "K_shifted", il);
         ggml_build_forward_expand(graph, tmp);
     }
-
-    return K_shift;
 }
 
 static void llm_build_kv_store(
@@ -4148,7 +4138,7 @@ struct llm_build_context {
 
     llama_buffer & buf_compute;
 
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
     ggml_kompute_context * ctx_kompute;
 #endif
 
@@ -4187,7 +4177,7 @@ struct llm_build_context {
         do_rope_shift (worst_case || kv_self.has_shift),
         cb            (cb),
         buf_compute   (lctx.buf_compute)
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
       , ctx_kompute   (lctx.ctx_kompute)
 #endif
         {
@@ -4220,9 +4210,8 @@ struct llm_build_context {
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
-        struct ggml_tensor * to_device_tensor = nullptr;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb, &to_device_tensor);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
@@ -4238,9 +4227,8 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
-        struct ggml_tensor * K_shift = nullptr;
         if (do_rope_shift) {
-            K_shift = llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -4336,21 +4324,6 @@ struct llm_build_context {
 
         ggml_build_forward_expand(gf, cur);
 
-#if defined(GGML_USE_KOMPUTE)
-        if (ctx_kompute) {
-            if (!ggml_vk_has_h2d_all(ctx_kompute)) {
-                ggml_vk_h2d_all(ctx_kompute);
-            } else {
-                ggml_vk_h2d_tensor(ctx_kompute, to_device_tensor);
-                ggml_vk_h2d_tensor(ctx_kompute, inp_pos);
-                ggml_vk_h2d_tensor(ctx_kompute, KQ_mask);
-                if (K_shift) {
-                    ggml_vk_h2d_tensor(ctx_kompute, K_shift);
-                }
-            }
-        }
-#endif
-
         return gf;
     }
 
@@ -4479,9 +4452,8 @@ struct llm_build_context {
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
-        struct ggml_tensor * to_device_tensor = nullptr;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb, &to_device_tensor);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
@@ -4497,9 +4469,8 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
-        struct ggml_tensor * K_shift = nullptr;
         if (do_rope_shift) {
-            K_shift = llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -4595,21 +4566,6 @@ struct llm_build_context {
 
         ggml_build_forward_expand(gf, cur);
 
-#if defined(GGML_USE_KOMPUTE)
-        if (ctx_kompute) {
-            if (!ggml_vk_has_h2d_all(ctx_kompute)) {
-                ggml_vk_h2d_all(ctx_kompute);
-            } else {
-                ggml_vk_h2d_tensor(ctx_kompute, to_device_tensor);
-                ggml_vk_h2d_tensor(ctx_kompute, inp_pos);
-                ggml_vk_h2d_tensor(ctx_kompute, KQ_mask);
-                if (K_shift) {
-                    ggml_vk_h2d_tensor(ctx_kompute, K_shift);
-                }
-            }
-        }
-#endif
-
         return gf;
     }
 
@@ -5627,6 +5583,10 @@ static struct ggml_cgraph * llama_build_graph(
     const bool do_offload = true; // TODO: set to false after finishing refactoring
 #endif
 
+#ifdef GGML_USE_KOMPUTE
+    const bool needs_h2d_all = lctx.ctx_kompute && !ggml_vk_has_h2d_all(lctx.ctx_kompute);
+#endif
+
     int n_non_view = 0; // number of non-view tensors that have been processed by the callback
 
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
@@ -5747,6 +5707,21 @@ static struct ggml_cgraph * llama_build_graph(
             n_non_view++;
         }
 
+#ifdef GGML_USE_KOMPUTE
+        if (lctx.ctx_kompute && !needs_h2d_all) {
+            const char * offload_tensors[] = {"inp_tokens", "inp_pos", "KQ_mask", "K_shift"};
+            for (auto off : offload_tensors) {
+                if (strcmp(name, off) == 0) {
+                    ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
+                    break;
+                }
+            }
+            if (strcmp(name, "inp_embd") == 0 && !batch.token) {
+                ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
+            }
+        }
+#endif
+
         //
         // offload layers
         //
@@ -5915,6 +5890,12 @@ static struct ggml_cgraph * llama_build_graph(
             GGML_ASSERT(false);
     }
 
+#ifdef GGML_USE_KOMPUTE
+        if (needs_h2d_all) {
+            ggml_vk_h2d_all(lctx.ctx_kompute);
+        }
+#endif
+
     llm.free();
 
     if (worst_case) {
@@ -6175,7 +6156,6 @@ static int llama_decode_internal(
         }
     }
 
-#if 0
     // extract embeddings
     if (!lctx.embedding.empty()) {
         auto & embedding_out = lctx.embedding;
@@ -6183,7 +6163,6 @@ static int llama_decode_internal(
         embedding_out.resize(n_embd);
         memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
     }
-#endif
 
     // measure the performance only for the single-token evals
     if (n_tokens == 1) {
@@ -8622,7 +8601,6 @@ static int llama_apply_lora_from_file_internal(
 ) {
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
-
     const int64_t t_start_lora_us = ggml_time_us();
 
     auto fin = std::ifstream(path_lora, std::ios::binary);

From 2d2c76acc42215e2ca11cf2d0a9f788324df66df Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 29 Nov 2023 18:17:57 -0500
Subject: [PATCH 077/140] vulkan : fix free of stack addr in llama_buffer

---
 llama.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index f7c6f26d271f0..ad431a27e9c81 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -765,8 +765,10 @@ struct llama_buffer {
     ~llama_buffer() {
         if (data) {
 #ifdef GGML_USE_KOMPUTE
-            if (ggml_vk_has_device()) {
-                ggml_vk_free_memory(memory);
+            if (memory.data) {
+                if (ggml_vk_has_device()) {
+                    ggml_vk_free_memory(memory);
+                }
                 data = NULL;
                 return;
             }

From 807270621016865bb5fb136295e962b47d4bf06d Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 15 Dec 2023 16:23:24 -0500
Subject: [PATCH 078/140] kompute : always destroy Manager via the destructor

---
 ggml-kompute.cpp | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index f70231bedaef2..cc0adaf2f5ed4 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -63,17 +63,29 @@ struct ggml_kompute_context {
 // and consolidate the init functions and simplify object lifetime management. As it currently stands,
 // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
 // is only created when a device is set and vulkan is explicitly turned on.
-ggml_kompute_context *s_kompute_context = nullptr;
-static kp::Manager *komputeManager() {
-    static kp::Manager *s_mgr = nullptr;
-    if (s_mgr && !s_mgr->hasInstance()) {
+static ggml_kompute_context *s_kompute_context = nullptr;
+
+class kompute_manager {
+    kp::Manager *s_mgr = nullptr;
+
+public:
+    kp::Manager *operator()() {
+        if (s_mgr && !s_mgr->hasInstance()) {
+            destroy();
+        }
+        if (!s_mgr) {
+            s_mgr = new kp::Manager;
+        }
+        return s_mgr;
+    }
+
+    void destroy() {
         delete s_mgr;
         s_mgr = nullptr;
     }
-    if (!s_mgr)
-        s_mgr = new kp::Manager;
-    return s_mgr;
-}
+};
+
+static kompute_manager komputeManager;
 
 #ifdef __linux__
 __attribute__((constructor))
@@ -257,7 +269,7 @@ bool ggml_vk_init_device(int device) {
 bool ggml_vk_free_device() {
     if (!ggml_vk_has_device())
         return false;
-    komputeManager()->destroy();
+    komputeManager.destroy();
     // FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact
     // the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which
     // is very brittle

From 44b1a97a15dd642c3938de94e5eeea5aabc4fc87 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 11 Dec 2023 13:04:43 -0500
Subject: [PATCH 079/140] kompute : fix -Wunused-private-field warnings from
 clang

Fixes nomic-ai/gpt4all#1722

(cherry picked from commit 3cd95323d995af7df4b42f6461f3d919a9267dad)
---
 kompute | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute b/kompute
index 2d0a8abc64e90..4565194ed7c32 160000
--- a/kompute
+++ b/kompute
@@ -1 +1 @@
-Subproject commit 2d0a8abc64e90a0956390aa3f1854cb6d48141db
+Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306

From 904c563dbc4620ac8f1f085a26441ecca68437a4 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 10 Jan 2024 12:12:59 -0500
Subject: [PATCH 080/140] sync xxd commands with GPT4All llama.cpp.cmake

---
 CMakeLists.txt | 42 +++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f906de40c54c..78c19da2baef6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -440,19 +440,35 @@ if (LLAMA_KOMPUTE)
         string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
         set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
         message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-        add_custom_command(
-          OUTPUT ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-          COMMAND xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          DEPENDS ${spv_file}
-          COMMENT "Converting to hpp: ${FILE_NAME}"
-        )
+        if(CMAKE_GENERATOR MATCHES "Visual Studio")
+            add_custom_command(
+              OUTPUT ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              DEPENDS ${spv_file} xxd
+              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
+            )
+        else()
+            add_custom_command(
+              OUTPUT ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              DEPENDS ${spv_file} xxd
+              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+            )
+        endif()
       endforeach()
     endfunction()
 

From 298d6eec09e1da30807a928dc6b113c5d4669176 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 9 Jan 2024 16:24:10 -0500
Subject: [PATCH 081/140] kompute : initial attempt at ggml-backend v2 support

---
 ggml-kompute.cpp | 179 +++++++++++++++++++++++++++++++++++++++++++++--
 ggml-kompute.h   |  16 +++++
 llama.cpp        | 148 ++++-----------------------------------
 3 files changed, 201 insertions(+), 142 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index cc0adaf2f5ed4..be6b7e9d4da05 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1,5 +1,7 @@
-#include "ggml-kompute.h"
 #include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-kompute.h"
 
 // These are generated at build time by cmake custom command
 #include "shaderop_scale.h"
@@ -488,16 +490,28 @@ void ggml_vk_free_memory(ggml_vk_memory &memory)
 }
 
 static
-decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+    // compatibility with ggml-backend
+    if (t->buffer && t->buffer->buft == ggml_backend_kompute_buffer_type()) {
+        ggml_vk_memory * buf_ctx = (ggml_vk_memory *) t->buffer->context;
+
+        const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
+
+        GGML_ASSERT(ioffs >= 0 && ioffs + ggml_nbytes(t) <= (int64_t)t->buffer->size);
+
+        offset = (uint64_t)ioffs;
+        return buf_ctx;
+     }
+
     for (auto it = ctx->buffers.begin(); ; it++) {
         if (it == ctx->buffers.end()) {
             fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
-            return it;
+            return nullptr;
         }
         if (it->data <= t->data &&
                 reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
             offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
-            return it;
+            return &*it;
         }
     }
 }
@@ -505,8 +519,8 @@ decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggm
 static
 const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
     uint64_t originalOffset = 0;
-    auto res = ggml_vk_find_tensor(ctx, t, originalOffset);
-    if (res == ctx->buffers.end()) {
+    auto * res = ggml_vk_find_tensor(ctx, t, originalOffset);
+    if (!res) {
         static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
         return nullTensor;
     }
@@ -1629,3 +1643,156 @@ kp::TensorT<uint8_t>::dataType()
 {
     return TensorDataTypes::eUnsignedInt;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
+    GGML_UNUSED(buffer);
+    return "Kompute";
+}
+
+static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    auto * memory = (ggml_vk_memory *)buffer->context;
+    if (ggml_vk_has_device()) {
+        ggml_vk_free_memory(*memory);
+    }
+    delete memory;
+}
+
+static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return ((ggml_vk_memory *)buffer->context)->data;
+}
+
+static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    memcpy((char *)tensor->data + offset, data, size);
+    ggml_vk_h2d_buffer(*(ggml_vk_memory *)buffer->context);
+}
+
+static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_vk_d2h_buffer(*(ggml_vk_memory *)buffer->context);
+    memcpy(data, (const char *)tensor->data + offset, size);
+}
+
+static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    auto * memory = (ggml_vk_memory *)buffer->context;
+    memset(memory->data, value, buffer->size);
+    ggml_vk_h2d_buffer(*memory);
+}
+
+static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
+    /* .get_name        = */ ggml_backend_kompute_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_kompute_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_kompute_buffer_get_base,
+    /* .init_tensor     = */ NULL,
+    /* .set_tensor      = */ ggml_backend_kompute_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_kompute_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_kompute_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// default buffer type
+
+static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return "Kompute";
+}
+
+static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
+    return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
+}
+
+static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return 32;
+}
+
+static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    GGML_UNUSED(buft);
+    return ggml_backend_is_kompute(backend);
+}
+
+ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
+        /* .iface = */ {
+            /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
+            /* .is_host          = */ NULL,
+        },
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_buffer_type_kompute;
+}
+
+// backend
+
+static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+    return "Kompute";
+}
+
+static void ggml_backend_kompute_free(ggml_backend_t backend) {
+    struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
+    ggml_vk_free_device();
+    ggml_vk_free(ctx);
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+    return ggml_backend_kompute_buffer_type();
+}
+
+static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    auto * ctx = (ggml_kompute_context *)backend->context;
+    ggml_vk_graph_compute(ctx, cgraph);
+    return true;
+}
+
+static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    GGML_UNUSED(backend);
+    GGML_UNUSED(op);
+    return true; // TODO: implement
+}
+
+static struct ggml_backend_i kompute_backend_i = {
+    /* .get_name                = */ ggml_backend_kompute_name,
+    /* .free                    = */ ggml_backend_kompute_free,
+    /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
+    /* .supports_op             = */ ggml_backend_kompute_supports_op,
+};
+
+ggml_backend_t ggml_backend_kompute_init() {
+    if (!ggml_vk_has_device()) {
+        fprintf(stderr, "%s: error: device was not initialized\n", __func__);
+        return nullptr;
+    }
+
+    struct ggml_kompute_context * ctx = ggml_vk_init();
+
+    ggml_backend_t kompute_backend = new ggml_backend {
+        /* .interface = */ kompute_backend_i,
+        /* .context   = */ ctx,
+    };
+
+    return kompute_backend;
+}
+
+bool ggml_backend_is_kompute(ggml_backend_t backend) {
+    return backend && backend->iface.get_name == ggml_backend_kompute_name;
+}
diff --git a/ggml-kompute.h b/ggml-kompute.h
index ac8a4d4a0bc39..f895dc545d9d8 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "ggml-backend.h"
+
 #include <cstddef>
 #include <vector>
 #include <string>
@@ -55,3 +57,17 @@ void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
 void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+// forward declaration
+typedef struct ggml_backend * ggml_backend_t;
+
+GGML_API ggml_backend_t ggml_backend_kompute_init(void);
+
+GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
diff --git a/llama.cpp b/llama.cpp
index d4668a221729f..98ffe1ec82c4f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -760,63 +760,6 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
-// TODO(jared): remove this
-struct llama_buffer {
-    void * data = NULL;
-    size_t size = 0;
-#ifdef GGML_USE_KOMPUTE
-    ggml_vk_memory memory;
-#endif
-
-    // fallback to malloc / free
-    // useful in cases where CUDA can try to allocate PINNED memory
-    bool fallback = false;
-
-    void resize(size_t n) {
-        llama_host_free(data);
-
-#ifdef GGML_USE_KOMPUTE
-        if (ggml_vk_has_device()) {
-            this->memory = ggml_vk_allocate(n);
-            this->data = (uint8_t*)memory.data;
-            this->size = n;
-            return;
-        }
-#endif
-        data = llama_host_malloc(n);
-        if (!data) {
-            fallback = true;
-            data = malloc(n);
-        } else {
-            fallback = false;
-        }
-
-        GGML_ASSERT(data);
-        size = n;
-    }
-
-    ~llama_buffer() {
-        if (data) {
-#ifdef GGML_USE_KOMPUTE
-            if (memory.data) {
-                if (ggml_vk_has_device()) {
-                    ggml_vk_free_memory(memory);
-                }
-                data = NULL;
-                return;
-            }
-#endif
-            if (fallback) { // NOLINT
-                free(data);
-            } else {
-                llama_host_free(data);
-            }
-        }
-
-        data = NULL;
-    }
-};
-
 template <typename T>
 struct no_init {
     T value;
@@ -1288,6 +1231,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
     buft = ggml_backend_cuda_buffer_type(gpu);
 #elif defined(GGML_USE_CLBLAST)
     buft = ggml_backend_opencl_buffer_type();
+#elif defined(GGML_USE_KOMPUTE)
+    buft = ggml_backend_kompute_buffer_type();
 #endif
 
     if (buft == nullptr) {
@@ -1721,11 +1666,6 @@ struct llama_context {
     // allocator for the input tensors
     ggml_tallocr * alloc = nullptr;
 
-// TODO(jared): remove this
-#if defined(GGML_USE_KOMPUTE)
-    ggml_kompute_context * ctx_kompute = NULL;
-#endif
-
     // temporary buffer for copying data to/from the backend
     std::vector<no_init<uint8_t>> buf_copy;
 
@@ -4363,10 +4303,6 @@ struct llm_build_context {
 
     std::vector<uint8_t> & buf_compute_meta;
 
-#ifdef GGML_USE_KOMPUTE
-    ggml_kompute_context * ctx_kompute;
-#endif
-
     struct ggml_context * ctx0 = nullptr;
 
     // TODO: consider making the entire interface noexcept
@@ -4406,10 +4342,6 @@ struct llm_build_context {
         do_rope_shift    (worst_case || kv_self.has_shift),
         cb               (cb),
         buf_compute_meta (lctx.buf_compute_meta)
-// TODO(jared): remove this
-#ifdef GGML_USE_KOMPUTE
-      , ctx_kompute   (lctx.ctx_kompute)
-#endif
         {
             // all initializations should be done in init()
         }
@@ -6031,11 +5963,6 @@ static struct ggml_cgraph * llama_build_graph(
     bool alloc_inp_KQ_mask  = false;
     bool alloc_inp_K_shift  = false;
 
-    // TODO(jared): do we still need this?
-#ifdef GGML_USE_KOMPUTE
-    const bool needs_h2d_all = lctx.ctx_kompute && !ggml_vk_has_h2d_all(lctx.ctx_kompute);
-#endif
-
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
@@ -6152,22 +6079,6 @@ static struct ggml_cgraph * llama_build_graph(
 
             alloc_inp_K_shift = true;
         }
-
-        // TODO(jared): this shouldn't be needed anymore
-#ifdef GGML_USE_KOMPUTE
-        if (lctx.ctx_kompute && !needs_h2d_all) {
-            const char * offload_tensors[] = {"inp_tokens", "inp_pos", "KQ_mask", "K_shift"};
-            for (auto off : offload_tensors) {
-                if (strcmp(name, off) == 0) {
-                    ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
-                    break;
-                }
-            }
-            if (strcmp(name, "inp_embd") == 0 && !batch.token) {
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
-            }
-        }
-#endif
     };
 
     struct ggml_cgraph * result = NULL;
@@ -6233,12 +6144,6 @@ static struct ggml_cgraph * llama_build_graph(
             GGML_ASSERT(false);
     }
 
-#ifdef GGML_USE_KOMPUTE
-        if (needs_h2d_all) {
-            ggml_vk_h2d_all(lctx.ctx_kompute);
-        }
-#endif
-
     llm.free();
 
     return result;
@@ -6379,25 +6284,6 @@ static int llama_decode_internal(
     if (ggml_backend_is_metal(lctx.backend_metal)) {
         ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
     }
-#elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute && n_tokens == 1) {
-        ggml_vk_graph_compute(lctx.ctx_kompute, gf);
-        ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
-    } else {
-        if (lctx.ctx_kompute) {
-            for (int il = 0; il < hparams.n_layer; ++il) {
-                ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k_l[il]);
-                ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v_l[il]);
-            }
-        }
-        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
-        if (lctx.ctx_kompute) {
-            for (int il = 0; il < hparams.n_layer; ++il) {
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k_l[il]);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v_l[il]);
-            }
-        }
-    }
 #endif
 
     if (lctx.backend_cpu != nullptr) {
@@ -9464,6 +9350,16 @@ struct llama_context * llama_new_context_with_model(
                 }
             }
         }
+#elif defined(GGML_USE_KOMPUTE)
+        if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
+            auto * backend = ggml_backend_kompute_init();
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        }
 #endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
@@ -9547,23 +9443,6 @@ struct llama_context * llama_new_context_with_model(
                         ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
             }
         }
-
-        // TODO(jared): remove this
-#if defined(GGML_USE_KOMPUTE)
-        if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
-            // this allocates all Vulkan resources and memory buffers
-            ctx->ctx_kompute = ggml_vk_init();
-
-            const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
-
-            printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
-
-            ggml_vk_add_buffer(ctx->ctx_kompute, "data", ctx->model.buf.memory);
-            ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.memory);
-            ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->kv_self.buf.memory);
-            ggml_vk_add_buffer(ctx->ctx_kompute, "alloc", ctx->buf_alloc.memory);
-        }
-#endif
     }
 
 #ifdef GGML_USE_MPI
@@ -9584,9 +9463,6 @@ struct llama_context * llama_new_context_with_model(
 }
 
 void llama_free(struct llama_context * ctx) {
-#ifdef GGML_USE_KOMPUTE
-    ggml_vk_free(ctx->ctx_kompute);
-#endif
     delete ctx;
 #ifdef GGML_USE_KOMPUTE
     ggml_vk_free_device();

From 5f660dada8d028e7b96f6dc1b80b7055126a3a1e Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 10 Jan 2024 13:44:34 -0500
Subject: [PATCH 082/140] fix assertion failure

---
 ggml-kompute.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index be6b7e9d4da05..6a75432bbbac3 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -491,9 +491,11 @@ void ggml_vk_free_memory(ggml_vk_memory &memory)
 
 static
 ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
+
     // compatibility with ggml-backend
-    if (t->buffer && t->buffer->buft == ggml_backend_kompute_buffer_type()) {
-        ggml_vk_memory * buf_ctx = (ggml_vk_memory *) t->buffer->context;
+    if (buffer && buffer->buft == ggml_backend_kompute_buffer_type()) {
+        ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
 
         const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
 

From 070919dbf7a24bedd9636be0704f701eaf41a9eb Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 10 Jan 2024 16:14:03 -0500
Subject: [PATCH 083/140] attempt to get test-backend-ops working

---
 ggml-backend.c   |  6 +++
 ggml-kompute.cpp | 99 +++++++++++++++++++++++++++++++++++++++++++++---
 ggml-kompute.h   |  8 ++++
 3 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 4c2d8b0b26f18..2c43aa9bb4934 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -342,6 +342,12 @@ static void ggml_backend_registry_init(void) {
     extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
     ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
 #endif
+
+#ifdef GGML_USE_KOMPUTE
+    extern ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
+    extern ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
+    ggml_backend_register("Kompute", ggml_backend_reg_kompute_init, ggml_backend_kompute_buffer_type(), NULL);
+#endif
 }
 
 void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 6a75432bbbac3..c4128f0704fc8 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -499,7 +499,7 @@ ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct g
 
         const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
 
-        GGML_ASSERT(ioffs >= 0 && ioffs + ggml_nbytes(t) <= (int64_t)t->buffer->size);
+        GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)t->buffer->size);
 
         offset = (uint64_t)ioffs;
         return buf_ctx;
@@ -1344,6 +1344,82 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
     ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
 }
 
+static bool ggml_kompute_supports_op(const struct ggml_tensor * op) {
+    switch (op->type) {
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            break;
+        default:
+            return false;
+    }
+
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                    return true;
+                default:
+                    ;
+            }
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_CONCAT:
+        case GGML_OP_ADD:
+        case GGML_OP_ACC:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_NORM:
+        case GGML_OP_ALIBI:
+        case GGML_OP_ROPE:
+        case GGML_OP_IM2COL:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_LEAKY_RELU:
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            return true;
+        case GGML_OP_DUP:
+        case GGML_OP_CPY:
+        case GGML_OP_CONT:
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    break;
+                default:
+                    return false;
+            }
+            switch (op->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    break;
+                default:
+                    return false;
+            }
+            return true;
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_GET_ROWS:
+            return op->ne[3] == 1;
+        default:
+            ;
+    }
+    return false;
+}
+
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
     const int n_seq = 8;
 
@@ -1362,7 +1438,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
         auto& seq = *sequences[seq_idx];
 
         const int node_start = (seq_idx + 0) * n_nodes_per_seq;
-        const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
+        const int node_end   = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes);
 
         for (int i = node_start; i < node_end; ++i) {
             struct ggml_tensor * src0 = gf->nodes[i]->src[0];
@@ -1381,6 +1457,11 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     break;
             }
 
+            if (!ggml_kompute_supports_op(dst)) {
+                 fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
+                 GGML_ASSERT(!"unsupported op");
+             }
+
             const int32_t ne00 = src0 ? src0->ne[0] : 0;
             const int32_t ne01 = src0 ? src0->ne[1] : 0;
             const int32_t ne02 = src0 ? src0->ne[2] : 0;
@@ -1717,7 +1798,7 @@ static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffe
     return ggml_backend_is_kompute(backend);
 }
 
-ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void) {
+ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
@@ -1760,8 +1841,7 @@ static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct gg
 
 static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     GGML_UNUSED(backend);
-    GGML_UNUSED(op);
-    return true; // TODO: implement
+    return ggml_kompute_supports_op(op);
 }
 
 static struct ggml_backend_i kompute_backend_i = {
@@ -1798,3 +1878,12 @@ ggml_backend_t ggml_backend_kompute_init() {
 bool ggml_backend_is_kompute(ggml_backend_t backend) {
     return backend && backend->iface.get_name == ggml_backend_kompute_name;
 }
+
+extern "C" ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
+
+ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
+    GGML_UNUSED(params);
+    GGML_UNUSED(user_data);
+    ggml_vk_init_device(0, "gpu");
+    return ggml_backend_kompute_init();
+}
diff --git a/ggml-kompute.h b/ggml-kompute.h
index f895dc545d9d8..e8d2d396b780d 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -63,6 +63,10 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
 // user-code should use only these functions
 //
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 
@@ -71,3 +75,7 @@ GGML_API ggml_backend_t ggml_backend_kompute_init(void);
 GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 
 GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
+
+#ifdef __cplusplus
+}
+#endif

From cad72e1252a2180a4f1ad93169c4f059faeac808 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 17 Jan 2024 10:09:27 -0500
Subject: [PATCH 084/140] add sanity check and fix kompute teardown order

---
 ggml-kompute.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index c4128f0704fc8..c66eb9ab39a72 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -301,6 +301,7 @@ ggml_vk_device ggml_vk_current_device() {
 }
 
 ggml_kompute_context *ggml_vk_init() {
+    GGML_ASSERT(s_kompute_context == nullptr);
     s_kompute_context = new ggml_kompute_context;
     return s_kompute_context;
 }
@@ -1823,8 +1824,8 @@ static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
 
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
     struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
-    ggml_vk_free_device();
     ggml_vk_free(ctx);
+    ggml_vk_free_device();
     delete backend;
 }
 

From 76474a7c0d1835ecfa00a4e0333b7fdb1698eb83 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 17 Jan 2024 13:47:03 -0500
Subject: [PATCH 085/140] kompute : ignore exceptions in
 ggml_vk_available_devices (#12)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 ggml-kompute.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index c66eb9ab39a72..e5d1c2c7949b2 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -147,9 +147,15 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
     if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
         return results;
 
-    std::vector<vk::PhysicalDevice> physicalDevices = komputeManager()->listDevices();
-    uint32_t deviceCount = physicalDevices.size();
+    std::vector<vk::PhysicalDevice> physicalDevices;
+    try {
+        physicalDevices = komputeManager()->listDevices();
+    } catch (vk::SystemError & err) {
+        std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n";
+        return results;
+    }
 
+    uint32_t deviceCount = physicalDevices.size();
     if (deviceCount == 0)
         return results;
 

From d6bd4716937c4b6092cec0e5a09c72b442e7c602 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 18 Jan 2024 18:49:39 +0200
Subject: [PATCH 086/140] kompute : fix rope_f32 and scale ops (#5008)

---
 ggml-kompute.cpp                 |  3 ++-
 kompute-shaders/op_rope_f32.comp | 38 +++++++++++++++++++-------------
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index e5d1c2c7949b2..63e9b38784705 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1540,7 +1540,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        const float scale = *(const float *) src1->data;
+                        float scale; memcpy(&scale, dst->op_params, sizeof(float));
+
                         ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
                     } break;
                 case GGML_OP_UNARY:
diff --git a/kompute-shaders/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp
index 104ae0ba4836c..2adf5eb4e3200 100644
--- a/kompute-shaders/op_rope_f32.comp
+++ b/kompute-shaders/op_rope_f32.comp
@@ -35,31 +35,39 @@ void main() {
             const float x0 = inA[src];
             const float x1 = inA[src+1];
 
-            out_[dst_data] = x0*cos_theta - x1*sin_theta;
+            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
             out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
         }
     } else {
         const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
-            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-                const uint cur_rot = ib * pcs.n_dims + ic;
+        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
+            const uint cur_rot = ic;
 
-                float cos_theta, sin_theta;
-                rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
-                theta *= theta_scale;
+            theta *= theta_scale;
 
-                const uint i0 = ib*pcs.n_dims + ic/2;
+            const uint i0 = ic/2;
 
-                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = inA[src];
+            const float x1 = inA[src+pcs.n_dims/2];
 
-                const float x0 = inA[src];
-                const float x1 = inA[src+pcs.n_dims/2];
+            out_[dst_data] = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        }
+
+        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
+            const uint i0 = ic;
+
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
 
-                out_[dst_data] = x0*cos_theta - x1*sin_theta;
-                out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
-            }
+            out_[dst_data + 0] = inA[src + 0];
+            out_[dst_data + 1] = inA[src + 1];
         }
     }
 }

From 9431026a849873fc6eab8d74156e541315b84a94 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 11:48:12 -0500
Subject: [PATCH 087/140] clean up old backend code

---
 ggml-kompute.cpp | 128 +++++++++++++++++------------------------------
 ggml-kompute.h   |  12 -----
 2 files changed, 45 insertions(+), 95 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 63e9b38784705..b31dff6ad7546 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -56,8 +56,6 @@
 
 typedef ggml_fp16_t half;
 struct ggml_kompute_context {
-    bool hasH2DAll = false;
-    std::vector<ggml_vk_memory> buffers;
     std::shared_ptr<vk::DescriptorPool> pool;
 };
 
@@ -312,10 +310,6 @@ ggml_kompute_context *ggml_vk_init() {
     return s_kompute_context;
 }
 
-bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
-    return ctx->hasH2DAll;
-}
-
 void ggml_vk_free(struct ggml_kompute_context * ctx) {
     assert(ctx == s_kompute_context);
     s_kompute_context = nullptr;
@@ -414,9 +408,8 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     return vkDeviceMemory;
 }
 
-size_t ggml_vk_aligned_offset(size_t offset) {
-
-    static size_t minStorageBufferOffsetAlignment = 0;
+static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) {
+    size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer);
     if (minStorageBufferOffsetAlignment == 0) {
         vk::PhysicalDeviceProperties deviceProperties;
         deviceProperties = komputeManager()->physicalDevice()->getProperties();
@@ -433,17 +426,7 @@ size_t ggml_vk_aligned_offset(size_t offset) {
     return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
 }
 
-static void ggml_vk_h2d_buffer(const ggml_vk_memory &memory) {
-    if (memory.stagingBuffer)
-        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
-}
-
-static void ggml_vk_d2h_buffer(const ggml_vk_memory &memory) {
-    if (memory.stagingBuffer)
-        komputeManager()->sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
-}
-
-ggml_vk_memory ggml_vk_allocate(size_t size) {
+static ggml_vk_memory ggml_vk_allocate(size_t size) {
     ggml_vk_memory memory;
     bool isHostVisible = false;
     {
@@ -497,38 +480,26 @@ void ggml_vk_free_memory(ggml_vk_memory &memory)
 }
 
 static
-ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
     ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
 
     // compatibility with ggml-backend
-    if (buffer && buffer->buft == ggml_backend_kompute_buffer_type()) {
-        ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
+    GGML_ASSERT(buffer && buffer->buft == ggml_backend_kompute_buffer_type());
 
-        const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
+    ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
 
-        GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)t->buffer->size);
+    const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
 
-        offset = (uint64_t)ioffs;
-        return buf_ctx;
-     }
+    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)t->buffer->size);
 
-    for (auto it = ctx->buffers.begin(); ; it++) {
-        if (it == ctx->buffers.end()) {
-            fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
-            return nullptr;
-        }
-        if (it->data <= t->data &&
-                reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
-            offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
-            return &*it;
-        }
-    }
+    offset = (uint64_t)ioffs;
+    return buf_ctx;
 }
 
 static
-const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
+const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
     uint64_t originalOffset = 0;
-    auto * res = ggml_vk_find_tensor(ctx, t, originalOffset);
+    auto * res = ggml_vk_find_tensor(t, originalOffset);
     if (!res) {
         static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
         return nullTensor;
@@ -538,7 +509,7 @@ const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context
     const size_t nelements = ggml_nelements(t);
     size_t nbytes = ggml_nbytes(t);
 
-    size_t vulkanOffset = ggml_vk_aligned_offset(originalOffset);
+    size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset);
     if (alignedOffset) {
         *alignedOffset = originalOffset - vulkanOffset;
         nbytes += *alignedOffset;
@@ -553,39 +524,6 @@ const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context
         vulkanOffset);
 }
 
-void ggml_vk_add_buffer(
-        struct ggml_kompute_context * ctx,
-        const char * /*name*/,
-        const ggml_vk_memory &memory) {
-    ctx->buffers.emplace_back(memory);
-}
-
-void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
-    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
-    GGML_ASSERT(res);
-    komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
-}
-
-void ggml_vk_h2d_all(struct ggml_kompute_context * ctx) {
-    for (auto& it : ctx->buffers) {
-        ggml_vk_h2d_buffer(it);
-    }
-    ctx->hasH2DAll = true;
-}
-
-void ggml_vk_d2h_all(struct ggml_kompute_context * ctx) {
-    for (auto& it : ctx->buffers) {
-        ggml_vk_d2h_buffer(it);
-    }
-}
-
-void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
-    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
-
-    GGML_ASSERT(res);
-    komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
-}
-
 static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
     if (size % sizeof(uint32_t) != 0) {
         throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
@@ -1506,10 +1444,10 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
             uint32_t off_src0 = 0;
             uint32_t off_src1 = 0;
-            uint32_t off_dst = 0;
-            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0, &off_src0) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1, &off_src1) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_dst  = dst ? ggml_vk_get_tensor(ctx, dst, &off_dst)  : nullTensor;
+            uint32_t off_dst  = 0;
+            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(dst,  &off_dst)  : nullTensor;
 
             switch (dst->op) {
                 case GGML_OP_ADD:
@@ -1757,19 +1695,33 @@ static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer)
 }
 
 static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+
+    const auto res = ggml_vk_get_tensor(tensor);
+    GGML_ASSERT(res);
+
     memcpy((char *)tensor->data + offset, data, size);
-    ggml_vk_h2d_buffer(*(ggml_vk_memory *)buffer->context);
+
+    komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
 }
 
 static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    ggml_vk_d2h_buffer(*(ggml_vk_memory *)buffer->context);
+    GGML_UNUSED(buffer);
+
+    const auto res = ggml_vk_get_tensor(tensor);
+    GGML_ASSERT(res);
+
+    komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
+
     memcpy(data, (const char *)tensor->data + offset, size);
 }
 
 static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     auto * memory = (ggml_vk_memory *)buffer->context;
     memset(memory->data, value, buffer->size);
-    ggml_vk_h2d_buffer(*memory);
+
+    if (memory->stagingBuffer)
+        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory->primaryBuffer, memory->stagingBuffer, memory->size);
 }
 
 static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
@@ -1798,7 +1750,17 @@ static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_
 
 static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
-    return 32;
+
+    static size_t minStorageBufferOffsetAlignment = 0;
+    if (minStorageBufferOffsetAlignment == 0) {
+        GGML_ASSERT(ggml_vk_has_device());
+        vk::PhysicalDeviceProperties deviceProperties;
+        deviceProperties = komputeManager()->physicalDevice()->getProperties();
+        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
+        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
+    }
+
+    return minStorageBufferOffsetAlignment;
 }
 
 static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
diff --git a/ggml-kompute.h b/ggml-kompute.h
index e8d2d396b780d..288c835c55ab5 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -41,21 +41,9 @@ bool ggml_vk_has_device();
 bool ggml_vk_using_vulkan();
 ggml_vk_device ggml_vk_current_device();
 struct ggml_kompute_context * ggml_vk_init(void);
-bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);
 void ggml_vk_free(struct ggml_kompute_context * ctx);
-size_t ggml_vk_aligned_offset(size_t offset);
-ggml_vk_memory ggml_vk_allocate(size_t size);
 void ggml_vk_free_memory(ggml_vk_memory &memory);
 
-void ggml_vk_add_buffer(
-    struct ggml_kompute_context * ctx,
-    const char * name,
-    const ggml_vk_memory &memory);
-
-void ggml_vk_h2d_all(struct ggml_kompute_context * ctx);
-void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
-void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
-void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
 
 //

From e9d5223da3522a339c93de38e26e71d9c386e3cf Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 11:48:27 -0500
Subject: [PATCH 088/140] actually fix this assertion

---
 ggml-kompute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index b31dff6ad7546..bb437929324a4 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -490,7 +490,7 @@ ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & of
 
     const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
 
-    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)t->buffer->size);
+    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)buffer->size);
 
     offset = (uint64_t)ioffs;
     return buf_ctx;

From 729e1a4cc1ae1b6bb8c038b503382688d1be39fe Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 11:56:00 -0500
Subject: [PATCH 089/140] sync op_rope_f16 with recent op_rope_f32 changes

---
 kompute-shaders/op_rope_f16.comp | 36 +++++++++++++++++++-------------
 kompute-shaders/op_rope_f32.comp |  8 +++----
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/kompute-shaders/op_rope_f16.comp b/kompute-shaders/op_rope_f16.comp
index 3abe3ed33f701..b446225849d5f 100644
--- a/kompute-shaders/op_rope_f16.comp
+++ b/kompute-shaders/op_rope_f16.comp
@@ -40,26 +40,34 @@ void main() {
         }
     } else {
         const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
-            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-                const uint cur_rot = ib * pcs.n_dims + ic;
+        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
+            const uint cur_rot = ic;
 
-                float cos_theta, sin_theta;
-                rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            theta *= theta_scale;
 
-                theta *= theta_scale;
+            const uint i0 = ic/2;
 
-                const uint i0 = ib*pcs.n_dims + ic/2;
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
 
-                const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+pcs.n_dims/2]);
 
-                const float x0 = float(inA[src]);
-                const float x1 = float(inA[src+pcs.n_dims/2]);
+            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
+        }
+
+        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
+            const uint i0 = ic;
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
 
-                out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
-                out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
-            }
+            out_[dst_data + 0] = inA[src + 0];
+            out_[dst_data + 1] = inA[src + 1];
         }
     }
 }
diff --git a/kompute-shaders/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp
index 2adf5eb4e3200..2c0235d75b6b6 100644
--- a/kompute-shaders/op_rope_f32.comp
+++ b/kompute-shaders/op_rope_f32.comp
@@ -29,8 +29,8 @@ void main() {
 
             theta *= theta_scale;
 
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
 
             const float x0 = inA[src];
             const float x1 = inA[src+1];
@@ -50,8 +50,8 @@ void main() {
 
             const uint i0 = ic/2;
 
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
 
             const float x0 = inA[src];
             const float x1 = inA[src+pcs.n_dims/2];

From 07530731baef16728085caab32516034d9d453f6 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 16:11:00 -0500
Subject: [PATCH 090/140] never try to evaluate an empty command buffer

This fixes the immediate crashes with test-backend-ops - when
evaluatating individual no-ops like OP_VIEW, it tries to submit an empty
command buffer, which crashes RADV and hangs AMDVLK.
---
 ggml-kompute.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index bb437929324a4..2ba7fbde44a57 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1385,6 +1385,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
         const int node_start = (seq_idx + 0) * n_nodes_per_seq;
         const int node_end   = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes);
 
+        bool any_commands_recorded = false;
+
         for (int i = node_start; i < node_end; ++i) {
             struct ggml_tensor * src0 = gf->nodes[i]->src[0];
             struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@@ -1402,6 +1404,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     break;
             }
 
+            any_commands_recorded = true;
+
             if (!ggml_kompute_supports_op(dst)) {
                  fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
                  GGML_ASSERT(!"unsupported op");
@@ -1647,7 +1651,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
         }
 
         // Evaluate sequence
-        seq.evalAsync();
+        if (any_commands_recorded) {
+            seq.evalAsync();
+        }
     }
 
     // Wait for all sequences to finish

From 2f6a279e299964bf25eb3307df1556e17c85f588 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 15:32:55 -0500
Subject: [PATCH 091/140] fix supported ops for kompute backend

---
 ggml-kompute.cpp           | 41 ++++++++++++++++++++++++--------------
 tests/test-backend-ops.cpp |  5 ++++-
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 2ba7fbde44a57..2625c29b544a0 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1316,27 +1316,13 @@ static bool ggml_kompute_supports_op(const struct ggml_tensor * op) {
         case GGML_OP_VIEW:
         case GGML_OP_TRANSPOSE:
         case GGML_OP_PERMUTE:
-        case GGML_OP_CONCAT:
         case GGML_OP_ADD:
-        case GGML_OP_ACC:
         case GGML_OP_MUL:
-        case GGML_OP_DIV:
         case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_SUM_ROWS:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_RMS_NORM:
-        case GGML_OP_GROUP_NORM:
         case GGML_OP_NORM:
-        case GGML_OP_ALIBI:
         case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_LEAKY_RELU:
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
             return true;
         case GGML_OP_DUP:
         case GGML_OP_CPY:
@@ -1357,8 +1343,33 @@ static bool ggml_kompute_supports_op(const struct ggml_tensor * op) {
             }
             return true;
         case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_GET_ROWS:
             return op->ne[3] == 1;
+        case GGML_OP_GET_ROWS:
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q6_K:
+                    return op->ne[3] == 1;
+                default:
+                    ;
+            }
+            return false;
+        case GGML_OP_MUL_MAT:
+            if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1]))
+                return false;
+
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q6_K:
+                    return true;
+                default:
+                    ;
+            }
         default:
             ;
     }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index d9b8b106a6033..a0063bbb9cf5b 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -360,7 +360,10 @@ struct test_case {
         // check if backends support op
         bool supported = true;
         for (ggml_backend_t backend : {backend1, backend2}) {
-            if (!ggml_backend_supports_op(backend, out)) {
+            if (
+                !ggml_backend_supports_op(backend, out)
+                || (op_desc(out) == "MOE" && !strcmp(ggml_backend_name(backend), "Kompute"))
+            ) {
                 printf("not supported [%s] ", ggml_backend_name(backend));
                 supported = false;
             }

From 33e8d6abe17a4123175dd6af179d73c6ec876823 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 19 Jan 2024 00:22:13 +0200
Subject: [PATCH 092/140] kompute : fix ggml_add kernel (#5027)

---
 ggml-kompute.cpp            |  2 +-
 kompute-shaders/op_add.comp | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 2625c29b544a0..c650f84a5d4e3 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1467,7 +1467,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             switch (dst->op) {
                 case GGML_OP_ADD:
                     {
-                        if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
+                        if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
                             // src1 is a row
                             ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
diff --git a/kompute-shaders/op_add.comp b/kompute-shaders/op_add.comp
index c866734523e74..b7b76a79dbdbe 100644
--- a/kompute-shaders/op_add.comp
+++ b/kompute-shaders/op_add.comp
@@ -30,6 +30,7 @@ layout(push_constant) uniform PushConstants {
     int nb1;
     int nb2;
     int nb3;
+  //int offs; // TODO: needed for GGML_OP_ACC, see metal code
 } pcs;
 
 // general-purpose kernel for addition of two tensors
@@ -44,15 +45,14 @@ void main() {
     const uint i12 = i02 % pcs.ne12;
     const uint i11 = i01 % pcs.ne11;
 
-    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + gl_SubgroupInvocationID.x*pcs.nb00) / 4);
-    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 + gl_SubgroupInvocationID.x*pcs.nb10) / 4);
-    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + gl_SubgroupInvocationID.x*pcs.nb0 ) / 4);
+    int offs = 0; // TMP (see above)
 
-    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
-        out_[pcs.outOff + dst_off] = inA[pcs.inAOff + src0_off] + inB[pcs.inBOff + src1_off];
+    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
+    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11       ) / 4);
+    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + offs) / 4);
 
-        src0_off += gl_WorkGroupSize.x*pcs.ne00;
-        src1_off += gl_WorkGroupSize.x*pcs.ne10;
-        dst_off  += gl_WorkGroupSize.x*pcs.ne0;
+    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
+        const uint i10 = i0 % pcs.ne10;
+        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
     }
 }

From cb9ceff966ba673976f6e0fbbcc47ebe59f99748 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 19 Jan 2024 15:05:01 -0500
Subject: [PATCH 093/140] minor cleanup

---
 ggml-kompute.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index c650f84a5d4e3..58c76347ea843 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1289,7 +1289,7 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
     ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
 }
 
-static bool ggml_kompute_supports_op(const struct ggml_tensor * op) {
+static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
     switch (op->type) {
         case GGML_TYPE_F16:
         case GGML_TYPE_F32:
@@ -1417,10 +1417,10 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
 
             any_commands_recorded = true;
 
-            if (!ggml_kompute_supports_op(dst)) {
+            if (!ggml_vk_supports_op(dst)) {
                  fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
                  GGML_ASSERT(!"unsupported op");
-             }
+            }
 
             const int32_t ne00 = src0 ? src0->ne[0] : 0;
             const int32_t ne01 = src0 ? src0->ne[1] : 0;
@@ -1828,7 +1828,7 @@ static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct gg
 
 static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     GGML_UNUSED(backend);
-    return ggml_kompute_supports_op(op);
+    return ggml_vk_supports_op(op);
 }
 
 static struct ggml_backend_i kompute_backend_i = {

From 0899adf86ed9765cc4cc349fb0a980e1bf77dd63 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 22 Jan 2024 14:16:10 -0500
Subject: [PATCH 094/140] kompute : fix get_rows dispatch -> 4 less failures

---
 ggml-kompute.cpp | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 58c76347ea843..86bd0d78bda69 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1089,15 +1089,18 @@ static void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
-                      unsigned element_size, unsigned qk,
-                      kp::Sequence& seq,
-                      const std::shared_ptr<kp::Tensor>& inA,
-                      const std::shared_ptr<kp::Tensor>& inB,
-                      const std::shared_ptr<kp::Tensor>& out,
-                      uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                      int32_t ne00, int32_t nb01, int32_t nb1,
-                      uint32_t size) {
+static void ggml_vk_get_rows(
+    const std::vector<uint32_t>& spirv,
+    const char * suffix,
+    unsigned element_size, unsigned qk,
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t nb01, int32_t nb1,
+    uint32_t size
+) {
     GGML_ASSERT(nb01%element_size == 0);
     GGML_ASSERT(nb1%sizeof(float) == 0);
     if (qk) GGML_ASSERT(ne00%qk == 0);
@@ -1110,11 +1113,12 @@ static void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
         ne00, nb01, nb1
     };
 
+    auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(name)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+    } else {
+        s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1128,7 +1132,7 @@ static void ggml_vk_get_rows_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
         kp::shader_data::op_getrows_f16_comp_spv_len);
 
-    ggml_vk_get_rows(spirv, sizeof(half), 0, std::forward<Args>(args)...);
+    ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -1136,7 +1140,7 @@ static void ggml_vk_get_rows_q4_0(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
         kp::shader_data::op_getrows_q4_0_comp_spv_len);
 
-    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
+    ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -1144,14 +1148,14 @@ static void ggml_vk_get_rows_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
         kp::shader_data::op_getrows_q4_1_comp_spv_len);
 
-    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
+    ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
 static void ggml_vk_get_rows_q6_k(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
         kp::shader_data::op_getrows_q6_k_comp_spv_len);
-    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
+    ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
 }
 
 static void ggml_vk_rope(

From 08e23fd78ca2afdbd0388f66e808851324634428 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 22 Jan 2024 16:08:16 -0500
Subject: [PATCH 095/140] kompute : fix op_mul kernel -> 13 less test failures

---
 ggml-kompute.cpp            | 53 +++++++++++++++++++++++++++----------
 kompute-shaders/op_mul.comp | 40 +++++++++++++++++++++++-----
 2 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 86bd0d78bda69..76280501fc3c0 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -559,7 +559,6 @@ static void ggml_vk_add(
     int32_t ne0,
     int32_t nb0,  int32_t nb1,  int32_t nb2,  int32_t nb3
 ) {
-
     const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
         kp::shader_data::op_add_comp_spv_len);
 
@@ -625,29 +624,47 @@ static void ggml_vk_addrow(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_mul(kp::Sequence& seq,
-                    const std::shared_ptr<kp::Tensor>& inA,
-                    const std::shared_ptr<kp::Tensor>& inB,
-                    const std::shared_ptr<kp::Tensor>& out,
-                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                    uint32_t size) {
-
+static void ggml_vk_mul(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
+    int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
+    int32_t ne0,
+    int32_t nb0,  int32_t nb1,  int32_t nb2,  int32_t nb3
+) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv,
         kp::shader_data::op_mul_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
+        int32_t ne00;
+        int32_t nb00, nb01, nb02, nb03;
+        int32_t ne10, ne11, ne12, ne13;
+        int32_t nb10, nb11, nb12, nb13;
+        int32_t ne0;
+        int32_t nb0, nb1, nb2, nb3;
     } const pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00,
+        nb00, nb01, nb02, nb03,
+        ne10, ne11, ne12, ne13,
+        nb10, nb11, nb12, nb13,
+        ne0,
+        nb0, nb1, nb2, nb3
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({size});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1492,7 +1509,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             // src1 is a row
                             ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
-                            ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
+                            ggml_vk_mul(
+                                seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                ne00, ne01, ne02, ne03,
+                                nb00, nb01, nb02, nb03,
+                                ne10, ne11, ne12, ne13,
+                                nb10, nb11, nb12, nb13,
+                                ne0,
+                                nb0, nb1, nb2, nb3
+                            );
                         }
                     } break;
                 case GGML_OP_SCALE:
diff --git a/kompute-shaders/op_mul.comp b/kompute-shaders/op_mul.comp
index d599460c3e961..c92647c4db1c8 100644
--- a/kompute-shaders/op_mul.comp
+++ b/kompute-shaders/op_mul.comp
@@ -2,7 +2,7 @@
 
 #include "common.comp"
 
-layout(local_size_x = 1) in;
+layout(local_size_x = 1024) in;
 
 layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
@@ -12,13 +12,41 @@ layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
     uint outOff;
+    int ne00;
+    int nb00;
+    int nb01;
+    int nb02;
+    int nb03;
+    int ne10;
+    int ne11;
+    int ne12;
+    int ne13;
+    int nb10;
+    int nb11;
+    int nb12;
+    int nb13;
+    int ne0;
+    int nb0;
+    int nb1;
+    int nb2;
+    int nb3;
 } pcs;
 
 void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
 
-    for (uint x = 0; x < 4; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
+    const uint i13 = i03 % pcs.ne13;
+    const uint i12 = i02 % pcs.ne12;
+    const uint i11 = i01 % pcs.ne11;
+
+    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
+    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
+    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1)  / 4);
+
+    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
+        const uint i10 = i0 % pcs.ne10;
+        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
     }
-}
\ No newline at end of file
+}

From 2755ae3d10f7a64f963085ba99d26ccfcc37bc1b Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 22 Jan 2024 17:04:10 -0500
Subject: [PATCH 096/140] kompute : fix more dispatch ambiguity -> 12 less
 failures

---
 ggml-kompute.cpp | 115 ++++++++++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 52 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 76280501fc3c0..163b0a29a89c6 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -743,22 +743,25 @@ static void ggml_vk_scale(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
-                  const std::shared_ptr<kp::Tensor>& in,
-                  const std::shared_ptr<kp::Tensor>& out,
-                  uint32_t inOff, uint32_t outOff,
-                  uint32_t size) {
+static void ggml_vk_xxlu(
+    const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& in,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inOff, uint32_t outOff,
+    uint32_t size
+) {
     struct PushConstants {
         uint32_t inOff, outOff;
     } const pushConsts {
         safe_divide(inOff, 4), safe_divide(outOff, 4),
     };
 
+    auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(name)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+    } else {
+        s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -772,7 +775,7 @@ static void ggml_vk_silu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
         kp::shader_data::op_silu_comp_spv_len);
 
-    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+    ggml_vk_xxlu(spirv, "silu", std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -780,7 +783,7 @@ static void ggml_vk_relu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
         kp::shader_data::op_relu_comp_spv_len);
 
-    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+    ggml_vk_xxlu(spirv, "relu", std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -788,7 +791,7 @@ static void ggml_vk_gelu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
         kp::shader_data::op_gelu_comp_spv_len);
 
-    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+    ggml_vk_xxlu(spirv, "gelu", std::forward<Args>(args)...);
 }
 
 static void ggml_vk_soft_max(kp::Sequence& seq,
@@ -823,12 +826,14 @@ static void ggml_vk_soft_max(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
-                   const std::shared_ptr<kp::Tensor>& in,
-                   const std::shared_ptr<kp::Tensor>& out,
-                   uint32_t inOff, uint32_t outOff,
-                   int32_t ne00, int32_t nb01,
-                   int32_t nrows, float epsilon) {
+static void ggml_vk_norm_(
+    const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& in,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inOff, uint32_t outOff,
+    int32_t ne00, int32_t nb01,
+    int32_t nrows, float epsilon
+) {
     GGML_ASSERT(nb01%sizeof(float) == 0);
     GGML_ASSERT(ne00%sizeof(float) == 0);
 
@@ -841,11 +846,12 @@ static void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
         (uint32_t)ne00, (uint32_t)nb01, epsilon
     };
 
+    auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
+    if (!komputeManager()->hasAlgorithm(name)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -859,7 +865,7 @@ static void ggml_vk_norm(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
         kp::shader_data::op_norm_comp_spv_len);
 
-    ggml_vk_norm_(spirv, std::forward<Args>(args)...);
+    ggml_vk_norm_(spirv, "norm", std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -867,7 +873,7 @@ static void ggml_vk_rms_norm(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
         kp::shader_data::op_rmsnorm_comp_spv_len);
 
-    ggml_vk_norm_(spirv, std::forward<Args>(args)...);
+    ggml_vk_norm_(spirv, "rms", std::forward<Args>(args)...);
 }
 
 static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
@@ -1029,13 +1035,15 @@ static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
-                          const std::shared_ptr<kp::Tensor>& inA,
-                          const std::shared_ptr<kp::Tensor>& inB,
-                          const std::shared_ptr<kp::Tensor>& out,
-                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
-                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
+static void ggml_vk_mul_mat_q4_x(
+    const std::vector<uint32_t>& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
+    int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
+) {
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
         int32_t ne00, ne10, ne0, ne1, ne01, gqa;
@@ -1044,12 +1052,13 @@ static void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t bl
         ne00, ne10, ne0, ne1, ne01, ne12/ne02
     };
 
+    auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
+    if (!komputeManager()->hasAlgorithm(name)) {
         const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
         s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1063,7 +1072,7 @@ static void ggml_vk_mul_mat_q4_0(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
         kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
 
-    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+    ggml_vk_mul_mat_q4_x(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -1071,7 +1080,7 @@ static void ggml_vk_mul_mat_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
         kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
 
-    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+    ggml_vk_mul_mat_q4_x(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
 static void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
@@ -1242,16 +1251,18 @@ static void ggml_vk_rope(
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-template<uint32_t in_element_size, uint32_t out_element_size>
-static void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
-                 kp::Sequence& seq,
-                 const std::shared_ptr<kp::Tensor>& in,
-                 const std::shared_ptr<kp::Tensor>& out,
-                 uint32_t inOff, uint32_t outOff,
-                 int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
-                 uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
-                 int32_t ne0, int32_t ne1, int32_t ne2,
-                 uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
+static void ggml_vk_cpy(
+    const std::vector<uint32_t>& spirv,
+    uint32_t in_element_size, uint32_t out_element_size,
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& in,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inOff, uint32_t outOff,
+    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
+    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    int32_t ne0, int32_t ne1, int32_t ne2,
+    uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
+) {
     struct PushConstants {
         uint32_t inOff, outOff;
         int32_t ne00, ne01, ne02;
@@ -1266,14 +1277,14 @@ static void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
         nb0, nb1, nb2, nb3
     };
 
-    static std::string unique_name = std::string(__func__) +
-                                     "_i_" + std::to_string(in_element_size) +
-                                     "_o_" + std::to_string(out_element_size);
+    std::string name = std::string(__func__)
+                       + "_i_" + std::to_string(in_element_size)
+                       + "_o_" + std::to_string(out_element_size);
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(unique_name))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(unique_name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    if (!komputeManager()->hasAlgorithm(name))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
-        s_algo = komputeManager()->getAlgorithm(unique_name);
+        s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1286,28 +1297,28 @@ template <typename... Args>
 static void ggml_vk_cpy_f32_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
         kp::shader_data::op_cpy_f32_f16_comp_spv_len);
-    ggml_vk_cpy<4, 2>(spirv, std::forward<Args>(args)...);
+    ggml_vk_cpy(spirv, 4, 2, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
 static void ggml_vk_cpy_f32_f32(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
         kp::shader_data::op_cpy_f32_f32_comp_spv_len);
-    ggml_vk_cpy<4, 4>(spirv, std::forward<Args>(args)...);
+    ggml_vk_cpy(spirv, 4, 4, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
 static void ggml_vk_cpy_f16_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
         kp::shader_data::op_cpy_f16_f16_comp_spv_len);
-    ggml_vk_cpy<2, 2>(spirv, std::forward<Args>(args)...);
+    ggml_vk_cpy(spirv, 2, 2, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
 static void ggml_vk_cpy_f16_f32(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
         kp::shader_data::op_cpy_f16_f32_comp_spv_len);
-    ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
+    ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
 }
 
 static bool ggml_vk_supports_op(const struct ggml_tensor * op) {

From 0787b80db8b78398061b7256e9cbf045cbcccb72 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 22 Jan 2024 17:42:05 -0500
Subject: [PATCH 097/140] kompute : remove broken mulrow kernel -> 1 less test
 failure

---
 CMakeLists.txt                 |  2 --
 ggml-kompute.cpp               | 55 ++++++----------------------------
 kompute-shaders/op_mulrow.comp | 25 ----------------
 3 files changed, 9 insertions(+), 73 deletions(-)
 delete mode 100644 kompute-shaders/op_mulrow.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 01f01bfee2f2c..c3533b9692607 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -520,7 +520,6 @@ if (LLAMA_KOMPUTE)
           kompute-shaders/op_add.comp
           kompute-shaders/op_addrow.comp
           kompute-shaders/op_mul.comp
-          kompute-shaders/op_mulrow.comp
           kompute-shaders/op_silu.comp
           kompute-shaders/op_relu.comp
           kompute-shaders/op_gelu.comp
@@ -553,7 +552,6 @@ if (LLAMA_KOMPUTE)
           shaderop_add.h
           shaderop_addrow.h
           shaderop_mul.h
-          shaderop_mulrow.h
           shaderop_silu.h
           shaderop_relu.h
           shaderop_gelu.h
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 163b0a29a89c6..45a579b3bfdee 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -9,7 +9,6 @@
 #include "shaderop_add.h"
 #include "shaderop_addrow.h"
 #include "shaderop_mul.h"
-#include "shaderop_mulrow.h"
 #include "shaderop_silu.h"
 #include "shaderop_relu.h"
 #include "shaderop_gelu.h"
@@ -671,37 +670,6 @@ static void ggml_vk_mul(
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_mulrow(kp::Sequence& seq,
-                 const std::shared_ptr<kp::Tensor>& inA,
-                 const std::shared_ptr<kp::Tensor>& inB,
-                 const std::shared_ptr<kp::Tensor>& out,
-                 uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                 uint32_t size, uint32_t row = 0) {
-
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mulrow_comp_spv,
-        kp::shader_data::op_mulrow_comp_spv_len);
-
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        uint32_t row;
-    } const pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        row
-    };
-
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({size});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
 static void ggml_vk_scale(kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in,
                    const std::shared_ptr<kp::Tensor>& out,
@@ -1516,20 +1484,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_MUL:
                     {
-                        if (ggml_nelements(src1) == ne10) {
-                            // src1 is a row
-                            ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
-                        } else {
-                            ggml_vk_mul(
-                                seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                ne00, ne01, ne02, ne03,
-                                nb00, nb01, nb02, nb03,
-                                ne10, ne11, ne12, ne13,
-                                nb10, nb11, nb12, nb13,
-                                ne0,
-                                nb0, nb1, nb2, nb3
-                            );
-                        }
+                        ggml_vk_mul(
+                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                            ne00, ne01, ne02, ne03,
+                            nb00, nb01, nb02, nb03,
+                            ne10, ne11, ne12, ne13,
+                            nb10, nb11, nb12, nb13,
+                            ne0,
+                            nb0, nb1, nb2, nb3
+                        );
                     } break;
                 case GGML_OP_SCALE:
                     {
diff --git a/kompute-shaders/op_mulrow.comp b/kompute-shaders/op_mulrow.comp
deleted file mode 100644
index ae71063208c2f..0000000000000
--- a/kompute-shaders/op_mulrow.comp
+++ /dev/null
@@ -1,25 +0,0 @@
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    uint row;
-} pcs;
-
-void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
-
-    for (uint x = 0; x < 4; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
-    }
-}
\ No newline at end of file

From 1a14099c43a7a40fcadd29d8a65462d162745f0c Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 11:56:43 -0500
Subject: [PATCH 098/140] fix q4_0/q4_1 mmv, 65 -> 49 failures

---
 CMakeLists.txt                         |  1 +
 ggml-kompute.cpp                       | 88 +++++++++++++++++---------
 kompute-shaders/op_mul_mat_q4_0.comp   | 20 +-----
 kompute-shaders/op_mul_mat_q4_1.comp   | 20 +-----
 kompute-shaders/op_mul_mv_q_n.comp     |  9 ++-
 kompute-shaders/op_mul_mv_q_n_pre.comp | 22 +++++++
 6 files changed, 92 insertions(+), 68 deletions(-)
 create mode 100644 kompute-shaders/op_mul_mv_q_n_pre.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3533b9692607..0b9b4f023d176 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -464,6 +464,7 @@ if (LLAMA_KOMPUTE)
             DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
               ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
               ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
               ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
               COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
             COMMENT "Compiling ${source} to ${spv_file}"
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 45a579b3bfdee..8a9e415e1ed6a 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1003,32 +1003,40 @@ static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_mul_mat_q4_x(
+static void ggml_vk_mul_mat_impl(
     const std::vector<uint32_t>& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
-    int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
+    int32_t ne00, int32_t ne01, int32_t ne02,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t ne0, int32_t ne1,
+    uint32_t r2, uint32_t r3
 ) {
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
+        int32_t ne00, ne01, ne02;
+        int32_t ne10, ne12;
+        int32_t ne0, ne1;
+        uint32_t r2, r3;
     } pushConsts {
         safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0, ne1, ne01, ne12/ne02
+        ne00, ne01, ne02,
+        ne10, ne12,
+        ne0, ne1,
+        r2, r3
     };
 
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
         const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
+        s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1040,7 +1048,7 @@ static void ggml_vk_mul_mat_q4_0(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
         kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
 
-    ggml_vk_mul_mat_q4_x(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+    ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -1048,16 +1056,18 @@ static void ggml_vk_mul_mat_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
         kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
 
-    ggml_vk_mul_mat_q4_x(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+    ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
-static void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
-                          const std::shared_ptr<kp::Tensor>& inA,
-                          const std::shared_ptr<kp::Tensor>& inB,
-                          const std::shared_ptr<kp::Tensor>& out,
-                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
-                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
+static void ggml_vk_mul_mat_q6_k(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
+    int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
+) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
         kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
 
@@ -1550,6 +1560,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_MUL_MAT:
                     {
+                        GGML_ASSERT(ne00 == ne10);
+
+                        // TODO: assert that dim2 and dim3 are contiguous
+                        GGML_ASSERT(ne12 % ne02 == 0);
+                        GGML_ASSERT(ne13 % ne03 == 0);
+
+                        const uint32_t r2 = ne12/ne02;
+                        const uint32_t r3 = ne13/ne03;
+
                         if (src1t != GGML_TYPE_F32) {
                             fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                             goto not_implemented;
@@ -1563,29 +1582,40 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
 
                         switch (src0t) {
                             case GGML_TYPE_F32:
-                                ggml_vk_mul_mat_mat_f32(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
+                                ggml_vk_mul_mat_mat_f32(
+                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2
+                                );
                                 break;
                             case GGML_TYPE_F16:
-                                ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                ggml_vk_mul_mat_f16(
+                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1
+                                );
                                 break;
                             case GGML_TYPE_Q8_0:
-                                ggml_vk_mul_mat_q8_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                ggml_vk_mul_mat_q8_0(
+                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1
+                                );
                                 break;
                             case GGML_TYPE_Q4_0:
-                                ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                ggml_vk_mul_mat_q4_0(
+                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                );
                                 break;
                             case GGML_TYPE_Q4_1:
-                                ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                ggml_vk_mul_mat_q4_1(
+                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                );
                                 break;
                             case GGML_TYPE_Q6_K:
-                                ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                ggml_vk_mul_mat_q6_k(
+                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02
+                                );
                                 break;
                             default: {
                                 fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
diff --git a/kompute-shaders/op_mul_mat_q4_0.comp b/kompute-shaders/op_mul_mat_q4_0.comp
index 03788c92090b6..b0cea8bbe67b9 100644
--- a/kompute-shaders/op_mul_mat_q4_0.comp
+++ b/kompute-shaders/op_mul_mat_q4_0.comp
@@ -6,25 +6,7 @@
 #define SIZE_OF_BLOCK sizeof_block_q4_0
 #define N_ROWS 4
 
-layout(local_size_x_id = 0) in;
-layout(local_size_y = 1) in;
-layout(local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne10;
-    int ne0;
-    int ne1;
-    int ne01;
-    int gqa;
-} pcs;
+#include "op_mul_mv_q_n_pre.comp"
 
 // The q4_0 version of this function
 float block_q_n_dot_y(uint block_index, uint yb, uint il) {
diff --git a/kompute-shaders/op_mul_mat_q4_1.comp b/kompute-shaders/op_mul_mat_q4_1.comp
index 0ae8f8c7d5d67..8582c61a3beb9 100644
--- a/kompute-shaders/op_mul_mat_q4_1.comp
+++ b/kompute-shaders/op_mul_mat_q4_1.comp
@@ -6,25 +6,7 @@
 #define SIZE_OF_BLOCK sizeof_block_q4_1
 #define N_ROWS 4
 
-layout(local_size_x_id = 0) in;
-layout(local_size_y = 1) in;
-layout(local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne10;
-    int ne0;
-    int ne1;
-    int ne01;
-    int gqa;
-} pcs;
+#include "op_mul_mv_q_n_pre.comp"
 
 // The q4_1 version of this function
 float block_q_n_dot_y(uint block_index, uint yb, uint il) {
diff --git a/kompute-shaders/op_mul_mv_q_n.comp b/kompute-shaders/op_mul_mv_q_n.comp
index 8b6e6a2e2a6f2..440b5ab2c81f8 100644
--- a/kompute-shaders/op_mul_mv_q_n.comp
+++ b/kompute-shaders/op_mul_mv_q_n.comp
@@ -1,13 +1,20 @@
 void main() {
+    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
     if (gl_SubgroupInvocationID > 31)
         return;
 
     const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
+
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
     const uint im = gl_WorkGroupID.z;
+
     const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
-    const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0);
+
+    const uint i12 = im%pcs.ne12;
+    const uint i13 = im/pcs.ne12;
+
+    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
 
     const uint x = offset0; // Based from inA without base offset
     const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
diff --git a/kompute-shaders/op_mul_mv_q_n_pre.comp b/kompute-shaders/op_mul_mv_q_n_pre.comp
new file mode 100644
index 0000000000000..7912b09ac69c4
--- /dev/null
+++ b/kompute-shaders/op_mul_mv_q_n_pre.comp
@@ -0,0 +1,22 @@
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int  ne00;
+    int  ne01;
+    int  ne02;
+    int  ne10;
+    int  ne12;
+    int  ne0;
+    int  ne1;
+    uint r2;
+    uint r3;
+} pcs;

From 2b0f642fec920e4801c43a885d44cb412dbd0387 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 12:47:41 -0500
Subject: [PATCH 099/140] fix f16 mmv, 49 -> 41 failures

---
 ggml-kompute.cpp                    | 48 ++++++++++++++++++-----------
 kompute-shaders/op_mul_mat_f16.comp | 47 +++++++++++++++++++---------
 2 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 8a9e415e1ed6a..b8aae17fda862 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -876,39 +876,50 @@ static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_mul_mat_f16(kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         int32_t ne0, int32_t ne1) {
+static void ggml_vk_mul_mat_f16(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t ne01, int32_t ne02,
+    uint32_t nb00, uint32_t nb01, uint32_t nb02,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    uint32_t nb10, uint32_t nb11, uint32_t nb12,
+    int32_t ne0, int32_t ne1,
+    uint32_t r2, uint32_t r3
+) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv,
         kp::shader_data::op_mul_mat_f16_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        int32_t ne02, ne12;
+        int32_t ne00, ne01, ne02;
+        uint32_t nb00, nb01, nb02;
+        int32_t ne10, ne11, ne12;
+        uint32_t nb10, nb11, nb12;
         int32_t ne0, ne1;
+        uint32_t r2, r3;
     } pushConsts {
         safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, nb01, nb02, nb11, nb12, ne02, ne12, ne0, ne1,
+        ne00, ne01, ne02,
+        nb00, nb01, nb02,
+        ne10, ne11, ne12,
+        nb10, nb11, nb12,
+        ne0, ne1,
+        r2, r3
     };
 
+    const unsigned ny = unsigned((ne11 + 4 - 1)/4);
+
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
         const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(std::max(ne12, ne02))}, {local_x}, {pushConsts});
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(std::max(ne12, ne02))});
+        s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1590,7 +1601,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             case GGML_TYPE_F16:
                                 ggml_vk_mul_mat_f16(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1
+                                    ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12,
+                                    ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q8_0:
diff --git a/kompute-shaders/op_mul_mat_f16.comp b/kompute-shaders/op_mul_mat_f16.comp
index dd1e139794d53..8f0a9031f7a37 100644
--- a/kompute-shaders/op_mul_mat_f16.comp
+++ b/kompute-shaders/op_mul_mat_f16.comp
@@ -15,34 +15,53 @@ layout (push_constant) uniform parameter {
     uint inBOff;
     uint outOff;
     int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
     uint nb01;
     uint nb02;
+    int ne10;
+    int ne11;
+    int ne12;
+    uint nb10;
     uint nb11;
     uint nb12;
-    uint ne02;
-    uint ne12;
     int ne0;
     int ne1;
+    uint r2;
+    uint r3;
 } pcs;
 
+#define N_F16_F32 4
+
 void main() {
     const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
+    const uint rb = gl_WorkGroupID.y*N_F16_F32;
     const uint im = gl_WorkGroupID.z;
 
-    uint bc_ab = pcs.ne12 > pcs.ne02 ? im / (pcs.ne12 / pcs.ne02) : im;
-    uint bc_ba = pcs.ne02 > pcs.ne12 ? im / (pcs.ne02 / pcs.ne12) : im;
+    const uint i12 = im%pcs.ne12;
+    const uint i13 = im/pcs.ne12;
 
-    const uint x = (r0*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
-    const uint y = (r1*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
 
-    float sumf = 0.0f;
-    for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-        sumf += float(inA[x+i]) * float(inB[y+i]);
-    }
+    const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
+
+    for (uint row = 0; row < N_F16_F32; ++row) {
+        uint r1 = rb + row;
+        if (r1 >= pcs.ne11) {
+            break;
+        }
+
+        const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
+
+        float sumf = 0;
+        for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
+            sumf += float(inA[x+i]) * float(inB[y+i]);
+        }
 
-    const float all_sum = subgroupAdd(sumf);
-    if (subgroupElect()) {
-        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
+        const float all_sum = subgroupAdd(sumf);
+        if (subgroupElect()) {
+            out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
+        }
     }
 }

From 2852902eda37e8490a07fcd0ab0d803e59260a52 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 14:55:41 -0500
Subject: [PATCH 100/140] test-backend-ops : add llama test

---
 tests/test-backend-ops.cpp | 212 +++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index a0063bbb9cf5b..b776e493a34a3 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1464,6 +1464,216 @@ struct test_moe : public test_case {
     }
 };
 
+
+// llama
+struct test_llama : public test_case {
+    const int n_tokens;
+    static constexpr float f_norm_rms_eps = 1e-5;
+    static constexpr int64_t n_embd_k_gqa = 3200;
+    static constexpr int64_t n_embd_v_gqa = 3200;
+    static constexpr int64_t n_ctx = 512;
+    static constexpr int64_t n_layer = 1;
+    static constexpr int64_t n_head = 32;
+    static constexpr int64_t n_head_kv = 32;
+    static constexpr int64_t n_embd_head = 100;
+    static constexpr int64_t n_embd = 3200;
+    static constexpr int64_t n_orig_ctx = n_ctx;
+    static constexpr int64_t n_ff = 8640;
+    static constexpr int64_t n_kv = 32;
+    static constexpr int64_t kv_head = 1;
+    static constexpr float freq_base = 10000.0f;
+    static constexpr float freq_scale = 1.0f;
+    static constexpr float ext_factor = 0.0f;
+    static constexpr float attn_factor = 1.0f;
+    static constexpr float beta_fast = 32.0f;
+    static constexpr float beta_slow = 1.0f;
+
+    std::string op_desc(ggml_tensor * t) override {
+        return "LLAMA";
+
+        GGML_UNUSED(t);
+    }
+
+    std::string vars() override {
+        return VARS_TO_STR1(n_tokens);
+    }
+
+    test_llama(int n_tokens = 1)
+        : n_tokens(n_tokens) {
+    }
+
+    struct ggml_tensor * llm_build_norm(
+            struct ggml_context * ctx,
+             struct ggml_tensor * cur,
+             struct ggml_tensor * mw) {
+        cur = ggml_rms_norm(ctx, cur, f_norm_rms_eps);
+        cur = ggml_mul(ctx, cur, mw);
+        return cur;
+    }
+
+    void llm_build_kv_store(
+            struct ggml_context * ctx,
+             struct ggml_tensor * k_l,
+             struct ggml_tensor * v_l,
+             struct ggml_tensor * k_cur,
+             struct ggml_tensor * v_cur) {
+        // compute the transposed [n_tokens, n_embd] V matrix
+        struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
+
+        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, n_tokens*n_embd_k_gqa,
+                (ggml_row_size(k_l->type, n_embd_k_gqa))*kv_head);
+
+        struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, n_tokens, n_embd_v_gqa,
+                (  n_ctx)*ggml_element_size(v_l),
+                (kv_head)*ggml_element_size(v_l));
+
+        // important: storing RoPE-ed version of K in the KV cache!
+        ggml_cpy(ctx, k_cur,   k_cache_view);
+        ggml_cpy(ctx, v_cur_t, v_cache_view);
+    }
+
+    // if max_alibi_bias > 0 then apply ALiBi
+    struct ggml_tensor * llm_build_kqv(
+            struct ggml_context * ctx,
+             struct ggml_tensor * k_l,
+             struct ggml_tensor * v_l,
+             struct ggml_tensor * q_cur,
+             struct ggml_tensor * kq_mask,
+                        float     kq_scale) {
+        struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
+
+        struct ggml_tensor * k =
+            ggml_view_3d(ctx, k_l,
+                    n_embd_head, n_kv, n_head_kv,
+                    ggml_row_size(k_l->type, n_embd_k_gqa),
+                    ggml_row_size(k_l->type, n_embd_head),
+                    0);
+
+        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
+
+        // split cached v into n_head heads
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx, v_l,
+                    n_kv, n_embd_head, n_head_kv,
+                    ggml_element_size(v_l)*n_ctx,
+                    ggml_element_size(v_l)*n_ctx*n_embd_head,
+                    0);
+
+        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+
+        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
+
+        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head*n_head, n_tokens);
+
+        struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 3200);
+        cur = ggml_mul_mat(ctx, wo, cur);
+
+        return cur;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_rot = n_embd_head;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_kv, n_tokens, 1);
+
+        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3200);
+            cur = llm_build_norm(ctx, inpL, attn_norm);
+
+            // self-attention
+            {
+                ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 3200);
+                ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 3200);
+                ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 3200);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
+
+                Qcur = ggml_rope_custom(
+                    ctx, ggml_reshape_3d(ctx, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
+                    n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                Kcur = ggml_rope_custom(
+                    ctx, ggml_reshape_3d(ctx, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
+
+                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(n_embd_head)));
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
+
+            // feed-forward network
+            ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3200);
+            cur = llm_build_norm(ctx, ffn_inp, ffn_norm);
+
+            ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 8640);
+            ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 8640);
+            ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 8640, 3200);
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
+            cur = ggml_mul_mat(ctx, ffn_gate, cur);
+            cur = ggml_silu(ctx, cur);
+            cur = ggml_mul(ctx, cur, tmp);
+            cur = ggml_mul_mat(ctx, ffn_down, cur);
+
+            cur = ggml_add(ctx, cur, ffn_inp);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3200);
+        cur = llm_build_norm(ctx, cur, output_norm);
+
+        // lm_head
+        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 32000);
+        cur = ggml_mul_mat(ctx, output, cur);
+
+        return cur;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // pos
+                std::vector<int> data(n_tokens);
+                for (int i = 0; i < n_tokens; i++) {
+                    data[i] = rand() % n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, n_tokens * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
     std::vector<std::unique_ptr<test_case>> test_cases;
     std::default_random_engine rng(0);
@@ -1651,6 +1861,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
 #endif
 
+    test_cases.emplace_back(new test_llama());
+
     // run tests
     if (mode == MODE_TEST) {
         ggml_backend_t backend_cpu = ggml_backend_cpu_init();

From 145096607109477c6da14e5505afaa97d2b08ecb Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 16:12:42 -0500
Subject: [PATCH 101/140] test-backend-ops : test scale parameter of
 ggml_soft_max_ext

---
 tests/test-backend-ops.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index b776e493a34a3..a529dfdb5158b 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1056,18 +1056,20 @@ struct test_diag_mask_inf : public test_case {
 struct test_soft_max : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
+    const float scale;
 
     std::string vars() override {
-        return VARS_TO_STR2(type, ne);
+        return VARS_TO_STR3(type, ne, scale);
     }
 
     test_soft_max(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
-        : type(type), ne(ne) {}
+            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            float scale = 1.0f)
+        : type(type), ne(ne), scale(scale) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_soft_max(ctx, a);
+        ggml_tensor * out = ggml_soft_max_ext(ctx, a, nullptr, scale);
         return out;
     }
 };
@@ -1825,6 +1827,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         exponent <<= 1;
     }
 
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 16, 1, 1}, 0.1f));
+
     for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
         test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B
         test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512)); // llama 13B

From 308f279622becca8f943f7da4684d53e5ec4f143 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 16:16:58 -0500
Subject: [PATCH 102/140] kompute : support scale parameter of softmax

---
 ggml-kompute.cpp                | 20 ++++++++++++--------
 kompute-shaders/op_softmax.comp |  5 +++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index b8aae17fda862..aafd29850058d 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -762,21 +762,24 @@ static void ggml_vk_gelu(Args&&... args) {
     ggml_vk_xxlu(spirv, "gelu", std::forward<Args>(args)...);
 }
 
-static void ggml_vk_soft_max(kp::Sequence& seq,
-                      const std::shared_ptr<kp::Tensor>& in,
-                      const std::shared_ptr<kp::Tensor>& out,
-                      uint32_t inOff, uint32_t outOff,
-                      int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03) {
-
+static void ggml_vk_soft_max(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& in,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inOff, uint32_t outOff,
+    int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
+    float scale
+) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
         kp::shader_data::op_softmax_comp_spv_len);
 
     struct PushConstants {
         uint32_t inOff, outOff;
         int32_t ne00, ne01, ne02;
+        float scale;
     } pushConsts {
         safe_divide(inOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02
+        ne00, ne01, ne02, scale
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
@@ -1548,7 +1551,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_SOFT_MAX:
                     {
-                        ggml_vk_soft_max(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03);
+                        const float scale = ((float *) dst->op_params)[0];
+                        ggml_vk_soft_max(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, scale);
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                     {
diff --git a/kompute-shaders/op_softmax.comp b/kompute-shaders/op_softmax.comp
index 89de1b701851d..fea371788d661 100644
--- a/kompute-shaders/op_softmax.comp
+++ b/kompute-shaders/op_softmax.comp
@@ -15,6 +15,7 @@ layout(push_constant) uniform PushConstants {
     int ne00;
     int ne01;
     int ne02;
+    float scale;
 } pcs;
 
 void main() {
@@ -32,14 +33,14 @@ void main() {
     // parallel max
     float localMax = uintBitsToFloat(0xFF800000);
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        localMax = max(localMax, in_[psrc0 + i00]);
+        localMax = max(localMax, in_[psrc0 + i00]*pcs.scale);
     }
     float max_ = subgroupMax(localMax);
 
     // parallel sum
     float localSum = 0.0f;
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
+        const float exp_psrc0 = exp(in_[psrc0 + i00]*pcs.scale - max_);
         localSum += exp_psrc0;
         out_[pdst + i00] = exp_psrc0;
     }

From 8bd38fe32d8ad6dc525910016756fef39c4f97ee Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 16:28:41 -0500
Subject: [PATCH 103/140] test-backend-ops : test mask parameter of
 ggml_soft_max_ext

---
 tests/test-backend-ops.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index a529dfdb5158b..a6486f34e6ab0 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1057,19 +1057,23 @@ struct test_soft_max : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
     const float scale;
+    const bool mask;
 
     std::string vars() override {
-        return VARS_TO_STR3(type, ne, scale);
+        return VARS_TO_STR4(type, ne, scale, mask);
     }
 
     test_soft_max(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            float scale = 1.0f)
-        : type(type), ne(ne), scale(scale) {}
+            float scale = 1.0f,
+            bool mask = false)
+        : type(type), ne(ne), scale(scale), mask(mask) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_soft_max_ext(ctx, a, nullptr, scale);
+        ggml_tensor * b = nullptr;
+        if (mask) { b = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]); }
+        ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale);
         return out;
     }
 };
@@ -1827,7 +1831,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         exponent <<= 1;
     }
 
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 16, 1, 1}, 0.1f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));
 
     for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
         test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B

From df687b10abdd9205e61bbef4ca4775ab4910b8ff Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 16:51:27 -0500
Subject: [PATCH 104/140] kompute : support mask parameter of softmax

---
 ggml-kompute.cpp                | 22 ++++++++++++++--------
 kompute-shaders/op_softmax.comp | 16 ++++++++++------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index aafd29850058d..b3ca984b445e3 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -764,9 +764,10 @@ static void ggml_vk_gelu(Args&&... args) {
 
 static void ggml_vk_soft_max(
     kp::Sequence& seq,
-    const std::shared_ptr<kp::Tensor>& in,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inOff, uint32_t outOff,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
     int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
     float scale
 ) {
@@ -774,22 +775,27 @@ static void ggml_vk_soft_max(
         kp::shader_data::op_softmax_comp_spv_len);
 
     struct PushConstants {
-        uint32_t inOff, outOff;
+        uint32_t inAOff, inBOff, outOff;
         int32_t ne00, ne01, ne02;
         float scale;
+        int32_t mask;
     } pushConsts {
-        safe_divide(inOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, scale
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02,
+        scale,
+        bool(inB)
     };
 
+    auto & inB_ = inB ? inB : inA;
+
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
         // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
         const uint32_t local_x = 32;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({in, out});
+        s_algo->setTensors({inA, inB_, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
@@ -1552,7 +1558,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_SOFT_MAX:
                     {
                         const float scale = ((float *) dst->op_params)[0];
-                        ggml_vk_soft_max(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, scale);
+                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                     {
diff --git a/kompute-shaders/op_softmax.comp b/kompute-shaders/op_softmax.comp
index fea371788d661..7bc9176cabaae 100644
--- a/kompute-shaders/op_softmax.comp
+++ b/kompute-shaders/op_softmax.comp
@@ -6,16 +6,19 @@
 
 layout(local_size_x_id = 0) in;
 
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 
 layout(push_constant) uniform PushConstants {
-    uint inOff;
+    uint inAOff;
+    uint inBOff;
     uint outOff;
     int ne00;
     int ne01;
     int ne02;
     float scale;
+    int mask;
 } pcs;
 
 void main() {
@@ -27,20 +30,21 @@ void main() {
     const uint i01 = gl_WorkGroupID.x;
 
     const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
-    const uint psrc0 = extra_off + pcs.inOff; // Based from in_
+    const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
+    const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
     const uint pdst = extra_off + pcs.outOff; // Based from out_
 
     // parallel max
     float localMax = uintBitsToFloat(0xFF800000);
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        localMax = max(localMax, in_[psrc0 + i00]*pcs.scale);
+        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
     }
     float max_ = subgroupMax(localMax);
 
     // parallel sum
     float localSum = 0.0f;
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        const float exp_psrc0 = exp(in_[psrc0 + i00]*pcs.scale - max_);
+        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
         localSum += exp_psrc0;
         out_[pdst + i00] = exp_psrc0;
     }

From ebb5f7e968d1ddcb50639d57d8c516052347d1af Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 16:55:27 -0500
Subject: [PATCH 105/140] test-backend-ops : test llama with different batch
 sizes

---
 tests/test-backend-ops.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index a6486f34e6ab0..a7cfa2c04519f 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1870,7 +1870,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
 #endif
 
-    test_cases.emplace_back(new test_llama());
+    test_cases.emplace_back(new test_llama(1));
+    test_cases.emplace_back(new test_llama(2));
 
     // run tests
     if (mode == MODE_TEST) {

From ec68a9657f1d1b93a790123b9db4e4c8726a55ac Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 24 Jan 2024 17:31:34 -0500
Subject: [PATCH 106/140] test-backend-ops : increase max_nmse_err so Llama
 passes

---
 tests/test-backend-ops.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index a7cfa2c04519f..22c08e4ffc330 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1504,6 +1504,10 @@ struct test_llama : public test_case {
         return VARS_TO_STR1(n_tokens);
     }
 
+    double max_nmse_err() override {
+        return 2e-3;
+    }
+
     test_llama(int n_tokens = 1)
         : n_tokens(n_tokens) {
     }

From 987335ea0a384c5242de549f6a9cd899f4137562 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 11:09:18 -0500
Subject: [PATCH 107/140] kompute : fix algorithm names

---
 ggml-kompute.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index b3ca984b445e3..f45902e3691f9 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -700,7 +700,7 @@ static void ggml_vk_scale(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({in, out});
@@ -727,7 +727,7 @@ static void ggml_vk_xxlu(
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({in, out});
@@ -826,7 +826,7 @@ static void ggml_vk_norm_(
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({in, out});
@@ -1052,7 +1052,7 @@ static void ggml_vk_mul_mat_impl(
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
         const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({inA, inB, out});
@@ -1140,7 +1140,7 @@ static void ggml_vk_get_rows(
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({inA, inB, out});

From f5ac635473301c0b9b538cbdc024d753b223be01 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 11:27:11 -0500
Subject: [PATCH 108/140] kompute : fix q8_0 mmv, 41 -> 28 failures

---
 ggml-kompute.cpp                     | 48 +++-----------
 kompute-shaders/common.comp          |  8 +++
 kompute-shaders/op_mul_mat_q8_0.comp | 99 ++++++++++++++++------------
 3 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index f45902e3691f9..030cb7a23df17 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -935,44 +935,6 @@ static void ggml_vk_mul_mat_f16(
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         int32_t ne0, int32_t ne1) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
-        kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        int32_t ne0, ne1;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
-    };
-
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-
 static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
@@ -1079,6 +1041,14 @@ static void ggml_vk_mul_mat_q4_1(Args&&... args) {
     ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
+template <typename... Args>
+static void ggml_vk_mul_mat_q8_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
+        kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
+
+    ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+}
+
 static void ggml_vk_mul_mat_q6_k(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
@@ -1618,7 +1588,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             case GGML_TYPE_Q8_0:
                                 ggml_vk_mul_mat_q8_0(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_0:
diff --git a/kompute-shaders/common.comp b/kompute-shaders/common.comp
index 0df6db7d046fc..0918657d5695c 100644
--- a/kompute-shaders/common.comp
+++ b/kompute-shaders/common.comp
@@ -95,3 +95,11 @@ mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
     }
     return reg;
 }
+
+
+#define QK8_0 32
+// struct block_q8_0 {
+//     float16_t d;         // delta
+//     int8_t    qs[QK8_0]; // quants
+// };
+#define sizeof_block_q8_0 34
diff --git a/kompute-shaders/op_mul_mat_q8_0.comp b/kompute-shaders/op_mul_mat_q8_0.comp
index 1c4ddbb083ed7..34d015e90b84c 100644
--- a/kompute-shaders/op_mul_mat_q8_0.comp
+++ b/kompute-shaders/op_mul_mat_q8_0.comp
@@ -2,55 +2,72 @@
 
 #include "common.comp"
 
-#define BLOCKS_IN_QUANT QK8_0
-#define SIZE_OF_BLOCK sizeof_block_q8_0
-#define N_ROWS 4
-
-layout(local_size_x_id = 0) in;
-layout(local_size_y = 1) in;
-layout(local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne10;
-    int ne0;
-    int ne1;
-    int ne01;
-    int gqa;
-} pcs;
-
-#define ELS_PER_BLOCK 32
+#include "op_mul_mv_q_n_pre.comp"
+
 #define SIZE_OF_D 2
-#define BLOCK_SIZE (ELS_PER_BLOCK + SIZE_OF_D)
+
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
+
+#define NB_Q8_0 8
 
 void main() {
+    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
+    if (gl_SubgroupInvocationID > 31)
+        return;
+
+    const int nr  = N_DST;
+    const int nsg = N_SIMDGROUP;
+    const int nw  = N_SIMDWIDTH;
+
+    const int nb = pcs.ne00/QK8_0;
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
     const uint im = gl_WorkGroupID.z;
 
-    const uint x = r0 * (pcs.ne00/ELS_PER_BLOCK) * BLOCK_SIZE + pcs.inAOff; // Based from inA
-    const uint y = r1 * pcs.ne10 + pcs.inBOff; // based from inB
-
-    float sumf = 0.0f;
-    for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-        const uint block_number = i / ELS_PER_BLOCK;
-        const uint block_offset = block_number * BLOCK_SIZE;
-        const float d = u8BufToFloat16(inA, x + block_offset);
-        const uint position_in_block = i % ELS_PER_BLOCK;
-        const int q = int8_t(inA[x+block_offset+SIZE_OF_D+position_in_block]);
-        const float dq = d * q;
-        sumf += dq * float(inB[y+i]);
+    const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
+
+    const uint i12 = im%pcs.ne12;
+    const uint i13 = im/pcs.ne12;
+
+    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
+
+    const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
+    const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
+
+    float yl[NB_Q8_0];
+    float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
+
+    const uint ix = gl_SubgroupInvocationID.x/4;
+    const uint il = gl_SubgroupInvocationID.x%4;
+
+    uint yb = y + ix * QK8_0 + NB_Q8_0*il;
+
+    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
+    for (uint ib = ix; ib < nb; ib += nw/4) {
+        for (int i = 0; i < NB_Q8_0; ++i) {
+            yl[i] = inB[yb + i];
+        }
+
+        for (int row = 0; row < nr; row++) {
+            const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
+            float sumq = 0.f;
+            for (int iq = 0; iq < NB_Q8_0; ++iq) {
+                const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
+                sumq += qs_iq * yl[iq];
+            }
+            const float16_t d = u8BufToFloat16(inA, x + block_offset);
+            sumf[row] += sumq*d;
+        }
+
+        yb += NB_Q8_0 * nw;
     }
 
-    const float all_sum = subgroupAdd(sumf);
-    if (subgroupElect()) {
-        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
+    for (int row = 0; row < nr; ++row) {
+        const float tot = subgroupAdd(sumf[row]);
+        if (subgroupElect() && first_row + row < pcs.ne01) {
+            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
+        }
     }
 }

From 1849b854731664aab54ea344d3ba39225473f93b Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 13:55:49 -0500
Subject: [PATCH 109/140] test-backend-ops : add Falcon test

---
 tests/test-backend-ops.cpp | 345 ++++++++++++++++++++++++++++---------
 1 file changed, 261 insertions(+), 84 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 22c08e4ffc330..08008a9ce5bfb 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1471,53 +1471,64 @@ struct test_moe : public test_case {
 };
 
 
-// llama
-struct test_llama : public test_case {
-    const int n_tokens;
-    static constexpr float f_norm_rms_eps = 1e-5;
-    static constexpr int64_t n_embd_k_gqa = 3200;
-    static constexpr int64_t n_embd_v_gqa = 3200;
-    static constexpr int64_t n_ctx = 512;
-    static constexpr int64_t n_layer = 1;
-    static constexpr int64_t n_head = 32;
-    static constexpr int64_t n_head_kv = 32;
-    static constexpr int64_t n_embd_head = 100;
-    static constexpr int64_t n_embd = 3200;
-    static constexpr int64_t n_orig_ctx = n_ctx;
-    static constexpr int64_t n_ff = 8640;
-    static constexpr int64_t n_kv = 32;
-    static constexpr int64_t kv_head = 1;
-    static constexpr float freq_base = 10000.0f;
-    static constexpr float freq_scale = 1.0f;
-    static constexpr float ext_factor = 0.0f;
-    static constexpr float attn_factor = 1.0f;
-    static constexpr float beta_fast = 32.0f;
-    static constexpr float beta_slow = 1.0f;
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
 
-    std::string op_desc(ggml_tensor * t) override {
-        return "LLAMA";
+struct llama_hparams {
+    uint32_t n_vocab;
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    static constexpr uint32_t n_layer = 1;
+    uint32_t n_rot;
+    uint32_t n_embd_head; // dimension of values (d_v)
+    uint32_t n_ff;
 
-        GGML_UNUSED(t);
-    }
+    float f_norm_eps;
+    float f_norm_rms_eps;
 
-    std::string vars() override {
-        return VARS_TO_STR1(n_tokens);
-    }
+    // cparams
+    static constexpr uint32_t n_ctx = 512; // user-specified context size
+    static constexpr uint32_t n_orig_ctx = n_ctx;
 
-    double max_nmse_err() override {
-        return 2e-3;
+    // batch
+    int32_t n_tokens;
+
+    // llm_build_context
+    static constexpr int32_t n_kv    = 32; // size of KV cache to consider (n_kv <= n_ctx
+    static constexpr int32_t kv_head = 1;  // index of where we store new KV data in the cache
+
+    uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
+        return n_embd_head * n_head_kv;
     }
+};
 
-    test_llama(int n_tokens = 1)
-        : n_tokens(n_tokens) {
+// LLM base class
+struct test_llm : public test_case {
+    llama_hparams hp;
+
+protected:
+    test_llm(llama_hparams hp)
+        : hp(std::move(hp)) {
     }
 
+public:
     struct ggml_tensor * llm_build_norm(
             struct ggml_context * ctx,
              struct ggml_tensor * cur,
-             struct ggml_tensor * mw) {
-        cur = ggml_rms_norm(ctx, cur, f_norm_rms_eps);
+             struct ggml_tensor * mw,
+             struct ggml_tensor * mb,
+                  llm_norm_type   type) {
+        switch (type) {
+            case LLM_NORM:     cur = ggml_norm    (ctx, cur, hp.f_norm_eps); break;
+            case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
+        }
         cur = ggml_mul(ctx, cur, mw);
+        if (mb) {
+            cur = ggml_add(ctx, cur, mb);
+        }
         return cur;
     }
 
@@ -1528,14 +1539,14 @@ struct test_llama : public test_case {
              struct ggml_tensor * k_cur,
              struct ggml_tensor * v_cur) {
         // compute the transposed [n_tokens, n_embd] V matrix
-        struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
+        struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
 
-        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, n_tokens*n_embd_k_gqa,
-                (ggml_row_size(k_l->type, n_embd_k_gqa))*kv_head);
+        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
+                (ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
 
-        struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, n_tokens, n_embd_v_gqa,
-                (  n_ctx)*ggml_element_size(v_l),
-                (kv_head)*ggml_element_size(v_l));
+        struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
+                (  hp.n_ctx)*ggml_element_size(v_l),
+                (hp.kv_head)*ggml_element_size(v_l));
 
         // important: storing RoPE-ed version of K in the KV cache!
         ggml_cpy(ctx, k_cur,   k_cache_view);
@@ -1554,9 +1565,9 @@ struct test_llama : public test_case {
 
         struct ggml_tensor * k =
             ggml_view_3d(ctx, k_l,
-                    n_embd_head, n_kv, n_head_kv,
-                    ggml_row_size(k_l->type, n_embd_k_gqa),
-                    ggml_row_size(k_l->type, n_embd_head),
+                    hp.n_embd_head, hp.n_kv, hp.n_head_kv,
+                    ggml_row_size(k_l->type, hp.n_embd_gqa()),
+                    ggml_row_size(k_l->type, hp.n_embd_head),
                     0);
 
         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
@@ -1566,52 +1577,105 @@ struct test_llama : public test_case {
         // split cached v into n_head heads
         struct ggml_tensor * v =
             ggml_view_3d(ctx, v_l,
-                    n_kv, n_embd_head, n_head_kv,
-                    ggml_element_size(v_l)*n_ctx,
-                    ggml_element_size(v_l)*n_ctx*n_embd_head,
+                    hp.n_kv, hp.n_embd_head, hp.n_head_kv,
+                    ggml_element_size(v_l)*hp.n_ctx,
+                    ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
                     0);
 
         struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
 
         struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
 
-        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head*n_head, n_tokens);
+        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
 
-        struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 3200);
+        struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
         cur = ggml_mul_mat(ctx, wo, cur);
 
         return cur;
     }
 
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        const int64_t n_rot = n_embd_head;
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // pos
+                std::vector<int> data(hp.n_tokens);
+                for (int i = 0; i < hp.n_tokens; i++) {
+                    data[i] = rand() % hp.n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+
+// Llama
+struct test_llama : public test_llm {
+    static constexpr float freq_base = 10000.0f;
+    static constexpr float freq_scale = 1.0f;
+    static constexpr float ext_factor = 0.0f;
+    static constexpr float attn_factor = 1.0f;
+    static constexpr float beta_fast = 32.0f;
+    static constexpr float beta_slow = 1.0f;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "LLAMA";
+    }
+
+    std::string vars() override {
+        auto n_tokens = hp.n_tokens;
+        return VARS_TO_STR1(n_tokens);
+    }
 
+    double max_nmse_err() override {
+        return 2e-3;
+    }
+
+    test_llama(int n_tokens = 1)
+        : test_llm({
+            /*n_vocab        =*/ 32000,
+            /*n_embd         =*/ 3200,
+            /*n_head         =*/ 32,
+            /*n_head_kv      =*/ 32,
+            /*n_rot          =*/ 100,
+            /*n_embd_head    =*/ 100,
+            /*n_ff           =*/ 8640,
+            /*f_norm_eps     =*/ 0.f,
+            /*f_norm_rms_eps =*/ 1e-5f,
+            /*n_tokens       =*/ n_tokens,
+        }) {
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
 
         ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
         ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
 
-        for (int il = 0; il < n_layer; ++il) {
+        for (uint32_t il = 0; il < hp.n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3200);
-            cur = llm_build_norm(ctx, inpL, attn_norm);
+            ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
 
             // self-attention
             {
-                ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 3200);
-                ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 3200);
-                ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 3200);
+                ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
+                ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
+                ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
@@ -1619,31 +1683,31 @@ struct test_llama : public test_case {
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
 
                 Qcur = ggml_rope_custom(
-                    ctx, ggml_reshape_3d(ctx, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos,
+                    hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
                 Kcur = ggml_rope_custom(
-                    ctx, ggml_reshape_3d(ctx, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
+                    hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
                 llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
 
-                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(n_embd_head)));
+                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
             }
 
             struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
 
             // feed-forward network
-            ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3200);
-            cur = llm_build_norm(ctx, ffn_inp, ffn_norm);
+            ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
 
-            ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 8640);
-            ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 8640);
-            ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 8640, 3200);
+            ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+            ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff,   hp.n_embd);
+            ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
             struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
             cur = ggml_mul_mat(ctx, ffn_gate, cur);
             cur = ggml_silu(ctx, cur);
@@ -1658,29 +1722,138 @@ struct test_llama : public test_case {
 
         cur = inpL;
 
-        ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3200);
-        cur = llm_build_norm(ctx, cur, output_norm);
+        ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
 
         // lm_head
-        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 3200, 32000);
+        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
         cur = ggml_mul_mat(ctx, output, cur);
 
         return cur;
     }
+};
 
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                // pos
-                std::vector<int> data(n_tokens);
-                for (int i = 0; i < n_tokens; i++) {
-                    data[i] = rand() % n_ctx;
-                }
-                ggml_backend_tensor_set(t, data.data(), 0, n_tokens * sizeof(int));
-            } else {
-                init_tensor_uniform(t);
+// Falcon
+struct test_falcon : public test_llm {
+    static constexpr float freq_base = 10000.0f;
+    static constexpr float freq_scale = 1.0f;
+    static constexpr float ext_factor = 0.0f;
+    static constexpr float attn_factor = 1.0f;
+    static constexpr float beta_fast = 32.0f;
+    static constexpr float beta_slow = 1.0f;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "FALCON";
+    }
+
+    std::string vars() override {
+        auto n_tokens = hp.n_tokens;
+        return VARS_TO_STR1(n_tokens);
+    }
+
+    double max_nmse_err() override {
+        return 2e-3;
+    }
+
+    test_falcon(int n_tokens = 1)
+        : test_llm({
+            /*n_vocab        =*/ 65024,
+            /*n_embd         =*/ 4544,
+            /*n_head         =*/ 71,
+            /*n_head_kv      =*/ 1,
+            /*n_rot          =*/ 64,
+            /*n_embd_head    =*/ 64,
+            /*n_ff           =*/ 18176,
+            /*f_norm_eps     =*/ 1e-5f,
+            /*f_norm_rms_eps =*/ 0.f,
+            /*n_tokens       =*/ n_tokens,
+        }) {
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
+
+        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+
+        for (uint32_t il = 0; il < hp.n_layer; ++il) {
+            // norm
+            ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
+
+            // self-attention
+            {
+                cur = attn_norm;
+
+                ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
+
+                cur = ggml_mul_mat(ctx, wqkv, cur);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd,     hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
+
+                Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens);
+                Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_custom(
+                    ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                Kcur = ggml_rope_custom(
+                    ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
+
+                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
             }
+
+            struct ggml_tensor * ffn_inp = cur;
+
+            // feed forward
+            {
+                ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+                ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
+                cur = attn_norm;
+                cur = ggml_mul_mat(ctx, ffn_up, cur);
+                cur = ggml_gelu(ctx, cur);
+                cur = ggml_mul_mat(ctx, ffn_down, cur);
+            }
+
+            cur = ggml_add(ctx, cur, ffn_inp);
+
+            cur = ggml_add(ctx, cur, inpL);
+
+            // input for next layer
+            inpL = cur;
         }
+
+        cur = inpL;
+
+        ggml_tensor * output_norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
+
+        // lm_head
+        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
+        cur = ggml_mul_mat(ctx, output, cur);
+
+        return cur;
     }
 };
 
@@ -1821,6 +1994,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10,  1}, 5));
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
 
+#if 0
     std::uniform_int_distribution<> dist_ne1(1, 50);
     int exponent = 1;
     while (exponent < (1 << 17)) {
@@ -1834,6 +2008,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
 
         exponent <<= 1;
     }
+#endif
 
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));
@@ -1876,6 +2051,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
 
     test_cases.emplace_back(new test_llama(1));
     test_cases.emplace_back(new test_llama(2));
+    test_cases.emplace_back(new test_falcon(1));
+    test_cases.emplace_back(new test_falcon(2));
 
     // run tests
     if (mode == MODE_TEST) {

From 6fc99a6e668d0eaacc3e519ae6d999974adbeb42 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 15:01:21 -0500
Subject: [PATCH 110/140] test-backend-ops : test larger GELU range

---
 tests/test-backend-ops.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 08008a9ce5bfb..dc6e0e682b495 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -616,6 +616,13 @@ struct test_unary : public test_case {
         ggml_tensor * out = ggml_unary(ctx, in, op);
         return out;
     }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check for NaNs in GELU
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
 };
 
 // GGML_OP_GET_ROWS

From 38d1f0c7a0520c53ed9225d3aad8d15d87102d05 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 14:35:40 -0500
Subject: [PATCH 111/140] kompute : fix op_gelu -> Falcon is working on AMDVLK

---
 kompute-shaders/op_gelu.comp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute-shaders/op_gelu.comp b/kompute-shaders/op_gelu.comp
index 5b547f414a10b..9d8c53710afbf 100644
--- a/kompute-shaders/op_gelu.comp
+++ b/kompute-shaders/op_gelu.comp
@@ -17,6 +17,6 @@ void main() {
     for (uint x = 0; x < 8; x++) {
         const uint i = baseIndex + x;
         const float y = in_[i + pcs.inOff];
-        out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y)));
+        out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
     }
 }

From 11b305082b322cf3b384329a6b388321ae5716aa Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 15:05:55 -0500
Subject: [PATCH 112/140] test-backend-ops : restore softmax tests

---
 tests/test-backend-ops.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index dc6e0e682b495..acc450c9961e6 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2001,7 +2001,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10,  1}, 5));
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
 
-#if 0
     std::uniform_int_distribution<> dist_ne1(1, 50);
     int exponent = 1;
     while (exponent < (1 << 17)) {
@@ -2015,7 +2014,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
 
         exponent <<= 1;
     }
-#endif
 
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));

From de9fba0d393840015663c601706edf08f8e19d5d Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 15:22:11 -0500
Subject: [PATCH 113/140] kompute : fix basic f16 get_rows, 28 -> 26 failures

---
 kompute-shaders/op_getrows_f16.comp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute-shaders/op_getrows_f16.comp b/kompute-shaders/op_getrows_f16.comp
index 34acbcd700f72..48c9361081138 100644
--- a/kompute-shaders/op_getrows_f16.comp
+++ b/kompute-shaders/op_getrows_f16.comp
@@ -27,5 +27,5 @@ void main() {
     const uint i = gl_WorkGroupID.x;
     const int r = inB[i + pcs.inBOff];
 
-    dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1 + pcs.outOff, pcs.ne00);
+    dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
 }

From 445a3734b743e9230a09e3198d8466da8843e4a7 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 15:38:39 -0500
Subject: [PATCH 114/140] kompute : fix basic Q6_K get_rows, 26 -> 24 failures

---
 kompute-shaders/common.comp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kompute-shaders/common.comp b/kompute-shaders/common.comp
index 0918657d5695c..62d62b025ec6d 100644
--- a/kompute-shaders/common.comp
+++ b/kompute-shaders/common.comp
@@ -73,9 +73,6 @@ struct block_q6_k {
 };
 mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
     const float16_t d_all = xb.d;
-    uint8_t ql[QK_K/2];
-    uint8_t qh[QK_K/4];
-    int8_t  scales[QK_K/16];
 
     const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
     const uint qhIndex = 32*(il/8) + 16*(il&1);
@@ -89,8 +86,8 @@ mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
     const float16_t dl = float16_t(d_all * sc * coef);
     mat4 reg;
     for (int i = 0; i < 16; ++i) {
-        const float16_t q = (il&1) != 0 ? ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 2))
-                                        : ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 4));
+        const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
+                                        : ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
         reg[i/4][i%4] = dl * q - ml;
     }
     return reg;

From 3fbf0529ef6d76b68561036c6669e071d806ee9f Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 15:47:43 -0500
Subject: [PATCH 115/140] kompute : mark last few failing ops as unsupported

---
 ggml-kompute.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 030cb7a23df17..b4781d3da89bd 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1350,7 +1350,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q6_K:
-                    return op->ne[3] == 1;
+                    return op->ne[2] == 1 && op->ne[3] == 1;
                 default:
                     ;
             }
@@ -1361,11 +1361,12 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
 
             switch (op->src[0]->type) {
                 case GGML_TYPE_F32:
+                case GGML_TYPE_Q6_K:
+                    return op->ne[3] == 1;
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q6_K:
                     return true;
                 default:
                     ;

From 3915194232b4ad2a4359edefdd8ddccd6eb8d267 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 15:56:42 -0500
Subject: [PATCH 116/140] test-backend-ops : make Falcon test faster with a
 smaller model

---
 tests/test-backend-ops.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index acc450c9961e6..621e3c45563e4 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1765,13 +1765,13 @@ struct test_falcon : public test_llm {
 
     test_falcon(int n_tokens = 1)
         : test_llm({
-            /*n_vocab        =*/ 65024,
-            /*n_embd         =*/ 4544,
-            /*n_head         =*/ 71,
+            /*n_vocab        =*/ 32000,
+            /*n_embd         =*/ 3200,
+            /*n_head         =*/ 50,
             /*n_head_kv      =*/ 1,
             /*n_rot          =*/ 64,
             /*n_embd_head    =*/ 64,
-            /*n_ff           =*/ 18176,
+            /*n_ff           =*/ 8640,
             /*f_norm_eps     =*/ 1e-5f,
             /*f_norm_rms_eps =*/ 0.f,
             /*n_tokens       =*/ n_tokens,

From bc287047fb9127f310cd2e14c733f2a9f6b585b6 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 10:13:09 -0500
Subject: [PATCH 117/140] kompute : remove unused immintrin.h #include

---
 ggml-kompute.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index b4781d3da89bd..8fd956e9d15ad 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -45,7 +45,6 @@
 #include <mutex>
 #include <atomic>
 #include <cstring>
-#include <immintrin.h>
 #include <kompute/Kompute.hpp>
 
 #define QK4_0 32

From 91654ff042a1c42546811be3cdb71af5f2b0a1fd Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 17:03:06 -0500
Subject: [PATCH 118/140] kompute : fix a -Wstrict-aliasing warning

---
 ggml-kompute.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 8fd956e9d15ad..31c6f0d90db9c 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1527,7 +1527,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_SOFT_MAX:
                     {
-                        const float scale = ((float *) dst->op_params)[0];
+                        float scale;
+                        memcpy(&scale, dst->op_params, sizeof(float));
                         ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
                     } break;
                 case GGML_OP_DIAG_MASK_INF:

From 61a5cf88dc15278ae718c82ebe1e5d4fb2f6ea89 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 12:58:50 -0500
Subject: [PATCH 119/140] kompute : remove unnecessary use_mmap=false

---
 llama.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 98ffe1ec82c4f..11ac9ceb59ac5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2312,9 +2312,6 @@ struct llama_model_loader {
             use_mmap = false;
         }
 
-#ifdef GGML_USE_KOMPUTE
-        use_mmap = false;
-#endif
         this->use_mmap = use_mmap;
     }
 

From e6ce5f21a16c499ae3fb0281f995ad78b4e64132 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 13:10:49 -0500
Subject: [PATCH 120/140] llama : revert unintended whitespace change

---
 llama.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 11ac9ceb59ac5..042b44e14d095 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4338,8 +4338,7 @@ struct llm_build_context {
         n_orig_ctx       (cparams.n_yarn_orig_ctx),
         do_rope_shift    (worst_case || kv_self.has_shift),
         cb               (cb),
-        buf_compute_meta (lctx.buf_compute_meta)
-        {
+        buf_compute_meta (lctx.buf_compute_meta) {
             // all initializations should be done in init()
         }
 

From 2512799cfe7999bec825bd0694d178a9ed926151 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 13:55:10 -0500
Subject: [PATCH 121/140] test-backend-ops : comment out Llama and Falcon tests

---
 tests/test-backend-ops.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 2a6490cfffffa..cfc65d53efc2b 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2062,10 +2062,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
 #endif
 
+    // these tests are disabled to save execution time, but they can be handy for debugging
+#if 0
     test_cases.emplace_back(new test_llama(1));
     test_cases.emplace_back(new test_llama(2));
     test_cases.emplace_back(new test_falcon(1));
     test_cases.emplace_back(new test_falcon(2));
+#endif
 
     // run tests
     if (mode == MODE_TEST) {

From 8ca33dec7d0d859f508697d110566fd680e56885 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 26 Jan 2024 20:01:36 +0100
Subject: [PATCH 122/140] test-backend-ops : check all the ops in the test for
 support in the backends

---
 tests/test-backend-ops.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index cfc65d53efc2b..f3b8e9143b35f 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -364,15 +364,15 @@ struct test_case {
         printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
         fflush(stdout);
 
-        // check if backends support op
+        // check if the backends support the ops
         bool supported = true;
         for (ggml_backend_t backend : {backend1, backend2}) {
-            if (
-                !ggml_backend_supports_op(backend, out)
-                || (op_desc(out) == "MOE" && !strcmp(ggml_backend_name(backend), "Kompute"))
-            ) {
-                printf("not supported [%s] ", ggml_backend_name(backend));
-                supported = false;
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                if (!ggml_backend_supports_op(backend, t)) {
+                    printf("not supported [%s] ", ggml_backend_name(backend));
+                    supported = false;
+                    break;
+                }
             }
         }
         if (!supported) {

From 6af02b19d128da1d73085851a6f1f6d6d92a9014 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 14:42:11 -0500
Subject: [PATCH 123/140] kompute : init device automatically and remove an
 unnecessary free

---
 examples/main/main.cpp | 4 ----
 ggml-kompute.cpp       | 9 ++++++++-
 llama.cpp              | 3 ---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 4367cf20c6717..ef80b5012d80d 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -186,10 +186,6 @@ int main(int argc, char ** argv) {
     g_model = &model;
     g_ctx = &ctx;
 
-#if defined(GGML_USE_KOMPUTE)
-    ggml_vk_init_device(0, "gpu");
-#endif
-
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 31c6f0d90db9c..955e7f0776e28 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1837,6 +1837,8 @@ static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
     struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
     ggml_vk_free(ctx);
+    // TODO(cebtenzzre): This should only be done if the device was initialized by us, but
+    //                   that would require a change to GPT4All.
     ggml_vk_free_device();
     delete backend;
 }
@@ -1873,6 +1875,12 @@ static struct ggml_backend_i kompute_backend_i = {
 };
 
 ggml_backend_t ggml_backend_kompute_init() {
+#if defined(GGML_USE_KOMPUTE)
+    if (!ggml_vk_has_device()) {
+        ggml_vk_init_device(0, "gpu");
+    }
+#endif
+
     if (!ggml_vk_has_device()) {
         fprintf(stderr, "%s: error: device was not initialized\n", __func__);
         return nullptr;
@@ -1897,6 +1905,5 @@ extern "C" ggml_backend_t ggml_backend_reg_kompute_init(const char * params, voi
 ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
     GGML_UNUSED(params);
     GGML_UNUSED(user_data);
-    ggml_vk_init_device(0, "gpu");
     return ggml_backend_kompute_init();
 }
diff --git a/llama.cpp b/llama.cpp
index 95ec257d51446..0da73628b168c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -10162,9 +10162,6 @@ struct llama_context * llama_new_context_with_model(
 
 void llama_free(struct llama_context * ctx) {
     delete ctx;
-#ifdef GGML_USE_KOMPUTE
-    ggml_vk_free_device();
-#endif
 }
 
 const llama_model * llama_get_model(const struct llama_context * ctx) {

From 2ff2d1613181fbdbd11771302a72892c8cc1cdf9 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 14:57:58 -0500
Subject: [PATCH 124/140] ggml-kompute.h : remove anything that doesn't need to
 be public

The remaining functions are either used by llama.cpp or GPT4All.
---
 ggml-kompute.cpp | 17 +++++++++++++----
 ggml-kompute.h   | 21 ---------------------
 2 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 955e7f0776e28..e0ae6cf70f3c3 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -85,6 +85,15 @@ class kompute_manager {
 
 static kompute_manager komputeManager;
 
+struct ggml_vk_memory {
+    void *data = nullptr;
+    size_t size = 0;
+    vk::DeviceMemory *primaryMemory = nullptr;
+    vk::Buffer *primaryBuffer = nullptr;
+    vk::DeviceMemory *stagingMemory = nullptr;
+    vk::Buffer *stagingBuffer = nullptr;
+};
+
 #ifdef __linux__
 __attribute__((constructor))
 static void enable_sam() {
@@ -302,13 +311,13 @@ ggml_vk_device ggml_vk_current_device() {
     return devices.front();
 }
 
-ggml_kompute_context *ggml_vk_init() {
+static ggml_kompute_context * ggml_vk_init() {
     GGML_ASSERT(s_kompute_context == nullptr);
     s_kompute_context = new ggml_kompute_context;
     return s_kompute_context;
 }
 
-void ggml_vk_free(struct ggml_kompute_context * ctx) {
+static void ggml_vk_free(struct ggml_kompute_context * ctx) {
     assert(ctx == s_kompute_context);
     s_kompute_context = nullptr;
     if (ctx != nullptr) {
@@ -457,7 +466,7 @@ static ggml_vk_memory ggml_vk_allocate(size_t size) {
     return memory;
 }
 
-void ggml_vk_free_memory(ggml_vk_memory &memory)
+static void ggml_vk_free_memory(ggml_vk_memory &memory)
 {
     komputeManager()->device()->destroy(
       *memory.primaryBuffer,
@@ -1376,7 +1385,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
     return false;
 }
 
-void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
+static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
     const int n_seq = 8;
 
     // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
diff --git a/ggml-kompute.h b/ggml-kompute.h
index 288c835c55ab5..63048213fd167 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -6,22 +6,6 @@
 #include <vector>
 #include <string>
 
-struct ggml_kompute_context;
-
-namespace vk {
-    class DeviceMemory;
-    class Buffer;
-};
-
-struct ggml_vk_memory {
-    void *data = nullptr;
-    size_t size = 0;
-    vk::DeviceMemory *primaryMemory = nullptr;
-    vk::Buffer *primaryBuffer = nullptr;
-    vk::DeviceMemory *stagingMemory = nullptr;
-    vk::Buffer *stagingBuffer = nullptr;
-};
-
 struct ggml_vk_device {
     int index = 0;
     int type = 0;           // same as VkPhysicalDeviceType
@@ -40,11 +24,6 @@ bool ggml_vk_has_vulkan();
 bool ggml_vk_has_device();
 bool ggml_vk_using_vulkan();
 ggml_vk_device ggml_vk_current_device();
-struct ggml_kompute_context * ggml_vk_init(void);
-void ggml_vk_free(struct ggml_kompute_context * ctx);
-void ggml_vk_free_memory(ggml_vk_memory &memory);
-
-void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
 
 //
 // backend API

From cdab4043b33152d5380e1ee8fb2a73c8e029c3a4 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 15:08:31 -0500
Subject: [PATCH 125/140] kompute : fix #includes

---
 ggml-kompute.cpp | 23 ++++++++++++++---------
 ggml-kompute.h   |  3 ++-
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index e0ae6cf70f3c3..c9abd6c688ba6 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -33,19 +33,24 @@
 #include "shaderop_cpy_f32_f16.h"
 #include "shaderop_cpy_f32_f32.h"
 
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstring>
 #include <iostream>
-#include <vector>
-#include <string>
 #include <memory>
-#include <vector>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
 #include <utility>
-#include <fstream>
-#include <exception>
-#include <thread>
-#include <mutex>
-#include <atomic>
-#include <cstring>
+#include <vector>
+
 #include <kompute/Kompute.hpp>
+#include <vulkan/vulkan.hpp>
+
+#ifdef __linux__
+#include <cstdlib> // for setenv
+#endif
 
 #define QK4_0 32
 #define QR4_0 2
diff --git a/ggml-kompute.h b/ggml-kompute.h
index 63048213fd167..d4a86d31df22d 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -1,10 +1,11 @@
 #pragma once
 
+#include "ggml.h"
 #include "ggml-backend.h"
 
 #include <cstddef>
-#include <vector>
 #include <string>
+#include <vector>
 
 struct ggml_vk_device {
     int index = 0;

From 454baebacc1c54b74c75acd68d242acc88df610c Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 15:44:13 -0500
Subject: [PATCH 126/140] op_mul_mat_mat_f32.comp : fix missing final newline

---
 kompute-shaders/op_mul_mat_mat_f32.comp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute-shaders/op_mul_mat_mat_f32.comp b/kompute-shaders/op_mul_mat_mat_f32.comp
index 6cc5558b2725d..d1ca4ad6c2528 100644
--- a/kompute-shaders/op_mul_mat_mat_f32.comp
+++ b/kompute-shaders/op_mul_mat_mat_f32.comp
@@ -48,4 +48,4 @@ void main() {
   if (subgroupElect()) {
     out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
   }
-}
\ No newline at end of file
+}

From 297fde5f58c0cdc9515d4d781e00d0f4fdaeefe2 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 15:48:35 -0500
Subject: [PATCH 127/140] editorconfig-checker : exclude .gitmodules

---
 .ecrc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ecrc b/.ecrc
index b682057dd6891..a3351f4e6442d 100644
--- a/.ecrc
+++ b/.ecrc
@@ -1,4 +1,5 @@
 {
+  "Exclude": ["^\\.gitmodules$"],
   "Disable": {
     "IndentSize": true
   }

From 91324851a3e0b470e157bd84f12c91cc759aac8f Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 16:36:31 -0500
Subject: [PATCH 128/140] ci : initial attempt at testing Kompute backend

---
 .github/workflows/build.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d22a041a66b73..30abc7307479a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -296,6 +296,7 @@ jobs:
       OPENCL_VERSION: 2023.04.17
       CLBLAST_VERSION: 1.6.0
       SDE_VERSION: 9.33.0-2024-01-07
+      VULKAN_VERSION: 1.3.261.1
 
     strategy:
       matrix:
@@ -312,6 +313,8 @@ jobs:
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
           - build: 'openblas'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'kompute'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
 
     steps:
       - name: Clone
@@ -320,6 +323,12 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Clone Kompute submodule
+        id: clone_kompute
+        if: ${{ matrix.build == 'kompute' }}
+        run: |
+          git submodule update --init kompute
+
       - name: Download OpenCL SDK
         id: get_opencl
         if: ${{ matrix.build == 'clblast' }}
@@ -354,6 +363,15 @@ jobs:
           $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
           & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
 
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'kompute' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          $env:RUNNER_TEMP/VulkanSDK-Installer.exe --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
       - name: Build
         id: cmake_build
         run: |

From 57cecad175bb4fe1b4d8f52d34d7974bf958013c Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 16:37:33 -0500
Subject: [PATCH 129/140] main : remove ggml-kompute.h #include

---
 examples/main/main.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index ef80b5012d80d..58b7f807a9cca 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -31,10 +31,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#if defined(GGML_USE_KOMPUTE)
-#include "ggml-kompute.h"
-#endif
-
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;

From 4b0c96a9e2617a798ac15968a830b43aa7b3a4f4 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 17:16:25 -0500
Subject: [PATCH 130/140] kompute : adapt ggml-kompute API to be compatible
 with C

---
 ggml-kompute.cpp | 56 +++++++++++++++++++++++++++++-------------------
 ggml-kompute.h   | 43 +++++++++++++++++--------------------
 tests/test-c.c   |  4 ++++
 3 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index c9abd6c688ba6..df01d6196cbcc 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -36,6 +36,8 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <iostream>
 #include <memory>
@@ -139,7 +141,7 @@ static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physicalDevic
     return true;
 }
 
-static std::string ggml_vk_getVendorName(uint32_t vendorID) {
+static const char * ggml_vk_getVendorName(uint32_t vendorID) {
     switch (vendorID) {
         case 0x10DE:
             return "nvidia";
@@ -152,7 +154,7 @@ static std::string ggml_vk_getVendorName(uint32_t vendorID) {
     }
 }
 
-std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
+static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t memoryRequired) {
     std::vector<ggml_vk_device> results;
     if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
         return results;
@@ -206,13 +208,16 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
         d.index = i;
         d.type = properties.deviceType;
         d.heapSize = heapSize;
-        d.name = properties.deviceName;
+        d.vendor = strdup(ggml_vk_getVendorName(properties.vendorID));
         d.subgroupSize = subgroupProperties.subgroupSize;
-        size_t n_idx = ++count_by_name[d.name];
+
+        std::string name(properties.deviceName);
+        size_t n_idx = ++count_by_name[name];
         if (n_idx > 1) {
-            d.name += " (" + std::to_string(n_idx) + ")";
+            name += " (" + std::to_string(n_idx) + ")";
         }
-        d.vendor = ggml_vk_getVendorName(properties.vendorID);
+        d.name = strdup(name.c_str());
+
         results.push_back(d);
     }
 
@@ -232,6 +237,20 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
     return results;
 }
 
+// public API returns a C-style array
+ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count) {
+    auto devices = ggml_vk_available_devices_internal(memoryRequired);
+    *count = devices.size();
+    if (devices.empty()) {
+        return nullptr;
+    }
+
+    size_t nbytes = sizeof (ggml_vk_device) * (devices.size());
+    auto * arr = static_cast<ggml_vk_device *>(malloc(nbytes));
+    memcpy(&arr, devices.data(), nbytes);
+    return arr;
+}
+
 static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
     devices.erase(
         std::remove_if(devices.begin(), devices.end(),
@@ -252,32 +271,25 @@ static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std
     );
 }
 
-bool ggml_vk_init_device(size_t memoryRequired, const std::string &device) {
+static bool ggml_vk_init_device(size_t memoryRequired, const std::string & device) {
     if (device.empty())
         return false;
 
-    std::vector<ggml_vk_device> devices = ggml_vk_available_devices(memoryRequired);
-    if (device == "gpu") {
-        if (devices.size() != 0)
-            return ggml_vk_init_device(devices.front());
-    } else if (device == "amd" || device == "nvidia" || device == "intel") {
+    auto devices = ggml_vk_available_devices_internal(memoryRequired);
+    if (device == "amd" || device == "nvidia" || device == "intel") {
         ggml_vk_filterByVendor(devices, device);
-        if (devices.size() != 0)
-            return ggml_vk_init_device(devices.front());
-    } else {
+    } else if (device != "gpu") {
         ggml_vk_filterByName(devices, device);
-        if (devices.size() != 0)
-            return ggml_vk_init_device(devices.front());
     }
 
-    return ggml_vk_has_device();
+    return !devices.empty() && ggml_vk_init_device_idx(devices[0].index);
 }
 
-bool ggml_vk_init_device(const ggml_vk_device &device) {
-    return ggml_vk_init_device(device.index);
+bool ggml_vk_init_device(size_t memoryRequired, const char * device) {
+    return ggml_vk_init_device(memoryRequired, std::string(device));
 }
 
-bool ggml_vk_init_device(int device) {
+bool ggml_vk_init_device_idx(int device) {
     komputeManager()->initializeDevice(device, {},
                          {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
                           "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
@@ -311,7 +323,7 @@ ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
 
-    std::vector<ggml_vk_device> devices = ggml_vk_available_devices(0);
+    auto devices = ggml_vk_available_devices_internal(0);
     ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName);
     return devices.front();
 }
diff --git a/ggml-kompute.h b/ggml-kompute.h
index d4a86d31df22d..d4aeb77314633 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -3,38 +3,35 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 
-#include <cstddef>
-#include <string>
-#include <vector>
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 struct ggml_vk_device {
-    int index = 0;
-    int type = 0;           // same as VkPhysicalDeviceType
-    size_t heapSize = 0;
-    std::string name;
-    std::string vendor;
-    int subgroupSize = 0;
+    int index;
+    int type; // same as VkPhysicalDeviceType
+    size_t heapSize;
+    const char * name;
+    const char * vendor;
+    int subgroupSize;
 };
 
-std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
-bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
-bool ggml_vk_init_device(const ggml_vk_device &device);
-bool ggml_vk_init_device(int device);
-bool ggml_vk_free_device();
-bool ggml_vk_has_vulkan();
-bool ggml_vk_has_device();
-bool ggml_vk_using_vulkan();
-ggml_vk_device ggml_vk_current_device();
+struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
+bool ggml_vk_init_device(size_t memoryRequired, const char * device);
+bool ggml_vk_init_device_idx(int device);
+bool ggml_vk_free_device(void);
+bool ggml_vk_has_vulkan(void);
+bool ggml_vk_has_device(void);
+bool ggml_vk_using_vulkan(void);
+struct ggml_vk_device ggml_vk_current_device(void);
 
 //
 // backend API
-// user-code should use only these functions
 //
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 
diff --git a/tests/test-c.c b/tests/test-c.c
index a05071080a1df..95ba73df39a3c 100644
--- a/tests/test-c.c
+++ b/tests/test-c.c
@@ -1,3 +1,7 @@
 #include "llama.h"
 
+#ifdef GGML_USE_KOMPUTE
+#include "ggml-kompute.h"
+#endif
+
 int main(void) {}

From e6edd44d5e3275c1e8acae159044ff13bed35b24 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 26 Jan 2024 19:36:49 -0500
Subject: [PATCH 131/140] ci : attempt to fix Vulkan installer path

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 30abc7307479a..1ae4e5a95de80 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -368,7 +368,7 @@ jobs:
         if: ${{ matrix.build == 'kompute' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          $env:RUNNER_TEMP/VulkanSDK-Installer.exe --accept-licenses --default-answer --confirm-command install
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
           Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
 

From 050d45029757a09dc20382d13adfa0b64f47031b Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Sat, 27 Jan 2024 11:06:50 -0500
Subject: [PATCH 132/140] ci : do not run tests for Kompute (no GPU)

---
 .github/workflows/build.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1ae4e5a95de80..3b019b7604596 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -409,7 +409,8 @@ jobs:
 
       - name: Test
         id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
+        # not all machines have native AVX-512
+        if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
         run: |
           cd build
           ctest -L main -C Release --verbose --timeout 900

From 530462550d611df9defd83f9c73d031a077e58c6 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Sat, 27 Jan 2024 11:55:32 -0500
Subject: [PATCH 133/140] kompute : use llama_backend_init/llama_backend_free
 to manage device

---
 ggml-kompute.cpp |  9 ---------
 llama.cpp        | 11 +++++++++++
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index df01d6196cbcc..f6bba6838ef79 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1863,9 +1863,6 @@ static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
     struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
     ggml_vk_free(ctx);
-    // TODO(cebtenzzre): This should only be done if the device was initialized by us, but
-    //                   that would require a change to GPT4All.
-    ggml_vk_free_device();
     delete backend;
 }
 
@@ -1901,12 +1898,6 @@ static struct ggml_backend_i kompute_backend_i = {
 };
 
 ggml_backend_t ggml_backend_kompute_init() {
-#if defined(GGML_USE_KOMPUTE)
-    if (!ggml_vk_has_device()) {
-        ggml_vk_init_device(0, "gpu");
-    }
-#endif
-
     if (!ggml_vk_has_device()) {
         fprintf(stderr, "%s: error: device was not initialized\n", __func__);
         return nullptr;
diff --git a/llama.cpp b/llama.cpp
index 0da73628b168c..b97d4d96051c6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9860,6 +9860,13 @@ void llama_backend_init(bool numa) {
 #ifdef GGML_USE_MPI
     ggml_mpi_backend_init();
 #endif
+
+#ifdef GGML_USE_KOMPUTE
+    if (!ggml_vk_has_device()) {
+        ggml_vk_init_device(0, "gpu");
+    }
+#endif
+
 }
 
 void llama_backend_free(void) {
@@ -9867,6 +9874,10 @@ void llama_backend_free(void) {
     ggml_mpi_backend_free();
 #endif
     ggml_quantize_free();
+
+#ifdef GGML_USE_KOMPUTE
+    ggml_vk_free_device();
+#endif
 }
 
 int64_t llama_time_us(void) {

From be7c0559d32387c26446c0e8fe844a073bf8f202 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 29 Jan 2024 12:07:35 -0500
Subject: [PATCH 134/140] kompute : better device management

---
 ggml-backend.c   |   5 +-
 ggml-kompute.cpp | 234 +++++++++++++++++++++++++++--------------------
 ggml-kompute.h   |  11 +--
 llama.cpp        |  20 ++--
 4 files changed, 149 insertions(+), 121 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index ed4260634740d..532d5bd28f953 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -346,9 +346,8 @@ GGML_CALL static void ggml_backend_registry_init(void) {
 #endif
 
 #ifdef GGML_USE_KOMPUTE
-    extern ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
-    extern ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
-    ggml_backend_register("Kompute", ggml_backend_reg_kompute_init, ggml_backend_kompute_buffer_type(), NULL);
+    extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
+    ggml_backend_kompute_reg_devices();
 #endif
 }
 
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index f6bba6838ef79..270b91010ed89 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -60,8 +60,18 @@
 #define QK_NL 16
 
 typedef ggml_fp16_t half;
+
+static std::string ggml_kompute_format_name(int device) {
+    return "Kompute" + std::to_string(device);
+}
+
 struct ggml_kompute_context {
+    int device;
+    std::string name;
     std::shared_ptr<vk::DescriptorPool> pool;
+
+    ggml_kompute_context(int device)
+        : device(device), name(ggml_kompute_format_name(device)) {}
 };
 
 // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
@@ -210,6 +220,7 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
         d.heapSize = heapSize;
         d.vendor = strdup(ggml_vk_getVendorName(properties.vendorID));
         d.subgroupSize = subgroupProperties.subgroupSize;
+        d.bufferAlignment = properties.limits.minStorageBufferOffsetAlignment;
 
         std::string name(properties.deviceName);
         size_t n_idx = ++count_by_name[name];
@@ -271,40 +282,26 @@ static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std
     );
 }
 
-static bool ggml_vk_init_device(size_t memoryRequired, const std::string & device) {
-    if (device.empty())
+static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) {
+    if (name.empty())
         return false;
 
     auto devices = ggml_vk_available_devices_internal(memoryRequired);
-    if (device == "amd" || device == "nvidia" || device == "intel") {
-        ggml_vk_filterByVendor(devices, device);
-    } else if (device != "gpu") {
-        ggml_vk_filterByName(devices, device);
+    if (name == "amd" || name == "nvidia" || name == "intel") {
+        ggml_vk_filterByVendor(devices, name);
+    } else if (name != "gpu") {
+        ggml_vk_filterByName(devices, name);
     }
 
-    return !devices.empty() && ggml_vk_init_device_idx(devices[0].index);
-}
+    if (devices.empty())
+        return false;
 
-bool ggml_vk_init_device(size_t memoryRequired, const char * device) {
-    return ggml_vk_init_device(memoryRequired, std::string(device));
+    *device = devices.front();
+    return true;
 }
 
-bool ggml_vk_init_device_idx(int device) {
-    komputeManager()->initializeDevice(device, {},
-                         {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
-                          "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
-    return ggml_vk_has_device();
-}
-
-bool ggml_vk_free_device() {
-    if (!ggml_vk_has_device())
-        return false;
-    komputeManager.destroy();
-    // FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact
-    // the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which
-    // is very brittle
-    s_kompute_context = nullptr;
-    return true;
+bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) {
+    return ggml_vk_get_device(device, memoryRequired, std::string(name));
 }
 
 bool ggml_vk_has_vulkan() {
@@ -315,10 +312,6 @@ bool ggml_vk_has_device() {
     return komputeManager()->hasDevice();
 }
 
-bool ggml_vk_using_vulkan() {
-    return s_kompute_context != nullptr;
-}
-
 ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
@@ -328,20 +321,6 @@ ggml_vk_device ggml_vk_current_device() {
     return devices.front();
 }
 
-static ggml_kompute_context * ggml_vk_init() {
-    GGML_ASSERT(s_kompute_context == nullptr);
-    s_kompute_context = new ggml_kompute_context;
-    return s_kompute_context;
-}
-
-static void ggml_vk_free(struct ggml_kompute_context * ctx) {
-    assert(ctx == s_kompute_context);
-    s_kompute_context = nullptr;
-    if (ctx != nullptr) {
-        delete ctx;
-    }
-}
-
 static
 void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
     std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
@@ -503,20 +482,22 @@ static void ggml_vk_free_memory(ggml_vk_memory &memory)
     }
 }
 
+static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft);
+
 static
 ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
     ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
 
     // compatibility with ggml-backend
-    GGML_ASSERT(buffer && buffer->buft == ggml_backend_kompute_buffer_type());
+    GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name);
 
-    ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
+    ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context);
 
-    const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
+    const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data);
 
-    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)buffer->size);
+    GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size));
 
-    offset = (uint64_t)ioffs;
+    offset = uint64_t(ioffs);
     return buf_ctx;
 }
 
@@ -1746,9 +1727,47 @@ kp::TensorT<uint8_t>::dataType()
 
 // backend interface
 
+struct ggml_backend_kompute_buffer_type_context {
+    int         device;
+    int         device_ref = 0;
+    uint64_t    buffer_alignment;
+    std::string name;
+
+    ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment)
+        : device(device), buffer_alignment(buffer_alignment), name(ggml_kompute_format_name(device)) {}
+};
+
+static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+
+    if (!ctx->device_ref) {
+        komputeManager()->initializeDevice(
+            ctx->device, {}, {
+                "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
+                "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
+            }
+        );
+    }
+
+    assert(ggml_vk_has_device());
+    ctx->device_ref++;
+}
+
+static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+
+    assert(ctx->device_ref > 0);
+
+    ctx->device_ref--;
+
+    if (!ctx->device_ref) {
+        komputeManager.destroy();
+    }
+}
+
 static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
-    GGML_UNUSED(buffer);
-    return "Kompute";
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
+    return ctx->name.c_str();
 }
 
 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -1808,28 +1827,19 @@ static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
 // default buffer type
 
 static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return "Kompute";
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+    return ctx->name.c_str();
 }
 
 static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_kompute_device_ref(buft);
     auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
     return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
 }
 
 static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-
-    static size_t minStorageBufferOffsetAlignment = 0;
-    if (minStorageBufferOffsetAlignment == 0) {
-        GGML_ASSERT(ggml_vk_has_device());
-        vk::PhysicalDeviceProperties deviceProperties;
-        deviceProperties = komputeManager()->physicalDevice()->getProperties();
-        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
-        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
-    }
-
-    return minStorageBufferOffsetAlignment;
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+    return ctx->buffer_alignment;
 }
 
 static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
@@ -1837,42 +1847,62 @@ static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffe
     return ggml_backend_is_kompute(backend);
 }
 
-ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
-            /* .is_host          = */ NULL,
-        },
-        /* .context = */ NULL,
-    };
+static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
+    /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+    /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
+    /* .is_host          = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
+    static std::vector<ggml_backend_buffer_type> bufts = []() {
+        std::vector<ggml_backend_buffer_type> vec;
+        auto devices = ggml_vk_available_devices_internal(0);
+        vec.reserve(devices.size());
+
+        for (const auto & dev : devices) {
+            vec.push_back({
+                /* .iface   = */ ggml_backend_kompute_buffer_type_interface,
+                /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment)
+            });
+        }
+        return vec;
+    }();
 
-    return &ggml_backend_buffer_type_kompute;
+    auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) { 
+        return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
+    });
+    return it < bufts.end() ? &*it : nullptr;
 }
 
 // backend
 
 static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-    return "Kompute";
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+    return ctx->name.c_str();
 }
 
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
-    struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
-    ggml_vk_free(ctx);
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+
+    assert(ctx == s_kompute_context);
+    s_kompute_context = nullptr;
+    if (ctx != nullptr) {
+        delete ctx;
+    }
+
     delete backend;
 }
 
 static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-    return ggml_backend_kompute_buffer_type();
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+    return ggml_backend_kompute_buffer_type(ctx->device);
 }
 
 static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    auto * ctx = (ggml_kompute_context *)backend->context;
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
     ggml_vk_graph_compute(ctx, cgraph);
     return true;
 }
@@ -1897,17 +1927,13 @@ static struct ggml_backend_i kompute_backend_i = {
     /* .supports_op             = */ ggml_backend_kompute_supports_op,
 };
 
-ggml_backend_t ggml_backend_kompute_init() {
-    if (!ggml_vk_has_device()) {
-        fprintf(stderr, "%s: error: device was not initialized\n", __func__);
-        return nullptr;
-    }
-
-    struct ggml_kompute_context * ctx = ggml_vk_init();
+ggml_backend_t ggml_backend_kompute_init(int device) {
+    GGML_ASSERT(s_kompute_context == nullptr);
+    s_kompute_context = new ggml_kompute_context(device);
 
     ggml_backend_t kompute_backend = new ggml_backend {
         /* .interface = */ kompute_backend_i,
-        /* .context   = */ ctx,
+        /* .context   = */ s_kompute_context,
     };
 
     return kompute_backend;
@@ -1917,10 +1943,22 @@ bool ggml_backend_is_kompute(ggml_backend_t backend) {
     return backend && backend->iface.get_name == ggml_backend_kompute_name;
 }
 
-extern "C" ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
-
-ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
+static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
     GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
-    return ggml_backend_kompute_init();
+    return ggml_backend_kompute_init(intptr_t(user_data));
+}
+
+extern "C" int ggml_backend_kompute_reg_devices();
+
+int ggml_backend_kompute_reg_devices() {
+    auto devices = ggml_vk_available_devices_internal(0);
+    for (const auto & device : devices) {
+        ggml_backend_register(
+            ggml_kompute_format_name(device.index).c_str(),
+            ggml_backend_reg_kompute_init,
+            ggml_backend_kompute_buffer_type(device.index),
+            reinterpret_cast<void *>(intptr_t(device.index))
+        );
+    }
+    return devices.size();
 }
diff --git a/ggml-kompute.h b/ggml-kompute.h
index d4aeb77314633..c56e42f8e0190 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -5,6 +5,7 @@
 
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -17,15 +18,13 @@ struct ggml_vk_device {
     const char * name;
     const char * vendor;
     int subgroupSize;
+    uint64_t bufferAlignment;
 };
 
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
-bool ggml_vk_init_device(size_t memoryRequired, const char * device);
-bool ggml_vk_init_device_idx(int device);
-bool ggml_vk_free_device(void);
+bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_has_vulkan(void);
 bool ggml_vk_has_device(void);
-bool ggml_vk_using_vulkan(void);
 struct ggml_vk_device ggml_vk_current_device(void);
 
 //
@@ -35,11 +34,11 @@ struct ggml_vk_device ggml_vk_current_device(void);
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 
-GGML_API ggml_backend_t ggml_backend_kompute_init(void);
+GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
 
 GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
 
 #ifdef __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index b97d4d96051c6..9605b50077f20 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1280,7 +1280,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
 #elif defined(GGML_USE_CLBLAST)
     buft = ggml_backend_opencl_buffer_type();
 #elif defined(GGML_USE_KOMPUTE)
-    buft = ggml_backend_kompute_buffer_type();
+    buft = ggml_backend_kompute_buffer_type(gpu);
+    if (buft == nullptr) {
+        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
+    }
 #endif
 
     if (buft == nullptr) {
@@ -9860,13 +9863,6 @@ void llama_backend_init(bool numa) {
 #ifdef GGML_USE_MPI
     ggml_mpi_backend_init();
 #endif
-
-#ifdef GGML_USE_KOMPUTE
-    if (!ggml_vk_has_device()) {
-        ggml_vk_init_device(0, "gpu");
-    }
-#endif
-
 }
 
 void llama_backend_free(void) {
@@ -9874,10 +9870,6 @@ void llama_backend_free(void) {
     ggml_mpi_backend_free();
 #endif
     ggml_quantize_free();
-
-#ifdef GGML_USE_KOMPUTE
-    ggml_vk_free_device();
-#endif
 }
 
 int64_t llama_time_us(void) {
@@ -10034,8 +10026,8 @@ struct llama_context * llama_new_context_with_model(
             }
         }
 #elif defined(GGML_USE_KOMPUTE)
-        if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
-            auto * backend = ggml_backend_kompute_init();
+        if (model->n_gpu_layers > 0) {
+            auto * backend = ggml_backend_kompute_init(model->main_gpu);
             if (backend == nullptr) {
                 LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
                 llama_free(ctx);

From dc08e512cc6c569c9371a83b40fe8d66cac47bd3 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 29 Jan 2024 12:41:02 -0500
Subject: [PATCH 135/140] kompute : fix merge issues

---
 ggml-kompute.cpp | 85 +++++++++++++++++++++++++++++++-----------------
 ggml-kompute.h   |  1 +
 2 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 270b91010ed89..fa7baf7e8b598 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -118,9 +118,9 @@ static void enable_sam() {
 }
 #endif
 
-static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physicalDevice) {
+static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physical_device) {
     vk::PhysicalDeviceFeatures availableFeatures;
-    physicalDevice.getFeatures(&availableFeatures);
+    physical_device.getFeatures(&availableFeatures);
 
     if (!availableFeatures.shaderInt16)
         return false;
@@ -134,7 +134,7 @@ static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physicalDevic
     vk::PhysicalDeviceFeatures2 features2;
     features2.pNext = &availableFeatures11;
 
-    physicalDevice.getFeatures2(&features2);
+    physical_device.getFeatures2(&features2);
 
     if (!availableFeatures11.uniformAndStorageBuffer16BitAccess ||
         !availableFeatures11.storageBuffer16BitAccess) {
@@ -169,29 +169,31 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
     if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
         return results;
 
-    std::vector<vk::PhysicalDevice> physicalDevices;
+    std::vector<vk::PhysicalDevice> physical_devices;
     try {
-        physicalDevices = komputeManager()->listDevices();
+        physical_devices = komputeManager()->listDevices();
     } catch (vk::SystemError & err) {
         std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n";
         return results;
     }
 
-    uint32_t deviceCount = physicalDevices.size();
+    uint32_t deviceCount = physical_devices.size();
     if (deviceCount == 0)
         return results;
 
     std::unordered_map<std::string, size_t> count_by_name;
 
     for (uint32_t i = 0; i < deviceCount; i++) {
-        VkPhysicalDeviceProperties properties = physicalDevices.at(i).getProperties();
-        VkPhysicalDeviceMemoryProperties memoryProperties = physicalDevices.at(i).getMemoryProperties();
-        const uint32_t major = VK_VERSION_MAJOR(properties.apiVersion);
-        const uint32_t minor = VK_VERSION_MINOR(properties.apiVersion);
+        const auto & physical_device = physical_devices[i];
+
+        VkPhysicalDeviceProperties dev_props = physical_device.getProperties();
+        VkPhysicalDeviceMemoryProperties memoryProperties = physical_device.getMemoryProperties();
+        const uint32_t major = VK_VERSION_MAJOR(dev_props.apiVersion);
+        const uint32_t minor = VK_VERSION_MINOR(dev_props.apiVersion);
         if (major < 1 || minor < 2)
             continue;
 
-        if (!ggml_vk_checkPhysicalDeviceFeatures(physicalDevices.at(i)))
+        if (!ggml_vk_checkPhysicalDeviceFeatures(physical_device))
             continue;
 
         size_t heapSize = 0;
@@ -206,23 +208,45 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
         if (heapSize < memoryRequired)
             continue;
 
-        vk::PhysicalDeviceSubgroupProperties subgroupProperties;
-        vk::PhysicalDeviceProperties2 deviceProperties2;
-        deviceProperties2.pNext = &subgroupProperties;
-        physicalDevices.at(i).getProperties2(&deviceProperties2);
+        auto ext_props = physical_device.enumerateDeviceExtensionProperties();
+        bool has_maintenance4 = false;
+
+        // Check if maintenance4 is supported
+        for (const auto & properties : ext_props) {
+            if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
+                has_maintenance4 = true;
+            }
+        }
+
+        vk::PhysicalDeviceSubgroupProperties subgroup_props;
+        vk::PhysicalDeviceProperties2 dev_props2;
+        vk::PhysicalDeviceMaintenance3Properties dev_props3;
+        vk::PhysicalDeviceMaintenance4Properties dev_props4;
+        dev_props2.pNext = &dev_props3;
+        dev_props3.pNext = &subgroup_props;
+        if (has_maintenance4) {
+            subgroup_props.pNext = &dev_props4;
+        }
+        physical_device.getProperties2(&dev_props2);
 
-        if (subgroupProperties.subgroupSize < 32)
+        if (subgroup_props.subgroupSize < 32)
             continue;
 
         ggml_vk_device d;
         d.index = i;
-        d.type = properties.deviceType;
+        d.type = dev_props.deviceType;
         d.heapSize = heapSize;
-        d.vendor = strdup(ggml_vk_getVendorName(properties.vendorID));
-        d.subgroupSize = subgroupProperties.subgroupSize;
-        d.bufferAlignment = properties.limits.minStorageBufferOffsetAlignment;
+        d.vendor = strdup(ggml_vk_getVendorName(dev_props.vendorID));
+        d.subgroupSize = subgroup_props.subgroupSize;
+        d.bufferAlignment = dev_props.limits.minStorageBufferOffsetAlignment;
+
+        if (has_maintenance4) {
+            d.maxAlloc = std::min(dev_props3.maxMemoryAllocationSize, dev_props4.maxBufferSize);
+        } else {
+            d.maxAlloc = dev_props3.maxMemoryAllocationSize;
+        }
 
-        std::string name(properties.deviceName);
+        std::string name(dev_props.deviceName);
         size_t n_idx = ++count_by_name[name];
         if (n_idx > 1) {
             name += " (" + std::to_string(n_idx) + ")";
@@ -413,12 +437,6 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
 
 static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) {
     size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer);
-    if (minStorageBufferOffsetAlignment == 0) {
-        vk::PhysicalDeviceProperties deviceProperties;
-        deviceProperties = komputeManager()->physicalDevice()->getProperties();
-        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
-        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
-    }
 
     // If offset is already aligned, return it directly
     if (offset % minStorageBufferOffsetAlignment == 0) {
@@ -1731,10 +1749,11 @@ struct ggml_backend_kompute_buffer_type_context {
     int         device;
     int         device_ref = 0;
     uint64_t    buffer_alignment;
+    uint64_t    max_alloc;
     std::string name;
 
-    ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment)
-        : device(device), buffer_alignment(buffer_alignment), name(ggml_kompute_format_name(device)) {}
+    ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc)
+        : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {}
 };
 
 static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
@@ -1842,6 +1861,11 @@ static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer
     return ctx->buffer_alignment;
 }
 
+static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+    return ctx->max_alloc;
+}
+
 static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     GGML_UNUSED(buft);
     return ggml_backend_is_kompute(backend);
@@ -1851,6 +1875,7 @@ static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
     /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
     /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
     /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
     /* .is_host          = */ NULL,
@@ -1865,7 +1890,7 @@ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
         for (const auto & dev : devices) {
             vec.push_back({
                 /* .iface   = */ ggml_backend_kompute_buffer_type_interface,
-                /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment)
+                /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
             });
         }
         return vec;
diff --git a/ggml-kompute.h b/ggml-kompute.h
index c56e42f8e0190..171465456a5b1 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -19,6 +19,7 @@ struct ggml_vk_device {
     const char * vendor;
     int subgroupSize;
     uint64_t bufferAlignment;
+    uint64_t maxAlloc;
 };
 
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);

From 7e11fe088077501136d9861954407ef2a32aea54 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 29 Jan 2024 12:52:54 -0500
Subject: [PATCH 136/140] kompute : remove llama_load_model_from_file_internal

---
 llama.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cd0122f8ecaff..9631506c6fd4e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4146,6 +4146,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
             )
         )) {
             // disable Vulkan due to unsupported model architecture or quantization type
+            // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
             params.n_gpu_layers = 0;
         }
 #endif
@@ -10118,11 +10119,9 @@ int64_t llama_time_us(void) {
     return ggml_time_us();
 }
 
-static struct llama_model * llama_load_model_from_file_internal(
-    const char * path_model, struct llama_model_params * params_p
-) {
-    auto & params = *params_p;
-
+struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+              struct llama_model_params   params) {
     ggml_time_init();
 
     llama_model * model = new llama_model;
@@ -10159,10 +10158,6 @@ static struct llama_model * llama_load_model_from_file_internal(
     return model;
 }
 
-struct llama_model * llama_load_model_from_file(const char * path_model, struct llama_model_params params) {
-    return llama_load_model_from_file_internal(path_model, &params);
-}
-
 void llama_free_model(struct llama_model * model) {
     delete model;
 }

From b932cd742853eeb6a8788a80188f19f0acebaa1c Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 30 Nov 2023 16:50:20 -0500
Subject: [PATCH 137/140] vulkan : correctly fix use-after-free in
 ggml_vk_current_device

The previous attempt actually broke GPU inference with the 'main'
example, which was previously working.

deviceName is a vk::ArrayWrapper1D. Be careful when we convert it to a
std::string, so we don't get null bytes at the end.
---
 ggml-kompute.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index fa7baf7e8b598..1704bc9da0f50 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -340,8 +340,9 @@ ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
 
-    auto devices = ggml_vk_available_devices_internal(0);
-    ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName);
+    auto devices = ggml_vk_available_devices(0);
+    ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
+    GGML_ASSERT(!devices.empty());
     return devices.front();
 }
 

From 48db724bc7ed582e23f0657ecb784502a16432bc Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 29 Jan 2024 14:15:18 -0500
Subject: [PATCH 138/140] minor fixup

---
 ggml-kompute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 1704bc9da0f50..bfc9d96f6c976 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -340,7 +340,7 @@ ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
 
-    auto devices = ggml_vk_available_devices(0);
+    auto devices = ggml_vk_available_devices_internal(0);
     ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
     GGML_ASSERT(!devices.empty());
     return devices.front();

From 1f98dff7a9960fb66b7dbdbce8ff96bd84a7c636 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 29 Jan 2024 14:16:56 -0500
Subject: [PATCH 139/140] fix trailing whitespace

---
 ggml-kompute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index bfc9d96f6c976..0d99563778d87 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1897,7 +1897,7 @@ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
         return vec;
     }();
 
-    auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) { 
+    auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) {
         return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
     });
     return it < bufts.end() ? &*it : nullptr;

From 299821140afdac8f0e4bf4a5ad5d893d2a5a598f Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 29 Jan 2024 15:20:45 -0500
Subject: [PATCH 140/140] fix incorrect memcpy

---
 ggml-kompute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 0d99563778d87..51c5af8ec00a2 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -282,7 +282,7 @@ ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count
 
     size_t nbytes = sizeof (ggml_vk_device) * (devices.size());
     auto * arr = static_cast<ggml_vk_device *>(malloc(nbytes));
-    memcpy(&arr, devices.data(), nbytes);
+    memcpy(arr, devices.data(), nbytes);
     return arr;
 }