From 8ce1001841aa977e68e6485e218389c2b5d3a5cd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 3 Aug 2022 15:04:02 -0700
Subject: [PATCH 001/166] Import Vulkan runtime changes from personal branch

---
 src/CMakeLists.txt                |   16 +
 src/CodeGen_Vulkan_Dev.cpp        | 1325 +++++++
 src/CodeGen_Vulkan_Dev.h          |   23 +
 src/runtime/CMakeLists.txt        |    9 +-
 src/runtime/HalideRuntimeVulkan.h |  101 +
 src/runtime/mini_vulkan.h         | 6038 +++++++++++++++++++++++++++++
 src/runtime/vulkan.cpp            |  847 ++++
 src/runtime/vulkan_context.h      |  329 ++
 src/runtime/vulkan_extensions.h   |  296 ++
 src/runtime/vulkan_functions.h    |   41 +
 src/runtime/vulkan_interface.h    |   67 +
 src/runtime/vulkan_internal.h     |  260 ++
 src/runtime/vulkan_memory.h       |  722 ++++
 src/runtime/vulkan_resources.h    |  761 ++++
 14 files changed, 10834 insertions(+), 1 deletion(-)
 create mode 100644 src/CodeGen_Vulkan_Dev.cpp
 create mode 100644 src/CodeGen_Vulkan_Dev.h
 create mode 100644 src/runtime/HalideRuntimeVulkan.h
 create mode 100644 src/runtime/mini_vulkan.h
 create mode 100644 src/runtime/vulkan.cpp
 create mode 100644 src/runtime/vulkan_context.h
 create mode 100644 src/runtime/vulkan_extensions.h
 create mode 100644 src/runtime/vulkan_functions.h
 create mode 100644 src/runtime/vulkan_interface.h
 create mode 100644 src/runtime/vulkan_internal.h
 create mode 100644 src/runtime/vulkan_memory.h
 create mode 100644 src/runtime/vulkan_resources.h
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c31e37c32a20..1281038f48f2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,3 +1,4 @@
+
 ##
 # Lists of source files. Keep ALL lists sorted in alphabetical order.
 ##
@@ -39,6 +40,7 @@ set(HEADER_FILES
     CodeGen_PTX_Dev.h
     CodeGen_PyTorch.h
     CodeGen_Targets.h
+    CodeGen_Vulkan.h
     CompilerLogger.h
     ConciseCasts.h
     CPlusPlusMangle.h
@@ -206,6 +208,7 @@ set(SOURCE_FILES
     CodeGen_PTX_Dev.cpp
     CodeGen_PyTorch.cpp
     CodeGen_RISCV.cpp
+    CodeGen_Vulkan_Dev.cpp
     CodeGen_WebAssembly.cpp
     CodeGen_X86.cpp
     CompilerLogger.cpp
@@ -399,6 +402,13 @@ set_target_properties(Halide PROPERTIES
                       VERSION ${Halide_VERSION}
                       SOVERSION ${Halide_SOVERSION_OVERRIDE})
 
+target_compile_definitions(Halide PUBLIC 
+                           HALIDE_VERSION=${Halide_VERSION} 
+                           HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR}
+                           HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR}
+                           HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH})
+
+
 target_include_directories(Halide INTERFACE "$<BUILD_INTERFACE:${Halide_BINARY_DIR}/include>")
 add_dependencies(Halide HalideIncludes)
 
@@ -526,6 +536,12 @@ if (TARGET_OPENGLCOMPUTE)
     target_compile_definitions(Halide PRIVATE WITH_OPENGLCOMPUTE)
 endif ()
 
+cmake_dependent_option(TARGET_VULKAN "Include Vulkan target" ON TARGET_SPIRV OFF)
+if (TARGET_VULKAN)
+    message(STATUS "Enabling Vulkan target")
+    target_compile_definitions(Halide PRIVATE WITH_VULKAN)
+endif()
+
 if (TARGET_SPIRV)
     target_compile_definitions(Halide PRIVATE WITH_SPIRV)
     target_include_directories(Halide SYSTEM PRIVATE "${SPIRV_INCLUDE_DIR}")
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
new file mode 100644
index 000000000000..2b568ae7aa93
--- /dev/null
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -0,0 +1,1325 @@
+#include <algorithm>
+#include <sstream>
+
+#include "CodeGen_Vulkan_Dev.h"
+#include "CodeGen_Internal.h"
+#include "CodeGen_GPU_Dev.h"
+#include "Deinterleave.h"
+#include "Debug.h"
+#include "IROperator.h"
+#include "IRPrinter.h"
+#include "Scope.h"
+#include "Target.h"
+
+#include "spirv/spirv.h"
+
+// Temporary:
+#include <fstream>
+
+namespace Halide {
+namespace Internal {
+
+class CodeGen_LLVM;
+
+namespace { // anonymous 
+
+template <typename CodeGenT, typename ValueT>
+ValueT lower_int_uint_div(CodeGenT *cg, Expr a, Expr b);
+
+template <typename CodeGenT, typename ValueT>
+ValueT lower_int_uint_mod(CodeGenT *cg, Expr a, Expr b);
+
+class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
+public:
+    CodeGen_Vulkan_Dev(Target target);
+
+    /** Compile a GPU kernel into the module. This may be called many times
+     * with different kernels, which will all be accumulated into a single
+     * source module shared by a given Halide pipeline. */
+    void add_kernel(Stmt stmt,
+                    const std::string &name,
+                    const std::vector<DeviceArgument> &args) override;
+
+    /** (Re)initialize the GPU kernel module. This is separate from compile,
+     * since a GPU device module will often have many kernels compiled into it
+     * for a single pipeline. */
+    void init_module() override;
+
+    std::vector<char> compile_to_src() override;
+
+    std::string get_current_kernel_name() override;
+
+    void dump() override;
+
+    std::string print_gpu_name(const std::string &name) override;
+
+    std::string api_unique_name() override { return "vulkan"; }
+
+protected:
+
+    class SPIRVEmitter : public IRVisitor {
+        
+    public:
+        SPIRVEmitter() { }
+
+        using IRVisitor::visit;
+
+        void visit(const Variable *) override;
+        void visit(const IntImm *) override;
+        void visit(const UIntImm *) override;
+        void visit(const StringImm *) override;
+        void visit(const FloatImm *) override;
+        void visit(const Cast *) override;
+        void visit(const Add *) override;
+        void visit(const Sub *) override;
+        void visit(const Mul *) override;
+        void visit(const Div *) override;
+        void visit(const Mod *) override;
+        void visit(const Max *) override;
+        void visit(const Min *) override;
+        void visit(const EQ *) override;
+        void visit(const NE *) override;
+        void visit(const LT *) override;
+        void visit(const LE *) override;
+        void visit(const GT *) override;
+        void visit(const GE *) override;
+        void visit(const And *) override;
+        void visit(const Or *) override;
+        void visit(const Not *) override;
+        void visit(const Call *) override;
+        void visit(const Select *) override;
+        void visit(const Load *) override;
+        void visit(const Store *) override;
+        void visit(const Let *) override;
+        void visit(const LetStmt *) override;
+        void visit(const AssertStmt *) override;
+        void visit(const ProducerConsumer *) override;
+        void visit(const For *) override;
+        void visit(const Ramp *) override;
+        void visit(const Broadcast *) override;
+        void visit(const Provide *) override;
+        void visit(const Allocate *) override;
+        void visit(const Free *) override;
+        void visit(const Realize *) override;
+        void visit(const IfThenElse *) override;
+        void visit(const Evaluate *) override;
+        void visit(const Shuffle *) override;
+        void visit(const Prefetch *) override;
+        void visit(const Fork *) override;
+        void visit(const Acquire *) override;
+
+        void visit_binop(Type t, Expr a, Expr b, uint32_t opcode);
+
+        // ID of last generated Expr.
+        uint32_t id;
+        // IDs are allocated in numerical order of use.
+        uint32_t next_id{0};
+
+        // The void type does not map to a Halide type, but must be unique
+        uint32_t void_id;
+
+        // SPIR-V instructions in a module must be in a specific
+        // order. This order doesn't correspond to the order in which they
+        // are created. Hence we generate into a set of blocks, each of
+        // which is added to at its end. In compile_to_src, these are
+        // concatenated to form a complete SPIR-V module.  We also
+        // represent the temporaries as vectors of uint32_t rather than
+        // char for ease of adding words to them.
+        std::vector<uint32_t> spir_v_header;
+        std::vector<uint32_t> spir_v_entrypoints;
+        std::vector<uint32_t> spir_v_execution_modes;
+        std::vector<uint32_t> spir_v_annotations;
+        std::vector<uint32_t> spir_v_types;
+        std::vector<uint32_t> spir_v_kernels;
+        // The next one is cleared in between kernels, and tracks the allocations
+        std::vector<uint32_t> spir_v_kernel_allocations;
+
+        // Id of entry point for kernel currently being compiled.
+        uint32_t current_function_id;
+
+        // Top-level function for adding kernels
+        void add_kernel(Stmt s, const std::string &name, const std::vector<DeviceArgument> &args);
+
+        // Function for allocating variables in function scope, with optional initializer.
+        // These will appear at the beginning of the function, as required by SPIR-V
+        void add_allocation(uint32_t result_type_id, uint32_t result_id, uint32_t storage_class, uint32_t initializer=0);
+
+        std::map<Type, uint32_t> type_map;
+        std::map<std::pair<Type, uint32_t>, uint32_t> pointer_type_map;
+        std::map<Type, uint32_t> pair_type_map;
+        std::map<std::string, uint32_t> constant_map;
+
+        void add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
+                             std::initializer_list<uint32_t> words);
+        void add_instruction(uint32_t opcode, std::initializer_list<uint32_t> words);
+        void add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
+                            std::vector<uint32_t> words);
+        void add_instruction(uint32_t opcode, std::vector<uint32_t> words);
+        uint32_t map_type(const Type &type);
+        uint32_t map_pointer_type(const Type &type, const uint32_t storage_class);
+        uint32_t map_type_to_pair(const Type &t);
+        uint32_t emit_constant(const Type &t, const void *data);
+        void scalarize(Expr e);
+
+        // The scope contains both the symbol and its storage class
+        Scope<std::pair<uint32_t, uint32_t>> symbol_table;
+
+        // The workgroup size.  Must be the same for all kernels.
+        uint32_t workgroup_size[3];
+
+        struct PhiNodeInputs {
+            uint32_t ids[4];
+        };
+        // Returns Phi node inputs.
+        template <typename StmtOrExpr>
+        PhiNodeInputs emit_if_then_else(Expr condition, StmtOrExpr then_case, StmtOrExpr else_case);
+    } emitter;
+
+    std::string current_kernel_name;
+};
+
+// --
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
+                                                       std::initializer_list<uint32_t> words) {
+    region.push_back(((1 + words.size()) << 16) | opcode);
+    region.insert(region.end(), words.begin(), words.end());
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(uint32_t opcode, std::initializer_list<uint32_t> words) {
+    spir_v_kernels.push_back(((1 + words.size()) << 16) | opcode);
+    spir_v_kernels.insert(spir_v_kernels.end(), words.begin(), words.end());
+}
+void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
+                                                        std::vector<uint32_t> words) {
+  region.push_back(((1 + words.size()) << 16) | opcode);
+  region.insert(region.end(), words.begin(), words.end());
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(uint32_t opcode, std::vector<uint32_t> words) {
+  spir_v_kernels.push_back(((1 + words.size()) << 16) | opcode);
+  spir_v_kernels.insert(spir_v_kernels.end(), words.begin(), words.end());
+}
+
+uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::emit_constant(const Type &t, const void *data) {
+    // TODO: this needs to emit OpConstantComposite for constants with lane > 1
+    std::string key(t.bytes() + 4, ' ');
+    key[0] = t.code();
+    key[1] = t.bits();
+    key[2] = t.lanes() & 0xff;
+    key[3] = (t.lanes() >> 8) & 0xff;
+    const char *data_char = (const char *)data;
+    for (int i = 0; i < t.bytes(); i++) {
+        key[i + 4] = data_char[i];
+    }
+
+    debug(3) << "emit_constant for type " << t << "\n";
+    auto item = constant_map.find(key);
+    if  (item == constant_map.end()) {
+        uint32_t type_id = map_type(t);
+        uint32_t extra_words = (t.bytes() + 3) / 4;
+        uint32_t constant_id = next_id++;
+        spir_v_types.push_back(((3 + extra_words) << 16) | SpvOpConstant);
+        spir_v_types.push_back(type_id);
+        spir_v_types.push_back(constant_id);
+
+        const uint8_t *data_temp = (const uint8_t *)data;
+        size_t bytes_copied = 0;
+        for (uint32_t i = 0; i < extra_words; i++) {
+            uint32_t word;
+            size_t to_copy = std::min(t.bytes() - bytes_copied, (size_t)4);
+            memcpy(&word, data_temp, to_copy);
+            bytes_copied += to_copy;
+            spir_v_types.push_back(word);
+            data_temp++;
+        }
+        return constant_id;
+    } else {
+        return item->second;
+    }
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::scalarize(Expr e) {
+    internal_assert(e.type().is_vector()) << "CodeGen_Vulkan_Dev::SPIRVEmitter::scalarize must be called with an expression of vector type.\n";
+    uint32_t type_id = map_type(e.type());
+
+    uint32_t result_id = next_id++;
+    add_instruction(SpvOpConstantNull, { type_id, result_id } );
+
+    for (int i = 0; i < e.type().lanes(); i++) {
+        extract_lane(e, i).accept(this);
+        uint32_t composite_vec = next_id++;
+        add_instruction(SpvOpVectorInsertDynamic, { type_id, composite_vec, (uint32_t)i, result_id, id });
+        result_id = composite_vec;
+    }
+    id = result_id;
+}
+
+uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_type(const Type &t) {
+    auto key_typecode = t.code();
+
+    Type t_key(key_typecode, t.bits(), t.lanes());
+
+    auto item = type_map.find(t_key);
+    if  (item == type_map.end()) {
+        // TODO, handle arrays, pointers, halide_buffer_t
+        uint32_t type_id = 0;
+        if (t.lanes() != 1) {
+            uint32_t base_id = map_type(t.with_lanes(1));
+            type_id = next_id++;
+            add_instruction(spir_v_types, SpvOpTypeVector, { type_id, base_id, (uint32_t)t.lanes() });
+        } else {
+            if (t.is_float()) {
+                type_id = next_id++;
+                add_instruction(spir_v_types, SpvOpTypeFloat, { type_id, (uint32_t)t.bits() });
+            } else if (t.is_bool()) {
+                type_id = next_id++;
+                add_instruction(spir_v_types, SpvOpTypeBool, { type_id });
+            } else if (t.is_int_or_uint()) {
+                type_id = next_id++;
+                uint32_t signedness = t.is_uint() ? 0 : 1;
+                add_instruction(spir_v_types, SpvOpTypeInt, { type_id, (uint32_t)t.bits(), signedness });
+            } else {
+                internal_error << "Unsupported type in Vulkan backend " << t << "\n";
+            }
+        }
+        type_map[t_key] = type_id;
+        return type_id;
+    } else {
+        return item->second;
+    }
+}
+
+uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_type_to_pair(const Type &t) {
+    uint32_t &ref = pair_type_map[t];
+
+    if  (ref == 0) {
+        uint32_t base_type = map_type(t);
+
+        uint32_t type_id = next_id++;
+
+        add_instruction(spir_v_types, SpvOpTypeStruct, { type_id, base_type, base_type });
+        ref = type_id;
+    }
+    return ref;
+}
+
+uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_pointer_type(const Type &type, const uint32_t storage_class) {
+    auto key = std::make_pair(type, storage_class);
+    uint32_t &ref = pointer_type_map[key];
+    if (ref == 0) {
+        uint32_t base_type_id = map_type(type);
+        ref = next_id++;
+        add_instruction(spir_v_types, SpvOpTypePointer, { ref, storage_class, base_type_id });
+        pointer_type_map[key] = ref;
+    }
+
+    return ref;
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Variable *var) {
+    id = symbol_table.get(var->name).first;
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const IntImm *imm) {
+    id = emit_constant(imm->type, &imm->value);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const UIntImm *imm) {
+    id = emit_constant(imm->type, &imm->value);
+}
+
+namespace {
+void encode_string(std::vector<uint32_t>& section, const uint32_t words,
+                   const size_t str_size, const char* str) {
+    size_t bytes_copied = 0;
+    for (uint32_t i = 0; i < words; i++) {
+      uint32_t word;
+      size_t to_copy = std::min(str_size + 1 - bytes_copied, (size_t)4);
+      memcpy(&word, str, to_copy);
+      bytes_copied += to_copy;
+      section.push_back(word);
+      str += 4;
+    }
+
+}
+}
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const StringImm *imm) {
+    uint32_t extra_words = (imm->value.size() + 1 + 3) / 4;
+    id = next_id++;
+    spir_v_kernels.push_back(((2 + extra_words) << 16) | SpvOpString);
+    spir_v_kernels.push_back(id);
+
+    const char *data_temp = (const char *)imm->value.c_str();
+    const size_t data_size = imm->value.size();
+    encode_string(spir_v_kernels, extra_words, data_size, data_temp);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const FloatImm *imm) {
+    user_assert(imm->type.bits() == 32) << "Vulkan backend currently only supports 32-bit floats\n";
+    float float_val = (float)(imm->value);
+    id = emit_constant(imm->type, &float_val);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Cast *op) {
+    uint32_t opcode = 0;
+    if (op->value.type().is_float()) {
+        if (op->type.is_float()) {
+            opcode = SpvOpFConvert;
+        } else if (op->type.is_uint()) {
+            opcode = SpvOpConvertFToU;
+        } else if (op->type.is_int()) {
+            opcode = SpvOpConvertFToS;
+        } else {
+            internal_error << "Vulkan cast unhandled case " << op->value.type() << " to " << op->type << "\n";
+        }
+    } else if (op->value.type().is_uint()) {
+        if (op->type.is_float()) {
+            opcode = SpvOpConvertUToF;
+        } else if (op->type.is_uint()) {
+            opcode = SpvOpUConvert;
+        } else if (op->type.is_int()) {
+            opcode = SpvOpSatConvertUToS;
+        } else {
+            internal_error << "Vulkan cast unhandled case " << op->value.type() << " to " << op->type << "\n";
+        }
+    } else if (op->value.type().is_int()) {
+        if (op->type.is_float()) {
+            opcode = SpvOpConvertSToF;
+        } else if (op->type.is_uint()) {
+            opcode = SpvOpSatConvertSToU;
+        } else if (op->type.is_int()) {
+            opcode = SpvOpSConvert;
+        } else {
+            internal_error << "Vulkan cast unhandled case " << op->value.type() << " to " << op->type << "\n";
+        }
+    } else {
+        internal_error << "Vulkan cast unhandled case " << op->value.type() << " to " << op->type << "\n";
+    }
+
+    uint32_t type_id = map_type(op->type);
+    op->value.accept(this);
+    uint32_t src_id = id;
+    id = next_id++;
+    add_instruction(opcode, { type_id, id, src_id });
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Add *op) {
+    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFAdd : SpvOpIAdd);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Sub *op) {
+    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFSub : SpvOpISub);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Mul *op) {
+    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFMul : SpvOpIMul);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Div *op) {
+    user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n";
+
+    if (op->type.is_float()) {
+        visit_binop(op->type, op->a, op->b, SpvOpFDiv);
+    } else {
+        Expr e = lower_int_uint_div(op->a, op->b);
+        e.accept(this);
+    }
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Mod *op) {
+    if (op->type.is_float()) {
+        // Takes sign of result from op->b
+        visit_binop(op->type, op->a, op->b, SpvOpFMod);
+    } else {
+        Expr e = lower_int_uint_mod(op->a, op->b);
+        e.accept(this);
+    }
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Max *op) {
+    std::string a_name = unique_name('a');
+    std::string b_name = unique_name('b');
+    Expr a = Variable::make(op->a.type(), a_name);
+    Expr b = Variable::make(op->b.type(), b_name);
+    Expr temp = Let::make(a_name, op->a,
+                          Let::make(b_name, op->b, select(a > b, a, b)));
+    temp.accept(this);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Min *op) {
+    std::string a_name = unique_name('a');
+    std::string b_name = unique_name('b');
+    Expr a = Variable::make(op->a.type(), a_name);
+    Expr b = Variable::make(op->b.type(), b_name);
+    Expr temp = Let::make(a_name, op->a,
+                          Let::make(b_name, op->b, select(a < b, a, b)));
+    temp.accept(this);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const EQ *op) {
+    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFOrdEqual : SpvOpIEqual);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const NE *op) {
+    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFOrdNotEqual : SpvOpINotEqual);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LT *op) {
+    uint32_t opcode = 0;
+    if (op->a.type().is_float()) {
+        opcode = SpvOpFOrdLessThan;
+    } else if (op->a.type().is_int()) {
+        opcode = SpvOpSLessThan;
+    } else if (op->a.type().is_uint()) {
+        opcode = SpvOpULessThan;
+    } else {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LT *op): unhandled type: " << op->a.type() << "\n";
+    }
+    visit_binop(op->type, op->a, op->b, opcode);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LE *op) {
+    uint32_t opcode = 0;
+    if (op->a.type().is_float()) {
+        opcode = SpvOpFOrdLessThanEqual;
+    } else if (op->a.type().is_int()) {
+        opcode = SpvOpSLessThanEqual;
+    } else if (op->a.type().is_uint()) {
+        opcode = SpvOpULessThanEqual;
+    } else {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LE *op): unhandled type: " << op->a.type() << "\n";
+    }
+    visit_binop(op->type, op->a, op->b, opcode);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const GT *op) {
+    uint32_t opcode = 0;
+    if (op->a.type().is_float()) {
+        opcode = SpvOpFOrdGreaterThan;
+    } else if (op->a.type().is_int()) {
+        opcode = SpvOpSGreaterThan;
+    } else if (op->a.type().is_uint()) {
+        opcode = SpvOpUGreaterThan;
+    } else {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const GT *op): unhandled type: " << op->a.type() << "\n";
+    }
+    visit_binop(op->type, op->a, op->b, opcode);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const GE *op) {
+    uint32_t opcode = 0;
+    if (op->a.type().is_float()) {
+        opcode = SpvOpFOrdGreaterThanEqual;
+    } else if (op->a.type().is_int()) {
+        opcode = SpvOpSGreaterThanEqual;
+    } else if (op->a.type().is_uint()) {
+        opcode = SpvOpUGreaterThanEqual;
+    } else {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const GE *op): unhandled type: " << op->a.type() << "\n";
+    }
+    visit_binop(op->type, op->a, op->b, opcode);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const And *op) {
+    visit_binop(op->type, op->a, op->b, SpvOpLogicalAnd);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Or *op) {
+    visit_binop(op->type, op->a, op->b, SpvOpLogicalOr);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Not *op) {
+    uint32_t type_id = map_type(op->type);
+    op->a.accept(this);
+    uint32_t a_id = id;
+    id = next_id++;
+    add_instruction(SpvOpLogicalNot, { type_id, id, a_id });
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
+    if (op->is_intrinsic(Call::gpu_thread_barrier)) {
+        // TODO: Check the scopes here and figure out if this is the
+        // right memory barrier. Might be able to use
+        // SpvMemorySemanticsMaskNone instead.
+        add_instruction(SpvOpControlBarrier, { current_function_id, current_function_id,
+                                               SpvMemorySemanticsAcquireReleaseMask });
+    } else if (op->is_intrinsic(Call::bitwise_and)) {
+        internal_assert(op->args.size() == 2);
+        visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseAnd);
+    } else if (op->is_intrinsic(Call::bitwise_xor)) {
+        internal_assert(op->args.size() == 2);
+        visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseXor);
+    } else if (op->is_intrinsic(Call::bitwise_or)) {
+        internal_assert(op->args.size() == 2);
+        visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseOr);
+    } else if (op->is_intrinsic(Call::bitwise_not)) {
+        internal_assert(op->args.size() == 1);
+        uint32_t type_id = map_type(op->type);
+        op->args[0]->accept(this);
+        uint32_t arg_id = id;
+        id = next_id++;
+        add_instruction(SpvOpNot, { type_id, id, arg_id });
+    } else if (op->is_intrinsic(Call::reinterpret)) {
+    } else if (op->is_intrinsic(Call::if_then_else)) {
+        if (op->type.is_vector()) {
+            scalarize(op);
+        } else {
+            internal_assert(op->args.size() == 3);
+            auto phi_inputs = emit_if_then_else(op->args[0], op->args[1], op->args[2]);
+            // Generate Phi node if used as an expression.
+
+            uint32_t type_id = map_type(op->type);
+            id = next_id++;
+            spir_v_kernels.push_back((7 << 16) | SpvOpPhi);
+            spir_v_kernels.push_back(type_id);
+            spir_v_kernels.push_back(id);
+            spir_v_kernels.insert(spir_v_kernels.end(), phi_inputs.ids, phi_inputs.ids + 4);
+      }
+    } else if (op->is_intrinsic(Call::IntrinsicOp::div_round_to_zero)) {
+        internal_assert(op->args.size() == 2);
+        uint32_t opcode = 0;
+        if (op->type.is_int()) {
+            opcode = SpvOpSDiv;
+        } else if (op->type.is_uint()) {
+            opcode = SpvOpUDiv;
+        } else {
+            internal_error << "div_round_to_zero of non-integer type.\n";
+        }
+        visit_binop(op->type, op->args[0], op->args[1], opcode);
+    } else if (op->is_intrinsic(Call::IntrinsicOp::mod_round_to_zero)) {
+        internal_assert(op->args.size() == 2);
+        uint32_t opcode = 0;
+        if (op->type.is_int()) {
+            opcode = SpvOpSMod;
+        } else if (op->type.is_uint()) {
+            opcode = SpvOpUMod;
+        } else {
+            internal_error << "mod_round_to_zero of non-integer type.\n";
+        }
+        visit_binop(op->type, op->args[0], op->args[1], opcode);
+    } else if (op->is_intrinsic(Call::IntrinsicOp::mul_shift_right)) {
+        internal_assert(op->args.size() == 3);
+        uint32_t type_id = map_type(op->type);
+
+        op->args[0].accept(this);
+        uint32_t a_id = id;
+        op->args[1].accept(this);
+        uint32_t b_id = id;
+
+        uint32_t pair_type_id = map_type_to_pair(op->type);
+
+        // Double width multiply
+        uint32_t product_pair = next_id++;
+        spir_v_kernels.push_back((5 << 16) | (op->type.is_uint() ? SpvOpUMulExtended : SpvOpSMulExtended));
+        spir_v_kernels.push_back(pair_type_id);
+        spir_v_kernels.push_back(a_id);
+        spir_v_kernels.push_back(b_id);
+
+        uint32_t high_item_id = next_id++;
+        spir_v_kernels.push_back((5 << 16) | SpvOpCompositeExtract);
+        spir_v_kernels.push_back(type_id);
+        spir_v_kernels.push_back(high_item_id);
+        spir_v_kernels.push_back(product_pair);
+        spir_v_kernels.push_back(1);
+
+        const UIntImm *shift = op->args[2].as<UIntImm>();
+        internal_assert(shift != nullptr) << "Third argument to mul_shift_right intrinsic must be an unsigned integer immediate.\n";
+
+        uint32_t result_id;
+        if (shift->value != 0) {
+            // TODO: This code depends on compilation happening on a little-endian host.
+            uint32_t shr_id = emit_constant(shift->type, &shift->value);
+            result_id = next_id++;
+            spir_v_kernels.push_back((5 << 16) | (op->type.is_uint() ? SpvOpShiftRightLogical : SpvOpShiftRightArithmetic));
+            spir_v_kernels.push_back(type_id);
+            spir_v_kernels.push_back(result_id);
+            spir_v_kernels.push_back(high_item_id);
+            spir_v_kernels.push_back(shr_id);
+        } else {
+            result_id = high_item_id;
+        }
+
+        id = result_id;
+    } else if (op->is_intrinsic(Call::IntrinsicOp::sorted_avg)) {
+        internal_assert(op->args.size() == 2);
+        // b > a, so the following works without widening:
+        // a + (b - a)/2
+        Expr e = op->args[0] + (op->args[1] - op->args[0]) / 2;
+        e.accept(this);
+    }
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Select *op) {
+    uint32_t type_id = map_type(op->type);
+    op->condition.accept(this);
+    uint32_t cond_id = id;
+    op->true_value.accept(this);
+    uint32_t true_id = id;
+    op->false_value.accept(this);
+    uint32_t false_id = id;
+    id = next_id++;
+    add_instruction(SpvOpSelect, { type_id, id, cond_id, true_id, false_id });
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Load *op) {
+    debug(2) << "Vulkan codegen: Load: " << (Expr)op << "\n";
+    user_assert(is_const_one(op->predicate)) << "Predicated loads not supported by the Vulkan backend\n";
+
+    // TODO: implement vector loads
+    // TODO: correct casting to the appropriate memory space
+
+    internal_assert(!(op->index.type().is_vector()));
+    internal_assert(op->param.defined() && op->param.is_buffer());
+
+    // Construct the pointer to read from
+    auto id_and_storage_class = symbol_table.get(op->name);
+    uint32_t base_id = id_and_storage_class.first;
+    uint32_t storage_class = id_and_storage_class.second;
+
+    op->index.accept(this);
+    uint32_t index_id = id;
+    uint32_t ptr_type_id = map_pointer_type(op->type, storage_class);
+    uint32_t access_chain_id = next_id++;
+    auto zero = 0;
+    add_instruction(SpvOpInBoundsAccessChain, {ptr_type_id, access_chain_id, base_id,
+                                               emit_constant(UInt(32), &zero), index_id});
+
+    id = next_id++;
+    uint32_t result_type_id = map_type(op->type);
+    add_instruction(SpvOpLoad, {result_type_id, id, access_chain_id});
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Store *op) {
+    debug(2) << "Vulkan codegen: Store: " << (Stmt)op << "\n";
+
+    user_assert(is_const_one(op->predicate)) << "Predicated stores not supported by the Vulkan backend\n";
+
+    // TODO: implement vector writes
+    // TODO: correct casting to the appropriate memory space
+
+    internal_assert(!(op->index.type().is_vector()));
+    internal_assert(op->param.defined() && op->param.is_buffer());
+
+    op->value.accept(this);
+    uint32_t value_id = id;
+
+    // Construct the pointer to write to
+    auto id_and_storage_class = symbol_table.get(op->name);
+    uint32_t base_id = id_and_storage_class.first;
+    uint32_t storage_class = id_and_storage_class.second;
+
+    op->index.accept(this);
+    uint32_t index_id = id;
+    uint32_t ptr_type_id = map_pointer_type(op->value.type(), storage_class);
+    uint32_t access_chain_id = next_id++;
+    auto zero = 0;
+    add_instruction(SpvOpInBoundsAccessChain, {ptr_type_id, access_chain_id, base_id,
+                                               emit_constant(UInt(32), &zero), index_id});
+
+    add_instruction(SpvOpStore, {access_chain_id, value_id});
+
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Let *let) {
+    let->value.accept(this);
+    ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, let->name, {id, SpvStorageClassFunction});
+    let->body.accept(this);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LetStmt *let) {
+    let->value.accept(this);
+    ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, let->name, {id, SpvStorageClassFunction});
+    let->body.accept(this);
+    // TODO: Figure out undef here?
+    id = 0xffffffff;
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const AssertStmt *) {
+    // TODO: Fill this in.
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const ProducerConsumer *) {
+    // I believe these nodes are solely for annotation purposes.
+#if 0
+    string name;
+    if (op->is_producer) {
+        name = std::string("produce ") + op->name;
+    } else {
+        name = std::string("consume ") + op->name;
+    }
+    BasicBlock *produce = BasicBlock::Create(*context, name, function);
+    builder->CreateBr(produce);
+    builder->SetInsertPoint(produce);
+    codegen(op->body);
+#endif
+}
+
+namespace {
+std::pair<std::string, uint32_t> simt_intrinsic(const std::string &name) {
+    if (ends_with(name, ".__thread_id_x")) {
+      return {"LocalInvocationId", 0};
+    } else if (ends_with(name, ".__thread_id_y")) {
+      return {"LocalInvocationId", 1};
+    } else if (ends_with(name, ".__thread_id_z")) {
+      return {"LocalInvocationId", 2};
+    } else if (ends_with(name, ".__block_id_x")) {
+      return {"WorkgroupId", 0};
+    } else if (ends_with(name, ".__block_id_y")) {
+      return {"WorkgroupId", 1};
+    } else if (ends_with(name, ".__block_id_z")) {
+      return {"WorkgroupId", 2};
+    } else if (ends_with(name, "id_w")) {
+      user_error << "Vulkan only supports <=3 dimensions for gpu blocks";
+    }
+    internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
+    return {"", -1};
+}
+int thread_loop_workgroup_index(const std::string &name) {
+    std::string ids[] = {".__thread_id_x",
+                         ".__thread_id_y",
+                         ".__thread_id_z"};
+    for (size_t i = 0; i < sizeof(ids) / sizeof(std::string); i++) {
+        if (ends_with(name, ids[i])) { return i; }
+    }
+    return -1;
+}
+} // anonymous namespace
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
+
+    if (is_gpu_var(op->name)) {
+        internal_assert((op->for_type == ForType::GPUBlock) ||
+                        (op->for_type == ForType::GPUThread))
+            << "kernel loops must be either gpu block or gpu thread\n";
+        // This should always be true at this point in codegen
+        internal_assert(is_const_zero(op->min));
+
+        // Save & validate the workgroup size
+        int idx = thread_loop_workgroup_index(op->name);
+        if (idx >= 0) {
+            const IntImm *wsize = op->extent.as<IntImm>();
+            user_assert(wsize != nullptr) << "Vulkan requires statically-known workgroup size.\n";
+            uint32_t new_wsize = wsize->value;
+            user_assert(workgroup_size[idx] == 0 || workgroup_size[idx] == new_wsize) <<
+              "Vulkan requires all kernels have the same workgroup size, but two different ones "
+              "were encountered " << workgroup_size[idx] << " and " << new_wsize <<
+              " in dimension " << idx << "\n";
+            workgroup_size[idx] = new_wsize;
+        }
+
+        auto intrinsic = simt_intrinsic(op->name);
+
+
+        // Intrinsics are inserted when adding the kernel
+        internal_assert(symbol_table.contains(intrinsic.first));
+
+        uint32_t intrinsic_id = symbol_table.get(intrinsic.first).first;
+        uint32_t gpu_var_id = next_id++;
+        uint32_t unsigned_gpu_var_id = next_id++;
+        add_instruction(SpvOpCompositeExtract, {map_type(UInt(32)), unsigned_gpu_var_id, intrinsic_id, intrinsic.second});
+        // cast to int, which is what's expected by Halide's for loops
+        add_instruction(SpvOpBitcast, {map_type(Int(32)), gpu_var_id, unsigned_gpu_var_id});
+
+        {
+            ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, op->name, {gpu_var_id, SpvStorageClassUniform});
+            op->body.accept(this);
+        }
+
+    } else {
+
+        internal_assert(op->for_type == ForType::Serial) << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit unhandled For type: " << op->for_type << "\n";
+
+        // TODO: Loop vars are alway int32_t right?
+        uint32_t index_type_id = map_type(Int(32));
+        uint32_t index_var_type_id = map_pointer_type(Int(32), SpvStorageClassFunction); 
+
+        op->min.accept(this);
+        uint32_t min_id = id;
+        op->extent.accept(this);
+        uint32_t extent_id = id;
+
+        // Compute max.
+        uint32_t max_id = next_id++;
+        add_instruction(SpvOpIAdd, { index_type_id, max_id, min_id, extent_id });
+
+        // Declare loop var
+        // TODO: Can we use the phi node for this?
+        uint32_t loop_var_id = next_id++;
+        add_allocation(index_var_type_id, loop_var_id, SpvStorageClassFunction, min_id);
+
+        uint32_t header_label_id = next_id++;
+        uint32_t loop_top_label_id = next_id++;
+        uint32_t body_label_id = next_id++;
+        uint32_t continue_label_id = next_id++;
+        uint32_t merge_label_id = next_id++;
+        add_instruction(SpvOpLabel, { header_label_id });
+        add_instruction(SpvOpLoopMerge, { merge_label_id, continue_label_id, SpvLoopControlMaskNone });
+        add_instruction(SpvOpBranch, { loop_top_label_id });
+        add_instruction(SpvOpLabel, { loop_top_label_id });
+
+        // loop test.
+        uint32_t cur_index_id = next_id++;
+        add_instruction(SpvOpLoad, { index_type_id, cur_index_id, loop_var_id });
+
+        uint32_t loop_test_id = next_id++;
+        add_instruction(SpvOpSLessThanEqual, { loop_test_id, cur_index_id, max_id });
+        add_instruction(SpvOpBranchConditional, { loop_test_id, body_label_id, merge_label_id });
+
+        add_instruction(SpvOpLabel, { body_label_id });
+
+        {
+            ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, op->name, {cur_index_id, SpvStorageClassFunction});
+
+            op->body.accept(this);
+        }
+
+        add_instruction(SpvOpBranch, { continue_label_id });
+        add_instruction(SpvOpLabel, { continue_label_id });
+
+        // Loop var update?
+        uint32_t next_index_id = next_id++;
+        int32_t one = 1;
+        uint32_t constant_one_id = emit_constant(Int(32), &one);
+        add_instruction(SpvOpIAdd, { index_type_id, next_index_id, cur_index_id, constant_one_id});
+        add_instruction(SpvOpStore, { index_type_id, next_index_id, loop_var_id });
+        add_instruction(SpvOpBranch, { header_label_id });
+        add_instruction(SpvOpLabel, { merge_label_id });
+    }
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Ramp *op) {
+    // TODO: Is there a way to do this that doesn't require duplicating lane values?
+    uint32_t base_type_id = map_type(op->base.type());
+    uint32_t type_id = map_type(op->type);
+    op->base.accept(this);
+    uint32_t base_id = id;
+    op->stride.accept(this);
+    uint32_t stride_id = id;
+    uint32_t add_opcode = op->base.type().is_float() ? SpvOpFAdd : SpvOpIAdd;
+    // Generate adds to make the elements of the ramp.
+    uint32_t prev_id = base_id;
+    uint32_t first_id = next_id;
+    for (int i = 1; i < op->lanes; i++) {
+        uint32_t this_id = next_id++;
+        add_instruction(add_opcode, { base_type_id, this_id, prev_id, stride_id });
+        prev_id = this_id;
+    }
+    
+    id = next_id++;
+    spir_v_kernels.push_back(((op->lanes + 3) << 16) | SpvOpCompositeConstruct);
+    spir_v_kernels.push_back(type_id);
+    spir_v_kernels.push_back(id);
+    spir_v_kernels.push_back(base_id);
+    for (int i = 1; i < op->lanes; i++) {
+        spir_v_kernels.push_back(first_id++);
+    }
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Broadcast *op) {
+    // TODO: Is there a way to do this that doesn't require duplicating lane values?
+    uint32_t type_id = map_type(op->type);
+    op->value.accept(this);
+    uint32_t value_id = id;
+    id = next_id++;
+    spir_v_kernels.push_back(((op->lanes + 3) << 16) | SpvOpCompositeConstruct);
+    spir_v_kernels.push_back(type_id);
+    spir_v_kernels.push_back(id);
+    spir_v_kernels.insert(spir_v_kernels.end(), op->lanes, value_id);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Provide *) {
+    internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Provide *): Provide encountered during codegen\n";
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Allocate *) {
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Free *) {
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Realize *) {
+    internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Realize *): Realize encountered during codegen\n";
+}
+
+template <typename StmtOrExpr>
+CodeGen_Vulkan_Dev::SPIRVEmitter::PhiNodeInputs
+CodeGen_Vulkan_Dev::SPIRVEmitter::emit_if_then_else(Expr condition,
+                                                    StmtOrExpr then_case, StmtOrExpr else_case) {
+    condition.accept(this);
+    uint32_t cond_id = id;
+    uint32_t then_label_id = next_id++;
+    uint32_t else_label_id = next_id++;
+    uint32_t merge_label_id = next_id++;
+
+    add_instruction(SpvOpSelectionMerge, {merge_label_id, SpvSelectionControlMaskNone});
+    add_instruction(SpvOpBranchConditional, { cond_id, then_label_id, else_label_id });
+    add_instruction(SpvOpLabel, { then_label_id });
+
+    then_case.accept(this);
+    uint32_t then_id = id;
+
+    add_instruction(SpvOpBranch, { merge_label_id });
+    add_instruction(SpvOpLabel, { else_label_id });
+
+    else_case.accept(this);
+    uint32_t else_id = id;
+
+    // Every basic block must end with a branch instruction
+    add_instruction(SpvOpBranch, {merge_label_id});
+
+    add_instruction(SpvOpLabel, { merge_label_id });
+
+    return {{ then_id, then_label_id, else_id, else_label_id }};
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const IfThenElse *op) {
+    emit_if_then_else(op->condition, op->then_case, op->else_case);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Evaluate *op) {
+    op->value.accept(this);
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Shuffle *op) {
+    internal_assert(op->vectors.size() == 2) << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Shuffle *op): SPIR-V codegen currently only supports shuffles of vector pairs.\n";
+    uint32_t type_id = map_type(op->type);
+    op->vectors[0].accept(this);
+    uint32_t vector0_id = id;
+    op->vectors[1].accept(this);
+    uint32_t vector1_id = id;
+
+    id = next_id++;
+    spir_v_kernels.push_back(((5 + op->indices.size()) << 16) | SpvOpPhi);
+    spir_v_kernels.push_back(type_id);
+    spir_v_kernels.push_back(id);
+    spir_v_kernels.push_back(vector0_id);
+    spir_v_kernels.push_back(vector1_id);
+    spir_v_kernels.insert(spir_v_kernels.end(), op->indices.begin(), op->indices.end());
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Prefetch *) {
+    internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Prefetch *): Prefetch encountered during codegen\n";
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Fork *) {
+    internal_error << "void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Fork *) not supported yet.";
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Acquire *) {
+    internal_error << "void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Acquire *) not supported yet.";
+}
+
+// TODO: fast math decorations.
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit_binop(Type t, Expr a, Expr b, uint32_t opcode) {
+    uint32_t type_id = map_type(t);
+    a.accept(this);
+    uint32_t a_id = id;
+    b.accept(this);
+    uint32_t b_id = id;
+    id = next_id++;
+    add_instruction(opcode, { type_id, id, a_id, b_id });
+}
+
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::add_allocation(uint32_t result_type_id,
+                                                      uint32_t result_id,
+                                                      uint32_t storage_class,
+                                                      uint32_t initializer) {
+    if (initializer) {
+        add_instruction(spir_v_kernel_allocations, SpvOpVariable, {result_type_id,
+                                                                  result_id,
+                                                                  storage_class,
+                                                                  initializer});
+    } else {
+        add_instruction(spir_v_kernel_allocations, SpvOpVariable, {result_type_id,
+                                                                  result_id,
+                                                                  storage_class});
+    }
+}
+
+void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(Stmt s,
+                                                  const std::string &name,
+                                                  const std::vector<DeviceArgument> &args) {
+    debug(2) << "Adding Vulkan kernel " << name << "\n";
+
+    // Add function definition
+    // TODO: can we use one of the function control annotations?
+
+    // We'll discover the workgroup size as we traverse the kernel
+    workgroup_size[0] = 0;
+    workgroup_size[1] = 0;
+    workgroup_size[2] = 0;
+
+    // Declare the function type.  TODO: should this be unique?
+    uint32_t function_type_id = next_id++;
+
+    add_instruction(spir_v_types, SpvOpTypeFunction, {function_type_id, void_id});
+
+    // Add definition and parameters
+    current_function_id = next_id++;
+    add_instruction(SpvOpFunction, {void_id, current_function_id, SpvFunctionControlMaskNone, function_type_id});
+
+    // Insert the starting label
+    add_instruction(SpvOpLabel, {next_id++});
+
+    // TODO: what about variables that need the SIMT intrinsics for their initializer?
+    // Save the location where we'll insert OpVariable instructions
+    size_t index = spir_v_kernels.size();
+
+    std::vector<uint32_t> entry_point_interface;
+    entry_point_interface.push_back(SpvExecutionModelGLCompute);
+    entry_point_interface.push_back(current_function_id);
+    // Add the string name of the function
+    encode_string(entry_point_interface, (name.size() + 1 + 3)/4, name.size(), name.c_str());
+
+
+
+    // TODO: only add the SIMT intrinsics used
+    auto intrinsics = {"WorkgroupId", "LocalInvocationId"};
+    for (auto intrinsic: intrinsics) {
+        uint32_t intrinsic_id = next_id++;
+        uint32_t intrinsic_loaded_id = next_id++;
+        // The builtins are pointers to vec3
+        uint32_t intrinsic_type_id = map_pointer_type(Type(Type::UInt, 32, 3), SpvStorageClassInput);
+
+        add_instruction(spir_v_types, SpvOpVariable, {intrinsic_type_id, intrinsic_id, SpvStorageClassInput});
+        add_instruction(SpvOpLoad, {map_type(Type(Type::UInt, 32, 3)), intrinsic_loaded_id, intrinsic_id});
+        symbol_table.push(intrinsic, {intrinsic_loaded_id, SpvStorageClassInput});
+
+        // Annotate that this is the specific builtin
+        auto built_in_kind = starts_with(intrinsic, "Workgroup") ? SpvBuiltInWorkgroupId : SpvBuiltInLocalInvocationId;
+        add_instruction(spir_v_annotations, SpvOpDecorate, {intrinsic_id, SpvDecorationBuiltIn, built_in_kind});
+
+        // Add the builtin to the interface
+        entry_point_interface.push_back(intrinsic_id);
+    }
+
+
+    // Add the entry point and exection mode
+    add_instruction(spir_v_entrypoints,
+                    SpvOpEntryPoint, entry_point_interface);
+
+    // GLSL-style: each input buffer is a runtime array in a buffer struct
+    // All other params get passed in as a single uniform block
+    // First, need to count scalar parameters to construct the uniform struct
+    std::vector<uint32_t> scalar_types;
+    uint32_t offset = 0;
+    uint32_t param_pack_type_id = next_id++;
+    uint32_t param_pack_ptr_type_id = next_id++;
+    uint32_t param_pack_id = next_id++;
+    scalar_types.push_back(param_pack_type_id);
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i].is_buffer) {
+            // record the type for later constructing the params struct type
+            scalar_types.push_back(map_type(args[i].type));
+
+            // Add a decoration describing the offset
+            add_instruction(spir_v_annotations, SpvOpMemberDecorate, {param_pack_type_id, 
+                                                                      (uint32_t)(scalar_types.size()-2), 
+                                                                      SpvDecorationOffset,
+                                                                      offset});
+            offset += args[i].type.bytes();
+        }
+    }
+
+    // Add a Block decoration for the parameter pack itself
+    add_instruction(spir_v_annotations, SpvOpDecorate, {param_pack_type_id, SpvDecorationBlock});
+    // We always pass in the parameter pack as the first binding
+    add_instruction(spir_v_annotations, SpvOpDecorate, {param_pack_id, SpvDecorationDescriptorSet, 0});
+    add_instruction(spir_v_annotations, SpvOpDecorate, {param_pack_id, SpvDecorationBinding, 0});
+
+    // Add a struct type for the parameter pack and a pointer to it
+    add_instruction(spir_v_types, SpvOpTypeStruct, scalar_types);
+    add_instruction(spir_v_types, SpvOpTypePointer, {param_pack_ptr_type_id, SpvStorageClassUniform, param_pack_type_id});
+    // Add a variable for the parameter pack
+    add_instruction(spir_v_types, SpvOpVariable, {param_pack_ptr_type_id, param_pack_id, SpvStorageClassUniform});
+
+
+    uint32_t binding_counter = 1;
+    uint32_t scalar_index = 0;
+    for (size_t i = 0; i < args.size(); i++) {
+        uint32_t param_id = next_id++;
+        if (args[i].is_buffer) {
+            uint32_t element_type = map_type(args[i].type);
+            uint32_t runtime_arr_type = next_id++;
+            uint32_t struct_type = next_id++;
+            uint32_t ptr_struct_type = next_id++;
+            add_instruction(spir_v_types, SpvOpTypeRuntimeArray, {runtime_arr_type, element_type});
+            add_instruction(spir_v_types, SpvOpTypeStruct, {struct_type, runtime_arr_type});
+            add_instruction(spir_v_types, SpvOpTypePointer, {ptr_struct_type,
+                                                             SpvStorageClassUniform,
+                                                             struct_type});
+            // Annotate the struct to indicate it's passed in a GLSL-style buffer block
+            add_instruction(spir_v_annotations, SpvOpDecorate, {struct_type, SpvDecorationBufferBlock});
+            // Annotate the array with its stride
+            add_instruction(spir_v_annotations, SpvOpDecorate, {runtime_arr_type,
+                                                                SpvDecorationArrayStride,
+                                                                (uint32_t)(args[i].type.bytes())});
+            // Annotate the offset for the array
+            add_instruction(spir_v_annotations, SpvOpMemberDecorate, {struct_type, 0, SpvDecorationOffset, (uint32_t)0});
+
+            // Set DescriptorSet and Binding
+            add_instruction(spir_v_annotations, SpvOpDecorate, {param_id, SpvDecorationDescriptorSet, 0});
+            add_instruction(spir_v_annotations, SpvOpDecorate, {param_id, SpvDecorationBinding, binding_counter++});
+
+            add_instruction(spir_v_types, SpvOpVariable, {ptr_struct_type, param_id, SpvStorageClassUniform});
+        } else {
+            uint32_t access_chain_id = next_id++;
+            add_instruction(SpvOpInBoundsAccessChain, {map_pointer_type(args[i].type, SpvStorageClassUniform), 
+                                                       access_chain_id, 
+                                                       param_pack_id,
+                                                       emit_constant(UInt(32), &scalar_index)});
+            scalar_index++;
+            add_instruction(SpvOpLoad, {map_type(args[i].type), param_id, access_chain_id});
+        }
+        symbol_table.push(args[i].name, {param_id, SpvStorageClassUniform});
+    }
+
+
+    s.accept(this);
+
+    // Insert return and  function end delimiter
+    add_instruction(SpvOpReturn, {});
+    add_instruction(SpvOpFunctionEnd, {});
+
+    // Insert the allocations in the right place
+    auto it = spir_v_kernels.begin() + index;
+    spir_v_kernels.insert(it, spir_v_kernel_allocations.begin(), spir_v_kernel_allocations.end());
+    spir_v_kernel_allocations.clear();
+
+    workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
+    workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
+    workgroup_size[2] = std::max(workgroup_size[2], (uint32_t)1);
+    // Add workgroup size to execution mode
+    add_instruction(spir_v_execution_modes, SpvOpExecutionMode,
+                    {current_function_id, SpvExecutionModeLocalSize,
+                     workgroup_size[0], workgroup_size[1], workgroup_size[2]});
+
+    // Pop scope
+    for (auto arg: args) {
+        symbol_table.pop(arg.name);
+    }
+
+    // Reset to an invalid value for safety.
+    current_function_id = 0;
+}
+
+CodeGen_Vulkan_Dev::CodeGen_Vulkan_Dev(Target t) {
+}
+
+namespace {
+void add_extension(const std::string &extension_name, std::vector<uint32_t> &section) {
+    uint32_t extra_words = (extension_name.size() + 1 + 3) / 4;
+    section.push_back(((1 + extra_words) << 16) | SpvOpExtension);
+
+    const char *data_temp = (const char *)extension_name.c_str();
+    const size_t data_size = extension_name.size();
+    encode_string(section, extra_words, data_size, data_temp);
+}
+}
+void CodeGen_Vulkan_Dev::init_module() {
+    debug(2) << "Vulkan device codegen init_module\n";
+
+    // Header.
+    emitter.spir_v_header.push_back(SpvMagicNumber);
+    emitter.spir_v_header.push_back(SpvVersion);
+    emitter.spir_v_header.push_back(SpvSourceLanguageUnknown);
+    emitter.spir_v_header.push_back(0); // Bound placeholder
+    emitter.spir_v_header.push_back(0); // Reserved for schema.
+
+    // the unique void type
+    emitter.next_id++; // 0 is not a valid id
+    emitter.void_id = emitter.next_id++;
+    emitter.add_instruction(emitter.spir_v_types, SpvOpTypeVoid, {emitter.void_id});
+
+    // Capabilities
+    // TODO: only add those required by the generated code
+    emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityShader});
+    //emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityInt8});
+    //emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityUniformAndStorageBuffer8BitAccess});
+
+    // Extensions
+    // TODO: only add those required by the generated code
+    add_extension(std::string("SPV_KHR_8bit_storage"), emitter.spir_v_header);
+
+    // Memory model
+    // TODO: 32-bit or 64-bit?
+    // TODO: Which memory model?
+    emitter.add_instruction(emitter.spir_v_header, SpvOpMemoryModel,
+                            {SpvAddressingModelLogical, SpvMemoryModelGLSL450});
+
+
+    // OpCapability instructions
+    //    Enumerate type maps and add subwidth integer types if used
+    // OpExtensions instructions
+    // OpExtImport instructions
+    // One OpMemoryModelInstruction
+    // OpEntryPoint instructions -- tricky as we don't know them until the kernels are added. May need to insert as we go.
+    // OpExecutionMode or OpExecutionModeId -- are these also added at add_kernel time?
+    // debug -- empty?
+    // annotation
+    //     I believe alignment info for load/store/etc. is done with annotations.
+    //     Also need various annotations for SIMT intrinsics, struct layouts, etc
+    // OpType instructions. Contained in spir_v_types member.
+    // Function declarations. Are there any?
+    // Function bodies -- one per add_kernel
+}
+
+void CodeGen_Vulkan_Dev::add_kernel(Stmt stmt,
+                                    const std::string &name,
+                                    const std::vector<DeviceArgument> &args) {
+    current_kernel_name = name;
+    emitter.add_kernel(stmt, name, args);
+    //dump();
+}
+
+std::vector<char> CodeGen_Vulkan_Dev::compile_to_src() {
+    //#ifdef WITH_VULKAN
+
+    emitter.spir_v_header[3] = emitter.next_id;
+
+    std::vector<char> final_module;
+    size_t total_size = (emitter.spir_v_header.size() + emitter.spir_v_entrypoints.size() + emitter.spir_v_execution_modes.size() + emitter.spir_v_annotations.size() + emitter.spir_v_types.size() + emitter.spir_v_kernels.size()) * sizeof(uint32_t);
+    final_module.reserve(total_size);
+    final_module.insert(final_module.end(), (const char *)emitter.spir_v_header.data(), (const char *)(emitter.spir_v_header.data() + emitter.spir_v_header.size()));
+    final_module.insert(final_module.end(), (const char *)emitter.spir_v_entrypoints.data(), (const char *)(emitter.spir_v_entrypoints.data() + emitter.spir_v_entrypoints.size()));
+    final_module.insert(final_module.end(), (const char *)emitter.spir_v_execution_modes.data(), (const char *)(emitter.spir_v_execution_modes.data() + emitter.spir_v_execution_modes.size()));
+    final_module.insert(final_module.end(), (const char *)emitter.spir_v_annotations.data(), (const char *)(emitter.spir_v_annotations.data() + emitter.spir_v_annotations.size()));
+    final_module.insert(final_module.end(), (const char *)emitter.spir_v_types.data(), (const char *)(emitter.spir_v_types.data() + emitter.spir_v_types.size()));
+    final_module.insert(final_module.end(), (const char *)emitter.spir_v_kernels.data(), (const char *)(emitter.spir_v_kernels.data() + emitter.spir_v_kernels.size()));
+    assert(final_module.size() == total_size);
+    std::ofstream f("/home/skamil/out.spv", std::ios::out | std::ios::binary);
+    f.write((char*)(final_module.data()), final_module.size());
+    f.close();
+
+    return final_module;
+
+    //#endif
+}
+
+std::string CodeGen_Vulkan_Dev::get_current_kernel_name() {
+    return current_kernel_name;
+}
+
+std::string CodeGen_Vulkan_Dev::print_gpu_name(const std::string &name) {
+    return name;
+}
+
+void CodeGen_Vulkan_Dev::dump() {
+    // TODO: Figure out what goes here.
+    // For now: dump to file so source can be consumed by validator
+    auto module = compile_to_src();
+    std::ofstream f("out.spv", std::ios::out | std::ios::binary);
+    f.write((char*)(module.data()), module.size());
+    f.close();
+}
+
+} // namespace 
+
+std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Vulkan_Dev(const Target &target) {
+    return std::make_unique<CodeGen_Vulkan_Dev>(target);
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/CodeGen_Vulkan_Dev.h b/src/CodeGen_Vulkan_Dev.h
new file mode 100644
index 000000000000..f67d6d9859f2
--- /dev/null
+++ b/src/CodeGen_Vulkan_Dev.h
@@ -0,0 +1,23 @@
+#ifndef HALIDE_CODEGEN_VULKAN_DEV_H
+#define HALIDE_CODEGEN_VULKAN_DEV_H
+
+/** \file
+ * Defines the code-generator for producing Vulkan SPIR-V kernel code
+ */
+
+#include <memory>
+
+namespace Halide {
+
+struct Target;
+
+namespace Internal {
+
+struct CodeGen_GPU_Dev;
+
+std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Vulkan_Dev(const Target &target);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index 1d1832be504f..5704556d72e3 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -80,6 +80,7 @@ set(RUNTIME_CPP
     to_string
     trace_helper
     tracing
+    vulkan
     wasm_cpu_features
     windows_clock
     windows_cuda
@@ -134,6 +135,7 @@ set(RUNTIME_HEADER_FILES
     HalideRuntimeOpenCL.h
     HalideRuntimeOpenGLCompute.h
     HalideRuntimeQurt.h
+    HalideRuntimeVulkan.h
     )
 
 # Need to create an object library for this because CMake
@@ -225,7 +227,12 @@ foreach (i IN LISTS RUNTIME_CPP)
 
         set(SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/${i}.cpp")
 
-        set(RUNTIME_DEFINES -DCOMPILING_HALIDE_RUNTIME -DBITS_${j})
+        set(RUNTIME_DEFINES -DCOMPILING_HALIDE_RUNTIME -DBITS_${j}
+                            -DHALIDE_VERSION=${Halide_VERSION} 
+                            -DHALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR}
+                            -DHALIDE_VERSION_MINOR=${Halide_VERSION_MINOR}
+                            -DHALIDE_VERSION_PATCH=${Halide_VERSION_PATCH})
+
         set(RUNTIME_DEFINES_debug -g -DDEBUG_RUNTIME ${RUNTIME_DEFINES})
 
         foreach (SUFFIX IN ITEMS "" "_debug")
diff --git a/src/runtime/HalideRuntimeVulkan.h b/src/runtime/HalideRuntimeVulkan.h
new file mode 100644
index 000000000000..ac649194f1f0
--- /dev/null
+++ b/src/runtime/HalideRuntimeVulkan.h
@@ -0,0 +1,101 @@
+#ifndef HALIDE_HALIDERUNTIMEVULKAN_H
+#define HALIDE_HALIDERUNTIMEVULKAN_H
+
+// Don't include HalideRuntime.h if the contents of it were already pasted into a generated header above this one
+#ifndef HALIDE_HALIDERUNTIME_H
+
+#include "HalideRuntime.h"
+
+#endif
+/** \file
+ *  Routines specific to the Halide Vulkan runtime.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HALIDE_RUNTIME_VULKAN
+
+extern const struct halide_device_interface_t *halide_vulkan_device_interface();
+
+/** These are forward declared here to allow clients to override the
+ *  Halide Vulkan runtime. Do not call them. */
+// @{
+extern int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr,
+                                            const char *src, int size);
+
+extern int halide_vulkan_run(void *user_context,
+                             void *state_ptr,
+                             const char *entry_name,
+                             int blocksX, int blocksY, int blocksZ,
+                             int threadsX, int threadsY, int threadsZ,
+                             int shared_mem_bytes,
+                             size_t arg_sizes[],
+                             void *args[],
+                             int8_t arg_is_buffer[]);
+
+extern void halide_vulkan_finalize_kernels(void *user_context, void *state_ptr);
+
+// @}
+
+// The default implementation of halide_acquire_vulkan_context uses
+// the global pointers above, and serializes access with a spin lock.
+// Overriding implementations of acquire/release must implement the
+// following behavior:
+
+//  - halide_acquire_vulkan_context should always store a valid
+//   instance/device/queue in the corresponding out parameters,
+//   or return an error code.
+// - A call to halide_acquire_vulkan_context is followed by a matching
+//   call to halide_release_vulkan_context. halide_acquire_vulkan_context
+//   should block while a previous call (if any) has not yet been
+//   released via halide_release_vulkan_context.
+// TODO: describe memory type index
+// TODO: describe queue family index
+struct halide_vulkan_memory_allocator;
+extern int halide_vulkan_acquire_context(void *user_context,
+                                         struct halide_vulkan_memory_allocator **allocator,
+                                         struct VkInstance_T **instance,
+                                         struct VkDevice_T **device, 
+                                         struct VkPhysicalDevice_T **physical_device,
+                                         uint64_t *command_pool,
+                                         struct VkQueue_T **queue,
+                                         uint32_t *queue_family_index,
+                                         bool create = true);
+
+extern int halide_vulkan_release_context(void *user_context, 
+                                         struct VkInstance_T *instance, 
+                                         struct VkDevice_T *device, 
+                                         struct VkQueue_T *queue);
+
+// --
+
+// Override the default allocation callbacks (default uses Vulkan runtime implementation)
+extern void halide_vulkan_set_allocation_callbacks(const struct VkAllocationCallbacks *callbacks);
+
+// Access the current allocation callbacks
+// -- may return nullptr ... which indicates the default Vulkan runtime implementation is being used)
+extern const struct VkAllocationCallbacks *halide_vulkan_get_allocation_callbacks(void *user_context);
+
+// Access methods to assign/retrieve required layer names for the context
+extern void halide_vulkan_set_layer_names(const char *n);
+extern const char *halide_vulkan_get_layer_names(void *user_context);
+
+// Access methods to assign/retrieve required externsion names for the context
+extern void halide_vulkan_set_extension_names(const char *n);
+extern const char *halide_vulkan_get_extension_names(void *user_context);
+
+// Access methods to assign/retrieve required device type names for the context (either "cpu", "gpu" (any), "discrete-gpu" (only), "virtual-gpu" (sw))
+extern void halide_vulkan_set_device_type(const char *n);
+extern const char *halide_vulkan_get_device_type(void *user_context);
+
+// Access methods to assign/retrieve specific build options to the Vulkan runtime compiler
+extern void halide_vulkan_set_build_options(const char *n);
+extern const char *halide_vulkan_get_build_options(void *user_context);
+
+#ifdef __cplusplus
+}  // End extern "C"
+#endif
+
+#endif  // HALIDE_HALIDERUNTIMEVULKAN_H
diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
new file mode 100644
index 000000000000..f0f10539a162
--- /dev/null
+++ b/src/runtime/mini_vulkan.h
@@ -0,0 +1,6038 @@
+#ifndef HALIDE_MINI_VULKAN_H
+#define HALIDE_MINI_VULKAN_H
+
+/*
+** Copyright (c) 2014-2017 The Khronos Group Inc.
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+// Pickup integer types
+#include "runtime_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/*
+***************************************************************************************************
+*   Platform-specific directives and type declarations
+***************************************************************************************************
+*/
+
+/* Platform-specific calling convention macros.
+ *
+ * Platforms should define these so that Vulkan clients call Vulkan commands
+ * with the same calling conventions that the Vulkan implementation expects.
+ *
+ * VKAPI_ATTR - Placed before the return type in function declarations.
+ *              Useful for C++11 and GCC/Clang-style function attribute syntax.
+ * VKAPI_CALL - Placed after the return type in function declarations.
+ *              Useful for MSVC-style calling convention syntax.
+ * VKAPI_PTR  - Placed between the '(' and '*' in function pointer types.
+ *
+ * Function declaration:  VKAPI_ATTR void VKAPI_CALL vkCommand(void);
+ * Function pointer type: typedef void (VKAPI_PTR *PFN_vkCommand)(void);
+ */
+#if defined(_WIN32)
+// On Windows, Vulkan commands use the stdcall convention
+#define VKAPI_ATTR
+#define VKAPI_CALL __stdcall
+#define VKAPI_PTR VKAPI_CALL
+#elif defined(__ANDROID__) && defined(__ARM_ARCH) && __ARM_ARCH < 7
+#error "Vulkan isn't supported for the 'armeabi' NDK ABI"
+#elif defined(__ANDROID__) && defined(__ARM_ARCH) && __ARM_ARCH >= 7 && defined(__ARM_32BIT_STATE)
+// On Android 32-bit ARM targets, Vulkan functions use the "hardfloat"
+// calling convention, i.e. float parameters are passed in registers. This
+// is true even if the rest of the application passes floats on the stack,
+// as it does by default when compiling for the armeabi-v7a NDK ABI.
+#define VKAPI_ATTR __attribute__((pcs("aapcs-vfp")))
+#define VKAPI_CALL
+#define VKAPI_PTR VKAPI_ATTR
+#else
+// On other platforms, use the default calling convention
+#define VKAPI_ATTR
+#define VKAPI_CALL
+#define VKAPI_PTR
+#endif
+
+typedef uint32_t VkFlags;
+typedef uint32_t VkBool32;
+typedef uint64_t VkDeviceSize;
+typedef uint32_t VkSampleMask;
+
+#define VK_DEFINE_HANDLE(object) typedef struct object##_T *(object);
+#define VK_DEFINE_NON_DISPATCHABLE_HANDLE(object) typedef uint64_t object;
+
+VK_DEFINE_HANDLE(VkInstance)
+VK_DEFINE_HANDLE(VkPhysicalDevice)
+VK_DEFINE_HANDLE(VkDevice)
+VK_DEFINE_HANDLE(VkQueue)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSemaphore)
+VK_DEFINE_HANDLE(VkCommandBuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkFence)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDeviceMemory)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkBuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkImage)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkEvent)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkQueryPool)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkBufferView)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkImageView)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkShaderModule)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipelineCache)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipelineLayout)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkRenderPass)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipeline)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorSetLayout)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSampler)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorPool)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorSet)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkFramebuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkCommandPool)
+
+#define VK_LOD_CLAMP_NONE 1000.0f
+#define VK_REMAINING_MIP_LEVELS (~0U)
+#define VK_REMAINING_ARRAY_LAYERS (~0U)
+#define VK_WHOLE_SIZE (~0ULL)
+#define VK_ATTACHMENT_UNUSED (~0U)
+#define VK_TRUE 1
+#define VK_FALSE 0
+#define VK_QUEUE_FAMILY_IGNORED (~0U)
+#define VK_SUBPASS_EXTERNAL (~0U)
+#define VK_MAX_PHYSICAL_DEVICE_NAME_SIZE 256
+#define VK_UUID_SIZE 16
+#define VK_MAX_MEMORY_TYPES 32
+#define VK_MAX_MEMORY_HEAPS 16
+#define VK_MAX_EXTENSION_NAME_SIZE 256
+#define VK_MAX_DESCRIPTION_SIZE 256
+
+typedef enum VkPipelineCacheHeaderVersion {
+    VK_PIPELINE_CACHE_HEADER_VERSION_ONE = 1,
+    VK_PIPELINE_CACHE_HEADER_VERSION_BEGIN_RANGE = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
+    VK_PIPELINE_CACHE_HEADER_VERSION_END_RANGE = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
+    VK_PIPELINE_CACHE_HEADER_VERSION_RANGE_SIZE = (VK_PIPELINE_CACHE_HEADER_VERSION_ONE - VK_PIPELINE_CACHE_HEADER_VERSION_ONE + 1),
+    VK_PIPELINE_CACHE_HEADER_VERSION_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineCacheHeaderVersion;
+
+typedef enum VkResult {
+    VK_SUCCESS = 0,
+    VK_NOT_READY = 1,
+    VK_TIMEOUT = 2,
+    VK_EVENT_SET = 3,
+    VK_EVENT_RESET = 4,
+    VK_INCOMPLETE = 5,
+    VK_ERROR_OUT_OF_HOST_MEMORY = -1,
+    VK_ERROR_OUT_OF_DEVICE_MEMORY = -2,
+    VK_ERROR_INITIALIZATION_FAILED = -3,
+    VK_ERROR_DEVICE_LOST = -4,
+    VK_ERROR_MEMORY_MAP_FAILED = -5,
+    VK_ERROR_LAYER_NOT_PRESENT = -6,
+    VK_ERROR_EXTENSION_NOT_PRESENT = -7,
+    VK_ERROR_FEATURE_NOT_PRESENT = -8,
+    VK_ERROR_INCOMPATIBLE_DRIVER = -9,
+    VK_ERROR_TOO_MANY_OBJECTS = -10,
+    VK_ERROR_FORMAT_NOT_SUPPORTED = -11,
+    VK_ERROR_FRAGMENTED_POOL = -12,
+    VK_ERROR_SURFACE_LOST_KHR = -1000000000,
+    VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000000001,
+    VK_SUBOPTIMAL_KHR = 1000001003,
+    VK_ERROR_OUT_OF_DATE_KHR = -1000001004,
+    VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001,
+    VK_ERROR_VALIDATION_FAILED_EXT = -1000011001,
+    VK_ERROR_INVALID_SHADER_NV = -1000012000,
+    VK_ERROR_OUT_OF_POOL_MEMORY_KHR = -1000069000,
+    VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = -1000072003,
+    VK_RESULT_BEGIN_RANGE = VK_ERROR_FRAGMENTED_POOL,
+    VK_RESULT_END_RANGE = VK_INCOMPLETE,
+    VK_RESULT_RANGE_SIZE = (VK_INCOMPLETE - VK_ERROR_FRAGMENTED_POOL + 1),
+    VK_RESULT_MAX_ENUM = 0x7FFFFFFF
+} VkResult;
+
+typedef enum VkStructureType {
+    VK_STRUCTURE_TYPE_APPLICATION_INFO = 0,
+    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO = 1,
+    VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO = 2,
+    VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO = 3,
+    VK_STRUCTURE_TYPE_SUBMIT_INFO = 4,
+    VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO = 5,
+    VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE = 6,
+    VK_STRUCTURE_TYPE_BIND_SPARSE_INFO = 7,
+    VK_STRUCTURE_TYPE_FENCE_CREATE_INFO = 8,
+    VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO = 9,
+    VK_STRUCTURE_TYPE_EVENT_CREATE_INFO = 10,
+    VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO = 11,
+    VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO = 12,
+    VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO = 13,
+    VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO = 14,
+    VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO = 15,
+    VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO = 16,
+    VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO = 17,
+    VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO = 18,
+    VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO = 19,
+    VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO = 20,
+    VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO = 21,
+    VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO = 22,
+    VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO = 23,
+    VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO = 24,
+    VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO = 25,
+    VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO = 26,
+    VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO = 27,
+    VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO = 28,
+    VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO = 29,
+    VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO = 30,
+    VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO = 31,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO = 32,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO = 33,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO = 34,
+    VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET = 35,
+    VK_STRUCTURE_TYPE_COPY_DESCRIPTOR_SET = 36,
+    VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO = 37,
+    VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO = 38,
+    VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO = 39,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO = 40,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO = 41,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO = 42,
+    VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO = 43,
+    VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER = 44,
+    VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER = 45,
+    VK_STRUCTURE_TYPE_MEMORY_BARRIER = 46,
+    VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO = 47,
+    VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO = 48,
+    VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR = 1000001000,
+    VK_STRUCTURE_TYPE_PRESENT_INFO_KHR = 1000001001,
+    VK_STRUCTURE_TYPE_DISPLAY_MODE_CREATE_INFO_KHR = 1000002000,
+    VK_STRUCTURE_TYPE_DISPLAY_SURFACE_CREATE_INFO_KHR = 1000002001,
+    VK_STRUCTURE_TYPE_DISPLAY_PRESENT_INFO_KHR = 1000003000,
+    VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR = 1000004000,
+    VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR = 1000005000,
+    VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR = 1000006000,
+    VK_STRUCTURE_TYPE_MIR_SURFACE_CREATE_INFO_KHR = 1000007000,
+    VK_STRUCTURE_TYPE_ANDROID_SURFACE_CREATE_INFO_KHR = 1000008000,
+    VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR = 1000009000,
+    VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT = 1000011000,
+    VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD = 1000018000,
+    VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_NAME_INFO_EXT = 1000022000,
+    VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_TAG_INFO_EXT = 1000022001,
+    VK_STRUCTURE_TYPE_DEBUG_MARKER_MARKER_INFO_EXT = 1000022002,
+    VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_IMAGE_CREATE_INFO_NV = 1000026000,
+    VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_BUFFER_CREATE_INFO_NV = 1000026001,
+    VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_MEMORY_ALLOCATE_INFO_NV = 1000026002,
+    VK_STRUCTURE_TYPE_TEXTURE_LOD_GATHER_FORMAT_PROPERTIES_AMD = 1000041000,
+    VK_STRUCTURE_TYPE_RENDER_PASS_MULTIVIEW_CREATE_INFO_KHX = 1000053000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES_KHX = 1000053001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES_KHX = 1000053002,
+    VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_NV = 1000056000,
+    VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_NV = 1000056001,
+    VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_NV = 1000057000,
+    VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_NV = 1000057001,
+    VK_STRUCTURE_TYPE_WIN32_KEYED_MUTEX_ACQUIRE_RELEASE_INFO_NV = 1000058000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR = 1000059000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR = 1000059001,
+    VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2_KHR = 1000059002,
+    VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR = 1000059003,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR = 1000059004,
+    VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2_KHR = 1000059005,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR = 1000059006,
+    VK_STRUCTURE_TYPE_SPARSE_IMAGE_FORMAT_PROPERTIES_2_KHR = 1000059007,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SPARSE_IMAGE_FORMAT_INFO_2_KHR = 1000059008,
+    VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHX = 1000060000,
+    VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO_KHX = 1000060001,
+    VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO_KHX = 1000060002,
+    VK_STRUCTURE_TYPE_DEVICE_GROUP_RENDER_PASS_BEGIN_INFO_KHX = 1000060003,
+    VK_STRUCTURE_TYPE_DEVICE_GROUP_COMMAND_BUFFER_BEGIN_INFO_KHX = 1000060004,
+    VK_STRUCTURE_TYPE_DEVICE_GROUP_SUBMIT_INFO_KHX = 1000060005,
+    VK_STRUCTURE_TYPE_DEVICE_GROUP_BIND_SPARSE_INFO_KHX = 1000060006,
+    VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHX = 1000060010,
+    VK_STRUCTURE_TYPE_DEVICE_GROUP_PRESENT_CAPABILITIES_KHX = 1000060007,
+    VK_STRUCTURE_TYPE_IMAGE_SWAPCHAIN_CREATE_INFO_KHX = 1000060008,
+    VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHX = 1000060009,
+    VK_STRUCTURE_TYPE_DEVICE_GROUP_PRESENT_INFO_KHX = 1000060011,
+    VK_STRUCTURE_TYPE_DEVICE_GROUP_SWAPCHAIN_CREATE_INFO_KHX = 1000060012,
+    VK_STRUCTURE_TYPE_VALIDATION_FLAGS_EXT = 1000061000,
+    VK_STRUCTURE_TYPE_VI_SURFACE_CREATE_INFO_NN = 1000062000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GROUP_PROPERTIES_KHX = 1000070000,
+    VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO_KHX = 1000070001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR = 1000071000,
+    VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR = 1000071001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR = 1000071002,
+    VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR = 1000071003,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR = 1000071004,
+    VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR = 1000072000,
+    VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR = 1000072001,
+    VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR = 1000072002,
+    VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR = 1000073000,
+    VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR = 1000073001,
+    VK_STRUCTURE_TYPE_MEMORY_WIN32_HANDLE_PROPERTIES_KHR = 1000073002,
+    VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR = 1000073003,
+    VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR = 1000074000,
+    VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR = 1000074001,
+    VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR = 1000074002,
+    VK_STRUCTURE_TYPE_WIN32_KEYED_MUTEX_ACQUIRE_RELEASE_INFO_KHR = 1000075000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR = 1000076000,
+    VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR = 1000076001,
+    VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR = 1000077000,
+    VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR = 1000078000,
+    VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR = 1000078001,
+    VK_STRUCTURE_TYPE_D3D12_FENCE_SUBMIT_INFO_KHR = 1000078002,
+    VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR = 1000078003,
+    VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR = 1000079000,
+    VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR = 1000079001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR = 1000080000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR = 1000083000,
+    VK_STRUCTURE_TYPE_PRESENT_REGIONS_KHR = 1000084000,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR = 1000085000,
+    VK_STRUCTURE_TYPE_OBJECT_TABLE_CREATE_INFO_NVX = 1000086000,
+    VK_STRUCTURE_TYPE_INDIRECT_COMMANDS_LAYOUT_CREATE_INFO_NVX = 1000086001,
+    VK_STRUCTURE_TYPE_CMD_PROCESS_COMMANDS_INFO_NVX = 1000086002,
+    VK_STRUCTURE_TYPE_CMD_RESERVE_SPACE_FOR_COMMANDS_INFO_NVX = 1000086003,
+    VK_STRUCTURE_TYPE_DEVICE_GENERATED_COMMANDS_LIMITS_NVX = 1000086004,
+    VK_STRUCTURE_TYPE_DEVICE_GENERATED_COMMANDS_FEATURES_NVX = 1000086005,
+    VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_W_SCALING_STATE_CREATE_INFO_NV = 1000087000,
+    VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_EXT = 1000090000,
+    VK_STRUCTURE_TYPE_DISPLAY_POWER_INFO_EXT = 1000091000,
+    VK_STRUCTURE_TYPE_DEVICE_EVENT_INFO_EXT = 1000091001,
+    VK_STRUCTURE_TYPE_DISPLAY_EVENT_INFO_EXT = 1000091002,
+    VK_STRUCTURE_TYPE_SWAPCHAIN_COUNTER_CREATE_INFO_EXT = 1000091003,
+    VK_STRUCTURE_TYPE_PRESENT_TIMES_INFO_GOOGLE = 1000092000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PER_VIEW_ATTRIBUTES_PROPERTIES_NVX = 1000097000,
+    VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SWIZZLE_STATE_CREATE_INFO_NV = 1000098000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DISCARD_RECTANGLE_PROPERTIES_EXT = 1000099000,
+    VK_STRUCTURE_TYPE_PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT = 1000099001,
+    VK_STRUCTURE_TYPE_HDR_METADATA_EXT = 1000105000,
+    VK_STRUCTURE_TYPE_SHARED_PRESENT_SURFACE_CAPABILITIES_KHR = 1000111000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO_KHR = 1000112000,
+    VK_STRUCTURE_TYPE_EXTERNAL_FENCE_PROPERTIES_KHR = 1000112001,
+    VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO_KHR = 1000113000,
+    VK_STRUCTURE_TYPE_IMPORT_FENCE_WIN32_HANDLE_INFO_KHR = 1000114000,
+    VK_STRUCTURE_TYPE_EXPORT_FENCE_WIN32_HANDLE_INFO_KHR = 1000114001,
+    VK_STRUCTURE_TYPE_FENCE_GET_WIN32_HANDLE_INFO_KHR = 1000114002,
+    VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR = 1000115000,
+    VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR = 1000115001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SURFACE_INFO_2_KHR = 1000119000,
+    VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR = 1000119001,
+    VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR = 1000119002,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES_KHR = 1000120000,
+    VK_STRUCTURE_TYPE_IOS_SURFACE_CREATE_INFO_MVK = 1000122000,
+    VK_STRUCTURE_TYPE_MACOS_SURFACE_CREATE_INFO_MVK = 1000123000,
+    VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR = 1000127000,
+    VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR = 1000127001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT = 1000130000,
+    VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT = 1000130001,
+    VK_STRUCTURE_TYPE_SAMPLE_LOCATIONS_INFO_EXT = 1000143000,
+    VK_STRUCTURE_TYPE_RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT = 1000143001,
+    VK_STRUCTURE_TYPE_PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT = 1000143002,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT = 1000143003,
+    VK_STRUCTURE_TYPE_MULTISAMPLE_PROPERTIES_EXT = 1000143004,
+    VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR = 1000146000,
+    VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR = 1000146001,
+    VK_STRUCTURE_TYPE_IMAGE_SPARSE_MEMORY_REQUIREMENTS_INFO_2_KHR = 1000146002,
+    VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR = 1000146003,
+    VK_STRUCTURE_TYPE_SPARSE_IMAGE_MEMORY_REQUIREMENTS_2_KHR = 1000146004,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BLEND_OPERATION_ADVANCED_FEATURES_EXT = 1000148000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BLEND_OPERATION_ADVANCED_PROPERTIES_EXT = 1000148001,
+    VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_ADVANCED_STATE_CREATE_INFO_EXT = 1000148002,
+    VK_STRUCTURE_TYPE_PIPELINE_COVERAGE_TO_COLOR_STATE_CREATE_INFO_NV = 1000149000,
+    VK_STRUCTURE_TYPE_PIPELINE_COVERAGE_MODULATION_STATE_CREATE_INFO_NV = 1000152000,
+    VK_STRUCTURE_TYPE_VALIDATION_CACHE_CREATE_INFO_EXT = 1000160000,
+    VK_STRUCTURE_TYPE_SHADER_MODULE_VALIDATION_CACHE_CREATE_INFO_EXT = 1000160001,
+    VK_STRUCTURE_TYPE_BEGIN_RANGE = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+    VK_STRUCTURE_TYPE_END_RANGE = VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO,
+    VK_STRUCTURE_TYPE_RANGE_SIZE = (VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO - VK_STRUCTURE_TYPE_APPLICATION_INFO + 1),
+    VK_STRUCTURE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkStructureType;
+
+typedef enum VkSystemAllocationScope {
+    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND = 0,
+    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT = 1,
+    VK_SYSTEM_ALLOCATION_SCOPE_CACHE = 2,
+    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE = 3,
+    VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE = 4,
+    VK_SYSTEM_ALLOCATION_SCOPE_BEGIN_RANGE = VK_SYSTEM_ALLOCATION_SCOPE_COMMAND,
+    VK_SYSTEM_ALLOCATION_SCOPE_END_RANGE = VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE,
+    VK_SYSTEM_ALLOCATION_SCOPE_RANGE_SIZE = (VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND + 1),
+    VK_SYSTEM_ALLOCATION_SCOPE_MAX_ENUM = 0x7FFFFFFF
+} VkSystemAllocationScope;
+
+typedef enum VkInternalAllocationType {
+    VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE = 0,
+    VK_INTERNAL_ALLOCATION_TYPE_BEGIN_RANGE = VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE,
+    VK_INTERNAL_ALLOCATION_TYPE_END_RANGE = VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE,
+    VK_INTERNAL_ALLOCATION_TYPE_RANGE_SIZE = (VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE - VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE + 1),
+    VK_INTERNAL_ALLOCATION_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkInternalAllocationType;
+
+typedef enum VkFormat {
+    VK_FORMAT_UNDEFINED = 0,
+    VK_FORMAT_R4G4_UNORM_PACK8 = 1,
+    VK_FORMAT_R4G4B4A4_UNORM_PACK16 = 2,
+    VK_FORMAT_B4G4R4A4_UNORM_PACK16 = 3,
+    VK_FORMAT_R5G6B5_UNORM_PACK16 = 4,
+    VK_FORMAT_B5G6R5_UNORM_PACK16 = 5,
+    VK_FORMAT_R5G5B5A1_UNORM_PACK16 = 6,
+    VK_FORMAT_B5G5R5A1_UNORM_PACK16 = 7,
+    VK_FORMAT_A1R5G5B5_UNORM_PACK16 = 8,
+    VK_FORMAT_R8_UNORM = 9,
+    VK_FORMAT_R8_SNORM = 10,
+    VK_FORMAT_R8_USCALED = 11,
+    VK_FORMAT_R8_SSCALED = 12,
+    VK_FORMAT_R8_UINT = 13,
+    VK_FORMAT_R8_SINT = 14,
+    VK_FORMAT_R8_SRGB = 15,
+    VK_FORMAT_R8G8_UNORM = 16,
+    VK_FORMAT_R8G8_SNORM = 17,
+    VK_FORMAT_R8G8_USCALED = 18,
+    VK_FORMAT_R8G8_SSCALED = 19,
+    VK_FORMAT_R8G8_UINT = 20,
+    VK_FORMAT_R8G8_SINT = 21,
+    VK_FORMAT_R8G8_SRGB = 22,
+    VK_FORMAT_R8G8B8_UNORM = 23,
+    VK_FORMAT_R8G8B8_SNORM = 24,
+    VK_FORMAT_R8G8B8_USCALED = 25,
+    VK_FORMAT_R8G8B8_SSCALED = 26,
+    VK_FORMAT_R8G8B8_UINT = 27,
+    VK_FORMAT_R8G8B8_SINT = 28,
+    VK_FORMAT_R8G8B8_SRGB = 29,
+    VK_FORMAT_B8G8R8_UNORM = 30,
+    VK_FORMAT_B8G8R8_SNORM = 31,
+    VK_FORMAT_B8G8R8_USCALED = 32,
+    VK_FORMAT_B8G8R8_SSCALED = 33,
+    VK_FORMAT_B8G8R8_UINT = 34,
+    VK_FORMAT_B8G8R8_SINT = 35,
+    VK_FORMAT_B8G8R8_SRGB = 36,
+    VK_FORMAT_R8G8B8A8_UNORM = 37,
+    VK_FORMAT_R8G8B8A8_SNORM = 38,
+    VK_FORMAT_R8G8B8A8_USCALED = 39,
+    VK_FORMAT_R8G8B8A8_SSCALED = 40,
+    VK_FORMAT_R8G8B8A8_UINT = 41,
+    VK_FORMAT_R8G8B8A8_SINT = 42,
+    VK_FORMAT_R8G8B8A8_SRGB = 43,
+    VK_FORMAT_B8G8R8A8_UNORM = 44,
+    VK_FORMAT_B8G8R8A8_SNORM = 45,
+    VK_FORMAT_B8G8R8A8_USCALED = 46,
+    VK_FORMAT_B8G8R8A8_SSCALED = 47,
+    VK_FORMAT_B8G8R8A8_UINT = 48,
+    VK_FORMAT_B8G8R8A8_SINT = 49,
+    VK_FORMAT_B8G8R8A8_SRGB = 50,
+    VK_FORMAT_A8B8G8R8_UNORM_PACK32 = 51,
+    VK_FORMAT_A8B8G8R8_SNORM_PACK32 = 52,
+    VK_FORMAT_A8B8G8R8_USCALED_PACK32 = 53,
+    VK_FORMAT_A8B8G8R8_SSCALED_PACK32 = 54,
+    VK_FORMAT_A8B8G8R8_UINT_PACK32 = 55,
+    VK_FORMAT_A8B8G8R8_SINT_PACK32 = 56,
+    VK_FORMAT_A8B8G8R8_SRGB_PACK32 = 57,
+    VK_FORMAT_A2R10G10B10_UNORM_PACK32 = 58,
+    VK_FORMAT_A2R10G10B10_SNORM_PACK32 = 59,
+    VK_FORMAT_A2R10G10B10_USCALED_PACK32 = 60,
+    VK_FORMAT_A2R10G10B10_SSCALED_PACK32 = 61,
+    VK_FORMAT_A2R10G10B10_UINT_PACK32 = 62,
+    VK_FORMAT_A2R10G10B10_SINT_PACK32 = 63,
+    VK_FORMAT_A2B10G10R10_UNORM_PACK32 = 64,
+    VK_FORMAT_A2B10G10R10_SNORM_PACK32 = 65,
+    VK_FORMAT_A2B10G10R10_USCALED_PACK32 = 66,
+    VK_FORMAT_A2B10G10R10_SSCALED_PACK32 = 67,
+    VK_FORMAT_A2B10G10R10_UINT_PACK32 = 68,
+    VK_FORMAT_A2B10G10R10_SINT_PACK32 = 69,
+    VK_FORMAT_R16_UNORM = 70,
+    VK_FORMAT_R16_SNORM = 71,
+    VK_FORMAT_R16_USCALED = 72,
+    VK_FORMAT_R16_SSCALED = 73,
+    VK_FORMAT_R16_UINT = 74,
+    VK_FORMAT_R16_SINT = 75,
+    VK_FORMAT_R16_SFLOAT = 76,
+    VK_FORMAT_R16G16_UNORM = 77,
+    VK_FORMAT_R16G16_SNORM = 78,
+    VK_FORMAT_R16G16_USCALED = 79,
+    VK_FORMAT_R16G16_SSCALED = 80,
+    VK_FORMAT_R16G16_UINT = 81,
+    VK_FORMAT_R16G16_SINT = 82,
+    VK_FORMAT_R16G16_SFLOAT = 83,
+    VK_FORMAT_R16G16B16_UNORM = 84,
+    VK_FORMAT_R16G16B16_SNORM = 85,
+    VK_FORMAT_R16G16B16_USCALED = 86,
+    VK_FORMAT_R16G16B16_SSCALED = 87,
+    VK_FORMAT_R16G16B16_UINT = 88,
+    VK_FORMAT_R16G16B16_SINT = 89,
+    VK_FORMAT_R16G16B16_SFLOAT = 90,
+    VK_FORMAT_R16G16B16A16_UNORM = 91,
+    VK_FORMAT_R16G16B16A16_SNORM = 92,
+    VK_FORMAT_R16G16B16A16_USCALED = 93,
+    VK_FORMAT_R16G16B16A16_SSCALED = 94,
+    VK_FORMAT_R16G16B16A16_UINT = 95,
+    VK_FORMAT_R16G16B16A16_SINT = 96,
+    VK_FORMAT_R16G16B16A16_SFLOAT = 97,
+    VK_FORMAT_R32_UINT = 98,
+    VK_FORMAT_R32_SINT = 99,
+    VK_FORMAT_R32_SFLOAT = 100,
+    VK_FORMAT_R32G32_UINT = 101,
+    VK_FORMAT_R32G32_SINT = 102,
+    VK_FORMAT_R32G32_SFLOAT = 103,
+    VK_FORMAT_R32G32B32_UINT = 104,
+    VK_FORMAT_R32G32B32_SINT = 105,
+    VK_FORMAT_R32G32B32_SFLOAT = 106,
+    VK_FORMAT_R32G32B32A32_UINT = 107,
+    VK_FORMAT_R32G32B32A32_SINT = 108,
+    VK_FORMAT_R32G32B32A32_SFLOAT = 109,
+    VK_FORMAT_R64_UINT = 110,
+    VK_FORMAT_R64_SINT = 111,
+    VK_FORMAT_R64_SFLOAT = 112,
+    VK_FORMAT_R64G64_UINT = 113,
+    VK_FORMAT_R64G64_SINT = 114,
+    VK_FORMAT_R64G64_SFLOAT = 115,
+    VK_FORMAT_R64G64B64_UINT = 116,
+    VK_FORMAT_R64G64B64_SINT = 117,
+    VK_FORMAT_R64G64B64_SFLOAT = 118,
+    VK_FORMAT_R64G64B64A64_UINT = 119,
+    VK_FORMAT_R64G64B64A64_SINT = 120,
+    VK_FORMAT_R64G64B64A64_SFLOAT = 121,
+    VK_FORMAT_B10G11R11_UFLOAT_PACK32 = 122,
+    VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 = 123,
+    VK_FORMAT_D16_UNORM = 124,
+    VK_FORMAT_X8_D24_UNORM_PACK32 = 125,
+    VK_FORMAT_D32_SFLOAT = 126,
+    VK_FORMAT_S8_UINT = 127,
+    VK_FORMAT_D16_UNORM_S8_UINT = 128,
+    VK_FORMAT_D24_UNORM_S8_UINT = 129,
+    VK_FORMAT_D32_SFLOAT_S8_UINT = 130,
+    VK_FORMAT_BC1_RGB_UNORM_BLOCK = 131,
+    VK_FORMAT_BC1_RGB_SRGB_BLOCK = 132,
+    VK_FORMAT_BC1_RGBA_UNORM_BLOCK = 133,
+    VK_FORMAT_BC1_RGBA_SRGB_BLOCK = 134,
+    VK_FORMAT_BC2_UNORM_BLOCK = 135,
+    VK_FORMAT_BC2_SRGB_BLOCK = 136,
+    VK_FORMAT_BC3_UNORM_BLOCK = 137,
+    VK_FORMAT_BC3_SRGB_BLOCK = 138,
+    VK_FORMAT_BC4_UNORM_BLOCK = 139,
+    VK_FORMAT_BC4_SNORM_BLOCK = 140,
+    VK_FORMAT_BC5_UNORM_BLOCK = 141,
+    VK_FORMAT_BC5_SNORM_BLOCK = 142,
+    VK_FORMAT_BC6H_UFLOAT_BLOCK = 143,
+    VK_FORMAT_BC6H_SFLOAT_BLOCK = 144,
+    VK_FORMAT_BC7_UNORM_BLOCK = 145,
+    VK_FORMAT_BC7_SRGB_BLOCK = 146,
+    VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK = 147,
+    VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK = 148,
+    VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK = 149,
+    VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK = 150,
+    VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK = 151,
+    VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK = 152,
+    VK_FORMAT_EAC_R11_UNORM_BLOCK = 153,
+    VK_FORMAT_EAC_R11_SNORM_BLOCK = 154,
+    VK_FORMAT_EAC_R11G11_UNORM_BLOCK = 155,
+    VK_FORMAT_EAC_R11G11_SNORM_BLOCK = 156,
+    VK_FORMAT_ASTC_4x4_UNORM_BLOCK = 157,
+    VK_FORMAT_ASTC_4x4_SRGB_BLOCK = 158,
+    VK_FORMAT_ASTC_5x4_UNORM_BLOCK = 159,
+    VK_FORMAT_ASTC_5x4_SRGB_BLOCK = 160,
+    VK_FORMAT_ASTC_5x5_UNORM_BLOCK = 161,
+    VK_FORMAT_ASTC_5x5_SRGB_BLOCK = 162,
+    VK_FORMAT_ASTC_6x5_UNORM_BLOCK = 163,
+    VK_FORMAT_ASTC_6x5_SRGB_BLOCK = 164,
+    VK_FORMAT_ASTC_6x6_UNORM_BLOCK = 165,
+    VK_FORMAT_ASTC_6x6_SRGB_BLOCK = 166,
+    VK_FORMAT_ASTC_8x5_UNORM_BLOCK = 167,
+    VK_FORMAT_ASTC_8x5_SRGB_BLOCK = 168,
+    VK_FORMAT_ASTC_8x6_UNORM_BLOCK = 169,
+    VK_FORMAT_ASTC_8x6_SRGB_BLOCK = 170,
+    VK_FORMAT_ASTC_8x8_UNORM_BLOCK = 171,
+    VK_FORMAT_ASTC_8x8_SRGB_BLOCK = 172,
+    VK_FORMAT_ASTC_10x5_UNORM_BLOCK = 173,
+    VK_FORMAT_ASTC_10x5_SRGB_BLOCK = 174,
+    VK_FORMAT_ASTC_10x6_UNORM_BLOCK = 175,
+    VK_FORMAT_ASTC_10x6_SRGB_BLOCK = 176,
+    VK_FORMAT_ASTC_10x8_UNORM_BLOCK = 177,
+    VK_FORMAT_ASTC_10x8_SRGB_BLOCK = 178,
+    VK_FORMAT_ASTC_10x10_UNORM_BLOCK = 179,
+    VK_FORMAT_ASTC_10x10_SRGB_BLOCK = 180,
+    VK_FORMAT_ASTC_12x10_UNORM_BLOCK = 181,
+    VK_FORMAT_ASTC_12x10_SRGB_BLOCK = 182,
+    VK_FORMAT_ASTC_12x12_UNORM_BLOCK = 183,
+    VK_FORMAT_ASTC_12x12_SRGB_BLOCK = 184,
+    VK_FORMAT_PVRTC1_2BPP_UNORM_BLOCK_IMG = 1000054000,
+    VK_FORMAT_PVRTC1_4BPP_UNORM_BLOCK_IMG = 1000054001,
+    VK_FORMAT_PVRTC2_2BPP_UNORM_BLOCK_IMG = 1000054002,
+    VK_FORMAT_PVRTC2_4BPP_UNORM_BLOCK_IMG = 1000054003,
+    VK_FORMAT_PVRTC1_2BPP_SRGB_BLOCK_IMG = 1000054004,
+    VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG = 1000054005,
+    VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG = 1000054006,
+    VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG = 1000054007,
+    VK_FORMAT_BEGIN_RANGE = VK_FORMAT_UNDEFINED,
+    VK_FORMAT_END_RANGE = VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
+    VK_FORMAT_RANGE_SIZE = (VK_FORMAT_ASTC_12x12_SRGB_BLOCK - VK_FORMAT_UNDEFINED + 1),
+    VK_FORMAT_MAX_ENUM = 0x7FFFFFFF
+} VkFormat;
+
+typedef enum VkImageType {
+    VK_IMAGE_TYPE_1D = 0,
+    VK_IMAGE_TYPE_2D = 1,
+    VK_IMAGE_TYPE_3D = 2,
+    VK_IMAGE_TYPE_BEGIN_RANGE = VK_IMAGE_TYPE_1D,
+    VK_IMAGE_TYPE_END_RANGE = VK_IMAGE_TYPE_3D,
+    VK_IMAGE_TYPE_RANGE_SIZE = (VK_IMAGE_TYPE_3D - VK_IMAGE_TYPE_1D + 1),
+    VK_IMAGE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkImageType;
+
+typedef enum VkImageTiling {
+    VK_IMAGE_TILING_OPTIMAL = 0,
+    VK_IMAGE_TILING_LINEAR = 1,
+    VK_IMAGE_TILING_BEGIN_RANGE = VK_IMAGE_TILING_OPTIMAL,
+    VK_IMAGE_TILING_END_RANGE = VK_IMAGE_TILING_LINEAR,
+    VK_IMAGE_TILING_RANGE_SIZE = (VK_IMAGE_TILING_LINEAR - VK_IMAGE_TILING_OPTIMAL + 1),
+    VK_IMAGE_TILING_MAX_ENUM = 0x7FFFFFFF
+} VkImageTiling;
+
+typedef enum VkPhysicalDeviceType {
+    VK_PHYSICAL_DEVICE_TYPE_OTHER = 0,
+    VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU = 1,
+    VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU = 2,
+    VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU = 3,
+    VK_PHYSICAL_DEVICE_TYPE_CPU = 4,
+    VK_PHYSICAL_DEVICE_TYPE_BEGIN_RANGE = VK_PHYSICAL_DEVICE_TYPE_OTHER,
+    VK_PHYSICAL_DEVICE_TYPE_END_RANGE = VK_PHYSICAL_DEVICE_TYPE_CPU,
+    VK_PHYSICAL_DEVICE_TYPE_RANGE_SIZE = (VK_PHYSICAL_DEVICE_TYPE_CPU - VK_PHYSICAL_DEVICE_TYPE_OTHER + 1),
+    VK_PHYSICAL_DEVICE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkPhysicalDeviceType;
+
+typedef enum VkQueryType {
+    VK_QUERY_TYPE_OCCLUSION = 0,
+    VK_QUERY_TYPE_PIPELINE_STATISTICS = 1,
+    VK_QUERY_TYPE_TIMESTAMP = 2,
+    VK_QUERY_TYPE_BEGIN_RANGE = VK_QUERY_TYPE_OCCLUSION,
+    VK_QUERY_TYPE_END_RANGE = VK_QUERY_TYPE_TIMESTAMP,
+    VK_QUERY_TYPE_RANGE_SIZE = (VK_QUERY_TYPE_TIMESTAMP - VK_QUERY_TYPE_OCCLUSION + 1),
+    VK_QUERY_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkQueryType;
+
+typedef enum VkSharingMode {
+    VK_SHARING_MODE_EXCLUSIVE = 0,
+    VK_SHARING_MODE_CONCURRENT = 1,
+    VK_SHARING_MODE_BEGIN_RANGE = VK_SHARING_MODE_EXCLUSIVE,
+    VK_SHARING_MODE_END_RANGE = VK_SHARING_MODE_CONCURRENT,
+    VK_SHARING_MODE_RANGE_SIZE = (VK_SHARING_MODE_CONCURRENT - VK_SHARING_MODE_EXCLUSIVE + 1),
+    VK_SHARING_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSharingMode;
+
+typedef enum VkImageLayout {
+    VK_IMAGE_LAYOUT_UNDEFINED = 0,
+    VK_IMAGE_LAYOUT_GENERAL = 1,
+    VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL = 2,
+    VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL = 3,
+    VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL = 4,
+    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL = 5,
+    VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL = 6,
+    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL = 7,
+    VK_IMAGE_LAYOUT_PREINITIALIZED = 8,
+    VK_IMAGE_LAYOUT_PRESENT_SRC_KHR = 1000001002,
+    VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR = 1000111000,
+    VK_IMAGE_LAYOUT_BEGIN_RANGE = VK_IMAGE_LAYOUT_UNDEFINED,
+    VK_IMAGE_LAYOUT_END_RANGE = VK_IMAGE_LAYOUT_PREINITIALIZED,
+    VK_IMAGE_LAYOUT_RANGE_SIZE = (VK_IMAGE_LAYOUT_PREINITIALIZED - VK_IMAGE_LAYOUT_UNDEFINED + 1),
+    VK_IMAGE_LAYOUT_MAX_ENUM = 0x7FFFFFFF
+} VkImageLayout;
+
+typedef enum VkImageViewType {
+    VK_IMAGE_VIEW_TYPE_1D = 0,
+    VK_IMAGE_VIEW_TYPE_2D = 1,
+    VK_IMAGE_VIEW_TYPE_3D = 2,
+    VK_IMAGE_VIEW_TYPE_CUBE = 3,
+    VK_IMAGE_VIEW_TYPE_1D_ARRAY = 4,
+    VK_IMAGE_VIEW_TYPE_2D_ARRAY = 5,
+    VK_IMAGE_VIEW_TYPE_CUBE_ARRAY = 6,
+    VK_IMAGE_VIEW_TYPE_BEGIN_RANGE = VK_IMAGE_VIEW_TYPE_1D,
+    VK_IMAGE_VIEW_TYPE_END_RANGE = VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
+    VK_IMAGE_VIEW_TYPE_RANGE_SIZE = (VK_IMAGE_VIEW_TYPE_CUBE_ARRAY - VK_IMAGE_VIEW_TYPE_1D + 1),
+    VK_IMAGE_VIEW_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkImageViewType;
+
+typedef enum VkComponentSwizzle {
+    VK_COMPONENT_SWIZZLE_IDENTITY = 0,
+    VK_COMPONENT_SWIZZLE_ZERO = 1,
+    VK_COMPONENT_SWIZZLE_ONE = 2,
+    VK_COMPONENT_SWIZZLE_R = 3,
+    VK_COMPONENT_SWIZZLE_G = 4,
+    VK_COMPONENT_SWIZZLE_B = 5,
+    VK_COMPONENT_SWIZZLE_A = 6,
+    VK_COMPONENT_SWIZZLE_BEGIN_RANGE = VK_COMPONENT_SWIZZLE_IDENTITY,
+    VK_COMPONENT_SWIZZLE_END_RANGE = VK_COMPONENT_SWIZZLE_A,
+    VK_COMPONENT_SWIZZLE_RANGE_SIZE = (VK_COMPONENT_SWIZZLE_A - VK_COMPONENT_SWIZZLE_IDENTITY + 1),
+    VK_COMPONENT_SWIZZLE_MAX_ENUM = 0x7FFFFFFF
+} VkComponentSwizzle;
+
+typedef enum VkVertexInputRate {
+    VK_VERTEX_INPUT_RATE_VERTEX = 0,
+    VK_VERTEX_INPUT_RATE_INSTANCE = 1,
+    VK_VERTEX_INPUT_RATE_BEGIN_RANGE = VK_VERTEX_INPUT_RATE_VERTEX,
+    VK_VERTEX_INPUT_RATE_END_RANGE = VK_VERTEX_INPUT_RATE_INSTANCE,
+    VK_VERTEX_INPUT_RATE_RANGE_SIZE = (VK_VERTEX_INPUT_RATE_INSTANCE - VK_VERTEX_INPUT_RATE_VERTEX + 1),
+    VK_VERTEX_INPUT_RATE_MAX_ENUM = 0x7FFFFFFF
+} VkVertexInputRate;
+
+typedef enum VkPrimitiveTopology {
+    VK_PRIMITIVE_TOPOLOGY_POINT_LIST = 0,
+    VK_PRIMITIVE_TOPOLOGY_LINE_LIST = 1,
+    VK_PRIMITIVE_TOPOLOGY_LINE_STRIP = 2,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST = 3,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP = 4,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN = 5,
+    VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY = 6,
+    VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY = 7,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY = 8,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY = 9,
+    VK_PRIMITIVE_TOPOLOGY_PATCH_LIST = 10,
+    VK_PRIMITIVE_TOPOLOGY_BEGIN_RANGE = VK_PRIMITIVE_TOPOLOGY_POINT_LIST,
+    VK_PRIMITIVE_TOPOLOGY_END_RANGE = VK_PRIMITIVE_TOPOLOGY_PATCH_LIST,
+    VK_PRIMITIVE_TOPOLOGY_RANGE_SIZE = (VK_PRIMITIVE_TOPOLOGY_PATCH_LIST - VK_PRIMITIVE_TOPOLOGY_POINT_LIST + 1),
+    VK_PRIMITIVE_TOPOLOGY_MAX_ENUM = 0x7FFFFFFF
+} VkPrimitiveTopology;
+
+typedef enum VkPolygonMode {
+    VK_POLYGON_MODE_FILL = 0,
+    VK_POLYGON_MODE_LINE = 1,
+    VK_POLYGON_MODE_POINT = 2,
+    VK_POLYGON_MODE_FILL_RECTANGLE_NV = 1000153000,
+    VK_POLYGON_MODE_BEGIN_RANGE = VK_POLYGON_MODE_FILL,
+    VK_POLYGON_MODE_END_RANGE = VK_POLYGON_MODE_POINT,
+    VK_POLYGON_MODE_RANGE_SIZE = (VK_POLYGON_MODE_POINT - VK_POLYGON_MODE_FILL + 1),
+    VK_POLYGON_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkPolygonMode;
+
+typedef enum VkFrontFace {
+    VK_FRONT_FACE_COUNTER_CLOCKWISE = 0,
+    VK_FRONT_FACE_CLOCKWISE = 1,
+    VK_FRONT_FACE_BEGIN_RANGE = VK_FRONT_FACE_COUNTER_CLOCKWISE,
+    VK_FRONT_FACE_END_RANGE = VK_FRONT_FACE_CLOCKWISE,
+    VK_FRONT_FACE_RANGE_SIZE = (VK_FRONT_FACE_CLOCKWISE - VK_FRONT_FACE_COUNTER_CLOCKWISE + 1),
+    VK_FRONT_FACE_MAX_ENUM = 0x7FFFFFFF
+} VkFrontFace;
+
+typedef enum VkCompareOp {
+    VK_COMPARE_OP_NEVER = 0,
+    VK_COMPARE_OP_LESS = 1,
+    VK_COMPARE_OP_EQUAL = 2,
+    VK_COMPARE_OP_LESS_OR_EQUAL = 3,
+    VK_COMPARE_OP_GREATER = 4,
+    VK_COMPARE_OP_NOT_EQUAL = 5,
+    VK_COMPARE_OP_GREATER_OR_EQUAL = 6,
+    VK_COMPARE_OP_ALWAYS = 7,
+    VK_COMPARE_OP_BEGIN_RANGE = VK_COMPARE_OP_NEVER,
+    VK_COMPARE_OP_END_RANGE = VK_COMPARE_OP_ALWAYS,
+    VK_COMPARE_OP_RANGE_SIZE = (VK_COMPARE_OP_ALWAYS - VK_COMPARE_OP_NEVER + 1),
+    VK_COMPARE_OP_MAX_ENUM = 0x7FFFFFFF
+} VkCompareOp;
+
+typedef enum VkStencilOp {
+    VK_STENCIL_OP_KEEP = 0,
+    VK_STENCIL_OP_ZERO = 1,
+    VK_STENCIL_OP_REPLACE = 2,
+    VK_STENCIL_OP_INCREMENT_AND_CLAMP = 3,
+    VK_STENCIL_OP_DECREMENT_AND_CLAMP = 4,
+    VK_STENCIL_OP_INVERT = 5,
+    VK_STENCIL_OP_INCREMENT_AND_WRAP = 6,
+    VK_STENCIL_OP_DECREMENT_AND_WRAP = 7,
+    VK_STENCIL_OP_BEGIN_RANGE = VK_STENCIL_OP_KEEP,
+    VK_STENCIL_OP_END_RANGE = VK_STENCIL_OP_DECREMENT_AND_WRAP,
+    VK_STENCIL_OP_RANGE_SIZE = (VK_STENCIL_OP_DECREMENT_AND_WRAP - VK_STENCIL_OP_KEEP + 1),
+    VK_STENCIL_OP_MAX_ENUM = 0x7FFFFFFF
+} VkStencilOp;
+
+typedef enum VkLogicOp {
+    VK_LOGIC_OP_CLEAR = 0,
+    VK_LOGIC_OP_AND = 1,
+    VK_LOGIC_OP_AND_REVERSE = 2,
+    VK_LOGIC_OP_COPY = 3,
+    VK_LOGIC_OP_AND_INVERTED = 4,
+    VK_LOGIC_OP_NO_OP = 5,
+    VK_LOGIC_OP_XOR = 6,
+    VK_LOGIC_OP_OR = 7,
+    VK_LOGIC_OP_NOR = 8,
+    VK_LOGIC_OP_EQUIVALENT = 9,
+    VK_LOGIC_OP_INVERT = 10,
+    VK_LOGIC_OP_OR_REVERSE = 11,
+    VK_LOGIC_OP_COPY_INVERTED = 12,
+    VK_LOGIC_OP_OR_INVERTED = 13,
+    VK_LOGIC_OP_NAND = 14,
+    VK_LOGIC_OP_SET = 15,
+    VK_LOGIC_OP_BEGIN_RANGE = VK_LOGIC_OP_CLEAR,
+    VK_LOGIC_OP_END_RANGE = VK_LOGIC_OP_SET,
+    VK_LOGIC_OP_RANGE_SIZE = (VK_LOGIC_OP_SET - VK_LOGIC_OP_CLEAR + 1),
+    VK_LOGIC_OP_MAX_ENUM = 0x7FFFFFFF
+} VkLogicOp;
+
+typedef enum VkBlendFactor {
+    VK_BLEND_FACTOR_ZERO = 0,
+    VK_BLEND_FACTOR_ONE = 1,
+    VK_BLEND_FACTOR_SRC_COLOR = 2,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR = 3,
+    VK_BLEND_FACTOR_DST_COLOR = 4,
+    VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR = 5,
+    VK_BLEND_FACTOR_SRC_ALPHA = 6,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA = 7,
+    VK_BLEND_FACTOR_DST_ALPHA = 8,
+    VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA = 9,
+    VK_BLEND_FACTOR_CONSTANT_COLOR = 10,
+    VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR = 11,
+    VK_BLEND_FACTOR_CONSTANT_ALPHA = 12,
+    VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA = 13,
+    VK_BLEND_FACTOR_SRC_ALPHA_SATURATE = 14,
+    VK_BLEND_FACTOR_SRC1_COLOR = 15,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR = 16,
+    VK_BLEND_FACTOR_SRC1_ALPHA = 17,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA = 18,
+    VK_BLEND_FACTOR_BEGIN_RANGE = VK_BLEND_FACTOR_ZERO,
+    VK_BLEND_FACTOR_END_RANGE = VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA,
+    VK_BLEND_FACTOR_RANGE_SIZE = (VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA - VK_BLEND_FACTOR_ZERO + 1),
+    VK_BLEND_FACTOR_MAX_ENUM = 0x7FFFFFFF
+} VkBlendFactor;
+
+typedef enum VkBlendOp {
+    VK_BLEND_OP_ADD = 0,
+    VK_BLEND_OP_SUBTRACT = 1,
+    VK_BLEND_OP_REVERSE_SUBTRACT = 2,
+    VK_BLEND_OP_MIN = 3,
+    VK_BLEND_OP_MAX = 4,
+    VK_BLEND_OP_ZERO_EXT = 1000148000,
+    VK_BLEND_OP_SRC_EXT = 1000148001,
+    VK_BLEND_OP_DST_EXT = 1000148002,
+    VK_BLEND_OP_SRC_OVER_EXT = 1000148003,
+    VK_BLEND_OP_DST_OVER_EXT = 1000148004,
+    VK_BLEND_OP_SRC_IN_EXT = 1000148005,
+    VK_BLEND_OP_DST_IN_EXT = 1000148006,
+    VK_BLEND_OP_SRC_OUT_EXT = 1000148007,
+    VK_BLEND_OP_DST_OUT_EXT = 1000148008,
+    VK_BLEND_OP_SRC_ATOP_EXT = 1000148009,
+    VK_BLEND_OP_DST_ATOP_EXT = 1000148010,
+    VK_BLEND_OP_XOR_EXT = 1000148011,
+    VK_BLEND_OP_MULTIPLY_EXT = 1000148012,
+    VK_BLEND_OP_SCREEN_EXT = 1000148013,
+    VK_BLEND_OP_OVERLAY_EXT = 1000148014,
+    VK_BLEND_OP_DARKEN_EXT = 1000148015,
+    VK_BLEND_OP_LIGHTEN_EXT = 1000148016,
+    VK_BLEND_OP_COLORDODGE_EXT = 1000148017,
+    VK_BLEND_OP_COLORBURN_EXT = 1000148018,
+    VK_BLEND_OP_HARDLIGHT_EXT = 1000148019,
+    VK_BLEND_OP_SOFTLIGHT_EXT = 1000148020,
+    VK_BLEND_OP_DIFFERENCE_EXT = 1000148021,
+    VK_BLEND_OP_EXCLUSION_EXT = 1000148022,
+    VK_BLEND_OP_INVERT_EXT = 1000148023,
+    VK_BLEND_OP_INVERT_RGB_EXT = 1000148024,
+    VK_BLEND_OP_LINEARDODGE_EXT = 1000148025,
+    VK_BLEND_OP_LINEARBURN_EXT = 1000148026,
+    VK_BLEND_OP_VIVIDLIGHT_EXT = 1000148027,
+    VK_BLEND_OP_LINEARLIGHT_EXT = 1000148028,
+    VK_BLEND_OP_PINLIGHT_EXT = 1000148029,
+    VK_BLEND_OP_HARDMIX_EXT = 1000148030,
+    VK_BLEND_OP_HSL_HUE_EXT = 1000148031,
+    VK_BLEND_OP_HSL_SATURATION_EXT = 1000148032,
+    VK_BLEND_OP_HSL_COLOR_EXT = 1000148033,
+    VK_BLEND_OP_HSL_LUMINOSITY_EXT = 1000148034,
+    VK_BLEND_OP_PLUS_EXT = 1000148035,
+    VK_BLEND_OP_PLUS_CLAMPED_EXT = 1000148036,
+    VK_BLEND_OP_PLUS_CLAMPED_ALPHA_EXT = 1000148037,
+    VK_BLEND_OP_PLUS_DARKER_EXT = 1000148038,
+    VK_BLEND_OP_MINUS_EXT = 1000148039,
+    VK_BLEND_OP_MINUS_CLAMPED_EXT = 1000148040,
+    VK_BLEND_OP_CONTRAST_EXT = 1000148041,
+    VK_BLEND_OP_INVERT_OVG_EXT = 1000148042,
+    VK_BLEND_OP_RED_EXT = 1000148043,
+    VK_BLEND_OP_GREEN_EXT = 1000148044,
+    VK_BLEND_OP_BLUE_EXT = 1000148045,
+    VK_BLEND_OP_BEGIN_RANGE = VK_BLEND_OP_ADD,
+    VK_BLEND_OP_END_RANGE = VK_BLEND_OP_MAX,
+    VK_BLEND_OP_RANGE_SIZE = (VK_BLEND_OP_MAX - VK_BLEND_OP_ADD + 1),
+    VK_BLEND_OP_MAX_ENUM = 0x7FFFFFFF
+} VkBlendOp;
+
+typedef enum VkDynamicState {
+    VK_DYNAMIC_STATE_VIEWPORT = 0,
+    VK_DYNAMIC_STATE_SCISSOR = 1,
+    VK_DYNAMIC_STATE_LINE_WIDTH = 2,
+    VK_DYNAMIC_STATE_DEPTH_BIAS = 3,
+    VK_DYNAMIC_STATE_BLEND_CONSTANTS = 4,
+    VK_DYNAMIC_STATE_DEPTH_BOUNDS = 5,
+    VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK = 6,
+    VK_DYNAMIC_STATE_STENCIL_WRITE_MASK = 7,
+    VK_DYNAMIC_STATE_STENCIL_REFERENCE = 8,
+    VK_DYNAMIC_STATE_VIEWPORT_W_SCALING_NV = 1000087000,
+    VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT = 1000099000,
+    VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT = 1000143000,
+    VK_DYNAMIC_STATE_BEGIN_RANGE = VK_DYNAMIC_STATE_VIEWPORT,
+    VK_DYNAMIC_STATE_END_RANGE = VK_DYNAMIC_STATE_STENCIL_REFERENCE,
+    VK_DYNAMIC_STATE_RANGE_SIZE = (VK_DYNAMIC_STATE_STENCIL_REFERENCE - VK_DYNAMIC_STATE_VIEWPORT + 1),
+    VK_DYNAMIC_STATE_MAX_ENUM = 0x7FFFFFFF
+} VkDynamicState;
+
+typedef enum VkFilter {
+    VK_FILTER_NEAREST = 0,
+    VK_FILTER_LINEAR = 1,
+    VK_FILTER_CUBIC_IMG = 1000015000,
+    VK_FILTER_BEGIN_RANGE = VK_FILTER_NEAREST,
+    VK_FILTER_END_RANGE = VK_FILTER_LINEAR,
+    VK_FILTER_RANGE_SIZE = (VK_FILTER_LINEAR - VK_FILTER_NEAREST + 1),
+    VK_FILTER_MAX_ENUM = 0x7FFFFFFF
+} VkFilter;
+
+typedef enum VkSamplerMipmapMode {
+    VK_SAMPLER_MIPMAP_MODE_NEAREST = 0,
+    VK_SAMPLER_MIPMAP_MODE_LINEAR = 1,
+    VK_SAMPLER_MIPMAP_MODE_BEGIN_RANGE = VK_SAMPLER_MIPMAP_MODE_NEAREST,
+    VK_SAMPLER_MIPMAP_MODE_END_RANGE = VK_SAMPLER_MIPMAP_MODE_LINEAR,
+    VK_SAMPLER_MIPMAP_MODE_RANGE_SIZE = (VK_SAMPLER_MIPMAP_MODE_LINEAR - VK_SAMPLER_MIPMAP_MODE_NEAREST + 1),
+    VK_SAMPLER_MIPMAP_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSamplerMipmapMode;
+
+typedef enum VkSamplerAddressMode {
+    VK_SAMPLER_ADDRESS_MODE_REPEAT = 0,
+    VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT = 1,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE = 2,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3,
+    VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE = 4,
+    VK_SAMPLER_ADDRESS_MODE_BEGIN_RANGE = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+    VK_SAMPLER_ADDRESS_MODE_END_RANGE = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
+    VK_SAMPLER_ADDRESS_MODE_RANGE_SIZE = (VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER - VK_SAMPLER_ADDRESS_MODE_REPEAT + 1),
+    VK_SAMPLER_ADDRESS_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSamplerAddressMode;
+
+typedef enum VkBorderColor {
+    VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK = 0,
+    VK_BORDER_COLOR_INT_TRANSPARENT_BLACK = 1,
+    VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK = 2,
+    VK_BORDER_COLOR_INT_OPAQUE_BLACK = 3,
+    VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE = 4,
+    VK_BORDER_COLOR_INT_OPAQUE_WHITE = 5,
+    VK_BORDER_COLOR_BEGIN_RANGE = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
+    VK_BORDER_COLOR_END_RANGE = VK_BORDER_COLOR_INT_OPAQUE_WHITE,
+    VK_BORDER_COLOR_RANGE_SIZE = (VK_BORDER_COLOR_INT_OPAQUE_WHITE - VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK + 1),
+    VK_BORDER_COLOR_MAX_ENUM = 0x7FFFFFFF
+} VkBorderColor;
+
+typedef enum VkDescriptorType {
+    VK_DESCRIPTOR_TYPE_SAMPLER = 0,
+    VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER = 1,
+    VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE = 2,
+    VK_DESCRIPTOR_TYPE_STORAGE_IMAGE = 3,
+    VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER = 4,
+    VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER = 5,
+    VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER = 6,
+    VK_DESCRIPTOR_TYPE_STORAGE_BUFFER = 7,
+    VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC = 8,
+    VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC = 9,
+    VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT = 10,
+    VK_DESCRIPTOR_TYPE_BEGIN_RANGE = VK_DESCRIPTOR_TYPE_SAMPLER,
+    VK_DESCRIPTOR_TYPE_END_RANGE = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT,
+    VK_DESCRIPTOR_TYPE_RANGE_SIZE = (VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT - VK_DESCRIPTOR_TYPE_SAMPLER + 1),
+    VK_DESCRIPTOR_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkDescriptorType;
+
+typedef enum VkAttachmentLoadOp {
+    VK_ATTACHMENT_LOAD_OP_LOAD = 0,
+    VK_ATTACHMENT_LOAD_OP_CLEAR = 1,
+    VK_ATTACHMENT_LOAD_OP_DONT_CARE = 2,
+    VK_ATTACHMENT_LOAD_OP_BEGIN_RANGE = VK_ATTACHMENT_LOAD_OP_LOAD,
+    VK_ATTACHMENT_LOAD_OP_END_RANGE = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+    VK_ATTACHMENT_LOAD_OP_RANGE_SIZE = (VK_ATTACHMENT_LOAD_OP_DONT_CARE - VK_ATTACHMENT_LOAD_OP_LOAD + 1),
+    VK_ATTACHMENT_LOAD_OP_MAX_ENUM = 0x7FFFFFFF
+} VkAttachmentLoadOp;
+
+typedef enum VkAttachmentStoreOp {
+    VK_ATTACHMENT_STORE_OP_STORE = 0,
+    VK_ATTACHMENT_STORE_OP_DONT_CARE = 1,
+    VK_ATTACHMENT_STORE_OP_BEGIN_RANGE = VK_ATTACHMENT_STORE_OP_STORE,
+    VK_ATTACHMENT_STORE_OP_END_RANGE = VK_ATTACHMENT_STORE_OP_DONT_CARE,
+    VK_ATTACHMENT_STORE_OP_RANGE_SIZE = (VK_ATTACHMENT_STORE_OP_DONT_CARE - VK_ATTACHMENT_STORE_OP_STORE + 1),
+    VK_ATTACHMENT_STORE_OP_MAX_ENUM = 0x7FFFFFFF
+} VkAttachmentStoreOp;
+
+typedef enum VkPipelineBindPoint {
+    VK_PIPELINE_BIND_POINT_GRAPHICS = 0,
+    VK_PIPELINE_BIND_POINT_COMPUTE = 1,
+    VK_PIPELINE_BIND_POINT_BEGIN_RANGE = VK_PIPELINE_BIND_POINT_GRAPHICS,
+    VK_PIPELINE_BIND_POINT_END_RANGE = VK_PIPELINE_BIND_POINT_COMPUTE,
+    VK_PIPELINE_BIND_POINT_RANGE_SIZE = (VK_PIPELINE_BIND_POINT_COMPUTE - VK_PIPELINE_BIND_POINT_GRAPHICS + 1),
+    VK_PIPELINE_BIND_POINT_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineBindPoint;
+
+typedef enum VkCommandBufferLevel {
+    VK_COMMAND_BUFFER_LEVEL_PRIMARY = 0,
+    VK_COMMAND_BUFFER_LEVEL_SECONDARY = 1,
+    VK_COMMAND_BUFFER_LEVEL_BEGIN_RANGE = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+    VK_COMMAND_BUFFER_LEVEL_END_RANGE = VK_COMMAND_BUFFER_LEVEL_SECONDARY,
+    VK_COMMAND_BUFFER_LEVEL_RANGE_SIZE = (VK_COMMAND_BUFFER_LEVEL_SECONDARY - VK_COMMAND_BUFFER_LEVEL_PRIMARY + 1),
+    VK_COMMAND_BUFFER_LEVEL_MAX_ENUM = 0x7FFFFFFF
+} VkCommandBufferLevel;
+
+typedef enum VkIndexType {
+    VK_INDEX_TYPE_UINT16 = 0,
+    VK_INDEX_TYPE_UINT32 = 1,
+    VK_INDEX_TYPE_BEGIN_RANGE = VK_INDEX_TYPE_UINT16,
+    VK_INDEX_TYPE_END_RANGE = VK_INDEX_TYPE_UINT32,
+    VK_INDEX_TYPE_RANGE_SIZE = (VK_INDEX_TYPE_UINT32 - VK_INDEX_TYPE_UINT16 + 1),
+    VK_INDEX_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkIndexType;
+
+typedef enum VkSubpassContents {
+    VK_SUBPASS_CONTENTS_INLINE = 0,
+    VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS = 1,
+    VK_SUBPASS_CONTENTS_BEGIN_RANGE = VK_SUBPASS_CONTENTS_INLINE,
+    VK_SUBPASS_CONTENTS_END_RANGE = VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS,
+    VK_SUBPASS_CONTENTS_RANGE_SIZE = (VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS - VK_SUBPASS_CONTENTS_INLINE + 1),
+    VK_SUBPASS_CONTENTS_MAX_ENUM = 0x7FFFFFFF
+} VkSubpassContents;
+
+typedef enum VkObjectType {
+    VK_OBJECT_TYPE_UNKNOWN = 0,
+    VK_OBJECT_TYPE_INSTANCE = 1,
+    VK_OBJECT_TYPE_PHYSICAL_DEVICE = 2,
+    VK_OBJECT_TYPE_DEVICE = 3,
+    VK_OBJECT_TYPE_QUEUE = 4,
+    VK_OBJECT_TYPE_SEMAPHORE = 5,
+    VK_OBJECT_TYPE_COMMAND_BUFFER = 6,
+    VK_OBJECT_TYPE_FENCE = 7,
+    VK_OBJECT_TYPE_DEVICE_MEMORY = 8,
+    VK_OBJECT_TYPE_BUFFER = 9,
+    VK_OBJECT_TYPE_IMAGE = 10,
+    VK_OBJECT_TYPE_EVENT = 11,
+    VK_OBJECT_TYPE_QUERY_POOL = 12,
+    VK_OBJECT_TYPE_BUFFER_VIEW = 13,
+    VK_OBJECT_TYPE_IMAGE_VIEW = 14,
+    VK_OBJECT_TYPE_SHADER_MODULE = 15,
+    VK_OBJECT_TYPE_PIPELINE_CACHE = 16,
+    VK_OBJECT_TYPE_PIPELINE_LAYOUT = 17,
+    VK_OBJECT_TYPE_RENDER_PASS = 18,
+    VK_OBJECT_TYPE_PIPELINE = 19,
+    VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT = 20,
+    VK_OBJECT_TYPE_SAMPLER = 21,
+    VK_OBJECT_TYPE_DESCRIPTOR_POOL = 22,
+    VK_OBJECT_TYPE_DESCRIPTOR_SET = 23,
+    VK_OBJECT_TYPE_FRAMEBUFFER = 24,
+    VK_OBJECT_TYPE_COMMAND_POOL = 25,
+    VK_OBJECT_TYPE_SURFACE_KHR = 1000000000,
+    VK_OBJECT_TYPE_SWAPCHAIN_KHR = 1000001000,
+    VK_OBJECT_TYPE_DISPLAY_KHR = 1000002000,
+    VK_OBJECT_TYPE_DISPLAY_MODE_KHR = 1000002001,
+    VK_OBJECT_TYPE_DEBUG_REPORT_CALLBACK_EXT = 1000011000,
+    VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR = 1000085000,
+    VK_OBJECT_TYPE_OBJECT_TABLE_NVX = 1000086000,
+    VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NVX = 1000086001,
+    VK_OBJECT_TYPE_VALIDATION_CACHE_EXT = 1000160000,
+    VK_OBJECT_TYPE_BEGIN_RANGE = VK_OBJECT_TYPE_UNKNOWN,
+    VK_OBJECT_TYPE_END_RANGE = VK_OBJECT_TYPE_COMMAND_POOL,
+    VK_OBJECT_TYPE_RANGE_SIZE = (VK_OBJECT_TYPE_COMMAND_POOL - VK_OBJECT_TYPE_UNKNOWN + 1),
+    VK_OBJECT_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkObjectType;
+
+typedef VkFlags VkInstanceCreateFlags;
+
+typedef enum VkFormatFeatureFlagBits {
+    VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT = 0x00000001,
+    VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT = 0x00000002,
+    VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT = 0x00000004,
+    VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000008,
+    VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT = 0x00000010,
+    VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT = 0x00000020,
+    VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT = 0x00000040,
+    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT = 0x00000080,
+    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT = 0x00000100,
+    VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT = 0x00000200,
+    VK_FORMAT_FEATURE_BLIT_SRC_BIT = 0x00000400,
+    VK_FORMAT_FEATURE_BLIT_DST_BIT = 0x00000800,
+    VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT = 0x00001000,
+    VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_IMG = 0x00002000,
+    VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR = 0x00004000,
+    VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR = 0x00008000,
+    VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT = 0x00010000,
+    VK_FORMAT_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkFormatFeatureFlagBits;
+typedef VkFlags VkFormatFeatureFlags;
+
+typedef enum VkImageUsageFlagBits {
+    VK_IMAGE_USAGE_TRANSFER_SRC_BIT = 0x00000001,
+    VK_IMAGE_USAGE_TRANSFER_DST_BIT = 0x00000002,
+    VK_IMAGE_USAGE_SAMPLED_BIT = 0x00000004,
+    VK_IMAGE_USAGE_STORAGE_BIT = 0x00000008,
+    VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT = 0x00000010,
+    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT = 0x00000020,
+    VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT = 0x00000040,
+    VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT = 0x00000080,
+    VK_IMAGE_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkImageUsageFlagBits;
+typedef VkFlags VkImageUsageFlags;
+
+typedef enum VkImageCreateFlagBits {
+    VK_IMAGE_CREATE_SPARSE_BINDING_BIT = 0x00000001,
+    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002,
+    VK_IMAGE_CREATE_SPARSE_ALIASED_BIT = 0x00000004,
+    VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT = 0x00000008,
+    VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT = 0x00000010,
+    VK_IMAGE_CREATE_BIND_SFR_BIT_KHX = 0x00000040,
+    VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT_KHR = 0x00000020,
+    VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT = 0x00001000,
+    VK_IMAGE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkImageCreateFlagBits;
+typedef VkFlags VkImageCreateFlags;
+
+typedef enum VkSampleCountFlagBits {
+    VK_SAMPLE_COUNT_1_BIT = 0x00000001,
+    VK_SAMPLE_COUNT_2_BIT = 0x00000002,
+    VK_SAMPLE_COUNT_4_BIT = 0x00000004,
+    VK_SAMPLE_COUNT_8_BIT = 0x00000008,
+    VK_SAMPLE_COUNT_16_BIT = 0x00000010,
+    VK_SAMPLE_COUNT_32_BIT = 0x00000020,
+    VK_SAMPLE_COUNT_64_BIT = 0x00000040,
+    VK_SAMPLE_COUNT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSampleCountFlagBits;
+typedef VkFlags VkSampleCountFlags;
+
+typedef enum VkQueueFlagBits {
+    VK_QUEUE_GRAPHICS_BIT = 0x00000001,
+    VK_QUEUE_COMPUTE_BIT = 0x00000002,
+    VK_QUEUE_TRANSFER_BIT = 0x00000004,
+    VK_QUEUE_SPARSE_BINDING_BIT = 0x00000008,
+    VK_QUEUE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueueFlagBits;
+typedef VkFlags VkQueueFlags;
+
+typedef enum VkMemoryPropertyFlagBits {
+    VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT = 0x00000001,
+    VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT = 0x00000002,
+    VK_MEMORY_PROPERTY_HOST_COHERENT_BIT = 0x00000004,
+    VK_MEMORY_PROPERTY_HOST_CACHED_BIT = 0x00000008,
+    VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT = 0x00000010,
+    VK_MEMORY_PROPERTY_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkMemoryPropertyFlagBits;
+typedef VkFlags VkMemoryPropertyFlags;
+
+typedef enum VkMemoryHeapFlagBits {
+    VK_MEMORY_HEAP_DEVICE_LOCAL_BIT = 0x00000001,
+    VK_MEMORY_HEAP_MULTI_INSTANCE_BIT_KHX = 0x00000002,
+    VK_MEMORY_HEAP_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkMemoryHeapFlagBits;
+typedef VkFlags VkMemoryHeapFlags;
+typedef VkFlags VkDeviceCreateFlags;
+typedef VkFlags VkDeviceQueueCreateFlags;
+
+typedef enum VkPipelineStageFlagBits {
+    VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT = 0x00000001,
+    VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT = 0x00000002,
+    VK_PIPELINE_STAGE_VERTEX_INPUT_BIT = 0x00000004,
+    VK_PIPELINE_STAGE_VERTEX_SHADER_BIT = 0x00000008,
+    VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT = 0x00000010,
+    VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT = 0x00000020,
+    VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT = 0x00000040,
+    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT = 0x00000080,
+    VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT = 0x00000100,
+    VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT = 0x00000200,
+    VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT = 0x00000400,
+    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT = 0x00000800,
+    VK_PIPELINE_STAGE_TRANSFER_BIT = 0x00001000,
+    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT = 0x00002000,
+    VK_PIPELINE_STAGE_HOST_BIT = 0x00004000,
+    VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT = 0x00008000,
+    VK_PIPELINE_STAGE_ALL_COMMANDS_BIT = 0x00010000,
+    VK_PIPELINE_STAGE_COMMAND_PROCESS_BIT_NVX = 0x00020000,
+    VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineStageFlagBits;
+typedef VkFlags VkPipelineStageFlags;
+typedef VkFlags VkMemoryMapFlags;
+
+typedef enum VkImageAspectFlagBits {
+    VK_IMAGE_ASPECT_COLOR_BIT = 0x00000001,
+    VK_IMAGE_ASPECT_DEPTH_BIT = 0x00000002,
+    VK_IMAGE_ASPECT_STENCIL_BIT = 0x00000004,
+    VK_IMAGE_ASPECT_METADATA_BIT = 0x00000008,
+    VK_IMAGE_ASPECT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkImageAspectFlagBits;
+typedef VkFlags VkImageAspectFlags;
+
+typedef enum VkSparseImageFormatFlagBits {
+    VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT = 0x00000001,
+    VK_SPARSE_IMAGE_FORMAT_ALIGNED_MIP_SIZE_BIT = 0x00000002,
+    VK_SPARSE_IMAGE_FORMAT_NONSTANDARD_BLOCK_SIZE_BIT = 0x00000004,
+    VK_SPARSE_IMAGE_FORMAT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSparseImageFormatFlagBits;
+typedef VkFlags VkSparseImageFormatFlags;
+
+typedef enum VkSparseMemoryBindFlagBits {
+    VK_SPARSE_MEMORY_BIND_METADATA_BIT = 0x00000001,
+    VK_SPARSE_MEMORY_BIND_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSparseMemoryBindFlagBits;
+typedef VkFlags VkSparseMemoryBindFlags;
+
+typedef enum VkFenceCreateFlagBits {
+    VK_FENCE_CREATE_SIGNALED_BIT = 0x00000001,
+    VK_FENCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkFenceCreateFlagBits;
+typedef VkFlags VkFenceCreateFlags;
+typedef VkFlags VkSemaphoreCreateFlags;
+typedef VkFlags VkEventCreateFlags;
+typedef VkFlags VkQueryPoolCreateFlags;
+
+typedef enum VkQueryPipelineStatisticFlagBits {
+    VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT = 0x00000001,
+    VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT = 0x00000002,
+    VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT = 0x00000004,
+    VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT = 0x00000008,
+    VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT = 0x00000010,
+    VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT = 0x00000020,
+    VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT = 0x00000040,
+    VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT = 0x00000080,
+    VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT = 0x00000100,
+    VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT = 0x00000200,
+    VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT = 0x00000400,
+    VK_QUERY_PIPELINE_STATISTIC_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueryPipelineStatisticFlagBits;
+typedef VkFlags VkQueryPipelineStatisticFlags;
+
+typedef enum VkQueryResultFlagBits {
+    VK_QUERY_RESULT_64_BIT = 0x00000001,
+    VK_QUERY_RESULT_WAIT_BIT = 0x00000002,
+    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT = 0x00000004,
+    VK_QUERY_RESULT_PARTIAL_BIT = 0x00000008,
+    VK_QUERY_RESULT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueryResultFlagBits;
+typedef VkFlags VkQueryResultFlags;
+
+typedef enum VkBufferCreateFlagBits {
+    VK_BUFFER_CREATE_SPARSE_BINDING_BIT = 0x00000001,
+    VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002,
+    VK_BUFFER_CREATE_SPARSE_ALIASED_BIT = 0x00000004,
+    VK_BUFFER_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkBufferCreateFlagBits;
+typedef VkFlags VkBufferCreateFlags;
+
+typedef enum VkBufferUsageFlagBits {
+    VK_BUFFER_USAGE_TRANSFER_SRC_BIT = 0x00000001,
+    VK_BUFFER_USAGE_TRANSFER_DST_BIT = 0x00000002,
+    VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000004,
+    VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT = 0x00000008,
+    VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT = 0x00000010,
+    VK_BUFFER_USAGE_STORAGE_BUFFER_BIT = 0x00000020,
+    VK_BUFFER_USAGE_INDEX_BUFFER_BIT = 0x00000040,
+    VK_BUFFER_USAGE_VERTEX_BUFFER_BIT = 0x00000080,
+    VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT = 0x00000100,
+    VK_BUFFER_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkBufferUsageFlagBits;
+typedef VkFlags VkBufferUsageFlags;
+typedef VkFlags VkBufferViewCreateFlags;
+typedef VkFlags VkImageViewCreateFlags;
+typedef VkFlags VkShaderModuleCreateFlags;
+typedef VkFlags VkPipelineCacheCreateFlags;
+
+typedef enum VkPipelineCreateFlagBits {
+    VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT = 0x00000001,
+    VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT = 0x00000002,
+    VK_PIPELINE_CREATE_DERIVATIVE_BIT = 0x00000004,
+    VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHX = 0x00000008,
+    VK_PIPELINE_CREATE_DISPATCH_BASE_KHX = 0x00000010,
+    VK_PIPELINE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineCreateFlagBits;
+typedef VkFlags VkPipelineCreateFlags;
+typedef VkFlags VkPipelineShaderStageCreateFlags;
+
+typedef enum VkShaderStageFlagBits {
+    VK_SHADER_STAGE_VERTEX_BIT = 0x00000001,
+    VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT = 0x00000002,
+    VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT = 0x00000004,
+    VK_SHADER_STAGE_GEOMETRY_BIT = 0x00000008,
+    VK_SHADER_STAGE_FRAGMENT_BIT = 0x00000010,
+    VK_SHADER_STAGE_COMPUTE_BIT = 0x00000020,
+    VK_SHADER_STAGE_ALL_GRAPHICS = 0x0000001F,
+    VK_SHADER_STAGE_ALL = 0x7FFFFFFF,
+    VK_SHADER_STAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkShaderStageFlagBits;
+typedef VkFlags VkPipelineVertexInputStateCreateFlags;
+typedef VkFlags VkPipelineInputAssemblyStateCreateFlags;
+typedef VkFlags VkPipelineTessellationStateCreateFlags;
+typedef VkFlags VkPipelineViewportStateCreateFlags;
+typedef VkFlags VkPipelineRasterizationStateCreateFlags;
+
+typedef enum VkCullModeFlagBits {
+    VK_CULL_MODE_NONE = 0,
+    VK_CULL_MODE_FRONT_BIT = 0x00000001,
+    VK_CULL_MODE_BACK_BIT = 0x00000002,
+    VK_CULL_MODE_FRONT_AND_BACK = 0x00000003,
+    VK_CULL_MODE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCullModeFlagBits;
+typedef VkFlags VkCullModeFlags;
+typedef VkFlags VkPipelineMultisampleStateCreateFlags;
+typedef VkFlags VkPipelineDepthStencilStateCreateFlags;
+typedef VkFlags VkPipelineColorBlendStateCreateFlags;
+
+typedef enum VkColorComponentFlagBits {
+    VK_COLOR_COMPONENT_R_BIT = 0x00000001,
+    VK_COLOR_COMPONENT_G_BIT = 0x00000002,
+    VK_COLOR_COMPONENT_B_BIT = 0x00000004,
+    VK_COLOR_COMPONENT_A_BIT = 0x00000008,
+    VK_COLOR_COMPONENT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkColorComponentFlagBits;
+typedef VkFlags VkColorComponentFlags;
+typedef VkFlags VkPipelineDynamicStateCreateFlags;
+typedef VkFlags VkPipelineLayoutCreateFlags;
+typedef VkFlags VkShaderStageFlags;
+typedef VkFlags VkSamplerCreateFlags;
+
+typedef enum VkDescriptorSetLayoutCreateFlagBits {
+    VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR = 0x00000001,
+    VK_DESCRIPTOR_SET_LAYOUT_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkDescriptorSetLayoutCreateFlagBits;
+typedef VkFlags VkDescriptorSetLayoutCreateFlags;
+
+typedef enum VkDescriptorPoolCreateFlagBits {
+    VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT = 0x00000001,
+    VK_DESCRIPTOR_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkDescriptorPoolCreateFlagBits;
+typedef VkFlags VkDescriptorPoolCreateFlags;
+typedef VkFlags VkDescriptorPoolResetFlags;
+typedef VkFlags VkFramebufferCreateFlags;
+typedef VkFlags VkRenderPassCreateFlags;
+
+typedef enum VkAttachmentDescriptionFlagBits {
+    VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT = 0x00000001,
+    VK_ATTACHMENT_DESCRIPTION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkAttachmentDescriptionFlagBits;
+typedef VkFlags VkAttachmentDescriptionFlags;
+
+typedef enum VkSubpassDescriptionFlagBits {
+    VK_SUBPASS_DESCRIPTION_PER_VIEW_ATTRIBUTES_BIT_NVX = 0x00000001,
+    VK_SUBPASS_DESCRIPTION_PER_VIEW_POSITION_X_ONLY_BIT_NVX = 0x00000002,
+    VK_SUBPASS_DESCRIPTION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubpassDescriptionFlagBits;
+typedef VkFlags VkSubpassDescriptionFlags;
+
+typedef enum VkAccessFlagBits {
+    VK_ACCESS_INDIRECT_COMMAND_READ_BIT = 0x00000001,
+    VK_ACCESS_INDEX_READ_BIT = 0x00000002,
+    VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT = 0x00000004,
+    VK_ACCESS_UNIFORM_READ_BIT = 0x00000008,
+    VK_ACCESS_INPUT_ATTACHMENT_READ_BIT = 0x00000010,
+    VK_ACCESS_SHADER_READ_BIT = 0x00000020,
+    VK_ACCESS_SHADER_WRITE_BIT = 0x00000040,
+    VK_ACCESS_COLOR_ATTACHMENT_READ_BIT = 0x00000080,
+    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT = 0x00000100,
+    VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT = 0x00000200,
+    VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT = 0x00000400,
+    VK_ACCESS_TRANSFER_READ_BIT = 0x00000800,
+    VK_ACCESS_TRANSFER_WRITE_BIT = 0x00001000,
+    VK_ACCESS_HOST_READ_BIT = 0x00002000,
+    VK_ACCESS_HOST_WRITE_BIT = 0x00004000,
+    VK_ACCESS_MEMORY_READ_BIT = 0x00008000,
+    VK_ACCESS_MEMORY_WRITE_BIT = 0x00010000,
+    VK_ACCESS_COMMAND_PROCESS_READ_BIT_NVX = 0x00020000,
+    VK_ACCESS_COMMAND_PROCESS_WRITE_BIT_NVX = 0x00040000,
+    VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT = 0x00080000,
+    VK_ACCESS_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkAccessFlagBits;
+typedef VkFlags VkAccessFlags;
+
+typedef enum VkDependencyFlagBits {
+    VK_DEPENDENCY_BY_REGION_BIT = 0x00000001,
+    VK_DEPENDENCY_VIEW_LOCAL_BIT_KHX = 0x00000002,
+    VK_DEPENDENCY_DEVICE_GROUP_BIT_KHX = 0x00000004,
+    VK_DEPENDENCY_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkDependencyFlagBits;
+typedef VkFlags VkDependencyFlags;
+
+typedef enum VkCommandPoolCreateFlagBits {
+    VK_COMMAND_POOL_CREATE_TRANSIENT_BIT = 0x00000001,
+    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT = 0x00000002,
+    VK_COMMAND_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCommandPoolCreateFlagBits;
+typedef VkFlags VkCommandPoolCreateFlags;
+
+typedef enum VkCommandPoolResetFlagBits {
+    VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT = 0x00000001,
+    VK_COMMAND_POOL_RESET_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCommandPoolResetFlagBits;
+typedef VkFlags VkCommandPoolResetFlags;
+
+typedef enum VkCommandBufferUsageFlagBits {
+    VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT = 0x00000001,
+    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT = 0x00000002,
+    VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT = 0x00000004,
+    VK_COMMAND_BUFFER_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCommandBufferUsageFlagBits;
+typedef VkFlags VkCommandBufferUsageFlags;
+
+typedef enum VkQueryControlFlagBits {
+    VK_QUERY_CONTROL_PRECISE_BIT = 0x00000001,
+    VK_QUERY_CONTROL_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueryControlFlagBits;
+typedef VkFlags VkQueryControlFlags;
+
+typedef enum VkCommandBufferResetFlagBits {
+    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT = 0x00000001,
+    VK_COMMAND_BUFFER_RESET_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkCommandBufferResetFlagBits;
+typedef VkFlags VkCommandBufferResetFlags;
+
+typedef enum VkStencilFaceFlagBits {
+    VK_STENCIL_FACE_FRONT_BIT = 0x00000001,
+    VK_STENCIL_FACE_BACK_BIT = 0x00000002,
+    VK_STENCIL_FRONT_AND_BACK = 0x00000003,
+    VK_STENCIL_FACE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkStencilFaceFlagBits;
+typedef VkFlags VkStencilFaceFlags;
+
+typedef struct VkApplicationInfo {
+    VkStructureType sType;
+    const void *pNext;
+    const char *pApplicationName;
+    uint32_t applicationVersion;
+    const char *pEngineName;
+    uint32_t engineVersion;
+    uint32_t apiVersion;
+} VkApplicationInfo;
+
+typedef struct VkInstanceCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkInstanceCreateFlags flags;
+    const VkApplicationInfo *pApplicationInfo;
+    uint32_t enabledLayerCount;
+    const char *const *ppEnabledLayerNames;
+    uint32_t enabledExtensionCount;
+    const char *const *ppEnabledExtensionNames;
+} VkInstanceCreateInfo;
+
+typedef void *(VKAPI_PTR *PFN_vkAllocationFunction)(
+    void *pUserData,
+    size_t size,
+    size_t alignment,
+    VkSystemAllocationScope allocationScope);
+
+typedef void *(VKAPI_PTR *PFN_vkReallocationFunction)(
+    void *pUserData,
+    void *pOriginal,
+    size_t size,
+    size_t alignment,
+    VkSystemAllocationScope allocationScope);
+
+typedef void(VKAPI_PTR *PFN_vkFreeFunction)(
+    void *pUserData,
+    void *pMemory);
+
+typedef void(VKAPI_PTR *PFN_vkInternalAllocationNotification)(
+    void *pUserData,
+    size_t size,
+    VkInternalAllocationType allocationType,
+    VkSystemAllocationScope allocationScope);
+
+typedef void(VKAPI_PTR *PFN_vkInternalFreeNotification)(
+    void *pUserData,
+    size_t size,
+    VkInternalAllocationType allocationType,
+    VkSystemAllocationScope allocationScope);
+
+typedef struct VkAllocationCallbacks {
+    void *pUserData;
+    PFN_vkAllocationFunction pfnAllocation;
+    PFN_vkReallocationFunction pfnReallocation;
+    PFN_vkFreeFunction pfnFree;
+    PFN_vkInternalAllocationNotification pfnInternalAllocation;
+    PFN_vkInternalFreeNotification pfnInternalFree;
+} VkAllocationCallbacks;
+
+typedef struct VkPhysicalDeviceFeatures {
+    VkBool32 robustBufferAccess;
+    VkBool32 fullDrawIndexUint32;
+    VkBool32 imageCubeArray;
+    VkBool32 independentBlend;
+    VkBool32 geometryShader;
+    VkBool32 tessellationShader;
+    VkBool32 sampleRateShading;
+    VkBool32 dualSrcBlend;
+    VkBool32 logicOp;
+    VkBool32 multiDrawIndirect;
+    VkBool32 drawIndirectFirstInstance;
+    VkBool32 depthClamp;
+    VkBool32 depthBiasClamp;
+    VkBool32 fillModeNonSolid;
+    VkBool32 depthBounds;
+    VkBool32 wideLines;
+    VkBool32 largePoints;
+    VkBool32 alphaToOne;
+    VkBool32 multiViewport;
+    VkBool32 samplerAnisotropy;
+    VkBool32 textureCompressionETC2;
+    VkBool32 textureCompressionASTC_LDR;
+    VkBool32 textureCompressionBC;
+    VkBool32 occlusionQueryPrecise;
+    VkBool32 pipelineStatisticsQuery;
+    VkBool32 vertexPipelineStoresAndAtomics;
+    VkBool32 fragmentStoresAndAtomics;
+    VkBool32 shaderTessellationAndGeometryPointSize;
+    VkBool32 shaderImageGatherExtended;
+    VkBool32 shaderStorageImageExtendedFormats;
+    VkBool32 shaderStorageImageMultisample;
+    VkBool32 shaderStorageImageReadWithoutFormat;
+    VkBool32 shaderStorageImageWriteWithoutFormat;
+    VkBool32 shaderUniformBufferArrayDynamicIndexing;
+    VkBool32 shaderSampledImageArrayDynamicIndexing;
+    VkBool32 shaderStorageBufferArrayDynamicIndexing;
+    VkBool32 shaderStorageImageArrayDynamicIndexing;
+    VkBool32 shaderClipDistance;
+    VkBool32 shaderCullDistance;
+    VkBool32 shaderFloat64;
+    VkBool32 shaderInt64;
+    VkBool32 shaderInt16;
+    VkBool32 shaderResourceResidency;
+    VkBool32 shaderResourceMinLod;
+    VkBool32 sparseBinding;
+    VkBool32 sparseResidencyBuffer;
+    VkBool32 sparseResidencyImage2D;
+    VkBool32 sparseResidencyImage3D;
+    VkBool32 sparseResidency2Samples;
+    VkBool32 sparseResidency4Samples;
+    VkBool32 sparseResidency8Samples;
+    VkBool32 sparseResidency16Samples;
+    VkBool32 sparseResidencyAliased;
+    VkBool32 variableMultisampleRate;
+    VkBool32 inheritedQueries;
+} VkPhysicalDeviceFeatures;
+
+typedef struct VkFormatProperties {
+    VkFormatFeatureFlags linearTilingFeatures;
+    VkFormatFeatureFlags optimalTilingFeatures;
+    VkFormatFeatureFlags bufferFeatures;
+} VkFormatProperties;
+
+typedef struct VkExtent3D {
+    uint32_t width;
+    uint32_t height;
+    uint32_t depth;
+} VkExtent3D;
+
+typedef struct VkImageFormatProperties {
+    VkExtent3D maxExtent;
+    uint32_t maxMipLevels;
+    uint32_t maxArrayLayers;
+    VkSampleCountFlags sampleCounts;
+    VkDeviceSize maxResourceSize;
+} VkImageFormatProperties;
+
+typedef struct VkPhysicalDeviceLimits {
+    uint32_t maxImageDimension1D;
+    uint32_t maxImageDimension2D;
+    uint32_t maxImageDimension3D;
+    uint32_t maxImageDimensionCube;
+    uint32_t maxImageArrayLayers;
+    uint32_t maxTexelBufferElements;
+    uint32_t maxUniformBufferRange;
+    uint32_t maxStorageBufferRange;
+    uint32_t maxPushConstantsSize;
+    uint32_t maxMemoryAllocationCount;
+    uint32_t maxSamplerAllocationCount;
+    VkDeviceSize bufferImageGranularity;
+    VkDeviceSize sparseAddressSpaceSize;
+    uint32_t maxBoundDescriptorSets;
+    uint32_t maxPerStageDescriptorSamplers;
+    uint32_t maxPerStageDescriptorUniformBuffers;
+    uint32_t maxPerStageDescriptorStorageBuffers;
+    uint32_t maxPerStageDescriptorSampledImages;
+    uint32_t maxPerStageDescriptorStorageImages;
+    uint32_t maxPerStageDescriptorInputAttachments;
+    uint32_t maxPerStageResources;
+    uint32_t maxDescriptorSetSamplers;
+    uint32_t maxDescriptorSetUniformBuffers;
+    uint32_t maxDescriptorSetUniformBuffersDynamic;
+    uint32_t maxDescriptorSetStorageBuffers;
+    uint32_t maxDescriptorSetStorageBuffersDynamic;
+    uint32_t maxDescriptorSetSampledImages;
+    uint32_t maxDescriptorSetStorageImages;
+    uint32_t maxDescriptorSetInputAttachments;
+    uint32_t maxVertexInputAttributes;
+    uint32_t maxVertexInputBindings;
+    uint32_t maxVertexInputAttributeOffset;
+    uint32_t maxVertexInputBindingStride;
+    uint32_t maxVertexOutputComponents;
+    uint32_t maxTessellationGenerationLevel;
+    uint32_t maxTessellationPatchSize;
+    uint32_t maxTessellationControlPerVertexInputComponents;
+    uint32_t maxTessellationControlPerVertexOutputComponents;
+    uint32_t maxTessellationControlPerPatchOutputComponents;
+    uint32_t maxTessellationControlTotalOutputComponents;
+    uint32_t maxTessellationEvaluationInputComponents;
+    uint32_t maxTessellationEvaluationOutputComponents;
+    uint32_t maxGeometryShaderInvocations;
+    uint32_t maxGeometryInputComponents;
+    uint32_t maxGeometryOutputComponents;
+    uint32_t maxGeometryOutputVertices;
+    uint32_t maxGeometryTotalOutputComponents;
+    uint32_t maxFragmentInputComponents;
+    uint32_t maxFragmentOutputAttachments;
+    uint32_t maxFragmentDualSrcAttachments;
+    uint32_t maxFragmentCombinedOutputResources;
+    uint32_t maxComputeSharedMemorySize;
+    uint32_t maxComputeWorkGroupCount[3];
+    uint32_t maxComputeWorkGroupInvocations;
+    uint32_t maxComputeWorkGroupSize[3];
+    uint32_t subPixelPrecisionBits;
+    uint32_t subTexelPrecisionBits;
+    uint32_t mipmapPrecisionBits;
+    uint32_t maxDrawIndexedIndexValue;
+    uint32_t maxDrawIndirectCount;
+    float maxSamplerLodBias;
+    float maxSamplerAnisotropy;
+    uint32_t maxViewports;
+    uint32_t maxViewportDimensions[2];
+    float viewportBoundsRange[2];
+    uint32_t viewportSubPixelBits;
+    size_t minMemoryMapAlignment;
+    VkDeviceSize minTexelBufferOffsetAlignment;
+    VkDeviceSize minUniformBufferOffsetAlignment;
+    VkDeviceSize minStorageBufferOffsetAlignment;
+    int32_t minTexelOffset;
+    uint32_t maxTexelOffset;
+    int32_t minTexelGatherOffset;
+    uint32_t maxTexelGatherOffset;
+    float minInterpolationOffset;
+    float maxInterpolationOffset;
+    uint32_t subPixelInterpolationOffsetBits;
+    uint32_t maxFramebufferWidth;
+    uint32_t maxFramebufferHeight;
+    uint32_t maxFramebufferLayers;
+    VkSampleCountFlags framebufferColorSampleCounts;
+    VkSampleCountFlags framebufferDepthSampleCounts;
+    VkSampleCountFlags framebufferStencilSampleCounts;
+    VkSampleCountFlags framebufferNoAttachmentsSampleCounts;
+    uint32_t maxColorAttachments;
+    VkSampleCountFlags sampledImageColorSampleCounts;
+    VkSampleCountFlags sampledImageIntegerSampleCounts;
+    VkSampleCountFlags sampledImageDepthSampleCounts;
+    VkSampleCountFlags sampledImageStencilSampleCounts;
+    VkSampleCountFlags storageImageSampleCounts;
+    uint32_t maxSampleMaskWords;
+    VkBool32 timestampComputeAndGraphics;
+    float timestampPeriod;
+    uint32_t maxClipDistances;
+    uint32_t maxCullDistances;
+    uint32_t maxCombinedClipAndCullDistances;
+    uint32_t discreteQueuePriorities;
+    float pointSizeRange[2];
+    float lineWidthRange[2];
+    float pointSizeGranularity;
+    float lineWidthGranularity;
+    VkBool32 strictLines;
+    VkBool32 standardSampleLocations;
+    VkDeviceSize optimalBufferCopyOffsetAlignment;
+    VkDeviceSize optimalBufferCopyRowPitchAlignment;
+    VkDeviceSize nonCoherentAtomSize;
+} VkPhysicalDeviceLimits;
+
+typedef struct VkPhysicalDeviceSparseProperties {
+    VkBool32 residencyStandard2DBlockShape;
+    VkBool32 residencyStandard2DMultisampleBlockShape;
+    VkBool32 residencyStandard3DBlockShape;
+    VkBool32 residencyAlignedMipSize;
+    VkBool32 residencyNonResidentStrict;
+} VkPhysicalDeviceSparseProperties;
+
+typedef struct VkPhysicalDeviceProperties {
+    uint32_t apiVersion;
+    uint32_t driverVersion;
+    uint32_t vendorID;
+    uint32_t deviceID;
+    VkPhysicalDeviceType deviceType;
+    char deviceName[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
+    uint8_t pipelineCacheUUID[VK_UUID_SIZE];
+    VkPhysicalDeviceLimits limits;
+    VkPhysicalDeviceSparseProperties sparseProperties;
+} VkPhysicalDeviceProperties;
+
+typedef struct VkQueueFamilyProperties {
+    VkQueueFlags queueFlags;
+    uint32_t queueCount;
+    uint32_t timestampValidBits;
+    VkExtent3D minImageTransferGranularity;
+} VkQueueFamilyProperties;
+
+typedef struct VkMemoryType {
+    VkMemoryPropertyFlags propertyFlags;
+    uint32_t heapIndex;
+} VkMemoryType;
+
+typedef struct VkMemoryHeap {
+    VkDeviceSize size;
+    VkMemoryHeapFlags flags;
+} VkMemoryHeap;
+
+typedef struct VkPhysicalDeviceMemoryProperties {
+    uint32_t memoryTypeCount;
+    VkMemoryType memoryTypes[VK_MAX_MEMORY_TYPES];
+    uint32_t memoryHeapCount;
+    VkMemoryHeap memoryHeaps[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryProperties;
+
+typedef void(VKAPI_PTR *PFN_vkVoidFunction)();
+typedef struct VkDeviceQueueCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkDeviceQueueCreateFlags flags;
+    uint32_t queueFamilyIndex;
+    uint32_t queueCount;
+    const float *pQueuePriorities;
+} VkDeviceQueueCreateInfo;
+
+typedef struct VkDeviceCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkDeviceCreateFlags flags;
+    uint32_t queueCreateInfoCount;
+    const VkDeviceQueueCreateInfo *pQueueCreateInfos;
+    uint32_t enabledLayerCount;
+    const char *const *ppEnabledLayerNames;
+    uint32_t enabledExtensionCount;
+    const char *const *ppEnabledExtensionNames;
+    const VkPhysicalDeviceFeatures *pEnabledFeatures;
+} VkDeviceCreateInfo;
+
+typedef struct VkExtensionProperties {
+    char extensionName[VK_MAX_EXTENSION_NAME_SIZE];
+    uint32_t specVersion;
+} VkExtensionProperties;
+
+typedef struct VkLayerProperties {
+    char layerName[VK_MAX_EXTENSION_NAME_SIZE];
+    uint32_t specVersion;
+    uint32_t implementationVersion;
+    char description[VK_MAX_DESCRIPTION_SIZE];
+} VkLayerProperties;
+
+typedef struct VkSubmitInfo {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t waitSemaphoreCount;
+    const VkSemaphore *pWaitSemaphores;
+    const VkPipelineStageFlags *pWaitDstStageMask;
+    uint32_t commandBufferCount;
+    const VkCommandBuffer *pCommandBuffers;
+    uint32_t signalSemaphoreCount;
+    const VkSemaphore *pSignalSemaphores;
+} VkSubmitInfo;
+
+typedef struct VkMemoryAllocateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkDeviceSize allocationSize;
+    uint32_t memoryTypeIndex;
+} VkMemoryAllocateInfo;
+
+typedef struct VkMappedMemoryRange {
+    VkStructureType sType;
+    const void *pNext;
+    VkDeviceMemory memory;
+    VkDeviceSize offset;
+    VkDeviceSize size;
+} VkMappedMemoryRange;
+
+typedef struct VkMemoryRequirements {
+    VkDeviceSize size;
+    VkDeviceSize alignment;
+    uint32_t memoryTypeBits;
+} VkMemoryRequirements;
+
+typedef struct VkSparseImageFormatProperties {
+    VkImageAspectFlags aspectMask;
+    VkExtent3D imageGranularity;
+    VkSparseImageFormatFlags flags;
+} VkSparseImageFormatProperties;
+
+typedef struct VkSparseImageMemoryRequirements {
+    VkSparseImageFormatProperties formatProperties;
+    uint32_t imageMipTailFirstLod;
+    VkDeviceSize imageMipTailSize;
+    VkDeviceSize imageMipTailOffset;
+    VkDeviceSize imageMipTailStride;
+} VkSparseImageMemoryRequirements;
+
+typedef struct VkSparseMemoryBind {
+    VkDeviceSize resourceOffset;
+    VkDeviceSize size;
+    VkDeviceMemory memory;
+    VkDeviceSize memoryOffset;
+    VkSparseMemoryBindFlags flags;
+} VkSparseMemoryBind;
+
+typedef struct VkSparseBufferMemoryBindInfo {
+    VkBuffer buffer;
+    uint32_t bindCount;
+    const VkSparseMemoryBind *pBinds;
+} VkSparseBufferMemoryBindInfo;
+
+typedef struct VkSparseImageOpaqueMemoryBindInfo {
+    VkImage image;
+    uint32_t bindCount;
+    const VkSparseMemoryBind *pBinds;
+} VkSparseImageOpaqueMemoryBindInfo;
+
+typedef struct VkImageSubresource {
+    VkImageAspectFlags aspectMask;
+    uint32_t mipLevel;
+    uint32_t arrayLayer;
+} VkImageSubresource;
+
+typedef struct VkOffset3D {
+    int32_t x;
+    int32_t y;
+    int32_t z;
+} VkOffset3D;
+
+typedef struct VkSparseImageMemoryBind {
+    VkImageSubresource subresource;
+    VkOffset3D offset;
+    VkExtent3D extent;
+    VkDeviceMemory memory;
+    VkDeviceSize memoryOffset;
+    VkSparseMemoryBindFlags flags;
+} VkSparseImageMemoryBind;
+
+typedef struct VkSparseImageMemoryBindInfo {
+    VkImage image;
+    uint32_t bindCount;
+    const VkSparseImageMemoryBind *pBinds;
+} VkSparseImageMemoryBindInfo;
+
+typedef struct VkBindSparseInfo {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t waitSemaphoreCount;
+    const VkSemaphore *pWaitSemaphores;
+    uint32_t bufferBindCount;
+    const VkSparseBufferMemoryBindInfo *pBufferBinds;
+    uint32_t imageOpaqueBindCount;
+    const VkSparseImageOpaqueMemoryBindInfo *pImageOpaqueBinds;
+    uint32_t imageBindCount;
+    const VkSparseImageMemoryBindInfo *pImageBinds;
+    uint32_t signalSemaphoreCount;
+    const VkSemaphore *pSignalSemaphores;
+} VkBindSparseInfo;
+
+typedef struct VkFenceCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkFenceCreateFlags flags;
+} VkFenceCreateInfo;
+
+typedef struct VkSemaphoreCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkSemaphoreCreateFlags flags;
+} VkSemaphoreCreateInfo;
+
+typedef struct VkEventCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkEventCreateFlags flags;
+} VkEventCreateInfo;
+
+typedef struct VkQueryPoolCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkQueryPoolCreateFlags flags;
+    VkQueryType queryType;
+    uint32_t queryCount;
+    VkQueryPipelineStatisticFlags pipelineStatistics;
+} VkQueryPoolCreateInfo;
+
+typedef struct VkBufferCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkBufferCreateFlags flags;
+    VkDeviceSize size;
+    VkBufferUsageFlags usage;
+    VkSharingMode sharingMode;
+    uint32_t queueFamilyIndexCount;
+    const uint32_t *pQueueFamilyIndices;
+} VkBufferCreateInfo;
+
+typedef struct VkBufferViewCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkBufferViewCreateFlags flags;
+    VkBuffer buffer;
+    VkFormat format;
+    VkDeviceSize offset;
+    VkDeviceSize range;
+} VkBufferViewCreateInfo;
+
+typedef struct VkImageCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkImageCreateFlags flags;
+    VkImageType imageType;
+    VkFormat format;
+    VkExtent3D extent;
+    uint32_t mipLevels;
+    uint32_t arrayLayers;
+    VkSampleCountFlagBits samples;
+    VkImageTiling tiling;
+    VkImageUsageFlags usage;
+    VkSharingMode sharingMode;
+    uint32_t queueFamilyIndexCount;
+    const uint32_t *pQueueFamilyIndices;
+    VkImageLayout initialLayout;
+} VkImageCreateInfo;
+
+typedef struct VkSubresourceLayout {
+    VkDeviceSize offset;
+    VkDeviceSize size;
+    VkDeviceSize rowPitch;
+    VkDeviceSize arrayPitch;
+    VkDeviceSize depthPitch;
+} VkSubresourceLayout;
+
+typedef struct VkComponentMapping {
+    VkComponentSwizzle r;
+    VkComponentSwizzle g;
+    VkComponentSwizzle b;
+    VkComponentSwizzle a;
+} VkComponentMapping;
+
+typedef struct VkImageSubresourceRange {
+    VkImageAspectFlags aspectMask;
+    uint32_t baseMipLevel;
+    uint32_t levelCount;
+    uint32_t baseArrayLayer;
+    uint32_t layerCount;
+} VkImageSubresourceRange;
+
+typedef struct VkImageViewCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkImageViewCreateFlags flags;
+    VkImage image;
+    VkImageViewType viewType;
+    VkFormat format;
+    VkComponentMapping components;
+    VkImageSubresourceRange subresourceRange;
+} VkImageViewCreateInfo;
+
+typedef struct VkShaderModuleCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkShaderModuleCreateFlags flags;
+    size_t codeSize;
+    const uint32_t *pCode;
+} VkShaderModuleCreateInfo;
+
+typedef struct VkPipelineCacheCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineCacheCreateFlags flags;
+    size_t initialDataSize;
+    const void *pInitialData;
+} VkPipelineCacheCreateInfo;
+
+typedef struct VkSpecializationMapEntry {
+    uint32_t constantID;
+    uint32_t offset;
+    size_t size;
+} VkSpecializationMapEntry;
+
+typedef struct VkSpecializationInfo {
+    uint32_t mapEntryCount;
+    const VkSpecializationMapEntry *pMapEntries;
+    size_t dataSize;
+    const void *pData;
+} VkSpecializationInfo;
+
+typedef struct VkPipelineShaderStageCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineShaderStageCreateFlags flags;
+    VkShaderStageFlagBits stage;
+    VkShaderModule module;
+    const char *pName;
+    const VkSpecializationInfo *pSpecializationInfo;
+} VkPipelineShaderStageCreateInfo;
+
+typedef struct VkVertexInputBindingDescription {
+    uint32_t binding;
+    uint32_t stride;
+    VkVertexInputRate inputRate;
+} VkVertexInputBindingDescription;
+
+typedef struct VkVertexInputAttributeDescription {
+    uint32_t location;
+    uint32_t binding;
+    VkFormat format;
+    uint32_t offset;
+} VkVertexInputAttributeDescription;
+
+typedef struct VkPipelineVertexInputStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineVertexInputStateCreateFlags flags;
+    uint32_t vertexBindingDescriptionCount;
+    const VkVertexInputBindingDescription *pVertexBindingDescriptions;
+    uint32_t vertexAttributeDescriptionCount;
+    const VkVertexInputAttributeDescription *pVertexAttributeDescriptions;
+} VkPipelineVertexInputStateCreateInfo;
+
+typedef struct VkPipelineInputAssemblyStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineInputAssemblyStateCreateFlags flags;
+    VkPrimitiveTopology topology;
+    VkBool32 primitiveRestartEnable;
+} VkPipelineInputAssemblyStateCreateInfo;
+
+typedef struct VkPipelineTessellationStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineTessellationStateCreateFlags flags;
+    uint32_t patchControlPoints;
+} VkPipelineTessellationStateCreateInfo;
+
+typedef struct VkViewport {
+    float x;
+    float y;
+    float width;
+    float height;
+    float minDepth;
+    float maxDepth;
+} VkViewport;
+
+typedef struct VkOffset2D {
+    int32_t x;
+    int32_t y;
+} VkOffset2D;
+
+typedef struct VkExtent2D {
+    uint32_t width;
+    uint32_t height;
+} VkExtent2D;
+
+typedef struct VkRect2D {
+    VkOffset2D offset;
+    VkExtent2D extent;
+} VkRect2D;
+
+typedef struct VkPipelineViewportStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineViewportStateCreateFlags flags;
+    uint32_t viewportCount;
+    const VkViewport *pViewports;
+    uint32_t scissorCount;
+    const VkRect2D *pScissors;
+} VkPipelineViewportStateCreateInfo;
+
+typedef struct VkPipelineRasterizationStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineRasterizationStateCreateFlags flags;
+    VkBool32 depthClampEnable;
+    VkBool32 rasterizerDiscardEnable;
+    VkPolygonMode polygonMode;
+    VkCullModeFlags cullMode;
+    VkFrontFace frontFace;
+    VkBool32 depthBiasEnable;
+    float depthBiasConstantFactor;
+    float depthBiasClamp;
+    float depthBiasSlopeFactor;
+    float lineWidth;
+} VkPipelineRasterizationStateCreateInfo;
+
+typedef struct VkPipelineMultisampleStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineMultisampleStateCreateFlags flags;
+    VkSampleCountFlagBits rasterizationSamples;
+    VkBool32 sampleShadingEnable;
+    float minSampleShading;
+    const VkSampleMask *pSampleMask;
+    VkBool32 alphaToCoverageEnable;
+    VkBool32 alphaToOneEnable;
+} VkPipelineMultisampleStateCreateInfo;
+
+typedef struct VkStencilOpState {
+    VkStencilOp failOp;
+    VkStencilOp passOp;
+    VkStencilOp depthFailOp;
+    VkCompareOp compareOp;
+    uint32_t compareMask;
+    uint32_t writeMask;
+    uint32_t reference;
+} VkStencilOpState;
+
+typedef struct VkPipelineDepthStencilStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineDepthStencilStateCreateFlags flags;
+    VkBool32 depthTestEnable;
+    VkBool32 depthWriteEnable;
+    VkCompareOp depthCompareOp;
+    VkBool32 depthBoundsTestEnable;
+    VkBool32 stencilTestEnable;
+    VkStencilOpState front;
+    VkStencilOpState back;
+    float minDepthBounds;
+    float maxDepthBounds;
+} VkPipelineDepthStencilStateCreateInfo;
+
+typedef struct VkPipelineColorBlendAttachmentState {
+    VkBool32 blendEnable;
+    VkBlendFactor srcColorBlendFactor;
+    VkBlendFactor dstColorBlendFactor;
+    VkBlendOp colorBlendOp;
+    VkBlendFactor srcAlphaBlendFactor;
+    VkBlendFactor dstAlphaBlendFactor;
+    VkBlendOp alphaBlendOp;
+    VkColorComponentFlags colorWriteMask;
+} VkPipelineColorBlendAttachmentState;
+
+typedef struct VkPipelineColorBlendStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineColorBlendStateCreateFlags flags;
+    VkBool32 logicOpEnable;
+    VkLogicOp logicOp;
+    uint32_t attachmentCount;
+    const VkPipelineColorBlendAttachmentState *pAttachments;
+    float blendConstants[4];
+} VkPipelineColorBlendStateCreateInfo;
+
+typedef struct VkPipelineDynamicStateCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineDynamicStateCreateFlags flags;
+    uint32_t dynamicStateCount;
+    const VkDynamicState *pDynamicStates;
+} VkPipelineDynamicStateCreateInfo;
+
+typedef struct VkGraphicsPipelineCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineCreateFlags flags;
+    uint32_t stageCount;
+    const VkPipelineShaderStageCreateInfo *pStages;
+    const VkPipelineVertexInputStateCreateInfo *pVertexInputState;
+    const VkPipelineInputAssemblyStateCreateInfo *pInputAssemblyState;
+    const VkPipelineTessellationStateCreateInfo *pTessellationState;
+    const VkPipelineViewportStateCreateInfo *pViewportState;
+    const VkPipelineRasterizationStateCreateInfo *pRasterizationState;
+    const VkPipelineMultisampleStateCreateInfo *pMultisampleState;
+    const VkPipelineDepthStencilStateCreateInfo *pDepthStencilState;
+    const VkPipelineColorBlendStateCreateInfo *pColorBlendState;
+    const VkPipelineDynamicStateCreateInfo *pDynamicState;
+    VkPipelineLayout layout;
+    VkRenderPass renderPass;
+    uint32_t subpass;
+    VkPipeline basePipelineHandle;
+    int32_t basePipelineIndex;
+} VkGraphicsPipelineCreateInfo;
+
+typedef struct VkComputePipelineCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineCreateFlags flags;
+    VkPipelineShaderStageCreateInfo stage;
+    VkPipelineLayout layout;
+    VkPipeline basePipelineHandle;
+    int32_t basePipelineIndex;
+} VkComputePipelineCreateInfo;
+
+typedef struct VkPushConstantRange {
+    VkShaderStageFlags stageFlags;
+    uint32_t offset;
+    uint32_t size;
+} VkPushConstantRange;
+
+typedef struct VkPipelineLayoutCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineLayoutCreateFlags flags;
+    uint32_t setLayoutCount;
+    const VkDescriptorSetLayout *pSetLayouts;
+    uint32_t pushConstantRangeCount;
+    const VkPushConstantRange *pPushConstantRanges;
+} VkPipelineLayoutCreateInfo;
+
+typedef struct VkSamplerCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkSamplerCreateFlags flags;
+    VkFilter magFilter;
+    VkFilter minFilter;
+    VkSamplerMipmapMode mipmapMode;
+    VkSamplerAddressMode addressModeU;
+    VkSamplerAddressMode addressModeV;
+    VkSamplerAddressMode addressModeW;
+    float mipLodBias;
+    VkBool32 anisotropyEnable;
+    float maxAnisotropy;
+    VkBool32 compareEnable;
+    VkCompareOp compareOp;
+    float minLod;
+    float maxLod;
+    VkBorderColor borderColor;
+    VkBool32 unnormalizedCoordinates;
+} VkSamplerCreateInfo;
+
+typedef struct VkDescriptorSetLayoutBinding {
+    uint32_t binding;
+    VkDescriptorType descriptorType;
+    uint32_t descriptorCount;
+    VkShaderStageFlags stageFlags;
+    const VkSampler *pImmutableSamplers;
+} VkDescriptorSetLayoutBinding;
+
+typedef struct VkDescriptorSetLayoutCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkDescriptorSetLayoutCreateFlags flags;
+    uint32_t bindingCount;
+    const VkDescriptorSetLayoutBinding *pBindings;
+} VkDescriptorSetLayoutCreateInfo;
+
+typedef struct VkDescriptorPoolSize {
+    VkDescriptorType type;
+    uint32_t descriptorCount;
+} VkDescriptorPoolSize;
+
+typedef struct VkDescriptorPoolCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkDescriptorPoolCreateFlags flags;
+    uint32_t maxSets;
+    uint32_t poolSizeCount;
+    const VkDescriptorPoolSize *pPoolSizes;
+} VkDescriptorPoolCreateInfo;
+
+typedef struct VkDescriptorSetAllocateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkDescriptorPool descriptorPool;
+    uint32_t descriptorSetCount;
+    const VkDescriptorSetLayout *pSetLayouts;
+} VkDescriptorSetAllocateInfo;
+
+typedef struct VkDescriptorImageInfo {
+    VkSampler sampler;
+    VkImageView imageView;
+    VkImageLayout imageLayout;
+} VkDescriptorImageInfo;
+
+typedef struct VkDescriptorBufferInfo {
+    VkBuffer buffer;
+    VkDeviceSize offset;
+    VkDeviceSize range;
+} VkDescriptorBufferInfo;
+
+typedef struct VkWriteDescriptorSet {
+    VkStructureType sType;
+    const void *pNext;
+    VkDescriptorSet dstSet;
+    uint32_t dstBinding;
+    uint32_t dstArrayElement;
+    uint32_t descriptorCount;
+    VkDescriptorType descriptorType;
+    const VkDescriptorImageInfo *pImageInfo;
+    const VkDescriptorBufferInfo *pBufferInfo;
+    const VkBufferView *pTexelBufferView;
+} VkWriteDescriptorSet;
+
+typedef struct VkCopyDescriptorSet {
+    VkStructureType sType;
+    const void *pNext;
+    VkDescriptorSet srcSet;
+    uint32_t srcBinding;
+    uint32_t srcArrayElement;
+    VkDescriptorSet dstSet;
+    uint32_t dstBinding;
+    uint32_t dstArrayElement;
+    uint32_t descriptorCount;
+} VkCopyDescriptorSet;
+
+typedef struct VkFramebufferCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkFramebufferCreateFlags flags;
+    VkRenderPass renderPass;
+    uint32_t attachmentCount;
+    const VkImageView *pAttachments;
+    uint32_t width;
+    uint32_t height;
+    uint32_t layers;
+} VkFramebufferCreateInfo;
+
+typedef struct VkAttachmentDescription {
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription;
+
+typedef struct VkAttachmentReference {
+    uint32_t attachment;
+    VkImageLayout layout;
+} VkAttachmentReference;
+
+typedef struct VkSubpassDescription {
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference *pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference *pColorAttachments;
+    const VkAttachmentReference *pResolveAttachments;
+    const VkAttachmentReference *pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t *pPreserveAttachments;
+} VkSubpassDescription;
+
+typedef struct VkSubpassDependency {
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+} VkSubpassDependency;
+
+typedef struct VkRenderPassCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription *pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription *pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency *pDependencies;
+} VkRenderPassCreateInfo;
+
+typedef struct VkCommandPoolCreateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkCommandPoolCreateFlags flags;
+    uint32_t queueFamilyIndex;
+} VkCommandPoolCreateInfo;
+
+typedef struct VkCommandBufferAllocateInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkCommandPool commandPool;
+    VkCommandBufferLevel level;
+    uint32_t commandBufferCount;
+} VkCommandBufferAllocateInfo;
+
+typedef struct VkCommandBufferInheritanceInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkRenderPass renderPass;
+    uint32_t subpass;
+    VkFramebuffer framebuffer;
+    VkBool32 occlusionQueryEnable;
+    VkQueryControlFlags queryFlags;
+    VkQueryPipelineStatisticFlags pipelineStatistics;
+} VkCommandBufferInheritanceInfo;
+
+typedef struct VkCommandBufferBeginInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkCommandBufferUsageFlags flags;
+    const VkCommandBufferInheritanceInfo *pInheritanceInfo;
+} VkCommandBufferBeginInfo;
+
+typedef struct VkBufferCopy {
+    VkDeviceSize srcOffset;
+    VkDeviceSize dstOffset;
+    VkDeviceSize size;
+} VkBufferCopy;
+
+typedef struct VkImageSubresourceLayers {
+    VkImageAspectFlags aspectMask;
+    uint32_t mipLevel;
+    uint32_t baseArrayLayer;
+    uint32_t layerCount;
+} VkImageSubresourceLayers;
+
+typedef struct VkImageCopy {
+    VkImageSubresourceLayers srcSubresource;
+    VkOffset3D srcOffset;
+    VkImageSubresourceLayers dstSubresource;
+    VkOffset3D dstOffset;
+    VkExtent3D extent;
+} VkImageCopy;
+
+typedef struct VkImageBlit {
+    VkImageSubresourceLayers srcSubresource;
+    VkOffset3D srcOffsets[2];
+    VkImageSubresourceLayers dstSubresource;
+    VkOffset3D dstOffsets[2];
+} VkImageBlit;
+
+typedef struct VkBufferImageCopy {
+    VkDeviceSize bufferOffset;
+    uint32_t bufferRowLength;
+    uint32_t bufferImageHeight;
+    VkImageSubresourceLayers imageSubresource;
+    VkOffset3D imageOffset;
+    VkExtent3D imageExtent;
+} VkBufferImageCopy;
+
+typedef union VkClearColorValue {
+    float float32[4];
+    int32_t int32[4];
+    uint32_t uint32[4];
+} VkClearColorValue;
+
+typedef struct VkClearDepthStencilValue {
+    float depth;
+    uint32_t stencil;
+} VkClearDepthStencilValue;
+
+typedef union VkClearValue {
+    VkClearColorValue color;
+    VkClearDepthStencilValue depthStencil;
+} VkClearValue;
+
+typedef struct VkClearAttachment {
+    VkImageAspectFlags aspectMask;
+    uint32_t colorAttachment;
+    VkClearValue clearValue;
+} VkClearAttachment;
+
+typedef struct VkClearRect {
+    VkRect2D rect;
+    uint32_t baseArrayLayer;
+    uint32_t layerCount;
+} VkClearRect;
+
+typedef struct VkImageResolve {
+    VkImageSubresourceLayers srcSubresource;
+    VkOffset3D srcOffset;
+    VkImageSubresourceLayers dstSubresource;
+    VkOffset3D dstOffset;
+    VkExtent3D extent;
+} VkImageResolve;
+
+typedef struct VkMemoryBarrier {
+    VkStructureType sType;
+    const void *pNext;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+} VkMemoryBarrier;
+
+typedef struct VkBufferMemoryBarrier {
+    VkStructureType sType;
+    const void *pNext;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    uint32_t srcQueueFamilyIndex;
+    uint32_t dstQueueFamilyIndex;
+    VkBuffer buffer;
+    VkDeviceSize offset;
+    VkDeviceSize size;
+} VkBufferMemoryBarrier;
+
+typedef struct VkImageMemoryBarrier {
+    VkStructureType sType;
+    const void *pNext;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkImageLayout oldLayout;
+    VkImageLayout newLayout;
+    uint32_t srcQueueFamilyIndex;
+    uint32_t dstQueueFamilyIndex;
+    VkImage image;
+    VkImageSubresourceRange subresourceRange;
+} VkImageMemoryBarrier;
+
+typedef struct VkRenderPassBeginInfo {
+    VkStructureType sType;
+    const void *pNext;
+    VkRenderPass renderPass;
+    VkFramebuffer framebuffer;
+    VkRect2D renderArea;
+    uint32_t clearValueCount;
+    const VkClearValue *pClearValues;
+} VkRenderPassBeginInfo;
+
+typedef struct VkDispatchIndirectCommand {
+    uint32_t x;
+    uint32_t y;
+    uint32_t z;
+} VkDispatchIndirectCommand;
+
+typedef struct VkDrawIndexedIndirectCommand {
+    uint32_t indexCount;
+    uint32_t instanceCount;
+    uint32_t firstIndex;
+    int32_t vertexOffset;
+    uint32_t firstInstance;
+} VkDrawIndexedIndirectCommand;
+
+typedef struct VkDrawIndirectCommand {
+    uint32_t vertexCount;
+    uint32_t instanceCount;
+    uint32_t firstVertex;
+    uint32_t firstInstance;
+} VkDrawIndirectCommand;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateInstance)(const VkInstanceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkInstance *pInstance);
+typedef void(VKAPI_PTR *PFN_vkDestroyInstance)(VkInstance instance, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkEnumeratePhysicalDevices)(VkInstance instance, uint32_t *pPhysicalDeviceCount, VkPhysicalDevice *pPhysicalDevices);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceFeatures)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures *pFeatures);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkFormatProperties *pFormatProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceImageFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags flags, VkImageFormatProperties *pImageFormatProperties);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceProperties)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties *pProperties);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceQueueFamilyProperties)(VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount, VkQueueFamilyProperties *pQueueFamilyProperties);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceMemoryProperties)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceMemoryProperties *pMemoryProperties);
+typedef PFN_vkVoidFunction(VKAPI_PTR *PFN_vkGetInstanceProcAddr)(VkInstance instance, const char *pName);
+typedef PFN_vkVoidFunction(VKAPI_PTR *PFN_vkGetDeviceProcAddr)(VkDevice device, const char *pName);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateDevice)(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDevice *pDevice);
+typedef void(VKAPI_PTR *PFN_vkDestroyDevice)(VkDevice device, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkEnumerateInstanceExtensionProperties)(const char *pLayerName, uint32_t *pPropertyCount, VkExtensionProperties *pProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkEnumerateDeviceExtensionProperties)(VkPhysicalDevice physicalDevice, const char *pLayerName, uint32_t *pPropertyCount, VkExtensionProperties *pProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkEnumerateInstanceLayerProperties)(uint32_t *pPropertyCount, VkLayerProperties *pProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkEnumerateDeviceLayerProperties)(VkPhysicalDevice physicalDevice, uint32_t *pPropertyCount, VkLayerProperties *pProperties);
+typedef void(VKAPI_PTR *PFN_vkGetDeviceQueue)(VkDevice device, uint32_t queueFamilyIndex, uint32_t queueIndex, VkQueue *pQueue);
+typedef VkResult(VKAPI_PTR *PFN_vkQueueSubmit)(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence);
+typedef VkResult(VKAPI_PTR *PFN_vkQueueWaitIdle)(VkQueue queue);
+typedef VkResult(VKAPI_PTR *PFN_vkDeviceWaitIdle)(VkDevice device);
+typedef VkResult(VKAPI_PTR *PFN_vkAllocateMemory)(VkDevice device, const VkMemoryAllocateInfo *pAllocateInfo, const VkAllocationCallbacks *pAllocator, VkDeviceMemory *pMemory);
+typedef void(VKAPI_PTR *PFN_vkFreeMemory)(VkDevice device, VkDeviceMemory memory, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkMapMemory)(VkDevice device, VkDeviceMemory memory, VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void **ppData);
+typedef void(VKAPI_PTR *PFN_vkUnmapMemory)(VkDevice device, VkDeviceMemory memory);
+typedef VkResult(VKAPI_PTR *PFN_vkFlushMappedMemoryRanges)(VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges);
+typedef VkResult(VKAPI_PTR *PFN_vkInvalidateMappedMemoryRanges)(VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges);
+typedef void(VKAPI_PTR *PFN_vkGetDeviceMemoryCommitment)(VkDevice device, VkDeviceMemory memory, VkDeviceSize *pCommittedMemoryInBytes);
+typedef VkResult(VKAPI_PTR *PFN_vkBindBufferMemory)(VkDevice device, VkBuffer buffer, VkDeviceMemory memory, VkDeviceSize memoryOffset);
+typedef VkResult(VKAPI_PTR *PFN_vkBindImageMemory)(VkDevice device, VkImage image, VkDeviceMemory memory, VkDeviceSize memoryOffset);
+typedef void(VKAPI_PTR *PFN_vkGetBufferMemoryRequirements)(VkDevice device, VkBuffer buffer, VkMemoryRequirements *pMemoryRequirements);
+typedef void(VKAPI_PTR *PFN_vkGetImageMemoryRequirements)(VkDevice device, VkImage image, VkMemoryRequirements *pMemoryRequirements);
+typedef void(VKAPI_PTR *PFN_vkGetImageSparseMemoryRequirements)(VkDevice device, VkImage image, uint32_t *pSparseMemoryRequirementCount, VkSparseImageMemoryRequirements *pSparseMemoryRequirements);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceSparseImageFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkSampleCountFlagBits samples, VkImageUsageFlags usage, VkImageTiling tiling, uint32_t *pPropertyCount, VkSparseImageFormatProperties *pProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkQueueBindSparse)(VkQueue queue, uint32_t bindInfoCount, const VkBindSparseInfo *pBindInfo, VkFence fence);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateFence)(VkDevice device, const VkFenceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkFence *pFence);
+typedef void(VKAPI_PTR *PFN_vkDestroyFence)(VkDevice device, VkFence fence, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkResetFences)(VkDevice device, uint32_t fenceCount, const VkFence *pFences);
+typedef VkResult(VKAPI_PTR *PFN_vkGetFenceStatus)(VkDevice device, VkFence fence);
+typedef VkResult(VKAPI_PTR *PFN_vkWaitForFences)(VkDevice device, uint32_t fenceCount, const VkFence *pFences, VkBool32 waitAll, uint64_t timeout);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateSemaphore)(VkDevice device, const VkSemaphoreCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSemaphore *pSemaphore);
+typedef void(VKAPI_PTR *PFN_vkDestroySemaphore)(VkDevice device, VkSemaphore semaphore, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateEvent)(VkDevice device, const VkEventCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkEvent *pEvent);
+typedef void(VKAPI_PTR *PFN_vkDestroyEvent)(VkDevice device, VkEvent event, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkGetEventStatus)(VkDevice device, VkEvent event);
+typedef VkResult(VKAPI_PTR *PFN_vkSetEvent)(VkDevice device, VkEvent event);
+typedef VkResult(VKAPI_PTR *PFN_vkResetEvent)(VkDevice device, VkEvent event);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateQueryPool)(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool);
+typedef void(VKAPI_PTR *PFN_vkDestroyQueryPool)(VkDevice device, VkQueryPool queryPool, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkGetQueryPoolResults)(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateBuffer)(VkDevice device, const VkBufferCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkBuffer *pBuffer);
+typedef void(VKAPI_PTR *PFN_vkDestroyBuffer)(VkDevice device, VkBuffer buffer, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateBufferView)(VkDevice device, const VkBufferViewCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkBufferView *pView);
+typedef void(VKAPI_PTR *PFN_vkDestroyBufferView)(VkDevice device, VkBufferView bufferView, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateImage)(VkDevice device, const VkImageCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkImage *pImage);
+typedef void(VKAPI_PTR *PFN_vkDestroyImage)(VkDevice device, VkImage image, const VkAllocationCallbacks *pAllocator);
+typedef void(VKAPI_PTR *PFN_vkGetImageSubresourceLayout)(VkDevice device, VkImage image, const VkImageSubresource *pSubresource, VkSubresourceLayout *pLayout);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateImageView)(VkDevice device, const VkImageViewCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkImageView *pView);
+typedef void(VKAPI_PTR *PFN_vkDestroyImageView)(VkDevice device, VkImageView imageView, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateShaderModule)(VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule);
+typedef void(VKAPI_PTR *PFN_vkDestroyShaderModule)(VkDevice device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreatePipelineCache)(VkDevice device, const VkPipelineCacheCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipelineCache *pPipelineCache);
+typedef void(VKAPI_PTR *PFN_vkDestroyPipelineCache)(VkDevice device, VkPipelineCache pipelineCache, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPipelineCacheData)(VkDevice device, VkPipelineCache pipelineCache, size_t *pDataSize, void *pData);
+typedef VkResult(VKAPI_PTR *PFN_vkMergePipelineCaches)(VkDevice device, VkPipelineCache dstCache, uint32_t srcCacheCount, const VkPipelineCache *pSrcCaches);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateGraphicsPipelines)(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkGraphicsPipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateComputePipelines)(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines);
+typedef void(VKAPI_PTR *PFN_vkDestroyPipeline)(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreatePipelineLayout)(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout);
+typedef void(VKAPI_PTR *PFN_vkDestroyPipelineLayout)(VkDevice device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateSampler)(VkDevice device, const VkSamplerCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSampler *pSampler);
+typedef void(VKAPI_PTR *PFN_vkDestroySampler)(VkDevice device, VkSampler sampler, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateDescriptorSetLayout)(VkDevice device, const VkDescriptorSetLayoutCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDescriptorSetLayout *pSetLayout);
+typedef void(VKAPI_PTR *PFN_vkDestroyDescriptorSetLayout)(VkDevice device, VkDescriptorSetLayout descriptorSetLayout, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateDescriptorPool)(VkDevice device, const VkDescriptorPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDescriptorPool *pDescriptorPool);
+typedef void(VKAPI_PTR *PFN_vkDestroyDescriptorPool)(VkDevice device, VkDescriptorPool descriptorPool, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkResetDescriptorPool)(VkDevice device, VkDescriptorPool descriptorPool, VkDescriptorPoolResetFlags flags);
+typedef VkResult(VKAPI_PTR *PFN_vkAllocateDescriptorSets)(VkDevice device, const VkDescriptorSetAllocateInfo *pAllocateInfo, VkDescriptorSet *pDescriptorSets);
+typedef VkResult(VKAPI_PTR *PFN_vkFreeDescriptorSets)(VkDevice device, VkDescriptorPool descriptorPool, uint32_t descriptorSetCount, const VkDescriptorSet *pDescriptorSets);
+typedef void(VKAPI_PTR *PFN_vkUpdateDescriptorSets)(VkDevice device, uint32_t descriptorWriteCount, const VkWriteDescriptorSet *pDescriptorWrites, uint32_t descriptorCopyCount, const VkCopyDescriptorSet *pDescriptorCopies);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateFramebuffer)(VkDevice device, const VkFramebufferCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkFramebuffer *pFramebuffer);
+typedef void(VKAPI_PTR *PFN_vkDestroyFramebuffer)(VkDevice device, VkFramebuffer framebuffer, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateRenderPass)(VkDevice device, const VkRenderPassCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkRenderPass *pRenderPass);
+typedef void(VKAPI_PTR *PFN_vkDestroyRenderPass)(VkDevice device, VkRenderPass renderPass, const VkAllocationCallbacks *pAllocator);
+typedef void(VKAPI_PTR *PFN_vkGetRenderAreaGranularity)(VkDevice device, VkRenderPass renderPass, VkExtent2D *pGranularity);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateCommandPool)(VkDevice device, const VkCommandPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkCommandPool *pCommandPool);
+typedef void(VKAPI_PTR *PFN_vkDestroyCommandPool)(VkDevice device, VkCommandPool commandPool, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkResetCommandPool)(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags);
+typedef VkResult(VKAPI_PTR *PFN_vkAllocateCommandBuffers)(VkDevice device, const VkCommandBufferAllocateInfo *pAllocateInfo, VkCommandBuffer *pCommandBuffers);
+typedef void(VKAPI_PTR *PFN_vkFreeCommandBuffers)(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount, const VkCommandBuffer *pCommandBuffers);
+typedef VkResult(VKAPI_PTR *PFN_vkBeginCommandBuffer)(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo);
+typedef VkResult(VKAPI_PTR *PFN_vkEndCommandBuffer)(VkCommandBuffer commandBuffer);
+typedef VkResult(VKAPI_PTR *PFN_vkResetCommandBuffer)(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags);
+typedef void(VKAPI_PTR *PFN_vkCmdBindPipeline)(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline);
+typedef void(VKAPI_PTR *PFN_vkCmdSetViewport)(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, const VkViewport *pViewports);
+typedef void(VKAPI_PTR *PFN_vkCmdSetScissor)(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, const VkRect2D *pScissors);
+typedef void(VKAPI_PTR *PFN_vkCmdSetLineWidth)(VkCommandBuffer commandBuffer, float lineWidth);
+typedef void(VKAPI_PTR *PFN_vkCmdSetDepthBias)(VkCommandBuffer commandBuffer, float depthBiasConstantFactor, float depthBiasClamp, float depthBiasSlopeFactor);
+typedef void(VKAPI_PTR *PFN_vkCmdSetBlendConstants)(VkCommandBuffer commandBuffer, const float blendConstants[4]);
+typedef void(VKAPI_PTR *PFN_vkCmdSetDepthBounds)(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds);
+typedef void(VKAPI_PTR *PFN_vkCmdSetStencilCompareMask)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t compareMask);
+typedef void(VKAPI_PTR *PFN_vkCmdSetStencilWriteMask)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t writeMask);
+typedef void(VKAPI_PTR *PFN_vkCmdSetStencilReference)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t reference);
+typedef void(VKAPI_PTR *PFN_vkCmdBindDescriptorSets)(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout layout, uint32_t firstSet, uint32_t descriptorSetCount, const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount, const uint32_t *pDynamicOffsets);
+typedef void(VKAPI_PTR *PFN_vkCmdBindIndexBuffer)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkIndexType indexType);
+typedef void(VKAPI_PTR *PFN_vkCmdBindVertexBuffers)(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount, const VkBuffer *pBuffers, const VkDeviceSize *pOffsets);
+typedef void(VKAPI_PTR *PFN_vkCmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance);
+typedef void(VKAPI_PTR *PFN_vkCmdDrawIndexed)(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance);
+typedef void(VKAPI_PTR *PFN_vkCmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride);
+typedef void(VKAPI_PTR *PFN_vkCmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride);
+typedef void(VKAPI_PTR *PFN_vkCmdDispatch)(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ);
+typedef void(VKAPI_PTR *PFN_vkCmdDispatchIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset);
+typedef void(VKAPI_PTR *PFN_vkCmdCopyBuffer)(VkCommandBuffer commandBuffer, VkBuffer srcBuffer, VkBuffer dstBuffer, uint32_t regionCount, const VkBufferCopy *pRegions);
+typedef void(VKAPI_PTR *PFN_vkCmdCopyImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageCopy *pRegions);
+typedef void(VKAPI_PTR *PFN_vkCmdBlitImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageBlit *pRegions, VkFilter filter);
+typedef void(VKAPI_PTR *PFN_vkCmdCopyBufferToImage)(VkCommandBuffer commandBuffer, VkBuffer srcBuffer, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkBufferImageCopy *pRegions);
+typedef void(VKAPI_PTR *PFN_vkCmdCopyImageToBuffer)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkBuffer dstBuffer, uint32_t regionCount, const VkBufferImageCopy *pRegions);
+typedef void(VKAPI_PTR *PFN_vkCmdUpdateBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize dataSize, const void *pData);
+typedef void(VKAPI_PTR *PFN_vkCmdFillBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize size, uint32_t data);
+typedef void(VKAPI_PTR *PFN_vkCmdClearColorImage)(VkCommandBuffer commandBuffer, VkImage image, VkImageLayout imageLayout, const VkClearColorValue *pColor, uint32_t rangeCount, const VkImageSubresourceRange *pRanges);
+typedef void(VKAPI_PTR *PFN_vkCmdClearDepthStencilImage)(VkCommandBuffer commandBuffer, VkImage image, VkImageLayout imageLayout, const VkClearDepthStencilValue *pDepthStencil, uint32_t rangeCount, const VkImageSubresourceRange *pRanges);
+typedef void(VKAPI_PTR *PFN_vkCmdClearAttachments)(VkCommandBuffer commandBuffer, uint32_t attachmentCount, const VkClearAttachment *pAttachments, uint32_t rectCount, const VkClearRect *pRects);
+typedef void(VKAPI_PTR *PFN_vkCmdResolveImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageResolve *pRegions);
+typedef void(VKAPI_PTR *PFN_vkCmdSetEvent)(VkCommandBuffer commandBuffer, VkEvent event, VkPipelineStageFlags stageMask);
+typedef void(VKAPI_PTR *PFN_vkCmdResetEvent)(VkCommandBuffer commandBuffer, VkEvent event, VkPipelineStageFlags stageMask);
+typedef void(VKAPI_PTR *PFN_vkCmdWaitEvents)(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier *pImageMemoryBarriers);
+typedef void(VKAPI_PTR *PFN_vkCmdPipelineBarrier)(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags, uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier *pImageMemoryBarriers);
+typedef void(VKAPI_PTR *PFN_vkCmdBeginQuery)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, VkQueryControlFlags flags);
+typedef void(VKAPI_PTR *PFN_vkCmdEndQuery)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query);
+typedef void(VKAPI_PTR *PFN_vkCmdResetQueryPool)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount);
+typedef void(VKAPI_PTR *PFN_vkCmdWriteTimestamp)(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage, VkQueryPool queryPool, uint32_t query);
+typedef void(VKAPI_PTR *PFN_vkCmdCopyQueryPoolResults)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags);
+typedef void(VKAPI_PTR *PFN_vkCmdPushConstants)(VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size, const void *pValues);
+typedef void(VKAPI_PTR *PFN_vkCmdBeginRenderPass)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo *pRenderPassBegin, VkSubpassContents contents);
+typedef void(VKAPI_PTR *PFN_vkCmdNextSubpass)(VkCommandBuffer commandBuffer, VkSubpassContents contents);
+typedef void(VKAPI_PTR *PFN_vkCmdEndRenderPass)(VkCommandBuffer commandBuffer);
+typedef void(VKAPI_PTR *PFN_vkCmdExecuteCommands)(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer *pCommandBuffers);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance(
+    const VkInstanceCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkInstance *pInstance);
+
+// Moved this to be the only prototype depended on.
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
+    VkInstance instance,
+    const char *pName);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkDestroyInstance(
+    VkInstance instance,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDevices(
+    VkInstance instance,
+    uint32_t *pPhysicalDeviceCount,
+    VkPhysicalDevice *pPhysicalDevices);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceFeatures *pFeatures);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFormatProperties(
+    VkPhysicalDevice physicalDevice,
+    VkFormat format,
+    VkFormatProperties *pFormatProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceImageFormatProperties(
+    VkPhysicalDevice physicalDevice,
+    VkFormat format,
+    VkImageType type,
+    VkImageTiling tiling,
+    VkImageUsageFlags usage,
+    VkImageCreateFlags flags,
+    VkImageFormatProperties *pImageFormatProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceProperties *pProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties(
+    VkPhysicalDevice physicalDevice,
+    uint32_t *pQueueFamilyPropertyCount,
+    VkQueueFamilyProperties *pQueueFamilyProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceMemoryProperties(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceMemoryProperties *pMemoryProperties);
+
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr(
+    VkDevice device,
+    const char *pName);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDevice(
+    VkPhysicalDevice physicalDevice,
+    const VkDeviceCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkDevice *pDevice);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDevice(
+    VkDevice device,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceExtensionProperties(
+    const char *pLayerName,
+    uint32_t *pPropertyCount,
+    VkExtensionProperties *pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceExtensionProperties(
+    VkPhysicalDevice physicalDevice,
+    const char *pLayerName,
+    uint32_t *pPropertyCount,
+    VkExtensionProperties *pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceLayerProperties(
+    uint32_t *pPropertyCount,
+    VkLayerProperties *pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceLayerProperties(
+    VkPhysicalDevice physicalDevice,
+    uint32_t *pPropertyCount,
+    VkLayerProperties *pProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceQueue(
+    VkDevice device,
+    uint32_t queueFamilyIndex,
+    uint32_t queueIndex,
+    VkQueue *pQueue);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit(
+    VkQueue queue,
+    uint32_t submitCount,
+    const VkSubmitInfo *pSubmits,
+    VkFence fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueWaitIdle(
+    VkQueue queue);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkDeviceWaitIdle(
+    VkDevice device);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateMemory(
+    VkDevice device,
+    const VkMemoryAllocateInfo *pAllocateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkDeviceMemory *pMemory);
+
+VKAPI_ATTR void VKAPI_CALL vkFreeMemory(
+    VkDevice device,
+    VkDeviceMemory memory,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkMapMemory(
+    VkDevice device,
+    VkDeviceMemory memory,
+    VkDeviceSize offset,
+    VkDeviceSize size,
+    VkMemoryMapFlags flags,
+    void **ppData);
+
+VKAPI_ATTR void VKAPI_CALL vkUnmapMemory(
+    VkDevice device,
+    VkDeviceMemory memory);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkFlushMappedMemoryRanges(
+    VkDevice device,
+    uint32_t memoryRangeCount,
+    const VkMappedMemoryRange *pMemoryRanges);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkInvalidateMappedMemoryRanges(
+    VkDevice device,
+    uint32_t memoryRangeCount,
+    const VkMappedMemoryRange *pMemoryRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceMemoryCommitment(
+    VkDevice device,
+    VkDeviceMemory memory,
+    VkDeviceSize *pCommittedMemoryInBytes);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBindBufferMemory(
+    VkDevice device,
+    VkBuffer buffer,
+    VkDeviceMemory memory,
+    VkDeviceSize memoryOffset);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBindImageMemory(
+    VkDevice device,
+    VkImage image,
+    VkDeviceMemory memory,
+    VkDeviceSize memoryOffset);
+
+VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements(
+    VkDevice device,
+    VkBuffer buffer,
+    VkMemoryRequirements *pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements(
+    VkDevice device,
+    VkImage image,
+    VkMemoryRequirements *pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements(
+    VkDevice device,
+    VkImage image,
+    uint32_t *pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements *pSparseMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceSparseImageFormatProperties(
+    VkPhysicalDevice physicalDevice,
+    VkFormat format,
+    VkImageType type,
+    VkSampleCountFlagBits samples,
+    VkImageUsageFlags usage,
+    VkImageTiling tiling,
+    uint32_t *pPropertyCount,
+    VkSparseImageFormatProperties *pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueBindSparse(
+    VkQueue queue,
+    uint32_t bindInfoCount,
+    const VkBindSparseInfo *pBindInfo,
+    VkFence fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateFence(
+    VkDevice device,
+    const VkFenceCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkFence *pFence);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyFence(
+    VkDevice device,
+    VkFence fence,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetFences(
+    VkDevice device,
+    uint32_t fenceCount,
+    const VkFence *pFences);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetFenceStatus(
+    VkDevice device,
+    VkFence fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkWaitForFences(
+    VkDevice device,
+    uint32_t fenceCount,
+    const VkFence *pFences,
+    VkBool32 waitAll,
+    uint64_t timeout);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSemaphore(
+    VkDevice device,
+    const VkSemaphoreCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkSemaphore *pSemaphore);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySemaphore(
+    VkDevice device,
+    VkSemaphore semaphore,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateEvent(
+    VkDevice device,
+    const VkEventCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkEvent *pEvent);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyEvent(
+    VkDevice device,
+    VkEvent event,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetEventStatus(
+    VkDevice device,
+    VkEvent event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkSetEvent(
+    VkDevice device,
+    VkEvent event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetEvent(
+    VkDevice device,
+    VkEvent event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateQueryPool(
+    VkDevice device,
+    const VkQueryPoolCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkQueryPool *pQueryPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyQueryPool(
+    VkDevice device,
+    VkQueryPool queryPool,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetQueryPoolResults(
+    VkDevice device,
+    VkQueryPool queryPool,
+    uint32_t firstQuery,
+    uint32_t queryCount,
+    size_t dataSize,
+    void *pData,
+    VkDeviceSize stride,
+    VkQueryResultFlags flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateBuffer(
+    VkDevice device,
+    const VkBufferCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkBuffer *pBuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyBuffer(
+    VkDevice device,
+    VkBuffer buffer,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateBufferView(
+    VkDevice device,
+    const VkBufferViewCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkBufferView *pView);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyBufferView(
+    VkDevice device,
+    VkBufferView bufferView,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateImage(
+    VkDevice device,
+    const VkImageCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkImage *pImage);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyImage(
+    VkDevice device,
+    VkImage image,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout(
+    VkDevice device,
+    VkImage image,
+    const VkImageSubresource *pSubresource,
+    VkSubresourceLayout *pLayout);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateImageView(
+    VkDevice device,
+    const VkImageViewCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkImageView *pView);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyImageView(
+    VkDevice device,
+    VkImageView imageView,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateShaderModule(
+    VkDevice device,
+    const VkShaderModuleCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkShaderModule *pShaderModule);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyShaderModule(
+    VkDevice device,
+    VkShaderModule shaderModule,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineCache(
+    VkDevice device,
+    const VkPipelineCacheCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkPipelineCache *pPipelineCache);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineCache(
+    VkDevice device,
+    VkPipelineCache pipelineCache,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineCacheData(
+    VkDevice device,
+    VkPipelineCache pipelineCache,
+    size_t *pDataSize,
+    void *pData);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkMergePipelineCaches(
+    VkDevice device,
+    VkPipelineCache dstCache,
+    uint32_t srcCacheCount,
+    const VkPipelineCache *pSrcCaches);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines(
+    VkDevice device,
+    VkPipelineCache pipelineCache,
+    uint32_t createInfoCount,
+    const VkGraphicsPipelineCreateInfo *pCreateInfos,
+    const VkAllocationCallbacks *pAllocator,
+    VkPipeline *pPipelines);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines(
+    VkDevice device,
+    VkPipelineCache pipelineCache,
+    uint32_t createInfoCount,
+    const VkComputePipelineCreateInfo *pCreateInfos,
+    const VkAllocationCallbacks *pAllocator,
+    VkPipeline *pPipelines);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipeline(
+    VkDevice device,
+    VkPipeline pipeline,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineLayout(
+    VkDevice device,
+    const VkPipelineLayoutCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkPipelineLayout *pPipelineLayout);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineLayout(
+    VkDevice device,
+    VkPipelineLayout pipelineLayout,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSampler(
+    VkDevice device,
+    const VkSamplerCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkSampler *pSampler);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySampler(
+    VkDevice device,
+    VkSampler sampler,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDescriptorSetLayout(
+    VkDevice device,
+    const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkDescriptorSetLayout *pSetLayout);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDescriptorSetLayout(
+    VkDevice device,
+    VkDescriptorSetLayout descriptorSetLayout,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDescriptorPool(
+    VkDevice device,
+    const VkDescriptorPoolCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkDescriptorPool *pDescriptorPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDescriptorPool(
+    VkDevice device,
+    VkDescriptorPool descriptorPool,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetDescriptorPool(
+    VkDevice device,
+    VkDescriptorPool descriptorPool,
+    VkDescriptorPoolResetFlags flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateDescriptorSets(
+    VkDevice device,
+    const VkDescriptorSetAllocateInfo *pAllocateInfo,
+    VkDescriptorSet *pDescriptorSets);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkFreeDescriptorSets(
+    VkDevice device,
+    VkDescriptorPool descriptorPool,
+    uint32_t descriptorSetCount,
+    const VkDescriptorSet *pDescriptorSets);
+
+VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSets(
+    VkDevice device,
+    uint32_t descriptorWriteCount,
+    const VkWriteDescriptorSet *pDescriptorWrites,
+    uint32_t descriptorCopyCount,
+    const VkCopyDescriptorSet *pDescriptorCopies);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateFramebuffer(
+    VkDevice device,
+    const VkFramebufferCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkFramebuffer *pFramebuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyFramebuffer(
+    VkDevice device,
+    VkFramebuffer framebuffer,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateRenderPass(
+    VkDevice device,
+    const VkRenderPassCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkRenderPass *pRenderPass);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyRenderPass(
+    VkDevice device,
+    VkRenderPass renderPass,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkGetRenderAreaGranularity(
+    VkDevice device,
+    VkRenderPass renderPass,
+    VkExtent2D *pGranularity);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateCommandPool(
+    VkDevice device,
+    const VkCommandPoolCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkCommandPool *pCommandPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyCommandPool(
+    VkDevice device,
+    VkCommandPool commandPool,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetCommandPool(
+    VkDevice device,
+    VkCommandPool commandPool,
+    VkCommandPoolResetFlags flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateCommandBuffers(
+    VkDevice device,
+    const VkCommandBufferAllocateInfo *pAllocateInfo,
+    VkCommandBuffer *pCommandBuffers);
+
+VKAPI_ATTR void VKAPI_CALL vkFreeCommandBuffers(
+    VkDevice device,
+    VkCommandPool commandPool,
+    uint32_t commandBufferCount,
+    const VkCommandBuffer *pCommandBuffers);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBeginCommandBuffer(
+    VkCommandBuffer commandBuffer,
+    const VkCommandBufferBeginInfo *pBeginInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer(
+    VkCommandBuffer commandBuffer);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetCommandBuffer(
+    VkCommandBuffer commandBuffer,
+    VkCommandBufferResetFlags flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindPipeline(
+    VkCommandBuffer commandBuffer,
+    VkPipelineBindPoint pipelineBindPoint,
+    VkPipeline pipeline);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetViewport(
+    VkCommandBuffer commandBuffer,
+    uint32_t firstViewport,
+    uint32_t viewportCount,
+    const VkViewport *pViewports);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetScissor(
+    VkCommandBuffer commandBuffer,
+    uint32_t firstScissor,
+    uint32_t scissorCount,
+    const VkRect2D *pScissors);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetLineWidth(
+    VkCommandBuffer commandBuffer,
+    float lineWidth);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetDepthBias(
+    VkCommandBuffer commandBuffer,
+    float depthBiasConstantFactor,
+    float depthBiasClamp,
+    float depthBiasSlopeFactor);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetBlendConstants(
+    VkCommandBuffer commandBuffer,
+    const float blendConstants[4]);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetDepthBounds(
+    VkCommandBuffer commandBuffer,
+    float minDepthBounds,
+    float maxDepthBounds);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilCompareMask(
+    VkCommandBuffer commandBuffer,
+    VkStencilFaceFlags faceMask,
+    uint32_t compareMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilWriteMask(
+    VkCommandBuffer commandBuffer,
+    VkStencilFaceFlags faceMask,
+    uint32_t writeMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilReference(
+    VkCommandBuffer commandBuffer,
+    VkStencilFaceFlags faceMask,
+    uint32_t reference);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindDescriptorSets(
+    VkCommandBuffer commandBuffer,
+    VkPipelineBindPoint pipelineBindPoint,
+    VkPipelineLayout layout,
+    uint32_t firstSet,
+    uint32_t descriptorSetCount,
+    const VkDescriptorSet *pDescriptorSets,
+    uint32_t dynamicOffsetCount,
+    const uint32_t *pDynamicOffsets);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer(
+    VkCommandBuffer commandBuffer,
+    VkBuffer buffer,
+    VkDeviceSize offset,
+    VkIndexType indexType);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindVertexBuffers(
+    VkCommandBuffer commandBuffer,
+    uint32_t firstBinding,
+    uint32_t bindingCount,
+    const VkBuffer *pBuffers,
+    const VkDeviceSize *pOffsets);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDraw(
+    VkCommandBuffer commandBuffer,
+    uint32_t vertexCount,
+    uint32_t instanceCount,
+    uint32_t firstVertex,
+    uint32_t firstInstance);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexed(
+    VkCommandBuffer commandBuffer,
+    uint32_t indexCount,
+    uint32_t instanceCount,
+    uint32_t firstIndex,
+    int32_t vertexOffset,
+    uint32_t firstInstance);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirect(
+    VkCommandBuffer commandBuffer,
+    VkBuffer buffer,
+    VkDeviceSize offset,
+    uint32_t drawCount,
+    uint32_t stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirect(
+    VkCommandBuffer commandBuffer,
+    VkBuffer buffer,
+    VkDeviceSize offset,
+    uint32_t drawCount,
+    uint32_t stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDispatch(
+    VkCommandBuffer commandBuffer,
+    uint32_t groupCountX,
+    uint32_t groupCountY,
+    uint32_t groupCountZ);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect(
+    VkCommandBuffer commandBuffer,
+    VkBuffer buffer,
+    VkDeviceSize offset);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyBuffer(
+    VkCommandBuffer commandBuffer,
+    VkBuffer srcBuffer,
+    VkBuffer dstBuffer,
+    uint32_t regionCount,
+    const VkBufferCopy *pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyImage(
+    VkCommandBuffer commandBuffer,
+    VkImage srcImage,
+    VkImageLayout srcImageLayout,
+    VkImage dstImage,
+    VkImageLayout dstImageLayout,
+    uint32_t regionCount,
+    const VkImageCopy *pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBlitImage(
+    VkCommandBuffer commandBuffer,
+    VkImage srcImage,
+    VkImageLayout srcImageLayout,
+    VkImage dstImage,
+    VkImageLayout dstImageLayout,
+    uint32_t regionCount,
+    const VkImageBlit *pRegions,
+    VkFilter filter);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyBufferToImage(
+    VkCommandBuffer commandBuffer,
+    VkBuffer srcBuffer,
+    VkImage dstImage,
+    VkImageLayout dstImageLayout,
+    uint32_t regionCount,
+    const VkBufferImageCopy *pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyImageToBuffer(
+    VkCommandBuffer commandBuffer,
+    VkImage srcImage,
+    VkImageLayout srcImageLayout,
+    VkBuffer dstBuffer,
+    uint32_t regionCount,
+    const VkBufferImageCopy *pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdUpdateBuffer(
+    VkCommandBuffer commandBuffer,
+    VkBuffer dstBuffer,
+    VkDeviceSize dstOffset,
+    VkDeviceSize dataSize,
+    const void *pData);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdFillBuffer(
+    VkCommandBuffer commandBuffer,
+    VkBuffer dstBuffer,
+    VkDeviceSize dstOffset,
+    VkDeviceSize size,
+    uint32_t data);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearColorImage(
+    VkCommandBuffer commandBuffer,
+    VkImage image,
+    VkImageLayout imageLayout,
+    const VkClearColorValue *pColor,
+    uint32_t rangeCount,
+    const VkImageSubresourceRange *pRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearDepthStencilImage(
+    VkCommandBuffer commandBuffer,
+    VkImage image,
+    VkImageLayout imageLayout,
+    const VkClearDepthStencilValue *pDepthStencil,
+    uint32_t rangeCount,
+    const VkImageSubresourceRange *pRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearAttachments(
+    VkCommandBuffer commandBuffer,
+    uint32_t attachmentCount,
+    const VkClearAttachment *pAttachments,
+    uint32_t rectCount,
+    const VkClearRect *pRects);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResolveImage(
+    VkCommandBuffer commandBuffer,
+    VkImage srcImage,
+    VkImageLayout srcImageLayout,
+    VkImage dstImage,
+    VkImageLayout dstImageLayout,
+    uint32_t regionCount,
+    const VkImageResolve *pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetEvent(
+    VkCommandBuffer commandBuffer,
+    VkEvent event,
+    VkPipelineStageFlags stageMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResetEvent(
+    VkCommandBuffer commandBuffer,
+    VkEvent event,
+    VkPipelineStageFlags stageMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdWaitEvents(
+    VkCommandBuffer commandBuffer,
+    uint32_t eventCount,
+    const VkEvent *pEvents,
+    VkPipelineStageFlags srcStageMask,
+    VkPipelineStageFlags dstStageMask,
+    uint32_t memoryBarrierCount,
+    const VkMemoryBarrier *pMemoryBarriers,
+    uint32_t bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier *pBufferMemoryBarriers,
+    uint32_t imageMemoryBarrierCount,
+    const VkImageMemoryBarrier *pImageMemoryBarriers);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier(
+    VkCommandBuffer commandBuffer,
+    VkPipelineStageFlags srcStageMask,
+    VkPipelineStageFlags dstStageMask,
+    VkDependencyFlags dependencyFlags,
+    uint32_t memoryBarrierCount,
+    const VkMemoryBarrier *pMemoryBarriers,
+    uint32_t bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier *pBufferMemoryBarriers,
+    uint32_t imageMemoryBarrierCount,
+    const VkImageMemoryBarrier *pImageMemoryBarriers);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBeginQuery(
+    VkCommandBuffer commandBuffer,
+    VkQueryPool queryPool,
+    uint32_t query,
+    VkQueryControlFlags flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdEndQuery(
+    VkCommandBuffer commandBuffer,
+    VkQueryPool queryPool,
+    uint32_t query);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResetQueryPool(
+    VkCommandBuffer commandBuffer,
+    VkQueryPool queryPool,
+    uint32_t firstQuery,
+    uint32_t queryCount);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdWriteTimestamp(
+    VkCommandBuffer commandBuffer,
+    VkPipelineStageFlagBits pipelineStage,
+    VkQueryPool queryPool,
+    uint32_t query);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyQueryPoolResults(
+    VkCommandBuffer commandBuffer,
+    VkQueryPool queryPool,
+    uint32_t firstQuery,
+    uint32_t queryCount,
+    VkBuffer dstBuffer,
+    VkDeviceSize dstOffset,
+    VkDeviceSize stride,
+    VkQueryResultFlags flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdPushConstants(
+    VkCommandBuffer commandBuffer,
+    VkPipelineLayout layout,
+    VkShaderStageFlags stageFlags,
+    uint32_t offset,
+    uint32_t size,
+    const void *pValues);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBeginRenderPass(
+    VkCommandBuffer commandBuffer,
+    const VkRenderPassBeginInfo *pRenderPassBegin,
+    VkSubpassContents contents);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdNextSubpass(
+    VkCommandBuffer commandBuffer,
+    VkSubpassContents contents);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdEndRenderPass(
+    VkCommandBuffer commandBuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdExecuteCommands(
+    VkCommandBuffer commandBuffer,
+    uint32_t commandBufferCount,
+    const VkCommandBuffer *pCommandBuffers);
+#endif
+
+#define VK_KHR_surface 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSurfaceKHR)
+
+#define VK_KHR_SURFACE_SPEC_VERSION 25
+#define VK_KHR_SURFACE_EXTENSION_NAME "VK_KHR_surface"
+#define VK_COLORSPACE_SRGB_NONLINEAR_KHR VK_COLOR_SPACE_SRGB_NONLINEAR_KHR
+
+typedef enum VkColorSpaceKHR {
+    VK_COLOR_SPACE_SRGB_NONLINEAR_KHR = 0,
+    VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT = 1000104001,
+    VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT = 1000104002,
+    VK_COLOR_SPACE_DCI_P3_LINEAR_EXT = 1000104003,
+    VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT = 1000104004,
+    VK_COLOR_SPACE_BT709_LINEAR_EXT = 1000104005,
+    VK_COLOR_SPACE_BT709_NONLINEAR_EXT = 1000104006,
+    VK_COLOR_SPACE_BT2020_LINEAR_EXT = 1000104007,
+    VK_COLOR_SPACE_HDR10_ST2084_EXT = 1000104008,
+    VK_COLOR_SPACE_DOLBYVISION_EXT = 1000104009,
+    VK_COLOR_SPACE_HDR10_HLG_EXT = 1000104010,
+    VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT = 1000104011,
+    VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT = 1000104012,
+    VK_COLOR_SPACE_PASS_THROUGH_EXT = 1000104013,
+    VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT = 1000104014,
+    VK_COLOR_SPACE_BEGIN_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,
+    VK_COLOR_SPACE_END_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,
+    VK_COLOR_SPACE_RANGE_SIZE_KHR = (VK_COLOR_SPACE_SRGB_NONLINEAR_KHR - VK_COLOR_SPACE_SRGB_NONLINEAR_KHR + 1),
+    VK_COLOR_SPACE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkColorSpaceKHR;
+
+typedef enum VkPresentModeKHR {
+    VK_PRESENT_MODE_IMMEDIATE_KHR = 0,
+    VK_PRESENT_MODE_MAILBOX_KHR = 1,
+    VK_PRESENT_MODE_FIFO_KHR = 2,
+    VK_PRESENT_MODE_FIFO_RELAXED_KHR = 3,
+    VK_PRESENT_MODE_SHARED_DEMAND_REFRESH_KHR = 1000111000,
+    VK_PRESENT_MODE_SHARED_CONTINUOUS_REFRESH_KHR = 1000111001,
+    VK_PRESENT_MODE_BEGIN_RANGE_KHR = VK_PRESENT_MODE_IMMEDIATE_KHR,
+    VK_PRESENT_MODE_END_RANGE_KHR = VK_PRESENT_MODE_FIFO_RELAXED_KHR,
+    VK_PRESENT_MODE_RANGE_SIZE_KHR = (VK_PRESENT_MODE_FIFO_RELAXED_KHR - VK_PRESENT_MODE_IMMEDIATE_KHR + 1),
+    VK_PRESENT_MODE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkPresentModeKHR;
+
+typedef enum VkSurfaceTransformFlagBitsKHR {
+    VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR = 0x00000001,
+    VK_SURFACE_TRANSFORM_ROTATE_90_BIT_KHR = 0x00000002,
+    VK_SURFACE_TRANSFORM_ROTATE_180_BIT_KHR = 0x00000004,
+    VK_SURFACE_TRANSFORM_ROTATE_270_BIT_KHR = 0x00000008,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_BIT_KHR = 0x00000010,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_90_BIT_KHR = 0x00000020,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_180_BIT_KHR = 0x00000040,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_270_BIT_KHR = 0x00000080,
+    VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR = 0x00000100,
+    VK_SURFACE_TRANSFORM_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkSurfaceTransformFlagBitsKHR;
+typedef VkFlags VkSurfaceTransformFlagsKHR;
+
+typedef enum VkCompositeAlphaFlagBitsKHR {
+    VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR = 0x00000001,
+    VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR = 0x00000002,
+    VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR = 0x00000004,
+    VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR = 0x00000008,
+    VK_COMPOSITE_ALPHA_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkCompositeAlphaFlagBitsKHR;
+typedef VkFlags VkCompositeAlphaFlagsKHR;
+
+typedef struct VkSurfaceCapabilitiesKHR {
+    uint32_t minImageCount;
+    uint32_t maxImageCount;
+    VkExtent2D currentExtent;
+    VkExtent2D minImageExtent;
+    VkExtent2D maxImageExtent;
+    uint32_t maxImageArrayLayers;
+    VkSurfaceTransformFlagsKHR supportedTransforms;
+    VkSurfaceTransformFlagBitsKHR currentTransform;
+    VkCompositeAlphaFlagsKHR supportedCompositeAlpha;
+    VkImageUsageFlags supportedUsageFlags;
+} VkSurfaceCapabilitiesKHR;
+
+typedef struct VkSurfaceFormatKHR {
+    VkFormat format;
+    VkColorSpaceKHR colorSpace;
+} VkSurfaceFormatKHR;
+
+typedef void(VKAPI_PTR *PFN_vkDestroySurfaceKHR)(VkInstance instance, VkSurfaceKHR surface, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, VkSurfaceKHR surface, VkBool32 *pSupported);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR *pSurfaceCapabilities);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t *pSurfaceFormatCount, VkSurfaceFormatKHR *pSurfaceFormats);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t *pPresentModeCount, VkPresentModeKHR *pPresentModes);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR(
+    VkInstance instance,
+    VkSurfaceKHR surface,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR(
+    VkPhysicalDevice physicalDevice,
+    uint32_t queueFamilyIndex,
+    VkSurfaceKHR surface,
+    VkBool32 *pSupported);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR(
+    VkPhysicalDevice physicalDevice,
+    VkSurfaceKHR surface,
+    VkSurfaceCapabilitiesKHR *pSurfaceCapabilities);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR(
+    VkPhysicalDevice physicalDevice,
+    VkSurfaceKHR surface,
+    uint32_t *pSurfaceFormatCount,
+    VkSurfaceFormatKHR *pSurfaceFormats);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR(
+    VkPhysicalDevice physicalDevice,
+    VkSurfaceKHR surface,
+    uint32_t *pPresentModeCount,
+    VkPresentModeKHR *pPresentModes);
+#endif
+
+#define VK_KHR_swapchain 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSwapchainKHR)
+
+#define VK_KHR_SWAPCHAIN_SPEC_VERSION 68
+#define VK_KHR_SWAPCHAIN_EXTENSION_NAME "VK_KHR_swapchain"
+
+typedef enum VkSwapchainCreateFlagBitsKHR {
+    VK_SWAPCHAIN_CREATE_BIND_SFR_BIT_KHX = 0x00000001,
+    VK_SWAPCHAIN_CREATE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkSwapchainCreateFlagBitsKHR;
+typedef VkFlags VkSwapchainCreateFlagsKHR;
+
+typedef struct VkSwapchainCreateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkSwapchainCreateFlagsKHR flags;
+    VkSurfaceKHR surface;
+    uint32_t minImageCount;
+    VkFormat imageFormat;
+    VkColorSpaceKHR imageColorSpace;
+    VkExtent2D imageExtent;
+    uint32_t imageArrayLayers;
+    VkImageUsageFlags imageUsage;
+    VkSharingMode imageSharingMode;
+    uint32_t queueFamilyIndexCount;
+    const uint32_t *pQueueFamilyIndices;
+    VkSurfaceTransformFlagBitsKHR preTransform;
+    VkCompositeAlphaFlagBitsKHR compositeAlpha;
+    VkPresentModeKHR presentMode;
+    VkBool32 clipped;
+    VkSwapchainKHR oldSwapchain;
+} VkSwapchainCreateInfoKHR;
+
+typedef struct VkPresentInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t waitSemaphoreCount;
+    const VkSemaphore *pWaitSemaphores;
+    uint32_t swapchainCount;
+    const VkSwapchainKHR *pSwapchains;
+    const uint32_t *pImageIndices;
+    VkResult *pResults;
+} VkPresentInfoKHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateSwapchainKHR)(VkDevice device, const VkSwapchainCreateInfoKHR *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSwapchainKHR *pSwapchain);
+typedef void(VKAPI_PTR *PFN_vkDestroySwapchainKHR)(VkDevice device, VkSwapchainKHR swapchain, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkGetSwapchainImagesKHR)(VkDevice device, VkSwapchainKHR swapchain, uint32_t *pSwapchainImageCount, VkImage *pSwapchainImages);
+typedef VkResult(VKAPI_PTR *PFN_vkAcquireNextImageKHR)(VkDevice device, VkSwapchainKHR swapchain, uint64_t timeout, VkSemaphore semaphore, VkFence fence, uint32_t *pImageIndex);
+typedef VkResult(VKAPI_PTR *PFN_vkQueuePresentKHR)(VkQueue queue, const VkPresentInfoKHR *pPresentInfo);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSwapchainKHR(
+    VkDevice device,
+    const VkSwapchainCreateInfoKHR *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkSwapchainKHR *pSwapchain);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySwapchainKHR(
+    VkDevice device,
+    VkSwapchainKHR swapchain,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainImagesKHR(
+    VkDevice device,
+    VkSwapchainKHR swapchain,
+    uint32_t *pSwapchainImageCount,
+    VkImage *pSwapchainImages);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImageKHR(
+    VkDevice device,
+    VkSwapchainKHR swapchain,
+    uint64_t timeout,
+    VkSemaphore semaphore,
+    VkFence fence,
+    uint32_t *pImageIndex);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR(
+    VkQueue queue,
+    const VkPresentInfoKHR *pPresentInfo);
+#endif
+
+#define VK_KHR_display 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDisplayKHR)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDisplayModeKHR)
+
+#define VK_KHR_DISPLAY_SPEC_VERSION 21
+#define VK_KHR_DISPLAY_EXTENSION_NAME "VK_KHR_display"
+
+typedef enum VkDisplayPlaneAlphaFlagBitsKHR {
+    VK_DISPLAY_PLANE_ALPHA_OPAQUE_BIT_KHR = 0x00000001,
+    VK_DISPLAY_PLANE_ALPHA_GLOBAL_BIT_KHR = 0x00000002,
+    VK_DISPLAY_PLANE_ALPHA_PER_PIXEL_BIT_KHR = 0x00000004,
+    VK_DISPLAY_PLANE_ALPHA_PER_PIXEL_PREMULTIPLIED_BIT_KHR = 0x00000008,
+    VK_DISPLAY_PLANE_ALPHA_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkDisplayPlaneAlphaFlagBitsKHR;
+typedef VkFlags VkDisplayPlaneAlphaFlagsKHR;
+typedef VkFlags VkDisplayModeCreateFlagsKHR;
+typedef VkFlags VkDisplaySurfaceCreateFlagsKHR;
+
+typedef struct VkDisplayPropertiesKHR {
+    VkDisplayKHR display;
+    const char *displayName;
+    VkExtent2D physicalDimensions;
+    VkExtent2D physicalResolution;
+    VkSurfaceTransformFlagsKHR supportedTransforms;
+    VkBool32 planeReorderPossible;
+    VkBool32 persistentContent;
+} VkDisplayPropertiesKHR;
+
+typedef struct VkDisplayModeParametersKHR {
+    VkExtent2D visibleRegion;
+    uint32_t refreshRate;
+} VkDisplayModeParametersKHR;
+
+typedef struct VkDisplayModePropertiesKHR {
+    VkDisplayModeKHR displayMode;
+    VkDisplayModeParametersKHR parameters;
+} VkDisplayModePropertiesKHR;
+
+typedef struct VkDisplayModeCreateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkDisplayModeCreateFlagsKHR flags;
+    VkDisplayModeParametersKHR parameters;
+} VkDisplayModeCreateInfoKHR;
+
+typedef struct VkDisplayPlaneCapabilitiesKHR {
+    VkDisplayPlaneAlphaFlagsKHR supportedAlpha;
+    VkOffset2D minSrcPosition;
+    VkOffset2D maxSrcPosition;
+    VkExtent2D minSrcExtent;
+    VkExtent2D maxSrcExtent;
+    VkOffset2D minDstPosition;
+    VkOffset2D maxDstPosition;
+    VkExtent2D minDstExtent;
+    VkExtent2D maxDstExtent;
+} VkDisplayPlaneCapabilitiesKHR;
+
+typedef struct VkDisplayPlanePropertiesKHR {
+    VkDisplayKHR currentDisplay;
+    uint32_t currentStackIndex;
+} VkDisplayPlanePropertiesKHR;
+
+typedef struct VkDisplaySurfaceCreateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkDisplaySurfaceCreateFlagsKHR flags;
+    VkDisplayModeKHR displayMode;
+    uint32_t planeIndex;
+    uint32_t planeStackIndex;
+    VkSurfaceTransformFlagBitsKHR transform;
+    float globalAlpha;
+    VkDisplayPlaneAlphaFlagBitsKHR alphaMode;
+    VkExtent2D imageExtent;
+} VkDisplaySurfaceCreateInfoKHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceDisplayPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t *pPropertyCount, VkDisplayPropertiesKHR *pProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceDisplayPlanePropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t *pPropertyCount, VkDisplayPlanePropertiesKHR *pProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkGetDisplayPlaneSupportedDisplaysKHR)(VkPhysicalDevice physicalDevice, uint32_t planeIndex, uint32_t *pDisplayCount, VkDisplayKHR *pDisplays);
+typedef VkResult(VKAPI_PTR *PFN_vkGetDisplayModePropertiesKHR)(VkPhysicalDevice physicalDevice, VkDisplayKHR display, uint32_t *pPropertyCount, VkDisplayModePropertiesKHR *pProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateDisplayModeKHR)(VkPhysicalDevice physicalDevice, VkDisplayKHR display, const VkDisplayModeCreateInfoKHR *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDisplayModeKHR *pMode);
+typedef VkResult(VKAPI_PTR *PFN_vkGetDisplayPlaneCapabilitiesKHR)(VkPhysicalDevice physicalDevice, VkDisplayModeKHR mode, uint32_t planeIndex, VkDisplayPlaneCapabilitiesKHR *pCapabilities);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateDisplayPlaneSurfaceKHR)(VkInstance instance, const VkDisplaySurfaceCreateInfoKHR *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSurfaceKHR *pSurface);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceDisplayPropertiesKHR(
+    VkPhysicalDevice physicalDevice,
+    uint32_t *pPropertyCount,
+    VkDisplayPropertiesKHR *pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceDisplayPlanePropertiesKHR(
+    VkPhysicalDevice physicalDevice,
+    uint32_t *pPropertyCount,
+    VkDisplayPlanePropertiesKHR *pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayPlaneSupportedDisplaysKHR(
+    VkPhysicalDevice physicalDevice,
+    uint32_t planeIndex,
+    uint32_t *pDisplayCount,
+    VkDisplayKHR *pDisplays);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayModePropertiesKHR(
+    VkPhysicalDevice physicalDevice,
+    VkDisplayKHR display,
+    uint32_t *pPropertyCount,
+    VkDisplayModePropertiesKHR *pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDisplayModeKHR(
+    VkPhysicalDevice physicalDevice,
+    VkDisplayKHR display,
+    const VkDisplayModeCreateInfoKHR *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkDisplayModeKHR *pMode);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayPlaneCapabilitiesKHR(
+    VkPhysicalDevice physicalDevice,
+    VkDisplayModeKHR mode,
+    uint32_t planeIndex,
+    VkDisplayPlaneCapabilitiesKHR *pCapabilities);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDisplayPlaneSurfaceKHR(
+    VkInstance instance,
+    const VkDisplaySurfaceCreateInfoKHR *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkSurfaceKHR *pSurface);
+#endif
+
+#define VK_KHR_display_swapchain 1
+#define VK_KHR_DISPLAY_SWAPCHAIN_SPEC_VERSION 9
+#define VK_KHR_DISPLAY_SWAPCHAIN_EXTENSION_NAME "VK_KHR_display_swapchain"
+
+typedef struct VkDisplayPresentInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkRect2D srcRect;
+    VkRect2D dstRect;
+    VkBool32 persistent;
+} VkDisplayPresentInfoKHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateSharedSwapchainsKHR)(VkDevice device, uint32_t swapchainCount, const VkSwapchainCreateInfoKHR *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkSwapchainKHR *pSwapchains);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSharedSwapchainsKHR(
+    VkDevice device,
+    uint32_t swapchainCount,
+    const VkSwapchainCreateInfoKHR *pCreateInfos,
+    const VkAllocationCallbacks *pAllocator,
+    VkSwapchainKHR *pSwapchains);
+#endif
+
+#define VK_KHR_sampler_mirror_clamp_to_edge 1
+#define VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_SPEC_VERSION 1
+#define VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME "VK_KHR_sampler_mirror_clamp_to_edge"
+
+#define VK_KHR_get_physical_device_properties2 1
+#define VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_SPEC_VERSION 1
+#define VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME "VK_KHR_get_physical_device_properties2"
+
+typedef struct VkPhysicalDeviceFeatures2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkPhysicalDeviceFeatures features;
+} VkPhysicalDeviceFeatures2KHR;
+
+typedef struct VkPhysicalDeviceProperties2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkPhysicalDeviceProperties properties;
+} VkPhysicalDeviceProperties2KHR;
+
+typedef struct VkFormatProperties2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkFormatProperties formatProperties;
+} VkFormatProperties2KHR;
+
+typedef struct VkImageFormatProperties2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkImageFormatProperties imageFormatProperties;
+} VkImageFormatProperties2KHR;
+
+typedef struct VkPhysicalDeviceImageFormatInfo2KHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkFormat format;
+    VkImageType type;
+    VkImageTiling tiling;
+    VkImageUsageFlags usage;
+    VkImageCreateFlags flags;
+} VkPhysicalDeviceImageFormatInfo2KHR;
+
+typedef struct VkQueueFamilyProperties2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkQueueFamilyProperties queueFamilyProperties;
+} VkQueueFamilyProperties2KHR;
+
+typedef struct VkPhysicalDeviceMemoryProperties2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkPhysicalDeviceMemoryProperties memoryProperties;
+} VkPhysicalDeviceMemoryProperties2KHR;
+
+typedef struct VkSparseImageFormatProperties2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkSparseImageFormatProperties properties;
+} VkSparseImageFormatProperties2KHR;
+
+typedef struct VkPhysicalDeviceSparseImageFormatInfo2KHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkFormat format;
+    VkImageType type;
+    VkSampleCountFlagBits samples;
+    VkImageUsageFlags usage;
+    VkImageTiling tiling;
+} VkPhysicalDeviceSparseImageFormatInfo2KHR;
+
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceFeatures2KHR)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures2KHR *pFeatures);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceProperties2KHR)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties2KHR *pProperties);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceFormatProperties2KHR)(VkPhysicalDevice physicalDevice, VkFormat format, VkFormatProperties2KHR *pFormatProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceImageFormatInfo2KHR *pImageFormatInfo, VkImageFormatProperties2KHR *pImageFormatProperties);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)(VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount, VkQueueFamilyProperties2KHR *pQueueFamilyProperties);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceMemoryProperties2KHR)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceMemoryProperties2KHR *pMemoryProperties);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceSparseImageFormatInfo2KHR *pFormatInfo, uint32_t *pPropertyCount, VkSparseImageFormatProperties2KHR *pProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures2KHR(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceFeatures2KHR *pFeatures);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties2KHR(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceProperties2KHR *pProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFormatProperties2KHR(
+    VkPhysicalDevice physicalDevice,
+    VkFormat format,
+    VkFormatProperties2KHR *pFormatProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceImageFormatProperties2KHR(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceImageFormatInfo2KHR *pImageFormatInfo,
+    VkImageFormatProperties2KHR *pImageFormatProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties2KHR(
+    VkPhysicalDevice physicalDevice,
+    uint32_t *pQueueFamilyPropertyCount,
+    VkQueueFamilyProperties2KHR *pQueueFamilyProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceMemoryProperties2KHR(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceMemoryProperties2KHR *pMemoryProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceSparseImageFormatProperties2KHR(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceSparseImageFormatInfo2KHR *pFormatInfo,
+    uint32_t *pPropertyCount,
+    VkSparseImageFormatProperties2KHR *pProperties);
+#endif
+
+#define VK_KHR_shader_draw_parameters 1
+#define VK_KHR_SHADER_DRAW_PARAMETERS_SPEC_VERSION 1
+#define VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME "VK_KHR_shader_draw_parameters"
+
+#define VK_KHR_maintenance1 1
+#define VK_KHR_MAINTENANCE1_SPEC_VERSION 1
+#define VK_KHR_MAINTENANCE1_EXTENSION_NAME "VK_KHR_maintenance1"
+
+typedef VkFlags VkCommandPoolTrimFlagsKHR;
+
+typedef void(VKAPI_PTR *PFN_vkTrimCommandPoolKHR)(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlagsKHR flags);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkTrimCommandPoolKHR(
+    VkDevice device,
+    VkCommandPool commandPool,
+    VkCommandPoolTrimFlagsKHR flags);
+#endif
+
+#define VK_KHR_external_memory_capabilities 1
+#define VK_LUID_SIZE_KHR 8
+#define VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME "VK_KHR_external_memory_capabilities"
+
+typedef enum VkExternalMemoryHandleTypeFlagBitsKHR {
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR = 0x00000001,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR = 0x00000002,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR = 0x00000004,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_BIT_KHR = 0x00000008,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_KMT_BIT_KHR = 0x00000010,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP_BIT_KHR = 0x00000020,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE_BIT_KHR = 0x00000040,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkExternalMemoryHandleTypeFlagBitsKHR;
+typedef VkFlags VkExternalMemoryHandleTypeFlagsKHR;
+
+typedef enum VkExternalMemoryFeatureFlagBitsKHR {
+    VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT_KHR = 0x00000001,
+    VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR = 0x00000002,
+    VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR = 0x00000004,
+    VK_EXTERNAL_MEMORY_FEATURE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkExternalMemoryFeatureFlagBitsKHR;
+typedef VkFlags VkExternalMemoryFeatureFlagsKHR;
+
+typedef struct VkExternalMemoryPropertiesKHR {
+    VkExternalMemoryFeatureFlagsKHR externalMemoryFeatures;
+    VkExternalMemoryHandleTypeFlagsKHR exportFromImportedHandleTypes;
+    VkExternalMemoryHandleTypeFlagsKHR compatibleHandleTypes;
+} VkExternalMemoryPropertiesKHR;
+
+typedef struct VkPhysicalDeviceExternalImageFormatInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalMemoryHandleTypeFlagBitsKHR handleType;
+} VkPhysicalDeviceExternalImageFormatInfoKHR;
+
+typedef struct VkExternalImageFormatPropertiesKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkExternalMemoryPropertiesKHR externalMemoryProperties;
+} VkExternalImageFormatPropertiesKHR;
+
+typedef struct VkPhysicalDeviceExternalBufferInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkBufferCreateFlags flags;
+    VkBufferUsageFlags usage;
+    VkExternalMemoryHandleTypeFlagBitsKHR handleType;
+} VkPhysicalDeviceExternalBufferInfoKHR;
+
+typedef struct VkExternalBufferPropertiesKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkExternalMemoryPropertiesKHR externalMemoryProperties;
+} VkExternalBufferPropertiesKHR;
+
+typedef struct VkPhysicalDeviceIDPropertiesKHR {
+    VkStructureType sType;
+    void *pNext;
+    uint8_t deviceUUID[VK_UUID_SIZE];
+    uint8_t driverUUID[VK_UUID_SIZE];
+    uint8_t deviceLUID[VK_LUID_SIZE_KHR];
+    uint32_t deviceNodeMask;
+    VkBool32 deviceLUIDValid;
+} VkPhysicalDeviceIDPropertiesKHR;
+
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalBufferInfoKHR *pExternalBufferInfo, VkExternalBufferPropertiesKHR *pExternalBufferProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceExternalBufferPropertiesKHR(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceExternalBufferInfoKHR *pExternalBufferInfo,
+    VkExternalBufferPropertiesKHR *pExternalBufferProperties);
+#endif
+
+#define VK_KHR_external_memory 1
+#define VK_KHR_EXTERNAL_MEMORY_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME "VK_KHR_external_memory"
+#define VK_QUEUE_FAMILY_EXTERNAL_KHR (~0U - 1)
+
+typedef struct VkExternalMemoryImageCreateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalMemoryHandleTypeFlagsKHR handleTypes;
+} VkExternalMemoryImageCreateInfoKHR;
+
+typedef struct VkExternalMemoryBufferCreateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalMemoryHandleTypeFlagsKHR handleTypes;
+} VkExternalMemoryBufferCreateInfoKHR;
+
+typedef struct VkExportMemoryAllocateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalMemoryHandleTypeFlagsKHR handleTypes;
+} VkExportMemoryAllocateInfoKHR;
+
+#define VK_KHR_external_memory_fd 1
+#define VK_KHR_EXTERNAL_MEMORY_FD_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME "VK_KHR_external_memory_fd"
+
+typedef struct VkImportMemoryFdInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalMemoryHandleTypeFlagBitsKHR handleType;
+    int fd;
+} VkImportMemoryFdInfoKHR;
+
+typedef struct VkMemoryFdPropertiesKHR {
+    VkStructureType sType;
+    void *pNext;
+    uint32_t memoryTypeBits;
+} VkMemoryFdPropertiesKHR;
+
+typedef struct VkMemoryGetFdInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkDeviceMemory memory;
+    VkExternalMemoryHandleTypeFlagBitsKHR handleType;
+} VkMemoryGetFdInfoKHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkGetMemoryFdKHR)(VkDevice device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFd);
+typedef VkResult(VKAPI_PTR *PFN_vkGetMemoryFdPropertiesKHR)(VkDevice device, VkExternalMemoryHandleTypeFlagBitsKHR handleType, int fd, VkMemoryFdPropertiesKHR *pMemoryFdProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryFdKHR(
+    VkDevice device,
+    const VkMemoryGetFdInfoKHR *pGetFdInfo,
+    int *pFd);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryFdPropertiesKHR(
+    VkDevice device,
+    VkExternalMemoryHandleTypeFlagBitsKHR handleType,
+    int fd,
+    VkMemoryFdPropertiesKHR *pMemoryFdProperties);
+#endif
+
+#define VK_KHR_external_semaphore_capabilities 1
+#define VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME "VK_KHR_external_semaphore_capabilities"
+
+typedef enum VkExternalSemaphoreHandleTypeFlagBitsKHR {
+    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR = 0x00000001,
+    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR = 0x00000002,
+    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR = 0x00000004,
+    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE_BIT_KHR = 0x00000008,
+    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR = 0x00000010,
+    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkExternalSemaphoreHandleTypeFlagBitsKHR;
+typedef VkFlags VkExternalSemaphoreHandleTypeFlagsKHR;
+
+typedef enum VkExternalSemaphoreFeatureFlagBitsKHR {
+    VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR = 0x00000001,
+    VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR = 0x00000002,
+    VK_EXTERNAL_SEMAPHORE_FEATURE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkExternalSemaphoreFeatureFlagBitsKHR;
+typedef VkFlags VkExternalSemaphoreFeatureFlagsKHR;
+
+typedef struct VkPhysicalDeviceExternalSemaphoreInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalSemaphoreHandleTypeFlagBitsKHR handleType;
+} VkPhysicalDeviceExternalSemaphoreInfoKHR;
+
+typedef struct VkExternalSemaphorePropertiesKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkExternalSemaphoreHandleTypeFlagsKHR exportFromImportedHandleTypes;
+    VkExternalSemaphoreHandleTypeFlagsKHR compatibleHandleTypes;
+    VkExternalSemaphoreFeatureFlagsKHR externalSemaphoreFeatures;
+} VkExternalSemaphorePropertiesKHR;
+
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalSemaphorePropertiesKHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalSemaphoreInfoKHR *pExternalSemaphoreInfo, VkExternalSemaphorePropertiesKHR *pExternalSemaphoreProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceExternalSemaphorePropertiesKHR(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceExternalSemaphoreInfoKHR *pExternalSemaphoreInfo,
+    VkExternalSemaphorePropertiesKHR *pExternalSemaphoreProperties);
+#endif
+
+#define VK_KHR_external_semaphore 1
+#define VK_KHR_EXTERNAL_SEMAPHORE_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME "VK_KHR_external_semaphore"
+
+typedef enum VkSemaphoreImportFlagBitsKHR {
+    VK_SEMAPHORE_IMPORT_TEMPORARY_BIT_KHR = 0x00000001,
+    VK_SEMAPHORE_IMPORT_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkSemaphoreImportFlagBitsKHR;
+typedef VkFlags VkSemaphoreImportFlagsKHR;
+
+typedef struct VkExportSemaphoreCreateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalSemaphoreHandleTypeFlagsKHR handleTypes;
+} VkExportSemaphoreCreateInfoKHR;
+
+#define VK_KHR_external_semaphore_fd 1
+#define VK_KHR_EXTERNAL_SEMAPHORE_FD_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME "VK_KHR_external_semaphore_fd"
+
+typedef struct VkImportSemaphoreFdInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkSemaphore semaphore;
+    VkSemaphoreImportFlagsKHR flags;
+    VkExternalSemaphoreHandleTypeFlagBitsKHR handleType;
+    int fd;
+} VkImportSemaphoreFdInfoKHR;
+
+typedef struct VkSemaphoreGetFdInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkSemaphore semaphore;
+    VkExternalSemaphoreHandleTypeFlagBitsKHR handleType;
+} VkSemaphoreGetFdInfoKHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkImportSemaphoreFdKHR)(VkDevice device, const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo);
+typedef VkResult(VKAPI_PTR *PFN_vkGetSemaphoreFdKHR)(VkDevice device, const VkSemaphoreGetFdInfoKHR *pGetFdInfo, int *pFd);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkImportSemaphoreFdKHR(
+    VkDevice device,
+    const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSemaphoreFdKHR(
+    VkDevice device,
+    const VkSemaphoreGetFdInfoKHR *pGetFdInfo,
+    int *pFd);
+#endif
+
+#define VK_KHR_push_descriptor 1
+#define VK_KHR_PUSH_DESCRIPTOR_SPEC_VERSION 1
+#define VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME "VK_KHR_push_descriptor"
+
+typedef struct VkPhysicalDevicePushDescriptorPropertiesKHR {
+    VkStructureType sType;
+    void *pNext;
+    uint32_t maxPushDescriptors;
+} VkPhysicalDevicePushDescriptorPropertiesKHR;
+
+typedef void(VKAPI_PTR *PFN_vkCmdPushDescriptorSetKHR)(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout layout, uint32_t set, uint32_t descriptorWriteCount, const VkWriteDescriptorSet *pDescriptorWrites);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdPushDescriptorSetKHR(
+    VkCommandBuffer commandBuffer,
+    VkPipelineBindPoint pipelineBindPoint,
+    VkPipelineLayout layout,
+    uint32_t set,
+    uint32_t descriptorWriteCount,
+    const VkWriteDescriptorSet *pDescriptorWrites);
+#endif
+
+#define VK_KHR_16bit_storage 1
+#define VK_KHR_16BIT_STORAGE_SPEC_VERSION 1
+#define VK_KHR_16BIT_STORAGE_EXTENSION_NAME "VK_KHR_16bit_storage"
+
+typedef struct VkPhysicalDevice16BitStorageFeaturesKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkBool32 storageBuffer16BitAccess;
+    VkBool32 uniformAndStorageBuffer16BitAccess;
+    VkBool32 storagePushConstant16;
+    VkBool32 storageInputOutput16;
+} VkPhysicalDevice16BitStorageFeaturesKHR;
+
+#define VK_KHR_incremental_present 1
+#define VK_KHR_INCREMENTAL_PRESENT_SPEC_VERSION 1
+#define VK_KHR_INCREMENTAL_PRESENT_EXTENSION_NAME "VK_KHR_incremental_present"
+
+typedef struct VkRectLayerKHR {
+    VkOffset2D offset;
+    VkExtent2D extent;
+    uint32_t layer;
+} VkRectLayerKHR;
+
+typedef struct VkPresentRegionKHR {
+    uint32_t rectangleCount;
+    const VkRectLayerKHR *pRectangles;
+} VkPresentRegionKHR;
+
+typedef struct VkPresentRegionsKHR {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t swapchainCount;
+    const VkPresentRegionKHR *pRegions;
+} VkPresentRegionsKHR;
+
+#define VK_KHR_descriptor_update_template 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorUpdateTemplateKHR)
+
+#define VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_SPEC_VERSION 1
+#define VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME "VK_KHR_descriptor_update_template"
+
+typedef enum VkDescriptorUpdateTemplateTypeKHR {
+    VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR = 0,
+    VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR = 1,
+    VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_BEGIN_RANGE_KHR = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR,
+    VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_END_RANGE_KHR = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR,
+    VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_RANGE_SIZE_KHR = (VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR - VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR + 1),
+    VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkDescriptorUpdateTemplateTypeKHR;
+
+typedef VkFlags VkDescriptorUpdateTemplateCreateFlagsKHR;
+
+typedef struct VkDescriptorUpdateTemplateEntryKHR {
+    uint32_t dstBinding;
+    uint32_t dstArrayElement;
+    uint32_t descriptorCount;
+    VkDescriptorType descriptorType;
+    size_t offset;
+    size_t stride;
+} VkDescriptorUpdateTemplateEntryKHR;
+
+typedef struct VkDescriptorUpdateTemplateCreateInfoKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkDescriptorUpdateTemplateCreateFlagsKHR flags;
+    uint32_t descriptorUpdateEntryCount;
+    const VkDescriptorUpdateTemplateEntryKHR *pDescriptorUpdateEntries;
+    VkDescriptorUpdateTemplateTypeKHR templateType;
+    VkDescriptorSetLayout descriptorSetLayout;
+    VkPipelineBindPoint pipelineBindPoint;
+    VkPipelineLayout pipelineLayout;
+    uint32_t set;
+} VkDescriptorUpdateTemplateCreateInfoKHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateDescriptorUpdateTemplateKHR)(VkDevice device, const VkDescriptorUpdateTemplateCreateInfoKHR *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDescriptorUpdateTemplateKHR *pDescriptorUpdateTemplate);
+typedef void(VKAPI_PTR *PFN_vkDestroyDescriptorUpdateTemplateKHR)(VkDevice device, VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, const VkAllocationCallbacks *pAllocator);
+typedef void(VKAPI_PTR *PFN_vkUpdateDescriptorSetWithTemplateKHR)(VkDevice device, VkDescriptorSet descriptorSet, VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, const void *pData);
+typedef void(VKAPI_PTR *PFN_vkCmdPushDescriptorSetWithTemplateKHR)(VkCommandBuffer commandBuffer, VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, VkPipelineLayout layout, uint32_t set, const void *pData);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDescriptorUpdateTemplateKHR(
+    VkDevice device,
+    const VkDescriptorUpdateTemplateCreateInfoKHR *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkDescriptorUpdateTemplateKHR *pDescriptorUpdateTemplate);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDescriptorUpdateTemplateKHR(
+    VkDevice device,
+    VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSetWithTemplateKHR(
+    VkDevice device,
+    VkDescriptorSet descriptorSet,
+    VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate,
+    const void *pData);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdPushDescriptorSetWithTemplateKHR(
+    VkCommandBuffer commandBuffer,
+    VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate,
+    VkPipelineLayout layout,
+    uint32_t set,
+    const void *pData);
+#endif
+
+#define VK_KHR_shared_presentable_image 1
+#define VK_KHR_SHARED_PRESENTABLE_IMAGE_SPEC_VERSION 1
+#define VK_KHR_SHARED_PRESENTABLE_IMAGE_EXTENSION_NAME "VK_KHR_shared_presentable_image"
+
+typedef struct VkSharedPresentSurfaceCapabilitiesKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkImageUsageFlags sharedPresentSupportedUsageFlags;
+} VkSharedPresentSurfaceCapabilitiesKHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkGetSwapchainStatusKHR)(VkDevice device, VkSwapchainKHR swapchain);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainStatusKHR(
+    VkDevice device,
+    VkSwapchainKHR swapchain);
+#endif
+
+#define VK_KHR_external_fence_capabilities 1
+#define VK_KHR_EXTERNAL_FENCE_CAPABILITIES_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME "VK_KHR_external_fence_capabilities"
+
+typedef enum VkExternalFenceHandleTypeFlagBitsKHR {
+    VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR = 0x00000001,
+    VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR = 0x00000002,
+    VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR = 0x00000004,
+    VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT_KHR = 0x00000008,
+    VK_EXTERNAL_FENCE_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkExternalFenceHandleTypeFlagBitsKHR;
+typedef VkFlags VkExternalFenceHandleTypeFlagsKHR;
+
+typedef enum VkExternalFenceFeatureFlagBitsKHR {
+    VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT_KHR = 0x00000001,
+    VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT_KHR = 0x00000002,
+    VK_EXTERNAL_FENCE_FEATURE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkExternalFenceFeatureFlagBitsKHR;
+typedef VkFlags VkExternalFenceFeatureFlagsKHR;
+
+typedef struct VkPhysicalDeviceExternalFenceInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalFenceHandleTypeFlagBitsKHR handleType;
+} VkPhysicalDeviceExternalFenceInfoKHR;
+
+typedef struct VkExternalFencePropertiesKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkExternalFenceHandleTypeFlagsKHR exportFromImportedHandleTypes;
+    VkExternalFenceHandleTypeFlagsKHR compatibleHandleTypes;
+    VkExternalFenceFeatureFlagsKHR externalFenceFeatures;
+} VkExternalFencePropertiesKHR;
+
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalFencePropertiesKHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalFenceInfoKHR *pExternalFenceInfo, VkExternalFencePropertiesKHR *pExternalFenceProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceExternalFencePropertiesKHR(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceExternalFenceInfoKHR *pExternalFenceInfo,
+    VkExternalFencePropertiesKHR *pExternalFenceProperties);
+#endif
+
+#define VK_KHR_external_fence 1
+#define VK_KHR_EXTERNAL_FENCE_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_FENCE_EXTENSION_NAME "VK_KHR_external_fence"
+
+typedef enum VkFenceImportFlagBitsKHR {
+    VK_FENCE_IMPORT_TEMPORARY_BIT_KHR = 0x00000001,
+    VK_FENCE_IMPORT_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkFenceImportFlagBitsKHR;
+typedef VkFlags VkFenceImportFlagsKHR;
+
+typedef struct VkExportFenceCreateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalFenceHandleTypeFlagsKHR handleTypes;
+} VkExportFenceCreateInfoKHR;
+
+#define VK_KHR_external_fence_fd 1
+#define VK_KHR_EXTERNAL_FENCE_FD_SPEC_VERSION 1
+#define VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME "VK_KHR_external_fence_fd"
+
+typedef struct VkImportFenceFdInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkFence fence;
+    VkFenceImportFlagsKHR flags;
+    VkExternalFenceHandleTypeFlagBitsKHR handleType;
+    int fd;
+} VkImportFenceFdInfoKHR;
+
+typedef struct VkFenceGetFdInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkFence fence;
+    VkExternalFenceHandleTypeFlagBitsKHR handleType;
+} VkFenceGetFdInfoKHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkImportFenceFdKHR)(VkDevice device, const VkImportFenceFdInfoKHR *pImportFenceFdInfo);
+typedef VkResult(VKAPI_PTR *PFN_vkGetFenceFdKHR)(VkDevice device, const VkFenceGetFdInfoKHR *pGetFdInfo, int *pFd);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkImportFenceFdKHR(
+    VkDevice device,
+    const VkImportFenceFdInfoKHR *pImportFenceFdInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetFenceFdKHR(
+    VkDevice device,
+    const VkFenceGetFdInfoKHR *pGetFdInfo,
+    int *pFd);
+#endif
+
+#define VK_KHR_get_surface_capabilities2 1
+#define VK_KHR_GET_SURFACE_CAPABILITIES_2_SPEC_VERSION 1
+#define VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME "VK_KHR_get_surface_capabilities2"
+
+typedef struct VkPhysicalDeviceSurfaceInfo2KHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkSurfaceKHR surface;
+} VkPhysicalDeviceSurfaceInfo2KHR;
+
+typedef struct VkSurfaceCapabilities2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkSurfaceCapabilitiesKHR surfaceCapabilities;
+} VkSurfaceCapabilities2KHR;
+
+typedef struct VkSurfaceFormat2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkSurfaceFormatKHR surfaceFormat;
+} VkSurfaceFormat2KHR;
+
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo, VkSurfaceCapabilities2KHR *pSurfaceCapabilities);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo, uint32_t *pSurfaceFormatCount, VkSurfaceFormat2KHR *pSurfaceFormats);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilities2KHR(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo,
+    VkSurfaceCapabilities2KHR *pSurfaceCapabilities);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormats2KHR(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo,
+    uint32_t *pSurfaceFormatCount,
+    VkSurfaceFormat2KHR *pSurfaceFormats);
+#endif
+
+#define VK_KHR_variable_pointers 1
+#define VK_KHR_VARIABLE_POINTERS_SPEC_VERSION 1
+#define VK_KHR_VARIABLE_POINTERS_EXTENSION_NAME "VK_KHR_variable_pointers"
+
+typedef struct VkPhysicalDeviceVariablePointerFeaturesKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkBool32 variablePointersStorageBuffer;
+    VkBool32 variablePointers;
+} VkPhysicalDeviceVariablePointerFeaturesKHR;
+
+#define VK_KHR_dedicated_allocation 1
+#define VK_KHR_DEDICATED_ALLOCATION_SPEC_VERSION 3
+#define VK_KHR_DEDICATED_ALLOCATION_EXTENSION_NAME "VK_KHR_dedicated_allocation"
+
+typedef struct VkMemoryDedicatedRequirementsKHR {
+    VkStructureType sType;
+    void *pNext;
+    VkBool32 prefersDedicatedAllocation;
+    VkBool32 requiresDedicatedAllocation;
+} VkMemoryDedicatedRequirementsKHR;
+
+typedef struct VkMemoryDedicatedAllocateInfoKHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkImage image;
+    VkBuffer buffer;
+} VkMemoryDedicatedAllocateInfoKHR;
+
+#define VK_KHR_storage_buffer_storage_class 1
+#define VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_SPEC_VERSION 1
+#define VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME "VK_KHR_storage_buffer_storage_class"
+
+#define VK_KHR_relaxed_block_layout 1
+#define VK_KHR_RELAXED_BLOCK_LAYOUT_SPEC_VERSION 1
+#define VK_KHR_RELAXED_BLOCK_LAYOUT_EXTENSION_NAME "VK_KHR_relaxed_block_layout"
+
+#define VK_KHR_get_memory_requirements2 1
+#define VK_KHR_GET_MEMORY_REQUIREMENTS_2_SPEC_VERSION 1
+#define VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME "VK_KHR_get_memory_requirements2"
+
+typedef struct VkBufferMemoryRequirementsInfo2KHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkBuffer buffer;
+} VkBufferMemoryRequirementsInfo2KHR;
+
+typedef struct VkImageMemoryRequirementsInfo2KHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkImage image;
+} VkImageMemoryRequirementsInfo2KHR;
+
+typedef struct VkImageSparseMemoryRequirementsInfo2KHR {
+    VkStructureType sType;
+    const void *pNext;
+    VkImage image;
+} VkImageSparseMemoryRequirementsInfo2KHR;
+
+typedef struct VkMemoryRequirements2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkMemoryRequirements memoryRequirements;
+} VkMemoryRequirements2KHR;
+
+typedef struct VkSparseImageMemoryRequirements2KHR {
+    VkStructureType sType;
+    void *pNext;
+    VkSparseImageMemoryRequirements memoryRequirements;
+} VkSparseImageMemoryRequirements2KHR;
+
+typedef void(VKAPI_PTR *PFN_vkGetImageMemoryRequirements2KHR)(VkDevice device, const VkImageMemoryRequirementsInfo2KHR *pInfo, VkMemoryRequirements2KHR *pMemoryRequirements);
+typedef void(VKAPI_PTR *PFN_vkGetBufferMemoryRequirements2KHR)(VkDevice device, const VkBufferMemoryRequirementsInfo2KHR *pInfo, VkMemoryRequirements2KHR *pMemoryRequirements);
+typedef void(VKAPI_PTR *PFN_vkGetImageSparseMemoryRequirements2KHR)(VkDevice device, const VkImageSparseMemoryRequirementsInfo2KHR *pInfo, uint32_t *pSparseMemoryRequirementCount, VkSparseImageMemoryRequirements2KHR *pSparseMemoryRequirements);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements2KHR(
+    VkDevice device,
+    const VkImageMemoryRequirementsInfo2KHR *pInfo,
+    VkMemoryRequirements2KHR *pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements2KHR(
+    VkDevice device,
+    const VkBufferMemoryRequirementsInfo2KHR *pInfo,
+    VkMemoryRequirements2KHR *pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements2KHR(
+    VkDevice device,
+    const VkImageSparseMemoryRequirementsInfo2KHR *pInfo,
+    uint32_t *pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements2KHR *pSparseMemoryRequirements);
+#endif
+
+#define VK_EXT_debug_report 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDebugReportCallbackEXT)
+
+#define VK_EXT_DEBUG_REPORT_SPEC_VERSION 8
+#define VK_EXT_DEBUG_REPORT_EXTENSION_NAME "VK_EXT_debug_report"
+#define VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT
+#define VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_CALLBACK_EXT_EXT
+
+typedef enum VkDebugReportObjectTypeEXT {
+    VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT = 0,
+    VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT = 1,
+    VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT = 2,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT = 3,
+    VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT = 4,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT = 5,
+    VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT = 6,
+    VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT = 7,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT = 8,
+    VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT = 9,
+    VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT = 10,
+    VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT = 11,
+    VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT = 12,
+    VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT = 13,
+    VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT = 14,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT = 15,
+    VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT = 16,
+    VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT = 17,
+    VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT = 18,
+    VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT = 19,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT = 20,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT = 21,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT = 22,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT = 23,
+    VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT = 24,
+    VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT = 25,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT = 26,
+    VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT = 27,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_CALLBACK_EXT_EXT = 28,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DISPLAY_KHR_EXT = 29,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DISPLAY_MODE_KHR_EXT = 30,
+    VK_DEBUG_REPORT_OBJECT_TYPE_OBJECT_TABLE_NVX_EXT = 31,
+    VK_DEBUG_REPORT_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NVX_EXT = 32,
+    VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT = 33,
+    VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR_EXT = 1000085000,
+    VK_DEBUG_REPORT_OBJECT_TYPE_BEGIN_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT,
+    VK_DEBUG_REPORT_OBJECT_TYPE_END_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT,
+    VK_DEBUG_REPORT_OBJECT_TYPE_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT + 1),
+    VK_DEBUG_REPORT_OBJECT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDebugReportObjectTypeEXT;
+
+typedef enum VkDebugReportFlagBitsEXT {
+    VK_DEBUG_REPORT_INFORMATION_BIT_EXT = 0x00000001,
+    VK_DEBUG_REPORT_WARNING_BIT_EXT = 0x00000002,
+    VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT = 0x00000004,
+    VK_DEBUG_REPORT_ERROR_BIT_EXT = 0x00000008,
+    VK_DEBUG_REPORT_DEBUG_BIT_EXT = 0x00000010,
+    VK_DEBUG_REPORT_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDebugReportFlagBitsEXT;
+typedef VkFlags VkDebugReportFlagsEXT;
+
+typedef VkBool32(VKAPI_PTR *PFN_vkDebugReportCallbackEXT)(
+    VkDebugReportFlagsEXT flags,
+    VkDebugReportObjectTypeEXT objectType,
+    uint64_t object,
+    size_t location,
+    int32_t messageCode,
+    const char *pLayerPrefix,
+    const char *pMessage,
+    void *pUserData);
+
+typedef struct VkDebugReportCallbackCreateInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkDebugReportFlagsEXT flags;
+    PFN_vkDebugReportCallbackEXT pfnCallback;
+    void *pUserData;
+} VkDebugReportCallbackCreateInfoEXT;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateDebugReportCallbackEXT)(VkInstance instance, const VkDebugReportCallbackCreateInfoEXT *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDebugReportCallbackEXT *pCallback);
+typedef void(VKAPI_PTR *PFN_vkDestroyDebugReportCallbackEXT)(VkInstance instance, VkDebugReportCallbackEXT callback, const VkAllocationCallbacks *pAllocator);
+typedef void(VKAPI_PTR *PFN_vkDebugReportMessageEXT)(VkInstance instance, VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, uint64_t object, size_t location, int32_t messageCode, const char *pLayerPrefix, const char *pMessage);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDebugReportCallbackEXT(
+    VkInstance instance,
+    const VkDebugReportCallbackCreateInfoEXT *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkDebugReportCallbackEXT *pCallback);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDebugReportCallbackEXT(
+    VkInstance instance,
+    VkDebugReportCallbackEXT callback,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkDebugReportMessageEXT(
+    VkInstance instance,
+    VkDebugReportFlagsEXT flags,
+    VkDebugReportObjectTypeEXT objectType,
+    uint64_t object,
+    size_t location,
+    int32_t messageCode,
+    const char *pLayerPrefix,
+    const char *pMessage);
+#endif
+
+#define VK_NV_glsl_shader 1
+#define VK_NV_GLSL_SHADER_SPEC_VERSION 1
+#define VK_NV_GLSL_SHADER_EXTENSION_NAME "VK_NV_glsl_shader"
+
+#define VK_EXT_depth_range_unrestricted 1
+#define VK_EXT_DEPTH_RANGE_UNRESTRICTED_SPEC_VERSION 1
+#define VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME "VK_EXT_depth_range_unrestricted"
+
+#define VK_IMG_filter_cubic 1
+#define VK_IMG_FILTER_CUBIC_SPEC_VERSION 1
+#define VK_IMG_FILTER_CUBIC_EXTENSION_NAME "VK_IMG_filter_cubic"
+
+#define VK_AMD_rasterization_order 1
+#define VK_AMD_RASTERIZATION_ORDER_SPEC_VERSION 1
+#define VK_AMD_RASTERIZATION_ORDER_EXTENSION_NAME "VK_AMD_rasterization_order"
+
+typedef enum VkRasterizationOrderAMD {
+    VK_RASTERIZATION_ORDER_STRICT_AMD = 0,
+    VK_RASTERIZATION_ORDER_RELAXED_AMD = 1,
+    VK_RASTERIZATION_ORDER_BEGIN_RANGE_AMD = VK_RASTERIZATION_ORDER_STRICT_AMD,
+    VK_RASTERIZATION_ORDER_END_RANGE_AMD = VK_RASTERIZATION_ORDER_RELAXED_AMD,
+    VK_RASTERIZATION_ORDER_RANGE_SIZE_AMD = (VK_RASTERIZATION_ORDER_RELAXED_AMD - VK_RASTERIZATION_ORDER_STRICT_AMD + 1),
+    VK_RASTERIZATION_ORDER_MAX_ENUM_AMD = 0x7FFFFFFF
+} VkRasterizationOrderAMD;
+
+typedef struct VkPipelineRasterizationStateRasterizationOrderAMD {
+    VkStructureType sType;
+    const void *pNext;
+    VkRasterizationOrderAMD rasterizationOrder;
+} VkPipelineRasterizationStateRasterizationOrderAMD;
+
+#define VK_AMD_shader_trinary_minmax 1
+#define VK_AMD_SHADER_TRINARY_MINMAX_SPEC_VERSION 1
+#define VK_AMD_SHADER_TRINARY_MINMAX_EXTENSION_NAME "VK_AMD_shader_trinary_minmax"
+
+#define VK_AMD_shader_explicit_vertex_parameter 1
+#define VK_AMD_SHADER_EXPLICIT_VERTEX_PARAMETER_SPEC_VERSION 1
+#define VK_AMD_SHADER_EXPLICIT_VERTEX_PARAMETER_EXTENSION_NAME "VK_AMD_shader_explicit_vertex_parameter"
+
+#define VK_EXT_debug_marker 1
+#define VK_EXT_DEBUG_MARKER_SPEC_VERSION 4
+#define VK_EXT_DEBUG_MARKER_EXTENSION_NAME "VK_EXT_debug_marker"
+
+typedef struct VkDebugMarkerObjectNameInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkDebugReportObjectTypeEXT objectType;
+    uint64_t object;
+    const char *pObjectName;
+} VkDebugMarkerObjectNameInfoEXT;
+
+typedef struct VkDebugMarkerObjectTagInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkDebugReportObjectTypeEXT objectType;
+    uint64_t object;
+    uint64_t tagName;
+    size_t tagSize;
+    const void *pTag;
+} VkDebugMarkerObjectTagInfoEXT;
+
+typedef struct VkDebugMarkerMarkerInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    const char *pMarkerName;
+    float color[4];
+} VkDebugMarkerMarkerInfoEXT;
+
+typedef VkResult(VKAPI_PTR *PFN_vkDebugMarkerSetObjectTagEXT)(VkDevice device, const VkDebugMarkerObjectTagInfoEXT *pTagInfo);
+typedef VkResult(VKAPI_PTR *PFN_vkDebugMarkerSetObjectNameEXT)(VkDevice device, const VkDebugMarkerObjectNameInfoEXT *pNameInfo);
+typedef void(VKAPI_PTR *PFN_vkCmdDebugMarkerBeginEXT)(VkCommandBuffer commandBuffer, const VkDebugMarkerMarkerInfoEXT *pMarkerInfo);
+typedef void(VKAPI_PTR *PFN_vkCmdDebugMarkerEndEXT)(VkCommandBuffer commandBuffer);
+typedef void(VKAPI_PTR *PFN_vkCmdDebugMarkerInsertEXT)(VkCommandBuffer commandBuffer, const VkDebugMarkerMarkerInfoEXT *pMarkerInfo);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkDebugMarkerSetObjectTagEXT(
+    VkDevice device,
+    const VkDebugMarkerObjectTagInfoEXT *pTagInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkDebugMarkerSetObjectNameEXT(
+    VkDevice device,
+    const VkDebugMarkerObjectNameInfoEXT *pNameInfo);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerBeginEXT(
+    VkCommandBuffer commandBuffer,
+    const VkDebugMarkerMarkerInfoEXT *pMarkerInfo);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerEndEXT(
+    VkCommandBuffer commandBuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerInsertEXT(
+    VkCommandBuffer commandBuffer,
+    const VkDebugMarkerMarkerInfoEXT *pMarkerInfo);
+#endif
+
+#define VK_AMD_gcn_shader 1
+#define VK_AMD_GCN_SHADER_SPEC_VERSION 1
+#define VK_AMD_GCN_SHADER_EXTENSION_NAME "VK_AMD_gcn_shader"
+
+#define VK_NV_dedicated_allocation 1
+#define VK_NV_DEDICATED_ALLOCATION_SPEC_VERSION 1
+#define VK_NV_DEDICATED_ALLOCATION_EXTENSION_NAME "VK_NV_dedicated_allocation"
+
+typedef struct VkDedicatedAllocationImageCreateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkBool32 dedicatedAllocation;
+} VkDedicatedAllocationImageCreateInfoNV;
+
+typedef struct VkDedicatedAllocationBufferCreateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkBool32 dedicatedAllocation;
+} VkDedicatedAllocationBufferCreateInfoNV;
+
+typedef struct VkDedicatedAllocationMemoryAllocateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkImage image;
+    VkBuffer buffer;
+} VkDedicatedAllocationMemoryAllocateInfoNV;
+
+#define VK_AMD_draw_indirect_count 1
+#define VK_AMD_DRAW_INDIRECT_COUNT_SPEC_VERSION 1
+#define VK_AMD_DRAW_INDIRECT_COUNT_EXTENSION_NAME "VK_AMD_draw_indirect_count"
+
+typedef void(VKAPI_PTR *PFN_vkCmdDrawIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride);
+typedef void(VKAPI_PTR *PFN_vkCmdDrawIndexedIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirectCountAMD(
+    VkCommandBuffer commandBuffer,
+    VkBuffer buffer,
+    VkDeviceSize offset,
+    VkBuffer countBuffer,
+    VkDeviceSize countBufferOffset,
+    uint32_t maxDrawCount,
+    uint32_t stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirectCountAMD(
+    VkCommandBuffer commandBuffer,
+    VkBuffer buffer,
+    VkDeviceSize offset,
+    VkBuffer countBuffer,
+    VkDeviceSize countBufferOffset,
+    uint32_t maxDrawCount,
+    uint32_t stride);
+#endif
+
+#define VK_AMD_negative_viewport_height 1
+#define VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_SPEC_VERSION 1
+#define VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_EXTENSION_NAME "VK_AMD_negative_viewport_height"
+
+#define VK_AMD_gpu_shader_half_float 1
+#define VK_AMD_GPU_SHADER_HALF_FLOAT_SPEC_VERSION 1
+#define VK_AMD_GPU_SHADER_HALF_FLOAT_EXTENSION_NAME "VK_AMD_gpu_shader_half_float"
+
+#define VK_AMD_shader_ballot 1
+#define VK_AMD_SHADER_BALLOT_SPEC_VERSION 1
+#define VK_AMD_SHADER_BALLOT_EXTENSION_NAME "VK_AMD_shader_ballot"
+
+#define VK_AMD_texture_gather_bias_lod 1
+#define VK_AMD_TEXTURE_GATHER_BIAS_LOD_SPEC_VERSION 1
+#define VK_AMD_TEXTURE_GATHER_BIAS_LOD_EXTENSION_NAME "VK_AMD_texture_gather_bias_lod"
+
+typedef struct VkTextureLODGatherFormatPropertiesAMD {
+    VkStructureType sType;
+    void *pNext;
+    VkBool32 supportsTextureGatherLODBiasAMD;
+} VkTextureLODGatherFormatPropertiesAMD;
+
+#define VK_KHX_multiview 1
+#define VK_KHX_MULTIVIEW_SPEC_VERSION 1
+#define VK_KHX_MULTIVIEW_EXTENSION_NAME "VK_KHX_multiview"
+
+typedef struct VkRenderPassMultiviewCreateInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t subpassCount;
+    const uint32_t *pViewMasks;
+    uint32_t dependencyCount;
+    const int32_t *pViewOffsets;
+    uint32_t correlationMaskCount;
+    const uint32_t *pCorrelationMasks;
+} VkRenderPassMultiviewCreateInfoKHX;
+
+typedef struct VkPhysicalDeviceMultiviewFeaturesKHX {
+    VkStructureType sType;
+    void *pNext;
+    VkBool32 multiview;
+    VkBool32 multiviewGeometryShader;
+    VkBool32 multiviewTessellationShader;
+} VkPhysicalDeviceMultiviewFeaturesKHX;
+
+typedef struct VkPhysicalDeviceMultiviewPropertiesKHX {
+    VkStructureType sType;
+    void *pNext;
+    uint32_t maxMultiviewViewCount;
+    uint32_t maxMultiviewInstanceIndex;
+} VkPhysicalDeviceMultiviewPropertiesKHX;
+
+#define VK_IMG_format_pvrtc 1
+#define VK_IMG_FORMAT_PVRTC_SPEC_VERSION 1
+#define VK_IMG_FORMAT_PVRTC_EXTENSION_NAME "VK_IMG_format_pvrtc"
+
+#define VK_NV_external_memory_capabilities 1
+#define VK_NV_EXTERNAL_MEMORY_CAPABILITIES_SPEC_VERSION 1
+#define VK_NV_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME "VK_NV_external_memory_capabilities"
+
+typedef enum VkExternalMemoryHandleTypeFlagBitsNV {
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_NV = 0x00000001,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_NV = 0x00000002,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_IMAGE_BIT_NV = 0x00000004,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_IMAGE_KMT_BIT_NV = 0x00000008,
+    VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_NV = 0x7FFFFFFF
+} VkExternalMemoryHandleTypeFlagBitsNV;
+typedef VkFlags VkExternalMemoryHandleTypeFlagsNV;
+
+typedef enum VkExternalMemoryFeatureFlagBitsNV {
+    VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT_NV = 0x00000001,
+    VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_NV = 0x00000002,
+    VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_NV = 0x00000004,
+    VK_EXTERNAL_MEMORY_FEATURE_FLAG_BITS_MAX_ENUM_NV = 0x7FFFFFFF
+} VkExternalMemoryFeatureFlagBitsNV;
+typedef VkFlags VkExternalMemoryFeatureFlagsNV;
+
+typedef struct VkExternalImageFormatPropertiesNV {
+    VkImageFormatProperties imageFormatProperties;
+    VkExternalMemoryFeatureFlagsNV externalMemoryFeatures;
+    VkExternalMemoryHandleTypeFlagsNV exportFromImportedHandleTypes;
+    VkExternalMemoryHandleTypeFlagsNV compatibleHandleTypes;
+} VkExternalImageFormatPropertiesNV;
+
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalImageFormatPropertiesNV)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags flags, VkExternalMemoryHandleTypeFlagsNV externalHandleType, VkExternalImageFormatPropertiesNV *pExternalImageFormatProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceExternalImageFormatPropertiesNV(
+    VkPhysicalDevice physicalDevice,
+    VkFormat format,
+    VkImageType type,
+    VkImageTiling tiling,
+    VkImageUsageFlags usage,
+    VkImageCreateFlags flags,
+    VkExternalMemoryHandleTypeFlagsNV externalHandleType,
+    VkExternalImageFormatPropertiesNV *pExternalImageFormatProperties);
+#endif
+
+#define VK_NV_external_memory 1
+#define VK_NV_EXTERNAL_MEMORY_SPEC_VERSION 1
+#define VK_NV_EXTERNAL_MEMORY_EXTENSION_NAME "VK_NV_external_memory"
+
+typedef struct VkExternalMemoryImageCreateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalMemoryHandleTypeFlagsNV handleTypes;
+} VkExternalMemoryImageCreateInfoNV;
+
+typedef struct VkExportMemoryAllocateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkExternalMemoryHandleTypeFlagsNV handleTypes;
+} VkExportMemoryAllocateInfoNV;
+
+#define VK_KHX_device_group 1
+#define VK_KHX_DEVICE_GROUP_SPEC_VERSION 1
+#define VK_KHX_DEVICE_GROUP_EXTENSION_NAME "VK_KHX_device_group"
+#define VK_MAX_DEVICE_GROUP_SIZE_KHX 32
+
+typedef enum VkPeerMemoryFeatureFlagBitsKHX {
+    VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT_KHX = 0x00000001,
+    VK_PEER_MEMORY_FEATURE_COPY_DST_BIT_KHX = 0x00000002,
+    VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT_KHX = 0x00000004,
+    VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT_KHX = 0x00000008,
+    VK_PEER_MEMORY_FEATURE_FLAG_BITS_MAX_ENUM_KHX = 0x7FFFFFFF
+} VkPeerMemoryFeatureFlagBitsKHX;
+typedef VkFlags VkPeerMemoryFeatureFlagsKHX;
+
+typedef enum VkMemoryAllocateFlagBitsKHX {
+    VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT_KHX = 0x00000001,
+    VK_MEMORY_ALLOCATE_FLAG_BITS_MAX_ENUM_KHX = 0x7FFFFFFF
+} VkMemoryAllocateFlagBitsKHX;
+typedef VkFlags VkMemoryAllocateFlagsKHX;
+
+typedef enum VkDeviceGroupPresentModeFlagBitsKHX {
+    VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHX = 0x00000001,
+    VK_DEVICE_GROUP_PRESENT_MODE_REMOTE_BIT_KHX = 0x00000002,
+    VK_DEVICE_GROUP_PRESENT_MODE_SUM_BIT_KHX = 0x00000004,
+    VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_MULTI_DEVICE_BIT_KHX = 0x00000008,
+    VK_DEVICE_GROUP_PRESENT_MODE_FLAG_BITS_MAX_ENUM_KHX = 0x7FFFFFFF
+} VkDeviceGroupPresentModeFlagBitsKHX;
+typedef VkFlags VkDeviceGroupPresentModeFlagsKHX;
+
+typedef struct VkMemoryAllocateFlagsInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    VkMemoryAllocateFlagsKHX flags;
+    uint32_t deviceMask;
+} VkMemoryAllocateFlagsInfoKHX;
+
+typedef struct VkBindBufferMemoryInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    VkBuffer buffer;
+    VkDeviceMemory memory;
+    VkDeviceSize memoryOffset;
+    uint32_t deviceIndexCount;
+    const uint32_t *pDeviceIndices;
+} VkBindBufferMemoryInfoKHX;
+
+typedef struct VkBindImageMemoryInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    VkImage image;
+    VkDeviceMemory memory;
+    VkDeviceSize memoryOffset;
+    uint32_t deviceIndexCount;
+    const uint32_t *pDeviceIndices;
+    uint32_t SFRRectCount;
+    const VkRect2D *pSFRRects;
+} VkBindImageMemoryInfoKHX;
+
+typedef struct VkDeviceGroupRenderPassBeginInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t deviceMask;
+    uint32_t deviceRenderAreaCount;
+    const VkRect2D *pDeviceRenderAreas;
+} VkDeviceGroupRenderPassBeginInfoKHX;
+
+typedef struct VkDeviceGroupCommandBufferBeginInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t deviceMask;
+} VkDeviceGroupCommandBufferBeginInfoKHX;
+
+typedef struct VkDeviceGroupSubmitInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t waitSemaphoreCount;
+    const uint32_t *pWaitSemaphoreDeviceIndices;
+    uint32_t commandBufferCount;
+    const uint32_t *pCommandBufferDeviceMasks;
+    uint32_t signalSemaphoreCount;
+    const uint32_t *pSignalSemaphoreDeviceIndices;
+} VkDeviceGroupSubmitInfoKHX;
+
+typedef struct VkDeviceGroupBindSparseInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t resourceDeviceIndex;
+    uint32_t memoryDeviceIndex;
+} VkDeviceGroupBindSparseInfoKHX;
+
+typedef struct VkDeviceGroupPresentCapabilitiesKHX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t presentMask[VK_MAX_DEVICE_GROUP_SIZE_KHX];
+    VkDeviceGroupPresentModeFlagsKHX modes;
+} VkDeviceGroupPresentCapabilitiesKHX;
+
+typedef struct VkImageSwapchainCreateInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    VkSwapchainKHR swapchain;
+} VkImageSwapchainCreateInfoKHX;
+
+typedef struct VkBindImageMemorySwapchainInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    VkSwapchainKHR swapchain;
+    uint32_t imageIndex;
+} VkBindImageMemorySwapchainInfoKHX;
+
+typedef struct VkAcquireNextImageInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    VkSwapchainKHR swapchain;
+    uint64_t timeout;
+    VkSemaphore semaphore;
+    VkFence fence;
+    uint32_t deviceMask;
+} VkAcquireNextImageInfoKHX;
+
+typedef struct VkDeviceGroupPresentInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t swapchainCount;
+    const uint32_t *pDeviceMasks;
+    VkDeviceGroupPresentModeFlagBitsKHX mode;
+} VkDeviceGroupPresentInfoKHX;
+
+typedef struct VkDeviceGroupSwapchainCreateInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    VkDeviceGroupPresentModeFlagsKHX modes;
+} VkDeviceGroupSwapchainCreateInfoKHX;
+
+typedef void(VKAPI_PTR *PFN_vkGetDeviceGroupPeerMemoryFeaturesKHX)(VkDevice device, uint32_t heapIndex, uint32_t localDeviceIndex, uint32_t remoteDeviceIndex, VkPeerMemoryFeatureFlagsKHX *pPeerMemoryFeatures);
+typedef VkResult(VKAPI_PTR *PFN_vkBindBufferMemory2KHX)(VkDevice device, uint32_t bindInfoCount, const VkBindBufferMemoryInfoKHX *pBindInfos);
+typedef VkResult(VKAPI_PTR *PFN_vkBindImageMemory2KHX)(VkDevice device, uint32_t bindInfoCount, const VkBindImageMemoryInfoKHX *pBindInfos);
+typedef void(VKAPI_PTR *PFN_vkCmdSetDeviceMaskKHX)(VkCommandBuffer commandBuffer, uint32_t deviceMask);
+typedef void(VKAPI_PTR *PFN_vkCmdDispatchBaseKHX)(VkCommandBuffer commandBuffer, uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ);
+typedef VkResult(VKAPI_PTR *PFN_vkGetDeviceGroupPresentCapabilitiesKHX)(VkDevice device, VkDeviceGroupPresentCapabilitiesKHX *pDeviceGroupPresentCapabilities);
+typedef VkResult(VKAPI_PTR *PFN_vkGetDeviceGroupSurfacePresentModesKHX)(VkDevice device, VkSurfaceKHR surface, VkDeviceGroupPresentModeFlagsKHX *pModes);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDevicePresentRectanglesKHX)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t *pRectCount, VkRect2D *pRects);
+typedef VkResult(VKAPI_PTR *PFN_vkAcquireNextImage2KHX)(VkDevice device, const VkAcquireNextImageInfoKHX *pAcquireInfo, uint32_t *pImageIndex);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceGroupPeerMemoryFeaturesKHX(
+    VkDevice device,
+    uint32_t heapIndex,
+    uint32_t localDeviceIndex,
+    uint32_t remoteDeviceIndex,
+    VkPeerMemoryFeatureFlagsKHX *pPeerMemoryFeatures);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBindBufferMemory2KHX(
+    VkDevice device,
+    uint32_t bindInfoCount,
+    const VkBindBufferMemoryInfoKHX *pBindInfos);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBindImageMemory2KHX(
+    VkDevice device,
+    uint32_t bindInfoCount,
+    const VkBindImageMemoryInfoKHX *pBindInfos);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetDeviceMaskKHX(
+    VkCommandBuffer commandBuffer,
+    uint32_t deviceMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDispatchBaseKHX(
+    VkCommandBuffer commandBuffer,
+    uint32_t baseGroupX,
+    uint32_t baseGroupY,
+    uint32_t baseGroupZ,
+    uint32_t groupCountX,
+    uint32_t groupCountY,
+    uint32_t groupCountZ);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDeviceGroupPresentCapabilitiesKHX(
+    VkDevice device,
+    VkDeviceGroupPresentCapabilitiesKHX *pDeviceGroupPresentCapabilities);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDeviceGroupSurfacePresentModesKHX(
+    VkDevice device,
+    VkSurfaceKHR surface,
+    VkDeviceGroupPresentModeFlagsKHX *pModes);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDevicePresentRectanglesKHX(
+    VkPhysicalDevice physicalDevice,
+    VkSurfaceKHR surface,
+    uint32_t *pRectCount,
+    VkRect2D *pRects);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImage2KHX(
+    VkDevice device,
+    const VkAcquireNextImageInfoKHX *pAcquireInfo,
+    uint32_t *pImageIndex);
+#endif
+
+#define VK_EXT_validation_flags 1
+#define VK_EXT_VALIDATION_FLAGS_SPEC_VERSION 1
+#define VK_EXT_VALIDATION_FLAGS_EXTENSION_NAME "VK_EXT_validation_flags"
+
+typedef enum VkValidationCheckEXT {
+    VK_VALIDATION_CHECK_ALL_EXT = 0,
+    VK_VALIDATION_CHECK_SHADERS_EXT = 1,
+    VK_VALIDATION_CHECK_BEGIN_RANGE_EXT = VK_VALIDATION_CHECK_ALL_EXT,
+    VK_VALIDATION_CHECK_END_RANGE_EXT = VK_VALIDATION_CHECK_SHADERS_EXT,
+    VK_VALIDATION_CHECK_RANGE_SIZE_EXT = (VK_VALIDATION_CHECK_SHADERS_EXT - VK_VALIDATION_CHECK_ALL_EXT + 1),
+    VK_VALIDATION_CHECK_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationCheckEXT;
+
+typedef struct VkValidationFlagsEXT {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t disabledValidationCheckCount;
+    VkValidationCheckEXT *pDisabledValidationChecks;
+} VkValidationFlagsEXT;
+
+#ifdef VK_USE_PLATFORM_VI_NN
+#define VK_NN_vi_surface 1
+#define VK_NN_VI_SURFACE_SPEC_VERSION 1
+#define VK_NN_VI_SURFACE_EXTENSION_NAME "VK_NN_vi_surface"
+
+typedef VkFlags VkViSurfaceCreateFlagsNN;
+
+typedef struct VkViSurfaceCreateInfoNN {
+    VkStructureType sType;
+    const void *pNext;
+    VkViSurfaceCreateFlagsNN flags;
+    void *window;
+} VkViSurfaceCreateInfoNN;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateViSurfaceNN)(VkInstance instance, const VkViSurfaceCreateInfoNN *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSurfaceKHR *pSurface);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateViSurfaceNN(
+    VkInstance instance,
+    const VkViSurfaceCreateInfoNN *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkSurfaceKHR *pSurface);
+#endif
+#endif /* VK_USE_PLATFORM_VI_NN */
+
+#define VK_EXT_shader_subgroup_ballot 1
+#define VK_EXT_SHADER_SUBGROUP_BALLOT_SPEC_VERSION 1
+#define VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME "VK_EXT_shader_subgroup_ballot"
+
+#define VK_EXT_shader_subgroup_vote 1
+#define VK_EXT_SHADER_SUBGROUP_VOTE_SPEC_VERSION 1
+#define VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME "VK_EXT_shader_subgroup_vote"
+
+#define VK_KHX_device_group_creation 1
+#define VK_KHX_DEVICE_GROUP_CREATION_SPEC_VERSION 1
+#define VK_KHX_DEVICE_GROUP_CREATION_EXTENSION_NAME "VK_KHX_device_group_creation"
+
+typedef struct VkPhysicalDeviceGroupPropertiesKHX {
+    VkStructureType sType;
+    void *pNext;
+    uint32_t physicalDeviceCount;
+    VkPhysicalDevice physicalDevices[VK_MAX_DEVICE_GROUP_SIZE_KHX];
+    VkBool32 subsetAllocation;
+} VkPhysicalDeviceGroupPropertiesKHX;
+
+typedef struct VkDeviceGroupDeviceCreateInfoKHX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t physicalDeviceCount;
+    const VkPhysicalDevice *pPhysicalDevices;
+} VkDeviceGroupDeviceCreateInfoKHX;
+
+typedef VkResult(VKAPI_PTR *PFN_vkEnumeratePhysicalDeviceGroupsKHX)(VkInstance instance, uint32_t *pPhysicalDeviceGroupCount, VkPhysicalDeviceGroupPropertiesKHX *pPhysicalDeviceGroupProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDeviceGroupsKHX(
+    VkInstance instance,
+    uint32_t *pPhysicalDeviceGroupCount,
+    VkPhysicalDeviceGroupPropertiesKHX *pPhysicalDeviceGroupProperties);
+#endif
+
+#define VK_NVX_device_generated_commands 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkObjectTableNVX)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkIndirectCommandsLayoutNVX)
+
+#define VK_NVX_DEVICE_GENERATED_COMMANDS_SPEC_VERSION 3
+#define VK_NVX_DEVICE_GENERATED_COMMANDS_EXTENSION_NAME "VK_NVX_device_generated_commands"
+
+typedef enum VkIndirectCommandsTokenTypeNVX {
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX = 0,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_DESCRIPTOR_SET_NVX = 1,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NVX = 2,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NVX = 3,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NVX = 4,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NVX = 5,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NVX = 6,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX = 7,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_BEGIN_RANGE_NVX = VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_END_RANGE_NVX = VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX,
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_RANGE_SIZE_NVX = (VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX - VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX + 1),
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_MAX_ENUM_NVX = 0x7FFFFFFF
+} VkIndirectCommandsTokenTypeNVX;
+
+typedef enum VkObjectEntryTypeNVX {
+    VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX = 0,
+    VK_OBJECT_ENTRY_TYPE_PIPELINE_NVX = 1,
+    VK_OBJECT_ENTRY_TYPE_INDEX_BUFFER_NVX = 2,
+    VK_OBJECT_ENTRY_TYPE_VERTEX_BUFFER_NVX = 3,
+    VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX = 4,
+    VK_OBJECT_ENTRY_TYPE_BEGIN_RANGE_NVX = VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX,
+    VK_OBJECT_ENTRY_TYPE_END_RANGE_NVX = VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX,
+    VK_OBJECT_ENTRY_TYPE_RANGE_SIZE_NVX = (VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX - VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX + 1),
+    VK_OBJECT_ENTRY_TYPE_MAX_ENUM_NVX = 0x7FFFFFFF
+} VkObjectEntryTypeNVX;
+
+typedef enum VkIndirectCommandsLayoutUsageFlagBitsNVX {
+    VK_INDIRECT_COMMANDS_LAYOUT_USAGE_UNORDERED_SEQUENCES_BIT_NVX = 0x00000001,
+    VK_INDIRECT_COMMANDS_LAYOUT_USAGE_SPARSE_SEQUENCES_BIT_NVX = 0x00000002,
+    VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EMPTY_EXECUTIONS_BIT_NVX = 0x00000004,
+    VK_INDIRECT_COMMANDS_LAYOUT_USAGE_INDEXED_SEQUENCES_BIT_NVX = 0x00000008,
+    VK_INDIRECT_COMMANDS_LAYOUT_USAGE_FLAG_BITS_MAX_ENUM_NVX = 0x7FFFFFFF
+} VkIndirectCommandsLayoutUsageFlagBitsNVX;
+typedef VkFlags VkIndirectCommandsLayoutUsageFlagsNVX;
+
+typedef enum VkObjectEntryUsageFlagBitsNVX {
+    VK_OBJECT_ENTRY_USAGE_GRAPHICS_BIT_NVX = 0x00000001,
+    VK_OBJECT_ENTRY_USAGE_COMPUTE_BIT_NVX = 0x00000002,
+    VK_OBJECT_ENTRY_USAGE_FLAG_BITS_MAX_ENUM_NVX = 0x7FFFFFFF
+} VkObjectEntryUsageFlagBitsNVX;
+typedef VkFlags VkObjectEntryUsageFlagsNVX;
+
+typedef struct VkDeviceGeneratedCommandsFeaturesNVX {
+    VkStructureType sType;
+    const void *pNext;
+    VkBool32 computeBindingPointSupport;
+} VkDeviceGeneratedCommandsFeaturesNVX;
+
+typedef struct VkDeviceGeneratedCommandsLimitsNVX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t maxIndirectCommandsLayoutTokenCount;
+    uint32_t maxObjectEntryCounts;
+    uint32_t minSequenceCountBufferOffsetAlignment;
+    uint32_t minSequenceIndexBufferOffsetAlignment;
+    uint32_t minCommandsTokenBufferOffsetAlignment;
+} VkDeviceGeneratedCommandsLimitsNVX;
+
+typedef struct VkIndirectCommandsTokenNVX {
+    VkIndirectCommandsTokenTypeNVX tokenType;
+    VkBuffer buffer;
+    VkDeviceSize offset;
+} VkIndirectCommandsTokenNVX;
+
+typedef struct VkIndirectCommandsLayoutTokenNVX {
+    VkIndirectCommandsTokenTypeNVX tokenType;
+    uint32_t bindingUnit;
+    uint32_t dynamicCount;
+    uint32_t divisor;
+} VkIndirectCommandsLayoutTokenNVX;
+
+typedef struct VkIndirectCommandsLayoutCreateInfoNVX {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineBindPoint pipelineBindPoint;
+    VkIndirectCommandsLayoutUsageFlagsNVX flags;
+    uint32_t tokenCount;
+    const VkIndirectCommandsLayoutTokenNVX *pTokens;
+} VkIndirectCommandsLayoutCreateInfoNVX;
+
+typedef struct VkCmdProcessCommandsInfoNVX {
+    VkStructureType sType;
+    const void *pNext;
+    VkObjectTableNVX objectTable;
+    VkIndirectCommandsLayoutNVX indirectCommandsLayout;
+    uint32_t indirectCommandsTokenCount;
+    const VkIndirectCommandsTokenNVX *pIndirectCommandsTokens;
+    uint32_t maxSequencesCount;
+    VkCommandBuffer targetCommandBuffer;
+    VkBuffer sequencesCountBuffer;
+    VkDeviceSize sequencesCountOffset;
+    VkBuffer sequencesIndexBuffer;
+    VkDeviceSize sequencesIndexOffset;
+} VkCmdProcessCommandsInfoNVX;
+
+typedef struct VkCmdReserveSpaceForCommandsInfoNVX {
+    VkStructureType sType;
+    const void *pNext;
+    VkObjectTableNVX objectTable;
+    VkIndirectCommandsLayoutNVX indirectCommandsLayout;
+    uint32_t maxSequencesCount;
+} VkCmdReserveSpaceForCommandsInfoNVX;
+
+typedef struct VkObjectTableCreateInfoNVX {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t objectCount;
+    const VkObjectEntryTypeNVX *pObjectEntryTypes;
+    const uint32_t *pObjectEntryCounts;
+    const VkObjectEntryUsageFlagsNVX *pObjectEntryUsageFlags;
+    uint32_t maxUniformBuffersPerDescriptor;
+    uint32_t maxStorageBuffersPerDescriptor;
+    uint32_t maxStorageImagesPerDescriptor;
+    uint32_t maxSampledImagesPerDescriptor;
+    uint32_t maxPipelineLayouts;
+} VkObjectTableCreateInfoNVX;
+
+typedef struct VkObjectTableEntryNVX {
+    VkObjectEntryTypeNVX type;
+    VkObjectEntryUsageFlagsNVX flags;
+} VkObjectTableEntryNVX;
+
+typedef struct VkObjectTablePipelineEntryNVX {
+    VkObjectEntryTypeNVX type;
+    VkObjectEntryUsageFlagsNVX flags;
+    VkPipeline pipeline;
+} VkObjectTablePipelineEntryNVX;
+
+typedef struct VkObjectTableDescriptorSetEntryNVX {
+    VkObjectEntryTypeNVX type;
+    VkObjectEntryUsageFlagsNVX flags;
+    VkPipelineLayout pipelineLayout;
+    VkDescriptorSet descriptorSet;
+} VkObjectTableDescriptorSetEntryNVX;
+
+typedef struct VkObjectTableVertexBufferEntryNVX {
+    VkObjectEntryTypeNVX type;
+    VkObjectEntryUsageFlagsNVX flags;
+    VkBuffer buffer;
+} VkObjectTableVertexBufferEntryNVX;
+
+typedef struct VkObjectTableIndexBufferEntryNVX {
+    VkObjectEntryTypeNVX type;
+    VkObjectEntryUsageFlagsNVX flags;
+    VkBuffer buffer;
+    VkIndexType indexType;
+} VkObjectTableIndexBufferEntryNVX;
+
+typedef struct VkObjectTablePushConstantEntryNVX {
+    VkObjectEntryTypeNVX type;
+    VkObjectEntryUsageFlagsNVX flags;
+    VkPipelineLayout pipelineLayout;
+    VkShaderStageFlags stageFlags;
+} VkObjectTablePushConstantEntryNVX;
+
+typedef void(VKAPI_PTR *PFN_vkCmdProcessCommandsNVX)(VkCommandBuffer commandBuffer, const VkCmdProcessCommandsInfoNVX *pProcessCommandsInfo);
+typedef void(VKAPI_PTR *PFN_vkCmdReserveSpaceForCommandsNVX)(VkCommandBuffer commandBuffer, const VkCmdReserveSpaceForCommandsInfoNVX *pReserveSpaceInfo);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateIndirectCommandsLayoutNVX)(VkDevice device, const VkIndirectCommandsLayoutCreateInfoNVX *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkIndirectCommandsLayoutNVX *pIndirectCommandsLayout);
+typedef void(VKAPI_PTR *PFN_vkDestroyIndirectCommandsLayoutNVX)(VkDevice device, VkIndirectCommandsLayoutNVX indirectCommandsLayout, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkCreateObjectTableNVX)(VkDevice device, const VkObjectTableCreateInfoNVX *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkObjectTableNVX *pObjectTable);
+typedef void(VKAPI_PTR *PFN_vkDestroyObjectTableNVX)(VkDevice device, VkObjectTableNVX objectTable, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkRegisterObjectsNVX)(VkDevice device, VkObjectTableNVX objectTable, uint32_t objectCount, const VkObjectTableEntryNVX *const *ppObjectTableEntries, const uint32_t *pObjectIndices);
+typedef VkResult(VKAPI_PTR *PFN_vkUnregisterObjectsNVX)(VkDevice device, VkObjectTableNVX objectTable, uint32_t objectCount, const VkObjectEntryTypeNVX *pObjectEntryTypes, const uint32_t *pObjectIndices);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceGeneratedCommandsPropertiesNVX)(VkPhysicalDevice physicalDevice, VkDeviceGeneratedCommandsFeaturesNVX *pFeatures, VkDeviceGeneratedCommandsLimitsNVX *pLimits);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdProcessCommandsNVX(
+    VkCommandBuffer commandBuffer,
+    const VkCmdProcessCommandsInfoNVX *pProcessCommandsInfo);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdReserveSpaceForCommandsNVX(
+    VkCommandBuffer commandBuffer,
+    const VkCmdReserveSpaceForCommandsInfoNVX *pReserveSpaceInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateIndirectCommandsLayoutNVX(
+    VkDevice device,
+    const VkIndirectCommandsLayoutCreateInfoNVX *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkIndirectCommandsLayoutNVX *pIndirectCommandsLayout);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyIndirectCommandsLayoutNVX(
+    VkDevice device,
+    VkIndirectCommandsLayoutNVX indirectCommandsLayout,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateObjectTableNVX(
+    VkDevice device,
+    const VkObjectTableCreateInfoNVX *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkObjectTableNVX *pObjectTable);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyObjectTableNVX(
+    VkDevice device,
+    VkObjectTableNVX objectTable,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkRegisterObjectsNVX(
+    VkDevice device,
+    VkObjectTableNVX objectTable,
+    uint32_t objectCount,
+    const VkObjectTableEntryNVX *const *ppObjectTableEntries,
+    const uint32_t *pObjectIndices);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkUnregisterObjectsNVX(
+    VkDevice device,
+    VkObjectTableNVX objectTable,
+    uint32_t objectCount,
+    const VkObjectEntryTypeNVX *pObjectEntryTypes,
+    const uint32_t *pObjectIndices);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceGeneratedCommandsPropertiesNVX(
+    VkPhysicalDevice physicalDevice,
+    VkDeviceGeneratedCommandsFeaturesNVX *pFeatures,
+    VkDeviceGeneratedCommandsLimitsNVX *pLimits);
+#endif
+
+#define VK_NV_clip_space_w_scaling 1
+#define VK_NV_CLIP_SPACE_W_SCALING_SPEC_VERSION 1
+#define VK_NV_CLIP_SPACE_W_SCALING_EXTENSION_NAME "VK_NV_clip_space_w_scaling"
+
+typedef struct VkViewportWScalingNV {
+    float xcoeff;
+    float ycoeff;
+} VkViewportWScalingNV;
+
+typedef struct VkPipelineViewportWScalingStateCreateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkBool32 viewportWScalingEnable;
+    uint32_t viewportCount;
+    const VkViewportWScalingNV *pViewportWScalings;
+} VkPipelineViewportWScalingStateCreateInfoNV;
+
+typedef void(VKAPI_PTR *PFN_vkCmdSetViewportWScalingNV)(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, const VkViewportWScalingNV *pViewportWScalings);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdSetViewportWScalingNV(
+    VkCommandBuffer commandBuffer,
+    uint32_t firstViewport,
+    uint32_t viewportCount,
+    const VkViewportWScalingNV *pViewportWScalings);
+#endif
+
+#define VK_EXT_direct_mode_display 1
+#define VK_EXT_DIRECT_MODE_DISPLAY_SPEC_VERSION 1
+#define VK_EXT_DIRECT_MODE_DISPLAY_EXTENSION_NAME "VK_EXT_direct_mode_display"
+
+typedef VkResult(VKAPI_PTR *PFN_vkReleaseDisplayEXT)(VkPhysicalDevice physicalDevice, VkDisplayKHR display);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkReleaseDisplayEXT(
+    VkPhysicalDevice physicalDevice,
+    VkDisplayKHR display);
+#endif
+
+#define VK_EXT_display_surface_counter 1
+#define VK_EXT_DISPLAY_SURFACE_COUNTER_SPEC_VERSION 1
+#define VK_EXT_DISPLAY_SURFACE_COUNTER_EXTENSION_NAME "VK_EXT_display_surface_counter"
+#define VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES2_EXT VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_EXT
+
+typedef enum VkSurfaceCounterFlagBitsEXT {
+    VK_SURFACE_COUNTER_VBLANK_EXT = 0x00000001,
+    VK_SURFACE_COUNTER_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkSurfaceCounterFlagBitsEXT;
+typedef VkFlags VkSurfaceCounterFlagsEXT;
+
+typedef struct VkSurfaceCapabilities2EXT {
+    VkStructureType sType;
+    void *pNext;
+    uint32_t minImageCount;
+    uint32_t maxImageCount;
+    VkExtent2D currentExtent;
+    VkExtent2D minImageExtent;
+    VkExtent2D maxImageExtent;
+    uint32_t maxImageArrayLayers;
+    VkSurfaceTransformFlagsKHR supportedTransforms;
+    VkSurfaceTransformFlagBitsKHR currentTransform;
+    VkCompositeAlphaFlagsKHR supportedCompositeAlpha;
+    VkImageUsageFlags supportedUsageFlags;
+    VkSurfaceCounterFlagsEXT supportedSurfaceCounters;
+} VkSurfaceCapabilities2EXT;
+
+typedef VkResult(VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilities2EXT)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, VkSurfaceCapabilities2EXT *pSurfaceCapabilities);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilities2EXT(
+    VkPhysicalDevice physicalDevice,
+    VkSurfaceKHR surface,
+    VkSurfaceCapabilities2EXT *pSurfaceCapabilities);
+#endif
+
+#define VK_EXT_display_control 1
+#define VK_EXT_DISPLAY_CONTROL_SPEC_VERSION 1
+#define VK_EXT_DISPLAY_CONTROL_EXTENSION_NAME "VK_EXT_display_control"
+
+typedef enum VkDisplayPowerStateEXT {
+    VK_DISPLAY_POWER_STATE_OFF_EXT = 0,
+    VK_DISPLAY_POWER_STATE_SUSPEND_EXT = 1,
+    VK_DISPLAY_POWER_STATE_ON_EXT = 2,
+    VK_DISPLAY_POWER_STATE_BEGIN_RANGE_EXT = VK_DISPLAY_POWER_STATE_OFF_EXT,
+    VK_DISPLAY_POWER_STATE_END_RANGE_EXT = VK_DISPLAY_POWER_STATE_ON_EXT,
+    VK_DISPLAY_POWER_STATE_RANGE_SIZE_EXT = (VK_DISPLAY_POWER_STATE_ON_EXT - VK_DISPLAY_POWER_STATE_OFF_EXT + 1),
+    VK_DISPLAY_POWER_STATE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDisplayPowerStateEXT;
+
+typedef enum VkDeviceEventTypeEXT {
+    VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT = 0,
+    VK_DEVICE_EVENT_TYPE_BEGIN_RANGE_EXT = VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT,
+    VK_DEVICE_EVENT_TYPE_END_RANGE_EXT = VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT,
+    VK_DEVICE_EVENT_TYPE_RANGE_SIZE_EXT = (VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT - VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT + 1),
+    VK_DEVICE_EVENT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDeviceEventTypeEXT;
+
+typedef enum VkDisplayEventTypeEXT {
+    VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT = 0,
+    VK_DISPLAY_EVENT_TYPE_BEGIN_RANGE_EXT = VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT,
+    VK_DISPLAY_EVENT_TYPE_END_RANGE_EXT = VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT,
+    VK_DISPLAY_EVENT_TYPE_RANGE_SIZE_EXT = (VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT - VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT + 1),
+    VK_DISPLAY_EVENT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDisplayEventTypeEXT;
+
+typedef struct VkDisplayPowerInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkDisplayPowerStateEXT powerState;
+} VkDisplayPowerInfoEXT;
+
+typedef struct VkDeviceEventInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkDeviceEventTypeEXT deviceEvent;
+} VkDeviceEventInfoEXT;
+
+typedef struct VkDisplayEventInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkDisplayEventTypeEXT displayEvent;
+} VkDisplayEventInfoEXT;
+
+typedef struct VkSwapchainCounterCreateInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkSurfaceCounterFlagsEXT surfaceCounters;
+} VkSwapchainCounterCreateInfoEXT;
+
+typedef VkResult(VKAPI_PTR *PFN_vkDisplayPowerControlEXT)(VkDevice device, VkDisplayKHR display, const VkDisplayPowerInfoEXT *pDisplayPowerInfo);
+typedef VkResult(VKAPI_PTR *PFN_vkRegisterDeviceEventEXT)(VkDevice device, const VkDeviceEventInfoEXT *pDeviceEventInfo, const VkAllocationCallbacks *pAllocator, VkFence *pFence);
+typedef VkResult(VKAPI_PTR *PFN_vkRegisterDisplayEventEXT)(VkDevice device, VkDisplayKHR display, const VkDisplayEventInfoEXT *pDisplayEventInfo, const VkAllocationCallbacks *pAllocator, VkFence *pFence);
+typedef VkResult(VKAPI_PTR *PFN_vkGetSwapchainCounterEXT)(VkDevice device, VkSwapchainKHR swapchain, VkSurfaceCounterFlagBitsEXT counter, uint64_t *pCounterValue);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkDisplayPowerControlEXT(
+    VkDevice device,
+    VkDisplayKHR display,
+    const VkDisplayPowerInfoEXT *pDisplayPowerInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkRegisterDeviceEventEXT(
+    VkDevice device,
+    const VkDeviceEventInfoEXT *pDeviceEventInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkFence *pFence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkRegisterDisplayEventEXT(
+    VkDevice device,
+    VkDisplayKHR display,
+    const VkDisplayEventInfoEXT *pDisplayEventInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkFence *pFence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainCounterEXT(
+    VkDevice device,
+    VkSwapchainKHR swapchain,
+    VkSurfaceCounterFlagBitsEXT counter,
+    uint64_t *pCounterValue);
+#endif
+
+#define VK_GOOGLE_display_timing 1
+#define VK_GOOGLE_DISPLAY_TIMING_SPEC_VERSION 1
+#define VK_GOOGLE_DISPLAY_TIMING_EXTENSION_NAME "VK_GOOGLE_display_timing"
+
+typedef struct VkRefreshCycleDurationGOOGLE {
+    uint64_t refreshDuration;
+} VkRefreshCycleDurationGOOGLE;
+
+typedef struct VkPastPresentationTimingGOOGLE {
+    uint32_t presentID;
+    uint64_t desiredPresentTime;
+    uint64_t actualPresentTime;
+    uint64_t earliestPresentTime;
+    uint64_t presentMargin;
+} VkPastPresentationTimingGOOGLE;
+
+typedef struct VkPresentTimeGOOGLE {
+    uint32_t presentID;
+    uint64_t desiredPresentTime;
+} VkPresentTimeGOOGLE;
+
+typedef struct VkPresentTimesInfoGOOGLE {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t swapchainCount;
+    const VkPresentTimeGOOGLE *pTimes;
+} VkPresentTimesInfoGOOGLE;
+
+typedef VkResult(VKAPI_PTR *PFN_vkGetRefreshCycleDurationGOOGLE)(VkDevice device, VkSwapchainKHR swapchain, VkRefreshCycleDurationGOOGLE *pDisplayTimingProperties);
+typedef VkResult(VKAPI_PTR *PFN_vkGetPastPresentationTimingGOOGLE)(VkDevice device, VkSwapchainKHR swapchain, uint32_t *pPresentationTimingCount, VkPastPresentationTimingGOOGLE *pPresentationTimings);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetRefreshCycleDurationGOOGLE(
+    VkDevice device,
+    VkSwapchainKHR swapchain,
+    VkRefreshCycleDurationGOOGLE *pDisplayTimingProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPastPresentationTimingGOOGLE(
+    VkDevice device,
+    VkSwapchainKHR swapchain,
+    uint32_t *pPresentationTimingCount,
+    VkPastPresentationTimingGOOGLE *pPresentationTimings);
+#endif
+
+#define VK_NV_sample_mask_override_coverage 1
+#define VK_NV_SAMPLE_MASK_OVERRIDE_COVERAGE_SPEC_VERSION 1
+#define VK_NV_SAMPLE_MASK_OVERRIDE_COVERAGE_EXTENSION_NAME "VK_NV_sample_mask_override_coverage"
+
+#define VK_NV_geometry_shader_passthrough 1
+#define VK_NV_GEOMETRY_SHADER_PASSTHROUGH_SPEC_VERSION 1
+#define VK_NV_GEOMETRY_SHADER_PASSTHROUGH_EXTENSION_NAME "VK_NV_geometry_shader_passthrough"
+
+#define VK_NV_viewport_array2 1
+#define VK_NV_VIEWPORT_ARRAY2_SPEC_VERSION 1
+#define VK_NV_VIEWPORT_ARRAY2_EXTENSION_NAME "VK_NV_viewport_array2"
+
+#define VK_NVX_multiview_per_view_attributes 1
+#define VK_NVX_MULTIVIEW_PER_VIEW_ATTRIBUTES_SPEC_VERSION 1
+#define VK_NVX_MULTIVIEW_PER_VIEW_ATTRIBUTES_EXTENSION_NAME "VK_NVX_multiview_per_view_attributes"
+
+typedef struct VkPhysicalDeviceMultiviewPerViewAttributesPropertiesNVX {
+    VkStructureType sType;
+    void *pNext;
+    VkBool32 perViewPositionAllComponents;
+} VkPhysicalDeviceMultiviewPerViewAttributesPropertiesNVX;
+
+#define VK_NV_viewport_swizzle 1
+#define VK_NV_VIEWPORT_SWIZZLE_SPEC_VERSION 1
+#define VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME "VK_NV_viewport_swizzle"
+
+typedef enum VkViewportCoordinateSwizzleNV {
+    VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV = 0,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_X_NV = 1,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_Y_NV = 2,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_Y_NV = 3,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_Z_NV = 4,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_Z_NV = 5,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_W_NV = 6,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV = 7,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_BEGIN_RANGE_NV = VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_END_RANGE_NV = VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV,
+    VK_VIEWPORT_COORDINATE_SWIZZLE_RANGE_SIZE_NV = (VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV - VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV + 1),
+    VK_VIEWPORT_COORDINATE_SWIZZLE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkViewportCoordinateSwizzleNV;
+
+typedef VkFlags VkPipelineViewportSwizzleStateCreateFlagsNV;
+
+typedef struct VkViewportSwizzleNV {
+    VkViewportCoordinateSwizzleNV x;
+    VkViewportCoordinateSwizzleNV y;
+    VkViewportCoordinateSwizzleNV z;
+    VkViewportCoordinateSwizzleNV w;
+} VkViewportSwizzleNV;
+
+typedef struct VkPipelineViewportSwizzleStateCreateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineViewportSwizzleStateCreateFlagsNV flags;
+    uint32_t viewportCount;
+    const VkViewportSwizzleNV *pViewportSwizzles;
+} VkPipelineViewportSwizzleStateCreateInfoNV;
+
+#define VK_EXT_discard_rectangles 1
+#define VK_EXT_DISCARD_RECTANGLES_SPEC_VERSION 1
+#define VK_EXT_DISCARD_RECTANGLES_EXTENSION_NAME "VK_EXT_discard_rectangles"
+
+typedef enum VkDiscardRectangleModeEXT {
+    VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT = 0,
+    VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT = 1,
+    VK_DISCARD_RECTANGLE_MODE_BEGIN_RANGE_EXT = VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT,
+    VK_DISCARD_RECTANGLE_MODE_END_RANGE_EXT = VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT,
+    VK_DISCARD_RECTANGLE_MODE_RANGE_SIZE_EXT = (VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT - VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT + 1),
+    VK_DISCARD_RECTANGLE_MODE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkDiscardRectangleModeEXT;
+
+typedef VkFlags VkPipelineDiscardRectangleStateCreateFlagsEXT;
+
+typedef struct VkPhysicalDeviceDiscardRectanglePropertiesEXT {
+    VkStructureType sType;
+    void *pNext;
+    uint32_t maxDiscardRectangles;
+} VkPhysicalDeviceDiscardRectanglePropertiesEXT;
+
+typedef struct VkPipelineDiscardRectangleStateCreateInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineDiscardRectangleStateCreateFlagsEXT flags;
+    VkDiscardRectangleModeEXT discardRectangleMode;
+    uint32_t discardRectangleCount;
+    const VkRect2D *pDiscardRectangles;
+} VkPipelineDiscardRectangleStateCreateInfoEXT;
+
+typedef void(VKAPI_PTR *PFN_vkCmdSetDiscardRectangleEXT)(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle, uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdSetDiscardRectangleEXT(
+    VkCommandBuffer commandBuffer,
+    uint32_t firstDiscardRectangle,
+    uint32_t discardRectangleCount,
+    const VkRect2D *pDiscardRectangles);
+#endif
+
+#define VK_EXT_swapchain_colorspace 1
+#define VK_EXT_SWAPCHAIN_COLOR_SPACE_SPEC_VERSION 3
+#define VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME "VK_EXT_swapchain_colorspace"
+
+#define VK_EXT_hdr_metadata 1
+#define VK_EXT_HDR_METADATA_SPEC_VERSION 1
+#define VK_EXT_HDR_METADATA_EXTENSION_NAME "VK_EXT_hdr_metadata"
+
+typedef struct VkXYColorEXT {
+    float x;
+    float y;
+} VkXYColorEXT;
+
+typedef struct VkHdrMetadataEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkXYColorEXT displayPrimaryRed;
+    VkXYColorEXT displayPrimaryGreen;
+    VkXYColorEXT displayPrimaryBlue;
+    VkXYColorEXT whitePoint;
+    float maxLuminance;
+    float minLuminance;
+    float maxContentLightLevel;
+    float maxFrameAverageLightLevel;
+} VkHdrMetadataEXT;
+
+typedef void(VKAPI_PTR *PFN_vkSetHdrMetadataEXT)(VkDevice device, uint32_t swapchainCount, const VkSwapchainKHR *pSwapchains, const VkHdrMetadataEXT *pMetadata);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkSetHdrMetadataEXT(
+    VkDevice device,
+    uint32_t swapchainCount,
+    const VkSwapchainKHR *pSwapchains,
+    const VkHdrMetadataEXT *pMetadata);
+#endif
+
+#ifdef VK_USE_PLATFORM_IOS_MVK
+#define VK_MVK_ios_surface 1
+#define VK_MVK_IOS_SURFACE_SPEC_VERSION 2
+#define VK_MVK_IOS_SURFACE_EXTENSION_NAME "VK_MVK_ios_surface"
+
+typedef VkFlags VkIOSSurfaceCreateFlagsMVK;
+
+typedef struct VkIOSSurfaceCreateInfoMVK {
+    VkStructureType sType;
+    const void *pNext;
+    VkIOSSurfaceCreateFlagsMVK flags;
+    const void *pView;
+} VkIOSSurfaceCreateInfoMVK;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateIOSSurfaceMVK)(VkInstance instance, const VkIOSSurfaceCreateInfoMVK *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSurfaceKHR *pSurface);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateIOSSurfaceMVK(
+    VkInstance instance,
+    const VkIOSSurfaceCreateInfoMVK *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkSurfaceKHR *pSurface);
+#endif
+#endif /* VK_USE_PLATFORM_IOS_MVK */
+
+#ifdef VK_USE_PLATFORM_MACOS_MVK
+#define VK_MVK_macos_surface 1
+#define VK_MVK_MACOS_SURFACE_SPEC_VERSION 2
+#define VK_MVK_MACOS_SURFACE_EXTENSION_NAME "VK_MVK_macos_surface"
+
+typedef VkFlags VkMacOSSurfaceCreateFlagsMVK;
+
+typedef struct VkMacOSSurfaceCreateInfoMVK {
+    VkStructureType sType;
+    const void *pNext;
+    VkMacOSSurfaceCreateFlagsMVK flags;
+    const void *pView;
+} VkMacOSSurfaceCreateInfoMVK;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateMacOSSurfaceMVK)(VkInstance instance, const VkMacOSSurfaceCreateInfoMVK *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSurfaceKHR *pSurface);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateMacOSSurfaceMVK(
+    VkInstance instance,
+    const VkMacOSSurfaceCreateInfoMVK *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkSurfaceKHR *pSurface);
+#endif
+#endif /* VK_USE_PLATFORM_MACOS_MVK */
+
+#define VK_EXT_sampler_filter_minmax 1
+#define VK_EXT_SAMPLER_FILTER_MINMAX_SPEC_VERSION 1
+#define VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME "VK_EXT_sampler_filter_minmax"
+
+typedef enum VkSamplerReductionModeEXT {
+    VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT = 0,
+    VK_SAMPLER_REDUCTION_MODE_MIN_EXT = 1,
+    VK_SAMPLER_REDUCTION_MODE_MAX_EXT = 2,
+    VK_SAMPLER_REDUCTION_MODE_BEGIN_RANGE_EXT = VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT,
+    VK_SAMPLER_REDUCTION_MODE_END_RANGE_EXT = VK_SAMPLER_REDUCTION_MODE_MAX_EXT,
+    VK_SAMPLER_REDUCTION_MODE_RANGE_SIZE_EXT = (VK_SAMPLER_REDUCTION_MODE_MAX_EXT - VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT + 1),
+    VK_SAMPLER_REDUCTION_MODE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkSamplerReductionModeEXT;
+
+typedef struct VkSamplerReductionModeCreateInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkSamplerReductionModeEXT reductionMode;
+} VkSamplerReductionModeCreateInfoEXT;
+
+typedef struct VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT {
+    VkStructureType sType;
+    void *pNext;
+    VkBool32 filterMinmaxSingleComponentFormats;
+    VkBool32 filterMinmaxImageComponentMapping;
+} VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT;
+
+#define VK_AMD_gpu_shader_int16 1
+#define VK_AMD_GPU_SHADER_INT16_SPEC_VERSION 1
+#define VK_AMD_GPU_SHADER_INT16_EXTENSION_NAME "VK_AMD_gpu_shader_int16"
+
+#define VK_AMD_mixed_attachment_samples 1
+#define VK_AMD_MIXED_ATTACHMENT_SAMPLES_SPEC_VERSION 1
+#define VK_AMD_MIXED_ATTACHMENT_SAMPLES_EXTENSION_NAME "VK_AMD_mixed_attachment_samples"
+
+#define VK_AMD_shader_fragment_mask 1
+#define VK_AMD_SHADER_FRAGMENT_MASK_SPEC_VERSION 1
+#define VK_AMD_SHADER_FRAGMENT_MASK_EXTENSION_NAME "VK_AMD_shader_fragment_mask"
+
+#define VK_EXT_shader_stencil_export 1
+#define VK_EXT_SHADER_STENCIL_EXPORT_SPEC_VERSION 1
+#define VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME "VK_EXT_shader_stencil_export"
+
+#define VK_EXT_sample_locations 1
+#define VK_EXT_SAMPLE_LOCATIONS_SPEC_VERSION 1
+#define VK_EXT_SAMPLE_LOCATIONS_EXTENSION_NAME "VK_EXT_sample_locations"
+
+typedef struct VkSampleLocationEXT {
+    float x;
+    float y;
+} VkSampleLocationEXT;
+
+typedef struct VkSampleLocationsInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkSampleCountFlagBits sampleLocationsPerPixel;
+    VkExtent2D sampleLocationGridSize;
+    uint32_t sampleLocationsCount;
+    const VkSampleLocationEXT *pSampleLocations;
+} VkSampleLocationsInfoEXT;
+
+typedef struct VkAttachmentSampleLocationsEXT {
+    uint32_t attachmentIndex;
+    VkSampleLocationsInfoEXT sampleLocationsInfo;
+} VkAttachmentSampleLocationsEXT;
+
+typedef struct VkSubpassSampleLocationsEXT {
+    uint32_t subpassIndex;
+    VkSampleLocationsInfoEXT sampleLocationsInfo;
+} VkSubpassSampleLocationsEXT;
+
+typedef struct VkRenderPassSampleLocationsBeginInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    uint32_t attachmentInitialSampleLocationsCount;
+    const VkAttachmentSampleLocationsEXT *pAttachmentInitialSampleLocations;
+    uint32_t postSubpassSampleLocationsCount;
+    const VkSubpassSampleLocationsEXT *pSubpassSampleLocations;
+} VkRenderPassSampleLocationsBeginInfoEXT;
+
+typedef struct VkPipelineSampleLocationsStateCreateInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkBool32 sampleLocationsEnable;
+    VkSampleLocationsInfoEXT sampleLocationsInfo;
+} VkPipelineSampleLocationsStateCreateInfoEXT;
+
+typedef struct VkPhysicalDeviceSampleLocationsPropertiesEXT {
+    VkStructureType sType;
+    void *pNext;
+    VkSampleCountFlags sampleLocationSampleCounts;
+    VkExtent2D maxSampleLocationGridSize;
+    float sampleLocationCoordinateRange[2];
+    uint32_t sampleLocationSubPixelBits;
+    VkBool32 variableSampleLocations;
+} VkPhysicalDeviceSampleLocationsPropertiesEXT;
+
+typedef struct VkMultisamplePropertiesEXT {
+    VkStructureType sType;
+    void *pNext;
+    VkExtent2D maxSampleLocationGridSize;
+} VkMultisamplePropertiesEXT;
+
+typedef void(VKAPI_PTR *PFN_vkCmdSetSampleLocationsEXT)(VkCommandBuffer commandBuffer, const VkSampleLocationsInfoEXT *pSampleLocationsInfo);
+typedef void(VKAPI_PTR *PFN_vkGetPhysicalDeviceMultisamplePropertiesEXT)(VkPhysicalDevice physicalDevice, VkSampleCountFlagBits samples, VkMultisamplePropertiesEXT *pMultisampleProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdSetSampleLocationsEXT(
+    VkCommandBuffer commandBuffer,
+    const VkSampleLocationsInfoEXT *pSampleLocationsInfo);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceMultisamplePropertiesEXT(
+    VkPhysicalDevice physicalDevice,
+    VkSampleCountFlagBits samples,
+    VkMultisamplePropertiesEXT *pMultisampleProperties);
+#endif
+
+#define VK_EXT_blend_operation_advanced 1
+#define VK_EXT_BLEND_OPERATION_ADVANCED_SPEC_VERSION 2
+#define VK_EXT_BLEND_OPERATION_ADVANCED_EXTENSION_NAME "VK_EXT_blend_operation_advanced"
+
+typedef enum VkBlendOverlapEXT {
+    VK_BLEND_OVERLAP_UNCORRELATED_EXT = 0,
+    VK_BLEND_OVERLAP_DISJOINT_EXT = 1,
+    VK_BLEND_OVERLAP_CONJOINT_EXT = 2,
+    VK_BLEND_OVERLAP_BEGIN_RANGE_EXT = VK_BLEND_OVERLAP_UNCORRELATED_EXT,
+    VK_BLEND_OVERLAP_END_RANGE_EXT = VK_BLEND_OVERLAP_CONJOINT_EXT,
+    VK_BLEND_OVERLAP_RANGE_SIZE_EXT = (VK_BLEND_OVERLAP_CONJOINT_EXT - VK_BLEND_OVERLAP_UNCORRELATED_EXT + 1),
+    VK_BLEND_OVERLAP_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkBlendOverlapEXT;
+
+typedef struct VkPhysicalDeviceBlendOperationAdvancedFeaturesEXT {
+    VkStructureType sType;
+    void *pNext;
+    VkBool32 advancedBlendCoherentOperations;
+} VkPhysicalDeviceBlendOperationAdvancedFeaturesEXT;
+
+typedef struct VkPhysicalDeviceBlendOperationAdvancedPropertiesEXT {
+    VkStructureType sType;
+    void *pNext;
+    uint32_t advancedBlendMaxColorAttachments;
+    VkBool32 advancedBlendIndependentBlend;
+    VkBool32 advancedBlendNonPremultipliedSrcColor;
+    VkBool32 advancedBlendNonPremultipliedDstColor;
+    VkBool32 advancedBlendCorrelatedOverlap;
+    VkBool32 advancedBlendAllOperations;
+} VkPhysicalDeviceBlendOperationAdvancedPropertiesEXT;
+
+typedef struct VkPipelineColorBlendAdvancedStateCreateInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkBool32 srcPremultiplied;
+    VkBool32 dstPremultiplied;
+    VkBlendOverlapEXT blendOverlap;
+} VkPipelineColorBlendAdvancedStateCreateInfoEXT;
+
+#define VK_NV_fragment_coverage_to_color 1
+#define VK_NV_FRAGMENT_COVERAGE_TO_COLOR_SPEC_VERSION 1
+#define VK_NV_FRAGMENT_COVERAGE_TO_COLOR_EXTENSION_NAME "VK_NV_fragment_coverage_to_color"
+
+typedef VkFlags VkPipelineCoverageToColorStateCreateFlagsNV;
+
+typedef struct VkPipelineCoverageToColorStateCreateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineCoverageToColorStateCreateFlagsNV flags;
+    VkBool32 coverageToColorEnable;
+    uint32_t coverageToColorLocation;
+} VkPipelineCoverageToColorStateCreateInfoNV;
+
+#define VK_NV_framebuffer_mixed_samples 1
+#define VK_NV_FRAMEBUFFER_MIXED_SAMPLES_SPEC_VERSION 1
+#define VK_NV_FRAMEBUFFER_MIXED_SAMPLES_EXTENSION_NAME "VK_NV_framebuffer_mixed_samples"
+
+typedef enum VkCoverageModulationModeNV {
+    VK_COVERAGE_MODULATION_MODE_NONE_NV = 0,
+    VK_COVERAGE_MODULATION_MODE_RGB_NV = 1,
+    VK_COVERAGE_MODULATION_MODE_ALPHA_NV = 2,
+    VK_COVERAGE_MODULATION_MODE_RGBA_NV = 3,
+    VK_COVERAGE_MODULATION_MODE_BEGIN_RANGE_NV = VK_COVERAGE_MODULATION_MODE_NONE_NV,
+    VK_COVERAGE_MODULATION_MODE_END_RANGE_NV = VK_COVERAGE_MODULATION_MODE_RGBA_NV,
+    VK_COVERAGE_MODULATION_MODE_RANGE_SIZE_NV = (VK_COVERAGE_MODULATION_MODE_RGBA_NV - VK_COVERAGE_MODULATION_MODE_NONE_NV + 1),
+    VK_COVERAGE_MODULATION_MODE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkCoverageModulationModeNV;
+
+typedef VkFlags VkPipelineCoverageModulationStateCreateFlagsNV;
+
+typedef struct VkPipelineCoverageModulationStateCreateInfoNV {
+    VkStructureType sType;
+    const void *pNext;
+    VkPipelineCoverageModulationStateCreateFlagsNV flags;
+    VkCoverageModulationModeNV coverageModulationMode;
+    VkBool32 coverageModulationTableEnable;
+    uint32_t coverageModulationTableCount;
+    const float *pCoverageModulationTable;
+} VkPipelineCoverageModulationStateCreateInfoNV;
+
+#define VK_NV_fill_rectangle 1
+#define VK_NV_FILL_RECTANGLE_SPEC_VERSION 1
+#define VK_NV_FILL_RECTANGLE_EXTENSION_NAME "VK_NV_fill_rectangle"
+
+#define VK_EXT_post_depth_coverage 1
+#define VK_EXT_POST_DEPTH_COVERAGE_SPEC_VERSION 1
+#define VK_EXT_POST_DEPTH_COVERAGE_EXTENSION_NAME "VK_EXT_post_depth_coverage"
+
+#define VK_EXT_validation_cache 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkValidationCacheEXT)
+
+#define VK_EXT_VALIDATION_CACHE_SPEC_VERSION 1
+#define VK_EXT_VALIDATION_CACHE_EXTENSION_NAME "VK_EXT_validation_cache"
+
+typedef enum VkValidationCacheHeaderVersionEXT {
+    VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT = 1,
+    VK_VALIDATION_CACHE_HEADER_VERSION_BEGIN_RANGE_EXT = VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT,
+    VK_VALIDATION_CACHE_HEADER_VERSION_END_RANGE_EXT = VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT,
+    VK_VALIDATION_CACHE_HEADER_VERSION_RANGE_SIZE_EXT = (VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT - VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT + 1),
+    VK_VALIDATION_CACHE_HEADER_VERSION_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationCacheHeaderVersionEXT;
+
+typedef VkFlags VkValidationCacheCreateFlagsEXT;
+
+typedef struct VkValidationCacheCreateInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkValidationCacheCreateFlagsEXT flags;
+    size_t initialDataSize;
+    const void *pInitialData;
+} VkValidationCacheCreateInfoEXT;
+
+typedef struct VkShaderModuleValidationCacheCreateInfoEXT {
+    VkStructureType sType;
+    const void *pNext;
+    VkValidationCacheEXT validationCache;
+} VkShaderModuleValidationCacheCreateInfoEXT;
+
+typedef VkResult(VKAPI_PTR *PFN_vkCreateValidationCacheEXT)(VkDevice device, const VkValidationCacheCreateInfoEXT *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkValidationCacheEXT *pValidationCache);
+typedef void(VKAPI_PTR *PFN_vkDestroyValidationCacheEXT)(VkDevice device, VkValidationCacheEXT validationCache, const VkAllocationCallbacks *pAllocator);
+typedef VkResult(VKAPI_PTR *PFN_vkMergeValidationCachesEXT)(VkDevice device, VkValidationCacheEXT dstCache, uint32_t srcCacheCount, const VkValidationCacheEXT *pSrcCaches);
+typedef VkResult(VKAPI_PTR *PFN_vkGetValidationCacheDataEXT)(VkDevice device, VkValidationCacheEXT validationCache, size_t *pDataSize, void *pData);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateValidationCacheEXT(
+    VkDevice device,
+    const VkValidationCacheCreateInfoEXT *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkValidationCacheEXT *pValidationCache);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyValidationCacheEXT(
+    VkDevice device,
+    VkValidationCacheEXT validationCache,
+    const VkAllocationCallbacks *pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkMergeValidationCachesEXT(
+    VkDevice device,
+    VkValidationCacheEXT dstCache,
+    uint32_t srcCacheCount,
+    const VkValidationCacheEXT *pSrcCaches);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetValidationCacheDataEXT(
+    VkDevice device,
+    VkValidationCacheEXT validationCache,
+    size_t *pDataSize,
+    void *pData);
+#endif
+
+#define VK_EXT_shader_viewport_index_layer 1
+#define VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_SPEC_VERSION 1
+#define VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME "VK_EXT_shader_viewport_index_layer"
+
+}  // extern "C"
+
+#endif  // HALIDE_MINI_VULKAN_H
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
new file mode 100644
index 000000000000..bd8e935d771f
--- /dev/null
+++ b/src/runtime/vulkan.cpp
@@ -0,0 +1,847 @@
+#include "HalideRuntimeVulkan.h"
+
+#include "device_buffer_utils.h"
+#include "device_interface.h"
+#include "runtime_internal.h"
+#include "vulkan_resources.h"
+#include "vulkan_context.h"
+#include "vulkan_extensions.h"
+#include "vulkan_internal.h"
+#include "vulkan_memory.h"
+
+using namespace Halide::Runtime::Internal::Vulkan;
+
+// --------------------------------------------------------------------------
+
+extern "C" {
+
+// --------------------------------------------------------------------------
+
+// The default implementation of halide_acquire_vulkan_context uses
+// the global pointers above, and serializes access with a spin lock.
+// Overriding implementations of acquire/release must implement the
+// following behavior:
+
+//  - halide_acquire_vulkan_context should always store a valid
+//   instance/device/queue in the corresponding out parameters,
+//   or return an error code.
+// - A call to halide_acquire_vulkan_context is followed by a matching
+//   call to halide_release_vulkan_context. halide_acquire_vulkan_context
+//   should block while a previous call (if any) has not yet been
+//   released via halide_release_vulkan_context.
+WEAK int halide_vulkan_acquire_context(void *user_context,
+                                       halide_vulkan_memory_allocator **allocator,
+                                       VkInstance *instance,
+                                       VkDevice *device, 
+                                       VkPhysicalDevice *physical_device,
+                                       VkCommandPool *command_pool,
+                                       VkQueue *queue,
+                                       uint32_t *queue_family_index,
+                                       bool create) {
+
+    halide_abort_if_false(user_context, instance != nullptr);
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, queue != nullptr);
+    halide_abort_if_false(user_context, &thread_lock != nullptr);
+    while (__atomic_test_and_set(&thread_lock, __ATOMIC_ACQUIRE)) {}
+
+    // If the context has not been initialized, initialize it now.
+    halide_abort_if_false(user_context, &cached_instance != nullptr);
+    halide_abort_if_false(user_context, &cached_device != nullptr);
+    halide_abort_if_false(user_context, &cached_queue != nullptr);
+    halide_abort_if_false(user_context, &cached_physical_device != nullptr);
+    if ((cached_instance == nullptr) && create) {
+        int result = vk_create_context(user_context,
+                                       reinterpret_cast<VulkanMemoryAllocator **>(&cached_allocator),
+                                       &cached_instance,
+                                       &cached_device,
+                                       &cached_physical_device,
+                                       &cached_command_pool,
+                                       &cached_queue,
+                                       &cached_queue_family_index);
+        if (result != halide_error_code_success) {
+            __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
+            return result;
+        }
+    }
+
+    *allocator = cached_allocator;
+    *instance = cached_instance;
+    *device = cached_device;
+    *physical_device = cached_physical_device;
+    *command_pool = cached_command_pool;
+    *queue = cached_queue;
+    *queue_family_index = cached_queue_family_index;
+    return 0;
+}
+
+WEAK int halide_vulkan_release_context(void *user_context, VkInstance instance, VkDevice device, VkQueue queue) {
+    __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
+    return 0;
+}
+
+WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_buffer) {
+    // halide_vulkan_device_free, at present, can be exposed to clients and they
+    // should be allowed to call halide_vulkan_device_free on any halide_buffer_t
+    // including ones that have never been used with a GPU.
+    if (halide_buffer->device == 0) {
+        return 0;
+    }
+
+    VulkanContext ctx(user_context);
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    // get the allocated region for the device
+    MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
+    if (ctx.allocator && device_region && device_region->handle) {
+        ctx.allocator->reclaim(user_context, device_region);
+    }
+    halide_buffer->device = 0;
+    halide_buffer->device_interface->impl->release_module();
+    halide_buffer->device_interface = nullptr;
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return 0;
+}
+
+WEAK int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr, const char *src, int size) {
+    debug(user_context)
+        << "Vulkan: halide_vulkan_init_kernels (user_context: " << user_context
+        << ", state_ptr: " << state_ptr
+        << ", program: " << (void *)src
+        << ", size: " << size << "\n";
+
+    VulkanContext ctx(user_context);
+    if (ctx.error != VK_SUCCESS) {
+        return ctx.error;
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    debug(user_context) << "halide_vulkan_initialize_kernels got compilation_cache mutex.\n";
+    VkShaderModule *shader_module = nullptr;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.device, shader_module,
+                                              Halide::Runtime::Internal::Vulkan::vk_compile_shader_module,
+                                              user_context, ctx.allocator, src, size)) {
+        return halide_error_code_generic_error;
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return 0;
+}
+
+WEAK void halide_vulkan_finalize_kernels(void *user_context, void *state_ptr) {
+    debug(user_context)
+        << "Vulkan: halide_vulkan_finalize_kernels (user_context: " << user_context
+        << ", state_ptr: " << state_ptr << "\n";
+    VulkanContext ctx(user_context);
+    if (ctx.error == VK_SUCCESS) {
+        compilation_cache.release_hold(user_context, ctx.device, state_ptr);
+    }
+}
+
+// Used to generate correct timings when tracing
+WEAK int halide_vulkan_device_sync(void *user_context, halide_buffer_t *) {
+    debug(user_context) << "Vulkan: halide_vulkan_device_sync (user_context: " << user_context << ")\n";
+
+    VulkanContext ctx(user_context);
+    halide_debug_assert(user_context, ctx.error == VK_SUCCESS);
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    vkQueueWaitIdle(ctx.queue);
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return VK_SUCCESS;
+}
+
+WEAK int halide_vulkan_device_release(void *user_context) {
+    debug(user_context)
+        << "Vulkan: halide_vulkan_device_release (user_context: " << user_context << ")\n";
+
+    VulkanMemoryAllocator *allocator;
+    VkInstance instance;
+    VkDevice device;
+    VkCommandPool command_pool;
+    VkPhysicalDevice physical_device;
+    VkQueue queue;
+    uint32_t _throwaway;
+
+    int acquire_status = halide_vulkan_acquire_context(user_context,
+                                                       reinterpret_cast<halide_vulkan_memory_allocator **>(&allocator),
+                                                       &instance, &device, &physical_device, &command_pool, &queue, &_throwaway, false);
+    halide_debug_assert(user_context, acquire_status == VK_SUCCESS);
+    (void)acquire_status;
+    if (instance != nullptr) {
+
+        vkQueueWaitIdle(queue);
+        vk_destroy_command_pool(user_context, allocator, command_pool);
+        vk_destroy_shader_modules(user_context, allocator);
+        vk_destroy_memory_allocator(user_context, allocator);
+
+        if (device == cached_device) {
+            cached_device = nullptr;
+            cached_physical_device = nullptr;
+            cached_queue = nullptr;
+        }
+        vkDestroyDevice(device, nullptr);
+
+        if (instance == cached_instance) {
+            cached_instance = nullptr;
+        }
+        vkDestroyInstance(instance, nullptr);
+        halide_vulkan_release_context(user_context, instance, device, queue);
+    }
+
+    return 0;
+}
+
+WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
+    debug(user_context)
+        << "halide_vulkan_device_malloc (user_context: " << user_context
+        << ", buf: " << buf << ")\n";
+
+    VulkanContext ctx(user_context);
+    if (ctx.error != VK_SUCCESS) {
+        return -1;
+    }
+
+    size_t size = buf->size_in_bytes();
+    halide_debug_assert(user_context, size != 0);
+    if (buf->device) {
+        return 0;
+    }
+
+    for (int i = 0; i < buf->dimensions; i++) {
+        halide_debug_assert(user_context, buf->dim[i].stride >= 0);
+    }
+
+    debug(user_context) << "    allocating " << *buf << "\n";
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    // request uncached device only memory
+    MemoryRequest request = {0};
+    request.size = size;
+    request.properties.usage = MemoryUsage::TransferSrcDst;
+    request.properties.caching = MemoryCaching::Uncached;
+    request.properties.visibility = MemoryVisibility::DeviceOnly;
+
+    // allocate a new region
+    MemoryRegion *device_region = ctx.allocator->reserve(user_context, request);
+    if ((device_region == nullptr) || (device_region->handle == nullptr)) {
+        error(user_context) << "Vulkan: Failed to allocate device memory!\n";
+        return -1;
+    }
+
+    buf->device = (uint64_t)device_region;
+    buf->device_interface = &vulkan_device_interface;
+    buf->device_interface->impl->use_module();
+
+    debug(user_context)
+        << "    Allocated device buffer " << (void *)buf->device
+        << " for buffer " << buf << "\n";
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return 0;
+}
+
+WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halide_buffer) {
+    int err = halide_vulkan_device_malloc(user_context, halide_buffer);
+    if (err) {
+        return err;
+    }
+
+    debug(user_context)
+        << "Vulkan: halide_vulkan_copy_to_device (user_context: " << user_context
+        << ", halide_buffer: " << halide_buffer << ")\n";
+
+    // Acquire the context so we can use the command queue.
+    VulkanContext ctx(user_context);
+    if (ctx.error != VK_SUCCESS) {
+        return ctx.error;
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    halide_abort_if_false(user_context, halide_buffer->host && halide_buffer->device);
+
+    device_copy copy_helper = make_host_to_device_copy(halide_buffer);
+
+    // We construct a staging buffer to copy into from host memory.  Then,
+    // we use vkCmdCopyBuffer() to copy from the staging buffer into the
+    // the actual device memory.
+    MemoryRequest request = {0};
+    request.size = halide_buffer->size_in_bytes();
+    request.properties.usage = MemoryUsage::TransferSrc;
+    request.properties.caching = MemoryCaching::UncachedCoherent;
+    request.properties.visibility = MemoryVisibility::HostToDevice;
+
+    // allocate a new region
+    MemoryRegion *staging_region = ctx.allocator->reserve(user_context, request);
+    if ((staging_region == nullptr) || (staging_region->handle == nullptr)) {
+        error(user_context) << "Vulkan: Failed to allocate device memory!\n";
+        return -1;
+    }
+
+    // map the region to a host ptr
+    uint8_t *stage_host_ptr = (uint8_t *)ctx.allocator->map(user_context, staging_region);
+    if (stage_host_ptr == nullptr) {
+        error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // copy to the (host-visible/coherent) staging buffer
+    copy_helper.dst = (uint64_t)(stage_host_ptr);
+    copy_memory(copy_helper, user_context);
+
+    // retrieve the buffer from the region
+    VkBuffer *staging_buffer = reinterpret_cast<VkBuffer *>(staging_region->handle);
+    if (staging_buffer == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve staging buffer for device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // unmap the pointer
+    ctx.allocator->unmap(user_context, staging_region);
+
+    // get the allocated region for the device
+    MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
+
+    // retrieve the buffer from the region
+    VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
+    if (device_buffer == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // create a command buffer
+    VkCommandBuffer command_buffer;
+    VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: vkCreateCommandBuffer returned: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // begin the command buffer
+    VkCommandBufferBeginInfo command_buffer_begin_info =
+        {
+            VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
+            nullptr,                                      // pointer to struct extending this
+            VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
+            nullptr                                       // pointer to parent command buffer
+        };
+
+    result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // enqueue the copy operation, using the allocated buffers
+    copy_helper.src = (uint64_t)(staging_buffer);
+    copy_helper.dst = (uint64_t)(device_buffer);
+    uint64_t src_offset = copy_helper.src_begin;
+    uint64_t dst_offset = 0;
+    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, src_offset, dst_offset, halide_buffer->dimensions);
+
+    // end the command buffer
+    result = vkEndCommandBuffer(command_buffer);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 13. Submit the command buffer to our command queue
+    VkSubmitInfo submit_info =
+        {
+            VK_STRUCTURE_TYPE_SUBMIT_INFO,  // struct type
+            nullptr,                        // pointer to struct extending this
+            0,                              // wait semaphore count
+            nullptr,                        // semaphores
+            nullptr,                        // pipeline stages where semaphore waits occur
+            1,                              // how many command buffers to execute
+            &command_buffer,                // the command buffers
+            0,                              // number of semaphores to signal
+            nullptr                         // the semaphores to signal
+        };
+
+    result = vkQueueSubmit(ctx.queue, 1, &submit_info, 0);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 14. Wait until the queue is done with the command buffer
+    result = vkQueueWaitIdle(ctx.queue);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 15. Reclaim the staging buffer
+    ctx.allocator->reclaim(user_context, staging_region);
+    
+ #ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return 0;
+}
+
+WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_buffer) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: halide_copy_to_host (user_context: " << user_context
+        << ", halide_buffer: " << halide_buffer << ")\n";
+#endif
+
+    // Acquire the context so we can use the command queue. This also avoids multiple
+    // redundant calls to clEnqueueReadBuffer when multiple threads are trying to copy
+    // the same buffer.
+    VulkanContext ctx(user_context);
+    if (ctx.error != VK_SUCCESS) {
+        return ctx.error;
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    halide_abort_if_false(user_context, halide_buffer->host && halide_buffer->device);
+
+    device_copy copy_helper = make_device_to_host_copy(halide_buffer);
+
+    // This is the inverse of copy_to_device: we create a staging buffer, copy into
+    // it, map it so the host can see it, then copy into the host buffer
+
+    MemoryRequest request = {0};
+    request.size = halide_buffer->size_in_bytes();
+    request.properties.usage = MemoryUsage::TransferDst;
+    request.properties.caching = MemoryCaching::UncachedCoherent;
+    request.properties.visibility = MemoryVisibility::DeviceToHost;
+
+    // allocate a new region for staging the transfer
+    MemoryRegion *staging_region = ctx.allocator->reserve(user_context, request);
+    if ((staging_region == nullptr) || (staging_region->handle == nullptr)) {
+        error(user_context) << "Vulkan: Failed to allocate device memory!\n";
+        return -1;
+    }
+
+    // retrieve the buffer from the region
+    VkBuffer *staging_buffer = reinterpret_cast<VkBuffer *>(staging_region->handle);
+    if (staging_buffer == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve staging buffer for device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // get the allocated region for the device
+    MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
+
+    // retrieve the buffer from the region
+    VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
+    if (device_buffer == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // create a command buffer
+    VkCommandBuffer command_buffer;
+    VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vk_create_command_buffer returned: " << vk_get_error_name(result) << "\n";
+        return -1;
+    }
+
+    // begin the command buffer
+    VkCommandBufferBeginInfo command_buffer_begin_info =
+        {
+            VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
+            nullptr,                                      // pointer to struct extending this
+            VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
+            nullptr                                       // pointer to parent command buffer
+        };
+
+    result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // enqueue the copy operation, using the allocated buffers
+    uint64_t copy_dst = copy_helper.dst;
+    copy_helper.src = (uint64_t)(device_buffer);
+    copy_helper.dst = (uint64_t)(staging_buffer);
+    uint64_t src_offset = copy_helper.src_begin;
+    uint64_t dst_offset = 0;
+    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, src_offset, dst_offset, halide_buffer->dimensions);
+
+    // end the command buffer
+    result = vkEndCommandBuffer(command_buffer);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 13. Submit the command buffer to our command queue
+    VkSubmitInfo submit_info =
+        {
+            VK_STRUCTURE_TYPE_SUBMIT_INFO,  // struct type
+            nullptr,                        // pointer to struct extending this
+            0,                              // wait semaphore count
+            nullptr,                        // semaphores
+            nullptr,                        // pipeline stages where semaphore waits occur
+            1,                              // how many command buffers to execute
+            &command_buffer,                // the command buffers
+            0,                              // number of semaphores to signal
+            nullptr                         // the semaphores to signal
+        };
+
+    result = vkQueueSubmit(ctx.queue, 1, &submit_info, 0);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 14. Wait until the queue is done with the command buffer
+    result = vkQueueWaitIdle(ctx.queue);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // map the staging region to a host ptr
+    uint8_t *stage_host_ptr = (uint8_t *)ctx.allocator->map(user_context, staging_region);
+    if (stage_host_ptr == nullptr) {
+        error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // copy to the (host-visible/coherent) staging buffer
+    copy_helper.dst = copy_dst;
+    copy_helper.src = (uint64_t)(stage_host_ptr);
+    copy_memory(copy_helper, user_context);
+
+    // unmap the pointer and reclaim the staging region
+    ctx.allocator->unmap(user_context, staging_region);
+    ctx.allocator->reclaim(user_context, staging_region);
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return 0;
+}
+
+WEAK int halide_vulkan_run(void *user_context,
+                           void *state_ptr,
+                           const char *entry_name,
+                           int blocksX, int blocksY, int blocksZ,
+                           int threadsX, int threadsY, int threadsZ,
+                           int shared_mem_bytes,
+                           size_t arg_sizes[],
+                           void *args[],
+                           int8_t arg_is_buffer[]) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: halide_vulkan_run (user_context: " << user_context << ", "
+        << "entry: " << entry_name << ", "
+        << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
+        << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
+        << "shmem: " << shared_mem_bytes << "\n";
+#endif
+
+    VulkanContext ctx(user_context);
+    if (ctx.error != VK_SUCCESS) {
+        return ctx.error;
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    // Running a Vulkan pipeline requires a large number of steps
+    // and boilerplate:
+    // 1. Create a descriptor set layout
+    // 1a. Create the buffer for the scalar params
+    // 2. Create a pipeline layout
+    // 3. Create a compute pipeline
+    // --- The above can be cached between invocations ---
+    // 4. Create a descriptor set
+    // 5. Set bindings for buffers in the descriptor set
+    // 6. Create a command buffer from the command pool
+    // 7. Fill the command buffer with a dispatch call
+    // 7a. Bind the compute pipeline from #3
+    // 7b. Bind the descriptor set
+    // 7c. Add a dispatch to the command buffer
+    // 7d. End the command buffer
+    // 8. Submit the command buffer to our command queue
+    // --- The following isn't best practice, but it's in line
+    //     with what we do in Metal etc. ---
+    // 9. Wait until the queue is done with the command buffer
+    // 10. Cleanup all temporary objects
+
+    uint32_t num_bindings = vk_count_bindings_for_descriptor_set(user_context, arg_sizes, args, arg_is_buffer);
+
+    //// 1. Create a descriptor set layout
+    VkDescriptorSetLayout descriptor_set_layout;
+    VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, arg_sizes, args, arg_is_buffer, &descriptor_set_layout);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 1a. Create a buffer for the scalar parameters
+    // First allocate memory, then map it and copy params, then create a buffer and bind the allocation
+    MemoryRegion *scalar_args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, arg_sizes, args, arg_is_buffer);
+    if (scalar_args_region == nullptr) {
+        error(user_context) << "Vulkan: vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
+        return result;
+    }
+
+    VkBuffer *scalar_args_buffer = reinterpret_cast<VkBuffer *>(scalar_args_region->handle);
+    if (scalar_args_buffer == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve scalar args buffer for device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    ///// 2. Create a pipeline layout
+    VkPipelineLayout pipeline_layout;
+    result = vk_create_pipeline_layout(user_context, ctx.allocator, &descriptor_set_layout, &pipeline_layout);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 3. Create a compute pipeline
+    // Get the shader module
+    VkShaderModule *shader_module = nullptr;
+    bool found = compilation_cache.lookup(ctx.device, state_ptr, shader_module);
+    halide_abort_if_false(user_context, found);
+    if (shader_module == nullptr) {
+        error(user_context) << "Vulkan: Failed to locate shader module! Unable to proceed!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // Construct the pipeline
+    VkPipeline compute_pipeline;
+    result = vk_create_compute_pipeline(user_context, ctx.allocator, entry_name, *shader_module, pipeline_layout, &compute_pipeline);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 4. Create a descriptor set
+    // Construct a descriptor pool
+    VkDescriptorPool descriptor_pool;
+    uint32_t storage_buffer_count = num_bindings - 1;
+    result = vk_create_descriptor_pool(user_context, ctx.allocator, storage_buffer_count, &descriptor_pool);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // Create the descriptor set
+    VkDescriptorSet descriptor_set;
+    result = vk_create_descriptor_set(user_context, ctx.allocator, descriptor_set_layout, descriptor_pool, &descriptor_set);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 5. Set bindings for buffers in the descriptor set
+    result = vk_update_descriptor_set(user_context, ctx.allocator, *scalar_args_buffer, storage_buffer_count, arg_sizes, args, arg_is_buffer, descriptor_set);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 6. Create a command buffer from the command pool
+    VkCommandBuffer command_buffer;
+    result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: vk_create_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 7. Begin the command buffer
+    result = vk_fill_command_buffer_with_dispatch_call(user_context,
+                                                       ctx.device, command_buffer, compute_pipeline, pipeline_layout, descriptor_set,
+                                                       blocksX, blocksY, blocksZ);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: vk_fill_command_buffer_with_dispatch_call() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 8. Submit the command buffer to our command queue
+    result = vk_submit_command_buffer(user_context, ctx.queue, command_buffer);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: vk_submit_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 9. Wait until the queue is done with the command buffer
+    result = vkQueueWaitIdle(ctx.queue);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    //// 10. Cleanup    
+    // Release all temporary objects for this run
+    vk_destroy_scalar_uniform_buffer(user_context, ctx.allocator, scalar_args_region);
+    vk_destroy_descriptor_set_layout(user_context, ctx.allocator, descriptor_set_layout);
+    vk_destroy_descriptor_pool(user_context, ctx.allocator, descriptor_pool);
+    vk_destroy_pipeline_layout(user_context, ctx.allocator, pipeline_layout);
+    vk_destroy_compute_pipeline(user_context, ctx.allocator, compute_pipeline);
+
+    vkResetCommandPool(ctx.device, ctx.command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+    return 0;
+}
+
+WEAK int halide_vulkan_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
+    return halide_default_device_and_host_malloc(user_context, buf, &vulkan_device_interface);
+}
+
+WEAK int halide_vulkan_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
+    return halide_default_device_and_host_free(user_context, buf, &vulkan_device_interface);
+}
+
+WEAK int halide_vulkan_wrap_vk_buffer(void *user_context, struct halide_buffer_t *buf, uint64_t vk_buffer) {
+    halide_debug_assert(user_context, buf->device == 0);
+    if (buf->device != 0) {
+        return -2;
+    }
+    buf->device = vk_buffer;
+    buf->device_interface = &vulkan_device_interface;
+    buf->device_interface->impl->use_module();
+
+    return 0;
+}
+
+WEAK int halide_vulkan_detach_vk_buffer(void *user_context, halide_buffer_t *buf) {
+    if (buf->device == 0) {
+        return 0;
+    }
+    halide_debug_assert(user_context, buf->device_interface == &vulkan_device_interface);
+    buf->device = 0;
+    buf->device_interface->impl->release_module();
+    buf->device_interface = nullptr;
+    return 0;
+}
+
+WEAK uintptr_t halide_vulkan_get_vk_buffer(void *user_context, halide_buffer_t *buf) {
+    if (buf->device == 0) {
+        return 0;
+    }
+    halide_debug_assert(user_context, buf->device_interface == &vulkan_device_interface);
+    return (uintptr_t)buf->device;
+}
+
+WEAK const struct halide_device_interface_t *halide_vulkan_device_interface() {
+    return &vulkan_device_interface;
+}
+
+namespace {
+
+__attribute__((destructor))
+WEAK void
+halide_vulkan_cleanup() {
+    halide_vulkan_device_release(nullptr);
+}
+
+// --------------------------------------------------------------------------
+
+}  // namespace
+
+// --------------------------------------------------------------------------
+
+}  // extern "C" linkage
+
+// --------------------------------------------------------------------------
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Vulkan {
+
+// --------------------------------------------------------------------------
+
+WEAK halide_device_interface_impl_t vulkan_device_interface_impl = {
+    halide_use_jit_module,
+    halide_release_jit_module,
+    halide_vulkan_device_malloc,
+    halide_vulkan_device_free,
+    halide_vulkan_device_sync,
+    halide_vulkan_device_release,
+    halide_vulkan_copy_to_host,
+    halide_vulkan_copy_to_device,
+    halide_vulkan_device_and_host_malloc,
+    halide_vulkan_device_and_host_free,
+    halide_default_buffer_copy,
+    halide_default_device_crop,
+    halide_default_device_slice,
+    halide_default_device_release_crop,
+    halide_vulkan_wrap_vk_buffer,
+    halide_vulkan_detach_vk_buffer,
+};
+
+WEAK halide_device_interface_t vulkan_device_interface = {
+    halide_device_malloc,
+    halide_device_free,
+    halide_device_sync,
+    halide_device_release,
+    halide_copy_to_host,
+    halide_copy_to_device,
+    halide_device_and_host_malloc,
+    halide_device_and_host_free,
+    halide_buffer_copy,
+    halide_device_crop,
+    halide_device_slice,
+    halide_device_release_crop,
+    halide_device_wrap_native,
+    halide_device_detach_native,
+    nullptr,  // target capabilities.
+    &vulkan_device_interface_impl};
+
+// --------------------------------------------------------------------------
+
+}  // namespace Vulkan
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
new file mode 100644
index 000000000000..84ff5c6a5dd6
--- /dev/null
+++ b/src/runtime/vulkan_context.h
@@ -0,0 +1,329 @@
+#ifndef HALIDE_RUNTIME_VULKAN_CONTEXT_H
+#define HALIDE_RUNTIME_VULKAN_CONTEXT_H
+
+#include "printer.h"
+#include "runtime_internal.h"
+#include "scoped_spin_lock.h"
+
+#include "vulkan_extensions.h"
+#include "vulkan_internal.h"
+#include "vulkan_memory.h"
+
+// --------------------------------------------------------------------------
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Vulkan {
+
+// --------------------------------------------------------------------------
+
+// An Vulkan context/queue/synchronization lock defined in this module with weak linkage
+// Vulkan Memory allocator for host-device allocations
+halide_vulkan_memory_allocator *WEAK cached_allocator = nullptr;
+VkInstance WEAK cached_instance = nullptr;
+VkDevice WEAK cached_device = nullptr;
+VkCommandPool WEAK cached_command_pool = 0;
+VkQueue WEAK cached_queue = nullptr;
+VkPhysicalDevice WEAK cached_physical_device = nullptr;
+uint32_t WEAK cached_queue_family_index = 0;
+volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0;
+
+// --------------------------------------------------------------------------
+
+// Helper object to acquire and release the Vulkan context.
+class VulkanContext {
+    void *user_context;
+
+public:
+    VulkanMemoryAllocator *allocator;
+    VkInstance instance;
+    VkDevice device;
+    VkCommandPool command_pool;
+    VkPhysicalDevice physical_device;
+    VkQueue queue;
+    uint32_t queue_family_index;  // used for operations requiring queue family
+    VkResult error;
+
+    HALIDE_ALWAYS_INLINE VulkanContext(void *user_context)
+        : user_context(user_context),
+          allocator(nullptr),
+          instance(nullptr),
+          device(nullptr),
+          command_pool(0),
+          physical_device(nullptr),
+          queue(nullptr),
+          queue_family_index(0),
+          error(VK_SUCCESS) {
+
+        int result = halide_vulkan_acquire_context(user_context,
+                                                   reinterpret_cast<halide_vulkan_memory_allocator **>(&allocator),
+                                                   &instance, &device, &physical_device, &command_pool, &queue, &queue_family_index);
+        halide_abort_if_false(user_context, result == 0);
+        halide_abort_if_false(user_context, allocator != nullptr);
+        halide_abort_if_false(user_context, instance != nullptr);
+        halide_abort_if_false(user_context, device != nullptr);
+        halide_abort_if_false(user_context, command_pool != 0);
+        halide_abort_if_false(user_context, queue != nullptr);
+        halide_abort_if_false(user_context, physical_device != nullptr);
+    }
+
+    HALIDE_ALWAYS_INLINE ~VulkanContext() {
+        halide_vulkan_release_context(user_context, instance, device, queue);
+    }
+
+    // For now, this is always nullptr
+    HALIDE_ALWAYS_INLINE const VkAllocationCallbacks *allocation_callbacks() {
+        return nullptr;
+    }
+};
+
+// --------------------------------------------------------------------------
+
+namespace {
+    
+// Initializes the instance (used by the default vk_create_context)
+int vk_create_instance(void *user_context, const StringTable &requested_layers, VkInstance *instance, const VkAllocationCallbacks *alloc_callbacks) {
+    debug(user_context) << "    vk_create_instance (user_context: " << user_context << ")\n";
+
+    StringTable required_instance_extensions;
+    vk_get_required_instance_extensions(user_context, required_instance_extensions);
+
+    StringTable supported_instance_extensions;
+    vk_get_supported_instance_extensions(user_context, supported_instance_extensions);
+
+    bool valid_instance = vk_validate_required_extension_support(user_context, required_instance_extensions, supported_instance_extensions);
+    halide_abort_if_false(user_context, valid_instance);
+
+    debug(user_context) << "Vulkan: Found " << (uint32_t)required_instance_extensions.size() << " required extensions for instance!\n";
+    for (int n = 0; n < (int)required_instance_extensions.size(); ++n) {
+        debug(user_context) << "    extension: " << required_instance_extensions[n] << "\n";
+    }
+
+    VkApplicationInfo app_info = {
+        VK_STRUCTURE_TYPE_APPLICATION_INFO,                                                        // struct type
+        nullptr,                                                                                   // Next
+        "Runtime",                                                                                 // application name
+        VK_MAKE_API_VERSION(0, 1, 0, 0),                                                           // app version
+        "Halide",                                                                                  // engine name
+        VK_MAKE_API_VERSION(0, HALIDE_VERSION_MAJOR, HALIDE_VERSION_MINOR, HALIDE_VERSION_PATCH),  // engine version
+        VK_API_VERSION_1_0};
+
+    VkInstanceCreateInfo create_info = {
+        VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        nullptr,                                                                            // Next
+        0,                                                                                  // Flags
+        &app_info,                                                                          // ApplicationInfo
+        (uint32_t)requested_layers.size(), requested_layers.data(),                         // Layers
+        (uint32_t)required_instance_extensions.size(), required_instance_extensions.data()  // Extensions
+    };
+
+    VkResult result = vkCreateInstance(&create_info, alloc_callbacks, instance);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: vkCreateInstance failed with return code: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_incompatible_device_interface;
+    }
+
+    return halide_error_code_success;
+}
+
+int vk_select_device_for_context(void *user_context,
+                                      VkInstance *instance, VkDevice *device,
+                                      VkPhysicalDevice *physical_device,
+                                      uint32_t *queue_family_index) {
+
+    // For now handle more than 16 devices by just looking at the first 16.
+    VkPhysicalDevice chosen_device = nullptr;
+    VkPhysicalDevice avail_devices[16];
+    uint32_t device_count = sizeof(avail_devices) / sizeof(avail_devices[0]);
+    VkResult result = vkEnumeratePhysicalDevices(*instance, &device_count, avail_devices);
+    if ((result != VK_SUCCESS) && (result != VK_INCOMPLETE)) {
+        debug(user_context) << "Vulkan: vkEnumeratePhysicalDevices failed with return code: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_incompatible_device_interface;
+    }
+
+    if (device_count == 0) {
+        debug(user_context) << "Vulkan: No devices found.\n";
+        return halide_error_code_incompatible_device_interface;
+    }
+
+    const char *dev_type = halide_vulkan_get_device_type(user_context);
+
+    // Try to find a device that supports compute.
+    uint32_t queue_family = 0;
+    for (uint32_t i = 0; (chosen_device == nullptr) && (i < device_count); i++) {
+        VkPhysicalDeviceProperties properties;
+        vkGetPhysicalDeviceProperties(avail_devices[i], &properties);
+        debug(user_context) << "Vulkan: Checking device #" << i << "='" << properties.deviceName << "'\n";
+
+        int matching_device = 0;
+        if ((dev_type != nullptr) && (*dev_type != '\0')) {
+            if (strstr(dev_type, "cpu") && (properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU)) {
+                matching_device = 1;
+            } else if (strstr(dev_type, "integrated-gpu") && ((properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU))) {
+                matching_device = 1;
+            } else if (strstr(dev_type, "discrete-gpu") && ((properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU))) {
+                matching_device = 1;
+            } else if (strstr(dev_type, "virtual-gpu") && (properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU)) {
+                matching_device = 1;
+            } else if (strstr(dev_type, "gpu") && ((properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) || (properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU))) {
+                matching_device = 1;
+            }
+        } else {
+            // use a non-virtual gpu device by default
+            if ((properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) ||
+                (properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)) {
+                matching_device = 1;
+            }
+        }
+
+        if (matching_device) {
+            VkQueueFamilyProperties queue_properties[16];
+            uint32_t queue_properties_count = sizeof(queue_properties) / sizeof(queue_properties[0]);
+            vkGetPhysicalDeviceQueueFamilyProperties(avail_devices[i], &queue_properties_count, queue_properties);
+            for (uint32_t j = 0; (chosen_device == nullptr) && (j < queue_properties_count); j++) {
+                if (queue_properties[j].queueCount > 0 &&
+                    queue_properties[j].queueFlags & VK_QUEUE_COMPUTE_BIT) {
+                    chosen_device = avail_devices[i];
+                    queue_family = j;
+
+                    debug(user_context) << "Vulkan: Found matching compute device '" << properties.deviceName << "'\n";
+                }
+            }
+        }
+    }
+    // If nothing, just try the first one for now.
+    if (chosen_device == nullptr) {
+        queue_family = 0;
+        chosen_device = avail_devices[0];
+        VkPhysicalDeviceProperties properties;
+        vkGetPhysicalDeviceProperties(chosen_device, &properties);
+        debug(user_context) << "Vulkan: Defaulting to first compute device '" << properties.deviceName << "'\n";
+    }
+
+    *queue_family_index = queue_family;
+    *physical_device = chosen_device;
+    return halide_error_code_success;
+}
+
+int vk_create_device(void *user_context, const StringTable &requested_layers, VkInstance *instance, VkDevice *device, VkQueue *queue,
+                          VkPhysicalDevice *physical_device, uint32_t *queue_family_index, const VkAllocationCallbacks *alloc_callbacks) {
+
+    StringTable required_device_extensions;
+    vk_get_required_device_extensions(user_context, required_device_extensions);
+
+    StringTable optional_device_extensions;
+    vk_get_optional_device_extensions(user_context, optional_device_extensions);
+
+    StringTable supported_device_extensions;
+    vk_get_supported_device_extensions(user_context, *physical_device, supported_device_extensions);
+
+    bool valid_device = vk_validate_required_extension_support(user_context, required_device_extensions, supported_device_extensions);
+    halide_abort_if_false(user_context, valid_device);
+
+    debug(user_context) << "Vulkan: Found " << (uint32_t)required_device_extensions.size() << " required extensions for device!\n";
+    for (int n = 0; n < (int)required_device_extensions.size(); ++n) {
+        debug(user_context) << "    required extension: " << required_device_extensions[n] << "\n";
+    }
+
+    // enable all available optional extensions
+    debug(user_context) << "Vulkan: Found " << (uint32_t)optional_device_extensions.size() << " optional extensions for device!\n";
+    for (int n = 0; n < (int)optional_device_extensions.size(); ++n) {
+        if (supported_device_extensions.contains(optional_device_extensions[n])) {
+            debug(user_context) << "    optional extension: " << optional_device_extensions[n] << "\n";
+            required_device_extensions.append(user_context, optional_device_extensions[n]);
+        }
+    }
+
+    float queue_priority = 1.0f;
+    VkDeviceQueueCreateInfo device_queue_create_info = {
+        VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        nullptr,  // Next
+        0,        // Flags
+        *queue_family_index,
+        1,
+        &queue_priority,
+    };
+
+    VkDeviceCreateInfo device_create_info = {
+        VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        nullptr,  // Next
+        0,        // Flags
+        1,        // Count of queues to create
+        &device_queue_create_info,
+        (uint32_t)requested_layers.size(), requested_layers.data(),                      // Layers
+        (uint32_t)required_device_extensions.size(), required_device_extensions.data(),  // Enabled extensions
+        nullptr,                                                                         // VkPhysicalDeviceFeatures
+    };
+
+    VkResult result = vkCreateDevice(*physical_device, &device_create_info, alloc_callbacks, device);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: vkCreateDevice failed with return code: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_incompatible_device_interface;
+    }
+
+    vkGetDeviceQueue(cached_device, *queue_family_index, 0, queue);
+    return halide_error_code_success;
+}
+
+// Initializes the context (used by the default implementation of halide_acquire_context)
+int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
+                           VkInstance *instance, VkDevice *device, VkPhysicalDevice *physical_device, 
+                           VkCommandPool *command_pool, VkQueue *queue, uint32_t *queue_family_index) {
+
+    debug(user_context) << "    vk_create_context (user_context: " << user_context << ")\n";
+
+    StringTable requested_layers;
+    uint32_t requested_layer_count = vk_get_requested_layers(user_context, requested_layers);
+    debug(user_context) << "Vulkan: Requested " << requested_layer_count << " layers for instance!\n";
+    for (int n = 0; n < (int)requested_layer_count; ++n) {
+        debug(user_context) << "    layer: " << requested_layers[n] << "\n";
+    }
+
+    const VkAllocationCallbacks *alloc_callbacks = halide_vulkan_get_allocation_callbacks(user_context);
+    int status = vk_create_instance(user_context, requested_layers, instance, alloc_callbacks);
+    if (status != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to create instance for context!\n";
+        return halide_error_code_generic_error;
+    }
+
+    if (vkCreateDevice == nullptr) {
+        vk_load_vulkan_functions(*instance);
+    }
+
+    status = vk_select_device_for_context(user_context, instance, device, physical_device, queue_family_index);
+    if (status != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to select device for context!\n";
+        return halide_error_code_generic_error;
+    }
+
+    status = vk_create_device(user_context, requested_layers, instance, device, queue, physical_device, queue_family_index, alloc_callbacks);
+    if (status != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to create device for context!\n";
+        return halide_error_code_generic_error;
+    }
+
+    *allocator = vk_create_memory_allocator(user_context, *device, *physical_device, alloc_callbacks);
+    if (*allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create memory allocator for device!\n";
+        return halide_error_code_generic_error;
+    }
+
+    VkResult result = vk_create_command_pool(user_context, *allocator, *queue_family_index, command_pool);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: Failed to create command pool for context! Error: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    return halide_error_code_success;
+}
+
+// --------------------------------------------------------------------------
+
+}  // namespace: (anonymous)
+}  // namespace: Vulkan
+}  // namespace: Internal
+}  // namespace: Runtime
+}  // namespace: Halide
+
+#endif  /// HALIDE_RUNTIME_VULKAN_CONTEXT_H
diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
new file mode 100644
index 000000000000..26796b61c116
--- /dev/null
+++ b/src/runtime/vulkan_extensions.h
@@ -0,0 +1,296 @@
+#ifndef HALIDE_RUNTIME_VULKAN_EXTENSIONS_H
+#define HALIDE_RUNTIME_VULKAN_EXTENSIONS_H
+
+#include "vulkan_internal.h"
+
+// --------------------------------------------------------------------------
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Vulkan {
+
+// --------------------------------------------------------------------------
+
+WEAK char layer_names[1024];
+WEAK ScopedSpinLock::AtomicFlag layer_names_lock = 0;
+WEAK bool layer_names_initialized = false;
+
+WEAK char extension_names[1024];
+WEAK ScopedSpinLock::AtomicFlag extension_names_lock = 0;
+WEAK bool extension_names_initialized = false;
+
+WEAK char device_type[256];
+WEAK ScopedSpinLock::AtomicFlag device_type_lock = 0;
+WEAK bool device_type_initialized = false;
+
+WEAK char build_options[1024];
+WEAK ScopedSpinLock::AtomicFlag build_options_lock = 0;
+WEAK bool build_options_initialized = false;
+
+// --------------------------------------------------------------------------
+namespace {
+
+void vk_set_layer_names_internal(const char *n) {
+    if (n) {
+        size_t buffer_size = sizeof(layer_names) / sizeof(layer_names[0]);
+        strncpy(layer_names, n, buffer_size);
+        layer_names[buffer_size - 1] = '\0';
+    } else {
+        layer_names[0] = '\0';
+    }
+    layer_names_initialized = true;
+}
+
+const char *vk_get_layer_names_internal(void *user_context) {
+    if (!layer_names_initialized) {
+        const char *value = getenv("HL_VK_LAYERS");
+        if (value == nullptr) { value = getenv("VK_INSTANCE_LAYERS"); }
+        vk_set_layer_names_internal(value);
+    }
+    return layer_names;
+}
+
+void vk_set_extension_names_internal(const char *n) {
+    if (n) {
+        size_t buffer_size = sizeof(extension_names) / sizeof(extension_names[0]);
+        strncpy(extension_names, n, buffer_size);
+        extension_names[buffer_size - 1] = 0;
+    } else {
+        extension_names[0] = 0;
+    }
+    extension_names_initialized = true;
+}
+
+const char *vk_get_extension_names_internal(void *user_context) {
+    if (!extension_names_initialized) {
+        const char *name = getenv("HL_VK_EXTENSIONS");
+        vk_set_extension_names_internal(name);
+    }
+    return extension_names;
+}
+
+void vk_set_device_type_internal(const char *n) {
+    if (n) {
+        size_t buffer_size = sizeof(device_type) / sizeof(device_type[0]);
+        strncpy(device_type, n, buffer_size);
+        device_type[buffer_size - 1] = 0;
+    } else {
+        device_type[0] = 0;
+    }
+    device_type_initialized = true;
+}
+
+const char *vk_get_device_type_internal(void *user_context) {
+    if (!device_type_initialized) {
+        const char *name = getenv("HL_VK_DEVICE_TYPE");
+        vk_set_device_type_internal(name);
+    }
+    return device_type;
+}
+
+void vk_set_build_options_internal(const char *n) {
+    if (n) {
+        size_t buffer_size = sizeof(build_options) / sizeof(build_options[0]);
+        strncpy(build_options, n, buffer_size);
+        build_options[buffer_size - 1] = 0;
+    } else {
+        build_options[0] = 0;
+    }
+    build_options_initialized = true;
+}
+
+const char *vk_get_build_options_internal(void *user_context) {
+    if (!build_options_initialized) {
+        const char *name = getenv("HL_VK_BUILD_OPTIONS");
+        vk_set_build_options_internal(name);
+    }
+    return build_options;
+}
+
+// --------------------------------------------------------------------------
+
+uint32_t vk_get_requested_layers(void *user_context, StringTable &layer_table) {
+    ScopedSpinLock lock(&layer_names_lock);
+    const char *layer_names = vk_get_layer_names_internal(user_context);
+    return layer_table.parse(user_context, layer_names, HL_VK_ENV_DELIM);
+}
+
+uint32_t vk_get_required_instance_extensions(void *user_context, StringTable &ext_table) {
+    const char *required_ext_table[] = {"VK_KHR_get_physical_device_properties2"};
+    const uint32_t required_ext_count = sizeof(required_ext_table) / sizeof(required_ext_table[0]);
+    ext_table.fill(user_context, (const char **)required_ext_table, required_ext_count);
+    return required_ext_count;
+}
+
+uint32_t vk_get_supported_instance_extensions(void *user_context, StringTable &ext_table) {
+
+    PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties = (PFN_vkEnumerateInstanceExtensionProperties)
+        vkGetInstanceProcAddr(nullptr, "vkEnumerateInstanceExtensionProperties");
+
+    if (vkEnumerateInstanceExtensionProperties == nullptr) {
+        debug(user_context) << "Vulkan: Missing vkEnumerateInstanceExtensionProperties proc address! Invalid loader?!\n";
+        return 0;
+    }
+
+    uint32_t avail_ext_count = 0;
+    vkEnumerateInstanceExtensionProperties(nullptr, &avail_ext_count, nullptr);
+    debug(user_context) << "Vulkan: vkEnumerateInstanceExtensionProperties found  " << avail_ext_count << " extensions ...\n";
+
+    if (avail_ext_count) {
+        BlockStorage::Config config;
+        config.entry_size = sizeof(VkExtensionProperties);
+        config.minimum_capacity = avail_ext_count;
+
+        BlockStorage extension_properties(user_context, config);
+        extension_properties.resize(user_context, avail_ext_count);
+
+        vkEnumerateInstanceExtensionProperties(nullptr,
+                                               &avail_ext_count, static_cast<VkExtensionProperties *>(extension_properties.data()));
+
+        for (uint32_t n = 0; n < avail_ext_count; ++n) {
+            const VkExtensionProperties *properties = static_cast<const VkExtensionProperties *>(extension_properties[n]);
+            debug(user_context) << "    extension: " << properties->extensionName << "\n";
+        }
+
+        ext_table.resize(user_context, avail_ext_count);
+        for (uint32_t n = 0; n < avail_ext_count; ++n) {
+            const VkExtensionProperties *properties = static_cast<const VkExtensionProperties *>(extension_properties[n]);
+            ext_table.assign(user_context, n, properties->extensionName);
+        }
+    }
+
+    return avail_ext_count;
+}
+
+uint32_t vk_get_required_device_extensions(void *user_context, StringTable &ext_table) {
+    const char *required_ext_table[] = {"VK_KHR_8bit_storage", "VK_KHR_storage_buffer_storage_class"};
+    const uint32_t required_ext_count = sizeof(required_ext_table) / sizeof(required_ext_table[0]);
+    ext_table.fill(user_context, (const char **)required_ext_table, required_ext_count);
+    return required_ext_count;
+}
+
+uint32_t vk_get_optional_device_extensions(void *user_context, StringTable &ext_table) {
+    const uint32_t optional_ext_count = 1;
+    const char *optional_ext_table[] = {"VK_KHR_portability_subset"};
+
+    ext_table.resize(user_context, optional_ext_count);
+    for (uint32_t n = 0; n < optional_ext_count; ++n) {
+        ext_table.assign(user_context, n, optional_ext_table[n]);
+    }
+    return optional_ext_count;
+}
+uint32_t vk_get_supported_device_extensions(void *user_context, VkPhysicalDevice physical_device, StringTable &ext_table) {
+
+    if (vkEnumerateDeviceExtensionProperties == nullptr) {
+        debug(user_context) << "Vulkan: Missing vkEnumerateDeviceExtensionProperties proc address! Invalid loader?!\n";
+        return 0;
+    }
+
+    uint32_t avail_ext_count = 0;
+    vkEnumerateDeviceExtensionProperties(physical_device, nullptr, &avail_ext_count, nullptr);
+    debug(user_context) << "Vulkan: vkEnumerateDeviceExtensionProperties found  " << avail_ext_count << " extensions ...\n";
+
+    if (avail_ext_count) {
+        BlockStorage::Config config;
+        config.entry_size = sizeof(VkExtensionProperties);
+        config.minimum_capacity = avail_ext_count;
+
+        BlockStorage extension_properties(user_context, config);
+        extension_properties.resize(user_context, avail_ext_count);
+
+        vkEnumerateDeviceExtensionProperties(physical_device, nullptr,
+                                             &avail_ext_count, static_cast<VkExtensionProperties *>(extension_properties.data()));
+
+        for (uint32_t n = 0; n < avail_ext_count; ++n) {
+            const VkExtensionProperties *properties = static_cast<const VkExtensionProperties *>(extension_properties[n]);
+            debug(user_context) << "    extension: " << properties->extensionName << "\n";
+        }
+
+        ext_table.resize(user_context, avail_ext_count);
+        for (uint32_t n = 0; n < avail_ext_count; ++n) {
+            const VkExtensionProperties *properties = static_cast<const VkExtensionProperties *>(extension_properties[n]);
+            ext_table.assign(user_context, n, properties->extensionName);
+        }
+    }
+
+    return avail_ext_count;
+}
+
+bool vk_validate_required_extension_support(void *user_context,
+                                                 const StringTable &required_extensions,
+                                                 const StringTable &supported_extensions) {
+    bool validated = true;
+    for (uint32_t n = 0; n < required_extensions.size(); ++n) {
+        const char *extension = required_extensions[n];
+        if (!supported_extensions.contains(extension)) {
+            debug(user_context) << "Vulkan: Missing required extension: '" << extension << "'! \n";
+            validated = false;
+        }
+    }
+    return validated;
+}
+
+// --------------------------------------------------------------------------
+
+}  // namespace: (anonymous)
+}  // namespace: Vulkan
+}  // namespace: Internal
+}  // namespace: Runtime
+}  // namespace: Halide
+
+// --------------------------------------------------------------------------
+
+using namespace Halide::Runtime::Internal::Vulkan;
+
+// --------------------------------------------------------------------------
+
+extern "C" {
+
+// --------------------------------------------------------------------------
+
+WEAK void halide_vulkan_set_layer_names(const char *n) {
+    ScopedSpinLock lock(&layer_names_lock);
+    vk_set_layer_names_internal(n);
+}
+
+WEAK const char *halide_vulkan_get_layer_names(void *user_context) {
+    ScopedSpinLock lock(&layer_names_lock);
+    return vk_get_layer_names_internal(user_context);
+}
+
+WEAK void halide_vulkan_set_extension_names(const char *n) {
+    ScopedSpinLock lock(&extension_names_lock);
+    vk_set_extension_names_internal(n);
+}
+
+WEAK const char *halide_vulkan_get_extension_names(void *user_context) {
+    ScopedSpinLock lock(&extension_names_lock);
+    return vk_get_extension_names_internal(user_context);
+}
+
+WEAK void halide_vulkan_set_device_type(const char *n) {
+    ScopedSpinLock lock(&device_type_lock);
+    vk_set_device_type_internal(n);
+}
+
+WEAK const char *halide_vulkan_get_device_type(void *user_context) {
+    ScopedSpinLock lock(&device_type_lock);
+    return vk_get_device_type_internal(user_context);
+}
+
+WEAK void halide_vulkan_set_build_options(const char *n) {
+    ScopedSpinLock lock(&build_options_lock);
+    vk_set_build_options_internal(n);
+}
+
+WEAK const char *halide_vulkan_get_build_options(void *user_context) {
+    ScopedSpinLock lock(&build_options_lock);
+    return vk_get_build_options_internal(user_context);
+}
+
+// --------------------------------------------------------------------------
+
+}  // extern "C"
+
+#endif  // HALIDE_RUNTIME_VULKAN_EXTENSIONS_H
\ No newline at end of file
diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
new file mode 100644
index 000000000000..a790d7453c24
--- /dev/null
+++ b/src/runtime/vulkan_functions.h
@@ -0,0 +1,41 @@
+//VULKAN_FN(vkCreateInstance)
+VULKAN_FN(vkDestroyInstance)
+VULKAN_FN(vkCreateDevice)
+VULKAN_FN(vkDestroyDevice)
+VULKAN_FN(vkGetDeviceQueue)
+VULKAN_FN(vkCreateBuffer)
+VULKAN_FN(vkDestroyBuffer)
+VULKAN_FN(vkDestroyCommandPool)
+VULKAN_FN(vkDestroyDescriptorPool)
+VULKAN_FN(vkDestroyDescriptorSetLayout)
+VULKAN_FN(vkDestroyPipeline)
+VULKAN_FN(vkDestroyPipelineLayout)
+VULKAN_FN(vkDestroyShaderModule)
+VULKAN_FN(vkEnumerateDeviceExtensionProperties)
+VULKAN_FN(vkEnumeratePhysicalDevices)
+VULKAN_FN(vkGetPhysicalDeviceProperties)
+VULKAN_FN(vkGetPhysicalDeviceQueueFamilyProperties)
+VULKAN_FN(vkCreateShaderModule)
+VULKAN_FN(vkCreateDescriptorSetLayout)
+VULKAN_FN(vkCreatePipelineLayout)
+VULKAN_FN(vkCreateComputePipelines)
+VULKAN_FN(vkCreateDescriptorPool)
+VULKAN_FN(vkAllocateDescriptorSets)
+VULKAN_FN(vkGetPhysicalDeviceMemoryProperties)
+VULKAN_FN(vkAllocateMemory)
+VULKAN_FN(vkBindBufferMemory)
+VULKAN_FN(vkMapMemory)
+VULKAN_FN(vkUnmapMemory)
+VULKAN_FN(vkFreeMemory)
+VULKAN_FN(vkUpdateDescriptorSets)
+VULKAN_FN(vkCreateCommandPool)
+VULKAN_FN(vkResetCommandPool)
+VULKAN_FN(vkAllocateCommandBuffers)
+VULKAN_FN(vkBeginCommandBuffer)
+VULKAN_FN(vkCmdBindPipeline)
+VULKAN_FN(vkCmdBindDescriptorSets)
+VULKAN_FN(vkCmdDispatch)
+VULKAN_FN(vkQueueSubmit)
+VULKAN_FN(vkQueueWaitIdle)
+VULKAN_FN(vkEndCommandBuffer)
+VULKAN_FN(vkCmdCopyBuffer)
\ No newline at end of file
diff --git a/src/runtime/vulkan_interface.h b/src/runtime/vulkan_interface.h
new file mode 100644
index 000000000000..12be41fc4711
--- /dev/null
+++ b/src/runtime/vulkan_interface.h
@@ -0,0 +1,67 @@
+#ifndef HALIDE_RUNTIME_VULKAN_INTERFACE_H
+#define HALIDE_RUNTIME_VULKAN_INTERFACE_H
+
+#include "runtime_internal.h"
+
+// --------------------------------------------------------------------------
+// Vulkan Specific Definitions
+// --------------------------------------------------------------------------
+
+// Vulkan API version identifier macro
+#define VK_MAKE_API_VERSION(variant, major, minor, patch) \
+    ((((uint32_t)(variant)) << 29) | (((uint32_t)(major)) << 22) | (((uint32_t)(minor)) << 12) | ((uint32_t)(patch)))
+
+// Vulkan API version 1.0.0
+#define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)  // Patch version should always be set to 0
+
+// Environment variable string delimiter
+#ifdef WINDOWS
+#define HL_VK_ENV_DELIM ";"
+#else
+#define HL_VK_ENV_DELIM ":"
+#endif
+
+// Prototypes for the subset of the Vulkan API we need
+#define VK_NO_PROTOTYPES
+// NOLINTNEXTLINE
+#include "mini_vulkan.h"
+
+// --------------------------------------------------------------------------
+// Vulkan API Definition
+// --------------------------------------------------------------------------
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Vulkan {
+
+// --------------------------------------------------------------------------
+
+// Halide device interface struct for runtime specific funtion table
+extern WEAK halide_device_interface_t vulkan_device_interface;
+
+// --------------------------------------------------------------------------
+
+// clang-format off
+#define VULKAN_FN(fn) WEAK PFN_##fn fn; 
+#include "vulkan_functions.h"
+#undef VULKAN_FN
+// clang-format on
+
+void WEAK vk_load_vulkan_functions(VkInstance instance) {
+#define VULKAN_FN(fn) fn = (PFN_##fn)vkGetInstanceProcAddr(instance, #fn);
+#include "vulkan_functions.h"
+#undef VULKAN_FN
+}
+
+// --
+
+
+// --------------------------------------------------------------------------
+
+}  // namespace: Vulkan
+}  // namespace: Internal
+}  // namespace: Runtime
+}  // namespace: Halide
+
+#endif  // HALIDE_RUNTIME_VULKAN_INTERFACE_H
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
new file mode 100644
index 000000000000..e21d877954f2
--- /dev/null
+++ b/src/runtime/vulkan_internal.h
@@ -0,0 +1,260 @@
+#ifndef HALIDE_RUNTIME_VULKAN_INTERNAL_H
+#define HALIDE_RUNTIME_VULKAN_INTERNAL_H
+
+#include "gpu_context_common.h"
+#include "printer.h"
+#include "runtime_internal.h"
+#include "scoped_spin_lock.h"
+
+#include "internal/block_storage.h"
+#include "internal/linked_list.h"
+#include "internal/memory_arena.h"
+#include "internal/string_storage.h"
+#include "internal/string_table.h"
+
+#include "vulkan_interface.h"
+
+// --
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Vulkan {
+
+// Declarations
+class VulkanMemoryAllocator;
+
+// --------------------------------------------------------------------------
+
+namespace { // internalize 
+
+// --------------------------------------------------------------------------
+// Memory
+// --------------------------------------------------------------------------
+void *vk_host_malloc(void *user_context, size_t size, size_t alignment, VkSystemAllocationScope scope, const VkAllocationCallbacks *callbacks = nullptr);
+void vk_host_free(void *user_context, void *ptr, const VkAllocationCallbacks *callbacks = nullptr);
+
+VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context, VkDevice device, VkPhysicalDevice physical_device,
+                                                       const VkAllocationCallbacks *alloc_callbacks);
+
+int vk_destroy_memory_allocator(void *user_context, VulkanMemoryAllocator *allocator);
+
+// --------------------------------------------------------------------------
+// Context
+// --------------------------------------------------------------------------
+int vk_create_context(
+    void *user_context,
+    VulkanMemoryAllocator **allocator,
+    VkInstance *instance,
+    VkDevice *device, 
+    VkPhysicalDevice *physical_device,
+    VkCommandPool *command_pool,
+    VkQueue *queue, uint32_t *queue_family_index);
+
+int vk_create_instance(void *user_context, const StringTable &requested_layers, VkInstance *instance, const VkAllocationCallbacks *alloc_callbacks);
+
+int vk_select_device_for_context(void *user_context,
+                                      VkInstance *instance, VkDevice *device,
+                                      VkPhysicalDevice *physical_device,
+                                      uint32_t *queue_family_index);
+
+int vk_create_device(void *user_context, const StringTable &requested_layers, VkInstance *instance, VkDevice *device, VkQueue *queue,
+                          VkPhysicalDevice *physical_device, uint32_t *queue_family_index, const VkAllocationCallbacks *alloc_callbacks);
+
+// --------------------------------------------------------------------------
+// Extensions
+// --------------------------------------------------------------------------
+uint32_t vk_get_requested_layers(void *user_context, StringTable &layer_table);
+uint32_t vk_get_required_instance_extensions(void *user_context, StringTable &ext_table);
+uint32_t vk_get_supported_instance_extensions(void *user_context, StringTable &ext_table);
+uint32_t vk_get_required_device_extensions(void *user_context, StringTable &ext_table);
+uint32_t vk_get_optional_device_extensions(void *user_context, StringTable &ext_table);
+uint32_t vk_get_supported_device_extensions(void *user_context, VkPhysicalDevice physical_device, StringTable &ext_table);
+bool vk_validate_required_extension_support(void *user_context,
+                                                 const StringTable &required_extensions,
+                                                 const StringTable &supported_extensions);
+
+// --------------------------------------------------------------------------
+// Resources
+// --------------------------------------------------------------------------
+
+// -- Command Pool
+VkResult vk_create_command_pool(void* user_context, VulkanMemoryAllocator* allocator, uint32_t queue_index, VkCommandPool *command_pool);
+VkResult vk_destroy_command_pool(void* user_context, VulkanMemoryAllocator* allocator, VkCommandPool command_pool);
+
+// -- Command Buffer
+VkResult vk_create_command_buffer(void* user_context, VulkanMemoryAllocator* allocator,  VkCommandPool pool, VkCommandBuffer *command_buffer);
+
+VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
+                                                        VkDevice device,
+                                                        VkCommandBuffer command_buffer,
+                                                        VkPipeline compute_pipeline,
+                                                        VkPipelineLayout pipeline_layout,
+                                                        VkDescriptorSet descriptor_set,
+                                                        int blocksX, int blocksY, int blocksZ);
+
+VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer);
+
+// -- Scalar Uniform Buffer
+size_t vk_estimate_scalar_uniform_buffer_size(void *user_context,
+                                                   size_t arg_sizes[],
+                                                   void *args[],
+                                                   int8_t arg_is_buffer[]);
+
+MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
+                                                   VulkanMemoryAllocator *allocator,
+                                                   size_t arg_sizes[],
+                                                   void *args[],
+                                                   int8_t arg_is_buffer[]);
+
+void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
+                                           MemoryRegion *scalar_args_region);
+// -- Descriptor Pool
+VkResult vk_create_descriptor_pool(void *user_context,
+                                   VulkanMemoryAllocator *allocator,
+                                   uint32_t storage_buffer_count,
+                                   VkDescriptorPool *descriptor_pool);
+
+VkResult vk_destroy_descriptor_pool(void* user_context, 
+                                    VulkanMemoryAllocator *allocator,
+                                    VkDescriptorPool descriptor_pool);
+
+// -- Descriptor Set Layout
+uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
+                                              size_t arg_sizes[],
+                                              void *args[],
+                                              int8_t arg_is_buffer[]);
+
+VkResult vk_create_descriptor_set_layout(void *user_context,
+                                              VkDevice device,
+                                              size_t arg_sizes[],
+                                              void *args[],
+                                              int8_t arg_is_buffer[],
+                                              VkDescriptorSetLayout *layout);
+
+VkResult vk_destroy_descriptor_set_layout(void* user_context, 
+                                          VulkanMemoryAllocator *allocator,
+                                          VkDescriptorSetLayout descriptor_set_layout);
+
+// -- Descriptor Set
+VkResult vk_create_descriptor_set(void *user_context,
+                                  VulkanMemoryAllocator *allocator,
+                                  VkDescriptorSetLayout descriptor_set_layout,
+                                  VkDescriptorPool descriptor_pool,
+                                  VkDescriptorSet *descriptor_set);
+
+VkResult vk_update_descriptor_set(void *user_context,
+                                  VulkanMemoryAllocator *allocator,
+                                  VkBuffer scalar_args_buffer,
+                                  size_t storage_buffer_count,
+                                  size_t arg_sizes[],
+                                  void *args[],
+                                  int8_t arg_is_buffer[],
+                                  VkDescriptorSet descriptor_set);
+
+// -- Pipeline Layout
+VkResult vk_create_pipeline_layout(void *user_context,
+                                   VulkanMemoryAllocator *allocator,
+                                   VkDescriptorSetLayout *descriptor_set_layout,
+                                   VkPipelineLayout *pipeline_layout);
+
+VkResult vk_destroy_pipeline_layout(void* user_context, 
+                                    VulkanMemoryAllocator *allocator,
+                                    VkPipelineLayout pipeline_layout);
+// -- Compute Pipeline
+VkResult vk_create_compute_pipeline(void *user_context,
+                                    VulkanMemoryAllocator *allocator,
+                                    const char *pipeline_name,
+                                    VkShaderModule shader_module,
+                                    VkPipelineLayout pipeline_layout,
+                                    VkPipeline *compute_pipeline);
+
+VkResult vk_destroy_compute_pipeline(void* user_context, 
+                                     VulkanMemoryAllocator *allocator,
+                                     VkPipeline compute_pipeline);
+
+// -- Shader Module
+VkShaderModule *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
+                                         const char *src, int size);
+
+int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocator);
+
+// -- Copy Buffer
+int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buffer,
+                             const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d);
+
+// --------------------------------------------------------------------------
+// Errors
+// --------------------------------------------------------------------------
+
+// Returns the corresponding string for a given vulkan error code
+const char *vk_get_error_name(VkResult error) {
+    switch (error) {
+    case VK_SUCCESS:
+        return "VK_SUCCESS";
+    case VK_NOT_READY:
+        return "VK_NOT_READY";
+    case VK_TIMEOUT:
+        return "VK_TIMEOUT";
+    case VK_EVENT_SET:
+        return "VK_EVENT_SET";
+    case VK_EVENT_RESET:
+        return "VK_EVENT_RESET";
+    case VK_INCOMPLETE:
+        return "VK_INCOMPLETE";
+    case VK_ERROR_OUT_OF_HOST_MEMORY:
+        return "VK_ERROR_OUT_OF_HOST_MEMORY";
+    case VK_ERROR_OUT_OF_DEVICE_MEMORY:
+        return "VK_ERROR_OUT_OF_DEVICE_MEMORY";
+    case VK_ERROR_INITIALIZATION_FAILED:
+        return "VK_ERROR_INITIALIZATION_FAILED";
+    case VK_ERROR_DEVICE_LOST:
+        return "VK_ERROR_DEVICE_LOST";
+    case VK_ERROR_MEMORY_MAP_FAILED:
+        return "VK_ERROR_MEMORY_MAP_FAILED";
+    case VK_ERROR_LAYER_NOT_PRESENT:
+        return "VK_ERROR_LAYER_NOT_PRESENT";
+    case VK_ERROR_EXTENSION_NOT_PRESENT:
+        return "VK_ERROR_EXTENSION_NOT_PRESENT";
+    case VK_ERROR_FEATURE_NOT_PRESENT:
+        return "VK_ERROR_FEATURE_NOT_PRESENT";
+    case VK_ERROR_INCOMPATIBLE_DRIVER:
+        return "VK_ERROR_INCOMPATIBLE_DRIVER";
+    case VK_ERROR_TOO_MANY_OBJECTS:
+        return "VK_ERROR_TOO_MANY_OBJECTS";
+    case VK_ERROR_FORMAT_NOT_SUPPORTED:
+        return "VK_ERROR_FORMAT_NOT_SUPPORTED";
+    case VK_ERROR_FRAGMENTED_POOL:
+        return "VK_ERROR_FRAGMENTED_POOL";
+    case VK_ERROR_SURFACE_LOST_KHR:
+        return "VK_ERROR_SURFACE_LOST_KHR";
+    case VK_ERROR_NATIVE_WINDOW_IN_USE_KHR:
+        return "VK_ERROR_NATIVE_WINDOW_IN_USE_KHR";
+    case VK_SUBOPTIMAL_KHR:
+        return "VK_SUBOPTIMAL_KHR";
+    case VK_ERROR_OUT_OF_DATE_KHR:
+        return "VK_ERROR_OUT_OF_DATE_KHR";
+    case VK_ERROR_INCOMPATIBLE_DISPLAY_KHR:
+        return "VK_ERROR_INCOMPATIBLE_DISPLAY_KHR";
+    case VK_ERROR_VALIDATION_FAILED_EXT:
+        return "VK_ERROR_VALIDATION_FAILED_EXT";
+    case VK_ERROR_INVALID_SHADER_NV:
+        return "VK_ERROR_INVALID_SHADER_NV";
+    case VK_ERROR_OUT_OF_POOL_MEMORY_KHR:
+        return "VK_ERROR_OUT_OF_POOL_MEMORY_KHR";
+    case VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR:
+        return "VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR";
+    default:
+        return "<Unknown Vulkan Result Code>";
+    }
+}
+
+// --------------------------------------------------------------------------
+
+}  // namespace: (anonymous)
+}  // namespace: Vulkan
+}  // namespace: Internal
+}  // namespace: Runtime
+}  // namespace: Halide
+
+#endif  // HALIDE_RUNTIME_VULKAN_INTERNAL_H
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
new file mode 100644
index 000000000000..c63492e45508
--- /dev/null
+++ b/src/runtime/vulkan_memory.h
@@ -0,0 +1,722 @@
+#ifndef HALIDE_RUNTIME_VULKAN_MEMORY_H
+#define HALIDE_RUNTIME_VULKAN_MEMORY_H
+
+#include "internal/block_allocator.h"
+#include "vulkan_internal.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Vulkan {
+
+// --------------------------------------------------------------------------
+
+// Enable external client to override Vulkan allocation callbacks (if they so desire)
+WEAK ScopedSpinLock::AtomicFlag custom_allocation_callbacks_lock = 0;
+static const VkAllocationCallbacks *custom_allocation_callbacks = nullptr;  // nullptr => use Vulkan runtime implementation
+
+// --------------------------------------------------------------------------
+
+// Runtime configuration parameters to adjust the behaviour of the block allocator
+struct VulkanMemoryConfig {
+    size_t minimum_block_size = 32 * 1024 * 1024;  // 32MB
+    size_t maximum_block_size = 0;                 //< zero means no contraint
+    size_t maximum_block_count = 0;                //< zero means no constraint
+};
+WEAK VulkanMemoryConfig memory_allocator_config;
+
+// --------------------------------------------------------------------------
+
+/** Vulkan Memory Allocator class interface for managing large 
+ * memory requests stored as contiguous blocks of memory, which 
+ * are then sub-allocated into smaller regions of 
+ * memory to avoid the excessive cost of vkAllocate and the limited
+ * number of available allocation calls through the API. 
+*/
+class VulkanMemoryAllocator {
+public:
+    // disable copy constructors and assignment
+    VulkanMemoryAllocator(const VulkanMemoryAllocator &) = delete;
+    VulkanMemoryAllocator &operator=(const VulkanMemoryAllocator &) = delete;
+
+    // disable non-factory constrction
+    VulkanMemoryAllocator() = delete;
+    ~VulkanMemoryAllocator() = delete;
+
+    // Factory methods for creation / destruction
+    static VulkanMemoryAllocator *create(void *user_context, const VulkanMemoryConfig &config,
+                                         VkDevice dev, VkPhysicalDevice phys_dev,
+                                         const SystemMemoryAllocatorFns &system_allocator,
+                                         const VkAllocationCallbacks *alloc_callbacks = nullptr);
+
+    static void destroy(void *user_context, VulkanMemoryAllocator *allocator);
+
+    // Public interface methods
+    MemoryRegion *reserve(void *user_context, MemoryRequest &request);
+    void reclaim(void *user_context, MemoryRegion *region);
+    bool collect(void *user_context);  //< returns true if any blocks were removed
+    void release(void *user_context);
+    void destroy(void *user_context);
+
+    void *map(void *user_context, MemoryRegion *region);
+    void unmap(void *user_context, MemoryRegion *region);
+
+    VkDevice current_device() const {
+        return this->device;
+    }
+    VkPhysicalDevice current_physical_device() const {
+        return this->physical_device;
+    }
+    const VkAllocationCallbacks *callbacks() const {
+        return this->alloc_callbacks;
+    }
+
+    static const VulkanMemoryConfig &default_config();
+
+    static void allocate_block(void *user_context, MemoryBlock *block);
+    static void deallocate_block(void *user_context, MemoryBlock *block);
+
+    static void allocate_region(void *user_context, MemoryRegion *region);
+    static void deallocate_region(void *user_context, MemoryRegion *region);
+
+    size_t bytes_allocated_for_blocks() const;
+    size_t blocks_allocated() const;
+
+    size_t bytes_allocated_for_regions() const;
+    size_t regions_allocated() const;
+
+private:
+    static const uint32_t invalid_usage_flags = uint32_t(-1);
+    static const uint32_t invalid_memory_type = uint32_t(VK_MAX_MEMORY_TYPES);
+
+    // Initializes a new instance
+    void initialize(void *user_context, const VulkanMemoryConfig &config,
+                    VkDevice dev, VkPhysicalDevice phys_dev,
+                    const SystemMemoryAllocatorFns &system_allocator,
+                    const VkAllocationCallbacks *alloc_callbacks = nullptr);
+
+    uint32_t select_memory_usage(void *user_context, MemoryProperties properties) const;
+
+    uint32_t select_memory_type(void *user_context,
+                                VkPhysicalDevice physical_device,
+                                MemoryProperties properties,
+                                uint32_t required_flags) const;
+
+
+    size_t block_byte_count = 0;
+    size_t block_count = 0;
+    size_t region_byte_count = 0;
+    size_t region_count = 0;
+    VulkanMemoryConfig config;
+    VkDevice device = nullptr;
+    VkPhysicalDevice physical_device = nullptr;
+    const VkAllocationCallbacks *alloc_callbacks = nullptr;
+    BlockAllocator *block_allocator = nullptr;
+};
+
+VulkanMemoryAllocator *VulkanMemoryAllocator::create(void *user_context,
+                                                     const VulkanMemoryConfig &cfg, VkDevice dev, VkPhysicalDevice phys_dev,
+                                                     const SystemMemoryAllocatorFns &system_allocator,
+                                                     const VkAllocationCallbacks *alloc_callbacks) {
+
+    halide_abort_if_false(user_context, system_allocator.allocate != nullptr);
+    VulkanMemoryAllocator *result = reinterpret_cast<VulkanMemoryAllocator *>(
+        system_allocator.allocate(user_context, sizeof(VulkanMemoryAllocator)));
+
+    if (result == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Failed to create instance! Out of memory!\n";
+        return nullptr;
+    }
+
+    result->initialize(user_context, cfg, dev, phys_dev, system_allocator, alloc_callbacks);
+    return result;
+}
+
+void VulkanMemoryAllocator::destroy(void *user_context, VulkanMemoryAllocator *instance) {
+    halide_abort_if_false(user_context, instance != nullptr);
+    const BlockAllocator::MemoryAllocators &allocators = instance->block_allocator->current_allocators();
+    instance->destroy(user_context);
+    BlockAllocator::destroy(user_context, instance->block_allocator);
+    halide_abort_if_false(user_context, allocators.system.deallocate != nullptr);
+    allocators.system.deallocate(user_context, instance);
+}
+
+void VulkanMemoryAllocator::initialize(void *user_context,
+                                       const VulkanMemoryConfig &cfg, VkDevice dev, VkPhysicalDevice phys_dev,
+                                       const SystemMemoryAllocatorFns &system_allocator,
+                                       const VkAllocationCallbacks *callbacks) {
+
+    config = cfg;
+    device = dev;
+    physical_device = phys_dev;
+    alloc_callbacks = callbacks;
+    BlockAllocator::MemoryAllocators allocators;
+    allocators.system = system_allocator;
+    allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block};
+    allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region};
+    BlockAllocator::Config block_allocator_config = {0};
+    block_allocator_config.maximum_block_count = cfg.maximum_block_count;
+    block_allocator_config.maximum_block_size = cfg.maximum_block_size;
+    block_allocator_config.minimum_block_size = cfg.minimum_block_size;
+    block_allocator = BlockAllocator::create(user_context, block_allocator_config, allocators);
+    halide_abort_if_false(user_context, block_allocator != nullptr);
+}
+
+MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) {
+    debug(nullptr) << "VulkanMemoryAllocator: Reserving memory ("
+             << "user_context=" << user_context << " "
+             << "block_allocator=" << (void *)(block_allocator) << " "
+             << "request_size=" << (uint32_t)(request.size) << " "
+             << "device=" << (void *)(device) << " "
+             << "physical_device=" << (void *)(physical_device) << ") ...\n";
+
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, physical_device != nullptr);
+    halide_abort_if_false(user_context, block_allocator != nullptr);
+    return block_allocator->reserve(this, request);
+}
+
+void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
+    debug(nullptr) << "VulkanMemoryAllocator: Mapping region ("
+             << "user_context=" << user_context << " "
+             << "region=" << (void *)(region) << " "
+             << "device=" << (void *)(device) << " "
+             << "physical_device=" << (void *)(physical_device) << ") ...\n";
+
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, physical_device != nullptr);
+    halide_abort_if_false(user_context, block_allocator != nullptr);
+
+    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, region);
+    if (region_allocator == nullptr) {
+        error(nullptr) << "VulkanMemoryAllocator: Unable to map region! Invalid region allocator handle!\n";
+        return nullptr;
+    }
+
+    BlockResource *block_resource = region_allocator->block_resource();
+    if (block_resource == nullptr) {
+        error(nullptr) << "VulkanMemoryAllocator: Unable to map region! Invalid block resource handle!\n";
+        return nullptr;
+    }
+
+    VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block_resource->memory.handle);
+    if (device_memory == nullptr) {
+        error(nullptr) << "VulkanMemoryAllocator: Unable to map region! Invalid device memory handle!\n";
+        return nullptr;
+    }
+
+    uint8_t *mapped_ptr = nullptr;
+    VkResult result = vkMapMemory(device, *device_memory, region->offset, region->size, 0, (void **)(&mapped_ptr));
+    if (result != VK_SUCCESS) {
+        error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n";
+        return nullptr;
+    }
+
+    return mapped_ptr;
+}
+
+void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
+    debug(nullptr) << "VulkanMemoryAllocator: Unmapping region ("
+             << "user_context=" << user_context << " "
+             << "region=" << (void *)(region) << " "
+             << "device=" << (void *)(device) << " "
+             << "physical_device=" << (void *)(physical_device) << ") ...\n";
+
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, physical_device != nullptr);
+
+    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, region);
+    if (region_allocator == nullptr) {
+        error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
+        return;
+    }
+
+    BlockResource *block_resource = region_allocator->block_resource();
+    if (block_resource == nullptr) {
+        error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid block resource handle!\n";
+        return;
+    }
+
+    VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block_resource->memory.handle);
+    if (device_memory == nullptr) {
+        error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid device memory handle!\n";
+        return;
+    }
+
+    vkUnmapMemory(device, *device_memory);
+}
+
+void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
+    debug(nullptr) << "VulkanMemoryAllocator: Reclaiming region ("
+             << "user_context=" << user_context << " "
+             << "region=" << (void *)(region) << ") ... \n";
+
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, physical_device != nullptr);
+
+    return block_allocator->reclaim(this, region);
+}
+
+bool VulkanMemoryAllocator::collect(void *user_context) {
+    debug(nullptr) << "VulkanMemoryAllocator: Collecting unused memory ("
+             << "user_context=" << user_context << ") ... \n";
+
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, physical_device != nullptr);
+
+    return block_allocator->collect(this);
+}
+
+void VulkanMemoryAllocator::release(void *user_context) {
+    debug(nullptr) << "VulkanMemoryAllocator: Releasing ("
+             << "user_context=" << user_context << ") ... \n";
+
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, physical_device != nullptr);
+
+    block_allocator->release(this);
+}
+
+void VulkanMemoryAllocator::destroy(void *user_context) {
+    debug(nullptr) << "VulkanMemoryAllocator: Destroying allocator ("
+             << "user_context=" << user_context << ") ... \n";
+
+    block_allocator->destroy(this);
+}
+
+const VulkanMemoryConfig &
+VulkanMemoryAllocator::default_config() {
+    static VulkanMemoryConfig result;
+    return result;
+}
+
+// --
+
+void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *block) {
+    debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
+             << "user_context=" << user_context << " "
+             << "block=" << (void *)(block) << ") ... \n";
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
+    halide_abort_if_false(user_context, instance != nullptr);
+    halide_abort_if_false(user_context, instance->device != nullptr);
+    halide_abort_if_false(user_context, instance->physical_device != nullptr);
+    halide_abort_if_false(user_context, block != nullptr);
+
+    debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
+             << "size=" << (uint32_t)block->size << ", "
+             << "dedicated=" << (block->dedicated ? "true" : "false") << " "
+             << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
+             << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
+             << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+
+    // Find an appropriate memory type given the flags
+    uint32_t memory_type = instance->select_memory_type(user_context, instance->physical_device, block->properties, 0);
+    if (memory_type == invalid_memory_type) {
+        debug(nullptr) << "VulkanMemoryAllocator: Unable to find appropriate memory type for device!\n";
+        return;
+    }
+
+    // Allocate memory
+    VkMemoryAllocateInfo alloc_info = {
+        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,  // struct type
+        nullptr,                                 // struct extending this
+        block->size,                             // size of allocation in bytes
+        memory_type                              // memory type index from physical device
+    };
+
+    VkDeviceMemory *device_memory = (VkDeviceMemory *)vk_host_malloc(nullptr, sizeof(VkDeviceMemory), 0, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT, instance->alloc_callbacks);
+    if (device_memory == nullptr) {
+        error(nullptr) << "VulkanBlockAllocator: Unable to allocate block! Failed to allocate device memory handle!\n";
+        return;
+    }
+
+    VkResult result = vkAllocateMemory(instance->device, &alloc_info, instance->alloc_callbacks, device_memory);
+    if (result != VK_SUCCESS) {
+        error(nullptr) << "VulkanMemoryAllocator: Allocation failed! vkAllocateMemory returned: " << vk_get_error_name(result) << "\n";
+        return;
+    }
+
+    block->handle = (void *)device_memory;
+    instance->block_byte_count += block->size;
+    instance->block_count++;
+}
+
+void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *block) {
+    debug(nullptr) << "VulkanMemoryAllocator: Deallocating block ("
+             << "user_context=" << user_context << " "
+             << "block=" << (void *)(block) << ") ... \n";
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
+    halide_abort_if_false(user_context, instance != nullptr);
+    halide_abort_if_false(user_context, instance->device != nullptr);
+    halide_abort_if_false(user_context, instance->physical_device != nullptr);
+    halide_abort_if_false(user_context, block != nullptr);
+
+    debug(nullptr) << "VulkanBlockAllocator: deallocating block ("
+             << "size=" << (uint32_t)block->size << ", "
+             << "dedicated=" << (block->dedicated ? "true" : "false") << " "
+             << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
+             << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
+             << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+
+    if (block->handle == nullptr) {
+        debug(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid handle!\n";
+        return;
+    }
+
+    VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block->handle);
+    if (device_memory == nullptr) {
+        debug(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid device memory handle!\n";
+        return;
+    }
+
+    vkFreeMemory(instance->device, *device_memory, instance->alloc_callbacks);
+    instance->block_byte_count -= block->size;
+    instance->block_count--;
+
+    vk_host_free(nullptr, device_memory, instance->alloc_callbacks);
+    device_memory = nullptr;
+}
+
+size_t VulkanMemoryAllocator::blocks_allocated() const {
+    return block_count;
+}
+
+size_t VulkanMemoryAllocator::bytes_allocated_for_blocks() const {
+    return block_byte_count;
+}
+
+uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
+                                                   VkPhysicalDevice physical_device,
+                                                   MemoryProperties properties,
+                                                   uint32_t required_flags) const {
+
+    uint32_t want_flags = 0;  //< preferred memory flags for requested access type
+    uint32_t need_flags = 0;  //< must have in order to enable requested access
+    switch (properties.visibility) {
+    case MemoryVisibility::HostOnly:
+        want_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        break;
+    case MemoryVisibility::DeviceOnly:
+        need_flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    case MemoryVisibility::DeviceToHost:
+        need_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        want_flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    case MemoryVisibility::HostToDevice:
+        need_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        break;
+    case MemoryVisibility::DefaultVisibility:
+    case MemoryVisibility::InvalidVisibility:
+    default:
+        debug(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory visibility request!\n\t"
+                 << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+        return invalid_memory_type;
+    };
+
+    switch (properties.caching) {
+    case MemoryCaching::CachedCoherent:
+        if (need_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+            want_flags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+        }
+        break;
+    case MemoryCaching::UncachedCoherent:
+        if (need_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+            want_flags |= VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+        }
+        break;
+    case MemoryCaching::Cached:
+        if (need_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+            want_flags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+        }
+        break;
+    case MemoryCaching::Uncached:
+    case MemoryCaching::DefaultCaching:
+        break;
+    case MemoryCaching::InvalidCaching:
+    default:
+        debug(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory caching request!\n\t"
+                 << "caching=" << halide_memory_caching_name(properties.caching) << "\n";
+        return invalid_memory_type;
+    };
+
+    VkPhysicalDeviceMemoryProperties device_memory_properties;
+    vkGetPhysicalDeviceMemoryProperties(physical_device, &device_memory_properties);
+
+    uint32_t result = invalid_memory_type;
+    for (uint32_t i = 0; i < device_memory_properties.memoryTypeCount; ++i) {
+
+        // if required flags are given, see if the memory type matches the requirement
+        if (required_flags) {
+            if (((required_flags >> i) & 1) == 0) {
+                continue;
+            }
+        }
+
+        const VkMemoryPropertyFlags properties = device_memory_properties.memoryTypes[i].propertyFlags;
+        if (need_flags) {
+            if ((properties & need_flags) != need_flags) {
+                continue;
+            }
+        }
+
+        if (want_flags) {
+            if ((properties & want_flags) != want_flags) {
+                continue;
+            }
+        }
+
+        result = i;
+        break;
+    }
+
+    if (result == invalid_memory_type) {
+        debug(nullptr) << "VulkanBlockAllocator: Failed to find appropriate memory type for given properties:\n\t"
+                 << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                 << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                 << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+        return invalid_memory_type;
+    }
+
+    return result;
+}
+
+// --
+
+void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *region) {
+    debug(nullptr) << "VulkanMemoryAllocator: Allocating region ("
+             << "user_context=" << user_context << " "
+             << "region=" << (void *)(region) << ") ... \n";
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
+    halide_abort_if_false(user_context, instance != nullptr);
+    halide_abort_if_false(user_context, instance->device != nullptr);
+    halide_abort_if_false(user_context, instance->physical_device != nullptr);
+    halide_abort_if_false(user_context, region != nullptr);
+
+    debug(nullptr) << "VulkanRegionAllocator: Allocating region ("
+             << "size=" << (uint32_t)region->size << ", "
+             << "offset=" << (uint32_t)region->offset << ", "
+             << "dedicated=" << (region->dedicated ? "true" : "false") << " "
+             << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
+             << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
+             << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+
+    uint32_t usage_flags = instance->select_memory_usage(user_context, region->properties);
+
+    VkBufferCreateInfo create_info = {
+        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
+        nullptr,                               // struct extending this
+        0,                                     // create flags
+        region->size,                          // buffer size (in bytes)
+        usage_flags,                           // buffer usage flags
+        VK_SHARING_MODE_EXCLUSIVE,             // sharing mode
+        0, nullptr};
+
+    VkBuffer *buffer = (VkBuffer *)vk_host_malloc(nullptr, sizeof(VkBuffer), 0, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT, instance->alloc_callbacks);
+    if (buffer == nullptr) {
+        error(nullptr) << "VulkanRegionAllocator: Unable to allocate region! Failed to allocate buffer handle!\n";
+        return;
+    }
+
+    VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer);
+    if (result != VK_SUCCESS) {
+        error(nullptr) << "VulkanRegionAllocator: Failed to create buffer!\n\t"
+                 << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+        return;
+    }
+
+    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, region);
+    BlockResource *block_resource = region_allocator->block_resource();
+    if (block_resource == nullptr) {
+        error(nullptr) << "VulkanBlockAllocator: Unable to allocate region! Invalid block resource handle!\n";
+        return;
+    }
+
+    VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block_resource->memory.handle);
+    if (device_memory == nullptr) {
+        error(nullptr) << "VulkanBlockAllocator: Unable to allocate region! Invalid device memory handle!\n";
+        return;
+    }
+
+    // Finally, bind buffer to the device memory
+    result = vkBindBufferMemory(instance->device, *buffer, *device_memory, region->offset);
+    if (result != VK_SUCCESS) {
+        error(nullptr) << "VulkanRegionAllocator: Failed to bind buffer!\n\t"
+                 << "vkBindBufferMemory returned: " << vk_get_error_name(result) << "\n";
+        return;
+    }
+
+    region->handle = (void *)buffer;
+    instance->region_byte_count += region->size;
+    instance->region_count++;
+}
+
+void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *region) {
+    debug(nullptr) << "VulkanMemoryAllocator: Deallocating region ("
+             << "user_context=" << user_context << " "
+             << "region=" << (void *)(region) << ") ... \n";
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
+    halide_abort_if_false(user_context, instance != nullptr);
+    halide_abort_if_false(user_context, instance->device != nullptr);
+    halide_abort_if_false(user_context, instance->physical_device != nullptr);
+    halide_abort_if_false(user_context, region != nullptr);
+    debug(nullptr) << "VulkanRegionAllocator: Deallocating region ("
+             << "size=" << (uint32_t)region->size << ", "
+             << "offset=" << (uint32_t)region->offset << ", "
+             << "dedicated=" << (region->dedicated ? "true" : "false") << " "
+             << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
+             << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
+             << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+
+    if (region->handle == nullptr) {
+        debug(nullptr) << "VulkanRegionAllocator: Unable to deallocate region! Invalid handle!\n";
+        return;
+    }
+
+    VkBuffer *buffer = reinterpret_cast<VkBuffer *>(region->handle);
+    if (buffer == nullptr) {
+        error(nullptr) << "VulkanRegionAllocator: Unable to deallocate region! Invalid buffer handle!\n";
+        return;
+    }
+
+    vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks);
+    region->handle = nullptr;
+    instance->region_byte_count -= region->size;
+    instance->region_count--;
+
+    vk_host_free(nullptr, buffer, instance->alloc_callbacks);
+    buffer = nullptr;
+}
+
+size_t VulkanMemoryAllocator::regions_allocated() const {
+    return region_count;
+}
+
+size_t VulkanMemoryAllocator::bytes_allocated_for_regions() const {
+    return region_byte_count;
+}
+
+uint32_t VulkanMemoryAllocator::select_memory_usage(void *user_context, MemoryProperties properties) const {
+    uint32_t result = 0;
+    switch (properties.usage) {
+    case MemoryUsage::UniformStorage:
+        result |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+        break;
+    case MemoryUsage::DynamicStorage:
+    case MemoryUsage::StaticStorage:
+        result |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+        break;
+    case MemoryUsage::TransferSrc:
+        result |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+        break;
+    case MemoryUsage::TransferDst:
+        result |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+        break;
+    case MemoryUsage::TransferSrcDst:
+        result |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+        break;
+    case MemoryUsage::DefaultUsage:
+    case MemoryUsage::InvalidUsage:
+    default:
+        error(user_context) << "VulkanRegionAllocator: Unable to convert type! Invalid memory usage request!\n\t"
+                            << "usage=" << halide_memory_usage_name(properties.usage) << "\n";
+        return invalid_usage_flags;
+    };
+
+    if (result == invalid_usage_flags) {
+        error(user_context) << "VulkanRegionAllocator: Failed to find appropriate memory usage for given properties:\n\t"
+                            << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                            << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                            << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+        return invalid_usage_flags;
+    }
+
+    return result;
+}
+
+// --------------------------------------------------------------------------
+
+namespace {
+
+// --------------------------------------------------------------------------
+// Halide System allocator for host allocations
+void *vk_system_malloc(void *user_context, size_t size) {
+    return malloc(size);
+}
+
+void vk_system_free(void *user_context, void *ptr) {
+    free(ptr);
+}
+
+// Vulkan host-side allocation
+void *vk_host_malloc(void *user_context, size_t size, size_t alignment, VkSystemAllocationScope scope, const VkAllocationCallbacks *callbacks) {
+    if (callbacks) {
+        return callbacks->pfnAllocation(user_context, size, alignment, scope);
+    } else {
+        return vk_system_malloc(user_context, size);
+    }
+}
+
+void vk_host_free(void *user_context, void *ptr, const VkAllocationCallbacks *callbacks) {
+    if (callbacks) {
+        return callbacks->pfnFree(user_context, ptr);
+    } else {
+        return vk_system_free(user_context, ptr);
+    }
+}
+
+VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context,
+                                                       VkDevice device,
+                                                       VkPhysicalDevice physical_device,
+                                                       const VkAllocationCallbacks *alloc_callbacks) {
+
+    SystemMemoryAllocatorFns system_allocator = {vk_system_malloc, vk_system_free};
+
+    return VulkanMemoryAllocator::create(user_context,
+                                         memory_allocator_config, device, physical_device,
+                                         system_allocator, alloc_callbacks);
+}
+
+int vk_destroy_memory_allocator(void *user_context, VulkanMemoryAllocator *allocator) {
+    if (allocator != nullptr) {
+        VulkanMemoryAllocator::destroy(user_context, allocator);
+        allocator = nullptr;
+    }
+    return halide_error_code_success;
+}
+
+// --------------------------------------------------------------------------
+
+}  // namespace: (anonymous)
+}  // namespace: Vulkan
+}  // namespace: Internal
+}  // namespace: Runtime
+}  // namespace: Halide
+
+// --------------------------------------------------------------------------
+
+extern "C" {
+
+// --------------------------------------------------------------------------
+
+WEAK void halide_vulkan_set_allocation_callbacks(const VkAllocationCallbacks *callbacks) {
+    using namespace Halide::Runtime::Internal::Vulkan;
+    ScopedSpinLock lock(&custom_allocation_callbacks_lock);
+    custom_allocation_callbacks = callbacks;
+}
+
+WEAK const VkAllocationCallbacks *halide_vulkan_get_allocation_callbacks(void *user_context) {
+    using namespace Halide::Runtime::Internal::Vulkan;
+    ScopedSpinLock lock(&custom_allocation_callbacks_lock);
+    return custom_allocation_callbacks;
+}
+
+// --------------------------------------------------------------------------
+
+}  // extern "C"
+
+#endif  // HALIDE_RUNTIME_VULKAN_MEMORY_H
\ No newline at end of file
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
new file mode 100644
index 000000000000..3051d1c4ed89
--- /dev/null
+++ b/src/runtime/vulkan_resources.h
@@ -0,0 +1,761 @@
+#ifndef HALIDE_RUNTIME_VULKAN_RESOURCES_H
+#define HALIDE_RUNTIME_VULKAN_RESOURCES_H
+
+#include "vulkan_internal.h"
+#include "vulkan_memory.h"
+
+// --------------------------------------------------------------------------
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Vulkan {
+
+// Compilation cache for compiled shader modules
+WEAK Halide::Internal::GPUCompilationCache<VkDevice, VkShaderModule *> compilation_cache;
+
+// --------------------------------------------------------------------------
+
+namespace {  // internalize
+
+// --------------------------------------------------------------------------
+
+VkResult vk_create_command_pool(void* user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index,  VkCommandPool *command_pool) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_create_command_pool (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "queue_index: " << queue_index << ")\n";
+#endif    
+
+    VkCommandPoolCreateInfo command_pool_info =
+        {
+            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,  // struct type
+            nullptr,                                     // pointer to struct extending this
+            0,                                           // flags.  may consider VK_COMMAND_POOL_CREATE_TRANSIENT_BIT
+            queue_index                                  // queue family index corresponding to the compute command queue
+        };
+    return vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
+}
+
+VkResult vk_destroy_command_pool(void* user_context, VulkanMemoryAllocator* allocator, VkCommandPool command_pool) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_destroy_command_pool (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "command_pool: " << (void *)command_pool << ")\n";
+#endif    
+    vkDestroyCommandPool(allocator->current_device(), command_pool, allocator->callbacks());
+    return VK_SUCCESS;    
+}
+
+// --
+
+VkResult vk_create_command_buffer(void* user_context, VulkanMemoryAllocator* allocator, VkCommandPool command_pool, VkCommandBuffer *command_buffer) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_create_command_buffer (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "command_pool: " << (void *)command_pool << ")\n";
+#endif
+    VkCommandBufferAllocateInfo command_buffer_info =
+        {
+            VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,  // struct type
+            nullptr,                                         // pointer to struct extending this
+            command_pool,                                    // command pool for allocation
+            VK_COMMAND_BUFFER_LEVEL_PRIMARY,                 // command buffer level
+            1                                                // number to allocate
+        };
+
+    return vkAllocateCommandBuffers(allocator->current_device(), &command_buffer_info, command_buffer);
+}
+
+VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
+                                                    VkDevice device,
+                                                    VkCommandBuffer command_buffer,
+                                                    VkPipeline compute_pipeline,
+                                                    VkPipelineLayout pipeline_layout,
+                                                    VkDescriptorSet descriptor_set,
+                                                    int blocksX, int blocksY, int blocksZ) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_fill_command_buffer_with_dispatch_call (user_context: " << user_context << ", "
+        << "device: " << (void *)device << ", "
+        << "command_buffer: " << (void *)command_buffer << ", "
+        << "pipeline_layout: " << (void *)pipeline_layout << ", "
+        << "descriptor_set: " << (void *)descriptor_set << ", "
+        << "blocks: " << blocksX << ", " << blocksY << ", " << blocksZ << ")\n";
+#endif
+
+    VkCommandBufferBeginInfo command_buffer_begin_info = {
+        VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
+        nullptr,                                      // pointer to struct extending this
+        VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
+        nullptr                                       // pointer to parent command buffer
+    };
+
+    VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute_pipeline);
+    vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_layout,
+                            0, 1, &descriptor_set, 0, nullptr);
+    vkCmdDispatch(command_buffer, blocksX, blocksY, blocksZ);  // TODO: make sure this is right!
+
+    result = vkEndCommandBuffer(command_buffer);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    return VK_SUCCESS;
+}
+
+VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_submit_command_buffer (user_context: " << user_context << ", "
+        << "queue: " << (void *)queue << ", "
+        << "command_buffer: " << (void *)command_buffer << ")\n";
+#endif
+
+    VkSubmitInfo submit_info =
+        {
+            VK_STRUCTURE_TYPE_SUBMIT_INFO,  // struct type
+            nullptr,                        // pointer to struct extending this
+            0,                              // wait semaphore count
+            nullptr,                        // semaphores
+            nullptr,                        // pipeline stages where semaphore waits occur
+            1,                              // how many command buffers to execute
+            &command_buffer,                // the command buffers
+            0,                              // number of semaphores to signal
+            nullptr                         // the semaphores to signal
+        };
+
+    VkResult result = vkQueueSubmit(queue, 1, &submit_info, 0);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+    return VK_SUCCESS;
+}
+
+// --
+
+uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
+                                                   size_t arg_sizes[],
+                                                   void *args[],
+                                                   int8_t arg_is_buffer[]) {
+    int i = 0;
+    uint32_t num_bindings = 1;  // first binding is for passing scalar parameters in a buffer
+    while (arg_sizes[i] > 0) {
+        if (arg_is_buffer[i]) {
+            num_bindings++;
+        }
+        i++;
+    }
+    return num_bindings;
+}
+
+// --
+
+VkResult vk_create_descriptor_pool(void *user_context,
+                                   VulkanMemoryAllocator *allocator,
+                                   uint32_t storage_buffer_count,
+                                   VkDescriptorPool *descriptor_pool) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_create_descriptor_pool (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n";
+#endif
+    static const uint32_t uniform_buffer_count = 1;  // all scalar args are packed into one uniform buffer
+
+    VkDescriptorPoolSize descriptor_pool_sizes[2] = {
+        {
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
+            uniform_buffer_count                // how many
+        },
+        {
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,  // descriptor type
+            storage_buffer_count                // how many
+        }};
+
+    uint32_t descriptor_set_count = (uniform_buffer_count + storage_buffer_count);
+    VkDescriptorPoolCreateInfo descriptor_pool_info =
+        {
+            VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,  // struct type
+            nullptr,                                        // point to struct extending this
+            0,                                              // flags
+            descriptor_set_count,                           // max number of sets that can be allocated TODO:should this be 1?
+            2,                                              // pool size count
+            descriptor_pool_sizes                           // ptr to descriptr pool sizes
+        };
+
+    VkResult result = vkCreateDescriptorPool(allocator->current_device(), &descriptor_pool_info, allocator->callbacks(), descriptor_pool);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: Failed to create descriptor pool! vkCreateDescriptorPool returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+    return VK_SUCCESS;
+}
+
+
+VkResult vk_destroy_descriptor_pool(void* user_context, 
+                                    VulkanMemoryAllocator *allocator,
+                                    VkDescriptorPool descriptor_pool) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_destroy_descriptor_pool (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
+#endif
+    vkDestroyDescriptorPool(allocator->current_device(), descriptor_pool, allocator->callbacks());
+    return VK_SUCCESS;
+}
+
+// --
+
+VkResult vk_create_descriptor_set_layout(void *user_context,
+                                         VulkanMemoryAllocator *allocator,
+                                         size_t arg_sizes[],
+                                         void *args[],
+                                         int8_t arg_is_buffer[],
+                                         VkDescriptorSetLayout *layout) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_create_descriptor_set_layout (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "layout: " << (void *)layout << ")\n";
+#endif
+
+    // The first binding is used for scalar parameters
+    uint32_t num_bindings = vk_count_bindings_for_descriptor_set(user_context, arg_sizes, args, arg_is_buffer);
+
+    BlockStorage::Config layout_config;
+    layout_config.entry_size = sizeof(VkDescriptorSetLayoutBinding);
+    layout_config.minimum_capacity = num_bindings;
+
+    BlockStorage layout_bindings(user_context, layout_config);
+
+    // First binding is reserved for passing scalar parameters as a uniform buffer
+    VkDescriptorSetLayoutBinding scalar_uniform_layout = {
+        0,                                  // binding index
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
+        1,                                  // descriptor count
+        VK_SHADER_STAGE_COMPUTE_BIT,        // stage flags
+        nullptr                             // immutable samplers
+    };
+    layout_bindings.append(user_context, &scalar_uniform_layout);
+
+    // Add all other bindings for buffer data
+    int i = 0;
+    while (arg_sizes[i] > 0) {
+        if (arg_is_buffer[i]) {
+            // TODO: I don't quite understand why STORAGE_BUFFER is valid
+            // here, but examples all across the docs seem to do this
+            VkDescriptorSetLayoutBinding storage_buffer_layout = {
+                (uint32_t)layout_bindings.size(),   // binding index
+                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,  // descriptor type
+                1,                                  // descriptor count
+                VK_SHADER_STAGE_COMPUTE_BIT,        // stage flags
+                nullptr                             // immutable samplers
+            };
+            layout_bindings.append(user_context, &storage_buffer_layout);
+        }
+        i++;
+    }
+    // Create the LayoutInfo struct
+    VkDescriptorSetLayoutCreateInfo layout_info = {
+        VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,    // structure type
+        nullptr,                                                // pointer to a struct extending this info
+        0,                                                      // flags
+        (uint32_t)layout_bindings.size(),                       // binding count
+        (VkDescriptorSetLayoutBinding *)layout_bindings.data()  // pointer to layout bindings array
+    };
+
+    // Create the descriptor set layout
+    VkResult result = vkCreateDescriptorSetLayout(allocator->current_device(), &layout_info, allocator->callbacks(), layout);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "vkCreateDescriptorSetLayout returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    return VK_SUCCESS;
+}
+
+VkResult vk_destroy_descriptor_set_layout(void* user_context, 
+                                          VulkanMemoryAllocator *allocator,
+                                          VkDescriptorSetLayout descriptor_set_layout) {
+
+    vkDestroyDescriptorSetLayout(allocator->current_device(), descriptor_set_layout, allocator->callbacks());
+    return VK_SUCCESS;
+}
+
+// --
+
+VkResult vk_create_descriptor_set(void *user_context,
+                                  VulkanMemoryAllocator *allocator,
+                                  VkDescriptorSetLayout descriptor_set_layout,
+                                  VkDescriptorPool descriptor_pool,
+                                  VkDescriptorSet *descriptor_set) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_create_descriptor_set (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "descriptor_set_layout: " << (void *)descriptor_set_layout << ", "
+        << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
+#endif
+
+    VkDescriptorSetAllocateInfo descriptor_set_info =
+        {
+            VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,  // struct type
+            nullptr,                                         // pointer to struct extending this
+            descriptor_pool,                                 // pool from which to allocate sets
+            1,                                               // number of descriptor sets
+            &descriptor_set_layout                           // pointer to array of descriptor set layouts
+        };
+
+    VkResult result = vkAllocateDescriptorSets(allocator->current_device(), &descriptor_set_info, descriptor_set);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkAllocateDescriptorSets returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    return VK_SUCCESS;
+}
+
+VkResult vk_update_descriptor_set(void *user_context,
+                                  VulkanMemoryAllocator *allocator,
+                                  VkBuffer scalar_args_buffer,
+                                  size_t storage_buffer_count,
+                                  size_t arg_sizes[],
+                                  void *args[],
+                                  int8_t arg_is_buffer[],
+                                  VkDescriptorSet descriptor_set) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_update_descriptor_set (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ", "
+        << "descriptor_set: " << (void *)descriptor_set << ")\n";
+#endif
+
+    static const int uniform_buffer_count = 1;  // scalar args are always packed into one uniform buffer
+
+    BlockStorage::Config dbi_config;
+    dbi_config.minimum_capacity = storage_buffer_count + uniform_buffer_count;
+    dbi_config.entry_size = sizeof(VkDescriptorBufferInfo);
+    BlockStorage descriptor_buffer_info(user_context, dbi_config);
+
+    BlockStorage::Config wds_config;
+    wds_config.minimum_capacity = storage_buffer_count + uniform_buffer_count;
+    wds_config.entry_size = sizeof(VkWriteDescriptorSet);
+    BlockStorage write_descriptor_set(user_context, wds_config);
+
+    // First binding will be the scalar params buffer
+    VkDescriptorBufferInfo scalar_args_descriptor_buffer_info = {
+        scalar_args_buffer,  // the buffer
+        0,                   // offset
+        VK_WHOLE_SIZE        // range
+    };
+    descriptor_buffer_info.append(user_context, &scalar_args_descriptor_buffer_info);
+    VkDescriptorBufferInfo *scalar_args_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
+
+    VkWriteDescriptorSet scalar_args_write_descriptor_set = {
+        VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
+        nullptr,                                 // pointer to struct extending this
+        descriptor_set,                          // descriptor set to update
+        0,                                       // binding slot
+        0,                                       // array elem
+        1,                                       // num to update
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,       // descriptor type
+        nullptr,                                 // for images
+        scalar_args_entry,                       // info for buffer
+        nullptr                                  // for texel buffers
+    };
+    write_descriptor_set.append(user_context, &scalar_args_write_descriptor_set);
+
+    // Add all the other device buffers
+    for (size_t i = 0; arg_sizes[i] > 0; i++) {
+        if (arg_is_buffer[i]) {
+
+            // get the allocated region for the buffer
+            MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(((halide_buffer_t *)args[i])->device);
+
+            // retrieve the buffer from the region
+            VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
+            if (device_buffer == nullptr) {
+                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+                return VK_ERROR_INITIALIZATION_FAILED;
+            }
+
+            VkDescriptorBufferInfo device_buffer_info = {
+                *device_buffer,  // the buffer
+                0,               // offset
+                VK_WHOLE_SIZE    // range
+            };
+            descriptor_buffer_info.append(user_context, &device_buffer_info);
+            VkDescriptorBufferInfo *device_buffer_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
+
+            VkWriteDescriptorSet scalar_args_write_descriptor_set = {
+                VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
+                nullptr,                                 // pointer to struct extending this
+                descriptor_set,                          // descriptor set to update
+                (uint32_t)write_descriptor_set.size(),   // binding slot
+                0,                                       // array elem
+                1,                                       // num to update
+                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,       // descriptor type
+                nullptr,                                 // for images
+                device_buffer_entry,                     // info for buffer
+                nullptr                                  // for texel buffers
+            };
+            write_descriptor_set.append(user_context, &scalar_args_write_descriptor_set);
+        }
+    }
+
+    // issue the update call to populate the descriptor set
+    vkUpdateDescriptorSets(allocator->current_device(), (uint32_t)write_descriptor_set.size(), (const VkWriteDescriptorSet *)write_descriptor_set.data(), 0, nullptr);
+    return VK_SUCCESS;
+}
+
+// --
+
+size_t vk_estimate_scalar_uniform_buffer_size(void *user_context,
+                                                   size_t arg_sizes[],
+                                                   void *args[],
+                                                   int8_t arg_is_buffer[]) {
+    int i = 0;
+    int scalar_uniform_buffer_size = 0;
+    while (arg_sizes[i] > 0) {
+        if (!arg_is_buffer[i]) {
+            scalar_uniform_buffer_size += arg_sizes[i];
+        }
+        i++;
+    }
+    return scalar_uniform_buffer_size;
+}
+
+MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
+                                                   VulkanMemoryAllocator *allocator,
+                                                   size_t arg_sizes[],
+                                                   void *args[],
+                                                   int8_t arg_is_buffer[]) {
+
+
+    size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context,
+                                                                       arg_sizes, args, arg_is_buffer);
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_create_scalar_uniform_buffer (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "scalar_buffer_size: " << (uint32_t)scalar_buffer_size << ")\n";
+#endif
+
+    MemoryRequest request = {0};
+    request.size = scalar_buffer_size;
+    request.properties.usage = MemoryUsage::UniformStorage;
+    request.properties.caching = MemoryCaching::UncachedCoherent;
+    request.properties.visibility = MemoryVisibility::HostToDevice;
+
+    // allocate a new region
+    MemoryRegion *region = allocator->reserve(user_context, request);
+    if ((region == nullptr) || (region->handle == nullptr)) {
+        error(user_context) << "Vulkan: Failed to allocate device memory!\n";
+        return nullptr;
+    }
+
+    // map the region to a host ptr
+    uint8_t *scalar_buffer_host_ptr = (uint8_t *)allocator->map(user_context, region);
+    if (scalar_buffer_host_ptr == nullptr) {
+        error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
+        return nullptr;
+    }
+
+    // copy to the (host-visible/coherent) scalar uniform buffer
+    size_t scalar_arg_offset = 0;
+    for (size_t i = 0; arg_sizes[i] > 0; i++) {
+        if (!arg_is_buffer[i]) {
+            memcpy(scalar_buffer_host_ptr + scalar_arg_offset, args[i], arg_sizes[i]);
+            scalar_arg_offset += arg_sizes[i];
+        }
+    }
+
+    // unmap the pointer to the buffer for the region
+    allocator->unmap(user_context, region);
+
+    // return the allocated region for the uniform buffer
+    return region;
+}
+
+void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
+                                           MemoryRegion *scalar_args_region) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_destroy_scalar_uniform_buffer (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "scalar_args_region: " << (void *)scalar_args_region << ")\n";
+#endif
+
+    if (!scalar_args_region) { return; }
+    allocator->reclaim(user_context, scalar_args_region);
+}
+
+// --
+
+VkResult vk_create_pipeline_layout(void *user_context,
+                                   VulkanMemoryAllocator *allocator,
+                                   VkDescriptorSetLayout *descriptor_set_layout,
+                                   VkPipelineLayout *pipeline_layout) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_create_pipeline_layout (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "descriptor_set_layout: " << (void *)descriptor_set_layout << ", "
+        << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
+#endif
+
+    VkPipelineLayoutCreateInfo pipeline_layout_info = {
+        VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,  // structure type
+        nullptr,                                        // pointer to a structure extending this
+        0,                                              // flags
+        1,                                              // number of descriptor sets
+        descriptor_set_layout,                          // pointer to the descriptor sets
+        0,                                              // number of push constant ranges
+        nullptr                                         // pointer to push constant range structs
+    };
+
+    VkResult result = vkCreatePipelineLayout(allocator->current_device(), &pipeline_layout_info, allocator->callbacks(), pipeline_layout);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkCreatePipelineLayout returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+    return VK_SUCCESS;
+}
+
+VkResult vk_destroy_pipeline_layout(void* user_context, 
+                                    VulkanMemoryAllocator *allocator,
+                                    VkPipelineLayout pipeline_layout) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_destroy_pipeline_layout (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
+#endif
+
+    vkDestroyPipelineLayout(allocator->current_device(), pipeline_layout, allocator->callbacks());
+    return VK_SUCCESS;
+}
+
+// --
+
+VkResult vk_create_compute_pipeline(void *user_context,
+                                    VulkanMemoryAllocator *allocator,
+                                    const char *pipeline_name,
+                                    VkShaderModule shader_module,
+                                    VkPipelineLayout pipeline_layout,
+                                    VkPipeline *compute_pipeline) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_create_compute_pipeline (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "shader_module: " << (void *)shader_module << ", "
+        << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
+#endif
+
+    VkComputePipelineCreateInfo compute_pipeline_info =
+        {
+            VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,  // structure type
+            nullptr,                                         // pointer to a structure extending this
+            0,                                               // flags
+            // VkPipelineShaderStageCreatInfo
+            {
+                VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,  // structure type
+                nullptr,                                              //pointer to a structure extending this
+                0,                                                    // flags
+                VK_SHADER_STAGE_COMPUTE_BIT,                          // compute stage shader
+                shader_module,                                        // shader module
+                pipeline_name,                                        // entry point name
+                nullptr                                               // pointer to VkSpecializationInfo struct
+            },
+            pipeline_layout,  // pipeline layout
+            0,                // base pipeline handle for derived pipeline
+            0                 // base pipeline index for derived pipeline
+        };
+
+    VkResult result = vkCreateComputePipelines(allocator->current_device(), 0, 1, &compute_pipeline_info, allocator->callbacks(), compute_pipeline);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: Failed to create compute pipeline! vkCreateComputePipelines returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    return VK_SUCCESS;
+}
+
+VkResult vk_destroy_compute_pipeline(void* user_context, 
+                                    VulkanMemoryAllocator *allocator,
+                                    VkPipeline compute_pipeline) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_destroy_compute_pipeline (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "compute_pipeline: " << (void *)compute_pipeline << ")\n";
+#endif
+    vkDestroyPipeline(allocator->current_device(), compute_pipeline, allocator->callbacks());
+    return VK_SUCCESS;
+}
+
+// --------------------------------------------------------------------------
+
+VkShaderModule *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
+                                              const char *src, int size) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_compile_shader_module (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "source: " << (void *)src  << ", "
+        << "size: " << size << ")\n";
+
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    VkShaderModuleCreateInfo shader_info = {
+        VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        nullptr,               // pointer to structure extending this
+        0,                     // flags (curently unused)
+        (size_t)size,          // code size in bytes
+        (const uint32_t *)src  // source
+    };
+
+    VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT;
+    VkShaderModule *shader_module = (VkShaderModule *)vk_host_malloc(user_context, sizeof(VkShaderModule), 0, alloc_scope, allocator->callbacks());
+    if(shader_module == nullptr) {
+        error(user_context) << "Vulkan: Failed to allocatoe shader module! Out of memory!\n";
+        return nullptr;
+    } 
+
+    VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), shader_module);
+    if ((result != VK_SUCCESS) || (shader_module == nullptr)) {
+        error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
+        vk_host_free(user_context, shader_module, allocator->callbacks());
+        return nullptr;
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return shader_module;
+}
+
+int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocator) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_destroy_shader_modules (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ")\n";
+
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    // Functor to match compilation cache destruction call with scoped params
+    struct DestroyShaderModule {
+        void *user_context = nullptr;
+        VkDevice device = nullptr;
+        const VkAllocationCallbacks *allocation_callbacks = nullptr;
+
+        DestroyShaderModule(void *ctx, VkDevice dev, const VkAllocationCallbacks *callbacks)
+            : user_context(ctx), device(dev), allocation_callbacks(callbacks) {
+        }
+
+        void operator()(VkShaderModule *shader_module) {
+            if (shader_module != nullptr) {
+                vkDestroyShaderModule(device, *shader_module, allocation_callbacks);
+                vk_host_free(user_context, shader_module, allocation_callbacks);
+            }
+        }
+    };
+
+    DestroyShaderModule module_destructor(user_context, allocator->current_device(), allocator->callbacks());
+    compilation_cache.delete_context(user_context, allocator->current_device(), module_destructor);
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+    return VK_SUCCESS;
+}
+
+// --------------------------------------------------------------------------
+
+int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buffer,
+                             const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d) {
+    if (d == 0) {
+
+        VkBufferCopy buffer_copy = {
+            c.src_begin + src_offset,       // srcOffset
+            dst_offset,                     // dstOffset
+            c.chunk_size                    // size
+        };
+
+        VkBuffer *src_buffer = reinterpret_cast<VkBuffer *>(c.src);
+        VkBuffer *dst_buffer = reinterpret_cast<VkBuffer *>(c.dst);
+        if (!src_buffer || !dst_buffer) {
+            error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+            return -1;
+        }
+        
+        vkCmdCopyBuffer(command_buffer, *src_buffer, *dst_buffer, 1, &buffer_copy);
+
+    } else {
+        // TODO: deal with negative strides. Currently the code in
+        // device_buffer_utils.h does not do so either.
+        uint64_t src_off = 0, dst_off = 0;
+        for (uint64_t i = 0; i < c.extent[d - 1]; i++) {
+            int err = vk_do_multidimensional_copy(user_context, command_buffer, c, src_offset + src_off, dst_offset + dst_off, d - 1);
+            dst_off += c.dst_stride_bytes[d - 1];
+            src_off += c.src_stride_bytes[d - 1];
+            if (err) {
+                return err;
+            }
+        }
+    }
+    return 0;
+}
+
+// --------------------------------------------------------------------------
+
+}  // namespace: (anonymous)
+}  // namespace: Vulkan
+}  // namespace: Internal
+}  // namespace: Runtime
+}  // namespace: Halide
+
+#endif  // HALIDE_RUNTIME_VULKAN_RESOURCES_H

From 7a0cf7c2ebb7b748a1b67a7a9d5d8fd7e5531ecd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 3 Aug 2022 15:14:32 -0700
Subject: [PATCH 002/166] Fix build to work with latest changes in main

---
 src/CMakeLists.txt         | 2 +-
 src/CodeGen_Vulkan_Dev.cpp | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1281038f48f2..3804b335f1b7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -40,7 +40,7 @@ set(HEADER_FILES
     CodeGen_PTX_Dev.h
     CodeGen_PyTorch.h
     CodeGen_Targets.h
-    CodeGen_Vulkan.h
+    CodeGen_Vulkan_Dev.h
     CompilerLogger.h
     ConciseCasts.h
     CPlusPlusMangle.h
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 2b568ae7aa93..1a0d74596301 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -11,7 +11,7 @@
 #include "Scope.h"
 #include "Target.h"
 
-#include "spirv/spirv.h"
+#include <spirv/1.0/spirv.h> 
 
 // Temporary:
 #include <fstream>
@@ -560,7 +560,6 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
         uint32_t arg_id = id;
         id = next_id++;
         add_instruction(SpvOpNot, { type_id, id, arg_id });
-    } else if (op->is_intrinsic(Call::reinterpret)) {
     } else if (op->is_intrinsic(Call::if_then_else)) {
         if (op->type.is_vector()) {
             scalarize(op);

From ae6050777173e1acdb1f04b4fcf78b5d898a2ab2 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 3 Aug 2022 15:53:29 -0700
Subject: [PATCH 003/166] Hookup Vulkan into Target, DeviceInterface and
 OffloadGPULoops

---
 src/DeviceAPI.h             |  4 +++-
 src/DeviceInterface.cpp     |  7 +++++++
 src/IRPrinter.cpp           |  3 +++
 src/OffloadGPULoops.cpp     |  4 ++++
 src/Target.cpp              | 16 +++++++++++++++-
 src/Target.h                |  1 +
 src/runtime/HalideRuntime.h |  1 +
 src/runtime/runtime_api.cpp |  6 ++++++
 8 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/DeviceAPI.h b/src/DeviceAPI.h
index e75711592558..672fb05e0157 100644
--- a/src/DeviceAPI.h
+++ b/src/DeviceAPI.h
@@ -23,6 +23,7 @@ enum class DeviceAPI {
     Hexagon,
     HexagonDma,
     D3D12Compute,
+    Vulkan,
 };
 
 /** An array containing all the device apis. Useful for iterating
@@ -36,7 +37,8 @@ const DeviceAPI all_device_apis[] = {DeviceAPI::None,
                                      DeviceAPI::Metal,
                                      DeviceAPI::Hexagon,
                                      DeviceAPI::HexagonDma,
-                                     DeviceAPI::D3D12Compute};
+                                     DeviceAPI::D3D12Compute,
+                                     DeviceAPI::Vulkan};
 
 }  // namespace Halide
 
diff --git a/src/DeviceInterface.cpp b/src/DeviceInterface.cpp
index b4b208c26e3f..b21580c18f3a 100644
--- a/src/DeviceInterface.cpp
+++ b/src/DeviceInterface.cpp
@@ -102,6 +102,8 @@ const halide_device_interface_t *get_device_interface_for_device_api(DeviceAPI d
         name = "hexagon_dma";
     } else if (d == DeviceAPI::D3D12Compute) {
         name = "d3d12compute";
+    } else if (d == DeviceAPI::Vulkan) {
+        name = "vulkan";
     } else {
         if (error_site) {
             user_error
@@ -150,6 +152,8 @@ DeviceAPI get_default_device_api_for_target(const Target &target) {
         return DeviceAPI::OpenCL;
     } else if (target.has_feature(Target::CUDA)) {
         return DeviceAPI::CUDA;
+    } else if (target.has_feature(Target::Vulkan)) {
+        return DeviceAPI::Vulkan;
     } else if (target.has_feature(Target::OpenGLCompute)) {
         return DeviceAPI::OpenGLCompute;
     } else if (target.arch != Target::Hexagon && target.has_feature(Target::HVX)) {
@@ -196,6 +200,9 @@ Expr make_device_interface_call(DeviceAPI device_api, MemoryType memory_type) {
     case DeviceAPI::D3D12Compute:
         interface_name = "halide_d3d12compute_device_interface";
         break;
+    case DeviceAPI::Vulkan:
+        interface_name = "halide_vulkan_device_interface";
+        break;
     case DeviceAPI::Default_GPU:
         // Will be resolved later
         interface_name = "halide_default_device_interface";
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index 38f57e46649e..6b6dbaef7fcf 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -117,6 +117,9 @@ ostream &operator<<(ostream &out, const DeviceAPI &api) {
     case DeviceAPI::D3D12Compute:
         out << "<D3D12Compute>";
         break;
+    case DeviceAPI::Vulkan:
+        out << "<Vulkan>";
+        break;
     }
     return out;
 }
diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index 09c457d8ddee..4642c6cea802 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -7,6 +7,7 @@
 #include "CodeGen_OpenCL_Dev.h"
 #include "CodeGen_OpenGLCompute_Dev.h"
 #include "CodeGen_PTX_Dev.h"
+#include "CodeGen_Vulkan_Dev.h"
 #include "ExprUsesVar.h"
 #include "IRMutator.h"
 #include "IROperator.h"
@@ -268,6 +269,9 @@ class InjectGpuOffload : public IRMutator {
         if (target.has_feature(Target::D3D12Compute)) {
             cgdev[DeviceAPI::D3D12Compute] = new_CodeGen_D3D12Compute_Dev(target);
         }
+        if (target.has_feature(Target::Vulkan)) {
+            cgdev[DeviceAPI::Vulkan] = new_CodeGen_Vulkan_Dev(target);
+        }
 
         internal_assert(!cgdev.empty()) << "Requested unknown GPU target: " << target.to_string() << "\n";
     }
diff --git a/src/Target.cpp b/src/Target.cpp
index dd0e0d5ab416..d0d40c4de3a0 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -529,6 +529,7 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"sanitizer_coverage", Target::SanitizerCoverage},
     {"profile_by_timer", Target::ProfileByTimer},
     {"spirv", Target::SPIRV},
+    {"vulkan", Target::Vulkan},
     // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well.
 };
 
@@ -866,6 +867,9 @@ bool Target::supported() const {
 #endif
 #if !defined(WITH_D3D12)
     bad |= has_feature(Target::D3D12Compute);
+#endif
+#if !defined(WITH_VULKAN)
+    bad |= has_feature(Target::Vulkan);
 #endif
     return !bad;
 }
@@ -931,6 +935,7 @@ bool Target::has_gpu_feature() const {
             has_feature(OpenCL) ||
             has_feature(Metal) ||
             has_feature(D3D12Compute) ||
+            has_feature(Vulkan) ||
             has_feature(OpenGLCompute));
 }
 
@@ -1007,6 +1012,9 @@ bool Target::supports_type(const Type &t, DeviceAPI device) const {
         // Shader Model 5.x can optionally support double-precision; 64-bit int
         // types are not supported.
         return t.bits() < 64;
+    } else if (device == DeviceAPI::Vulkan) {
+        // TODO(shoaibkamil): Is this correct?
+        return t.bits() < 64;
     } else if (device == DeviceAPI::OpenGLCompute) {
         return t.bits() < 64;
     }
@@ -1053,6 +1061,9 @@ DeviceAPI Target::get_required_device_api() const {
     if (has_feature(Target::OpenGLCompute)) {
         return DeviceAPI::OpenGLCompute;
     }
+    if (has_feature(Target::Vulkan)) {
+        return DeviceAPI::Vulkan;
+    }
     return DeviceAPI::None;
 }
 
@@ -1070,6 +1081,8 @@ Target::Feature target_feature_for_device_api(DeviceAPI api) {
         return Target::HVX;
     case DeviceAPI::D3D12Compute:
         return Target::D3D12Compute;
+    case DeviceAPI::Vulkan:
+        return Target::Vulkan;
     default:
         return Target::FeatureEnd;
     }
@@ -1152,7 +1165,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
     // (c) must match across both targets; it is an error if one target has the feature and the other doesn't
 
     // clang-format off
-    const std::array<Feature, 18> union_features = {{
+    const std::array<Feature, 19> union_features = {{
         // These are true union features.
         CUDA,
         D3D12Compute,
@@ -1160,6 +1173,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         NoNEON,
         OpenCL,
         OpenGLCompute,
+        Vulkan,
 
         // These features are actually intersection-y, but because targets only record the _highest_,
         // we have to put their union in the result and then take a lower bound.
diff --git a/src/Target.h b/src/Target.h
index 8678bfefbb90..080e24c641cb 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -162,6 +162,7 @@ struct Target {
         SanitizerCoverage = halide_target_feature_sanitizer_coverage,
         ProfileByTimer = halide_target_feature_profile_by_timer,
         SPIRV = halide_target_feature_spirv,
+        Vulkan = halide_target_feature_vulkan,
         FeatureEnd = halide_target_feature_end
     };
     Target() = default;
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 62fc35640eb2..baf3bb64b892 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1364,6 +1364,7 @@ typedef enum halide_target_feature_t {
     halide_target_feature_sanitizer_coverage,     ///< Enable hooks for SanitizerCoverage support.
     halide_target_feature_profile_by_timer,       ///< Alternative to halide_target_feature_profile using timer interrupt for systems without threads or applicartions that need to avoid them.
     halide_target_feature_spirv,                  ///< Enable SPIR-V code generation support.
+    halide_target_feature_vulkan,                 ///< Enable Vulkan runtime support.
     halide_target_feature_end                     ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing.
 } halide_target_feature_t;
 
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
index c06141dcf267..750aac20eac3 100644
--- a/src/runtime/runtime_api.cpp
+++ b/src/runtime/runtime_api.cpp
@@ -9,6 +9,7 @@
 #include "HalideRuntimeOpenCL.h"
 #include "HalideRuntimeOpenGLCompute.h"
 #include "HalideRuntimeQurt.h"
+#include "HalideRuntimeVulkan.h"
 #include "cpu_features.h"
 
 // This runtime module will contain extern declarations of the Halide
@@ -211,5 +212,10 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_d3d12compute_finalize_kernels,
     (void *)&halide_d3d12compute_release_context,
     (void *)&halide_d3d12compute_run,
+    (void *)&halide_vulkan_acquire_context,
+    (void *)&halide_vulkan_device_interface,
+    (void *)&halide_vulkan_initialize_kernels,
+    (void *)&halide_vulkan_release_context,
+    (void *)&halide_vulkan_run,
     (void *)&halide_unused_force_include_types,
 };

From 745525c3e1ebd6e678f645e7fc65f9ca52855c50 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 11:09:01 -0700
Subject: [PATCH 004/166] Add Vulkan runtime to Makefile

---
 Makefile | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/Makefile b/Makefile
index 30b7fb9d7916..4eec9ab282e7 100644
--- a/Makefile
+++ b/Makefile
@@ -125,6 +125,7 @@ WITH_OPENCL ?= not-empty
 WITH_METAL ?= not-empty
 WITH_OPENGLCOMPUTE ?= not-empty
 WITH_D3D12 ?= not-empty
+WITH_VULKAN ?= not-empty
 WITH_INTROSPECTION ?= not-empty
 WITH_EXCEPTIONS ?=
 WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty
@@ -177,6 +178,9 @@ EXCEPTIONS_CXX_FLAGS=$(if $(WITH_EXCEPTIONS), -DHALIDE_WITH_EXCEPTIONS -fexcepti
 HEXAGON_CXX_FLAGS=$(if $(WITH_HEXAGON), -DWITH_HEXAGON, )
 HEXAGON_LLVM_CONFIG_LIB=$(if $(WITH_HEXAGON), hexagon, )
 
+VULKAN_CXX_FLAGS=$(if $(WITH_VULKAN), -DWITH_VULKAN, )
+VULKAN_LLVM_CONFIG_LIB=$(if $(WITH_VULKAN), , )
+
 WEBASSEMBLY_CXX_FLAGS=$(if $(WITH_WEBASSEMBLY), -DWITH_WEBASSEMBLY, )
 WEBASSEMBLY_LLVM_CONFIG_LIB=$(if $(WITH_WEBASSEMBLY), webassembly, )
 
@@ -216,6 +220,7 @@ CXX_FLAGS += $(INTROSPECTION_CXX_FLAGS)
 CXX_FLAGS += $(EXCEPTIONS_CXX_FLAGS)
 CXX_FLAGS += $(AMDGPU_CXX_FLAGS)
 CXX_FLAGS += $(RISCV_CXX_FLAGS)
+CXX_FLAGS += $(VULKAN_CXX_FLAGS)
 CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS)
 
 # This is required on some hosts like powerpc64le-linux-gnu because we may build
@@ -243,6 +248,7 @@ LLVM_STATIC_LIBFILES = \
 	$(POWERPC_LLVM_CONFIG_LIB) \
 	$(HEXAGON_LLVM_CONFIG_LIB) \
 	$(AMDGPU_LLVM_CONFIG_LIB) \
+	$(VULKAN_LLVM_CONFIG_LIB) \
 	$(WEBASSEMBLY_LLVM_CONFIG_LIB) \
 	$(RISCV_LLVM_CONFIG_LIB)
 
@@ -307,6 +313,12 @@ TEST_METAL = 1
 endif
 endif
 
+ifneq ($(WITH_VULKAN), )
+ifneq (,$(findstring vulkan,$(HL_TARGET)))
+TEST_VULKAN = 1
+endif
+endif
+
 ifeq ($(UNAME), Linux)
 ifneq ($(TEST_CUDA), )
 CUDA_LD_FLAGS ?= -L/usr/lib/nvidia-current -lcuda
@@ -314,6 +326,9 @@ endif
 ifneq ($(TEST_OPENCL), )
 OPENCL_LD_FLAGS ?= -lOpenCL
 endif
+ifneq ($(TEST_VULKAN), )
+VULKAN_LD_FLAGS ?= -lvulkan
+endif
 OPENGL_LD_FLAGS ?= -lGL
 HOST_OS=linux
 endif
@@ -326,6 +341,10 @@ endif
 ifneq ($(TEST_OPENCL), )
 OPENCL_LD_FLAGS ?= -framework OpenCL
 endif
+ifneq ($(TEST_VULKAN), )
+# The Vulkan loader is distributed as a dylib on OSX (not a framework)
+VULKAN_LD_FLAGS ?= -lvulkan
+endif
 ifneq ($(TEST_METAL), )
 METAL_LD_FLAGS ?= -framework Metal -framework Foundation
 endif
@@ -337,6 +356,10 @@ ifneq ($(TEST_OPENCL), )
 TEST_CXX_FLAGS += -DTEST_OPENCL
 endif
 
+ifneq ($(TEST_VULKAN), )
+TEST_CXX_FLAGS += -DTEST_VULKAN
+endif
+
 ifneq ($(TEST_METAL), )
 # Using Metal APIs requires writing Objective-C++ (or Swift). Add ObjC++
 # to allow tests to create and destroy Metal contexts, etc. This requires
@@ -436,6 +459,7 @@ SOURCE_FILES = \
   CodeGen_Metal_Dev.cpp \
   CodeGen_MIPS.cpp \
   CodeGen_OpenCL_Dev.cpp \
+  CodeGen_Vulkan_Dev.cpp \
   CodeGen_OpenGLCompute_Dev.cpp \
   CodeGen_Posix.cpp \
   CodeGen_PowerPC.cpp \
@@ -614,6 +638,7 @@ HEADER_FILES = \
   CodeGen_LLVM.h \
   CodeGen_Metal_Dev.h \
   CodeGen_OpenCL_Dev.h \
+  CodeGen_Vulkan_Dev.h \
   CodeGen_OpenGLCompute_Dev.h \
   CodeGen_Posix.h \
   CodeGen_PTX_Dev.h \
@@ -838,8 +863,10 @@ RUNTIME_CPP_COMPONENTS = \
   windows_profiler \
   windows_threads \
   windows_threads_tsan \
+  windows_vulkan \
   windows_yield \
   write_debug_image \
+  vulkan \
   x86_cpu_features \
 
 RUNTIME_LL_COMPONENTS = \
@@ -869,6 +896,7 @@ RUNTIME_EXPORTED_INCLUDES = $(INCLUDE_DIR)/HalideRuntime.h \
                             $(INCLUDE_DIR)/HalideRuntimeOpenGLCompute.h \
                             $(INCLUDE_DIR)/HalideRuntimeMetal.h	\
                             $(INCLUDE_DIR)/HalideRuntimeQurt.h \
+                            $(INCLUDE_DIR)/HalideRuntimeVulkan.h \
                             $(INCLUDE_DIR)/HalideBuffer.h \
                             $(INCLUDE_DIR)/HalidePyTorchHelpers.h \
                             $(INCLUDE_DIR)/HalidePyTorchCudaHelpers.h

From 57ae62b7fe81579dafe60128421715aaa662a76c Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 11:09:36 -0700
Subject: [PATCH 005/166] Add Vulkan target to Python bindings

---
 python_bindings/src/halide/halide_/PyEnums.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index c04478211676..c67906a73574 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -14,6 +14,7 @@ void define_enums(py::module &m) {
         .value("Host", DeviceAPI::Host)
         .value("Default_GPU", DeviceAPI::Default_GPU)
         .value("CUDA", DeviceAPI::CUDA)
+        .value("Vulkan", DeviceAPI::Vulkan)
         .value("OpenCL", DeviceAPI::OpenCL)
         .value("OpenGLCompute", DeviceAPI::OpenGLCompute)
         .value("Metal", DeviceAPI::Metal)
@@ -175,6 +176,7 @@ void define_enums(py::module &m) {
         .value("SanitizerCoverage", Target::Feature::SanitizerCoverage)
         .value("ProfileByTimer", Target::Feature::ProfileByTimer)
         .value("SPIRV", Target::Feature::SPIRV)
+        .value("Vulkan", Target::Feature::Vulkan)
         .value("FeatureEnd", Target::Feature::FeatureEnd);
 
     py::enum_<halide_type_code_t>(m, "TypeCode")

From 5d3d955bf4ac696e0e066fcc819f55764ab340d6 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 11:10:25 -0700
Subject: [PATCH 006/166] Add runtime linker support to target Vulkan CodeGen

---
 src/CodeGen_Internal.cpp    |  2 ++
 src/DeviceInterface.cpp     |  4 ++--
 src/FuseGPUThreadLoops.cpp  |  3 ++-
 src/JITModule.cpp           | 43 ++++++++++++++++++++++++++++++++++++-
 src/LLVM_Runtime_Linker.cpp | 15 +++++++++++++
 5 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index f2e628af3fe5..75a971330ecc 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -120,6 +120,7 @@ bool function_takes_user_context(const std::string &name) {
         "halide_openglcompute_run",
         "halide_metal_run",
         "halide_d3d12compute_run",
+        "halide_vulkan_run",
         "halide_msan_annotate_buffer_is_initialized_as_destructor",
         "halide_msan_annotate_buffer_is_initialized",
         "halide_msan_annotate_memory_is_initialized",
@@ -143,6 +144,7 @@ bool function_takes_user_context(const std::string &name) {
         "halide_openglcompute_initialize_kernels",
         "halide_metal_initialize_kernels",
         "halide_d3d12compute_initialize_kernels",
+        "halide_vulkan_initialize_kernels",
         "halide_get_gpu_device",
         "_halide_buffer_crop",
         "_halide_buffer_retire_crop_after_extern_stage",
diff --git a/src/DeviceInterface.cpp b/src/DeviceInterface.cpp
index b21580c18f3a..fa264a33dcae 100644
--- a/src/DeviceInterface.cpp
+++ b/src/DeviceInterface.cpp
@@ -152,8 +152,6 @@ DeviceAPI get_default_device_api_for_target(const Target &target) {
         return DeviceAPI::OpenCL;
     } else if (target.has_feature(Target::CUDA)) {
         return DeviceAPI::CUDA;
-    } else if (target.has_feature(Target::Vulkan)) {
-        return DeviceAPI::Vulkan;
     } else if (target.has_feature(Target::OpenGLCompute)) {
         return DeviceAPI::OpenGLCompute;
     } else if (target.arch != Target::Hexagon && target.has_feature(Target::HVX)) {
@@ -162,6 +160,8 @@ DeviceAPI get_default_device_api_for_target(const Target &target) {
         return DeviceAPI::HexagonDma;
     } else if (target.has_feature(Target::D3D12Compute)) {
         return DeviceAPI::D3D12Compute;
+    } else if (target.has_feature(Target::Vulkan)) {
+        return DeviceAPI::Vulkan;
     } else {
         return DeviceAPI::Host;
     }
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index cd65a32618f5..7bf5707765a6 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -1489,7 +1489,8 @@ class ZeroGPULoopMins : public IRMutator {
         in_non_glsl_gpu = (in_non_glsl_gpu && op->device_api == DeviceAPI::None) ||
                           (op->device_api == DeviceAPI::CUDA) || (op->device_api == DeviceAPI::OpenCL) ||
                           (op->device_api == DeviceAPI::Metal) ||
-                          (op->device_api == DeviceAPI::D3D12Compute);
+                          (op->device_api == DeviceAPI::D3D12Compute) ||
+                          (op->device_api == DeviceAPI::Vulkan);
 
         Stmt stmt = IRMutator::visit(op);
         if (CodeGen_GPU_Dev::is_gpu_var(op->name) && !is_const_zero(op->min)) {
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index e595613ffb5e..730c6bc94924 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -115,6 +115,27 @@ void load_metal() {
 #endif
 }
 
+void load_vulkan() {
+    if (have_symbol("vkGetInstanceProcAddr")) {
+        debug(1) << "Vulkan support code already linked in...\n";
+    } else {
+        debug(1) << "Looking for Vulkan support code...\n";
+        string error;
+#if defined(__linux__)
+        llvm::sys::DynamicLibrary::LoadLibraryPermanently("libvulkan.so.1", &error);
+        user_assert(error.empty()) << "Could not find libvulkan.so.1\n";
+#elif defined(__APPLE__)
+        llvm::sys::DynamicLibrary::LoadLibraryPermanently("libvulkan.1.dylib", &error);
+        user_assert(error.empty()) << "Could not find libvulkan.1.dylib\n";
+#elif defined(_WIN32)
+        llvm::sys::DynamicLibrary::LoadLibraryPermanently("vulkan-1.dll", &error);
+        user_assert(error.empty()) << "Could not find vulkan-1.dll\n";
+#else
+        internal_error << "JIT support for Vulkan only available on Linux, OS X and Windows!\n";
+#endif
+    }
+}
+
 }  // namespace
 
 using namespace llvm;
@@ -648,12 +669,14 @@ enum RuntimeKind {
     OpenGLCompute,
     Hexagon,
     D3D12Compute,
+    Vulkan,
     OpenCLDebug,
     MetalDebug,
     CUDADebug,
     OpenGLComputeDebug,
     HexagonDebug,
     D3D12ComputeDebug,
+    VulkanDebug,
     MaxRuntimeKind
 };
 
@@ -689,6 +712,7 @@ JITModule &make_module(llvm::Module *for_module, Target target,
         one_gpu.set_feature(Target::HVX, false);
         one_gpu.set_feature(Target::OpenGLCompute, false);
         one_gpu.set_feature(Target::D3D12Compute, false);
+        one_gpu.set_feature(Target::Vulkan, false);
         string module_name;
         switch (runtime_kind) {
         case OpenCLDebug:
@@ -752,6 +776,17 @@ JITModule &make_module(llvm::Module *for_module, Target target,
             internal_error << "JIT support for Direct3D 12 is only implemented on Windows 10 and above.\n";
 #endif
             break;
+        case VulkanDebug:
+            one_gpu.set_feature(Target::Debug);
+            one_gpu.set_feature(Target::Vulkan);
+            load_vulkan();
+            module_name = "debug_vulkan";
+            break;
+        case Vulkan:
+            one_gpu.set_feature(Target::Vulkan);
+            load_vulkan();
+            module_name += "vulkan";
+            break;
         default:
             module_name = "shared runtime";
             break;
@@ -937,7 +972,13 @@ std::vector<JITModule> JITSharedRuntime::get(llvm::Module *for_module, const Tar
             result.push_back(m);
         }
     }
-
+    if (target.has_feature(Target::Vulkan)) {
+        auto kind = target.has_feature(Target::Debug) ? VulkanDebug : Vulkan;
+        JITModule m = make_module(for_module, target, kind, result, create);
+        if (m.compiled()) {
+            result.push_back(m);
+        }
+    }
     return result;
 }
 
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 4995b197b433..df22a69c8640 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -228,6 +228,14 @@ DECLARE_CPP_INITMOD_LOOKUP(windows_d3d12compute_arm)
 DECLARE_NO_INITMOD(windows_d3d12compute_arm)
 #endif  // WITH_D3D12
 
+#ifdef WITH_VULKAN
+DECLARE_CPP_INITMOD(vulkan)
+DECLARE_CPP_INITMOD(windows_vulkan)
+#else
+DECLARE_NO_INITMOD(vulkan)
+DECLARE_NO_INITMOD(windows_vulkan)
+#endif  // WITH_VULKAN
+
 #ifdef WITH_X86
 DECLARE_LL_INITMOD(x86_amx)
 DECLARE_LL_INITMOD(x86_avx512)
@@ -1197,6 +1205,13 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
                 user_error << "Direct3D 12 can only be used on ARM or X86 architectures.\n";
             }
         }
+        if (t.has_feature(Target::Vulkan)) {
+            if (t.os == Target::Windows) {
+                modules.push_back(get_initmod_windows_vulkan(c, bits_64, debug));
+            } else {
+                modules.push_back(get_initmod_vulkan(c, bits_64, debug));
+            }
+        }
         if (t.arch != Target::Hexagon && t.has_feature(Target::HVX)) {
             modules.push_back(get_initmod_module_jit_ref_count(c, bits_64, debug));
             modules.push_back(get_initmod_hexagon_host(c, bits_64, debug));

From 76f636d48aa8e1cb1633fc9d957e27c484b391eb Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 11:11:20 -0700
Subject: [PATCH 007/166] Add Vulkan windows decorator to runtime targets

---
 src/runtime/CMakeLists.txt     | 1 +
 src/runtime/windows_vulkan.cpp | 2 ++
 2 files changed, 3 insertions(+)
 create mode 100644 src/runtime/windows_vulkan.cpp

diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index 5704556d72e3..67c6221be270 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -92,6 +92,7 @@ set(RUNTIME_CPP
     windows_profiler
     windows_threads
     windows_threads_tsan
+    windows_vulkan
     windows_yield
     write_debug_image
     x86_cpu_features
diff --git a/src/runtime/windows_vulkan.cpp b/src/runtime/windows_vulkan.cpp
new file mode 100644
index 000000000000..cd12ca0aa74b
--- /dev/null
+++ b/src/runtime/windows_vulkan.cpp
@@ -0,0 +1,2 @@
+#define WINDOWS
+#include "vulkan.cpp"

From 522ea43346aa7845db88cb761bf9d105fba737f6 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 11:14:44 -0700
Subject: [PATCH 008/166] Wrap debug messages for internal runtime classes with
 DEBUG_INTERNAL Error on failed string termination

---
 src/runtime/internal/block_allocator.h  | 20 ++++++++--------
 src/runtime/internal/block_storage.h    | 10 ++++++++
 src/runtime/internal/pointer_table.h    |  9 ++++---
 src/runtime/internal/region_allocator.h | 32 +++++++++++++------------
 src/runtime/internal/string_storage.h   |  4 +++-
 5 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index 8dd7e4fc6dfa..fbdf6f52b395 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -142,7 +142,7 @@ void BlockAllocator::initialize(void *user_context, const Config &cfg, const Mem
 }
 
 MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &request) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "BlockAllocator: Reserve ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "offset=" << (uint32_t)request.offset << " "
@@ -239,7 +239,7 @@ void BlockAllocator::destroy(void *user_context) {
 MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAllocator *allocator, const MemoryRequest &request) {
     MemoryRegion *result = allocator->reserve(user_context, request);
     if (result == nullptr) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "BlockAllocator: Failed to allocate region of size ("
                             << (int32_t)(request.size) << " bytes)!\n";
 #endif
@@ -274,7 +274,7 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
 
         size_t available = (block->memory.size - block->reserved);
         if (available >= size) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
             debug(user_context) << "BlockAllocator: find_block_entry (FOUND) ("
                                 << "user_context=" << (void *)(user_context) << " "
                                 << "block_entry=" << (void *)(block_entry) << " "
@@ -310,7 +310,7 @@ BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &
 
 RegionAllocator *
 BlockAllocator::create_region_allocator(void *user_context, BlockResource *block) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "BlockAllocator: Creating region allocator ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_resource=" << (void *)(block) << ")...\n";
@@ -328,7 +328,7 @@ BlockAllocator::create_region_allocator(void *user_context, BlockResource *block
 }
 
 void BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator *region_allocator) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "BlockAllocator: Destroying region allocator ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "region_allocator=" << (void *)(region_allocator) << ")...\n";
@@ -353,7 +353,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
         return nullptr;
     }
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "BlockAllocator: Creating block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << " "
@@ -371,7 +371,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
 }
 
 void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "BlockAllocator: Releasing block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << ")...\n";
@@ -383,7 +383,7 @@ void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::Blo
 }
 
 void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "BlockAllocator: Destroying block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << " "
@@ -399,7 +399,7 @@ void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Blo
 }
 
 void BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n";
 #endif
     halide_abort_if_false(user_context, allocators.block.allocate != nullptr);
@@ -409,7 +409,7 @@ void BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block
 }
 
 void BlockAllocator::free_memory_block(void *user_context, BlockResource *block) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n";
 #endif
     halide_abort_if_false(user_context, allocators.block.deallocate != nullptr);
diff --git a/src/runtime/internal/block_storage.h b/src/runtime/internal/block_storage.h
index a552c0a438d9..8c17f700040f 100644
--- a/src/runtime/internal/block_storage.h
+++ b/src/runtime/internal/block_storage.h
@@ -57,6 +57,8 @@ class BlockStorage {
     void destroy(void *user_context);
 
     bool empty() const;
+    bool full() const;
+    bool is_valid(size_t index) const;
     size_t stride() const;
     size_t size() const;
 
@@ -321,6 +323,14 @@ bool BlockStorage::empty() const {
     return count == 0;
 }
 
+bool BlockStorage::full() const {
+    return (count >= capacity);
+}
+
+bool BlockStorage::is_valid(size_t index) const {
+    return (index < capacity);
+}
+
 size_t BlockStorage::size() const {
     return count;
 }
diff --git a/src/runtime/internal/pointer_table.h b/src/runtime/internal/pointer_table.h
index b37a86338028..7f16f11ea49f 100644
--- a/src/runtime/internal/pointer_table.h
+++ b/src/runtime/internal/pointer_table.h
@@ -184,7 +184,7 @@ void PointerTable::resize(void *user_context, size_t entry_count, bool realloc)
     size_t actual_size = current_size;
     count = requested_size;
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "PointerTable: Resize ("
                         << "requested_size=" << (int32_t)requested_size << " "
                         << "current_size=" << (int32_t)current_size << " "
@@ -234,7 +234,7 @@ void PointerTable::remove(void *user_context, size_t index, size_t entry_count)
         size_t src_offset = (index + entry_count) * sizeof(void *);
         size_t bytes = (last_index - index - entry_count) * sizeof(void *);
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "PointerTable: Remove ("
                             << "index=" << (int32_t)index << " "
                             << "entry_count=" << (int32_t)entry_count << " "
@@ -253,8 +253,7 @@ void PointerTable::replace(void *user_context, size_t index, const void **array,
     size_t remaining = count - index;
     size_t copy_count = min(remaining, array_size);
 
-#ifdef DEBUG_RUNTIME
-
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "PointerTable: Replace ("
                         << "index=" << (int32_t)index << " "
                         << "array_size=" << (int32_t)array_size << " "
@@ -332,7 +331,7 @@ void PointerTable::allocate(void *user_context, size_t new_capacity) {
         halide_abort_if_false(user_context, allocator.allocate != nullptr);
         size_t bytes = new_capacity * sizeof(void *);
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "PointerTable: Allocating (bytes=" << (int32_t)bytes << " allocator=" << (void *)allocator.allocate << ")...\n";
 #endif
 
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 5deba8c644fc..e01a94699ed6 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -133,7 +133,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     halide_abort_if_false(user_context, request.size > 0);
     size_t remaining = block->memory.size - block->reserved;
     if (remaining < request.size) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "RegionAllocator: Unable to reserve more memory from block "
                             << "-- requested size (" << (int32_t)(request.size) << " bytes) "
                             << "greater than available (" << (int32_t)(remaining) << " bytes)!\n";
@@ -143,7 +143,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
 
     BlockRegion *block_region = find_block_region(user_context, request);
     if (block_region == nullptr) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "RegionAllocator: Failed to locate region for requested size ("
                             << (int32_t)(request.size) << " bytes)!\n";
 #endif
@@ -151,7 +151,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     }
 
     if (can_split(block_region, request.size)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
                             << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n";
 #endif
@@ -230,7 +230,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     if (block_region->prev_ptr && (block_region->prev_ptr->status == AllocationStatus::Available)) {
         BlockRegion *prev_region = block_region->prev_ptr;
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "RegionAllocator: Coalescing "
                             << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) "
                             << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!";
@@ -248,7 +248,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     if (block_region->next_ptr && (block_region->next_ptr->status == AllocationStatus::Available)) {
         BlockRegion *next_region = block_region->next_ptr;
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "RegionAllocator: Coalescing "
                             << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) "
                             << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n";
@@ -276,7 +276,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     size_t empty_offset = adjusted_offset + size;
     size_t empty_size = block_region->memory.size - adjusted_size;
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Splitting "
                         << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
                         << "to create empty region (offset=" << (int32_t)empty_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
@@ -299,7 +299,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 }
 
 BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Creating block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "offset=" << (uint32_t)offset << " "
@@ -317,7 +317,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
         return nullptr;
     }
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Added block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
@@ -333,7 +333,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
 }
 
 void RegionAllocator::release_block_region(void *user_context, BlockRegion *block_region) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Releasing block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
@@ -342,7 +342,7 @@ void RegionAllocator::release_block_region(void *user_context, BlockRegion *bloc
 }
 
 void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block_region) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Destroying block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
@@ -353,7 +353,7 @@ void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *bloc
 }
 
 void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_region) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Allocating region (size=" << (int32_t)(block_region->memory.size) << ", offset=" << (int32_t)block_region->memory.offset << ")!\n";
 #endif
     halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
@@ -365,14 +365,16 @@ void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_
 }
 
 void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_region) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Freeing block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
     if ((block_region->status == AllocationStatus::InUse) ||
         (block_region->status == AllocationStatus::Dedicated)) {
+#ifdef DEBUG_INTERNAL
         debug(user_context) << "RegionAllocator: Deallocating region (size=" << (int32_t)(block_region->memory.size) << ", offset=" << (int32_t)block_region->memory.offset << ")!\n";
+#endif
         halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
         MemoryRegion *memory_region = &(block_region->memory);
         allocators.region.deallocate(user_context, memory_region);
@@ -383,7 +385,7 @@ void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_r
 }
 
 void RegionAllocator::release(void *user_context) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Releasing all regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
@@ -393,7 +395,7 @@ void RegionAllocator::release(void *user_context) {
 }
 
 bool RegionAllocator::collect(void *user_context) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Collecting free block regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
@@ -410,7 +412,7 @@ bool RegionAllocator::collect(void *user_context) {
 }
 
 void RegionAllocator::destroy(void *user_context) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_INTERNAL
     debug(user_context) << "RegionAllocator: Destroying all block regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
diff --git a/src/runtime/internal/string_storage.h b/src/runtime/internal/string_storage.h
index ac7dac69215c..21f20981dc33 100644
--- a/src/runtime/internal/string_storage.h
+++ b/src/runtime/internal/string_storage.h
@@ -256,9 +256,11 @@ void StringStorage::prepend(void *user_context, char ch) {
 }
 
 void StringStorage::terminate(void *user_context, size_t length) {
-    if (contents.data() && (length < contents.size())) {
+    if (contents.is_valid(length)) {
         char *end_ptr = static_cast<char *>(contents[length]);
         (*end_ptr) = '\0';
+    } else {
+        halide_error(user_context, "StringStorage: Failed to terminate string! Out of bounds!\n");
     }
 }
 

From acc13ee2a1b0f0e619bf43f3ead77d16c9a4a2ac Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 12:26:39 -0700
Subject: [PATCH 009/166] Silence clang-tidy warnings for redundant expressions
 on Vulkan enum values

---
 src/runtime/mini_vulkan.h | 106 +++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index f0f10539a162..fc501cfcc72f 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -120,7 +120,7 @@ typedef enum VkPipelineCacheHeaderVersion {
     VK_PIPELINE_CACHE_HEADER_VERSION_ONE = 1,
     VK_PIPELINE_CACHE_HEADER_VERSION_BEGIN_RANGE = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
     VK_PIPELINE_CACHE_HEADER_VERSION_END_RANGE = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
-    VK_PIPELINE_CACHE_HEADER_VERSION_RANGE_SIZE = (VK_PIPELINE_CACHE_HEADER_VERSION_ONE - VK_PIPELINE_CACHE_HEADER_VERSION_ONE + 1),
+    VK_PIPELINE_CACHE_HEADER_VERSION_RANGE_SIZE = (VK_PIPELINE_CACHE_HEADER_VERSION_ONE - VK_PIPELINE_CACHE_HEADER_VERSION_ONE + 1),  // NOLINT: misc-redundant-expression
     VK_PIPELINE_CACHE_HEADER_VERSION_MAX_ENUM = 0x7FFFFFFF
 } VkPipelineCacheHeaderVersion;
 
@@ -154,7 +154,7 @@ typedef enum VkResult {
     VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = -1000072003,
     VK_RESULT_BEGIN_RANGE = VK_ERROR_FRAGMENTED_POOL,
     VK_RESULT_END_RANGE = VK_INCOMPLETE,
-    VK_RESULT_RANGE_SIZE = (VK_INCOMPLETE - VK_ERROR_FRAGMENTED_POOL + 1),
+    VK_RESULT_RANGE_SIZE = (VK_INCOMPLETE - VK_ERROR_FRAGMENTED_POOL + 1),  // NOLINT: misc-redundant-expression
     VK_RESULT_MAX_ENUM = 0x7FFFFFFF
 } VkResult;
 
@@ -347,7 +347,7 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_SHADER_MODULE_VALIDATION_CACHE_CREATE_INFO_EXT = 1000160001,
     VK_STRUCTURE_TYPE_BEGIN_RANGE = VK_STRUCTURE_TYPE_APPLICATION_INFO,
     VK_STRUCTURE_TYPE_END_RANGE = VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO,
-    VK_STRUCTURE_TYPE_RANGE_SIZE = (VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO - VK_STRUCTURE_TYPE_APPLICATION_INFO + 1),
+    VK_STRUCTURE_TYPE_RANGE_SIZE = (VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO - VK_STRUCTURE_TYPE_APPLICATION_INFO + 1),  // NOLINT: misc-redundant-expression
     VK_STRUCTURE_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkStructureType;
 
@@ -359,7 +359,7 @@ typedef enum VkSystemAllocationScope {
     VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE = 4,
     VK_SYSTEM_ALLOCATION_SCOPE_BEGIN_RANGE = VK_SYSTEM_ALLOCATION_SCOPE_COMMAND,
     VK_SYSTEM_ALLOCATION_SCOPE_END_RANGE = VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE,
-    VK_SYSTEM_ALLOCATION_SCOPE_RANGE_SIZE = (VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND + 1),
+    VK_SYSTEM_ALLOCATION_SCOPE_RANGE_SIZE = (VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND + 1),  // NOLINT: misc-redundant-expression
     VK_SYSTEM_ALLOCATION_SCOPE_MAX_ENUM = 0x7FFFFFFF
 } VkSystemAllocationScope;
 
@@ -367,7 +367,7 @@ typedef enum VkInternalAllocationType {
     VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE = 0,
     VK_INTERNAL_ALLOCATION_TYPE_BEGIN_RANGE = VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE,
     VK_INTERNAL_ALLOCATION_TYPE_END_RANGE = VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE,
-    VK_INTERNAL_ALLOCATION_TYPE_RANGE_SIZE = (VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE - VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE + 1),
+    VK_INTERNAL_ALLOCATION_TYPE_RANGE_SIZE = (VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE - VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE + 1),  // NOLINT: misc-redundant-expression
     VK_INTERNAL_ALLOCATION_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkInternalAllocationType;
 
@@ -567,7 +567,7 @@ typedef enum VkFormat {
     VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG = 1000054007,
     VK_FORMAT_BEGIN_RANGE = VK_FORMAT_UNDEFINED,
     VK_FORMAT_END_RANGE = VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
-    VK_FORMAT_RANGE_SIZE = (VK_FORMAT_ASTC_12x12_SRGB_BLOCK - VK_FORMAT_UNDEFINED + 1),
+    VK_FORMAT_RANGE_SIZE = (VK_FORMAT_ASTC_12x12_SRGB_BLOCK - VK_FORMAT_UNDEFINED + 1),  // NOLINT: misc-redundant-expression
     VK_FORMAT_MAX_ENUM = 0x7FFFFFFF
 } VkFormat;
 
@@ -577,7 +577,7 @@ typedef enum VkImageType {
     VK_IMAGE_TYPE_3D = 2,
     VK_IMAGE_TYPE_BEGIN_RANGE = VK_IMAGE_TYPE_1D,
     VK_IMAGE_TYPE_END_RANGE = VK_IMAGE_TYPE_3D,
-    VK_IMAGE_TYPE_RANGE_SIZE = (VK_IMAGE_TYPE_3D - VK_IMAGE_TYPE_1D + 1),
+    VK_IMAGE_TYPE_RANGE_SIZE = (VK_IMAGE_TYPE_3D - VK_IMAGE_TYPE_1D + 1),  // NOLINT: misc-redundant-expression
     VK_IMAGE_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkImageType;
 
@@ -586,7 +586,7 @@ typedef enum VkImageTiling {
     VK_IMAGE_TILING_LINEAR = 1,
     VK_IMAGE_TILING_BEGIN_RANGE = VK_IMAGE_TILING_OPTIMAL,
     VK_IMAGE_TILING_END_RANGE = VK_IMAGE_TILING_LINEAR,
-    VK_IMAGE_TILING_RANGE_SIZE = (VK_IMAGE_TILING_LINEAR - VK_IMAGE_TILING_OPTIMAL + 1),
+    VK_IMAGE_TILING_RANGE_SIZE = (VK_IMAGE_TILING_LINEAR - VK_IMAGE_TILING_OPTIMAL + 1),  // NOLINT: misc-redundant-expression
     VK_IMAGE_TILING_MAX_ENUM = 0x7FFFFFFF
 } VkImageTiling;
 
@@ -598,7 +598,7 @@ typedef enum VkPhysicalDeviceType {
     VK_PHYSICAL_DEVICE_TYPE_CPU = 4,
     VK_PHYSICAL_DEVICE_TYPE_BEGIN_RANGE = VK_PHYSICAL_DEVICE_TYPE_OTHER,
     VK_PHYSICAL_DEVICE_TYPE_END_RANGE = VK_PHYSICAL_DEVICE_TYPE_CPU,
-    VK_PHYSICAL_DEVICE_TYPE_RANGE_SIZE = (VK_PHYSICAL_DEVICE_TYPE_CPU - VK_PHYSICAL_DEVICE_TYPE_OTHER + 1),
+    VK_PHYSICAL_DEVICE_TYPE_RANGE_SIZE = (VK_PHYSICAL_DEVICE_TYPE_CPU - VK_PHYSICAL_DEVICE_TYPE_OTHER + 1),  // NOLINT: misc-redundant-expression
     VK_PHYSICAL_DEVICE_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkPhysicalDeviceType;
 
@@ -608,7 +608,7 @@ typedef enum VkQueryType {
     VK_QUERY_TYPE_TIMESTAMP = 2,
     VK_QUERY_TYPE_BEGIN_RANGE = VK_QUERY_TYPE_OCCLUSION,
     VK_QUERY_TYPE_END_RANGE = VK_QUERY_TYPE_TIMESTAMP,
-    VK_QUERY_TYPE_RANGE_SIZE = (VK_QUERY_TYPE_TIMESTAMP - VK_QUERY_TYPE_OCCLUSION + 1),
+    VK_QUERY_TYPE_RANGE_SIZE = (VK_QUERY_TYPE_TIMESTAMP - VK_QUERY_TYPE_OCCLUSION + 1),  // NOLINT: misc-redundant-expression
     VK_QUERY_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkQueryType;
 
@@ -617,7 +617,7 @@ typedef enum VkSharingMode {
     VK_SHARING_MODE_CONCURRENT = 1,
     VK_SHARING_MODE_BEGIN_RANGE = VK_SHARING_MODE_EXCLUSIVE,
     VK_SHARING_MODE_END_RANGE = VK_SHARING_MODE_CONCURRENT,
-    VK_SHARING_MODE_RANGE_SIZE = (VK_SHARING_MODE_CONCURRENT - VK_SHARING_MODE_EXCLUSIVE + 1),
+    VK_SHARING_MODE_RANGE_SIZE = (VK_SHARING_MODE_CONCURRENT - VK_SHARING_MODE_EXCLUSIVE + 1),  // NOLINT: misc-redundant-expression
     VK_SHARING_MODE_MAX_ENUM = 0x7FFFFFFF
 } VkSharingMode;
 
@@ -635,7 +635,7 @@ typedef enum VkImageLayout {
     VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR = 1000111000,
     VK_IMAGE_LAYOUT_BEGIN_RANGE = VK_IMAGE_LAYOUT_UNDEFINED,
     VK_IMAGE_LAYOUT_END_RANGE = VK_IMAGE_LAYOUT_PREINITIALIZED,
-    VK_IMAGE_LAYOUT_RANGE_SIZE = (VK_IMAGE_LAYOUT_PREINITIALIZED - VK_IMAGE_LAYOUT_UNDEFINED + 1),
+    VK_IMAGE_LAYOUT_RANGE_SIZE = (VK_IMAGE_LAYOUT_PREINITIALIZED - VK_IMAGE_LAYOUT_UNDEFINED + 1),  // NOLINT: misc-redundant-expression
     VK_IMAGE_LAYOUT_MAX_ENUM = 0x7FFFFFFF
 } VkImageLayout;
 
@@ -649,7 +649,7 @@ typedef enum VkImageViewType {
     VK_IMAGE_VIEW_TYPE_CUBE_ARRAY = 6,
     VK_IMAGE_VIEW_TYPE_BEGIN_RANGE = VK_IMAGE_VIEW_TYPE_1D,
     VK_IMAGE_VIEW_TYPE_END_RANGE = VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
-    VK_IMAGE_VIEW_TYPE_RANGE_SIZE = (VK_IMAGE_VIEW_TYPE_CUBE_ARRAY - VK_IMAGE_VIEW_TYPE_1D + 1),
+    VK_IMAGE_VIEW_TYPE_RANGE_SIZE = (VK_IMAGE_VIEW_TYPE_CUBE_ARRAY - VK_IMAGE_VIEW_TYPE_1D + 1),  // NOLINT: misc-redundant-expression
     VK_IMAGE_VIEW_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkImageViewType;
 
@@ -663,7 +663,7 @@ typedef enum VkComponentSwizzle {
     VK_COMPONENT_SWIZZLE_A = 6,
     VK_COMPONENT_SWIZZLE_BEGIN_RANGE = VK_COMPONENT_SWIZZLE_IDENTITY,
     VK_COMPONENT_SWIZZLE_END_RANGE = VK_COMPONENT_SWIZZLE_A,
-    VK_COMPONENT_SWIZZLE_RANGE_SIZE = (VK_COMPONENT_SWIZZLE_A - VK_COMPONENT_SWIZZLE_IDENTITY + 1),
+    VK_COMPONENT_SWIZZLE_RANGE_SIZE = (VK_COMPONENT_SWIZZLE_A - VK_COMPONENT_SWIZZLE_IDENTITY + 1),  // NOLINT: misc-redundant-expression
     VK_COMPONENT_SWIZZLE_MAX_ENUM = 0x7FFFFFFF
 } VkComponentSwizzle;
 
@@ -672,7 +672,7 @@ typedef enum VkVertexInputRate {
     VK_VERTEX_INPUT_RATE_INSTANCE = 1,
     VK_VERTEX_INPUT_RATE_BEGIN_RANGE = VK_VERTEX_INPUT_RATE_VERTEX,
     VK_VERTEX_INPUT_RATE_END_RANGE = VK_VERTEX_INPUT_RATE_INSTANCE,
-    VK_VERTEX_INPUT_RATE_RANGE_SIZE = (VK_VERTEX_INPUT_RATE_INSTANCE - VK_VERTEX_INPUT_RATE_VERTEX + 1),
+    VK_VERTEX_INPUT_RATE_RANGE_SIZE = (VK_VERTEX_INPUT_RATE_INSTANCE - VK_VERTEX_INPUT_RATE_VERTEX + 1),  // NOLINT: misc-redundant-expression
     VK_VERTEX_INPUT_RATE_MAX_ENUM = 0x7FFFFFFF
 } VkVertexInputRate;
 
@@ -690,7 +690,7 @@ typedef enum VkPrimitiveTopology {
     VK_PRIMITIVE_TOPOLOGY_PATCH_LIST = 10,
     VK_PRIMITIVE_TOPOLOGY_BEGIN_RANGE = VK_PRIMITIVE_TOPOLOGY_POINT_LIST,
     VK_PRIMITIVE_TOPOLOGY_END_RANGE = VK_PRIMITIVE_TOPOLOGY_PATCH_LIST,
-    VK_PRIMITIVE_TOPOLOGY_RANGE_SIZE = (VK_PRIMITIVE_TOPOLOGY_PATCH_LIST - VK_PRIMITIVE_TOPOLOGY_POINT_LIST + 1),
+    VK_PRIMITIVE_TOPOLOGY_RANGE_SIZE = (VK_PRIMITIVE_TOPOLOGY_PATCH_LIST - VK_PRIMITIVE_TOPOLOGY_POINT_LIST + 1),  // NOLINT: misc-redundant-expression
     VK_PRIMITIVE_TOPOLOGY_MAX_ENUM = 0x7FFFFFFF
 } VkPrimitiveTopology;
 
@@ -701,7 +701,7 @@ typedef enum VkPolygonMode {
     VK_POLYGON_MODE_FILL_RECTANGLE_NV = 1000153000,
     VK_POLYGON_MODE_BEGIN_RANGE = VK_POLYGON_MODE_FILL,
     VK_POLYGON_MODE_END_RANGE = VK_POLYGON_MODE_POINT,
-    VK_POLYGON_MODE_RANGE_SIZE = (VK_POLYGON_MODE_POINT - VK_POLYGON_MODE_FILL + 1),
+    VK_POLYGON_MODE_RANGE_SIZE = (VK_POLYGON_MODE_POINT - VK_POLYGON_MODE_FILL + 1),  // NOLINT: misc-redundant-expression
     VK_POLYGON_MODE_MAX_ENUM = 0x7FFFFFFF
 } VkPolygonMode;
 
@@ -710,7 +710,7 @@ typedef enum VkFrontFace {
     VK_FRONT_FACE_CLOCKWISE = 1,
     VK_FRONT_FACE_BEGIN_RANGE = VK_FRONT_FACE_COUNTER_CLOCKWISE,
     VK_FRONT_FACE_END_RANGE = VK_FRONT_FACE_CLOCKWISE,
-    VK_FRONT_FACE_RANGE_SIZE = (VK_FRONT_FACE_CLOCKWISE - VK_FRONT_FACE_COUNTER_CLOCKWISE + 1),
+    VK_FRONT_FACE_RANGE_SIZE = (VK_FRONT_FACE_CLOCKWISE - VK_FRONT_FACE_COUNTER_CLOCKWISE + 1),  // NOLINT: misc-redundant-expression
     VK_FRONT_FACE_MAX_ENUM = 0x7FFFFFFF
 } VkFrontFace;
 
@@ -725,7 +725,7 @@ typedef enum VkCompareOp {
     VK_COMPARE_OP_ALWAYS = 7,
     VK_COMPARE_OP_BEGIN_RANGE = VK_COMPARE_OP_NEVER,
     VK_COMPARE_OP_END_RANGE = VK_COMPARE_OP_ALWAYS,
-    VK_COMPARE_OP_RANGE_SIZE = (VK_COMPARE_OP_ALWAYS - VK_COMPARE_OP_NEVER + 1),
+    VK_COMPARE_OP_RANGE_SIZE = (VK_COMPARE_OP_ALWAYS - VK_COMPARE_OP_NEVER + 1),  // NOLINT: misc-redundant-expression
     VK_COMPARE_OP_MAX_ENUM = 0x7FFFFFFF
 } VkCompareOp;
 
@@ -740,7 +740,7 @@ typedef enum VkStencilOp {
     VK_STENCIL_OP_DECREMENT_AND_WRAP = 7,
     VK_STENCIL_OP_BEGIN_RANGE = VK_STENCIL_OP_KEEP,
     VK_STENCIL_OP_END_RANGE = VK_STENCIL_OP_DECREMENT_AND_WRAP,
-    VK_STENCIL_OP_RANGE_SIZE = (VK_STENCIL_OP_DECREMENT_AND_WRAP - VK_STENCIL_OP_KEEP + 1),
+    VK_STENCIL_OP_RANGE_SIZE = (VK_STENCIL_OP_DECREMENT_AND_WRAP - VK_STENCIL_OP_KEEP + 1),  // NOLINT: misc-redundant-expression
     VK_STENCIL_OP_MAX_ENUM = 0x7FFFFFFF
 } VkStencilOp;
 
@@ -763,7 +763,7 @@ typedef enum VkLogicOp {
     VK_LOGIC_OP_SET = 15,
     VK_LOGIC_OP_BEGIN_RANGE = VK_LOGIC_OP_CLEAR,
     VK_LOGIC_OP_END_RANGE = VK_LOGIC_OP_SET,
-    VK_LOGIC_OP_RANGE_SIZE = (VK_LOGIC_OP_SET - VK_LOGIC_OP_CLEAR + 1),
+    VK_LOGIC_OP_RANGE_SIZE = (VK_LOGIC_OP_SET - VK_LOGIC_OP_CLEAR + 1),  // NOLINT: misc-redundant-expression
     VK_LOGIC_OP_MAX_ENUM = 0x7FFFFFFF
 } VkLogicOp;
 
@@ -789,7 +789,7 @@ typedef enum VkBlendFactor {
     VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA = 18,
     VK_BLEND_FACTOR_BEGIN_RANGE = VK_BLEND_FACTOR_ZERO,
     VK_BLEND_FACTOR_END_RANGE = VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA,
-    VK_BLEND_FACTOR_RANGE_SIZE = (VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA - VK_BLEND_FACTOR_ZERO + 1),
+    VK_BLEND_FACTOR_RANGE_SIZE = (VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA - VK_BLEND_FACTOR_ZERO + 1),  // NOLINT: misc-redundant-expression
     VK_BLEND_FACTOR_MAX_ENUM = 0x7FFFFFFF
 } VkBlendFactor;
 
@@ -847,7 +847,7 @@ typedef enum VkBlendOp {
     VK_BLEND_OP_BLUE_EXT = 1000148045,
     VK_BLEND_OP_BEGIN_RANGE = VK_BLEND_OP_ADD,
     VK_BLEND_OP_END_RANGE = VK_BLEND_OP_MAX,
-    VK_BLEND_OP_RANGE_SIZE = (VK_BLEND_OP_MAX - VK_BLEND_OP_ADD + 1),
+    VK_BLEND_OP_RANGE_SIZE = (VK_BLEND_OP_MAX - VK_BLEND_OP_ADD + 1),  // NOLINT: misc-redundant-expression
     VK_BLEND_OP_MAX_ENUM = 0x7FFFFFFF
 } VkBlendOp;
 
@@ -866,7 +866,7 @@ typedef enum VkDynamicState {
     VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT = 1000143000,
     VK_DYNAMIC_STATE_BEGIN_RANGE = VK_DYNAMIC_STATE_VIEWPORT,
     VK_DYNAMIC_STATE_END_RANGE = VK_DYNAMIC_STATE_STENCIL_REFERENCE,
-    VK_DYNAMIC_STATE_RANGE_SIZE = (VK_DYNAMIC_STATE_STENCIL_REFERENCE - VK_DYNAMIC_STATE_VIEWPORT + 1),
+    VK_DYNAMIC_STATE_RANGE_SIZE = (VK_DYNAMIC_STATE_STENCIL_REFERENCE - VK_DYNAMIC_STATE_VIEWPORT + 1),  // NOLINT: misc-redundant-expression
     VK_DYNAMIC_STATE_MAX_ENUM = 0x7FFFFFFF
 } VkDynamicState;
 
@@ -876,7 +876,7 @@ typedef enum VkFilter {
     VK_FILTER_CUBIC_IMG = 1000015000,
     VK_FILTER_BEGIN_RANGE = VK_FILTER_NEAREST,
     VK_FILTER_END_RANGE = VK_FILTER_LINEAR,
-    VK_FILTER_RANGE_SIZE = (VK_FILTER_LINEAR - VK_FILTER_NEAREST + 1),
+    VK_FILTER_RANGE_SIZE = (VK_FILTER_LINEAR - VK_FILTER_NEAREST + 1),  // NOLINT: misc-redundant-expression
     VK_FILTER_MAX_ENUM = 0x7FFFFFFF
 } VkFilter;
 
@@ -885,7 +885,7 @@ typedef enum VkSamplerMipmapMode {
     VK_SAMPLER_MIPMAP_MODE_LINEAR = 1,
     VK_SAMPLER_MIPMAP_MODE_BEGIN_RANGE = VK_SAMPLER_MIPMAP_MODE_NEAREST,
     VK_SAMPLER_MIPMAP_MODE_END_RANGE = VK_SAMPLER_MIPMAP_MODE_LINEAR,
-    VK_SAMPLER_MIPMAP_MODE_RANGE_SIZE = (VK_SAMPLER_MIPMAP_MODE_LINEAR - VK_SAMPLER_MIPMAP_MODE_NEAREST + 1),
+    VK_SAMPLER_MIPMAP_MODE_RANGE_SIZE = (VK_SAMPLER_MIPMAP_MODE_LINEAR - VK_SAMPLER_MIPMAP_MODE_NEAREST + 1),  // NOLINT: misc-redundant-expression
     VK_SAMPLER_MIPMAP_MODE_MAX_ENUM = 0x7FFFFFFF
 } VkSamplerMipmapMode;
 
@@ -897,7 +897,7 @@ typedef enum VkSamplerAddressMode {
     VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE = 4,
     VK_SAMPLER_ADDRESS_MODE_BEGIN_RANGE = VK_SAMPLER_ADDRESS_MODE_REPEAT,
     VK_SAMPLER_ADDRESS_MODE_END_RANGE = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
-    VK_SAMPLER_ADDRESS_MODE_RANGE_SIZE = (VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER - VK_SAMPLER_ADDRESS_MODE_REPEAT + 1),
+    VK_SAMPLER_ADDRESS_MODE_RANGE_SIZE = (VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER - VK_SAMPLER_ADDRESS_MODE_REPEAT + 1),  // NOLINT: misc-redundant-expression
     VK_SAMPLER_ADDRESS_MODE_MAX_ENUM = 0x7FFFFFFF
 } VkSamplerAddressMode;
 
@@ -910,7 +910,7 @@ typedef enum VkBorderColor {
     VK_BORDER_COLOR_INT_OPAQUE_WHITE = 5,
     VK_BORDER_COLOR_BEGIN_RANGE = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
     VK_BORDER_COLOR_END_RANGE = VK_BORDER_COLOR_INT_OPAQUE_WHITE,
-    VK_BORDER_COLOR_RANGE_SIZE = (VK_BORDER_COLOR_INT_OPAQUE_WHITE - VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK + 1),
+    VK_BORDER_COLOR_RANGE_SIZE = (VK_BORDER_COLOR_INT_OPAQUE_WHITE - VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK + 1),  // NOLINT: misc-redundant-expression
     VK_BORDER_COLOR_MAX_ENUM = 0x7FFFFFFF
 } VkBorderColor;
 
@@ -928,7 +928,7 @@ typedef enum VkDescriptorType {
     VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT = 10,
     VK_DESCRIPTOR_TYPE_BEGIN_RANGE = VK_DESCRIPTOR_TYPE_SAMPLER,
     VK_DESCRIPTOR_TYPE_END_RANGE = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT,
-    VK_DESCRIPTOR_TYPE_RANGE_SIZE = (VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT - VK_DESCRIPTOR_TYPE_SAMPLER + 1),
+    VK_DESCRIPTOR_TYPE_RANGE_SIZE = (VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT - VK_DESCRIPTOR_TYPE_SAMPLER + 1),  // NOLINT: misc-redundant-expression
     VK_DESCRIPTOR_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkDescriptorType;
 
@@ -938,7 +938,7 @@ typedef enum VkAttachmentLoadOp {
     VK_ATTACHMENT_LOAD_OP_DONT_CARE = 2,
     VK_ATTACHMENT_LOAD_OP_BEGIN_RANGE = VK_ATTACHMENT_LOAD_OP_LOAD,
     VK_ATTACHMENT_LOAD_OP_END_RANGE = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
-    VK_ATTACHMENT_LOAD_OP_RANGE_SIZE = (VK_ATTACHMENT_LOAD_OP_DONT_CARE - VK_ATTACHMENT_LOAD_OP_LOAD + 1),
+    VK_ATTACHMENT_LOAD_OP_RANGE_SIZE = (VK_ATTACHMENT_LOAD_OP_DONT_CARE - VK_ATTACHMENT_LOAD_OP_LOAD + 1),  // NOLINT: misc-redundant-expression
     VK_ATTACHMENT_LOAD_OP_MAX_ENUM = 0x7FFFFFFF
 } VkAttachmentLoadOp;
 
@@ -947,7 +947,7 @@ typedef enum VkAttachmentStoreOp {
     VK_ATTACHMENT_STORE_OP_DONT_CARE = 1,
     VK_ATTACHMENT_STORE_OP_BEGIN_RANGE = VK_ATTACHMENT_STORE_OP_STORE,
     VK_ATTACHMENT_STORE_OP_END_RANGE = VK_ATTACHMENT_STORE_OP_DONT_CARE,
-    VK_ATTACHMENT_STORE_OP_RANGE_SIZE = (VK_ATTACHMENT_STORE_OP_DONT_CARE - VK_ATTACHMENT_STORE_OP_STORE + 1),
+    VK_ATTACHMENT_STORE_OP_RANGE_SIZE = (VK_ATTACHMENT_STORE_OP_DONT_CARE - VK_ATTACHMENT_STORE_OP_STORE + 1),  // NOLINT: misc-redundant-expression
     VK_ATTACHMENT_STORE_OP_MAX_ENUM = 0x7FFFFFFF
 } VkAttachmentStoreOp;
 
@@ -956,7 +956,7 @@ typedef enum VkPipelineBindPoint {
     VK_PIPELINE_BIND_POINT_COMPUTE = 1,
     VK_PIPELINE_BIND_POINT_BEGIN_RANGE = VK_PIPELINE_BIND_POINT_GRAPHICS,
     VK_PIPELINE_BIND_POINT_END_RANGE = VK_PIPELINE_BIND_POINT_COMPUTE,
-    VK_PIPELINE_BIND_POINT_RANGE_SIZE = (VK_PIPELINE_BIND_POINT_COMPUTE - VK_PIPELINE_BIND_POINT_GRAPHICS + 1),
+    VK_PIPELINE_BIND_POINT_RANGE_SIZE = (VK_PIPELINE_BIND_POINT_COMPUTE - VK_PIPELINE_BIND_POINT_GRAPHICS + 1),  // NOLINT: misc-redundant-expression
     VK_PIPELINE_BIND_POINT_MAX_ENUM = 0x7FFFFFFF
 } VkPipelineBindPoint;
 
@@ -965,7 +965,7 @@ typedef enum VkCommandBufferLevel {
     VK_COMMAND_BUFFER_LEVEL_SECONDARY = 1,
     VK_COMMAND_BUFFER_LEVEL_BEGIN_RANGE = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
     VK_COMMAND_BUFFER_LEVEL_END_RANGE = VK_COMMAND_BUFFER_LEVEL_SECONDARY,
-    VK_COMMAND_BUFFER_LEVEL_RANGE_SIZE = (VK_COMMAND_BUFFER_LEVEL_SECONDARY - VK_COMMAND_BUFFER_LEVEL_PRIMARY + 1),
+    VK_COMMAND_BUFFER_LEVEL_RANGE_SIZE = (VK_COMMAND_BUFFER_LEVEL_SECONDARY - VK_COMMAND_BUFFER_LEVEL_PRIMARY + 1),  // NOLINT: misc-redundant-expression
     VK_COMMAND_BUFFER_LEVEL_MAX_ENUM = 0x7FFFFFFF
 } VkCommandBufferLevel;
 
@@ -974,7 +974,7 @@ typedef enum VkIndexType {
     VK_INDEX_TYPE_UINT32 = 1,
     VK_INDEX_TYPE_BEGIN_RANGE = VK_INDEX_TYPE_UINT16,
     VK_INDEX_TYPE_END_RANGE = VK_INDEX_TYPE_UINT32,
-    VK_INDEX_TYPE_RANGE_SIZE = (VK_INDEX_TYPE_UINT32 - VK_INDEX_TYPE_UINT16 + 1),
+    VK_INDEX_TYPE_RANGE_SIZE = (VK_INDEX_TYPE_UINT32 - VK_INDEX_TYPE_UINT16 + 1),  // NOLINT: misc-redundant-expression
     VK_INDEX_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkIndexType;
 
@@ -983,7 +983,7 @@ typedef enum VkSubpassContents {
     VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS = 1,
     VK_SUBPASS_CONTENTS_BEGIN_RANGE = VK_SUBPASS_CONTENTS_INLINE,
     VK_SUBPASS_CONTENTS_END_RANGE = VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS,
-    VK_SUBPASS_CONTENTS_RANGE_SIZE = (VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS - VK_SUBPASS_CONTENTS_INLINE + 1),
+    VK_SUBPASS_CONTENTS_RANGE_SIZE = (VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS - VK_SUBPASS_CONTENTS_INLINE + 1),  // NOLINT: misc-redundant-expression
     VK_SUBPASS_CONTENTS_MAX_ENUM = 0x7FFFFFFF
 } VkSubpassContents;
 
@@ -1025,7 +1025,7 @@ typedef enum VkObjectType {
     VK_OBJECT_TYPE_VALIDATION_CACHE_EXT = 1000160000,
     VK_OBJECT_TYPE_BEGIN_RANGE = VK_OBJECT_TYPE_UNKNOWN,
     VK_OBJECT_TYPE_END_RANGE = VK_OBJECT_TYPE_COMMAND_POOL,
-    VK_OBJECT_TYPE_RANGE_SIZE = (VK_OBJECT_TYPE_COMMAND_POOL - VK_OBJECT_TYPE_UNKNOWN + 1),
+    VK_OBJECT_TYPE_RANGE_SIZE = (VK_OBJECT_TYPE_COMMAND_POOL - VK_OBJECT_TYPE_UNKNOWN + 1),  // NOLINT: misc-redundant-expression
     VK_OBJECT_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkObjectType;
 
@@ -3449,7 +3449,7 @@ typedef enum VkColorSpaceKHR {
     VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT = 1000104014,
     VK_COLOR_SPACE_BEGIN_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,
     VK_COLOR_SPACE_END_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,
-    VK_COLOR_SPACE_RANGE_SIZE_KHR = (VK_COLOR_SPACE_SRGB_NONLINEAR_KHR - VK_COLOR_SPACE_SRGB_NONLINEAR_KHR + 1),
+    VK_COLOR_SPACE_RANGE_SIZE_KHR = (VK_COLOR_SPACE_SRGB_NONLINEAR_KHR - VK_COLOR_SPACE_SRGB_NONLINEAR_KHR + 1),  // NOLINT: misc-redundant-expression
     VK_COLOR_SPACE_MAX_ENUM_KHR = 0x7FFFFFFF
 } VkColorSpaceKHR;
 
@@ -3462,7 +3462,7 @@ typedef enum VkPresentModeKHR {
     VK_PRESENT_MODE_SHARED_CONTINUOUS_REFRESH_KHR = 1000111001,
     VK_PRESENT_MODE_BEGIN_RANGE_KHR = VK_PRESENT_MODE_IMMEDIATE_KHR,
     VK_PRESENT_MODE_END_RANGE_KHR = VK_PRESENT_MODE_FIFO_RELAXED_KHR,
-    VK_PRESENT_MODE_RANGE_SIZE_KHR = (VK_PRESENT_MODE_FIFO_RELAXED_KHR - VK_PRESENT_MODE_IMMEDIATE_KHR + 1),
+    VK_PRESENT_MODE_RANGE_SIZE_KHR = (VK_PRESENT_MODE_FIFO_RELAXED_KHR - VK_PRESENT_MODE_IMMEDIATE_KHR + 1),  // NOLINT: misc-redundant-expression
     VK_PRESENT_MODE_MAX_ENUM_KHR = 0x7FFFFFFF
 } VkPresentModeKHR;
 
@@ -4206,7 +4206,7 @@ typedef enum VkDescriptorUpdateTemplateTypeKHR {
     VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR = 1,
     VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_BEGIN_RANGE_KHR = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR,
     VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_END_RANGE_KHR = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR,
-    VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_RANGE_SIZE_KHR = (VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR - VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR + 1),
+    VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_RANGE_SIZE_KHR = (VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR - VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR + 1),  // NOLINT: misc-redundant-expression
     VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
 } VkDescriptorUpdateTemplateTypeKHR;
 
@@ -4553,7 +4553,7 @@ typedef enum VkDebugReportObjectTypeEXT {
     VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR_EXT = 1000085000,
     VK_DEBUG_REPORT_OBJECT_TYPE_BEGIN_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT,
     VK_DEBUG_REPORT_OBJECT_TYPE_END_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT,
-    VK_DEBUG_REPORT_OBJECT_TYPE_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT + 1),
+    VK_DEBUG_REPORT_OBJECT_TYPE_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_DEBUG_REPORT_OBJECT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkDebugReportObjectTypeEXT;
 
@@ -4633,7 +4633,7 @@ typedef enum VkRasterizationOrderAMD {
     VK_RASTERIZATION_ORDER_RELAXED_AMD = 1,
     VK_RASTERIZATION_ORDER_BEGIN_RANGE_AMD = VK_RASTERIZATION_ORDER_STRICT_AMD,
     VK_RASTERIZATION_ORDER_END_RANGE_AMD = VK_RASTERIZATION_ORDER_RELAXED_AMD,
-    VK_RASTERIZATION_ORDER_RANGE_SIZE_AMD = (VK_RASTERIZATION_ORDER_RELAXED_AMD - VK_RASTERIZATION_ORDER_STRICT_AMD + 1),
+    VK_RASTERIZATION_ORDER_RANGE_SIZE_AMD = (VK_RASTERIZATION_ORDER_RELAXED_AMD - VK_RASTERIZATION_ORDER_STRICT_AMD + 1),  // NOLINT: misc-redundant-expression
     VK_RASTERIZATION_ORDER_MAX_ENUM_AMD = 0x7FFFFFFF
 } VkRasterizationOrderAMD;
 
@@ -5080,7 +5080,7 @@ typedef enum VkValidationCheckEXT {
     VK_VALIDATION_CHECK_SHADERS_EXT = 1,
     VK_VALIDATION_CHECK_BEGIN_RANGE_EXT = VK_VALIDATION_CHECK_ALL_EXT,
     VK_VALIDATION_CHECK_END_RANGE_EXT = VK_VALIDATION_CHECK_SHADERS_EXT,
-    VK_VALIDATION_CHECK_RANGE_SIZE_EXT = (VK_VALIDATION_CHECK_SHADERS_EXT - VK_VALIDATION_CHECK_ALL_EXT + 1),
+    VK_VALIDATION_CHECK_RANGE_SIZE_EXT = (VK_VALIDATION_CHECK_SHADERS_EXT - VK_VALIDATION_CHECK_ALL_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_VALIDATION_CHECK_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkValidationCheckEXT;
 
@@ -5170,7 +5170,7 @@ typedef enum VkIndirectCommandsTokenTypeNVX {
     VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX = 7,
     VK_INDIRECT_COMMANDS_TOKEN_TYPE_BEGIN_RANGE_NVX = VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX,
     VK_INDIRECT_COMMANDS_TOKEN_TYPE_END_RANGE_NVX = VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX,
-    VK_INDIRECT_COMMANDS_TOKEN_TYPE_RANGE_SIZE_NVX = (VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX - VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX + 1),
+    VK_INDIRECT_COMMANDS_TOKEN_TYPE_RANGE_SIZE_NVX = (VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX - VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX + 1),  // NOLINT: misc-redundant-expression
     VK_INDIRECT_COMMANDS_TOKEN_TYPE_MAX_ENUM_NVX = 0x7FFFFFFF
 } VkIndirectCommandsTokenTypeNVX;
 
@@ -5182,7 +5182,7 @@ typedef enum VkObjectEntryTypeNVX {
     VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX = 4,
     VK_OBJECT_ENTRY_TYPE_BEGIN_RANGE_NVX = VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX,
     VK_OBJECT_ENTRY_TYPE_END_RANGE_NVX = VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX,
-    VK_OBJECT_ENTRY_TYPE_RANGE_SIZE_NVX = (VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX - VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX + 1),
+    VK_OBJECT_ENTRY_TYPE_RANGE_SIZE_NVX = (VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX - VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX + 1),  // NOLINT: misc-redundant-expression
     VK_OBJECT_ENTRY_TYPE_MAX_ENUM_NVX = 0x7FFFFFFF
 } VkObjectEntryTypeNVX;
 
@@ -5461,7 +5461,7 @@ typedef enum VkDisplayPowerStateEXT {
     VK_DISPLAY_POWER_STATE_ON_EXT = 2,
     VK_DISPLAY_POWER_STATE_BEGIN_RANGE_EXT = VK_DISPLAY_POWER_STATE_OFF_EXT,
     VK_DISPLAY_POWER_STATE_END_RANGE_EXT = VK_DISPLAY_POWER_STATE_ON_EXT,
-    VK_DISPLAY_POWER_STATE_RANGE_SIZE_EXT = (VK_DISPLAY_POWER_STATE_ON_EXT - VK_DISPLAY_POWER_STATE_OFF_EXT + 1),
+    VK_DISPLAY_POWER_STATE_RANGE_SIZE_EXT = (VK_DISPLAY_POWER_STATE_ON_EXT - VK_DISPLAY_POWER_STATE_OFF_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_DISPLAY_POWER_STATE_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkDisplayPowerStateEXT;
 
@@ -5469,7 +5469,7 @@ typedef enum VkDeviceEventTypeEXT {
     VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT = 0,
     VK_DEVICE_EVENT_TYPE_BEGIN_RANGE_EXT = VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT,
     VK_DEVICE_EVENT_TYPE_END_RANGE_EXT = VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT,
-    VK_DEVICE_EVENT_TYPE_RANGE_SIZE_EXT = (VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT - VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT + 1),
+    VK_DEVICE_EVENT_TYPE_RANGE_SIZE_EXT = (VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT - VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_DEVICE_EVENT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkDeviceEventTypeEXT;
 
@@ -5477,7 +5477,7 @@ typedef enum VkDisplayEventTypeEXT {
     VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT = 0,
     VK_DISPLAY_EVENT_TYPE_BEGIN_RANGE_EXT = VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT,
     VK_DISPLAY_EVENT_TYPE_END_RANGE_EXT = VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT,
-    VK_DISPLAY_EVENT_TYPE_RANGE_SIZE_EXT = (VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT - VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT + 1),
+    VK_DISPLAY_EVENT_TYPE_RANGE_SIZE_EXT = (VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT - VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_DISPLAY_EVENT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkDisplayEventTypeEXT;
 
@@ -5617,7 +5617,7 @@ typedef enum VkViewportCoordinateSwizzleNV {
     VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV = 7,
     VK_VIEWPORT_COORDINATE_SWIZZLE_BEGIN_RANGE_NV = VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV,
     VK_VIEWPORT_COORDINATE_SWIZZLE_END_RANGE_NV = VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV,
-    VK_VIEWPORT_COORDINATE_SWIZZLE_RANGE_SIZE_NV = (VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV - VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV + 1),
+    VK_VIEWPORT_COORDINATE_SWIZZLE_RANGE_SIZE_NV = (VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV - VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV + 1),  // NOLINT: misc-redundant-expression
     VK_VIEWPORT_COORDINATE_SWIZZLE_MAX_ENUM_NV = 0x7FFFFFFF
 } VkViewportCoordinateSwizzleNV;
 
@@ -5647,7 +5647,7 @@ typedef enum VkDiscardRectangleModeEXT {
     VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT = 1,
     VK_DISCARD_RECTANGLE_MODE_BEGIN_RANGE_EXT = VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT,
     VK_DISCARD_RECTANGLE_MODE_END_RANGE_EXT = VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT,
-    VK_DISCARD_RECTANGLE_MODE_RANGE_SIZE_EXT = (VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT - VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT + 1),
+    VK_DISCARD_RECTANGLE_MODE_RANGE_SIZE_EXT = (VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT - VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_DISCARD_RECTANGLE_MODE_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkDiscardRectangleModeEXT;
 
@@ -5774,7 +5774,7 @@ typedef enum VkSamplerReductionModeEXT {
     VK_SAMPLER_REDUCTION_MODE_MAX_EXT = 2,
     VK_SAMPLER_REDUCTION_MODE_BEGIN_RANGE_EXT = VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT,
     VK_SAMPLER_REDUCTION_MODE_END_RANGE_EXT = VK_SAMPLER_REDUCTION_MODE_MAX_EXT,
-    VK_SAMPLER_REDUCTION_MODE_RANGE_SIZE_EXT = (VK_SAMPLER_REDUCTION_MODE_MAX_EXT - VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT + 1),
+    VK_SAMPLER_REDUCTION_MODE_RANGE_SIZE_EXT = (VK_SAMPLER_REDUCTION_MODE_MAX_EXT - VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_SAMPLER_REDUCTION_MODE_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkSamplerReductionModeEXT;
 
@@ -5891,7 +5891,7 @@ typedef enum VkBlendOverlapEXT {
     VK_BLEND_OVERLAP_CONJOINT_EXT = 2,
     VK_BLEND_OVERLAP_BEGIN_RANGE_EXT = VK_BLEND_OVERLAP_UNCORRELATED_EXT,
     VK_BLEND_OVERLAP_END_RANGE_EXT = VK_BLEND_OVERLAP_CONJOINT_EXT,
-    VK_BLEND_OVERLAP_RANGE_SIZE_EXT = (VK_BLEND_OVERLAP_CONJOINT_EXT - VK_BLEND_OVERLAP_UNCORRELATED_EXT + 1),
+    VK_BLEND_OVERLAP_RANGE_SIZE_EXT = (VK_BLEND_OVERLAP_CONJOINT_EXT - VK_BLEND_OVERLAP_UNCORRELATED_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_BLEND_OVERLAP_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkBlendOverlapEXT;
 
@@ -5945,7 +5945,7 @@ typedef enum VkCoverageModulationModeNV {
     VK_COVERAGE_MODULATION_MODE_RGBA_NV = 3,
     VK_COVERAGE_MODULATION_MODE_BEGIN_RANGE_NV = VK_COVERAGE_MODULATION_MODE_NONE_NV,
     VK_COVERAGE_MODULATION_MODE_END_RANGE_NV = VK_COVERAGE_MODULATION_MODE_RGBA_NV,
-    VK_COVERAGE_MODULATION_MODE_RANGE_SIZE_NV = (VK_COVERAGE_MODULATION_MODE_RGBA_NV - VK_COVERAGE_MODULATION_MODE_NONE_NV + 1),
+    VK_COVERAGE_MODULATION_MODE_RANGE_SIZE_NV = (VK_COVERAGE_MODULATION_MODE_RGBA_NV - VK_COVERAGE_MODULATION_MODE_NONE_NV + 1),  // NOLINT: misc-redundant-expression
     VK_COVERAGE_MODULATION_MODE_MAX_ENUM_NV = 0x7FFFFFFF
 } VkCoverageModulationModeNV;
 
@@ -5979,7 +5979,7 @@ typedef enum VkValidationCacheHeaderVersionEXT {
     VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT = 1,
     VK_VALIDATION_CACHE_HEADER_VERSION_BEGIN_RANGE_EXT = VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT,
     VK_VALIDATION_CACHE_HEADER_VERSION_END_RANGE_EXT = VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT,
-    VK_VALIDATION_CACHE_HEADER_VERSION_RANGE_SIZE_EXT = (VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT - VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT + 1),
+    VK_VALIDATION_CACHE_HEADER_VERSION_RANGE_SIZE_EXT = (VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT - VK_VALIDATION_CACHE_HEADER_VERSION_ONE_EXT + 1),  // NOLINT: misc-redundant-expression
     VK_VALIDATION_CACHE_HEADER_VERSION_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkValidationCacheHeaderVersionEXT;
 

From afc0bb6477360214b025efe9a2cc8303e8f09efd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 12:28:08 -0700
Subject: [PATCH 010/166] Clang tidy & format pass

---
 src/CodeGen_Vulkan_Dev.cpp        | 268 ++++++++++++++----------------
 src/runtime/HalideRuntimeVulkan.h |   8 +-
 src/runtime/vulkan.cpp            |  10 +-
 src/runtime/vulkan_context.h      |  24 +--
 src/runtime/vulkan_extensions.h   |  14 +-
 src/runtime/vulkan_functions.h    |   2 +-
 src/runtime/vulkan_interface.h    |   9 +-
 src/runtime/vulkan_internal.h     |  82 ++++-----
 src/runtime/vulkan_memory.h       | 137 ++++++++-------
 src/runtime/vulkan_resources.h    |  90 +++++-----
 10 files changed, 309 insertions(+), 335 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 1a0d74596301..3ebc058af8e0 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1,17 +1,17 @@
 #include <algorithm>
 #include <sstream>
 
-#include "CodeGen_Vulkan_Dev.h"
-#include "CodeGen_Internal.h"
 #include "CodeGen_GPU_Dev.h"
-#include "Deinterleave.h"
+#include "CodeGen_Internal.h"
+#include "CodeGen_Vulkan_Dev.h"
 #include "Debug.h"
+#include "Deinterleave.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "Scope.h"
 #include "Target.h"
 
-#include <spirv/1.0/spirv.h> 
+#include <spirv/1.0/spirv.h>
 
 // Temporary:
 #include <fstream>
@@ -21,12 +21,12 @@ namespace Internal {
 
 class CodeGen_LLVM;
 
-namespace { // anonymous 
+namespace {  // anonymous
 
-template <typename CodeGenT, typename ValueT>
+template<typename CodeGenT, typename ValueT>
 ValueT lower_int_uint_div(CodeGenT *cg, Expr a, Expr b);
 
-template <typename CodeGenT, typename ValueT>
+template<typename CodeGenT, typename ValueT>
 ValueT lower_int_uint_mod(CodeGenT *cg, Expr a, Expr b);
 
 class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
@@ -53,14 +53,15 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 
     std::string print_gpu_name(const std::string &name) override;
 
-    std::string api_unique_name() override { return "vulkan"; }
+    std::string api_unique_name() override {
+        return "vulkan";
+    }
 
 protected:
-
     class SPIRVEmitter : public IRVisitor {
-        
+
     public:
-        SPIRVEmitter() { }
+        SPIRVEmitter() = default;
 
         using IRVisitor::visit;
 
@@ -108,7 +109,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const Fork *) override;
         void visit(const Acquire *) override;
 
-        void visit_binop(Type t, Expr a, Expr b, uint32_t opcode);
+        void visit_binop(Type t, const Expr &a, const Expr &b, uint32_t opcode);
 
         // ID of last generated Expr.
         uint32_t id;
@@ -138,11 +139,11 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         uint32_t current_function_id;
 
         // Top-level function for adding kernels
-        void add_kernel(Stmt s, const std::string &name, const std::vector<DeviceArgument> &args);
+        void add_kernel(const Stmt &s, const std::string &name, const std::vector<DeviceArgument> &args);
 
         // Function for allocating variables in function scope, with optional initializer.
         // These will appear at the beginning of the function, as required by SPIR-V
-        void add_allocation(uint32_t result_type_id, uint32_t result_id, uint32_t storage_class, uint32_t initializer=0);
+        void add_allocation(uint32_t result_type_id, uint32_t result_id, uint32_t storage_class, uint32_t initializer = 0);
 
         std::map<Type, uint32_t> type_map;
         std::map<std::pair<Type, uint32_t>, uint32_t> pointer_type_map;
@@ -153,13 +154,13 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
                              std::initializer_list<uint32_t> words);
         void add_instruction(uint32_t opcode, std::initializer_list<uint32_t> words);
         void add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
-                            std::vector<uint32_t> words);
+                             std::vector<uint32_t> words);
         void add_instruction(uint32_t opcode, std::vector<uint32_t> words);
         uint32_t map_type(const Type &type);
-        uint32_t map_pointer_type(const Type &type, const uint32_t storage_class);
+        uint32_t map_pointer_type(const Type &type, uint32_t storage_class);
         uint32_t map_type_to_pair(const Type &t);
         uint32_t emit_constant(const Type &t, const void *data);
-        void scalarize(Expr e);
+        void scalarize(const Expr &e);
 
         // The scope contains both the symbol and its storage class
         Scope<std::pair<uint32_t, uint32_t>> symbol_table;
@@ -171,8 +172,8 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
             uint32_t ids[4];
         };
         // Returns Phi node inputs.
-        template <typename StmtOrExpr>
-        PhiNodeInputs emit_if_then_else(Expr condition, StmtOrExpr then_case, StmtOrExpr else_case);
+        template<typename StmtOrExpr>
+        PhiNodeInputs emit_if_then_else(const Expr &condition, StmtOrExpr then_case, StmtOrExpr else_case);
     } emitter;
 
     std::string current_kernel_name;
@@ -191,14 +192,14 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(uint32_t opcode, std::ini
     spir_v_kernels.insert(spir_v_kernels.end(), words.begin(), words.end());
 }
 void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
-                                                        std::vector<uint32_t> words) {
-  region.push_back(((1 + words.size()) << 16) | opcode);
-  region.insert(region.end(), words.begin(), words.end());
+                                                       std::vector<uint32_t> words) {
+    region.push_back(((1 + words.size()) << 16) | opcode);
+    region.insert(region.end(), words.begin(), words.end());
 }
 
 void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(uint32_t opcode, std::vector<uint32_t> words) {
-  spir_v_kernels.push_back(((1 + words.size()) << 16) | opcode);
-  spir_v_kernels.insert(spir_v_kernels.end(), words.begin(), words.end());
+    spir_v_kernels.push_back(((1 + words.size()) << 16) | opcode);
+    spir_v_kernels.insert(spir_v_kernels.end(), words.begin(), words.end());
 }
 
 uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::emit_constant(const Type &t, const void *data) {
@@ -215,7 +216,7 @@ uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::emit_constant(const Type &t, const vo
 
     debug(3) << "emit_constant for type " << t << "\n";
     auto item = constant_map.find(key);
-    if  (item == constant_map.end()) {
+    if (item == constant_map.end()) {
         uint32_t type_id = map_type(t);
         uint32_t extra_words = (t.bytes() + 3) / 4;
         uint32_t constant_id = next_id++;
@@ -239,17 +240,17 @@ uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::emit_constant(const Type &t, const vo
     }
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::scalarize(Expr e) {
+void CodeGen_Vulkan_Dev::SPIRVEmitter::scalarize(const Expr &e) {
     internal_assert(e.type().is_vector()) << "CodeGen_Vulkan_Dev::SPIRVEmitter::scalarize must be called with an expression of vector type.\n";
     uint32_t type_id = map_type(e.type());
 
     uint32_t result_id = next_id++;
-    add_instruction(SpvOpConstantNull, { type_id, result_id } );
+    add_instruction(SpvOpConstantNull, {type_id, result_id});
 
     for (int i = 0; i < e.type().lanes(); i++) {
         extract_lane(e, i).accept(this);
         uint32_t composite_vec = next_id++;
-        add_instruction(SpvOpVectorInsertDynamic, { type_id, composite_vec, (uint32_t)i, result_id, id });
+        add_instruction(SpvOpVectorInsertDynamic, {type_id, composite_vec, (uint32_t)i, result_id, id});
         result_id = composite_vec;
     }
     id = result_id;
@@ -261,24 +262,24 @@ uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_type(const Type &t) {
     Type t_key(key_typecode, t.bits(), t.lanes());
 
     auto item = type_map.find(t_key);
-    if  (item == type_map.end()) {
+    if (item == type_map.end()) {
         // TODO, handle arrays, pointers, halide_buffer_t
         uint32_t type_id = 0;
         if (t.lanes() != 1) {
             uint32_t base_id = map_type(t.with_lanes(1));
             type_id = next_id++;
-            add_instruction(spir_v_types, SpvOpTypeVector, { type_id, base_id, (uint32_t)t.lanes() });
+            add_instruction(spir_v_types, SpvOpTypeVector, {type_id, base_id, (uint32_t)t.lanes()});
         } else {
             if (t.is_float()) {
                 type_id = next_id++;
-                add_instruction(spir_v_types, SpvOpTypeFloat, { type_id, (uint32_t)t.bits() });
+                add_instruction(spir_v_types, SpvOpTypeFloat, {type_id, (uint32_t)t.bits()});
             } else if (t.is_bool()) {
                 type_id = next_id++;
-                add_instruction(spir_v_types, SpvOpTypeBool, { type_id });
+                add_instruction(spir_v_types, SpvOpTypeBool, {type_id});
             } else if (t.is_int_or_uint()) {
                 type_id = next_id++;
                 uint32_t signedness = t.is_uint() ? 0 : 1;
-                add_instruction(spir_v_types, SpvOpTypeInt, { type_id, (uint32_t)t.bits(), signedness });
+                add_instruction(spir_v_types, SpvOpTypeInt, {type_id, (uint32_t)t.bits(), signedness});
             } else {
                 internal_error << "Unsupported type in Vulkan backend " << t << "\n";
             }
@@ -293,24 +294,24 @@ uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_type(const Type &t) {
 uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_type_to_pair(const Type &t) {
     uint32_t &ref = pair_type_map[t];
 
-    if  (ref == 0) {
+    if (ref == 0) {
         uint32_t base_type = map_type(t);
 
         uint32_t type_id = next_id++;
 
-        add_instruction(spir_v_types, SpvOpTypeStruct, { type_id, base_type, base_type });
+        add_instruction(spir_v_types, SpvOpTypeStruct, {type_id, base_type, base_type});
         ref = type_id;
     }
     return ref;
 }
 
-uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_pointer_type(const Type &type, const uint32_t storage_class) {
+uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_pointer_type(const Type &type, uint32_t storage_class) {
     auto key = std::make_pair(type, storage_class);
     uint32_t &ref = pointer_type_map[key];
     if (ref == 0) {
         uint32_t base_type_id = map_type(type);
         ref = next_id++;
-        add_instruction(spir_v_types, SpvOpTypePointer, { ref, storage_class, base_type_id });
+        add_instruction(spir_v_types, SpvOpTypePointer, {ref, storage_class, base_type_id});
         pointer_type_map[key] = ref;
     }
 
@@ -330,20 +331,19 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const UIntImm *imm) {
 }
 
 namespace {
-void encode_string(std::vector<uint32_t>& section, const uint32_t words,
-                   const size_t str_size, const char* str) {
+void encode_string(std::vector<uint32_t> &section, uint32_t words,
+                   const size_t str_size, const char *str) {
     size_t bytes_copied = 0;
     for (uint32_t i = 0; i < words; i++) {
-      uint32_t word;
-      size_t to_copy = std::min(str_size + 1 - bytes_copied, (size_t)4);
-      memcpy(&word, str, to_copy);
-      bytes_copied += to_copy;
-      section.push_back(word);
-      str += 4;
+        uint32_t word;
+        size_t to_copy = std::min(str_size + 1 - bytes_copied, (size_t)4);
+        memcpy(&word, str, to_copy);
+        bytes_copied += to_copy;
+        section.push_back(word);
+        str += 4;
     }
-
-}
 }
+}  // namespace
 void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const StringImm *imm) {
     uint32_t extra_words = (imm->value.size() + 1 + 3) / 4;
     id = next_id++;
@@ -401,7 +401,7 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Cast *op) {
     op->value.accept(this);
     uint32_t src_id = id;
     id = next_id++;
-    add_instruction(opcode, { type_id, id, src_id });
+    add_instruction(opcode, {type_id, id, src_id});
 }
 
 void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Add *op) {
@@ -534,7 +534,7 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Not *op) {
     op->a.accept(this);
     uint32_t a_id = id;
     id = next_id++;
-    add_instruction(SpvOpLogicalNot, { type_id, id, a_id });
+    add_instruction(SpvOpLogicalNot, {type_id, id, a_id});
 }
 
 void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
@@ -542,8 +542,8 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
         // TODO: Check the scopes here and figure out if this is the
         // right memory barrier. Might be able to use
         // SpvMemorySemanticsMaskNone instead.
-        add_instruction(SpvOpControlBarrier, { current_function_id, current_function_id,
-                                               SpvMemorySemanticsAcquireReleaseMask });
+        add_instruction(SpvOpControlBarrier, {current_function_id, current_function_id,
+                                              SpvMemorySemanticsAcquireReleaseMask});
     } else if (op->is_intrinsic(Call::bitwise_and)) {
         internal_assert(op->args.size() == 2);
         visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseAnd);
@@ -559,7 +559,7 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
         op->args[0]->accept(this);
         uint32_t arg_id = id;
         id = next_id++;
-        add_instruction(SpvOpNot, { type_id, id, arg_id });
+        add_instruction(SpvOpNot, {type_id, id, arg_id});
     } else if (op->is_intrinsic(Call::if_then_else)) {
         if (op->type.is_vector()) {
             scalarize(op);
@@ -574,7 +574,7 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
             spir_v_kernels.push_back(type_id);
             spir_v_kernels.push_back(id);
             spir_v_kernels.insert(spir_v_kernels.end(), phi_inputs.ids, phi_inputs.ids + 4);
-      }
+        }
     } else if (op->is_intrinsic(Call::IntrinsicOp::div_round_to_zero)) {
         internal_assert(op->args.size() == 2);
         uint32_t opcode = 0;
@@ -658,7 +658,7 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Select *op) {
     op->false_value.accept(this);
     uint32_t false_id = id;
     id = next_id++;
-    add_instruction(SpvOpSelect, { type_id, id, cond_id, true_id, false_id });
+    add_instruction(SpvOpSelect, {type_id, id, cond_id, true_id, false_id});
 }
 
 void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Load *op) {
@@ -717,7 +717,6 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Store *op) {
                                                emit_constant(UInt(32), &zero), index_id});
 
     add_instruction(SpvOpStore, {access_chain_id, value_id});
-
 }
 
 void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Let *let) {
@@ -757,19 +756,19 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const ProducerConsumer *) {
 namespace {
 std::pair<std::string, uint32_t> simt_intrinsic(const std::string &name) {
     if (ends_with(name, ".__thread_id_x")) {
-      return {"LocalInvocationId", 0};
+        return {"LocalInvocationId", 0};
     } else if (ends_with(name, ".__thread_id_y")) {
-      return {"LocalInvocationId", 1};
+        return {"LocalInvocationId", 1};
     } else if (ends_with(name, ".__thread_id_z")) {
-      return {"LocalInvocationId", 2};
+        return {"LocalInvocationId", 2};
     } else if (ends_with(name, ".__block_id_x")) {
-      return {"WorkgroupId", 0};
+        return {"WorkgroupId", 0};
     } else if (ends_with(name, ".__block_id_y")) {
-      return {"WorkgroupId", 1};
+        return {"WorkgroupId", 1};
     } else if (ends_with(name, ".__block_id_z")) {
-      return {"WorkgroupId", 2};
+        return {"WorkgroupId", 2};
     } else if (ends_with(name, "id_w")) {
-      user_error << "Vulkan only supports <=3 dimensions for gpu blocks";
+        user_error << "Vulkan only supports <=3 dimensions for gpu blocks";
     }
     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
     return {"", -1};
@@ -783,7 +782,7 @@ int thread_loop_workgroup_index(const std::string &name) {
     }
     return -1;
 }
-} // anonymous namespace
+}  // anonymous namespace
 
 void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
 
@@ -800,16 +799,14 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
             const IntImm *wsize = op->extent.as<IntImm>();
             user_assert(wsize != nullptr) << "Vulkan requires statically-known workgroup size.\n";
             uint32_t new_wsize = wsize->value;
-            user_assert(workgroup_size[idx] == 0 || workgroup_size[idx] == new_wsize) <<
-              "Vulkan requires all kernels have the same workgroup size, but two different ones "
-              "were encountered " << workgroup_size[idx] << " and " << new_wsize <<
-              " in dimension " << idx << "\n";
+            user_assert(workgroup_size[idx] == 0 || workgroup_size[idx] == new_wsize) << "Vulkan requires all kernels have the same workgroup size, but two different ones "
+                                                                                         "were encountered "
+                                                                                      << workgroup_size[idx] << " and " << new_wsize << " in dimension " << idx << "\n";
             workgroup_size[idx] = new_wsize;
         }
 
         auto intrinsic = simt_intrinsic(op->name);
 
-
         // Intrinsics are inserted when adding the kernel
         internal_assert(symbol_table.contains(intrinsic.first));
 
@@ -831,7 +828,7 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
 
         // TODO: Loop vars are alway int32_t right?
         uint32_t index_type_id = map_type(Int(32));
-        uint32_t index_var_type_id = map_pointer_type(Int(32), SpvStorageClassFunction); 
+        uint32_t index_var_type_id = map_pointer_type(Int(32), SpvStorageClassFunction);
 
         op->min.accept(this);
         uint32_t min_id = id;
@@ -840,7 +837,7 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
 
         // Compute max.
         uint32_t max_id = next_id++;
-        add_instruction(SpvOpIAdd, { index_type_id, max_id, min_id, extent_id });
+        add_instruction(SpvOpIAdd, {index_type_id, max_id, min_id, extent_id});
 
         // Declare loop var
         // TODO: Can we use the phi node for this?
@@ -852,20 +849,20 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
         uint32_t body_label_id = next_id++;
         uint32_t continue_label_id = next_id++;
         uint32_t merge_label_id = next_id++;
-        add_instruction(SpvOpLabel, { header_label_id });
-        add_instruction(SpvOpLoopMerge, { merge_label_id, continue_label_id, SpvLoopControlMaskNone });
-        add_instruction(SpvOpBranch, { loop_top_label_id });
-        add_instruction(SpvOpLabel, { loop_top_label_id });
+        add_instruction(SpvOpLabel, {header_label_id});
+        add_instruction(SpvOpLoopMerge, {merge_label_id, continue_label_id, SpvLoopControlMaskNone});
+        add_instruction(SpvOpBranch, {loop_top_label_id});
+        add_instruction(SpvOpLabel, {loop_top_label_id});
 
         // loop test.
         uint32_t cur_index_id = next_id++;
-        add_instruction(SpvOpLoad, { index_type_id, cur_index_id, loop_var_id });
+        add_instruction(SpvOpLoad, {index_type_id, cur_index_id, loop_var_id});
 
         uint32_t loop_test_id = next_id++;
-        add_instruction(SpvOpSLessThanEqual, { loop_test_id, cur_index_id, max_id });
-        add_instruction(SpvOpBranchConditional, { loop_test_id, body_label_id, merge_label_id });
+        add_instruction(SpvOpSLessThanEqual, {loop_test_id, cur_index_id, max_id});
+        add_instruction(SpvOpBranchConditional, {loop_test_id, body_label_id, merge_label_id});
 
-        add_instruction(SpvOpLabel, { body_label_id });
+        add_instruction(SpvOpLabel, {body_label_id});
 
         {
             ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, op->name, {cur_index_id, SpvStorageClassFunction});
@@ -873,17 +870,17 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
             op->body.accept(this);
         }
 
-        add_instruction(SpvOpBranch, { continue_label_id });
-        add_instruction(SpvOpLabel, { continue_label_id });
+        add_instruction(SpvOpBranch, {continue_label_id});
+        add_instruction(SpvOpLabel, {continue_label_id});
 
         // Loop var update?
         uint32_t next_index_id = next_id++;
         int32_t one = 1;
         uint32_t constant_one_id = emit_constant(Int(32), &one);
-        add_instruction(SpvOpIAdd, { index_type_id, next_index_id, cur_index_id, constant_one_id});
-        add_instruction(SpvOpStore, { index_type_id, next_index_id, loop_var_id });
-        add_instruction(SpvOpBranch, { header_label_id });
-        add_instruction(SpvOpLabel, { merge_label_id });
+        add_instruction(SpvOpIAdd, {index_type_id, next_index_id, cur_index_id, constant_one_id});
+        add_instruction(SpvOpStore, {index_type_id, next_index_id, loop_var_id});
+        add_instruction(SpvOpBranch, {header_label_id});
+        add_instruction(SpvOpLabel, {merge_label_id});
     }
 }
 
@@ -901,10 +898,10 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Ramp *op) {
     uint32_t first_id = next_id;
     for (int i = 1; i < op->lanes; i++) {
         uint32_t this_id = next_id++;
-        add_instruction(add_opcode, { base_type_id, this_id, prev_id, stride_id });
+        add_instruction(add_opcode, {base_type_id, this_id, prev_id, stride_id});
         prev_id = this_id;
     }
-    
+
     id = next_id++;
     spir_v_kernels.push_back(((op->lanes + 3) << 16) | SpvOpCompositeConstruct);
     spir_v_kernels.push_back(type_id);
@@ -941,9 +938,9 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Realize *) {
     internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Realize *): Realize encountered during codegen\n";
 }
 
-template <typename StmtOrExpr>
+template<typename StmtOrExpr>
 CodeGen_Vulkan_Dev::SPIRVEmitter::PhiNodeInputs
-CodeGen_Vulkan_Dev::SPIRVEmitter::emit_if_then_else(Expr condition,
+CodeGen_Vulkan_Dev::SPIRVEmitter::emit_if_then_else(const Expr &condition,
                                                     StmtOrExpr then_case, StmtOrExpr else_case) {
     condition.accept(this);
     uint32_t cond_id = id;
@@ -952,14 +949,14 @@ CodeGen_Vulkan_Dev::SPIRVEmitter::emit_if_then_else(Expr condition,
     uint32_t merge_label_id = next_id++;
 
     add_instruction(SpvOpSelectionMerge, {merge_label_id, SpvSelectionControlMaskNone});
-    add_instruction(SpvOpBranchConditional, { cond_id, then_label_id, else_label_id });
-    add_instruction(SpvOpLabel, { then_label_id });
+    add_instruction(SpvOpBranchConditional, {cond_id, then_label_id, else_label_id});
+    add_instruction(SpvOpLabel, {then_label_id});
 
     then_case.accept(this);
     uint32_t then_id = id;
 
-    add_instruction(SpvOpBranch, { merge_label_id });
-    add_instruction(SpvOpLabel, { else_label_id });
+    add_instruction(SpvOpBranch, {merge_label_id});
+    add_instruction(SpvOpLabel, {else_label_id});
 
     else_case.accept(this);
     uint32_t else_id = id;
@@ -967,9 +964,9 @@ CodeGen_Vulkan_Dev::SPIRVEmitter::emit_if_then_else(Expr condition,
     // Every basic block must end with a branch instruction
     add_instruction(SpvOpBranch, {merge_label_id});
 
-    add_instruction(SpvOpLabel, { merge_label_id });
+    add_instruction(SpvOpLabel, {merge_label_id});
 
-    return {{ then_id, then_label_id, else_id, else_label_id }};
+    return {{then_id, then_label_id, else_id, else_label_id}};
 }
 
 void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const IfThenElse *op) {
@@ -1010,34 +1007,28 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Acquire *) {
 }
 
 // TODO: fast math decorations.
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit_binop(Type t, Expr a, Expr b, uint32_t opcode) {
+void CodeGen_Vulkan_Dev::SPIRVEmitter::visit_binop(Type t, const Expr &a, const Expr &b, uint32_t opcode) {
     uint32_t type_id = map_type(t);
     a.accept(this);
     uint32_t a_id = id;
     b.accept(this);
     uint32_t b_id = id;
     id = next_id++;
-    add_instruction(opcode, { type_id, id, a_id, b_id });
+    add_instruction(opcode, {type_id, id, a_id, b_id});
 }
 
-
 void CodeGen_Vulkan_Dev::SPIRVEmitter::add_allocation(uint32_t result_type_id,
                                                       uint32_t result_id,
                                                       uint32_t storage_class,
                                                       uint32_t initializer) {
     if (initializer) {
-        add_instruction(spir_v_kernel_allocations, SpvOpVariable, {result_type_id,
-                                                                  result_id,
-                                                                  storage_class,
-                                                                  initializer});
+        add_instruction(spir_v_kernel_allocations, SpvOpVariable, {result_type_id, result_id, storage_class, initializer});
     } else {
-        add_instruction(spir_v_kernel_allocations, SpvOpVariable, {result_type_id,
-                                                                  result_id,
-                                                                  storage_class});
+        add_instruction(spir_v_kernel_allocations, SpvOpVariable, {result_type_id, result_id, storage_class});
     }
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(Stmt s,
+void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(const Stmt &s,
                                                   const std::string &name,
                                                   const std::vector<DeviceArgument> &args) {
     debug(2) << "Adding Vulkan kernel " << name << "\n";
@@ -1070,13 +1061,11 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(Stmt s,
     entry_point_interface.push_back(SpvExecutionModelGLCompute);
     entry_point_interface.push_back(current_function_id);
     // Add the string name of the function
-    encode_string(entry_point_interface, (name.size() + 1 + 3)/4, name.size(), name.c_str());
-
-
+    encode_string(entry_point_interface, (name.size() + 1 + 3) / 4, name.size(), name.c_str());
 
     // TODO: only add the SIMT intrinsics used
     auto intrinsics = {"WorkgroupId", "LocalInvocationId"};
-    for (auto intrinsic: intrinsics) {
+    for (const std::string &intrinsic : intrinsics) {
         uint32_t intrinsic_id = next_id++;
         uint32_t intrinsic_loaded_id = next_id++;
         // The builtins are pointers to vec3
@@ -1094,7 +1083,6 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(Stmt s,
         entry_point_interface.push_back(intrinsic_id);
     }
 
-
     // Add the entry point and exection mode
     add_instruction(spir_v_entrypoints,
                     SpvOpEntryPoint, entry_point_interface);
@@ -1108,17 +1096,14 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(Stmt s,
     uint32_t param_pack_ptr_type_id = next_id++;
     uint32_t param_pack_id = next_id++;
     scalar_types.push_back(param_pack_type_id);
-    for (size_t i = 0; i < args.size(); i++) {
-        if (!args[i].is_buffer) {
+    for (const DeviceArgument &arg : args) {
+        if (!arg.is_buffer) {
             // record the type for later constructing the params struct type
-            scalar_types.push_back(map_type(args[i].type));
+            scalar_types.push_back(map_type(arg.type));
 
             // Add a decoration describing the offset
-            add_instruction(spir_v_annotations, SpvOpMemberDecorate, {param_pack_type_id, 
-                                                                      (uint32_t)(scalar_types.size()-2), 
-                                                                      SpvDecorationOffset,
-                                                                      offset});
-            offset += args[i].type.bytes();
+            add_instruction(spir_v_annotations, SpvOpMemberDecorate, {param_pack_type_id, (uint32_t)(scalar_types.size() - 2), SpvDecorationOffset, offset});
+            offset += arg.type.bytes();
         }
     }
 
@@ -1134,27 +1119,22 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(Stmt s,
     // Add a variable for the parameter pack
     add_instruction(spir_v_types, SpvOpVariable, {param_pack_ptr_type_id, param_pack_id, SpvStorageClassUniform});
 
-
     uint32_t binding_counter = 1;
     uint32_t scalar_index = 0;
-    for (size_t i = 0; i < args.size(); i++) {
+    for (const DeviceArgument &arg : args) {
         uint32_t param_id = next_id++;
-        if (args[i].is_buffer) {
-            uint32_t element_type = map_type(args[i].type);
+        if (arg.is_buffer) {
+            uint32_t element_type = map_type(arg.type);
             uint32_t runtime_arr_type = next_id++;
             uint32_t struct_type = next_id++;
             uint32_t ptr_struct_type = next_id++;
             add_instruction(spir_v_types, SpvOpTypeRuntimeArray, {runtime_arr_type, element_type});
             add_instruction(spir_v_types, SpvOpTypeStruct, {struct_type, runtime_arr_type});
-            add_instruction(spir_v_types, SpvOpTypePointer, {ptr_struct_type,
-                                                             SpvStorageClassUniform,
-                                                             struct_type});
+            add_instruction(spir_v_types, SpvOpTypePointer, {ptr_struct_type, SpvStorageClassUniform, struct_type});
             // Annotate the struct to indicate it's passed in a GLSL-style buffer block
             add_instruction(spir_v_annotations, SpvOpDecorate, {struct_type, SpvDecorationBufferBlock});
             // Annotate the array with its stride
-            add_instruction(spir_v_annotations, SpvOpDecorate, {runtime_arr_type,
-                                                                SpvDecorationArrayStride,
-                                                                (uint32_t)(args[i].type.bytes())});
+            add_instruction(spir_v_annotations, SpvOpDecorate, {runtime_arr_type, SpvDecorationArrayStride, (uint32_t)(arg.type.bytes())});
             // Annotate the offset for the array
             add_instruction(spir_v_annotations, SpvOpMemberDecorate, {struct_type, 0, SpvDecorationOffset, (uint32_t)0});
 
@@ -1165,17 +1145,16 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(Stmt s,
             add_instruction(spir_v_types, SpvOpVariable, {ptr_struct_type, param_id, SpvStorageClassUniform});
         } else {
             uint32_t access_chain_id = next_id++;
-            add_instruction(SpvOpInBoundsAccessChain, {map_pointer_type(args[i].type, SpvStorageClassUniform), 
-                                                       access_chain_id, 
+            add_instruction(SpvOpInBoundsAccessChain, {map_pointer_type(arg.type, SpvStorageClassUniform),
+                                                       access_chain_id,
                                                        param_pack_id,
                                                        emit_constant(UInt(32), &scalar_index)});
             scalar_index++;
-            add_instruction(SpvOpLoad, {map_type(args[i].type), param_id, access_chain_id});
+            add_instruction(SpvOpLoad, {map_type(arg.type), param_id, access_chain_id});
         }
-        symbol_table.push(args[i].name, {param_id, SpvStorageClassUniform});
+        symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
     }
 
-
     s.accept(this);
 
     // Insert return and  function end delimiter
@@ -1196,7 +1175,7 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(Stmt s,
                      workgroup_size[0], workgroup_size[1], workgroup_size[2]});
 
     // Pop scope
-    for (auto arg: args) {
+    for (const DeviceArgument &arg : args) {
         symbol_table.pop(arg.name);
     }
 
@@ -1216,7 +1195,7 @@ void add_extension(const std::string &extension_name, std::vector<uint32_t> &sec
     const size_t data_size = extension_name.size();
     encode_string(section, extra_words, data_size, data_temp);
 }
-}
+}  // namespace
 void CodeGen_Vulkan_Dev::init_module() {
     debug(2) << "Vulkan device codegen init_module\n";
 
@@ -1224,19 +1203,19 @@ void CodeGen_Vulkan_Dev::init_module() {
     emitter.spir_v_header.push_back(SpvMagicNumber);
     emitter.spir_v_header.push_back(SpvVersion);
     emitter.spir_v_header.push_back(SpvSourceLanguageUnknown);
-    emitter.spir_v_header.push_back(0); // Bound placeholder
-    emitter.spir_v_header.push_back(0); // Reserved for schema.
+    emitter.spir_v_header.push_back(0);  // Bound placeholder
+    emitter.spir_v_header.push_back(0);  // Reserved for schema.
 
     // the unique void type
-    emitter.next_id++; // 0 is not a valid id
+    emitter.next_id++;  // 0 is not a valid id
     emitter.void_id = emitter.next_id++;
     emitter.add_instruction(emitter.spir_v_types, SpvOpTypeVoid, {emitter.void_id});
 
     // Capabilities
     // TODO: only add those required by the generated code
     emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityShader});
-    //emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityInt8});
-    //emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityUniformAndStorageBuffer8BitAccess});
+    // emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityInt8});
+    // emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityUniformAndStorageBuffer8BitAccess});
 
     // Extensions
     // TODO: only add those required by the generated code
@@ -1248,7 +1227,6 @@ void CodeGen_Vulkan_Dev::init_module() {
     emitter.add_instruction(emitter.spir_v_header, SpvOpMemoryModel,
                             {SpvAddressingModelLogical, SpvMemoryModelGLSL450});
 
-
     // OpCapability instructions
     //    Enumerate type maps and add subwidth integer types if used
     // OpExtensions instructions
@@ -1270,7 +1248,7 @@ void CodeGen_Vulkan_Dev::add_kernel(Stmt stmt,
                                     const std::vector<DeviceArgument> &args) {
     current_kernel_name = name;
     emitter.add_kernel(stmt, name, args);
-    //dump();
+    // dump();
 }
 
 std::vector<char> CodeGen_Vulkan_Dev::compile_to_src() {
@@ -1289,7 +1267,7 @@ std::vector<char> CodeGen_Vulkan_Dev::compile_to_src() {
     final_module.insert(final_module.end(), (const char *)emitter.spir_v_kernels.data(), (const char *)(emitter.spir_v_kernels.data() + emitter.spir_v_kernels.size()));
     assert(final_module.size() == total_size);
     std::ofstream f("/home/skamil/out.spv", std::ios::out | std::ios::binary);
-    f.write((char*)(final_module.data()), final_module.size());
+    f.write((char *)(final_module.data()), final_module.size());
     f.close();
 
     return final_module;
@@ -1310,11 +1288,11 @@ void CodeGen_Vulkan_Dev::dump() {
     // For now: dump to file so source can be consumed by validator
     auto module = compile_to_src();
     std::ofstream f("out.spv", std::ios::out | std::ios::binary);
-    f.write((char*)(module.data()), module.size());
+    f.write((char *)(module.data()), module.size());
     f.close();
 }
 
-} // namespace 
+}  // namespace
 
 std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Vulkan_Dev(const Target &target) {
     return std::make_unique<CodeGen_Vulkan_Dev>(target);
diff --git a/src/runtime/HalideRuntimeVulkan.h b/src/runtime/HalideRuntimeVulkan.h
index ac649194f1f0..88b8d3a8d714 100644
--- a/src/runtime/HalideRuntimeVulkan.h
+++ b/src/runtime/HalideRuntimeVulkan.h
@@ -57,16 +57,16 @@ struct halide_vulkan_memory_allocator;
 extern int halide_vulkan_acquire_context(void *user_context,
                                          struct halide_vulkan_memory_allocator **allocator,
                                          struct VkInstance_T **instance,
-                                         struct VkDevice_T **device, 
+                                         struct VkDevice_T **device,
                                          struct VkPhysicalDevice_T **physical_device,
                                          uint64_t *command_pool,
                                          struct VkQueue_T **queue,
                                          uint32_t *queue_family_index,
                                          bool create = true);
 
-extern int halide_vulkan_release_context(void *user_context, 
-                                         struct VkInstance_T *instance, 
-                                         struct VkDevice_T *device, 
+extern int halide_vulkan_release_context(void *user_context,
+                                         struct VkInstance_T *instance,
+                                         struct VkDevice_T *device,
                                          struct VkQueue_T *queue);
 
 // --
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index bd8e935d771f..16e9c8c5e42f 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -3,11 +3,11 @@
 #include "device_buffer_utils.h"
 #include "device_interface.h"
 #include "runtime_internal.h"
-#include "vulkan_resources.h"
 #include "vulkan_context.h"
 #include "vulkan_extensions.h"
 #include "vulkan_internal.h"
 #include "vulkan_memory.h"
+#include "vulkan_resources.h"
 
 using namespace Halide::Runtime::Internal::Vulkan;
 
@@ -32,7 +32,7 @@ extern "C" {
 WEAK int halide_vulkan_acquire_context(void *user_context,
                                        halide_vulkan_memory_allocator **allocator,
                                        VkInstance *instance,
-                                       VkDevice *device, 
+                                       VkDevice *device,
                                        VkPhysicalDevice *physical_device,
                                        VkCommandPool *command_pool,
                                        VkQueue *queue,
@@ -408,8 +408,8 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
 
     //// 15. Reclaim the staging buffer
     ctx.allocator->reclaim(user_context, staging_region);
-    
- #ifdef DEBUG_RUNTIME
+
+#ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
@@ -717,7 +717,7 @@ WEAK int halide_vulkan_run(void *user_context,
         return result;
     }
 
-    //// 10. Cleanup    
+    //// 10. Cleanup
     // Release all temporary objects for this run
     vk_destroy_scalar_uniform_buffer(user_context, ctx.allocator, scalar_args_region);
     vk_destroy_descriptor_set_layout(user_context, ctx.allocator, descriptor_set_layout);
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 84ff5c6a5dd6..bfa68200b098 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -81,7 +81,7 @@ class VulkanContext {
 // --------------------------------------------------------------------------
 
 namespace {
-    
+
 // Initializes the instance (used by the default vk_create_context)
 int vk_create_instance(void *user_context, const StringTable &requested_layers, VkInstance *instance, const VkAllocationCallbacks *alloc_callbacks) {
     debug(user_context) << "    vk_create_instance (user_context: " << user_context << ")\n";
@@ -128,9 +128,9 @@ int vk_create_instance(void *user_context, const StringTable &requested_layers,
 }
 
 int vk_select_device_for_context(void *user_context,
-                                      VkInstance *instance, VkDevice *device,
-                                      VkPhysicalDevice *physical_device,
-                                      uint32_t *queue_family_index) {
+                                 VkInstance *instance, VkDevice *device,
+                                 VkPhysicalDevice *physical_device,
+                                 uint32_t *queue_family_index) {
 
     // For now handle more than 16 devices by just looking at the first 16.
     VkPhysicalDevice chosen_device = nullptr;
@@ -207,7 +207,7 @@ int vk_select_device_for_context(void *user_context,
 }
 
 int vk_create_device(void *user_context, const StringTable &requested_layers, VkInstance *instance, VkDevice *device, VkQueue *queue,
-                          VkPhysicalDevice *physical_device, uint32_t *queue_family_index, const VkAllocationCallbacks *alloc_callbacks) {
+                     VkPhysicalDevice *physical_device, uint32_t *queue_family_index, const VkAllocationCallbacks *alloc_callbacks) {
 
     StringTable required_device_extensions;
     vk_get_required_device_extensions(user_context, required_device_extensions);
@@ -268,8 +268,8 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
 
 // Initializes the context (used by the default implementation of halide_acquire_context)
 int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
-                           VkInstance *instance, VkDevice *device, VkPhysicalDevice *physical_device, 
-                           VkCommandPool *command_pool, VkQueue *queue, uint32_t *queue_family_index) {
+                      VkInstance *instance, VkDevice *device, VkPhysicalDevice *physical_device,
+                      VkCommandPool *command_pool, VkQueue *queue, uint32_t *queue_family_index) {
 
     debug(user_context) << "    vk_create_context (user_context: " << user_context << ")\n";
 
@@ -320,10 +320,10 @@ int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
 
 // --------------------------------------------------------------------------
 
-}  // namespace: (anonymous)
-}  // namespace: Vulkan
-}  // namespace: Internal
-}  // namespace: Runtime
-}  // namespace: Halide
+}  // namespace
+}  // namespace Vulkan
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
 
 #endif  /// HALIDE_RUNTIME_VULKAN_CONTEXT_H
diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
index 26796b61c116..6541fb787f12 100644
--- a/src/runtime/vulkan_extensions.h
+++ b/src/runtime/vulkan_extensions.h
@@ -218,8 +218,8 @@ uint32_t vk_get_supported_device_extensions(void *user_context, VkPhysicalDevice
 }
 
 bool vk_validate_required_extension_support(void *user_context,
-                                                 const StringTable &required_extensions,
-                                                 const StringTable &supported_extensions) {
+                                            const StringTable &required_extensions,
+                                            const StringTable &supported_extensions) {
     bool validated = true;
     for (uint32_t n = 0; n < required_extensions.size(); ++n) {
         const char *extension = required_extensions[n];
@@ -233,11 +233,11 @@ bool vk_validate_required_extension_support(void *user_context,
 
 // --------------------------------------------------------------------------
 
-}  // namespace: (anonymous)
-}  // namespace: Vulkan
-}  // namespace: Internal
-}  // namespace: Runtime
-}  // namespace: Halide
+}  // namespace
+}  // namespace Vulkan
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
 
 // --------------------------------------------------------------------------
 
diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
index a790d7453c24..8d2e996145ab 100644
--- a/src/runtime/vulkan_functions.h
+++ b/src/runtime/vulkan_functions.h
@@ -1,4 +1,4 @@
-//VULKAN_FN(vkCreateInstance)
+// VULKAN_FN(vkCreateInstance)
 VULKAN_FN(vkDestroyInstance)
 VULKAN_FN(vkCreateDevice)
 VULKAN_FN(vkDestroyDevice)
diff --git a/src/runtime/vulkan_interface.h b/src/runtime/vulkan_interface.h
index 12be41fc4711..676c8548f6fc 100644
--- a/src/runtime/vulkan_interface.h
+++ b/src/runtime/vulkan_interface.h
@@ -56,12 +56,11 @@ void WEAK vk_load_vulkan_functions(VkInstance instance) {
 
 // --
 
-
 // --------------------------------------------------------------------------
 
-}  // namespace: Vulkan
-}  // namespace: Internal
-}  // namespace: Runtime
-}  // namespace: Halide
+}  // namespace Vulkan
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
 
 #endif  // HALIDE_RUNTIME_VULKAN_INTERFACE_H
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index e21d877954f2..d472bf26eb9a 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -26,7 +26,7 @@ class VulkanMemoryAllocator;
 
 // --------------------------------------------------------------------------
 
-namespace { // internalize 
+namespace {  // internalize
 
 // --------------------------------------------------------------------------
 // Memory
@@ -35,7 +35,7 @@ void *vk_host_malloc(void *user_context, size_t size, size_t alignment, VkSystem
 void vk_host_free(void *user_context, void *ptr, const VkAllocationCallbacks *callbacks = nullptr);
 
 VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context, VkDevice device, VkPhysicalDevice physical_device,
-                                                       const VkAllocationCallbacks *alloc_callbacks);
+                                                  const VkAllocationCallbacks *alloc_callbacks);
 
 int vk_destroy_memory_allocator(void *user_context, VulkanMemoryAllocator *allocator);
 
@@ -46,7 +46,7 @@ int vk_create_context(
     void *user_context,
     VulkanMemoryAllocator **allocator,
     VkInstance *instance,
-    VkDevice *device, 
+    VkDevice *device,
     VkPhysicalDevice *physical_device,
     VkCommandPool *command_pool,
     VkQueue *queue, uint32_t *queue_family_index);
@@ -54,12 +54,12 @@ int vk_create_context(
 int vk_create_instance(void *user_context, const StringTable &requested_layers, VkInstance *instance, const VkAllocationCallbacks *alloc_callbacks);
 
 int vk_select_device_for_context(void *user_context,
-                                      VkInstance *instance, VkDevice *device,
-                                      VkPhysicalDevice *physical_device,
-                                      uint32_t *queue_family_index);
+                                 VkInstance *instance, VkDevice *device,
+                                 VkPhysicalDevice *physical_device,
+                                 uint32_t *queue_family_index);
 
 int vk_create_device(void *user_context, const StringTable &requested_layers, VkInstance *instance, VkDevice *device, VkQueue *queue,
-                          VkPhysicalDevice *physical_device, uint32_t *queue_family_index, const VkAllocationCallbacks *alloc_callbacks);
+                     VkPhysicalDevice *physical_device, uint32_t *queue_family_index, const VkAllocationCallbacks *alloc_callbacks);
 
 // --------------------------------------------------------------------------
 // Extensions
@@ -71,51 +71,51 @@ uint32_t vk_get_required_device_extensions(void *user_context, StringTable &ext_
 uint32_t vk_get_optional_device_extensions(void *user_context, StringTable &ext_table);
 uint32_t vk_get_supported_device_extensions(void *user_context, VkPhysicalDevice physical_device, StringTable &ext_table);
 bool vk_validate_required_extension_support(void *user_context,
-                                                 const StringTable &required_extensions,
-                                                 const StringTable &supported_extensions);
+                                            const StringTable &required_extensions,
+                                            const StringTable &supported_extensions);
 
 // --------------------------------------------------------------------------
 // Resources
 // --------------------------------------------------------------------------
 
 // -- Command Pool
-VkResult vk_create_command_pool(void* user_context, VulkanMemoryAllocator* allocator, uint32_t queue_index, VkCommandPool *command_pool);
-VkResult vk_destroy_command_pool(void* user_context, VulkanMemoryAllocator* allocator, VkCommandPool command_pool);
+VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool);
+VkResult vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool);
 
 // -- Command Buffer
-VkResult vk_create_command_buffer(void* user_context, VulkanMemoryAllocator* allocator,  VkCommandPool pool, VkCommandBuffer *command_buffer);
+VkResult vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool pool, VkCommandBuffer *command_buffer);
 
 VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
-                                                        VkDevice device,
-                                                        VkCommandBuffer command_buffer,
-                                                        VkPipeline compute_pipeline,
-                                                        VkPipelineLayout pipeline_layout,
-                                                        VkDescriptorSet descriptor_set,
-                                                        int blocksX, int blocksY, int blocksZ);
+                                                   VkDevice device,
+                                                   VkCommandBuffer command_buffer,
+                                                   VkPipeline compute_pipeline,
+                                                   VkPipelineLayout pipeline_layout,
+                                                   VkDescriptorSet descriptor_set,
+                                                   int blocksX, int blocksY, int blocksZ);
 
 VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer);
 
 // -- Scalar Uniform Buffer
 size_t vk_estimate_scalar_uniform_buffer_size(void *user_context,
-                                                   size_t arg_sizes[],
-                                                   void *args[],
-                                                   int8_t arg_is_buffer[]);
+                                              size_t arg_sizes[],
+                                              void *args[],
+                                              int8_t arg_is_buffer[]);
 
 MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
-                                                   VulkanMemoryAllocator *allocator,
-                                                   size_t arg_sizes[],
-                                                   void *args[],
-                                                   int8_t arg_is_buffer[]);
+                                              VulkanMemoryAllocator *allocator,
+                                              size_t arg_sizes[],
+                                              void *args[],
+                                              int8_t arg_is_buffer[]);
 
 void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
-                                           MemoryRegion *scalar_args_region);
+                                      MemoryRegion *scalar_args_region);
 // -- Descriptor Pool
 VkResult vk_create_descriptor_pool(void *user_context,
                                    VulkanMemoryAllocator *allocator,
                                    uint32_t storage_buffer_count,
                                    VkDescriptorPool *descriptor_pool);
 
-VkResult vk_destroy_descriptor_pool(void* user_context, 
+VkResult vk_destroy_descriptor_pool(void *user_context,
                                     VulkanMemoryAllocator *allocator,
                                     VkDescriptorPool descriptor_pool);
 
@@ -126,13 +126,13 @@ uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
                                               int8_t arg_is_buffer[]);
 
 VkResult vk_create_descriptor_set_layout(void *user_context,
-                                              VkDevice device,
-                                              size_t arg_sizes[],
-                                              void *args[],
-                                              int8_t arg_is_buffer[],
-                                              VkDescriptorSetLayout *layout);
+                                         VkDevice device,
+                                         size_t arg_sizes[],
+                                         void *args[],
+                                         int8_t arg_is_buffer[],
+                                         VkDescriptorSetLayout *layout);
 
-VkResult vk_destroy_descriptor_set_layout(void* user_context, 
+VkResult vk_destroy_descriptor_set_layout(void *user_context,
                                           VulkanMemoryAllocator *allocator,
                                           VkDescriptorSetLayout descriptor_set_layout);
 
@@ -158,7 +158,7 @@ VkResult vk_create_pipeline_layout(void *user_context,
                                    VkDescriptorSetLayout *descriptor_set_layout,
                                    VkPipelineLayout *pipeline_layout);
 
-VkResult vk_destroy_pipeline_layout(void* user_context, 
+VkResult vk_destroy_pipeline_layout(void *user_context,
                                     VulkanMemoryAllocator *allocator,
                                     VkPipelineLayout pipeline_layout);
 // -- Compute Pipeline
@@ -169,7 +169,7 @@ VkResult vk_create_compute_pipeline(void *user_context,
                                     VkPipelineLayout pipeline_layout,
                                     VkPipeline *compute_pipeline);
 
-VkResult vk_destroy_compute_pipeline(void* user_context, 
+VkResult vk_destroy_compute_pipeline(void *user_context,
                                      VulkanMemoryAllocator *allocator,
                                      VkPipeline compute_pipeline);
 
@@ -181,7 +181,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 
 // -- Copy Buffer
 int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buffer,
-                             const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d);
+                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d);
 
 // --------------------------------------------------------------------------
 // Errors
@@ -251,10 +251,10 @@ const char *vk_get_error_name(VkResult error) {
 
 // --------------------------------------------------------------------------
 
-}  // namespace: (anonymous)
-}  // namespace: Vulkan
-}  // namespace: Internal
-}  // namespace: Runtime
-}  // namespace: Halide
+}  // namespace
+}  // namespace Vulkan
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
 
 #endif  // HALIDE_RUNTIME_VULKAN_INTERNAL_H
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index c63492e45508..d911389e0715 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -27,12 +27,12 @@ WEAK VulkanMemoryConfig memory_allocator_config;
 
 // --------------------------------------------------------------------------
 
-/** Vulkan Memory Allocator class interface for managing large 
- * memory requests stored as contiguous blocks of memory, which 
- * are then sub-allocated into smaller regions of 
+/** Vulkan Memory Allocator class interface for managing large
+ * memory requests stored as contiguous blocks of memory, which
+ * are then sub-allocated into smaller regions of
  * memory to avoid the excessive cost of vkAllocate and the limited
- * number of available allocation calls through the API. 
-*/
+ * number of available allocation calls through the API.
+ */
 class VulkanMemoryAllocator {
 public:
     // disable copy constructors and assignment
@@ -102,7 +102,6 @@ class VulkanMemoryAllocator {
                                 MemoryProperties properties,
                                 uint32_t required_flags) const;
 
-
     size_t block_byte_count = 0;
     size_t block_count = 0;
     size_t region_byte_count = 0;
@@ -164,11 +163,11 @@ void VulkanMemoryAllocator::initialize(void *user_context,
 
 MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) {
     debug(nullptr) << "VulkanMemoryAllocator: Reserving memory ("
-             << "user_context=" << user_context << " "
-             << "block_allocator=" << (void *)(block_allocator) << " "
-             << "request_size=" << (uint32_t)(request.size) << " "
-             << "device=" << (void *)(device) << " "
-             << "physical_device=" << (void *)(physical_device) << ") ...\n";
+                   << "user_context=" << user_context << " "
+                   << "block_allocator=" << (void *)(block_allocator) << " "
+                   << "request_size=" << (uint32_t)(request.size) << " "
+                   << "device=" << (void *)(device) << " "
+                   << "physical_device=" << (void *)(physical_device) << ") ...\n";
 
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -178,10 +177,10 @@ MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &
 
 void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
     debug(nullptr) << "VulkanMemoryAllocator: Mapping region ("
-             << "user_context=" << user_context << " "
-             << "region=" << (void *)(region) << " "
-             << "device=" << (void *)(device) << " "
-             << "physical_device=" << (void *)(physical_device) << ") ...\n";
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << " "
+                   << "device=" << (void *)(device) << " "
+                   << "physical_device=" << (void *)(physical_device) << ") ...\n";
 
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -217,10 +216,10 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
 
 void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
     debug(nullptr) << "VulkanMemoryAllocator: Unmapping region ("
-             << "user_context=" << user_context << " "
-             << "region=" << (void *)(region) << " "
-             << "device=" << (void *)(device) << " "
-             << "physical_device=" << (void *)(physical_device) << ") ...\n";
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << " "
+                   << "device=" << (void *)(device) << " "
+                   << "physical_device=" << (void *)(physical_device) << ") ...\n";
 
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -248,8 +247,8 @@ void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
 
 void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
     debug(nullptr) << "VulkanMemoryAllocator: Reclaiming region ("
-             << "user_context=" << user_context << " "
-             << "region=" << (void *)(region) << ") ... \n";
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << ") ... \n";
 
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -259,7 +258,7 @@ void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
 
 bool VulkanMemoryAllocator::collect(void *user_context) {
     debug(nullptr) << "VulkanMemoryAllocator: Collecting unused memory ("
-             << "user_context=" << user_context << ") ... \n";
+                   << "user_context=" << user_context << ") ... \n";
 
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -269,7 +268,7 @@ bool VulkanMemoryAllocator::collect(void *user_context) {
 
 void VulkanMemoryAllocator::release(void *user_context) {
     debug(nullptr) << "VulkanMemoryAllocator: Releasing ("
-             << "user_context=" << user_context << ") ... \n";
+                   << "user_context=" << user_context << ") ... \n";
 
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -279,7 +278,7 @@ void VulkanMemoryAllocator::release(void *user_context) {
 
 void VulkanMemoryAllocator::destroy(void *user_context) {
     debug(nullptr) << "VulkanMemoryAllocator: Destroying allocator ("
-             << "user_context=" << user_context << ") ... \n";
+                   << "user_context=" << user_context << ") ... \n";
 
     block_allocator->destroy(this);
 }
@@ -294,8 +293,8 @@ VulkanMemoryAllocator::default_config() {
 
 void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *block) {
     debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
-             << "user_context=" << user_context << " "
-             << "block=" << (void *)(block) << ") ... \n";
+                   << "user_context=" << user_context << " "
+                   << "block=" << (void *)(block) << ") ... \n";
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
@@ -304,11 +303,11 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     halide_abort_if_false(user_context, block != nullptr);
 
     debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
-             << "size=" << (uint32_t)block->size << ", "
-             << "dedicated=" << (block->dedicated ? "true" : "false") << " "
-             << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
-             << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
-             << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+                   << "size=" << (uint32_t)block->size << ", "
+                   << "dedicated=" << (block->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
 
     // Find an appropriate memory type given the flags
     uint32_t memory_type = instance->select_memory_type(user_context, instance->physical_device, block->properties, 0);
@@ -344,8 +343,8 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
 
 void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *block) {
     debug(nullptr) << "VulkanMemoryAllocator: Deallocating block ("
-             << "user_context=" << user_context << " "
-             << "block=" << (void *)(block) << ") ... \n";
+                   << "user_context=" << user_context << " "
+                   << "block=" << (void *)(block) << ") ... \n";
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
@@ -354,11 +353,11 @@ void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *bl
     halide_abort_if_false(user_context, block != nullptr);
 
     debug(nullptr) << "VulkanBlockAllocator: deallocating block ("
-             << "size=" << (uint32_t)block->size << ", "
-             << "dedicated=" << (block->dedicated ? "true" : "false") << " "
-             << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
-             << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
-             << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+                   << "size=" << (uint32_t)block->size << ", "
+                   << "dedicated=" << (block->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
 
     if (block->handle == nullptr) {
         debug(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid handle!\n";
@@ -412,7 +411,7 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
     case MemoryVisibility::InvalidVisibility:
     default:
         debug(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory visibility request!\n\t"
-                 << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+                       << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
         return invalid_memory_type;
     };
 
@@ -438,7 +437,7 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
     case MemoryCaching::InvalidCaching:
     default:
         debug(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory caching request!\n\t"
-                 << "caching=" << halide_memory_caching_name(properties.caching) << "\n";
+                       << "caching=" << halide_memory_caching_name(properties.caching) << "\n";
         return invalid_memory_type;
     };
 
@@ -474,9 +473,9 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
 
     if (result == invalid_memory_type) {
         debug(nullptr) << "VulkanBlockAllocator: Failed to find appropriate memory type for given properties:\n\t"
-                 << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                 << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                 << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+                       << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                       << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                       << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
         return invalid_memory_type;
     }
 
@@ -487,8 +486,8 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
 
 void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *region) {
     debug(nullptr) << "VulkanMemoryAllocator: Allocating region ("
-             << "user_context=" << user_context << " "
-             << "region=" << (void *)(region) << ") ... \n";
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << ") ... \n";
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
@@ -497,12 +496,12 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
     halide_abort_if_false(user_context, region != nullptr);
 
     debug(nullptr) << "VulkanRegionAllocator: Allocating region ("
-             << "size=" << (uint32_t)region->size << ", "
-             << "offset=" << (uint32_t)region->offset << ", "
-             << "dedicated=" << (region->dedicated ? "true" : "false") << " "
-             << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
-             << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
-             << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+                   << "size=" << (uint32_t)region->size << ", "
+                   << "offset=" << (uint32_t)region->offset << ", "
+                   << "dedicated=" << (region->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
 
     uint32_t usage_flags = instance->select_memory_usage(user_context, region->properties);
 
@@ -524,7 +523,7 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
     VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer);
     if (result != VK_SUCCESS) {
         error(nullptr) << "VulkanRegionAllocator: Failed to create buffer!\n\t"
-                 << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
         return;
     }
 
@@ -545,7 +544,7 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
     result = vkBindBufferMemory(instance->device, *buffer, *device_memory, region->offset);
     if (result != VK_SUCCESS) {
         error(nullptr) << "VulkanRegionAllocator: Failed to bind buffer!\n\t"
-                 << "vkBindBufferMemory returned: " << vk_get_error_name(result) << "\n";
+                       << "vkBindBufferMemory returned: " << vk_get_error_name(result) << "\n";
         return;
     }
 
@@ -556,8 +555,8 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
 
 void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *region) {
     debug(nullptr) << "VulkanMemoryAllocator: Deallocating region ("
-             << "user_context=" << user_context << " "
-             << "region=" << (void *)(region) << ") ... \n";
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << ") ... \n";
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
@@ -565,12 +564,12 @@ void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *
     halide_abort_if_false(user_context, instance->physical_device != nullptr);
     halide_abort_if_false(user_context, region != nullptr);
     debug(nullptr) << "VulkanRegionAllocator: Deallocating region ("
-             << "size=" << (uint32_t)region->size << ", "
-             << "offset=" << (uint32_t)region->offset << ", "
-             << "dedicated=" << (region->dedicated ? "true" : "false") << " "
-             << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
-             << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
-             << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+                   << "size=" << (uint32_t)region->size << ", "
+                   << "offset=" << (uint32_t)region->offset << ", "
+                   << "dedicated=" << (region->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
 
     if (region->handle == nullptr) {
         debug(nullptr) << "VulkanRegionAllocator: Unable to deallocate region! Invalid handle!\n";
@@ -670,9 +669,9 @@ void vk_host_free(void *user_context, void *ptr, const VkAllocationCallbacks *ca
 }
 
 VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context,
-                                                       VkDevice device,
-                                                       VkPhysicalDevice physical_device,
-                                                       const VkAllocationCallbacks *alloc_callbacks) {
+                                                  VkDevice device,
+                                                  VkPhysicalDevice physical_device,
+                                                  const VkAllocationCallbacks *alloc_callbacks) {
 
     SystemMemoryAllocatorFns system_allocator = {vk_system_malloc, vk_system_free};
 
@@ -691,11 +690,11 @@ int vk_destroy_memory_allocator(void *user_context, VulkanMemoryAllocator *alloc
 
 // --------------------------------------------------------------------------
 
-}  // namespace: (anonymous)
-}  // namespace: Vulkan
-}  // namespace: Internal
-}  // namespace: Runtime
-}  // namespace: Halide
+}  // namespace
+}  // namespace Vulkan
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
 
 // --------------------------------------------------------------------------
 
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 3051d1c4ed89..91284b9d307f 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -20,14 +20,14 @@ namespace {  // internalize
 
 // --------------------------------------------------------------------------
 
-VkResult vk_create_command_pool(void* user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index,  VkCommandPool *command_pool) {
+VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "Vulkan: vk_create_command_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "queue_index: " << queue_index << ")\n";
-#endif    
+#endif
 
     VkCommandPoolCreateInfo command_pool_info =
         {
@@ -39,21 +39,21 @@ VkResult vk_create_command_pool(void* user_context, VulkanMemoryAllocator *alloc
     return vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
 }
 
-VkResult vk_destroy_command_pool(void* user_context, VulkanMemoryAllocator* allocator, VkCommandPool command_pool) {
+VkResult vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "Vulkan: vk_destroy_command_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "command_pool: " << (void *)command_pool << ")\n";
-#endif    
+#endif
     vkDestroyCommandPool(allocator->current_device(), command_pool, allocator->callbacks());
-    return VK_SUCCESS;    
+    return VK_SUCCESS;
 }
 
 // --
 
-VkResult vk_create_command_buffer(void* user_context, VulkanMemoryAllocator* allocator, VkCommandPool command_pool, VkCommandBuffer *command_buffer) {
+VkResult vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer *command_buffer) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "Vulkan: vk_create_command_buffer (user_context: " << user_context << ", "
@@ -74,12 +74,12 @@ VkResult vk_create_command_buffer(void* user_context, VulkanMemoryAllocator* all
 }
 
 VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
-                                                    VkDevice device,
-                                                    VkCommandBuffer command_buffer,
-                                                    VkPipeline compute_pipeline,
-                                                    VkPipelineLayout pipeline_layout,
-                                                    VkDescriptorSet descriptor_set,
-                                                    int blocksX, int blocksY, int blocksZ) {
+                                                   VkDevice device,
+                                                   VkCommandBuffer command_buffer,
+                                                   VkPipeline compute_pipeline,
+                                                   VkPipelineLayout pipeline_layout,
+                                                   VkDescriptorSet descriptor_set,
+                                                   int blocksX, int blocksY, int blocksZ) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
@@ -150,9 +150,9 @@ VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBu
 // --
 
 uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
-                                                   size_t arg_sizes[],
-                                                   void *args[],
-                                                   int8_t arg_is_buffer[]) {
+                                              size_t arg_sizes[],
+                                              void *args[],
+                                              int8_t arg_is_buffer[]) {
     int i = 0;
     uint32_t num_bindings = 1;  // first binding is for passing scalar parameters in a buffer
     while (arg_sizes[i] > 0) {
@@ -208,8 +208,7 @@ VkResult vk_create_descriptor_pool(void *user_context,
     return VK_SUCCESS;
 }
 
-
-VkResult vk_destroy_descriptor_pool(void* user_context, 
+VkResult vk_destroy_descriptor_pool(void *user_context,
                                     VulkanMemoryAllocator *allocator,
                                     VkDescriptorPool descriptor_pool) {
 #ifdef DEBUG_RUNTIME
@@ -295,7 +294,7 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
     return VK_SUCCESS;
 }
 
-VkResult vk_destroy_descriptor_set_layout(void* user_context, 
+VkResult vk_destroy_descriptor_set_layout(void *user_context,
                                           VulkanMemoryAllocator *allocator,
                                           VkDescriptorSetLayout descriptor_set_layout) {
 
@@ -435,9 +434,9 @@ VkResult vk_update_descriptor_set(void *user_context,
 // --
 
 size_t vk_estimate_scalar_uniform_buffer_size(void *user_context,
-                                                   size_t arg_sizes[],
-                                                   void *args[],
-                                                   int8_t arg_is_buffer[]) {
+                                              size_t arg_sizes[],
+                                              void *args[],
+                                              int8_t arg_is_buffer[]) {
     int i = 0;
     int scalar_uniform_buffer_size = 0;
     while (arg_sizes[i] > 0) {
@@ -450,11 +449,10 @@ size_t vk_estimate_scalar_uniform_buffer_size(void *user_context,
 }
 
 MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
-                                                   VulkanMemoryAllocator *allocator,
-                                                   size_t arg_sizes[],
-                                                   void *args[],
-                                                   int8_t arg_is_buffer[]) {
-
+                                              VulkanMemoryAllocator *allocator,
+                                              size_t arg_sizes[],
+                                              void *args[],
+                                              int8_t arg_is_buffer[]) {
 
     size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context,
                                                                        arg_sizes, args, arg_is_buffer);
@@ -504,7 +502,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
 }
 
 void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
-                                           MemoryRegion *scalar_args_region) {
+                                      MemoryRegion *scalar_args_region) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
@@ -552,7 +550,7 @@ VkResult vk_create_pipeline_layout(void *user_context,
     return VK_SUCCESS;
 }
 
-VkResult vk_destroy_pipeline_layout(void* user_context, 
+VkResult vk_destroy_pipeline_layout(void *user_context,
                                     VulkanMemoryAllocator *allocator,
                                     VkPipelineLayout pipeline_layout) {
 
@@ -594,7 +592,7 @@ VkResult vk_create_compute_pipeline(void *user_context,
             // VkPipelineShaderStageCreatInfo
             {
                 VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,  // structure type
-                nullptr,                                              //pointer to a structure extending this
+                nullptr,                                              // pointer to a structure extending this
                 0,                                                    // flags
                 VK_SHADER_STAGE_COMPUTE_BIT,                          // compute stage shader
                 shader_module,                                        // shader module
@@ -615,9 +613,9 @@ VkResult vk_create_compute_pipeline(void *user_context,
     return VK_SUCCESS;
 }
 
-VkResult vk_destroy_compute_pipeline(void* user_context, 
-                                    VulkanMemoryAllocator *allocator,
-                                    VkPipeline compute_pipeline) {
+VkResult vk_destroy_compute_pipeline(void *user_context,
+                                     VulkanMemoryAllocator *allocator,
+                                     VkPipeline compute_pipeline) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "Vulkan: vk_destroy_compute_pipeline (user_context: " << user_context << ", "
@@ -632,13 +630,13 @@ VkResult vk_destroy_compute_pipeline(void* user_context,
 // --------------------------------------------------------------------------
 
 VkShaderModule *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
-                                              const char *src, int size) {
+                                         const char *src, int size) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "Vulkan: vk_compile_shader_module (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
-        << "source: " << (void *)src  << ", "
+        << "source: " << (void *)src << ", "
         << "size: " << size << ")\n";
 
     uint64_t t_before = halide_current_time_ns(user_context);
@@ -654,10 +652,10 @@ VkShaderModule *vk_compile_shader_module(void *user_context, VulkanMemoryAllocat
 
     VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT;
     VkShaderModule *shader_module = (VkShaderModule *)vk_host_malloc(user_context, sizeof(VkShaderModule), 0, alloc_scope, allocator->callbacks());
-    if(shader_module == nullptr) {
+    if (shader_module == nullptr) {
         error(user_context) << "Vulkan: Failed to allocatoe shader module! Out of memory!\n";
         return nullptr;
-    } 
+    }
 
     VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), shader_module);
     if ((result != VK_SUCCESS) || (shader_module == nullptr)) {
@@ -716,13 +714,13 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 // --------------------------------------------------------------------------
 
 int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buffer,
-                             const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d) {
+                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d) {
     if (d == 0) {
 
         VkBufferCopy buffer_copy = {
-            c.src_begin + src_offset,       // srcOffset
-            dst_offset,                     // dstOffset
-            c.chunk_size                    // size
+            c.src_begin + src_offset,  // srcOffset
+            dst_offset,                // dstOffset
+            c.chunk_size               // size
         };
 
         VkBuffer *src_buffer = reinterpret_cast<VkBuffer *>(c.src);
@@ -731,7 +729,7 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
             error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
             return -1;
         }
-        
+
         vkCmdCopyBuffer(command_buffer, *src_buffer, *dst_buffer, 1, &buffer_copy);
 
     } else {
@@ -752,10 +750,10 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
 
 // --------------------------------------------------------------------------
 
-}  // namespace: (anonymous)
-}  // namespace: Vulkan
-}  // namespace: Internal
-}  // namespace: Runtime
-}  // namespace: Halide
+}  // namespace
+}  // namespace Vulkan
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
 
 #endif  // HALIDE_RUNTIME_VULKAN_RESOURCES_H

From 4b3460c0606906c6e766b71b1e33c86db30ff5ff Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 12:45:44 -0700
Subject: [PATCH 011/166] Fix formatting for single line statements

---
 src/CodeGen_Vulkan_Dev.cpp      | 4 +++-
 src/runtime/vulkan_extensions.h | 4 +++-
 src/runtime/vulkan_resources.h  | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 3ebc058af8e0..34ca7938d3f6 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -778,7 +778,9 @@ int thread_loop_workgroup_index(const std::string &name) {
                          ".__thread_id_y",
                          ".__thread_id_z"};
     for (size_t i = 0; i < sizeof(ids) / sizeof(std::string); i++) {
-        if (ends_with(name, ids[i])) { return i; }
+        if (ends_with(name, ids[i])) {
+            return i;
+        }
     }
     return -1;
 }
diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
index 6541fb787f12..343e66238ac5 100644
--- a/src/runtime/vulkan_extensions.h
+++ b/src/runtime/vulkan_extensions.h
@@ -45,7 +45,9 @@ void vk_set_layer_names_internal(const char *n) {
 const char *vk_get_layer_names_internal(void *user_context) {
     if (!layer_names_initialized) {
         const char *value = getenv("HL_VK_LAYERS");
-        if (value == nullptr) { value = getenv("VK_INSTANCE_LAYERS"); }
+        if (value == nullptr) {
+            value = getenv("VK_INSTANCE_LAYERS");
+        }
         vk_set_layer_names_internal(value);
     }
     return layer_names;
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 91284b9d307f..6fa8f9472d6a 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -512,7 +512,9 @@ void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator
         << "scalar_args_region: " << (void *)scalar_args_region << ")\n";
 #endif
 
-    if (!scalar_args_region) { return; }
+    if (!scalar_args_region) {
+        return;
+    }
     allocator->reclaim(user_context, scalar_args_region);
 }
 

From eb3f9c4a5f8690b28a6c7cb36687995b60b58a8a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 14:16:37 -0700
Subject: [PATCH 012/166] Move Vulkan option to top-level CMakeLists.txt and
 enable SPIR-V as needed

---
 CMakeLists.txt     | 4 ++++
 src/CMakeLists.txt | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b73ce26e19f5..a2600bdb306b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,10 @@ endif ()
 
 # Enable the SPIR-V target if requested (must declare before processing dependencies)
 option(TARGET_SPIRV "Include SPIR-V target" OFF)
+option(TARGET_VULKAN "Include Vulkan target" ON)
+if (TARGET_VULKAN)
+    set(TARGET_SPIRV ON) # required
+endif()
 
 ##
 # Import dependencies
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3804b335f1b7..e2c6a8bc80f7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -536,7 +536,6 @@ if (TARGET_OPENGLCOMPUTE)
     target_compile_definitions(Halide PRIVATE WITH_OPENGLCOMPUTE)
 endif ()
 
-cmake_dependent_option(TARGET_VULKAN "Include Vulkan target" ON TARGET_SPIRV OFF)
 if (TARGET_VULKAN)
     message(STATUS "Enabling Vulkan target")
     target_compile_definitions(Halide PRIVATE WITH_VULKAN)

From ba8920208a27dcf4f6cc3e163682b48359b2856b Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 14:32:12 -0700
Subject: [PATCH 013/166] Fix Vulkan & SPIRV dependencies for makefile

---
 Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Makefile b/Makefile
index 4eec9ab282e7..f41c2d3a22b5 100644
--- a/Makefile
+++ b/Makefile
@@ -126,6 +126,7 @@ WITH_METAL ?= not-empty
 WITH_OPENGLCOMPUTE ?= not-empty
 WITH_D3D12 ?= not-empty
 WITH_VULKAN ?= not-empty
+WITH_SPIRV ?= not-empty
 WITH_INTROSPECTION ?= not-empty
 WITH_EXCEPTIONS ?=
 WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty
@@ -178,6 +179,9 @@ EXCEPTIONS_CXX_FLAGS=$(if $(WITH_EXCEPTIONS), -DHALIDE_WITH_EXCEPTIONS -fexcepti
 HEXAGON_CXX_FLAGS=$(if $(WITH_HEXAGON), -DWITH_HEXAGON, )
 HEXAGON_LLVM_CONFIG_LIB=$(if $(WITH_HEXAGON), hexagon, )
 
+SPIRV_CXX_FLAGS=$(if $(WITH_SPIRV), -DWITH_SPIRV -isystem $(ROOT_DIR)/dependencies/spirv/include, )
+SPIRV_LLVM_CONFIG_LIB=$(if $(WITH_SPIRV), , )
+
 VULKAN_CXX_FLAGS=$(if $(WITH_VULKAN), -DWITH_VULKAN, )
 VULKAN_LLVM_CONFIG_LIB=$(if $(WITH_VULKAN), , )
 
@@ -220,6 +224,7 @@ CXX_FLAGS += $(INTROSPECTION_CXX_FLAGS)
 CXX_FLAGS += $(EXCEPTIONS_CXX_FLAGS)
 CXX_FLAGS += $(AMDGPU_CXX_FLAGS)
 CXX_FLAGS += $(RISCV_CXX_FLAGS)
+CXX_FLAGS += $(SPIRV_CXX_FLAGS)
 CXX_FLAGS += $(VULKAN_CXX_FLAGS)
 CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS)
 
@@ -248,6 +253,7 @@ LLVM_STATIC_LIBFILES = \
 	$(POWERPC_LLVM_CONFIG_LIB) \
 	$(HEXAGON_LLVM_CONFIG_LIB) \
 	$(AMDGPU_LLVM_CONFIG_LIB) \
+	$(SPIRV_LLVM_CONFIG_LIB) \
 	$(VULKAN_LLVM_CONFIG_LIB) \
 	$(WEBASSEMBLY_LLVM_CONFIG_LIB) \
 	$(RISCV_LLVM_CONFIG_LIB)

From 010d3e8299855236bc88ec5dc3f6d86bee7769f5 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 15:00:50 -0700
Subject: [PATCH 014/166] Add Halide version info to Makefile Add
 HALIDE_VERSION compiler definitions to compilation

---
 Makefile | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f41c2d3a22b5..e14b53cac341 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,12 @@
 #     For correctness and performance tests this include halide build time and run time. For
 #     the tests in test/generator/ this times only the halide build time.
 
+# Halide project version
+HALIDE_VERSION_MAJOR ?= 15
+HALIDE_VERSION_MINOR ?= 0
+HALIDE_VERSION_PATCH ?= 0
+HALIDE_VERSION=$(HALIDE_VERSION_MAJOR).$(HALIDE_VERSION_MINOR).$(HALIDE_VERSION_PATCH)
+
 # Disable built-in makefile rules for all apps to avoid pointless file-system
 # scanning and general weirdness resulting from implicit rules.
 MAKEFLAGS += --no-builtin-rules
@@ -136,6 +142,8 @@ WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty
 HL_TARGET ?= host
 HL_JIT_TARGET ?= host
 
+HL_VERSION_FLAGS=-DHALIDE_VERSION="$(HALIDE_VERSION)" -DHALIDE_VERSION_MAJOR=$(HALIDE_VERSION_MAJOR) -DHALIDE_VERSION_MINOR=$(HALIDE_VERSION_MINOR) -DHALIDE_VERSION_PATCH=$(HALIDE_VERSION_PATCH) 
+
 X86_CXX_FLAGS=$(if $(WITH_X86), -DWITH_X86, )
 X86_LLVM_CONFIG_LIB=$(if $(WITH_X86), x86, )
 
@@ -207,7 +215,7 @@ LLVM_CXX_FLAGS_LIBCPP := $(findstring -stdlib=libc++, $(LLVM_CXX_FLAGS))
 endif
 
 CXX_FLAGS = $(CXXFLAGS) $(CXX_WARNING_FLAGS) $(RTTI_CXX_FLAGS) -Woverloaded-virtual $(FPIC) $(OPTIMIZE) -fno-omit-frame-pointer -DCOMPILING_HALIDE
-
+CXX_FLAGS += $(HL_VERSION_FLAGS)
 CXX_FLAGS += $(LLVM_CXX_FLAGS)
 CXX_FLAGS += $(PTX_CXX_FLAGS)
 CXX_FLAGS += $(ARM_CXX_FLAGS)
@@ -279,6 +287,7 @@ TEST_LD_FLAGS = -L$(BIN_DIR) -lHalide $(COMMON_LD_FLAGS)
 
 # In the tests, some of our expectations change depending on the llvm version
 TEST_CXX_FLAGS += -DLLVM_VERSION=$(LLVM_VERSION_TIMES_10)
+TEST_CXX_FLAGS += $(HL_VERSION_FLAGS)
 
 # In the tests, default to exporting no symbols that aren't explicitly exported
 TEST_CXX_FLAGS += -fvisibility=hidden -fvisibility-inlines-hidden

From e3d3313567833d0283c879a6a3f3346404d515ff Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 5 Aug 2022 15:18:46 -0700
Subject: [PATCH 015/166] Add HL_VERSION_FLAGS to RUNTIME_CXX_FLAGS

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index e14b53cac341..a385252acdf4 100644
--- a/Makefile
+++ b/Makefile
@@ -1056,6 +1056,7 @@ RUNTIME_TRIPLE_WIN_GENERIC_64 = "le64-unknown-windows-unknown"
 #
 # `-fno-rtti` is necessary to allow us to use classes with virtual functions in the runtime code
 RUNTIME_CXX_FLAGS = -std=c++17 -O3 -fno-vectorize -ffreestanding -fno-blocks -fno-exceptions -fno-unwind-tables -fno-threadsafe-statics -fno-rtti
+RUNTIME_CXX_FLAGS += $(HL_VERSION_FLAGS)
 
 $(BUILD_DIR)/initmod.windows_%_x86_32.ll: $(SRC_DIR)/runtime/windows_%_x86.cpp $(BUILD_DIR)/clang_ok
 	@mkdir -p $(@D)

From edfd0766b909108eb0b54551e22181cd463616ac Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 26 Sep 2022 15:36:18 -0700
Subject: [PATCH 016/166] Finish refactoring of Vulkan CodeGen to use SpirV-IR.
 Added splitmix64 based hashing scheme for types and constants. Numerous fixes
 to instruction packing. Added debug symbols to all variables.

---
 src/CodeGen_Vulkan_Dev.cpp | 1295 ++++++++++++++++--------------------
 src/CodeGen_Vulkan_Dev.h   |    3 +-
 src/SpirvIR.cpp            |  785 +++++++++++++++++-----
 src/SpirvIR.h              |  125 ++--
 4 files changed, 1309 insertions(+), 899 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 34ca7938d3f6..09f0e3f8a365 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1,6 +1,11 @@
 #include <algorithm>
 #include <sstream>
 
+// XXX
+#ifndef WITH_SPIRV
+#define WITH_SPIRV
+#endif
+
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Vulkan_Dev.h"
@@ -10,12 +15,13 @@
 #include "IRPrinter.h"
 #include "Scope.h"
 #include "Target.h"
-
-#include <spirv/1.0/spirv.h>
+#include "SpirvIR.h"
 
 // Temporary:
 #include <fstream>
 
+#ifdef WITH_SPIRV
+
 namespace Halide {
 namespace Internal {
 
@@ -23,6 +29,8 @@ class CodeGen_LLVM;
 
 namespace {  // anonymous
 
+// --
+
 template<typename CodeGenT, typename ValueT>
 ValueT lower_int_uint_div(CodeGenT *cg, Expr a, Expr b);
 
@@ -58,10 +66,10 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
     }
 
 protected:
-    class SPIRVEmitter : public IRVisitor {
+    class SPIRV_Emitter : public IRVisitor {
 
     public:
-        SPIRVEmitter() = default;
+        SPIRV_Emitter() = default;
 
         using IRVisitor::visit;
 
@@ -109,314 +117,198 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const Fork *) override;
         void visit(const Acquire *) override;
 
-        void visit_binop(Type t, const Expr &a, const Expr &b, uint32_t opcode);
-
-        // ID of last generated Expr.
-        uint32_t id;
-        // IDs are allocated in numerical order of use.
-        uint32_t next_id{0};
-
-        // The void type does not map to a Halide type, but must be unique
-        uint32_t void_id;
-
-        // SPIR-V instructions in a module must be in a specific
-        // order. This order doesn't correspond to the order in which they
-        // are created. Hence we generate into a set of blocks, each of
-        // which is added to at its end. In compile_to_src, these are
-        // concatenated to form a complete SPIR-V module.  We also
-        // represent the temporaries as vectors of uint32_t rather than
-        // char for ease of adding words to them.
-        std::vector<uint32_t> spir_v_header;
-        std::vector<uint32_t> spir_v_entrypoints;
-        std::vector<uint32_t> spir_v_execution_modes;
-        std::vector<uint32_t> spir_v_annotations;
-        std::vector<uint32_t> spir_v_types;
-        std::vector<uint32_t> spir_v_kernels;
-        // The next one is cleared in between kernels, and tracks the allocations
-        std::vector<uint32_t> spir_v_kernel_allocations;
-
-        // Id of entry point for kernel currently being compiled.
-        uint32_t current_function_id;
+        void visit_binop(Type t, const Expr &a, const Expr &b, SpvOp op_code);
 
+        // The SPIRV-IR builder
+        SpvBuilder builder;
+        
         // Top-level function for adding kernels
         void add_kernel(const Stmt &s, const std::string &name, const std::vector<DeviceArgument> &args);
+        void init_module();
+        void compile(std::vector<char>& binary);
 
-        // Function for allocating variables in function scope, with optional initializer.
-        // These will appear at the beginning of the function, as required by SPIR-V
-        void add_allocation(uint32_t result_type_id, uint32_t result_id, uint32_t storage_class, uint32_t initializer = 0);
-
-        std::map<Type, uint32_t> type_map;
-        std::map<std::pair<Type, uint32_t>, uint32_t> pointer_type_map;
-        std::map<Type, uint32_t> pair_type_map;
-        std::map<std::string, uint32_t> constant_map;
-
-        void add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
-                             std::initializer_list<uint32_t> words);
-        void add_instruction(uint32_t opcode, std::initializer_list<uint32_t> words);
-        void add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
-                             std::vector<uint32_t> words);
-        void add_instruction(uint32_t opcode, std::vector<uint32_t> words);
-        uint32_t map_type(const Type &type);
-        uint32_t map_pointer_type(const Type &type, uint32_t storage_class);
-        uint32_t map_type_to_pair(const Type &t);
-        uint32_t emit_constant(const Type &t, const void *data);
+        // Scalarize expressions 
         void scalarize(const Expr &e);
+        SpvId map_type_to_pair(const Type &t);
 
-        // The scope contains both the symbol and its storage class
-        Scope<std::pair<uint32_t, uint32_t>> symbol_table;
+        // The scope contains both the symbol id and its storage class
+        using SymbolIdStorageClassPair = std::pair<SpvId, SpvStorageClass>;
+        using SymbolScope = Scope<SymbolIdStorageClassPair>;
+        using ScopedSymbolBinding = ScopedBinding<SymbolIdStorageClassPair>;
+        SymbolScope symbol_table;
 
         // The workgroup size.  Must be the same for all kernels.
         uint32_t workgroup_size[3];
 
-        struct PhiNodeInputs {
-            uint32_t ids[4];
-        };
         // Returns Phi node inputs.
         template<typename StmtOrExpr>
-        PhiNodeInputs emit_if_then_else(const Expr &condition, StmtOrExpr then_case, StmtOrExpr else_case);
+        SpvFactory::BlockVariables emit_if_then_else(const Expr &condition, StmtOrExpr then_case, StmtOrExpr else_case);
+
     } emitter;
 
     std::string current_kernel_name;
 };
 
-// --
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
-                                                       std::initializer_list<uint32_t> words) {
-    region.push_back(((1 + words.size()) << 16) | opcode);
-    region.insert(region.end(), words.begin(), words.end());
-}
-
-void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(uint32_t opcode, std::initializer_list<uint32_t> words) {
-    spir_v_kernels.push_back(((1 + words.size()) << 16) | opcode);
-    spir_v_kernels.insert(spir_v_kernels.end(), words.begin(), words.end());
-}
-void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(std::vector<uint32_t> &region, uint32_t opcode,
-                                                       std::vector<uint32_t> words) {
-    region.push_back(((1 + words.size()) << 16) | opcode);
-    region.insert(region.end(), words.begin(), words.end());
-}
-
-void CodeGen_Vulkan_Dev::SPIRVEmitter::add_instruction(uint32_t opcode, std::vector<uint32_t> words) {
-    spir_v_kernels.push_back(((1 + words.size()) << 16) | opcode);
-    spir_v_kernels.insert(spir_v_kernels.end(), words.begin(), words.end());
-}
-
-uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::emit_constant(const Type &t, const void *data) {
-    // TODO: this needs to emit OpConstantComposite for constants with lane > 1
-    std::string key(t.bytes() + 4, ' ');
-    key[0] = t.code();
-    key[1] = t.bits();
-    key[2] = t.lanes() & 0xff;
-    key[3] = (t.lanes() >> 8) & 0xff;
-    const char *data_char = (const char *)data;
-    for (int i = 0; i < t.bytes(); i++) {
-        key[i + 4] = data_char[i];
-    }
-
-    debug(3) << "emit_constant for type " << t << "\n";
-    auto item = constant_map.find(key);
-    if (item == constant_map.end()) {
-        uint32_t type_id = map_type(t);
-        uint32_t extra_words = (t.bytes() + 3) / 4;
-        uint32_t constant_id = next_id++;
-        spir_v_types.push_back(((3 + extra_words) << 16) | SpvOpConstant);
-        spir_v_types.push_back(type_id);
-        spir_v_types.push_back(constant_id);
-
-        const uint8_t *data_temp = (const uint8_t *)data;
-        size_t bytes_copied = 0;
-        for (uint32_t i = 0; i < extra_words; i++) {
-            uint32_t word;
-            size_t to_copy = std::min(t.bytes() - bytes_copied, (size_t)4);
-            memcpy(&word, data_temp, to_copy);
-            bytes_copied += to_copy;
-            spir_v_types.push_back(word);
-            data_temp++;
-        }
-        return constant_id;
-    } else {
-        return item->second;
-    }
-}
-
-void CodeGen_Vulkan_Dev::SPIRVEmitter::scalarize(const Expr &e) {
-    internal_assert(e.type().is_vector()) << "CodeGen_Vulkan_Dev::SPIRVEmitter::scalarize must be called with an expression of vector type.\n";
-    uint32_t type_id = map_type(e.type());
-
-    uint32_t result_id = next_id++;
-    add_instruction(SpvOpConstantNull, {type_id, result_id});
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(const Expr &e) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(): " << (Expr)e << "\n";
+    internal_assert(e.type().is_vector()) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize must be called with an expression of vector type.\n";
 
+    SpvId type_id = builder.declare_type(e.type());
+    SpvId value_id = builder.declare_null_constant(e.type());
+    SpvId result_id = value_id;
     for (int i = 0; i < e.type().lanes(); i++) {
         extract_lane(e, i).accept(this);
-        uint32_t composite_vec = next_id++;
-        add_instruction(SpvOpVectorInsertDynamic, {type_id, composite_vec, (uint32_t)i, result_id, id});
-        result_id = composite_vec;
+        SpvId vector_id = builder.current_id();
+        SpvId composite_vector_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::vector_insert_dynamic(type_id, composite_vector_id, vector_id, value_id, i));
+        result_id = composite_vector_id;
     }
-    id = result_id;
-}
-
-uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_type(const Type &t) {
-    auto key_typecode = t.code();
-
-    Type t_key(key_typecode, t.bits(), t.lanes());
-
-    auto item = type_map.find(t_key);
-    if (item == type_map.end()) {
-        // TODO, handle arrays, pointers, halide_buffer_t
-        uint32_t type_id = 0;
-        if (t.lanes() != 1) {
-            uint32_t base_id = map_type(t.with_lanes(1));
-            type_id = next_id++;
-            add_instruction(spir_v_types, SpvOpTypeVector, {type_id, base_id, (uint32_t)t.lanes()});
-        } else {
-            if (t.is_float()) {
-                type_id = next_id++;
-                add_instruction(spir_v_types, SpvOpTypeFloat, {type_id, (uint32_t)t.bits()});
-            } else if (t.is_bool()) {
-                type_id = next_id++;
-                add_instruction(spir_v_types, SpvOpTypeBool, {type_id});
-            } else if (t.is_int_or_uint()) {
-                type_id = next_id++;
-                uint32_t signedness = t.is_uint() ? 0 : 1;
-                add_instruction(spir_v_types, SpvOpTypeInt, {type_id, (uint32_t)t.bits(), signedness});
-            } else {
-                internal_error << "Unsupported type in Vulkan backend " << t << "\n";
-            }
-        }
-        type_map[t_key] = type_id;
-        return type_id;
+    builder.update_id(result_id);
+}
+
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::map_type_to_pair(const Type &t) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::map_type_to_pair(): " << t << "\n";
+    SpvId base_type_id = builder.declare_type(t);
+    const std::string& type_name = type_to_c_type(t, false, false) + std::string("_pair"); 
+    SpvBuilder::StructMemberTypes member_type_ids = {base_type_id, base_type_id};
+    SpvId struct_type_id = builder.declare_struct(type_name, member_type_ids);
+    return struct_type_id;
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Variable *var) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Variable): " << var->type << " " << var->name << "\n";
+    SpvId variable_id = symbol_table.get(var->name).first;
+    user_assert(variable_id != SpvInvalidId) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Variable): Invalid symbol name!\n";
+    builder.update_id(variable_id);
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IntImm *imm) {
+    if(imm->type.bits() == 8) {
+        const int8_t value = (int8_t)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else if(imm->type.bits() == 16) {
+        const int16_t value = (int16_t)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else if(imm->type.bits() == 32) {
+        const int32_t value = (int32_t)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else if(imm->type.bits() == 64) {
+        const int64_t value = (int64_t)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
     } else {
-        return item->second;
-    }
-}
-
-uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_type_to_pair(const Type &t) {
-    uint32_t &ref = pair_type_map[t];
-
-    if (ref == 0) {
-        uint32_t base_type = map_type(t);
-
-        uint32_t type_id = next_id++;
-
-        add_instruction(spir_v_types, SpvOpTypeStruct, {type_id, base_type, base_type});
-        ref = type_id;
+        internal_error << "Vulkan backend currently only supports 8-bit, 16-bit, 32-bit or 64-bit signed integers!\n";           
     }
-    return ref;
 }
 
-uint32_t CodeGen_Vulkan_Dev::SPIRVEmitter::map_pointer_type(const Type &type, uint32_t storage_class) {
-    auto key = std::make_pair(type, storage_class);
-    uint32_t &ref = pointer_type_map[key];
-    if (ref == 0) {
-        uint32_t base_type_id = map_type(type);
-        ref = next_id++;
-        add_instruction(spir_v_types, SpvOpTypePointer, {ref, storage_class, base_type_id});
-        pointer_type_map[key] = ref;
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const UIntImm *imm) {
+    if(imm->type.bits() == 8) {
+        const uint8_t value = (uint8_t)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else if(imm->type.bits() == 16) {
+        const uint16_t value = (uint16_t)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else if(imm->type.bits() == 32) {
+        const uint32_t value = (uint32_t)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else if(imm->type.bits() == 64) {
+        const uint64_t value = (uint64_t)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else {
+        internal_error << "Vulkan backend currently only supports 8-bit, 16-bit, 32-bit or 64-bit unsigned integers!\n";           
     }
-
-    return ref;
-}
-
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Variable *var) {
-    id = symbol_table.get(var->name).first;
-}
-
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const IntImm *imm) {
-    id = emit_constant(imm->type, &imm->value);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const UIntImm *imm) {
-    id = emit_constant(imm->type, &imm->value);
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const StringImm *imm) {
+    SpvId constant_id = builder.declare_string_constant(imm->value);
+    builder.update_id(constant_id);
 }
 
-namespace {
-void encode_string(std::vector<uint32_t> &section, uint32_t words,
-                   const size_t str_size, const char *str) {
-    size_t bytes_copied = 0;
-    for (uint32_t i = 0; i < words; i++) {
-        uint32_t word;
-        size_t to_copy = std::min(str_size + 1 - bytes_copied, (size_t)4);
-        memcpy(&word, str, to_copy);
-        bytes_copied += to_copy;
-        section.push_back(word);
-        str += 4;
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
+    if(imm->type.bits() == 32) {
+        const float value = (float)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else if(imm->type.bits() == 64) {
+        const double value = (double)(imm->value);
+        SpvId constant_id = builder.declare_constant(imm->type, &value);
+        builder.update_id(constant_id);
+    } else {
+        internal_error << "Vulkan backend currently only supports 32-bit or 64-bit floats\n";           
     }
 }
-}  // namespace
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const StringImm *imm) {
-    uint32_t extra_words = (imm->value.size() + 1 + 3) / 4;
-    id = next_id++;
-    spir_v_kernels.push_back(((2 + extra_words) << 16) | SpvOpString);
-    spir_v_kernels.push_back(id);
-
-    const char *data_temp = (const char *)imm->value.c_str();
-    const size_t data_size = imm->value.size();
-    encode_string(spir_v_kernels, extra_words, data_size, data_temp);
-}
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const FloatImm *imm) {
-    user_assert(imm->type.bits() == 32) << "Vulkan backend currently only supports 32-bit floats\n";
-    float float_val = (float)(imm->value);
-    id = emit_constant(imm->type, &float_val);
-}
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast): " << op->value.type() << " to " << op->type << "\n";
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Cast *op) {
-    uint32_t opcode = 0;
+    SpvOp op_code = SpvOpNop;
     if (op->value.type().is_float()) {
         if (op->type.is_float()) {
-            opcode = SpvOpFConvert;
+            op_code = SpvOpFConvert;
         } else if (op->type.is_uint()) {
-            opcode = SpvOpConvertFToU;
+            op_code = SpvOpConvertFToU;
         } else if (op->type.is_int()) {
-            opcode = SpvOpConvertFToS;
+            op_code = SpvOpConvertFToS;
         } else {
-            internal_error << "Vulkan cast unhandled case " << op->value.type() << " to " << op->type << "\n";
+            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
         }
     } else if (op->value.type().is_uint()) {
         if (op->type.is_float()) {
-            opcode = SpvOpConvertUToF;
+            op_code = SpvOpConvertUToF;
         } else if (op->type.is_uint()) {
-            opcode = SpvOpUConvert;
+            op_code = SpvOpUConvert;
         } else if (op->type.is_int()) {
-            opcode = SpvOpSatConvertUToS;
+            op_code = SpvOpSatConvertUToS;
         } else {
-            internal_error << "Vulkan cast unhandled case " << op->value.type() << " to " << op->type << "\n";
+            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
         }
     } else if (op->value.type().is_int()) {
         if (op->type.is_float()) {
-            opcode = SpvOpConvertSToF;
+            op_code = SpvOpConvertSToF;
         } else if (op->type.is_uint()) {
-            opcode = SpvOpSatConvertSToU;
+            op_code = SpvOpSatConvertSToU;
         } else if (op->type.is_int()) {
-            opcode = SpvOpSConvert;
+            op_code = SpvOpSConvert;
         } else {
-            internal_error << "Vulkan cast unhandled case " << op->value.type() << " to " << op->type << "\n";
+            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
         }
     } else {
-        internal_error << "Vulkan cast unhandled case " << op->value.type() << " to " << op->type << "\n";
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
     }
 
-    uint32_t type_id = map_type(op->type);
+    SpvId type_id = builder.declare_type(op->type);
     op->value.accept(this);
-    uint32_t src_id = id;
-    id = next_id++;
-    add_instruction(opcode, {type_id, id, src_id});
+    SpvId src_id = builder.current_id();
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::convert(op_code, type_id, result_id, src_id));
+    builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Add *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Add *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Add): " << op->type << " ((" << op->a <<  ") + (" << op->b << "))\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFAdd : SpvOpIAdd);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Sub *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Sub *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Sub): " << op->type << " ((" << op->a <<  ") - (" << op->b << "))\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFSub : SpvOpISub);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Mul *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mul *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mul): " << op->type << " ((" << op->a <<  ") * (" << op->b << "))\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFMul : SpvOpIMul);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Div *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Div): " << op->type << " ((" << op->a <<  ") / (" << op->b << "))\n";
     user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n";
 
     if (op->type.is_float()) {
@@ -427,7 +319,8 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Div *op) {
     }
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Mod *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a <<  ") % (" << op->b << "))\n";
     if (op->type.is_float()) {
         // Takes sign of result from op->b
         visit_binop(op->type, op->a, op->b, SpvOpFMod);
@@ -437,7 +330,9 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Mod *op) {
     }
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Max *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Max *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Max): " << op->type << " Max((" << op->a <<  "), (" << op->b << "))\n";
+
     std::string a_name = unique_name('a');
     std::string b_name = unique_name('b');
     Expr a = Variable::make(op->a.type(), a_name);
@@ -447,7 +342,8 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Max *op) {
     temp.accept(this);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Min *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Min): " << op->type << " Min((" << op->a <<  "), (" << op->b << "))\n";
     std::string a_name = unique_name('a');
     std::string b_name = unique_name('b');
     Expr a = Variable::make(op->a.type(), a_name);
@@ -457,93 +353,107 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Min *op) {
     temp.accept(this);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const EQ *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(EQ): " << op->type << " (" << op->a <<  ") == (" << op->b << ")\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFOrdEqual : SpvOpIEqual);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const NE *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const NE *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(NE): " << op->type << " (" << op->a <<  ") != (" << op->b << ")\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFOrdNotEqual : SpvOpINotEqual);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LT *op) {
-    uint32_t opcode = 0;
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LT): " << op->type << " (" << op->a <<  ") < (" << op->b << ")\n";
+    SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
-        opcode = SpvOpFOrdLessThan;
+        op_code = SpvOpFOrdLessThan;
     } else if (op->a.type().is_int()) {
-        opcode = SpvOpSLessThan;
+        op_code = SpvOpSLessThan;
     } else if (op->a.type().is_uint()) {
-        opcode = SpvOpULessThan;
+        op_code = SpvOpULessThan;
     } else {
-        internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LT *op): unhandled type: " << op->a.type() << "\n";
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binop(op->type, op->a, op->b, opcode);
+    visit_binop(op->type, op->a, op->b, op_code);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LE *op) {
-    uint32_t opcode = 0;
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LE): " << op->type << " (" << op->a <<  ") <= (" << op->b << ")\n";
+    SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
-        opcode = SpvOpFOrdLessThanEqual;
+        op_code = SpvOpFOrdLessThanEqual;
     } else if (op->a.type().is_int()) {
-        opcode = SpvOpSLessThanEqual;
+        op_code = SpvOpSLessThanEqual;
     } else if (op->a.type().is_uint()) {
-        opcode = SpvOpULessThanEqual;
+        op_code = SpvOpULessThanEqual;
     } else {
-        internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LE *op): unhandled type: " << op->a.type() << "\n";
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binop(op->type, op->a, op->b, opcode);
+    visit_binop(op->type, op->a, op->b, op_code);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const GT *op) {
-    uint32_t opcode = 0;
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(GT): " << op->type << " (" << op->a <<  ") > (" << op->b << ")\n";
+    SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
-        opcode = SpvOpFOrdGreaterThan;
+        op_code = SpvOpFOrdGreaterThan;
     } else if (op->a.type().is_int()) {
-        opcode = SpvOpSGreaterThan;
+        op_code = SpvOpSGreaterThan;
     } else if (op->a.type().is_uint()) {
-        opcode = SpvOpUGreaterThan;
+        op_code = SpvOpUGreaterThan;
     } else {
-        internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const GT *op): unhandled type: " << op->a.type() << "\n";
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binop(op->type, op->a, op->b, opcode);
+    visit_binop(op->type, op->a, op->b, op_code);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const GE *op) {
-    uint32_t opcode = 0;
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(GE): " << op->type << " (" << op->a <<  ") >= (" << op->b << ")\n";
+    SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
-        opcode = SpvOpFOrdGreaterThanEqual;
+        op_code = SpvOpFOrdGreaterThanEqual;
     } else if (op->a.type().is_int()) {
-        opcode = SpvOpSGreaterThanEqual;
+        op_code = SpvOpSGreaterThanEqual;
     } else if (op->a.type().is_uint()) {
-        opcode = SpvOpUGreaterThanEqual;
+        op_code = SpvOpUGreaterThanEqual;
     } else {
-        internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const GE *op): unhandled type: " << op->a.type() << "\n";
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binop(op->type, op->a, op->b, opcode);
+    visit_binop(op->type, op->a, op->b, op_code);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const And *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const And *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(And): " << op->type << " (" << op->a <<  ") && (" << op->b << ")\n";
     visit_binop(op->type, op->a, op->b, SpvOpLogicalAnd);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Or *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Or *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Or): " << op->type << " (" << op->a <<  ") || (" << op->b << ")\n";
     visit_binop(op->type, op->a, op->b, SpvOpLogicalOr);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Not *op) {
-    uint32_t type_id = map_type(op->type);
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Not *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Not): " << op->type << " !(" << op->a <<  ")\n";
+
+    SpvId type_id = builder.declare_type(op->type);
     op->a.accept(this);
-    uint32_t a_id = id;
-    id = next_id++;
-    add_instruction(SpvOpLogicalNot, {type_id, id, a_id});
+    SpvId src_id = builder.current_id();
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::logical_not(type_id, result_id, src_id));
+    builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Call): " << op->type << " " << op->name << " args=" << (uint32_t)op->args.size() << "\n";
+
     if (op->is_intrinsic(Call::gpu_thread_barrier)) {
         // TODO: Check the scopes here and figure out if this is the
         // right memory barrier. Might be able to use
         // SpvMemorySemanticsMaskNone instead.
-        add_instruction(SpvOpControlBarrier, {current_function_id, current_function_id,
-                                              SpvMemorySemanticsAcquireReleaseMask});
+        SpvId current_function_id = builder.current_function().id();
+        builder.append(SpvFactory::control_barrier(current_function_id, current_function_id, 
+                                                   SpvMemorySemanticsAcquireReleaseMask));
     } else if (op->is_intrinsic(Call::bitwise_and)) {
         internal_assert(op->args.size() == 2);
         visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseAnd);
@@ -555,91 +465,80 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
         visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseOr);
     } else if (op->is_intrinsic(Call::bitwise_not)) {
         internal_assert(op->args.size() == 1);
-        uint32_t type_id = map_type(op->type);
+        SpvId type_id = builder.declare_type(op->type);
         op->args[0]->accept(this);
-        uint32_t arg_id = id;
-        id = next_id++;
-        add_instruction(SpvOpNot, {type_id, id, arg_id});
+        SpvId arg_id = builder.current_id();
+        SpvId result_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::logical_not(type_id, result_id, arg_id));
+        builder.update_id(result_id);
     } else if (op->is_intrinsic(Call::if_then_else)) {
         if (op->type.is_vector()) {
             scalarize(op);
         } else {
-            internal_assert(op->args.size() == 3);
-            auto phi_inputs = emit_if_then_else(op->args[0], op->args[1], op->args[2]);
             // Generate Phi node if used as an expression.
-
-            uint32_t type_id = map_type(op->type);
-            id = next_id++;
-            spir_v_kernels.push_back((7 << 16) | SpvOpPhi);
-            spir_v_kernels.push_back(type_id);
-            spir_v_kernels.push_back(id);
-            spir_v_kernels.insert(spir_v_kernels.end(), phi_inputs.ids, phi_inputs.ids + 4);
+            internal_assert(op->args.size() == 3);
+            SpvFactory::BlockVariables block_vars = emit_if_then_else(op->args[0], op->args[1], op->args[2]);
+            SpvId type_id = builder.declare_type(op->type);
+            SpvId result_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::phi(type_id, result_id, block_vars));
+            builder.update_id(result_id);
         }
     } else if (op->is_intrinsic(Call::IntrinsicOp::div_round_to_zero)) {
         internal_assert(op->args.size() == 2);
-        uint32_t opcode = 0;
+        SpvOp op_code = SpvOpNop;
         if (op->type.is_int()) {
-            opcode = SpvOpSDiv;
+            op_code = SpvOpSDiv;
         } else if (op->type.is_uint()) {
-            opcode = SpvOpUDiv;
+            op_code = SpvOpUDiv;
         } else {
             internal_error << "div_round_to_zero of non-integer type.\n";
         }
-        visit_binop(op->type, op->args[0], op->args[1], opcode);
+        visit_binop(op->type, op->args[0], op->args[1], op_code);
     } else if (op->is_intrinsic(Call::IntrinsicOp::mod_round_to_zero)) {
         internal_assert(op->args.size() == 2);
-        uint32_t opcode = 0;
+        SpvOp op_code = SpvOpNop;
         if (op->type.is_int()) {
-            opcode = SpvOpSMod;
+            op_code = SpvOpSMod;
         } else if (op->type.is_uint()) {
-            opcode = SpvOpUMod;
+            op_code = SpvOpUMod;
         } else {
             internal_error << "mod_round_to_zero of non-integer type.\n";
         }
-        visit_binop(op->type, op->args[0], op->args[1], opcode);
+        visit_binop(op->type, op->args[0], op->args[1], op_code);
     } else if (op->is_intrinsic(Call::IntrinsicOp::mul_shift_right)) {
         internal_assert(op->args.size() == 3);
-        uint32_t type_id = map_type(op->type);
+        uint32_t type_id = builder.declare_type(op->type);
 
         op->args[0].accept(this);
-        uint32_t a_id = id;
+        SpvId src_a_id = builder.current_id();
         op->args[1].accept(this);
-        uint32_t b_id = id;
+        SpvId src_b_id = builder.current_id();
 
-        uint32_t pair_type_id = map_type_to_pair(op->type);
+        SpvId pair_type_id = map_type_to_pair(op->type);
 
         // Double width multiply
-        uint32_t product_pair = next_id++;
-        spir_v_kernels.push_back((5 << 16) | (op->type.is_uint() ? SpvOpUMulExtended : SpvOpSMulExtended));
-        spir_v_kernels.push_back(pair_type_id);
-        spir_v_kernels.push_back(a_id);
-        spir_v_kernels.push_back(b_id);
-
-        uint32_t high_item_id = next_id++;
-        spir_v_kernels.push_back((5 << 16) | SpvOpCompositeExtract);
-        spir_v_kernels.push_back(type_id);
-        spir_v_kernels.push_back(high_item_id);
-        spir_v_kernels.push_back(product_pair);
-        spir_v_kernels.push_back(1);
+        SpvId product_pair_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::multiply_extended(pair_type_id, product_pair_id, src_a_id, src_b_id, op->type.is_uint() ? false : true));
+
+        SpvFactory::Indices indices = {1};
+        uint32_t high_item_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::composite_extract(type_id, high_item_id, product_pair_id, indices));
 
         const UIntImm *shift = op->args[2].as<UIntImm>();
         internal_assert(shift != nullptr) << "Third argument to mul_shift_right intrinsic must be an unsigned integer immediate.\n";
 
-        uint32_t result_id;
+        SpvId result_id = high_item_id;
         if (shift->value != 0) {
             // TODO: This code depends on compilation happening on a little-endian host.
-            uint32_t shr_id = emit_constant(shift->type, &shift->value);
-            result_id = next_id++;
-            spir_v_kernels.push_back((5 << 16) | (op->type.is_uint() ? SpvOpShiftRightLogical : SpvOpShiftRightArithmetic));
-            spir_v_kernels.push_back(type_id);
-            spir_v_kernels.push_back(result_id);
-            spir_v_kernels.push_back(high_item_id);
-            spir_v_kernels.push_back(shr_id);
-        } else {
-            result_id = high_item_id;
+            SpvId shift_amount_id = builder.declare_constant(shift->type, &shift->value);
+            result_id = builder.reserve_id(SpvResultId);
+            if(op->type.is_uint()) {
+                builder.append(SpvFactory::shift_right_logical(type_id, result_id, high_item_id, shift_amount_id));
+            } else {
+                builder.append(SpvFactory::shift_right_arithmetic(type_id, result_id, high_item_id, shift_amount_id));
+            }
         }
-
-        id = result_id;
+        builder.update_id(result_id);
     } else if (op->is_intrinsic(Call::IntrinsicOp::sorted_avg)) {
         internal_assert(op->args.size() == 2);
         // b > a, so the following works without widening:
@@ -649,21 +548,23 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Call *op) {
     }
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Select *op) {
-    uint32_t type_id = map_type(op->type);
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Select *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Select): " << op->type << " (" << op->condition <<  ") ? (" << op->true_value << ") : (" << op->false_value << ")\n";
+    SpvId type_id = builder.declare_type(op->type);
     op->condition.accept(this);
-    uint32_t cond_id = id;
+    SpvId cond_id = builder.current_id();
     op->true_value.accept(this);
-    uint32_t true_id = id;
+    SpvId true_id = builder.current_id();
     op->false_value.accept(this);
-    uint32_t false_id = id;
-    id = next_id++;
-    add_instruction(SpvOpSelect, {type_id, id, cond_id, true_id, false_id});
+    SpvId false_id = builder.current_id();
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::select(type_id, result_id, cond_id, true_id, false_id));
+    builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Load *op) {
-    debug(2) << "Vulkan codegen: Load: " << (Expr)op << "\n";
-    user_assert(is_const_one(op->predicate)) << "Predicated loads not supported by the Vulkan backend\n";
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): " << (Expr)op << "\n";
+    user_assert(is_const_one(op->predicate)) << "Predicated loads not supported by SPIR-V codegen\n";
 
     // TODO: implement vector loads
     // TODO: correct casting to the appropriate memory space
@@ -672,27 +573,33 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Load *op) {
     internal_assert(op->param.defined() && op->param.is_buffer());
 
     // Construct the pointer to read from
-    auto id_and_storage_class = symbol_table.get(op->name);
-    uint32_t base_id = id_and_storage_class.first;
-    uint32_t storage_class = id_and_storage_class.second;
+    internal_assert(symbol_table.contains(op->name));
+    SymbolIdStorageClassPair id_and_storage_class = symbol_table.get(op->name);
+    SpvId base_id = id_and_storage_class.first;
+    SpvStorageClass storage_class = id_and_storage_class.second;
+    internal_assert(base_id != SpvInvalidId);
+    internal_assert(storage_class != SpvInvalidId);
 
     op->index.accept(this);
-    uint32_t index_id = id;
-    uint32_t ptr_type_id = map_pointer_type(op->type, storage_class);
-    uint32_t access_chain_id = next_id++;
-    auto zero = 0;
-    add_instruction(SpvOpInBoundsAccessChain, {ptr_type_id, access_chain_id, base_id,
-                                               emit_constant(UInt(32), &zero), index_id});
-
-    id = next_id++;
-    uint32_t result_type_id = map_type(op->type);
-    add_instruction(SpvOpLoad, {result_type_id, id, access_chain_id});
-}
+    SpvId index_id = builder.current_id();
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Store *op) {
-    debug(2) << "Vulkan codegen: Store: " << (Stmt)op << "\n";
+    uint32_t zero = 0;
+    SpvId type_id = builder.declare_type(op->type);
+    SpvId zero_id = builder.declare_constant(UInt(32), &zero);
+    SpvId ptr_type_id = builder.declare_pointer_type(type_id, storage_class);
+    SpvId access_chain_id = builder.reserve_id(SpvResultId);
+    SpvFactory::Indices indices = {index_id};
+    builder.append(SpvFactory::in_bounds_access_chain(ptr_type_id, access_chain_id, base_id, zero_id, indices));
 
-    user_assert(is_const_one(op->predicate)) << "Predicated stores not supported by the Vulkan backend\n";
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    SpvId result_type_id = builder.declare_type(op->type);
+    builder.append(SpvFactory::load(result_type_id, result_id, access_chain_id));
+    builder.update_id(result_id);
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): " << op->name << "[" << op->index << "] = (" << op->value << ")\n";
+    user_assert(is_const_one(op->predicate)) << "Predicated stores not supported by SPIR-V codegen!\n";
 
     // TODO: implement vector writes
     // TODO: correct casting to the appropriate memory space
@@ -701,56 +608,54 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Store *op) {
     internal_assert(op->param.defined() && op->param.is_buffer());
 
     op->value.accept(this);
-    uint32_t value_id = id;
+    SpvId value_id = builder.current_id();
 
     // Construct the pointer to write to
-    auto id_and_storage_class = symbol_table.get(op->name);
-    uint32_t base_id = id_and_storage_class.first;
-    uint32_t storage_class = id_and_storage_class.second;
+    internal_assert(symbol_table.contains(op->name));
+    SymbolIdStorageClassPair id_and_storage_class = symbol_table.get(op->name);
+    SpvId base_id = id_and_storage_class.first;
+    SpvStorageClass storage_class = id_and_storage_class.second;
+    internal_assert(base_id != SpvInvalidId);
+    internal_assert(storage_class != SpvInvalidId);
 
     op->index.accept(this);
-    uint32_t index_id = id;
-    uint32_t ptr_type_id = map_pointer_type(op->value.type(), storage_class);
-    uint32_t access_chain_id = next_id++;
-    auto zero = 0;
-    add_instruction(SpvOpInBoundsAccessChain, {ptr_type_id, access_chain_id, base_id,
-                                               emit_constant(UInt(32), &zero), index_id});
-
-    add_instruction(SpvOpStore, {access_chain_id, value_id});
+    SpvId index_id = builder.current_id();
+    SpvId type_id = builder.declare_type(op->value.type());
+    SpvId ptr_type_id = builder.declare_pointer_type(type_id, storage_class);
+    SpvId access_chain_id = builder.reserve_id(SpvResultId);
+
+    SpvId zero = 0;
+    SpvId zero_id = builder.declare_constant(UInt(32), &zero);
+    SpvFactory::Indices indices = {index_id};
+    builder.append(SpvFactory::in_bounds_access_chain(ptr_type_id, access_chain_id, base_id, zero_id, indices));
+    builder.append(SpvFactory::store(access_chain_id, value_id));
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Let *let) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Let *let) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Let): " << (Expr)let << "\n";
     let->value.accept(this);
-    ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, let->name, {id, SpvStorageClassFunction});
+    SpvId current_id = builder.current_id();
+    ScopedSymbolBinding binding(symbol_table, let->name, {current_id, SpvStorageClassFunction});
     let->body.accept(this);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const LetStmt *let) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LetStmt *let) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LetStmt): " << let->name << "\n";
     let->value.accept(this);
-    ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, let->name, {id, SpvStorageClassFunction});
+    SpvId current_id = builder.current_id();
+    ScopedSymbolBinding binding(symbol_table, let->name, {current_id, SpvStorageClassFunction});
     let->body.accept(this);
+
     // TODO: Figure out undef here?
-    id = 0xffffffff;
+    builder.update_id(SpvInvalidId);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const AssertStmt *) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const AssertStmt *) {
     // TODO: Fill this in.
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const ProducerConsumer *) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const ProducerConsumer *) {
     // I believe these nodes are solely for annotation purposes.
-#if 0
-    string name;
-    if (op->is_producer) {
-        name = std::string("produce ") + op->name;
-    } else {
-        name = std::string("consume ") + op->name;
-    }
-    BasicBlock *produce = BasicBlock::Create(*context, name, function);
-    builder->CreateBr(produce);
-    builder->SetInsertPoint(produce);
-    codegen(op->body);
-#endif
 }
 
 namespace {
@@ -778,15 +683,14 @@ int thread_loop_workgroup_index(const std::string &name) {
                          ".__thread_id_y",
                          ".__thread_id_z"};
     for (size_t i = 0; i < sizeof(ids) / sizeof(std::string); i++) {
-        if (ends_with(name, ids[i])) {
-            return i;
-        }
+        if (ends_with(name, ids[i])) { return i; }
     }
     return -1;
 }
 }  // anonymous namespace
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(For): " << op->name << "\n";
 
     if (is_gpu_var(op->name)) {
         internal_assert((op->for_type == ForType::GPUBlock) ||
@@ -811,226 +715,255 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const For *op) {
 
         // Intrinsics are inserted when adding the kernel
         internal_assert(symbol_table.contains(intrinsic.first));
-
-        uint32_t intrinsic_id = symbol_table.get(intrinsic.first).first;
-        uint32_t gpu_var_id = next_id++;
-        uint32_t unsigned_gpu_var_id = next_id++;
-        add_instruction(SpvOpCompositeExtract, {map_type(UInt(32)), unsigned_gpu_var_id, intrinsic_id, intrinsic.second});
-        // cast to int, which is what's expected by Halide's for loops
-        add_instruction(SpvOpBitcast, {map_type(Int(32)), gpu_var_id, unsigned_gpu_var_id});
-
+        SpvId intrinsic_id = symbol_table.get(intrinsic.first).first;
+
+        // extract and cast to int (which is what's expected by Halide's for loops)
+        SpvId unsigned_type_id = builder.declare_type(UInt(32));
+        SpvId unsigned_gpu_var_id = builder.reserve_id(SpvResultId);
+        SpvId signed_type_id = builder.declare_type(Int(32));
+        SpvId signed_gpu_var_id = builder.reserve_id(SpvResultId);
+        SpvFactory::Indices indices = { intrinsic.second };
+        builder.append(SpvFactory::composite_extract(unsigned_type_id, unsigned_gpu_var_id, intrinsic_id, indices));
+        builder.append(SpvFactory::bitcast(signed_type_id, signed_gpu_var_id, unsigned_gpu_var_id));
         {
-            ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, op->name, {gpu_var_id, SpvStorageClassUniform});
+            ScopedSymbolBinding binding(symbol_table, op->name, {signed_gpu_var_id, SpvStorageClassUniform});
             op->body.accept(this);
         }
 
     } else {
 
-        internal_assert(op->for_type == ForType::Serial) << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit unhandled For type: " << op->for_type << "\n";
+        internal_assert(op->for_type == ForType::Serial) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit unhandled For type: " << op->for_type << "\n";
 
         // TODO: Loop vars are alway int32_t right?
-        uint32_t index_type_id = map_type(Int(32));
-        uint32_t index_var_type_id = map_pointer_type(Int(32), SpvStorageClassFunction);
+        SpvId index_type_id = builder.declare_type(Int(32));
+        SpvId index_var_type_id = builder.declare_pointer_type(index_type_id, SpvStorageClassFunction);
 
         op->min.accept(this);
-        uint32_t min_id = id;
+        SpvId min_id = builder.current_id();
         op->extent.accept(this);
-        uint32_t extent_id = id;
+        SpvId extent_id = builder.current_id();
 
         // Compute max.
-        uint32_t max_id = next_id++;
-        add_instruction(SpvOpIAdd, {index_type_id, max_id, min_id, extent_id});
+        SpvId max_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::integer_add(index_type_id, max_id, min_id, extent_id));
 
         // Declare loop var
-        // TODO: Can we use the phi node for this?
-        uint32_t loop_var_id = next_id++;
-        add_allocation(index_var_type_id, loop_var_id, SpvStorageClassFunction, min_id);
-
-        uint32_t header_label_id = next_id++;
-        uint32_t loop_top_label_id = next_id++;
-        uint32_t body_label_id = next_id++;
-        uint32_t continue_label_id = next_id++;
-        uint32_t merge_label_id = next_id++;
-        add_instruction(SpvOpLabel, {header_label_id});
-        add_instruction(SpvOpLoopMerge, {merge_label_id, continue_label_id, SpvLoopControlMaskNone});
-        add_instruction(SpvOpBranch, {loop_top_label_id});
-        add_instruction(SpvOpLabel, {loop_top_label_id});
+        SpvId loop_var_id = builder.declare_variable(unique_name("loop_index"), index_var_type_id, SpvStorageClassFunction, min_id);
 
-        // loop test.
-        uint32_t cur_index_id = next_id++;
-        add_instruction(SpvOpLoad, {index_type_id, cur_index_id, loop_var_id});
+        SpvId header_label_id = builder.reserve_id(SpvLabelId);
+        SpvId loop_top_label_id = builder.reserve_id(SpvLabelId);
+        SpvId body_label_id = builder.reserve_id(SpvLabelId);
+        SpvId continue_label_id = builder.reserve_id(SpvLabelId);
+        SpvId merge_label_id = builder.reserve_id(SpvLabelId);
+
+        builder.append(SpvFactory::label(header_label_id));
+        builder.append(SpvFactory::loop_merge(merge_label_id, continue_label_id, SpvLoopControlMaskNone));
+        builder.append(SpvFactory::branch(loop_top_label_id));
+        builder.append(SpvFactory::label(loop_top_label_id));
 
-        uint32_t loop_test_id = next_id++;
-        add_instruction(SpvOpSLessThanEqual, {loop_test_id, cur_index_id, max_id});
-        add_instruction(SpvOpBranchConditional, {loop_test_id, body_label_id, merge_label_id});
+        // loop test.
+        SpvId cur_index_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::load(index_type_id, cur_index_id, loop_var_id));
 
-        add_instruction(SpvOpLabel, {body_label_id});
+        SpvId loop_test_type_id = builder.declare_type(Bool());
+        SpvId loop_test_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::less_than_equal(loop_test_type_id, loop_test_id, cur_index_id, max_id, true));
+        builder.append(SpvFactory::conditional_branch(loop_test_id, body_label_id, merge_label_id));
+        builder.append(SpvFactory::label(body_label_id));
 
         {
-            ScopedBinding<std::pair<uint32_t, uint32_t>> binding(symbol_table, op->name, {cur_index_id, SpvStorageClassFunction});
-
+            ScopedSymbolBinding binding(symbol_table, op->name, {cur_index_id, SpvStorageClassFunction});
             op->body.accept(this);
         }
+        builder.append(SpvFactory::branch(continue_label_id));
+        builder.append(SpvFactory::label(continue_label_id));
 
-        add_instruction(SpvOpBranch, {continue_label_id});
-        add_instruction(SpvOpLabel, {continue_label_id});
-
-        // Loop var update?
-        uint32_t next_index_id = next_id++;
+        // Update loop variable
         int32_t one = 1;
-        uint32_t constant_one_id = emit_constant(Int(32), &one);
-        add_instruction(SpvOpIAdd, {index_type_id, next_index_id, cur_index_id, constant_one_id});
-        add_instruction(SpvOpStore, {index_type_id, next_index_id, loop_var_id});
-        add_instruction(SpvOpBranch, {header_label_id});
-        add_instruction(SpvOpLabel, {merge_label_id});
+        SpvId next_index_id = builder.reserve_id(SpvResultId);
+        SpvId constant_one_id = builder.declare_constant(Int(32), &one);
+        builder.append(SpvFactory::integer_add(index_type_id, next_index_id, cur_index_id, constant_one_id));
+        builder.append(SpvFactory::store(loop_var_id, next_index_id));
+
+        builder.append(SpvFactory::branch(header_label_id));
+        builder.append(SpvFactory::label(merge_label_id));
     }
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Ramp *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Ramp *op) {
     // TODO: Is there a way to do this that doesn't require duplicating lane values?
-    uint32_t base_type_id = map_type(op->base.type());
-    uint32_t type_id = map_type(op->type);
+    SpvId base_type_id = builder.declare_type(op->base.type());
+    SpvId type_id = builder.declare_type(op->type);
     op->base.accept(this);
-    uint32_t base_id = id;
+    SpvId base_id = builder.current_id();
     op->stride.accept(this);
-    uint32_t stride_id = id;
-    uint32_t add_opcode = op->base.type().is_float() ? SpvOpFAdd : SpvOpIAdd;
+    SpvId stride_id = builder.current_id();
+
     // Generate adds to make the elements of the ramp.
-    uint32_t prev_id = base_id;
-    uint32_t first_id = next_id;
+    SpvId prev_id = base_id;
+    SpvFactory::Components constituents = {base_id};
     for (int i = 1; i < op->lanes; i++) {
-        uint32_t this_id = next_id++;
-        add_instruction(add_opcode, {base_type_id, this_id, prev_id, stride_id});
+        SpvId this_id = builder.reserve_id(SpvResultId);
+        if(op->base.type().is_float()) {
+            builder.append(SpvFactory::float_add(base_type_id, this_id, prev_id, stride_id));
+        }
+        else {
+            builder.append(SpvFactory::integer_add(base_type_id, this_id, prev_id, stride_id));
+        }
+        constituents.push_back(this_id);
         prev_id = this_id;
     }
 
-    id = next_id++;
-    spir_v_kernels.push_back(((op->lanes + 3) << 16) | SpvOpCompositeConstruct);
-    spir_v_kernels.push_back(type_id);
-    spir_v_kernels.push_back(id);
-    spir_v_kernels.push_back(base_id);
-    for (int i = 1; i < op->lanes; i++) {
-        spir_v_kernels.push_back(first_id++);
-    }
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::composite_construct(type_id, result_id, constituents));
+    builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Broadcast *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Broadcast *op) {
     // TODO: Is there a way to do this that doesn't require duplicating lane values?
-    uint32_t type_id = map_type(op->type);
+    SpvId type_id = builder.declare_type(op->type);
     op->value.accept(this);
-    uint32_t value_id = id;
-    id = next_id++;
-    spir_v_kernels.push_back(((op->lanes + 3) << 16) | SpvOpCompositeConstruct);
-    spir_v_kernels.push_back(type_id);
-    spir_v_kernels.push_back(id);
-    spir_v_kernels.insert(spir_v_kernels.end(), op->lanes, value_id);
+    SpvId value_id = builder.current_id();
+    SpvId result_id = builder.reserve_id(SpvResultId);
+
+    SpvFactory::Components constituents;
+    constituents.insert(constituents.end(), op->lanes, value_id);
+    builder.append(SpvFactory::composite_construct(type_id, result_id, constituents));
+    builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Provide *) {
-    internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Provide *): Provide encountered during codegen\n";
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Provide *) {
+    internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Provide *): Provide encountered during codegen\n";
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Allocate *) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *) {
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Free *) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Free *) {
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Realize *) {
-    internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Realize *): Realize encountered during codegen\n";
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Realize *) {
+    internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Realize *): Realize encountered during codegen\n";
 }
 
 template<typename StmtOrExpr>
-CodeGen_Vulkan_Dev::SPIRVEmitter::PhiNodeInputs
-CodeGen_Vulkan_Dev::SPIRVEmitter::emit_if_then_else(const Expr &condition,
+SpvFactory::BlockVariables
+CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
                                                     StmtOrExpr then_case, StmtOrExpr else_case) {
     condition.accept(this);
-    uint32_t cond_id = id;
-    uint32_t then_label_id = next_id++;
-    uint32_t else_label_id = next_id++;
-    uint32_t merge_label_id = next_id++;
+    SpvId cond_id = builder.current_id();
+    SpvId then_label_id = builder.reserve_id(SpvLabelId);
+    SpvId else_label_id = builder.reserve_id(SpvLabelId);
+    SpvId merge_label_id = builder.reserve_id(SpvLabelId);
 
-    add_instruction(SpvOpSelectionMerge, {merge_label_id, SpvSelectionControlMaskNone});
-    add_instruction(SpvOpBranchConditional, {cond_id, then_label_id, else_label_id});
-    add_instruction(SpvOpLabel, {then_label_id});
+    // If Conditional
+    builder.append(SpvFactory::selection_merge(merge_label_id, SpvSelectionControlMaskNone));
+    builder.append(SpvFactory::conditional_branch(cond_id, then_label_id, else_label_id));
 
+    // Then block    
+    builder.append(SpvFactory::label(then_label_id));
     then_case.accept(this);
-    uint32_t then_id = id;
-
-    add_instruction(SpvOpBranch, {merge_label_id});
-    add_instruction(SpvOpLabel, {else_label_id});
-
-    else_case.accept(this);
-    uint32_t else_id = id;
-
-    // Every basic block must end with a branch instruction
-    add_instruction(SpvOpBranch, {merge_label_id});
+    SpvId then_id = builder.current_id();
+    builder.append(SpvFactory::branch(merge_label_id));
+
+    SpvFactory::BlockVariables block_vars = {
+        { then_id, then_label_id }
+    };
+
+    // Else block (optional)
+    builder.append(SpvFactory::label(else_label_id));
+    if(else_case.defined()) {
+        else_case.accept(this);
+        SpvId else_id = builder.current_id();
+        block_vars.push_back({ else_id, else_label_id });
+    }
+    builder.append(SpvFactory::branch(merge_label_id));
 
-    add_instruction(SpvOpLabel, {merge_label_id});
+    // Merge label
+    builder.append(SpvFactory::label(merge_label_id));
 
-    return {{then_id, then_label_id, else_id, else_label_id}};
+    return block_vars;
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const IfThenElse *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IfThenElse *op) {
     emit_if_then_else(op->condition, op->then_case, op->else_case);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Evaluate *op) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Evaluate *op) {
     op->value.accept(this);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Shuffle *op) {
-    internal_assert(op->vectors.size() == 2) << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Shuffle *op): SPIR-V codegen currently only supports shuffles of vector pairs.\n";
-    uint32_t type_id = map_type(op->type);
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
+    internal_assert(op->vectors.size() == 2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op): SPIR-V codegen currently only supports shuffles of vector pairs.\n";
+    SpvId type_id = builder.declare_type(op->type);
     op->vectors[0].accept(this);
-    uint32_t vector0_id = id;
+    SpvId vector0_id = builder.current_id();
     op->vectors[1].accept(this);
-    uint32_t vector1_id = id;
-
-    id = next_id++;
-    spir_v_kernels.push_back(((5 + op->indices.size()) << 16) | SpvOpPhi);
-    spir_v_kernels.push_back(type_id);
-    spir_v_kernels.push_back(id);
-    spir_v_kernels.push_back(vector0_id);
-    spir_v_kernels.push_back(vector1_id);
-    spir_v_kernels.insert(spir_v_kernels.end(), op->indices.begin(), op->indices.end());
+    SpvId vector1_id = builder.current_id();
+
+    SpvFactory::Indices indices;
+    indices.insert(indices.end(), op->indices.begin(), op->indices.end());
+
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::vector_shuffle(type_id, result_id, vector0_id, vector1_id, indices));
+    builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Prefetch *) {
-    internal_error << "CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Prefetch *): Prefetch encountered during codegen\n";
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Prefetch *) {
+    internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Prefetch *): Prefetch encountered during codegen\n";
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Fork *) {
-    internal_error << "void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Fork *) not supported yet.";
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Fork *) {
+    internal_error << "void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Fork *) not supported yet.";
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Acquire *) {
-    internal_error << "void CodeGen_Vulkan_Dev::SPIRVEmitter::visit(const Acquire *) not supported yet.";
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Acquire *) {
+    internal_error << "void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Acquire *) not supported yet.";
 }
 
 // TODO: fast math decorations.
-void CodeGen_Vulkan_Dev::SPIRVEmitter::visit_binop(Type t, const Expr &a, const Expr &b, uint32_t opcode) {
-    uint32_t type_id = map_type(t);
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_binop(Type t, const Expr &a, const Expr &b, SpvOp op_code) {
+    SpvId type_id = builder.declare_type(t);
     a.accept(this);
-    uint32_t a_id = id;
+    SpvId src_a_id = builder.current_id();
     b.accept(this);
-    uint32_t b_id = id;
-    id = next_id++;
-    add_instruction(opcode, {type_id, id, a_id, b_id});
+    SpvId src_b_id = builder.current_id();
+    
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::binary_op(op_code, type_id, result_id, src_a_id, src_b_id));
+    builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::add_allocation(uint32_t result_type_id,
-                                                      uint32_t result_id,
-                                                      uint32_t storage_class,
-                                                      uint32_t initializer) {
-    if (initializer) {
-        add_instruction(spir_v_kernel_allocations, SpvOpVariable, {result_type_id, result_id, storage_class, initializer});
-    } else {
-        add_instruction(spir_v_kernel_allocations, SpvOpVariable, {result_type_id, result_id, storage_class});
-    }
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
+
+    builder.reset();
+
+    // NOTE: Source language is irrelevant. We encode the binary directly
+    builder.set_source_language(SpvSourceLanguageUnknown);   
+
+    // TODO: Should we autodetect and/or force 32bit or 64bit?
+    builder.set_addressing_model(SpvAddressingModelLogical); 
+    
+    // TODO: Is there a better memory model to use?
+    builder.set_memory_model(SpvMemoryModelGLSL450);         
+
+    // Capabilities
+    builder.require_capability(SpvCapabilityShader);
+
+    // NOTE: Extensions are handled in finalize
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::compile\n";
+    SpvBinary spirv_binary;
+    builder.finalize();
+    builder.encode(spirv_binary);
+    module.reserve(spirv_binary.size() * sizeof(uint32_t));
+    module.insert(module.end(), (const char*)spirv_binary.data(), (const char*)(spirv_binary.data() + spirv_binary.size()));
 }
 
-void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(const Stmt &s,
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
                                                   const std::string &name,
                                                   const std::vector<DeviceArgument> &args) {
     debug(2) << "Adding Vulkan kernel " << name << "\n";
@@ -1043,238 +976,172 @@ void CodeGen_Vulkan_Dev::SPIRVEmitter::add_kernel(const Stmt &s,
     workgroup_size[1] = 0;
     workgroup_size[2] = 0;
 
-    // Declare the function type.  TODO: should this be unique?
-    uint32_t function_type_id = next_id++;
-
-    add_instruction(spir_v_types, SpvOpTypeFunction, {function_type_id, void_id});
-
-    // Add definition and parameters
-    current_function_id = next_id++;
-    add_instruction(SpvOpFunction, {void_id, current_function_id, SpvFunctionControlMaskNone, function_type_id});
-
-    // Insert the starting label
-    add_instruction(SpvOpLabel, {next_id++});
-
-    // TODO: what about variables that need the SIMT intrinsics for their initializer?
-    // Save the location where we'll insert OpVariable instructions
-    size_t index = spir_v_kernels.size();
-
-    std::vector<uint32_t> entry_point_interface;
-    entry_point_interface.push_back(SpvExecutionModelGLCompute);
-    entry_point_interface.push_back(current_function_id);
-    // Add the string name of the function
-    encode_string(entry_point_interface, (name.size() + 1 + 3) / 4, name.size(), name.c_str());
+    // Declare the kernel function 
+    SpvId void_type_id = builder.declare_void_type();
+    SpvId kernel_func_id = builder.add_function(name, void_type_id);
+    SpvFunction kernel_func = builder.lookup_function(kernel_func_id);
+    builder.enter_function(kernel_func);
+    builder.enter_block(kernel_func.entry_block());
+//    SpvId start_label_id = kernel_func.entry_block().id();
 
     // TODO: only add the SIMT intrinsics used
+    SpvFactory::Variables entry_point_variables;
     auto intrinsics = {"WorkgroupId", "LocalInvocationId"};
-    for (const std::string &intrinsic : intrinsics) {
-        uint32_t intrinsic_id = next_id++;
-        uint32_t intrinsic_loaded_id = next_id++;
-        // The builtins are pointers to vec3
-        uint32_t intrinsic_type_id = map_pointer_type(Type(Type::UInt, 32, 3), SpvStorageClassInput);
+    for (const std::string& intrinsic_name : intrinsics) {
 
-        add_instruction(spir_v_types, SpvOpVariable, {intrinsic_type_id, intrinsic_id, SpvStorageClassInput});
-        add_instruction(SpvOpLoad, {map_type(Type(Type::UInt, 32, 3)), intrinsic_loaded_id, intrinsic_id});
-        symbol_table.push(intrinsic, {intrinsic_loaded_id, SpvStorageClassInput});
+        // The builtins are pointers to vec3
+        SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
+        SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, SpvStorageClassInput);
+        SpvId intrinsic_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, SpvStorageClassInput);
+        SpvId intrinsic_loaded_id = builder.reserve_id();
+        builder.append( SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id) );
+        symbol_table.push(intrinsic_name, {intrinsic_loaded_id, SpvStorageClassInput});
 
         // Annotate that this is the specific builtin
-        auto built_in_kind = starts_with(intrinsic, "Workgroup") ? SpvBuiltInWorkgroupId : SpvBuiltInLocalInvocationId;
-        add_instruction(spir_v_annotations, SpvOpDecorate, {intrinsic_id, SpvDecorationBuiltIn, built_in_kind});
+        SpvBuiltIn built_in_kind = starts_with(intrinsic_name, "Workgroup") ? SpvBuiltInWorkgroupId : SpvBuiltInLocalInvocationId;
+        SpvBuilder::Literals annotation_literals = { (uint32_t)built_in_kind };
+        builder.add_annotation( intrinsic_id, SpvDecorationBuiltIn, annotation_literals  );
 
         // Add the builtin to the interface
-        entry_point_interface.push_back(intrinsic_id);
+        entry_point_variables.push_back(intrinsic_id);
     }
 
     // Add the entry point and exection mode
-    add_instruction(spir_v_entrypoints,
-                    SpvOpEntryPoint, entry_point_interface);
+    builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
 
     // GLSL-style: each input buffer is a runtime array in a buffer struct
     // All other params get passed in as a single uniform block
     // First, need to count scalar parameters to construct the uniform struct
-    std::vector<uint32_t> scalar_types;
-    uint32_t offset = 0;
-    uint32_t param_pack_type_id = next_id++;
-    uint32_t param_pack_ptr_type_id = next_id++;
-    uint32_t param_pack_id = next_id++;
-    scalar_types.push_back(param_pack_type_id);
-    for (const DeviceArgument &arg : args) {
+    SpvBuilder::StructMemberTypes param_struct_members;
+    for (const auto &arg : args) {
         if (!arg.is_buffer) {
-            // record the type for later constructing the params struct type
-            scalar_types.push_back(map_type(arg.type));
+            SpvId arg_type_id = builder.declare_type(arg.type);
+            param_struct_members.push_back(arg_type_id);
+        }
+    }
+    SpvId param_struct_type_id = builder.declare_struct( unique_name("param_struct"), param_struct_members);
 
-            // Add a decoration describing the offset
-            add_instruction(spir_v_annotations, SpvOpMemberDecorate, {param_pack_type_id, (uint32_t)(scalar_types.size() - 2), SpvDecorationOffset, offset});
-            offset += arg.type.bytes();
+    // Add a decoration describing the offset for each parameter struct member
+    uint32_t param_member_index = 0;
+    uint32_t param_member_offset = 0;
+    for (const auto &arg : args) {
+        if (!arg.is_buffer) {
+            SpvBuilder::Literals param_offset_literals = { param_member_offset };
+            builder.add_struct_annotation( param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals );
+            param_member_offset += arg.type.bytes();
+            param_member_index++;
         }
     }
 
     // Add a Block decoration for the parameter pack itself
-    add_instruction(spir_v_annotations, SpvOpDecorate, {param_pack_type_id, SpvDecorationBlock});
-    // We always pass in the parameter pack as the first binding
-    add_instruction(spir_v_annotations, SpvOpDecorate, {param_pack_id, SpvDecorationDescriptorSet, 0});
-    add_instruction(spir_v_annotations, SpvOpDecorate, {param_pack_id, SpvDecorationBinding, 0});
+    builder.add_annotation( param_struct_type_id, SpvDecorationBlock );
 
-    // Add a struct type for the parameter pack and a pointer to it
-    add_instruction(spir_v_types, SpvOpTypeStruct, scalar_types);
-    add_instruction(spir_v_types, SpvOpTypePointer, {param_pack_ptr_type_id, SpvStorageClassUniform, param_pack_type_id});
     // Add a variable for the parameter pack
-    add_instruction(spir_v_types, SpvOpVariable, {param_pack_ptr_type_id, param_pack_id, SpvStorageClassUniform});
+    SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
+    SpvId param_pack_var_id = builder.declare_global_variable(unique_name("kernel_params"), param_pack_ptr_type_id, SpvStorageClassUniform);
+
+    // We always pass in the parameter pack as the first binding
+    SpvBuilder::Literals zero_literal = {0};
+    builder.add_annotation( param_pack_var_id, SpvDecorationDescriptorSet, zero_literal );
+    builder.add_annotation( param_pack_var_id, SpvDecorationBinding, zero_literal );
 
     uint32_t binding_counter = 1;
     uint32_t scalar_index = 0;
-    for (const DeviceArgument &arg : args) {
-        uint32_t param_id = next_id++;
+    for (const auto &arg : args) {
         if (arg.is_buffer) {
-            uint32_t element_type = map_type(arg.type);
-            uint32_t runtime_arr_type = next_id++;
-            uint32_t struct_type = next_id++;
-            uint32_t ptr_struct_type = next_id++;
-            add_instruction(spir_v_types, SpvOpTypeRuntimeArray, {runtime_arr_type, element_type});
-            add_instruction(spir_v_types, SpvOpTypeStruct, {struct_type, runtime_arr_type});
-            add_instruction(spir_v_types, SpvOpTypePointer, {ptr_struct_type, SpvStorageClassUniform, struct_type});
+            SpvId element_type_id = builder.declare_type(arg.type);
+            SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
+            SpvBuilder::StructMemberTypes struct_member_types = { runtime_arr_type_id };
+            SpvId struct_type_id = builder.declare_struct( unique_name("param_buffer_" + std::to_string(binding_counter)), struct_member_types );
+            SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, SpvStorageClassUniform);
+            SpvId param_id = builder.declare_global_variable( unique_name("param_" + arg.name), ptr_struct_type_id, SpvStorageClassUniform);
+
             // Annotate the struct to indicate it's passed in a GLSL-style buffer block
-            add_instruction(spir_v_annotations, SpvOpDecorate, {struct_type, SpvDecorationBufferBlock});
+            builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
+
             // Annotate the array with its stride
-            add_instruction(spir_v_annotations, SpvOpDecorate, {runtime_arr_type, SpvDecorationArrayStride, (uint32_t)(arg.type.bytes())});
+            SpvBuilder::Literals array_stride = { (uint32_t)(arg.type.bytes()) };
+            builder.add_annotation(runtime_arr_type_id, SpvDecorationArrayStride, array_stride);
+
             // Annotate the offset for the array
-            add_instruction(spir_v_annotations, SpvOpMemberDecorate, {struct_type, 0, SpvDecorationOffset, (uint32_t)0});
+            SpvBuilder::Literals zero_literal = { uint32_t(0) };
+            builder.add_struct_annotation(struct_type_id, 0, SpvDecorationOffset, zero_literal);
 
             // Set DescriptorSet and Binding
-            add_instruction(spir_v_annotations, SpvOpDecorate, {param_id, SpvDecorationDescriptorSet, 0});
-            add_instruction(spir_v_annotations, SpvOpDecorate, {param_id, SpvDecorationBinding, binding_counter++});
+            SpvBuilder::Literals binding_index = { uint32_t(binding_counter++) };
+            builder.add_annotation(param_id, SpvDecorationDescriptorSet, zero_literal);
+            builder.add_annotation(param_id, SpvDecorationBinding, binding_index);
+            symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
 
-            add_instruction(spir_v_types, SpvOpVariable, {ptr_struct_type, param_id, SpvStorageClassUniform});
         } else {
-            uint32_t access_chain_id = next_id++;
-            add_instruction(SpvOpInBoundsAccessChain, {map_pointer_type(arg.type, SpvStorageClassUniform),
-                                                       access_chain_id,
-                                                       param_pack_id,
-                                                       emit_constant(UInt(32), &scalar_index)});
+
+            SpvId arg_type_id = builder.declare_type(arg.type);
+            SpvId access_index_id = builder.declare_constant(UInt(32), &scalar_index);
+            SpvId pointer_type_id = builder.declare_pointer_type(arg_type_id, SpvStorageClassUniform);
+            SpvId access_chain_id = builder.declare_access_chain(pointer_type_id, param_pack_var_id, access_index_id, {});
             scalar_index++;
-            add_instruction(SpvOpLoad, {map_type(arg.type), param_id, access_chain_id});
+
+            SpvId param_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::load(arg_type_id, param_id, access_chain_id));
+            symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
         }
-        symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
     }
 
     s.accept(this);
 
-    // Insert return and  function end delimiter
-    add_instruction(SpvOpReturn, {});
-    add_instruction(SpvOpFunctionEnd, {});
-
-    // Insert the allocations in the right place
-    auto it = spir_v_kernels.begin() + index;
-    spir_v_kernels.insert(it, spir_v_kernel_allocations.begin(), spir_v_kernel_allocations.end());
-    spir_v_kernel_allocations.clear();
+    // Insert return statement end delimiter
+    kernel_func.entry_block().add_instruction( SpvFactory::return_stmt() );
 
     workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
     workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
     workgroup_size[2] = std::max(workgroup_size[2], (uint32_t)1);
+
     // Add workgroup size to execution mode
-    add_instruction(spir_v_execution_modes, SpvOpExecutionMode,
-                    {current_function_id, SpvExecutionModeLocalSize,
-                     workgroup_size[0], workgroup_size[1], workgroup_size[2]});
+    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size( kernel_func_id, workgroup_size[0], workgroup_size[1], workgroup_size[2] );
+    builder.current_module().add_execution_mode( exec_mode_inst );
 
     // Pop scope
-    for (const DeviceArgument &arg : args) {
+    for (const auto &arg : args) {
         symbol_table.pop(arg.name);
     }
-
-    // Reset to an invalid value for safety.
-    current_function_id = 0;
+    builder.leave_block();
+    builder.leave_function();
 }
 
 CodeGen_Vulkan_Dev::CodeGen_Vulkan_Dev(Target t) {
 }
 
-namespace {
-void add_extension(const std::string &extension_name, std::vector<uint32_t> &section) {
-    uint32_t extra_words = (extension_name.size() + 1 + 3) / 4;
-    section.push_back(((1 + extra_words) << 16) | SpvOpExtension);
-
-    const char *data_temp = (const char *)extension_name.c_str();
-    const size_t data_size = extension_name.size();
-    encode_string(section, extra_words, data_size, data_temp);
-}
-}  // namespace
 void CodeGen_Vulkan_Dev::init_module() {
-    debug(2) << "Vulkan device codegen init_module\n";
-
-    // Header.
-    emitter.spir_v_header.push_back(SpvMagicNumber);
-    emitter.spir_v_header.push_back(SpvVersion);
-    emitter.spir_v_header.push_back(SpvSourceLanguageUnknown);
-    emitter.spir_v_header.push_back(0);  // Bound placeholder
-    emitter.spir_v_header.push_back(0);  // Reserved for schema.
-
-    // the unique void type
-    emitter.next_id++;  // 0 is not a valid id
-    emitter.void_id = emitter.next_id++;
-    emitter.add_instruction(emitter.spir_v_types, SpvOpTypeVoid, {emitter.void_id});
-
-    // Capabilities
-    // TODO: only add those required by the generated code
-    emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityShader});
-    // emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityInt8});
-    // emitter.add_instruction(emitter.spir_v_header, SpvOpCapability, {SpvCapabilityUniformAndStorageBuffer8BitAccess});
-
-    // Extensions
-    // TODO: only add those required by the generated code
-    add_extension(std::string("SPV_KHR_8bit_storage"), emitter.spir_v_header);
-
-    // Memory model
-    // TODO: 32-bit or 64-bit?
-    // TODO: Which memory model?
-    emitter.add_instruction(emitter.spir_v_header, SpvOpMemoryModel,
-                            {SpvAddressingModelLogical, SpvMemoryModelGLSL450});
-
-    // OpCapability instructions
-    //    Enumerate type maps and add subwidth integer types if used
-    // OpExtensions instructions
-    // OpExtImport instructions
-    // One OpMemoryModelInstruction
-    // OpEntryPoint instructions -- tricky as we don't know them until the kernels are added. May need to insert as we go.
-    // OpExecutionMode or OpExecutionModeId -- are these also added at add_kernel time?
-    // debug -- empty?
-    // annotation
-    //     I believe alignment info for load/store/etc. is done with annotations.
-    //     Also need various annotations for SIMT intrinsics, struct layouts, etc
-    // OpType instructions. Contained in spir_v_types member.
-    // Function declarations. Are there any?
-    // Function bodies -- one per add_kernel
+    debug(2) << "CodeGen_Vulkan_Dev::init_module\n";
+    emitter.init_module();
 }
 
 void CodeGen_Vulkan_Dev::add_kernel(Stmt stmt,
                                     const std::string &name,
                                     const std::vector<DeviceArgument> &args) {
+
+    debug(2) << "CodeGen_Vulkan_Dev::add_kernel " << name << "\n";
+
+    // We need to scalarize/de-predicate any loads/stores, since Vulkan does not support predication.
+    stmt = scalarize_predicated_loads_stores(stmt);
+
+    debug(2) << "CodeGen_Vulkan_Dev: after removing predication: \n"
+             << stmt;
+
     current_kernel_name = name;
     emitter.add_kernel(stmt, name, args);
-    // dump();
+
+    // dump the SPIRV file if requested
+    if(getenv("HL_SPIRV_DUMP_FILE")) {
+        dump();
+    }
 }
 
 std::vector<char> CodeGen_Vulkan_Dev::compile_to_src() {
-    //#ifdef WITH_VULKAN
-
-    emitter.spir_v_header[3] = emitter.next_id;
-
-    std::vector<char> final_module;
-    size_t total_size = (emitter.spir_v_header.size() + emitter.spir_v_entrypoints.size() + emitter.spir_v_execution_modes.size() + emitter.spir_v_annotations.size() + emitter.spir_v_types.size() + emitter.spir_v_kernels.size()) * sizeof(uint32_t);
-    final_module.reserve(total_size);
-    final_module.insert(final_module.end(), (const char *)emitter.spir_v_header.data(), (const char *)(emitter.spir_v_header.data() + emitter.spir_v_header.size()));
-    final_module.insert(final_module.end(), (const char *)emitter.spir_v_entrypoints.data(), (const char *)(emitter.spir_v_entrypoints.data() + emitter.spir_v_entrypoints.size()));
-    final_module.insert(final_module.end(), (const char *)emitter.spir_v_execution_modes.data(), (const char *)(emitter.spir_v_execution_modes.data() + emitter.spir_v_execution_modes.size()));
-    final_module.insert(final_module.end(), (const char *)emitter.spir_v_annotations.data(), (const char *)(emitter.spir_v_annotations.data() + emitter.spir_v_annotations.size()));
-    final_module.insert(final_module.end(), (const char *)emitter.spir_v_types.data(), (const char *)(emitter.spir_v_types.data() + emitter.spir_v_types.size()));
-    final_module.insert(final_module.end(), (const char *)emitter.spir_v_kernels.data(), (const char *)(emitter.spir_v_kernels.data() + emitter.spir_v_kernels.size()));
-    assert(final_module.size() == total_size);
-    std::ofstream f("/home/skamil/out.spv", std::ios::out | std::ios::binary);
-    f.write((char *)(final_module.data()), final_module.size());
-    f.close();
-
-    return final_module;
-
-    //#endif
+    debug(2) << "CodeGen_Vulkan_Dev::compile_to_src\n";
+    std::vector<char> module;
+    emitter.compile(module);
+    return module;
 }
 
 std::string CodeGen_Vulkan_Dev::get_current_kernel_name() {
@@ -1286,10 +1153,10 @@ std::string CodeGen_Vulkan_Dev::print_gpu_name(const std::string &name) {
 }
 
 void CodeGen_Vulkan_Dev::dump() {
-    // TODO: Figure out what goes here.
-    // For now: dump to file so source can be consumed by validator
-    auto module = compile_to_src();
-    std::ofstream f("out.spv", std::ios::out | std::ios::binary);
+    std::vector<char> module = compile_to_src();
+    const char *filename = getenv("HL_SPIRV_DUMP_FILE") ? getenv("HL_SPIRV_DUMP_FILE") : "out.spv";
+    debug(1) << "Vulkan: Dumping SPIRV module to file: '" << filename << "'\n";
+    std::ofstream f(filename, std::ios::out | std::ios::binary);
     f.write((char *)(module.data()), module.size());
     f.close();
 }
@@ -1302,3 +1169,17 @@ std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Vulkan_Dev(const Target &target) {
 
 }  // namespace Internal
 }  // namespace Halide
+
+#else // WITH_SPIRV
+
+namespace Halide {
+namespace Internal {
+
+std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Vulkan_Dev(const Target &target) {
+    return nullptr;
+}
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif // WITH_SPIRV
diff --git a/src/CodeGen_Vulkan_Dev.h b/src/CodeGen_Vulkan_Dev.h
index f67d6d9859f2..d4830309e653 100644
--- a/src/CodeGen_Vulkan_Dev.h
+++ b/src/CodeGen_Vulkan_Dev.h
@@ -2,7 +2,8 @@
 #define HALIDE_CODEGEN_VULKAN_DEV_H
 
 /** \file
- * Defines the code-generator for producing Vulkan SPIR-V kernel code
+ * Defines the code-generator for producing SPIR-V binary modules for
+ * use with the Vulkan runtime
  */
 
 #include <memory>
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 621e79de7c62..db6749fed020 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -6,6 +6,32 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+    
+    template <typename T, typename S>
+    typename std::enable_if<std::is_unsigned<T>::value,T>::type
+    constexpr rotl(const T n, const S i){
+        const T m = (std::numeric_limits<T>::digits-1);
+        const T c = i & m;
+        return (n<<c)|(n>>((T(0)-c)&m));
+    }
+
+    static uint64_t hash_splitmix64(uint64_t x) {
+        // http://xorshift.di.unimi.it/splitmix64.c
+        x += uint64_t(0x9e3779b97f4a7c15);
+        x = (x ^ (x >> 30)) * uint64_t(0xbf58476d1ce4e5b9);
+        x = (x ^ (x >> 27)) * uint64_t(0x94d049bb133111eb);
+        return x ^ (x >> 31);
+    }
+
+    inline uint64_t hash_combine(uint64_t& seed, const uint64_t& value) {
+        // mix using a cheap asymmetric binary rotation
+        const uint64_t r = std::numeric_limits<uint64_t>::digits/3;
+        return rotl(seed, r) ^ hash_splitmix64(value);
+    }
+}
+
+
 /** SpvInstruction implementation **/
 SpvInstruction SpvInstruction::make(SpvOp op_code) {
     SpvInstruction instance;
@@ -112,7 +138,7 @@ void SpvInstruction::add_data(uint32_t bytes, const void *data) {
         memcpy(&entry, ptr, copy_size);
         bytes_copied += copy_size;
         add_immediate(entry);
-        ptr++;
+        ptr += 4;
     }
 }
 
@@ -380,9 +406,14 @@ bool SpvModule::is_defined() const {
     return contents.defined();
 }
 
-void SpvModule::add_debug(const SpvInstruction &val) {
+void SpvModule::add_debug_string(SpvId result_id, const std::string& string) {
+    check_defined();
+    contents->debug_source.push_back(SpvFactory::debug_string(result_id, string));
+}
+
+void SpvModule::add_debug_symbol(SpvId id, const std::string& symbol) {
     check_defined();
-    contents->debug.push_back(val);
+    contents->debug_symbols.push_back(SpvFactory::debug_symbol(id, symbol));
 }
 
 void SpvModule::add_annotation(const SpvInstruction &val) {
@@ -426,6 +457,11 @@ void SpvModule::add_entry_point(const std::string &name, SpvInstruction inst) {
     contents->entry_points[name] = std::move(inst);
 }
 
+void SpvModule::set_binding_count(SpvId val) {
+    check_defined();
+    contents->binding_count = val;
+}
+
 void SpvModule::set_source_language(SpvSourceLanguage val) {
     check_defined();
     contents->source_language = val;
@@ -441,6 +477,11 @@ void SpvModule::set_memory_model(SpvMemoryModel val) {
     contents->memory_model = val;
 }
 
+SpvId SpvModule::binding_count() const {
+    check_defined();
+    return contents->binding_count;
+}
+
 SpvSourceLanguage SpvModule::source_language() const {
     check_defined();
     return contents->source_language;
@@ -526,7 +567,7 @@ void SpvModule::encode(SpvBinary &binary) const {
     binary.push_back(SpvMagicNumber);
     binary.push_back(SpvVersion);
     binary.push_back(contents->source_language);
-    binary.push_back(0);  // Bound placeholder (aka last id used)
+    binary.push_back(contents->binding_count);  // last id bound to this module (aka last id used)
     binary.push_back(0);  // Reserved for schema.
 
     // 1. Capabilities
@@ -562,8 +603,11 @@ void SpvModule::encode(SpvBinary &binary) const {
         inst.encode(binary);
     }
 
-    // 7. Debug
-    for (const SpvInstruction &inst : contents->debug) {
+    // 7. Debug Source & Names
+    for (const SpvInstruction &inst : contents->debug_source) {
+        inst.encode(binary);
+    }
+    for (const SpvInstruction &inst : contents->debug_symbols) {
         inst.encode(binary);
     }
 
@@ -596,22 +640,117 @@ void SpvModule::encode(SpvBinary &binary) const {
 // --
 
 SpvBuilder::SpvBuilder() {
-    SpvId module_id = declare_id(SpvModuleId);
+    reset();
+}
+
+void SpvBuilder::reset() {
+
+    kind_map.clear();
+    type_map.clear();
+    struct_map.clear();
+    scope_map.clear();
+    string_map.clear();
+    constant_map.clear();
+    function_map.clear();
+    id_symbol_map.clear();
+    symbol_id_map.clear();
+    instruction_map.clear();
+    storage_class_map.clear();
+    pointer_type_map.clear();
+    variable_type_map.clear();
+    function_type_map.clear();
+
+    FunctionStack empty_fs;
+    function_stack.swap(empty_fs);
+
+    BlockStack empty_bs;
+    block_stack.swap(empty_bs);
+
+    scope_id = SpvInvalidId;
+    active_id = SpvInvalidId;
+    SpvId module_id = make_id(SpvModuleId);
     module = SpvModule::make(module_id);
 }
 
 SpvId SpvBuilder::reserve_id(SpvKind kind) {
-    return declare_id(kind);
+    return make_id(kind);
 }
 
-SpvId SpvBuilder::declare_id(SpvKind kind) {
+SpvId SpvBuilder::make_id(SpvKind kind) {
     // use type-agnostic non-overlapping increasing ids
     SpvId item_id = kind_map.size() + 1;
+    debug(3) << "    make_id: %" << item_id << " kind=" << kind_name(kind) << "\n";
     kind_map[item_id] = kind;
     return item_id;
 }
 
-SpvKind SpvBuilder::kind_of(SpvId item_id) {
+std::string SpvBuilder::kind_name(SpvKind kind) const {
+    switch(kind) {
+        case SpvInvalidItem:
+            { return "InvalidItem"; }
+        case SpvTypeId:
+            { return "TypeId"; }
+        case SpvVoidTypeId:
+            { return "VoidTypeId"; }
+        case SpvBoolTypeId:
+            { return "BoolTypeId"; }
+        case SpvIntTypeId:
+            { return "IntTypeId"; }
+        case SpvFloatTypeId:
+            { return "FloatTypeId"; }
+        case SpvVectorTypeId:
+            { return "VectorTypeId"; }
+        case SpvArrayTypeId:
+            { return "ArrayTypeId"; }
+        case SpvRuntimeArrayTypeId:
+            { return "RuntimeArrayTypeId"; }
+        case SpvStringTypeId:
+            { return "StringTypeId"; }
+        case SpvPointerTypeId:
+            { return "PointerTypeId"; }
+        case SpvStructTypeId:
+            { return "StructTypeId"; }
+        case SpvFunctionTypeId:
+            { return "FunctionTypeId"; }
+        case SpvAccessChainId:
+            { return "AccessChainId"; }
+        case SpvConstantId:
+            { return "ConstantId"; }
+        case SpvBoolConstantId:
+            { return "BoolConstantId"; }
+        case SpvIntConstantId:
+            { return "IntConstantId"; }
+        case SpvFloatConstantId:
+            { return "FloatConstantId"; }
+        case SpvStringConstantId:
+            { return "StringConstantId"; }
+        case SpvCompositeConstantId:
+            { return "CompositeConstantId"; }
+        case SpvResultId:
+            { return "ResultId"; }
+        case SpvVariableId:
+            { return "VariableId"; }
+        case SpvInstructionId:
+            { return "InstructionId"; }
+        case SpvFunctionId:
+            { return "FunctionId"; }
+        case SpvBlockId:
+            { return "BlockId"; }
+        case SpvLabelId:
+            { return "LabelId"; }
+        case SpvParameterId:
+            { return "ParameterId"; }
+        case SpvModuleId:
+            { return "ModuleId"; }
+        case SpvUnknownItem: 
+            { return "UnknownItem"; }
+        default:
+            { return "InvalidItem"; }
+    };
+    return "InvalidItem";
+}
+
+SpvKind SpvBuilder::kind_of(SpvId item_id) const {
     KindMap::const_iterator it = kind_map.find(item_id);
     if (it != kind_map.end()) {
         return SpvInvalidItem;
@@ -619,97 +758,200 @@ SpvKind SpvBuilder::kind_of(SpvId item_id) {
     return it->second;
 }
 
+SpvId SpvBuilder::type_of(SpvId variable_id) const {
+    VariableTypeMap::const_iterator it = variable_type_map.find(variable_id);
+    if (it != variable_type_map.end()) {
+        return SpvInvalidId;
+    }
+    return it->second;
+}
+
+void SpvBuilder::finalize() {
+    SpvId last_id = (SpvId)(kind_map.size() + 1);
+    module.set_binding_count(last_id);
+
+    if(module.is_capability_required(SpvCapabilityInt8)) {
+        module.require_extension("SPV_KHR_8bit_storage");
+    }
+
+    if(module.is_capability_required(SpvCapabilityInt16)) {
+        module.require_extension("SPV_KHR_16bit_storage");
+    }
+}
+
 void SpvBuilder::encode(SpvBinary &binary) const {
     // Encode the module
     module.encode(binary);
 }
 
-SpvId SpvBuilder::map_type(const Type &type, uint32_t array_size) {
+SpvId SpvBuilder::declare_type(const Type &type, uint32_t array_size) {
     SpvId type_id = lookup_type(type, array_size);
     if (type_id == SpvInvalidId) {
-        type_id = declare_type(type, array_size);
+        type_id = add_type(type, array_size);
     }
     return type_id;
 }
 
-SpvId SpvBuilder::map_pointer_type(const Type &type, SpvStorageClass storage_class) {
+SpvId SpvBuilder::declare_pointer_type(const Type &type, SpvStorageClass storage_class) {
     SpvId ptr_type_id = lookup_pointer_type(type, storage_class);
     if (ptr_type_id == SpvInvalidId) {
-        ptr_type_id = declare_pointer_type(ptr_type_id, storage_class);
+        ptr_type_id = add_pointer_type(ptr_type_id, storage_class);
     }
     return ptr_type_id;
 }
 
-SpvId SpvBuilder::map_pointer_type(SpvId type_id, SpvStorageClass storage_class) {
+SpvId SpvBuilder::declare_pointer_type(SpvId type_id, SpvStorageClass storage_class) {
     SpvId ptr_type_id = lookup_pointer_type(type_id, storage_class);
     if (ptr_type_id == SpvInvalidId) {
-        ptr_type_id = declare_pointer_type(type_id, storage_class);
+        ptr_type_id = add_pointer_type(type_id, storage_class);
     }
     return ptr_type_id;
 }
 
-SpvId SpvBuilder::map_function_type(SpvId return_type, const ParamTypes &param_types) {
+SpvId SpvBuilder::declare_function_type(SpvId return_type, const ParamTypes &param_types) {
     SpvId type_id = lookup_function_type(return_type, param_types);
     if (type_id == SpvInvalidId) {
-        type_id = declare_function_type(return_type, param_types);
+        type_id = add_function_type(return_type, param_types);
     }
     return type_id;
 }
 
-SpvId SpvBuilder::map_constant(const Type &type, const void *data) {
+SpvId SpvBuilder::declare_function(const std::string& name, SpvId function_type) {
+    SpvId existing_id = lookup_id(name);
+    if(existing_id != SpvInvalidId) {
+        if(kind_of(existing_id) == SpvFunctionId) {
+            SpvFunction existing_func = lookup_function(existing_id);
+            if(existing_func.type_id() == function_type) {
+                return existing_id;
+            }
+        }
+    }
+    return add_function(name, function_type);
+}
+
+SpvId SpvBuilder::declare_constant(const Type &type, const void *data) {
     SpvId result_id = lookup_constant(type, data);
     if (result_id == SpvInvalidId) {
-        result_id = declare_constant(type, data);
+        result_id = add_constant(type, data);
     }
     return result_id;
 }
 
-void SpvBuilder::add_entry_point(const std::string &name,
-                                 SpvId func_id, SpvExecutionModel exec_model,
+SpvId SpvBuilder::declare_symbol(const std::string& symbol, SpvId id, SpvId scope_id) {
+    SpvId existing_id = lookup_id(symbol);
+    if(existing_id != SpvInvalidId) {
+        SpvId existing_scope = lookup_scope(existing_id);
+        if(existing_scope == scope_id) {
+            return existing_id;
+        }
+    }
+    add_symbol(symbol, id, scope_id);
+    return id;
+}
+
+SpvStorageClass SpvBuilder::lookup_storage_class(SpvId id) const {
+    SpvStorageClass result = SpvInvalidStorageClass; 
+    StorageClassMap::const_iterator it = storage_class_map.find(id);
+    if (it != storage_class_map.end()) {
+        result = it->second;
+    }
+    return result;
+}
+
+SpvId SpvBuilder::lookup_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id ) const {
+    SpvId existing_id = lookup_id(name);
+    if(existing_id != SpvInvalidId) {
+        if( (kind_of(existing_id) == SpvVariableId) && 
+            (type_of(existing_id) == type_id) && 
+            (lookup_storage_class(existing_id) == storage_class) &&
+            (lookup_scope(existing_id) == scope_id)) {
+            return existing_id;
+        }
+    }
+    return SpvInvalidId;
+}
+
+bool SpvBuilder::has_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id ) const {
+    return (lookup_variable(name, type_id, storage_class, scope_id ) != SpvInvalidId);
+}
+
+SpvId SpvBuilder::declare_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId init_id) {
+    SpvId block_id = current_block().id();
+    SpvId existing_id = lookup_variable(name, type_id, storage_class, block_id);
+    if(existing_id != SpvInvalidId) {
+        return existing_id;
+    }
+
+    SpvId var_id = reserve_id(SpvVariableId);
+    debug(3) << "    declare_variable: %" << var_id << " name='" << name << "' type_id=" << type_id << " storage_class=" << (uint32_t)storage_class << " init_id=" << init_id << "\n";
+    current_block().add_variable(SpvFactory::variable(var_id, type_id, storage_class, init_id));
+    declare_symbol(name, var_id, block_id);
+    storage_class_map[var_id] = storage_class;
+    return var_id;
+}
+
+SpvId SpvBuilder::declare_global_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId init_id) {
+    SpvId var_id = reserve_id(SpvVariableId);
+    debug(3) << "    declare_global_variable: %" << var_id << " name='" << name << "' type_id=" << type_id << " storage_class=" << (uint32_t)storage_class << " init_id=" << init_id << "\n";
+    module.add_global(SpvFactory::variable(var_id, type_id, storage_class, init_id));
+    declare_symbol(name, var_id, module.id());
+    storage_class_map[var_id] = storage_class;
+    return var_id;
+}
+
+void SpvBuilder::add_entry_point(SpvId func_id, SpvExecutionModel exec_model,
                                  const Variables &variables) {
 
-    SpvInstruction inst = SpvFactory::entry_point(exec_model, func_id, name, variables);
-    module.add_entry_point(name, inst);
+    const std::string& func_name = lookup_symbol(func_id);
+    if(func_name.empty()) {
+        internal_error << "SPIRV: Function missing name definition: " << func_id << "\n";
+    } else {
+        debug(3) << "    add_entry_point: %" << func_id << " func_name='" << func_name << "' exec_model=" << (uint32_t)exec_model << " variable_count=" << (uint32_t)variables.size() << "\n";
+        SpvInstruction inst = SpvFactory::entry_point(exec_model, func_id, func_name, variables);
+        module.add_entry_point(func_name, inst);
+    }
 }
 
-SpvFunction SpvBuilder::add_function(SpvId return_type_id, const ParamTypes &param_types) {
-    SpvId func_id = declare_id(SpvFunctionId);
-    SpvId func_type_id = map_function_type(return_type_id, param_types);
+SpvId SpvBuilder::add_function(const std::string& name, SpvId return_type_id, const ParamTypes &param_types) {
+    SpvId func_id = make_id(SpvFunctionId);
+    SpvId func_type_id = declare_function_type(return_type_id, param_types);
+
+    debug(3) << "    add_function: %" << func_id << " func_type_id=" << func_type_id << " return_type_id=" << return_type_id << " parameter_count=" << (uint32_t)param_types.size() << "\n";
     SpvFunction func = SpvFunction::make(func_type_id, func_id, return_type_id);
     for (SpvId param_type_id : param_types) {
-        SpvId param_id = declare_id(SpvParameterId);
+        SpvId param_id = make_id(SpvParameterId);
         SpvInstruction param_inst = SpvFactory::function_parameter(param_type_id, param_id);
         func.add_parameter(param_inst);
         map_instruction(param_inst);
     }
-    SpvId block_id = declare_id(SpvBlockId);
+    SpvId block_id = make_id(SpvBlockId);
     SpvBlock entry_block = SpvBlock::make(func, block_id);
     func.add_block(entry_block);
     module.add_function(func);
     function_map[func_id] = func;
+    declare_symbol(name, func_id, module.id());
     map_instruction(func.declaration());
-    return func;
+    return func_id;
 }
 
-SpvId SpvBuilder::add_global_variable(SpvId type_id, uint32_t storage_class, SpvId init_id) {
-    SpvId var_id = reserve_id(SpvVariableId);
-    module.add_global(SpvFactory::variable(var_id, type_id, storage_class, init_id));
-    return var_id;
-}
-
-SpvId SpvBuilder::add_variable(SpvId type_id, uint32_t storage_class, SpvId init_id) {
-    SpvId var_id = reserve_id(SpvVariableId);
-    current_block().add_variable(SpvFactory::variable(var_id, type_id, storage_class, init_id));
-    return var_id;
-}
 
 void SpvBuilder::add_annotation(SpvId target_id, SpvDecoration decoration_type, const Literals &literals) {
     SpvInstruction inst = SpvFactory::decorate(target_id, decoration_type, literals);
+    debug(3) << "    add_annotation: %" << target_id << " decoration_type=" << uint32_t(decoration_type) << " literals=[";
+    for(uint32_t v : literals) {
+        debug(3) << " " << v;
+    }
+    debug(3) << " ]\n";
     current_module().add_annotation(inst);
 }
 
 void SpvBuilder::add_struct_annotation(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const Literals &literals) {
     SpvInstruction inst = SpvFactory::decorate_member(struct_type_id, member_index, decoration_type, literals);
+    debug(3) << "    add_struct_annotation: %" << struct_type_id << " member_index=" << member_index << " decoration_type=" << uint32_t(decoration_type) << " literals=[";
+    for(uint32_t v : literals) {
+        debug(3) << " " << v;
+    }
+    debug(3) << " ]\n";
     current_module().add_annotation(inst);
 }
 
@@ -754,6 +996,41 @@ SpvFunction SpvBuilder::lookup_function(SpvId func_id) const {
     return func;
 }
 
+std::string SpvBuilder::lookup_symbol(SpvId id) const {
+    std::string name;
+    IdSymbolMap::const_iterator it = id_symbol_map.find(id);
+    if (it != id_symbol_map.end()) {
+        name = it->second;
+    }
+    return name;
+}
+
+SpvId SpvBuilder::lookup_id(const std::string& symbol) const {
+    SpvId result = SpvInvalidId;
+    SymbolIdMap::const_iterator it = symbol_id_map.find(symbol);
+    if (it != symbol_id_map.end()) {
+        result = it->second;
+    }
+    return result;
+}
+
+void SpvBuilder::add_symbol(const std::string& symbol, SpvId id, SpvId scope_id) {
+    symbol_id_map[symbol] = id;
+    id_symbol_map[id] = symbol;
+    scope_map[id] = scope_id;
+    debug(3) << "    add_symbol: %" << id << " symbol='" << symbol << "' scope_id=" << scope_id << "\n";
+    module.add_debug_symbol(id, symbol);
+}
+
+SpvId SpvBuilder::lookup_scope(SpvId id) const {
+    SpvId result = SpvInvalidId;
+    ScopeMap::const_iterator it = scope_map.find(id);
+    if (it != scope_map.end()) {
+        result = it->second;
+    }
+    return result;
+}
+
 void SpvBuilder::enter_function(const SpvFunction &func) {
     function_stack.push(func);
     enter_block(func.entry_block());
@@ -777,18 +1054,42 @@ SpvFunction SpvBuilder::leave_function() {
     return func;
 }
 
-void SpvBuilder::set_current_id(SpvId val) {
-    scope_id = val;
+SpvId SpvBuilder::current_id() const {
+    return active_id;
 }
 
-SpvId SpvBuilder::current_id() const {
-    return scope_id;
+void SpvBuilder::update_id(SpvId id) {
+    active_id = id;
 }
 
 SpvModule SpvBuilder::current_module() const {
     return module;
 }
 
+void SpvBuilder::set_source_language(SpvSourceLanguage val) {
+    module.set_source_language(val);
+}
+
+void SpvBuilder::set_addressing_model(SpvAddressingModel val) {
+    module.set_addressing_model(val);
+}
+
+void SpvBuilder::set_memory_model(SpvMemoryModel val) {
+    module.set_memory_model(val);
+}
+
+SpvSourceLanguage SpvBuilder::source_language() const {
+    return module.source_language();
+}
+
+SpvAddressingModel SpvBuilder::addressing_model() const {
+    return module.addressing_model();
+}
+
+SpvMemoryModel SpvBuilder::memory_model() const {
+    return module.memory_model();    
+}
+
 void SpvBuilder::require_capability(SpvCapability capability) {
     if (!module.is_capability_required(capability)) {
         module.require_capability(capability);
@@ -810,15 +1111,11 @@ bool SpvBuilder::is_extension_required(const std::string &extension) const {
 }
 
 SpvBuilder::TypeKey SpvBuilder::make_type_key(const Type &type, uint32_t array_size) const {
-    TypeKey key(4 + sizeof(uint32_t), ' ');
-    key[0] = type.code();
-    key[1] = type.bits();
-    key[2] = type.lanes() & 0xff;
-    key[3] = (type.lanes() >> 8) & 0xff;
-    for (size_t i = 0; i < sizeof(uint32_t); i++) {
-        key[i + 4] = (array_size & 0xff);
-        array_size >>= 8;
-    }
+    TypeKey key = hash_splitmix64(type.code());
+    key = hash_combine(key, type.bits());
+    key = hash_combine(key, type.lanes());
+    key = hash_combine(key, type.bytes());
+    key = hash_combine(key, array_size);
     return key;
 }
 
@@ -831,7 +1128,7 @@ SpvId SpvBuilder::lookup_type(const Type &type, uint32_t array_size) const {
     return it->second;
 }
 
-SpvId SpvBuilder::declare_type(const Type &type, uint32_t array_size) {
+SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
     SpvBuilder::TypeKey type_key = make_type_key(type, array_size);
     TypeMap::const_iterator it = type_map.find(type_key);
     if (it != type_map.end()) {
@@ -839,8 +1136,9 @@ SpvId SpvBuilder::declare_type(const Type &type, uint32_t array_size) {
     }
 
     if (array_size > 1) {
-        SpvId array_type_id = declare_id(SpvArrayTypeId);
-        SpvId element_type_id = declare_type(type, 1);
+        SpvId array_type_id = make_id(SpvArrayTypeId);
+        SpvId element_type_id = add_type(type, 1);
+        debug(3) << "    add_array_type: %" << array_type_id << " element_type_id='" << element_type_id << " array_size='" << array_size << "\n";
         SpvInstruction inst = SpvFactory::array_type(array_type_id, element_type_id, array_size);
         module.add_type(inst);
         type_map[type_key] = array_type_id;
@@ -849,28 +1147,45 @@ SpvId SpvBuilder::declare_type(const Type &type, uint32_t array_size) {
 
     SpvId type_id = SpvInvalidId;
     if (type.is_vector()) {
-        type_id = declare_id(SpvVectorTypeId);
-        SpvId element_type_id = declare_type(type.with_lanes(1));
+        type_id = make_id(SpvVectorTypeId);
+        SpvId element_type_id = add_type(type.with_lanes(1));
+        debug(3) << "    add_vector_type: %" << type_id << " element_type_id='" << element_type_id << " lanes='" << type.lanes() << "\n";
         SpvInstruction inst = SpvFactory::vector_type(type_id, element_type_id, type.lanes());
         module.add_type(inst);
     } else {
         if (type.is_handle()) {
-            type_id = declare_id(SpvVoidTypeId);
+            type_id = make_id(SpvVoidTypeId);
             SpvInstruction inst = SpvFactory::void_type(type_id);
+            debug(3) << "    add_void_type: %" << type_id << "\n";
             module.add_type(inst);
         } else if (type.is_bool()) {
-            type_id = declare_id(SpvBoolTypeId);
+            type_id = make_id(SpvBoolTypeId);
+            debug(3) << "    add_bool_type: %" << type_id << "\n";
             SpvInstruction inst = SpvFactory::bool_type(type_id);
             module.add_type(inst);
         } else if (type.is_float()) {
-            type_id = declare_id(SpvFloatTypeId);
+            type_id = make_id(SpvFloatTypeId);
+            debug(3) << "    add_float_type: %" << type_id << " bits=" << type.bits() << "\n";
             SpvInstruction inst = SpvFactory::float_type(type_id, type.bits());
             module.add_type(inst);
+            if(type.bits() == 16) {
+                module.require_capability(SpvCapabilityFloat16);                
+            } else if(type.bits() == 64) {
+                module.require_capability(SpvCapabilityFloat64);                
+            }
         } else if (type.is_int_or_uint()) {
-            type_id = declare_id(SpvIntTypeId);
+            type_id = make_id(SpvIntTypeId);
             SpvId signedness = type.is_uint() ? 0 : 1;
+            debug(3) << "    add_integer_type: %" << type_id << " bits=" << type.bits() << " signed=" << (signedness ? "true" : "false") << "\n";
             SpvInstruction inst = SpvFactory::integer_type(type_id, type.bits(), signedness);
             module.add_type(inst);
+            if(type.bits() == 8) {
+                module.require_capability(SpvCapabilityInt8);                
+            } else if(type.bits() == 16) {
+                module.require_capability(SpvCapabilityInt16);                
+            } else if(type.bits() == 64) {
+                module.require_capability(SpvCapabilityInt64);                
+            }
         } else {
             internal_error << "SPIRV: Unsupported type " << type << "\n";
         }
@@ -880,41 +1195,59 @@ SpvId SpvBuilder::declare_type(const Type &type, uint32_t array_size) {
     return type_id;
 }
 
+SpvId SpvBuilder::declare_void_type() {
+    return declare_type(Handle());
+}
+
 SpvBuilder::TypeKey SpvBuilder::make_struct_type_key(const StructMemberTypes &member_type_ids) const {
-    TypeKey key(member_type_ids.size() * sizeof(SpvId), ' ');
-    uint32_t index = 0;
+    TypeKey key = hash_splitmix64(member_type_ids.size());
     for (SpvId type_id : member_type_ids) {
-        for (size_t i = 0; i < sizeof(uint32_t); i++, index++) {
-            key[index] = (type_id & 0xff);
-            type_id >>= 8;
-        }
+        key = hash_combine(key, type_id);
     }
     return key;
 }
 
-SpvId SpvBuilder::lookup_struct(const StructMemberTypes &member_type_ids) const {
+SpvId SpvBuilder::lookup_struct(const std::string& struct_name, const StructMemberTypes &member_type_ids) const {
     TypeKey key = make_struct_type_key(member_type_ids);
     TypeMap::const_iterator it = struct_map.find(key);
     if (it != struct_map.end()) {
-        return it->second;
+        if(struct_name == lookup_symbol(it->second)) {
+            return it->second;
+        }
     }
     return SpvInvalidId;
 }
 
-SpvId SpvBuilder::declare_struct(const StructMemberTypes &member_type_ids) {
+SpvId SpvBuilder::add_struct(const std::string& struct_name, const StructMemberTypes &member_type_ids) {
     TypeKey key = make_struct_type_key(member_type_ids);
     TypeMap::const_iterator it = struct_map.find(key);
     if (it != struct_map.end()) {
-        return it->second;
+        if(struct_name == lookup_symbol(it->second)) {
+            return it->second;
+        }
     }
 
-    SpvId struct_type_id = declare_id(SpvStructTypeId);
+    SpvId struct_type_id = make_id(SpvStructTypeId);
+    debug(3) << "    add_struct_type: %" << struct_type_id << " name=" << struct_name << " member_type_ids=[";
+    for(SpvId m : member_type_ids) {
+        debug(3) << " " << m;
+    }
+    debug(3) << " ]\n";
     SpvInstruction inst = SpvFactory::struct_type(struct_type_id, member_type_ids);
     module.add_type(inst);
     struct_map[key] = struct_type_id;
+    add_symbol(struct_name, struct_type_id, module.id());
     return struct_type_id;
 }
 
+SpvId SpvBuilder::declare_struct(const std::string& struct_name, const StructMemberTypes &member_types) {
+    SpvId struct_id = lookup_struct(struct_name, member_types);
+    if (struct_id == SpvInvalidId) {
+        struct_id = add_struct(struct_name, member_types);
+    }
+    return struct_id;
+}
+
 SpvBuilder::PointerTypeKey SpvBuilder::make_pointer_type_key(const Type &type, SpvStorageClass storage_class) const {
     SpvId base_type_id = lookup_type(type);
     if (base_type_id == SpvInvalidId) {
@@ -944,54 +1277,57 @@ SpvId SpvBuilder::lookup_pointer_type(SpvId base_type_id, SpvStorageClass storag
     return SpvInvalidId;
 }
 
-SpvId SpvBuilder::declare_pointer_type(const Type &type, SpvStorageClass storage_class) {
-    SpvId base_type_id = map_type(type);
-    return declare_pointer_type(base_type_id, storage_class);
+SpvId SpvBuilder::add_pointer_type(const Type &type, SpvStorageClass storage_class) {
+    SpvId base_type_id = declare_type(type);
+    debug(3) << "    add_pointer_type: base_type=" << type << " base_type_id=" << base_type_id << " storage_class=" << (uint32_t)(storage_class) << "\n";
+    return add_pointer_type(base_type_id, storage_class);
 }
 
-SpvId SpvBuilder::declare_pointer_type(SpvId base_type_id, SpvStorageClass storage_class) {
+SpvId SpvBuilder::add_pointer_type(SpvId base_type_id, SpvStorageClass storage_class) {
     PointerTypeKey key = make_pointer_type_key(base_type_id, storage_class);
     PointerTypeMap::const_iterator it = pointer_type_map.find(key);
     if (it != pointer_type_map.end()) {
         return it->second;
     }
 
-    SpvId pointer_type_id = declare_id(SpvPointerTypeId);
+    SpvId pointer_type_id = make_id(SpvPointerTypeId);
+    debug(3) << "    add_pointer_type: %" << pointer_type_id << " base_type_id=" << base_type_id << " storage_class=" << (uint32_t)(storage_class) << "\n";
     SpvInstruction inst = SpvFactory::pointer_type(pointer_type_id, storage_class, base_type_id);
     module.add_type(inst);
     pointer_type_map[key] = pointer_type_id;
+    storage_class_map[pointer_type_id] = storage_class;
     return pointer_type_id;
 }
 
-SpvBuilder::ConstantKey SpvBuilder::make_constant_key(const Type &type, const void *data) const {
-    ConstantKey key(type.bytes() + 4, ' ');
-    key[0] = type.code();
-    key[1] = type.bits();
-    key[2] = type.lanes() & 0xff;
-    key[3] = (type.lanes() >> 8) & 0xff;
-    const char *data_char = (const char *)data;
-    for (int i = 0; i < type.bytes(); i++) {
-        key[i + 4] = data_char[i];
+SpvBuilder::ConstantKey SpvBuilder::make_constant_key( uint8_t code, uint8_t bits, int lanes, size_t bytes, const void* data) const {
+    ConstantKey key = hash_splitmix64(code);
+    key = hash_combine(key, bits);
+    key = hash_combine(key, lanes);
+    key = hash_combine(key, bytes);
+    
+    const int8_t* ptr = reinterpret_bits<const int8_t*>(data);
+    for(size_t i = 0; i < bytes; ++i) {
+        key = hash_combine(key, uint64_t(ptr[i]));
     }
     return key;
 }
 
+SpvBuilder::ConstantKey SpvBuilder::make_constant_key(const Type &type, const void *data) const {
+    return make_constant_key(type.code(), type.bits(), type.lanes(), type.bytes(), data);
+}
+
 SpvBuilder::ConstantKey SpvBuilder::make_bool_constant_key(bool value) const {
     Type type = Bool();
     bool data = value;
     return make_constant_key(type, &data);
 }
 
+SpvBuilder::ConstantKey SpvBuilder::make_string_constant_key(const std::string& value) const {
+    return make_constant_key(halide_type_handle, 8, 1, value.length(), (const char*)(value.c_str()));
+}
+
 SpvBuilder::ConstantKey SpvBuilder::make_null_constant_key(const Type &type) const {
-    ConstantKey key(type.bytes() + 4, ' ');
-    key[0] = type.code();
-    key[1] = type.bits();
-    key[2] = type.lanes() & 0xff;
-    key[3] = (type.lanes() >> 8) & 0xff;
-    for (int i = 0; i < type.bytes(); i++) {
-        key[i + 4] = 0;
-    }
-    return key;
+    return make_constant_key(type.code(), type.bits(), type.lanes(), type.bytes(), nullptr);
 }
 
 SpvId SpvBuilder::lookup_null_constant(const Type &type) const {
@@ -1010,8 +1346,10 @@ SpvId SpvBuilder::declare_null_constant(const Type &type) {
         return it->second;
     }
 
-    SpvId result_id = declare_id(SpvConstantId);
-    SpvId type_id = declare_type(type);
+    SpvId result_id = make_id(SpvConstantId);
+    SpvId type_id = add_type(type);
+
+    debug(3) << "    declare_null_constant: %" << result_id << " " << type << "\n";
     SpvInstruction inst = SpvFactory::null_constant(result_id, type_id);
     module.add_constant(inst);
     constant_map[key] = result_id;
@@ -1019,30 +1357,45 @@ SpvId SpvBuilder::declare_null_constant(const Type &type) {
 }
 
 SpvId SpvBuilder::declare_bool_constant(bool value) {
-    const std::string key = make_bool_constant_key(value);
+    ConstantKey key = make_bool_constant_key(value);
     ConstantMap::const_iterator it = constant_map.find(key);
     if (it != constant_map.end()) {
         return it->second;
     }
 
-    debug(3) << "declare_bool_constant for " << value << "\n";
-
     Type type = Bool();
-    SpvId result_id = declare_id(SpvBoolConstantId);
-    SpvId type_id = declare_type(type);
+    SpvId result_id = make_id(SpvBoolConstantId);
+    SpvId type_id = add_type(type);
+
+    debug(3) << "    declare_bool_constant: %" << result_id << " bool " <<  value << "\n";
     SpvInstruction inst = SpvFactory::bool_constant(result_id, type_id, value);
     module.add_constant(inst);
     constant_map[key] = result_id;
     return result_id;
 }
 
+SpvId SpvBuilder::declare_string_constant(const std::string& value) {
+    ConstantKey key = make_string_constant_key(value);
+    ConstantMap::const_iterator it = constant_map.find(key);
+    if (it != constant_map.end()) {
+        return it->second;
+    }
+
+    SpvId result_id = make_id(SpvStringConstantId);
+    debug(3) << "    declare_string_constant: %" << result_id << " string '" <<  value << "'\n";
+    SpvInstruction inst = SpvFactory::string_constant(result_id, value);
+    module.add_constant(inst);
+    constant_map[key] = result_id;
+    return result_id;
+}
+
 SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *data) {
     if (scalar_type.lanes() != 1) {
         internal_error << "SPIRV: Invalid type provided for scalar constant!" << scalar_type << "\n";
         return SpvInvalidId;
     }
 
-    const std::string constant_key = make_constant_key(scalar_type, data);
+    ConstantKey constant_key = make_constant_key(scalar_type, data);
     ConstantMap::const_iterator it = constant_map.find(constant_key);
     if (it != constant_map.end()) {
         return it->second;
@@ -1053,21 +1406,19 @@ SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *d
         return declare_bool_constant(value);
     }
 
-    debug(3) << "declare_scalar_constant for type " << scalar_type << "\n";
-
     SpvId result_id = SpvInvalidId;
     if (scalar_type.is_float()) {
-        result_id = declare_id(SpvFloatConstantId);
-    } else if (scalar_type.is_bool()) {
-        result_id = declare_id(SpvBoolConstantId);
+        result_id = make_id(SpvFloatConstantId);
+        debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const float*)(data)) << "\n";
     } else if (scalar_type.is_int_or_uint()) {
-        result_id = declare_id(SpvIntConstantId);
+        result_id = make_id(SpvIntConstantId);
+        debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const int32_t*)(data)) << "\n";
     } else {
         internal_error << "SPIRV: Unsupported type:" << scalar_type << "\n";
         return SpvInvalidId;
     }
 
-    SpvId type_id = declare_type(scalar_type);
+    SpvId type_id = add_type(scalar_type);
     SpvInstruction inst = SpvFactory::constant(result_id, type_id, scalar_type.bytes(), data);
     module.add_constant(inst);
     constant_map[constant_key] = result_id;
@@ -1080,7 +1431,7 @@ SpvId SpvBuilder::declare_vector_constant(const Type &type, const void *data) {
         return SpvInvalidId;
     }
 
-    const std::string key = make_constant_key(type, data);
+    ConstantKey key = make_constant_key(type, data);
     ConstantMap::const_iterator it = constant_map.find(key);
     if (it != constant_map.end()) {
         return it->second;
@@ -1132,8 +1483,9 @@ SpvId SpvBuilder::declare_vector_constant(const Type &type, const void *data) {
         return SpvInvalidId;
     }
 
-    SpvId result_id = declare_id(SpvCompositeConstantId);
-    SpvId type_id = declare_type(type);
+    SpvId result_id = make_id(SpvCompositeConstantId);
+    SpvId type_id = add_type(type);
+    debug(3) << "    declare_vector_constant: %" << result_id << " key=" << key << " type=" << type << " data=" << data << "\n";
     SpvInstruction inst = SpvFactory::composite_constant(result_id, type_id, components);
     module.add_constant(inst);
     constant_map[key] = result_id;
@@ -1149,15 +1501,14 @@ SpvId SpvBuilder::lookup_constant(const Type &type, const void *data) const {
     return SpvInvalidId;
 }
 
-SpvId SpvBuilder::declare_constant(const Type &type, const void *data) {
+SpvId SpvBuilder::add_constant(const Type &type, const void *data) {
 
-    const std::string key = make_constant_key(type, data);
+    ConstantKey key = make_constant_key(type, data);
     ConstantMap::const_iterator it = constant_map.find(key);
     if (it != constant_map.end()) {
         return it->second;
     }
 
-    debug(3) << "declare_constant for type " << type << "\n";
     if (type.lanes() == 1) {
         return declare_scalar_constant(type, data);
     } else {
@@ -1166,7 +1517,7 @@ SpvId SpvBuilder::declare_constant(const Type &type, const void *data) {
 }
 
 SpvId SpvBuilder::declare_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId element_id, const Indices &indices) {
-    SpvId access_chain_id = declare_id(SpvAccessChainId);
+    SpvId access_chain_id = make_id(SpvAccessChainId);
     append(SpvFactory::in_bounds_access_chain(ptr_type_id, access_chain_id, base_id, element_id, indices));
     return access_chain_id;
 }
@@ -1190,18 +1541,9 @@ SpvInstruction SpvBuilder::lookup_instruction(SpvId result_id) const {
 }
 
 SpvBuilder::FunctionTypeKey SpvBuilder::make_function_type_key(SpvId return_type_id, const ParamTypes &param_type_ids) const {
-    TypeKey key((1 + param_type_ids.size()) * sizeof(SpvId), ' ');
-
-    uint32_t index = 0;
-    for (size_t i = 0; i < sizeof(uint32_t); i++, index++) {
-        key[index] = (return_type_id & 0xff);
-        return_type_id >>= 8;
-    }
+    TypeKey key = hash_splitmix64(return_type_id);
     for (SpvId type_id : param_type_ids) {
-        for (size_t i = 0; i < sizeof(uint32_t); i++, index++) {
-            key[index] = (type_id & 0xff);
-            type_id >>= 8;
-        }
+        key = hash_combine(key, type_id);
     }
     return key;
 }
@@ -1215,22 +1557,27 @@ SpvId SpvBuilder::lookup_function_type(SpvId return_type_id, const ParamTypes &p
     return SpvInvalidId;
 }
 
-SpvId SpvBuilder::declare_function_type(SpvId return_type_id, const ParamTypes &param_type_ids) {
+SpvId SpvBuilder::add_function_type(SpvId return_type_id, const ParamTypes &param_type_ids) {
     FunctionTypeKey func_type_key = make_function_type_key(return_type_id, param_type_ids);
     FunctionTypeMap::const_iterator it = function_type_map.find(func_type_key);
     if (it != function_type_map.end()) {
         return it->second;
     }
 
-    SpvId function_type_id = declare_id(SpvFunctionTypeId);
+    SpvId function_type_id = make_id(SpvFunctionTypeId);
+    debug(3) << "    add_function_type: %" << function_type_id << " return_type_id=" << return_type_id << " param_type_ids=[";
+    for(SpvId p : param_type_ids) {
+        debug(3) << " " << p;
+    }
+    debug(3) << " ]\n";
     SpvInstruction inst = SpvFactory::function_type(function_type_id, return_type_id, param_type_ids);
     module.add_type(inst);
     function_type_map[func_type_key] = function_type_id;
     return function_type_id;
 }
 
-SpvId SpvBuilder::declare_runtime_array(SpvId base_type_id) {
-    SpvId runtime_array_id = declare_id(SpvRuntimeArrayTypeId);
+SpvId SpvBuilder::add_runtime_array(SpvId base_type_id) {
+    SpvId runtime_array_id = make_id(SpvRuntimeArrayTypeId);
     SpvInstruction inst = SpvFactory::runtime_array_type(runtime_array_id, base_type_id);
     module.add_type(inst);
     return runtime_array_id;
@@ -1248,12 +1595,39 @@ void SpvBuilder::append(SpvInstruction inst) {
 
 // -- Factory Methods for Specific Instructions
 
+SpvInstruction SpvFactory::no_op(SpvId result_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpNop);
+    return inst;
+}
+
 SpvInstruction SpvFactory::label(SpvId result_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpLabel);
     inst.set_result_id(result_id);
     return inst;
 }
 
+SpvInstruction SpvFactory::debug_line(SpvId string_id, uint32_t line, uint32_t column) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpLine);
+    inst.add_operand(string_id);
+    inst.add_immediate(line);
+    inst.add_immediate(column);
+    return inst;
+}
+
+SpvInstruction SpvFactory::debug_string(SpvId result_id, const std::string& string) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpString);
+    inst.set_result_id(result_id);
+    inst.add_string(string);
+    return inst;
+}
+
+SpvInstruction SpvFactory::debug_symbol(SpvId target_id, const std::string& symbol) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpName);
+    inst.set_result_id(target_id);
+    inst.add_string(symbol);
+    return inst;
+}
+
 SpvInstruction SpvFactory::decorate(SpvId target_id, SpvDecoration decoration_type, const SpvFactory::Literals &literals) {
     SpvInstruction inst = SpvInstruction::make(SpvOpDecorate);
     inst.add_operand(target_id);
@@ -1267,6 +1641,7 @@ SpvInstruction SpvFactory::decorate(SpvId target_id, SpvDecoration decoration_ty
 SpvInstruction SpvFactory::decorate_member(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const SpvFactory::Literals &literals) {
     SpvInstruction inst = SpvInstruction::make(SpvOpMemberDecorate);
     inst.add_operand(struct_type_id);
+    inst.add_immediate(member_index);
     inst.add_immediate(decoration_type);
     for (uint32_t l : literals) {
         inst.add_immediate(l);
@@ -1368,8 +1743,8 @@ SpvInstruction SpvFactory::pointer_type(SpvId pointer_type_id, SpvStorageClass s
 
 SpvInstruction SpvFactory::function_type(SpvId function_type_id, SpvId return_type_id, const SpvFactory::ParamTypes &param_type_ids) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeFunction);
-    inst.set_type_id(return_type_id);
     inst.set_result_id(function_type_id);
+    inst.add_operand(return_type_id);
     for (SpvId type_id : param_type_ids) {
         inst.add_operand(type_id);
     }
@@ -1399,6 +1774,13 @@ SpvInstruction SpvFactory::bool_constant(SpvId result_id, SpvId type_id, bool va
     return inst;
 }
 
+SpvInstruction SpvFactory::string_constant(SpvId result_id, const std::string& value) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpString);
+    inst.set_result_id(result_id);
+    inst.add_string(value);
+    return inst;
+}
+
 SpvInstruction SpvFactory::composite_constant(SpvId result_id, SpvId type_id, const SpvFactory::Components &components) {
     SpvInstruction inst = SpvInstruction::make(SpvOpConstantComposite);
     inst.set_type_id(type_id);
@@ -1490,6 +1872,24 @@ SpvInstruction SpvFactory::logical_not(SpvId type_id, SpvId result_id, SpvId src
     return unary_op(SpvOpNot, type_id, result_id, src_id);
 }
 
+SpvInstruction SpvFactory::shift_right_logical(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpShiftRightLogical);
+    inst.add_operand(type_id);
+    inst.add_operand(result_id);
+    inst.add_operand(src_id);
+    inst.add_operand(shift_id);
+    return inst;
+}
+
+SpvInstruction SpvFactory::shift_right_arithmetic(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpShiftRightArithmetic);
+    inst.add_operand(type_id);
+    inst.add_operand(result_id);
+    inst.add_operand(src_id);
+    inst.add_operand(shift_id);
+    return inst;
+}
+
 SpvInstruction SpvFactory::multiply_extended(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
     return binary_op(is_signed ? SpvOpSMulExtended : SpvOpUMulExtended, type_id, result_id, src_a_id, src_b_id);
 }
@@ -1544,9 +1944,19 @@ SpvInstruction SpvFactory::composite_extract(SpvId type_id, SpvId result_id, Spv
     return inst;
 }
 
-SpvInstruction SpvFactory::vector_insert_dynamic(SpvId result_id, SpvId vector_id, SpvId value_id, uint32_t index) {
+SpvInstruction SpvFactory::composite_construct(SpvId type_id, SpvId result_id, const Components &constituents) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpCompositeConstruct);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    for (SpvId id : constituents) {
+        inst.add_operand(id);
+    }
+    return inst;
+}
+
+SpvInstruction SpvFactory::vector_insert_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, uint32_t index) {
     SpvInstruction inst = SpvInstruction::make(SpvOpVectorInsertDynamic);
-    inst.set_type_id(SpvOpTypeVector);
+    inst.set_type_id(type_id);
     inst.set_result_id(result_id);
     inst.add_operand(vector_id);
     inst.add_operand(value_id);
@@ -1554,6 +1964,18 @@ SpvInstruction SpvFactory::vector_insert_dynamic(SpvId result_id, SpvId vector_i
     return inst;
 }
 
+SpvInstruction SpvFactory::vector_shuffle(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, const Indices &indices) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpVectorShuffle);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operand(src_a_id);
+    inst.add_operand(src_b_id);
+    for (SpvId i : indices) {
+        inst.add_immediate(i);
+    }
+    return inst;
+}
+
 SpvInstruction SpvFactory::bitcast(SpvId type_id, SpvId result_id, SpvId src_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpBitcast);
     inst.set_type_id(type_id);
@@ -1566,6 +1988,10 @@ SpvInstruction SpvFactory::integer_add(SpvId type_id, SpvId result_id, SpvId src
     return binary_op(SpvOpIAdd, type_id, result_id, src_a_id, src_b_id);
 }
 
+SpvInstruction SpvFactory::float_add(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
+    return binary_op(SpvOpFAdd, type_id, result_id, src_a_id, src_b_id);
+}
+
 SpvInstruction SpvFactory::branch(SpvId target_label_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpBranch);
     inst.add_operand(target_label_id);
@@ -1573,14 +1999,68 @@ SpvInstruction SpvFactory::branch(SpvId target_label_id) {
 }
 
 SpvInstruction SpvFactory::conditional_branch(SpvId condition_label_id, SpvId true_label_id, SpvId false_label_id, const SpvFactory::BranchWeights &weights) {
-    SpvInstruction inst = SpvInstruction::make(SpvOpBranch);
+    SpvInstruction inst = SpvInstruction::make(SpvOpBranchConditional);
     inst.add_operand(condition_label_id);
     inst.add_operand(true_label_id);
     inst.add_operand(false_label_id);
     for (uint32_t w : weights) {
         inst.add_immediate(w);
     }
-    return inst;
+    return inst;   
+}
+
+SpvInstruction SpvFactory::equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpIEqual);
+    inst.add_operand(type_id);
+    inst.add_operand(result_id);
+    inst.add_operand(src_a_id);
+    inst.add_operand(src_b_id);
+    return inst;   
+}
+
+SpvInstruction SpvFactory::not_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpINotEqual);
+    inst.add_operand(type_id);
+    inst.add_operand(result_id);
+    inst.add_operand(src_a_id);
+    inst.add_operand(src_b_id);
+    return inst;   
+}
+
+SpvInstruction SpvFactory::less_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
+    SpvInstruction inst = SpvInstruction::make(is_signed ? SpvOpSLessThan : SpvOpULessThan);
+    inst.add_operand(type_id);
+    inst.add_operand(result_id);
+    inst.add_operand(src_a_id);
+    inst.add_operand(src_b_id);
+    return inst;   
+}
+
+SpvInstruction SpvFactory::less_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
+    SpvInstruction inst = SpvInstruction::make(is_signed ? SpvOpSLessThanEqual : SpvOpULessThanEqual);
+    inst.add_operand(type_id);
+    inst.add_operand(result_id);
+    inst.add_operand(src_a_id);
+    inst.add_operand(src_b_id);
+    return inst;   
+}
+
+SpvInstruction SpvFactory::greater_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
+    SpvInstruction inst = SpvInstruction::make(is_signed ? SpvOpSGreaterThan : SpvOpUGreaterThan);
+    inst.add_operand(type_id);
+    inst.add_operand(result_id);
+    inst.add_operand(src_a_id);
+    inst.add_operand(src_b_id);
+    return inst;   
+}
+
+SpvInstruction SpvFactory::greater_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
+    SpvInstruction inst = SpvInstruction::make(is_signed ? SpvOpSGreaterThanEqual : SpvOpUGreaterThanEqual);
+    inst.add_operand(type_id);
+    inst.add_operand(result_id);
+    inst.add_operand(src_a_id);
+    inst.add_operand(src_b_id);
+    return inst;   
 }
 
 SpvInstruction SpvFactory::loop_merge(SpvId merge_label_id, SpvId continue_label_id, uint32_t loop_control_mask) {
@@ -1691,24 +2171,23 @@ void spirv_ir_test() {
     SpvInstruction void_inst = SpvFactory::void_type(void_type_id);
     builder.current_module().add_type(void_inst);
 
-    SpvId int_type_id = builder.map_type(Int(32));
-    SpvId uint_type_id = builder.map_type(UInt(32));
-    SpvId float_type_id = builder.map_type(Float(32));
+    SpvId int_type_id = builder.declare_type(Int(32));
+    SpvId uint_type_id = builder.declare_type(UInt(32));
+    SpvId float_type_id = builder.declare_type(Float(32));
 
     SpvBuilder::ParamTypes param_types = {int_type_id, uint_type_id, float_type_id};
-    SpvFunction function = builder.add_function(void_type_id, param_types);
+    SpvId kernel_func_id = builder.add_function("kernel_func", void_type_id, param_types);
+    SpvFunction kernel_func = builder.lookup_function(kernel_func_id);
 
-    builder.enter_function(function);
-    SpvId intrinsic_type_id = builder.map_type(Type(Type::UInt, 32, 3));
-    SpvId intrinsic_id = builder.add_global_variable(intrinsic_type_id, SpvStorageClassInput);
+    builder.enter_function(kernel_func);
+    SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
+    SpvId intrinsic_id = builder.declare_global_variable("InputVar", intrinsic_type_id, SpvStorageClassInput);
 
-    SpvId output_type_id = builder.map_type(Type(Type::UInt, 32, 1));
-    SpvId output_id = builder.add_global_variable(output_type_id, SpvStorageClassOutput);
+    SpvId output_type_id = builder.declare_type(Type(Type::UInt, 32, 1));
+    SpvId output_id = builder.declare_global_variable("OutputVar", output_type_id, SpvStorageClassOutput);
 
-    SpvBuilder::Variables entry_point_variables;
-    entry_point_variables.push_back(intrinsic_id);
-    entry_point_variables.push_back(output_id);
-    builder.add_entry_point("entry_func", function.id(), SpvExecutionModelKernel, entry_point_variables);
+    SpvBuilder::Variables entry_point_variables = { intrinsic_id, output_id };
+    builder.add_entry_point(kernel_func_id, SpvExecutionModelKernel, entry_point_variables);
 
     SpvBuilder::Literals annotation_literals = {SpvBuiltInWorkgroupId};
     builder.add_annotation(intrinsic_id, SpvDecorationBuiltIn, annotation_literals);
@@ -1717,7 +2196,7 @@ void spirv_ir_test() {
     builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id));
 
     float float_value = 32.0f;
-    SpvId float_src_id = builder.declare_constant(Float(32), &float_value);
+    SpvId float_src_id = builder.add_constant(Float(32), &float_value);
     SpvId converted_value_id = builder.reserve_id(SpvResultId);
     builder.append(SpvFactory::convert(SpvOpConvertFToU, uint_type_id, converted_value_id, float_src_id));
     builder.append(SpvFactory::store(output_id, converted_value_id));
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 0c3356162820..448afa7d3449 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -78,6 +78,7 @@ enum SpvKind {
 using SpvId = uint32_t;
 using SpvBinary = std::vector<uint32_t>;
 
+static constexpr SpvStorageClass SpvInvalidStorageClass = SpvStorageClassMax; // sentinel for invalid storage class
 static constexpr SpvId SpvInvalidId = SpvId(-1);
 static constexpr SpvId SpvNoResult = 0;
 static constexpr SpvId SpvNoType = 0;
@@ -229,7 +230,8 @@ class SpvModule {
     SpvModule(SpvModule &&) = default;
     SpvModule &operator=(SpvModule &&) = default;
 
-    void add_debug(const SpvInstruction &val);
+    void add_debug_string(SpvId result_id, const std::string& string);
+    void add_debug_symbol(SpvId id, const std::string& symbol);    
     void add_annotation(const SpvInstruction &val);
     void add_type(const SpvInstruction &val);
     void add_constant(const SpvInstruction &val);
@@ -245,12 +247,15 @@ class SpvModule {
     void set_source_language(SpvSourceLanguage val);
     void set_addressing_model(SpvAddressingModel val);
     void set_memory_model(SpvMemoryModel val);
+    void set_binding_count(SpvId count);
+
     SpvSourceLanguage source_language() const;
     SpvAddressingModel addressing_model() const;
     SpvMemoryModel memory_model() const;
     SpvInstruction entry_point(const std::string &name) const;
     EntryPointNames entry_point_names() const;
     const Instructions &execution_modes() const;
+    SpvId binding_count() const;
     SpvModule module() const;
 
     bool is_capability_required(SpvCapability val) const;
@@ -288,21 +293,15 @@ class SpvBuilder {
     SpvBuilder &operator=(const SpvBuilder &) = delete;
 
     SpvId reserve_id(SpvKind = SpvResultId);
-    SpvKind kind_of(SpvId id);
 
-    SpvId map_type(const Type &type, uint32_t array_size = 1);
-    SpvId map_pointer_type(const Type &type, SpvStorageClass storage_class);
-    SpvId map_pointer_type(SpvId type_id, SpvStorageClass storage_class);
-    SpvId map_constant(const Type &type, const void *data);
-    SpvId map_null_constant(const Type &type);
-    SpvId map_bool_constant(bool value);
-    SpvId map_function_type(SpvId return_type, const ParamTypes &param_types = {});
+    SpvKind kind_of(SpvId id) const;
+    std::string kind_name(SpvKind kind) const;
+    SpvId type_of(SpvId variable_id) const;
 
+    SpvId declare_void_type();
     SpvId declare_type(const Type &type, uint32_t array_size = 1);
-    SpvId declare_struct(const StructMemberTypes &member_types);
-    SpvId declare_runtime_array(SpvId base_type_id);
     SpvId declare_pointer_type(const Type &type, SpvStorageClass storage_class);
-    SpvId declare_pointer_type(SpvId base_type_id, SpvStorageClass storage_class);
+    SpvId declare_pointer_type(SpvId type_id, SpvStorageClass storage_class);
     SpvId declare_constant(const Type &type, const void *data);
     SpvId declare_null_constant(const Type &type);
     SpvId declare_bool_constant(bool value);
@@ -310,20 +309,26 @@ class SpvBuilder {
     SpvId declare_scalar_constant(const Type &type, const void *data);
     SpvId declare_vector_constant(const Type &type, const void *data);
     SpvId declare_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId element_id, const Indices &indices);
-    SpvId declare_function_type(SpvId return_type_id, const ParamTypes &param_type_ids);
-
-    SpvFunction add_function(SpvId return_type, const ParamTypes &param_types = {});
+    SpvId declare_function_type(SpvId return_type, const ParamTypes &param_types = {});
+    SpvId declare_function(const std::string& name, SpvId function_type);
+    SpvId declare_struct(const std::string& name, const StructMemberTypes &member_types);
+    SpvId declare_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId initializer_id = SpvInvalidId);
+    SpvId declare_global_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId initializer_id = SpvInvalidId);
+
+    SpvId add_type(const Type &type, uint32_t array_size = 1);
+    SpvId add_struct(const std::string& name, const StructMemberTypes &member_types);
+    SpvId add_runtime_array(SpvId base_type_id);
+    SpvId add_pointer_type(const Type &type, SpvStorageClass storage_class);
+    SpvId add_pointer_type(SpvId base_type_id, SpvStorageClass storage_class);
+    SpvId add_constant(const Type &type, const void *data);
+    SpvId add_function_type(SpvId return_type_id, const ParamTypes &param_type_ids);
+    SpvId add_function(const std::string& name, SpvId return_type, const ParamTypes &param_types = {});   
     SpvId add_instruction(SpvInstruction val);
+
     void add_annotation(SpvId target_id, SpvDecoration decoration_type, const Literals &literals = {});
     void add_struct_annotation(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const Literals &literals = {});
 
-    SpvId add_variable(SpvId type_id, uint32_t storage_class, SpvId initializer_id = SpvInvalidId);
-    SpvId add_global_variable(SpvId type_id, uint32_t storage_class, SpvId initializer_id = SpvInvalidId);
-
-    SpvId map_struct(const StructMemberTypes &member_types);
-
-    void add_entry_point(const std::string &name,
-                         SpvId func_id, SpvExecutionModel exec_model,
+    void add_entry_point(SpvId func_id, SpvExecutionModel exec_model,
                          const Variables &variables = {});
 
     void add_execution_mode_local_size(SpvId entry_point_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z);
@@ -331,10 +336,16 @@ class SpvBuilder {
     void set_source_language(SpvSourceLanguage val);
     void set_addressing_model(SpvAddressingModel val);
     void set_memory_model(SpvMemoryModel val);
-
+    
     SpvSourceLanguage source_language() const;
     SpvAddressingModel addressing_model() const;
     SpvMemoryModel memory_model() const;
+    
+    void require_extension(const std::string &extension);
+    void require_capability(SpvCapability);
+
+    bool is_extension_required(const std::string &extension) const;
+    bool is_capability_required(SpvCapability) const;
 
     void enter_block(const SpvBlock &block);
     SpvBlock current_block() const;
@@ -345,43 +356,53 @@ class SpvBuilder {
     SpvFunction current_function() const;
     SpvFunction leave_function();
 
-    void set_current_id(SpvId id);
     SpvId current_id() const;
+    void update_id(SpvId id);
 
-    SpvModule current_module() const;
-
-    void require_extension(const std::string &extension);
-    void require_capability(SpvCapability);
+    SpvStorageClass lookup_storage_class(SpvId id) const;
+    SpvId lookup_id(const std::string& symbol) const;
+    SpvId lookup_scope(SpvId id) const;
+    std::string lookup_symbol(SpvId id) const;
+    SpvId declare_symbol(const std::string& symbol, SpvId id, SpvId scope_id);
+    void add_symbol(const std::string& symbol, SpvId id, SpvId scope_id);
 
-    bool is_extension_required(const std::string &extension) const;
-    bool is_capability_required(SpvCapability) const;
+    SpvModule current_module() const;
 
     void append(SpvInstruction inst);
+    void finalize();
     void encode(SpvBinary &binary) const;
+    void reset();
 
 protected:
-    using TypeKey = std::string;
+    
+    using TypeKey = uint64_t;
     using TypeMap = std::unordered_map<TypeKey, SpvId>;
     using KindMap = std::unordered_map<SpvId, SpvKind>;
     using PointerTypeKey = std::pair<SpvId, SpvStorageClass>;
     using PointerTypeMap = std::map<PointerTypeKey, SpvId>;
-    using ConstantKey = std::string;
+    using VariableTypeMap = std::unordered_map<SpvId, SpvId>;
+    using StorageClassMap = std::unordered_map<SpvId, SpvStorageClass>;
+    using ConstantKey = uint64_t;
     using ConstantMap = std::unordered_map<ConstantKey, SpvId>;
     using StringMap = std::unordered_map<ConstantKey, SpvId>;
+    using ScopeMap = std::unordered_map<SpvId, SpvId>;
+    using IdSymbolMap = std::unordered_map<SpvId, std::string>;
+    using SymbolIdMap = std::unordered_map<std::string, SpvId>;
     using InstructionMap = std::unordered_map<SpvId, SpvInstruction>;
-    using FunctionTypeKey = std::string;
+    using FunctionTypeKey = uint64_t;
     using FunctionTypeMap = std::unordered_map<FunctionTypeKey, SpvId>;
     using FunctionMap = std::unordered_map<SpvId, SpvFunction>;
     using FunctionStack = std::stack<SpvFunction>;
     using BlockStack = std::stack<SpvBlock>;
+    using IdStack = std::stack<SpvId>;
 
-    SpvId declare_id(SpvKind kind);
+    SpvId make_id(SpvKind kind);
 
     TypeKey make_type_key(const Type &type, uint32_t array_size = 1) const;
     SpvId lookup_type(const Type &type, uint32_t array_size = 1) const;
 
     TypeKey make_struct_type_key(const StructMemberTypes &member_types) const;
-    SpvId lookup_struct(const StructMemberTypes &member_types) const;
+    SpvId lookup_struct(const std::string& name, const StructMemberTypes &member_types) const;
 
     PointerTypeKey make_pointer_type_key(const Type &type, SpvStorageClass storage_class) const;
     SpvId lookup_pointer_type(const Type &type, SpvStorageClass storage_class) const;
@@ -390,7 +411,8 @@ class SpvBuilder {
     SpvId lookup_pointer_type(SpvId base_type_id, SpvStorageClass storage_class) const;
 
     ConstantKey make_bool_constant_key(bool value) const;
-
+    ConstantKey make_string_constant_key(const std::string& value) const;
+    ConstantKey make_constant_key( uint8_t code, uint8_t bits, int lanes, size_t bytes, const void* data) const;
     ConstantKey make_constant_key(const Type &type, const void *data) const;
     SpvId lookup_constant(const Type &type, const void *data) const;
 
@@ -401,19 +423,28 @@ class SpvBuilder {
     SpvInstruction lookup_instruction(SpvId result_id) const;
     bool has_instruction(SpvId inst) const;
 
+    SpvId lookup_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id ) const;
+    bool has_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id ) const;
+    
     FunctionTypeKey make_function_type_key(SpvId return_type_id, const ParamTypes &param_type_ids) const;
     SpvId lookup_function_type(SpvId return_type_id, const ParamTypes &param_type_ids) const;
 
     SpvId scope_id = SpvInvalidId;
+    SpvId active_id = SpvInvalidId;
     SpvModule module;
     KindMap kind_map;
     TypeMap type_map;
     TypeMap struct_map;
+    ScopeMap scope_map;
     StringMap string_map;
     ConstantMap constant_map;
     FunctionMap function_map;
+    IdSymbolMap id_symbol_map;
+    SymbolIdMap symbol_id_map;
     InstructionMap instruction_map;
+    StorageClassMap storage_class_map;
     PointerTypeMap pointer_type_map;
+    VariableTypeMap variable_type_map;
     FunctionTypeMap function_type_map;
     FunctionStack function_stack;
     BlockStack block_stack;
@@ -431,10 +462,14 @@ struct SpvFactory {
     using VariableBlockIdPair = std::pair<SpvId, SpvId>;  // (Variable Id, Block Id)
     using BlockVariables = std::vector<VariableBlockIdPair>;
 
+    static SpvInstruction no_op(SpvId result_id);
     static SpvInstruction capability(const SpvCapability &capability);
     static SpvInstruction extension(const std::string &extension);
     static SpvInstruction import(const std::string &import);
     static SpvInstruction label(SpvId result_id);
+    static SpvInstruction debug_line(SpvId string_id, uint32_t line, uint32_t column);
+    static SpvInstruction debug_string(SpvId result_id, const std::string& string);
+    static SpvInstruction debug_symbol(SpvId target_id, const std::string& symbol);
     static SpvInstruction decorate(SpvId target_id, SpvDecoration decoration_type, const Literals &literals = {});
     static SpvInstruction decorate_member(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const Literals &literals = {});
     static SpvInstruction void_type(SpvId void_type_id);
@@ -450,6 +485,7 @@ struct SpvFactory {
     static SpvInstruction constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data);
     static SpvInstruction null_constant(SpvId result_id, SpvId type_id);
     static SpvInstruction bool_constant(SpvId result_id, SpvId type_id, bool value);
+    static SpvInstruction string_constant(SpvId result_id, const std::string& value);
     static SpvInstruction composite_constant(SpvId result_id, SpvId type_id, const Components &components);
     static SpvInstruction variable(SpvId result_id, SpvId result_type_id, uint32_t storage_class, SpvId initializer_id = SpvInvalidId);
     static SpvInstruction function(SpvId return_type_id, SpvId func_id, uint32_t control_mask, SpvId func_type_id);
@@ -461,16 +497,27 @@ struct SpvFactory {
     static SpvInstruction exec_mode_local_size(SpvId function_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z);
     static SpvInstruction control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, uint32_t semantics_mask);
     static SpvInstruction logical_not(SpvId type_id, SpvId result_id, SpvId src_id);
+    static SpvInstruction shift_right_logical(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id);
+    static SpvInstruction shift_right_arithmetic(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id);
     static SpvInstruction multiply_extended(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
     static SpvInstruction select(SpvId type_id, SpvId result_id, SpvId condition_id, SpvId true_id, SpvId false_id);
     static SpvInstruction in_bounds_access_chain(SpvId type_id, SpvId result_id, SpvId base_id, SpvId element_id, const Indices &indices);
     static SpvInstruction load(SpvId type_id, SpvId result_id, SpvId ptr_id, uint32_t access_mask = 0x0);
     static SpvInstruction store(SpvId ptr_id, SpvId obj_id, uint32_t access_mask = 0x0);
-    static SpvInstruction vector_insert_dynamic(SpvId result_id, SpvId vector_id, SpvId value_id, uint32_t index);
+    static SpvInstruction vector_insert_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, uint32_t index);
+    static SpvInstruction vector_shuffle(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, const Indices &indices);
     static SpvInstruction composite_extract(SpvId type_id, SpvId result_id, SpvId composite_id, const Indices &indices);
+    static SpvInstruction composite_construct(SpvId type_id, SpvId result_id, const Components &constituents);
     static SpvInstruction bitcast(SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction integer_add(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
+    static SpvInstruction float_add(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
     static SpvInstruction branch(SpvId target_label_id);
+    static SpvInstruction equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
+    static SpvInstruction not_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
+    static SpvInstruction less_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
+    static SpvInstruction less_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
+    static SpvInstruction greater_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
+    static SpvInstruction greater_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
     static SpvInstruction conditional_branch(SpvId condition_label_id, SpvId true_label_id, SpvId false_label_id, const BranchWeights &weights = {});
     static SpvInstruction loop_merge(SpvId merge_label_id, SpvId continue_label_id, uint32_t loop_control_mask = SpvLoopControlMaskNone);
     static SpvInstruction selection_merge(SpvId merge_label_id, uint32_t selection_control_mask = SpvSelectionControlMaskNone);
@@ -536,6 +583,7 @@ struct SpvModuleContents {
 
     mutable RefCount ref_count;
     SpvId module_id = SpvInvalidId;
+    SpvId binding_count = 0;
     SpvSourceLanguage source_language = SpvSourceLanguageUnknown;
     SpvAddressingModel addressing_model = SpvAddressingModelLogical;
     SpvMemoryModel memory_model = SpvMemoryModelSimple;
@@ -544,7 +592,8 @@ struct SpvModuleContents {
     Imports imports;
     EntryPoints entry_points;
     Instructions execution_modes;
-    Instructions debug;
+    Instructions debug_source;
+    Instructions debug_symbols;
     Instructions annotations;
     Instructions types;
     Instructions constants;

From 9d6112ea92db420d6e176a01c79c884022b674ea Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 26 Sep 2022 15:44:09 -0700
Subject: [PATCH 017/166] Clang tidy/format pass.

---
 src/CodeGen_Vulkan_Dev.cpp | 158 +++++++++---------
 src/SpirvIR.cpp            | 330 ++++++++++++++++++++-----------------
 src/SpirvIR.h              |  47 +++---
 3 files changed, 279 insertions(+), 256 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 09f0e3f8a365..8272aac6b0a4 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -14,8 +14,8 @@
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "Scope.h"
-#include "Target.h"
 #include "SpirvIR.h"
+#include "Target.h"
 
 // Temporary:
 #include <fstream>
@@ -121,13 +121,13 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 
         // The SPIRV-IR builder
         SpvBuilder builder;
-        
+
         // Top-level function for adding kernels
         void add_kernel(const Stmt &s, const std::string &name, const std::vector<DeviceArgument> &args);
         void init_module();
-        void compile(std::vector<char>& binary);
+        void compile(std::vector<char> &binary);
 
-        // Scalarize expressions 
+        // Scalarize expressions
         void scalarize(const Expr &e);
         SpvId map_type_to_pair(const Type &t);
 
@@ -149,7 +149,6 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
     std::string current_kernel_name;
 };
 
-
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(const Expr &e) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(): " << (Expr)e << "\n";
     internal_assert(e.type().is_vector()) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize must be called with an expression of vector type.\n";
@@ -170,7 +169,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(const Expr &e) {
 SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::map_type_to_pair(const Type &t) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::map_type_to_pair(): " << t << "\n";
     SpvId base_type_id = builder.declare_type(t);
-    const std::string& type_name = type_to_c_type(t, false, false) + std::string("_pair"); 
+    const std::string &type_name = type_to_c_type(t, false, false) + std::string("_pair");
     SpvBuilder::StructMemberTypes member_type_ids = {base_type_id, base_type_id};
     SpvId struct_type_id = builder.declare_struct(type_name, member_type_ids);
     return struct_type_id;
@@ -184,46 +183,46 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Variable *var) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IntImm *imm) {
-    if(imm->type.bits() == 8) {
+    if (imm->type.bits() == 8) {
         const int8_t value = (int8_t)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
-    } else if(imm->type.bits() == 16) {
+    } else if (imm->type.bits() == 16) {
         const int16_t value = (int16_t)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
-    } else if(imm->type.bits() == 32) {
+    } else if (imm->type.bits() == 32) {
         const int32_t value = (int32_t)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
-    } else if(imm->type.bits() == 64) {
+    } else if (imm->type.bits() == 64) {
         const int64_t value = (int64_t)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
     } else {
-        internal_error << "Vulkan backend currently only supports 8-bit, 16-bit, 32-bit or 64-bit signed integers!\n";           
+        internal_error << "Vulkan backend currently only supports 8-bit, 16-bit, 32-bit or 64-bit signed integers!\n";
     }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const UIntImm *imm) {
-    if(imm->type.bits() == 8) {
+    if (imm->type.bits() == 8) {
         const uint8_t value = (uint8_t)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
-    } else if(imm->type.bits() == 16) {
+    } else if (imm->type.bits() == 16) {
         const uint16_t value = (uint16_t)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
-    } else if(imm->type.bits() == 32) {
+    } else if (imm->type.bits() == 32) {
         const uint32_t value = (uint32_t)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
-    } else if(imm->type.bits() == 64) {
+    } else if (imm->type.bits() == 64) {
         const uint64_t value = (uint64_t)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
     } else {
-        internal_error << "Vulkan backend currently only supports 8-bit, 16-bit, 32-bit or 64-bit unsigned integers!\n";           
+        internal_error << "Vulkan backend currently only supports 8-bit, 16-bit, 32-bit or 64-bit unsigned integers!\n";
     }
 }
 
@@ -233,16 +232,16 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const StringImm *imm) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
-    if(imm->type.bits() == 32) {
+    if (imm->type.bits() == 32) {
         const float value = (float)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
-    } else if(imm->type.bits() == 64) {
+    } else if (imm->type.bits() == 64) {
         const double value = (double)(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
     } else {
-        internal_error << "Vulkan backend currently only supports 32-bit or 64-bit floats\n";           
+        internal_error << "Vulkan backend currently only supports 32-bit or 64-bit floats\n";
     }
 }
 
@@ -293,22 +292,22 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Add *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Add): " << op->type << " ((" << op->a <<  ") + (" << op->b << "))\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Add): " << op->type << " ((" << op->a << ") + (" << op->b << "))\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFAdd : SpvOpIAdd);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Sub *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Sub): " << op->type << " ((" << op->a <<  ") - (" << op->b << "))\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Sub): " << op->type << " ((" << op->a << ") - (" << op->b << "))\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFSub : SpvOpISub);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mul *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mul): " << op->type << " ((" << op->a <<  ") * (" << op->b << "))\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mul): " << op->type << " ((" << op->a << ") * (" << op->b << "))\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFMul : SpvOpIMul);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Div): " << op->type << " ((" << op->a <<  ") / (" << op->b << "))\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Div): " << op->type << " ((" << op->a << ") / (" << op->b << "))\n";
     user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n";
 
     if (op->type.is_float()) {
@@ -320,7 +319,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a <<  ") % (" << op->b << "))\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a << ") % (" << op->b << "))\n";
     if (op->type.is_float()) {
         // Takes sign of result from op->b
         visit_binop(op->type, op->a, op->b, SpvOpFMod);
@@ -331,7 +330,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Max *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Max): " << op->type << " Max((" << op->a <<  "), (" << op->b << "))\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Max): " << op->type << " Max((" << op->a << "), (" << op->b << "))\n";
 
     std::string a_name = unique_name('a');
     std::string b_name = unique_name('b');
@@ -343,7 +342,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Max *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Min): " << op->type << " Min((" << op->a <<  "), (" << op->b << "))\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Min): " << op->type << " Min((" << op->a << "), (" << op->b << "))\n";
     std::string a_name = unique_name('a');
     std::string b_name = unique_name('b');
     Expr a = Variable::make(op->a.type(), a_name);
@@ -354,17 +353,17 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(EQ): " << op->type << " (" << op->a <<  ") == (" << op->b << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(EQ): " << op->type << " (" << op->a << ") == (" << op->b << ")\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFOrdEqual : SpvOpIEqual);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const NE *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(NE): " << op->type << " (" << op->a <<  ") != (" << op->b << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(NE): " << op->type << " (" << op->a << ") != (" << op->b << ")\n";
     visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFOrdNotEqual : SpvOpINotEqual);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LT): " << op->type << " (" << op->a <<  ") < (" << op->b << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LT): " << op->type << " (" << op->a << ") < (" << op->b << ")\n";
     SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
         op_code = SpvOpFOrdLessThan;
@@ -379,7 +378,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LE): " << op->type << " (" << op->a <<  ") <= (" << op->b << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LE): " << op->type << " (" << op->a << ") <= (" << op->b << ")\n";
     SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
         op_code = SpvOpFOrdLessThanEqual;
@@ -394,7 +393,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(GT): " << op->type << " (" << op->a <<  ") > (" << op->b << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(GT): " << op->type << " (" << op->a << ") > (" << op->b << ")\n";
     SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
         op_code = SpvOpFOrdGreaterThan;
@@ -409,7 +408,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(GE): " << op->type << " (" << op->a <<  ") >= (" << op->b << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(GE): " << op->type << " (" << op->a << ") >= (" << op->b << ")\n";
     SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
         op_code = SpvOpFOrdGreaterThanEqual;
@@ -424,17 +423,17 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const And *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(And): " << op->type << " (" << op->a <<  ") && (" << op->b << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(And): " << op->type << " (" << op->a << ") && (" << op->b << ")\n";
     visit_binop(op->type, op->a, op->b, SpvOpLogicalAnd);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Or *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Or): " << op->type << " (" << op->a <<  ") || (" << op->b << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Or): " << op->type << " (" << op->a << ") || (" << op->b << ")\n";
     visit_binop(op->type, op->a, op->b, SpvOpLogicalOr);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Not *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Not): " << op->type << " !(" << op->a <<  ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Not): " << op->type << " !(" << op->a << ")\n";
 
     SpvId type_id = builder.declare_type(op->type);
     op->a.accept(this);
@@ -452,7 +451,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         // right memory barrier. Might be able to use
         // SpvMemorySemanticsMaskNone instead.
         SpvId current_function_id = builder.current_function().id();
-        builder.append(SpvFactory::control_barrier(current_function_id, current_function_id, 
+        builder.append(SpvFactory::control_barrier(current_function_id, current_function_id,
                                                    SpvMemorySemanticsAcquireReleaseMask));
     } else if (op->is_intrinsic(Call::bitwise_and)) {
         internal_assert(op->args.size() == 2);
@@ -532,7 +531,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             // TODO: This code depends on compilation happening on a little-endian host.
             SpvId shift_amount_id = builder.declare_constant(shift->type, &shift->value);
             result_id = builder.reserve_id(SpvResultId);
-            if(op->type.is_uint()) {
+            if (op->type.is_uint()) {
                 builder.append(SpvFactory::shift_right_logical(type_id, result_id, high_item_id, shift_amount_id));
             } else {
                 builder.append(SpvFactory::shift_right_arithmetic(type_id, result_id, high_item_id, shift_amount_id));
@@ -549,7 +548,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Select *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Select): " << op->type << " (" << op->condition <<  ") ? (" << op->true_value << ") : (" << op->false_value << ")\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Select): " << op->type << " (" << op->condition << ") ? (" << op->true_value << ") : (" << op->false_value << ")\n";
     SpvId type_id = builder.declare_type(op->type);
     op->condition.accept(this);
     SpvId cond_id = builder.current_id();
@@ -722,7 +721,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         SpvId unsigned_gpu_var_id = builder.reserve_id(SpvResultId);
         SpvId signed_type_id = builder.declare_type(Int(32));
         SpvId signed_gpu_var_id = builder.reserve_id(SpvResultId);
-        SpvFactory::Indices indices = { intrinsic.second };
+        SpvFactory::Indices indices = {intrinsic.second};
         builder.append(SpvFactory::composite_extract(unsigned_type_id, unsigned_gpu_var_id, intrinsic_id, indices));
         builder.append(SpvFactory::bitcast(signed_type_id, signed_gpu_var_id, unsigned_gpu_var_id));
         {
@@ -804,10 +803,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Ramp *op) {
     SpvFactory::Components constituents = {base_id};
     for (int i = 1; i < op->lanes; i++) {
         SpvId this_id = builder.reserve_id(SpvResultId);
-        if(op->base.type().is_float()) {
+        if (op->base.type().is_float()) {
             builder.append(SpvFactory::float_add(base_type_id, this_id, prev_id, stride_id));
-        }
-        else {
+        } else {
             builder.append(SpvFactory::integer_add(base_type_id, this_id, prev_id, stride_id));
         }
         constituents.push_back(this_id);
@@ -849,7 +847,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Realize *) {
 template<typename StmtOrExpr>
 SpvFactory::BlockVariables
 CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
-                                                    StmtOrExpr then_case, StmtOrExpr else_case) {
+                                                     StmtOrExpr then_case, StmtOrExpr else_case) {
     condition.accept(this);
     SpvId cond_id = builder.current_id();
     SpvId then_label_id = builder.reserve_id(SpvLabelId);
@@ -860,22 +858,21 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
     builder.append(SpvFactory::selection_merge(merge_label_id, SpvSelectionControlMaskNone));
     builder.append(SpvFactory::conditional_branch(cond_id, then_label_id, else_label_id));
 
-    // Then block    
+    // Then block
     builder.append(SpvFactory::label(then_label_id));
     then_case.accept(this);
     SpvId then_id = builder.current_id();
     builder.append(SpvFactory::branch(merge_label_id));
 
     SpvFactory::BlockVariables block_vars = {
-        { then_id, then_label_id }
-    };
+        {then_id, then_label_id}};
 
     // Else block (optional)
     builder.append(SpvFactory::label(else_label_id));
-    if(else_case.defined()) {
+    if (else_case.defined()) {
         else_case.accept(this);
         SpvId else_id = builder.current_id();
-        block_vars.push_back({ else_id, else_label_id });
+        block_vars.push_back({else_id, else_label_id});
     }
     builder.append(SpvFactory::branch(merge_label_id));
 
@@ -928,7 +925,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_binop(Type t, const Expr &a, const
     SpvId src_a_id = builder.current_id();
     b.accept(this);
     SpvId src_b_id = builder.current_id();
-    
+
     SpvId result_id = builder.reserve_id(SpvResultId);
     builder.append(SpvFactory::binary_op(op_code, type_id, result_id, src_a_id, src_b_id));
     builder.update_id(result_id);
@@ -939,13 +936,13 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
     builder.reset();
 
     // NOTE: Source language is irrelevant. We encode the binary directly
-    builder.set_source_language(SpvSourceLanguageUnknown);   
+    builder.set_source_language(SpvSourceLanguageUnknown);
 
     // TODO: Should we autodetect and/or force 32bit or 64bit?
-    builder.set_addressing_model(SpvAddressingModelLogical); 
-    
+    builder.set_addressing_model(SpvAddressingModelLogical);
+
     // TODO: Is there a better memory model to use?
-    builder.set_memory_model(SpvMemoryModelGLSL450);         
+    builder.set_memory_model(SpvMemoryModelGLSL450);
 
     // Capabilities
     builder.require_capability(SpvCapabilityShader);
@@ -959,13 +956,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
     builder.finalize();
     builder.encode(spirv_binary);
     module.reserve(spirv_binary.size() * sizeof(uint32_t));
-    module.insert(module.end(), (const char*)spirv_binary.data(), (const char*)(spirv_binary.data() + spirv_binary.size()));
+    module.insert(module.end(), (const char *)spirv_binary.data(), (const char *)(spirv_binary.data() + spirv_binary.size()));
 }
 
-
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
-                                                  const std::string &name,
-                                                  const std::vector<DeviceArgument> &args) {
+                                                   const std::string &name,
+                                                   const std::vector<DeviceArgument> &args) {
     debug(2) << "Adding Vulkan kernel " << name << "\n";
 
     // Add function definition
@@ -976,31 +972,31 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     workgroup_size[1] = 0;
     workgroup_size[2] = 0;
 
-    // Declare the kernel function 
+    // Declare the kernel function
     SpvId void_type_id = builder.declare_void_type();
     SpvId kernel_func_id = builder.add_function(name, void_type_id);
     SpvFunction kernel_func = builder.lookup_function(kernel_func_id);
     builder.enter_function(kernel_func);
     builder.enter_block(kernel_func.entry_block());
-//    SpvId start_label_id = kernel_func.entry_block().id();
+    //    SpvId start_label_id = kernel_func.entry_block().id();
 
     // TODO: only add the SIMT intrinsics used
     SpvFactory::Variables entry_point_variables;
     auto intrinsics = {"WorkgroupId", "LocalInvocationId"};
-    for (const std::string& intrinsic_name : intrinsics) {
+    for (const std::string &intrinsic_name : intrinsics) {
 
         // The builtins are pointers to vec3
         SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
         SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, SpvStorageClassInput);
         SpvId intrinsic_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, SpvStorageClassInput);
         SpvId intrinsic_loaded_id = builder.reserve_id();
-        builder.append( SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id) );
+        builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id));
         symbol_table.push(intrinsic_name, {intrinsic_loaded_id, SpvStorageClassInput});
 
         // Annotate that this is the specific builtin
         SpvBuiltIn built_in_kind = starts_with(intrinsic_name, "Workgroup") ? SpvBuiltInWorkgroupId : SpvBuiltInLocalInvocationId;
-        SpvBuilder::Literals annotation_literals = { (uint32_t)built_in_kind };
-        builder.add_annotation( intrinsic_id, SpvDecorationBuiltIn, annotation_literals  );
+        SpvBuilder::Literals annotation_literals = {(uint32_t)built_in_kind};
+        builder.add_annotation(intrinsic_id, SpvDecorationBuiltIn, annotation_literals);
 
         // Add the builtin to the interface
         entry_point_variables.push_back(intrinsic_id);
@@ -1019,22 +1015,22 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
             param_struct_members.push_back(arg_type_id);
         }
     }
-    SpvId param_struct_type_id = builder.declare_struct( unique_name("param_struct"), param_struct_members);
+    SpvId param_struct_type_id = builder.declare_struct(unique_name("param_struct"), param_struct_members);
 
     // Add a decoration describing the offset for each parameter struct member
     uint32_t param_member_index = 0;
     uint32_t param_member_offset = 0;
     for (const auto &arg : args) {
         if (!arg.is_buffer) {
-            SpvBuilder::Literals param_offset_literals = { param_member_offset };
-            builder.add_struct_annotation( param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals );
+            SpvBuilder::Literals param_offset_literals = {param_member_offset};
+            builder.add_struct_annotation(param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals);
             param_member_offset += arg.type.bytes();
             param_member_index++;
         }
     }
 
     // Add a Block decoration for the parameter pack itself
-    builder.add_annotation( param_struct_type_id, SpvDecorationBlock );
+    builder.add_annotation(param_struct_type_id, SpvDecorationBlock);
 
     // Add a variable for the parameter pack
     SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
@@ -1042,8 +1038,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
 
     // We always pass in the parameter pack as the first binding
     SpvBuilder::Literals zero_literal = {0};
-    builder.add_annotation( param_pack_var_id, SpvDecorationDescriptorSet, zero_literal );
-    builder.add_annotation( param_pack_var_id, SpvDecorationBinding, zero_literal );
+    builder.add_annotation(param_pack_var_id, SpvDecorationDescriptorSet, zero_literal);
+    builder.add_annotation(param_pack_var_id, SpvDecorationBinding, zero_literal);
 
     uint32_t binding_counter = 1;
     uint32_t scalar_index = 0;
@@ -1051,24 +1047,24 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
         if (arg.is_buffer) {
             SpvId element_type_id = builder.declare_type(arg.type);
             SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
-            SpvBuilder::StructMemberTypes struct_member_types = { runtime_arr_type_id };
-            SpvId struct_type_id = builder.declare_struct( unique_name("param_buffer_" + std::to_string(binding_counter)), struct_member_types );
+            SpvBuilder::StructMemberTypes struct_member_types = {runtime_arr_type_id};
+            SpvId struct_type_id = builder.declare_struct(unique_name("param_buffer_" + std::to_string(binding_counter)), struct_member_types);
             SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, SpvStorageClassUniform);
-            SpvId param_id = builder.declare_global_variable( unique_name("param_" + arg.name), ptr_struct_type_id, SpvStorageClassUniform);
+            SpvId param_id = builder.declare_global_variable(unique_name("param_" + arg.name), ptr_struct_type_id, SpvStorageClassUniform);
 
             // Annotate the struct to indicate it's passed in a GLSL-style buffer block
             builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
 
             // Annotate the array with its stride
-            SpvBuilder::Literals array_stride = { (uint32_t)(arg.type.bytes()) };
+            SpvBuilder::Literals array_stride = {(uint32_t)(arg.type.bytes())};
             builder.add_annotation(runtime_arr_type_id, SpvDecorationArrayStride, array_stride);
 
             // Annotate the offset for the array
-            SpvBuilder::Literals zero_literal = { uint32_t(0) };
+            SpvBuilder::Literals zero_literal = {uint32_t(0)};
             builder.add_struct_annotation(struct_type_id, 0, SpvDecorationOffset, zero_literal);
 
             // Set DescriptorSet and Binding
-            SpvBuilder::Literals binding_index = { uint32_t(binding_counter++) };
+            SpvBuilder::Literals binding_index = {uint32_t(binding_counter++)};
             builder.add_annotation(param_id, SpvDecorationDescriptorSet, zero_literal);
             builder.add_annotation(param_id, SpvDecorationBinding, binding_index);
             symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
@@ -1090,15 +1086,15 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     s.accept(this);
 
     // Insert return statement end delimiter
-    kernel_func.entry_block().add_instruction( SpvFactory::return_stmt() );
+    kernel_func.entry_block().add_instruction(SpvFactory::return_stmt());
 
     workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
     workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
     workgroup_size[2] = std::max(workgroup_size[2], (uint32_t)1);
 
     // Add workgroup size to execution mode
-    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size( kernel_func_id, workgroup_size[0], workgroup_size[1], workgroup_size[2] );
-    builder.current_module().add_execution_mode( exec_mode_inst );
+    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(kernel_func_id, workgroup_size[0], workgroup_size[1], workgroup_size[2]);
+    builder.current_module().add_execution_mode(exec_mode_inst);
 
     // Pop scope
     for (const auto &arg : args) {
@@ -1132,7 +1128,7 @@ void CodeGen_Vulkan_Dev::add_kernel(Stmt stmt,
     emitter.add_kernel(stmt, name, args);
 
     // dump the SPIRV file if requested
-    if(getenv("HL_SPIRV_DUMP_FILE")) {
+    if (getenv("HL_SPIRV_DUMP_FILE")) {
         dump();
     }
 }
@@ -1170,7 +1166,7 @@ std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Vulkan_Dev(const Target &target) {
 }  // namespace Internal
 }  // namespace Halide
 
-#else // WITH_SPIRV
+#else  // WITH_SPIRV
 
 namespace Halide {
 namespace Internal {
@@ -1182,4 +1178,4 @@ std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Vulkan_Dev(const Target &target) {
 }  // namespace Internal
 }  // namespace Halide
 
-#endif // WITH_SPIRV
+#endif  // WITH_SPIRV
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index db6749fed020..36a4e996ff6b 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -7,30 +7,29 @@ namespace Halide {
 namespace Internal {
 
 namespace {
-    
-    template <typename T, typename S>
-    typename std::enable_if<std::is_unsigned<T>::value,T>::type
-    constexpr rotl(const T n, const S i){
-        const T m = (std::numeric_limits<T>::digits-1);
-        const T c = i & m;
-        return (n<<c)|(n>>((T(0)-c)&m));
-    }
 
-    static uint64_t hash_splitmix64(uint64_t x) {
-        // http://xorshift.di.unimi.it/splitmix64.c
-        x += uint64_t(0x9e3779b97f4a7c15);
-        x = (x ^ (x >> 30)) * uint64_t(0xbf58476d1ce4e5b9);
-        x = (x ^ (x >> 27)) * uint64_t(0x94d049bb133111eb);
-        return x ^ (x >> 31);
-    }
+template<typename T, typename S>
+typename std::enable_if<std::is_unsigned<T>::value, T>::type constexpr rotl(const T n, const S i) {
+    const T m = (std::numeric_limits<T>::digits - 1);
+    const T c = i & m;
+    return (n << c) | (n >> ((T(0) - c) & m));
+}
 
-    inline uint64_t hash_combine(uint64_t& seed, const uint64_t& value) {
-        // mix using a cheap asymmetric binary rotation
-        const uint64_t r = std::numeric_limits<uint64_t>::digits/3;
-        return rotl(seed, r) ^ hash_splitmix64(value);
-    }
+inline uint64_t hash_splitmix64(uint64_t x) {
+    // http://xorshift.di.unimi.it/splitmix64.c
+    x += uint64_t(0x9e3779b97f4a7c15);
+    x = (x ^ (x >> 30)) * uint64_t(0xbf58476d1ce4e5b9);
+    x = (x ^ (x >> 27)) * uint64_t(0x94d049bb133111eb);
+    return x ^ (x >> 31);
+}
+
+inline uint64_t hash_combine(uint64_t &seed, const uint64_t &value) {
+    // mix using a cheap asymmetric binary rotation
+    const uint64_t r = std::numeric_limits<uint64_t>::digits / 3;
+    return rotl(seed, r) ^ hash_splitmix64(value);
 }
 
+}  // namespace
 
 /** SpvInstruction implementation **/
 SpvInstruction SpvInstruction::make(SpvOp op_code) {
@@ -406,12 +405,12 @@ bool SpvModule::is_defined() const {
     return contents.defined();
 }
 
-void SpvModule::add_debug_string(SpvId result_id, const std::string& string) {
+void SpvModule::add_debug_string(SpvId result_id, const std::string &string) {
     check_defined();
     contents->debug_source.push_back(SpvFactory::debug_string(result_id, string));
 }
 
-void SpvModule::add_debug_symbol(SpvId id, const std::string& symbol) {
+void SpvModule::add_debug_symbol(SpvId id, const std::string &symbol) {
     check_defined();
     contents->debug_symbols.push_back(SpvFactory::debug_symbol(id, symbol));
 }
@@ -568,7 +567,7 @@ void SpvModule::encode(SpvBinary &binary) const {
     binary.push_back(SpvVersion);
     binary.push_back(contents->source_language);
     binary.push_back(contents->binding_count);  // last id bound to this module (aka last id used)
-    binary.push_back(0);  // Reserved for schema.
+    binary.push_back(0);                        // Reserved for schema.
 
     // 1. Capabilities
     for (const SpvCapability &capability : contents->capabilities) {
@@ -685,67 +684,97 @@ SpvId SpvBuilder::make_id(SpvKind kind) {
 }
 
 std::string SpvBuilder::kind_name(SpvKind kind) const {
-    switch(kind) {
-        case SpvInvalidItem:
-            { return "InvalidItem"; }
-        case SpvTypeId:
-            { return "TypeId"; }
-        case SpvVoidTypeId:
-            { return "VoidTypeId"; }
-        case SpvBoolTypeId:
-            { return "BoolTypeId"; }
-        case SpvIntTypeId:
-            { return "IntTypeId"; }
-        case SpvFloatTypeId:
-            { return "FloatTypeId"; }
-        case SpvVectorTypeId:
-            { return "VectorTypeId"; }
-        case SpvArrayTypeId:
-            { return "ArrayTypeId"; }
-        case SpvRuntimeArrayTypeId:
-            { return "RuntimeArrayTypeId"; }
-        case SpvStringTypeId:
-            { return "StringTypeId"; }
-        case SpvPointerTypeId:
-            { return "PointerTypeId"; }
-        case SpvStructTypeId:
-            { return "StructTypeId"; }
-        case SpvFunctionTypeId:
-            { return "FunctionTypeId"; }
-        case SpvAccessChainId:
-            { return "AccessChainId"; }
-        case SpvConstantId:
-            { return "ConstantId"; }
-        case SpvBoolConstantId:
-            { return "BoolConstantId"; }
-        case SpvIntConstantId:
-            { return "IntConstantId"; }
-        case SpvFloatConstantId:
-            { return "FloatConstantId"; }
-        case SpvStringConstantId:
-            { return "StringConstantId"; }
-        case SpvCompositeConstantId:
-            { return "CompositeConstantId"; }
-        case SpvResultId:
-            { return "ResultId"; }
-        case SpvVariableId:
-            { return "VariableId"; }
-        case SpvInstructionId:
-            { return "InstructionId"; }
-        case SpvFunctionId:
-            { return "FunctionId"; }
-        case SpvBlockId:
-            { return "BlockId"; }
-        case SpvLabelId:
-            { return "LabelId"; }
-        case SpvParameterId:
-            { return "ParameterId"; }
-        case SpvModuleId:
-            { return "ModuleId"; }
-        case SpvUnknownItem: 
-            { return "UnknownItem"; }
-        default:
-            { return "InvalidItem"; }
+    switch (kind) {
+    case SpvInvalidItem: {
+        return "InvalidItem";
+    }
+    case SpvTypeId: {
+        return "TypeId";
+    }
+    case SpvVoidTypeId: {
+        return "VoidTypeId";
+    }
+    case SpvBoolTypeId: {
+        return "BoolTypeId";
+    }
+    case SpvIntTypeId: {
+        return "IntTypeId";
+    }
+    case SpvFloatTypeId: {
+        return "FloatTypeId";
+    }
+    case SpvVectorTypeId: {
+        return "VectorTypeId";
+    }
+    case SpvArrayTypeId: {
+        return "ArrayTypeId";
+    }
+    case SpvRuntimeArrayTypeId: {
+        return "RuntimeArrayTypeId";
+    }
+    case SpvStringTypeId: {
+        return "StringTypeId";
+    }
+    case SpvPointerTypeId: {
+        return "PointerTypeId";
+    }
+    case SpvStructTypeId: {
+        return "StructTypeId";
+    }
+    case SpvFunctionTypeId: {
+        return "FunctionTypeId";
+    }
+    case SpvAccessChainId: {
+        return "AccessChainId";
+    }
+    case SpvConstantId: {
+        return "ConstantId";
+    }
+    case SpvBoolConstantId: {
+        return "BoolConstantId";
+    }
+    case SpvIntConstantId: {
+        return "IntConstantId";
+    }
+    case SpvFloatConstantId: {
+        return "FloatConstantId";
+    }
+    case SpvStringConstantId: {
+        return "StringConstantId";
+    }
+    case SpvCompositeConstantId: {
+        return "CompositeConstantId";
+    }
+    case SpvResultId: {
+        return "ResultId";
+    }
+    case SpvVariableId: {
+        return "VariableId";
+    }
+    case SpvInstructionId: {
+        return "InstructionId";
+    }
+    case SpvFunctionId: {
+        return "FunctionId";
+    }
+    case SpvBlockId: {
+        return "BlockId";
+    }
+    case SpvLabelId: {
+        return "LabelId";
+    }
+    case SpvParameterId: {
+        return "ParameterId";
+    }
+    case SpvModuleId: {
+        return "ModuleId";
+    }
+    case SpvUnknownItem: {
+        return "UnknownItem";
+    }
+    default: {
+        return "InvalidItem";
+    }
     };
     return "InvalidItem";
 }
@@ -770,11 +799,11 @@ void SpvBuilder::finalize() {
     SpvId last_id = (SpvId)(kind_map.size() + 1);
     module.set_binding_count(last_id);
 
-    if(module.is_capability_required(SpvCapabilityInt8)) {
+    if (module.is_capability_required(SpvCapabilityInt8)) {
         module.require_extension("SPV_KHR_8bit_storage");
     }
 
-    if(module.is_capability_required(SpvCapabilityInt16)) {
+    if (module.is_capability_required(SpvCapabilityInt16)) {
         module.require_extension("SPV_KHR_16bit_storage");
     }
 }
@@ -816,12 +845,12 @@ SpvId SpvBuilder::declare_function_type(SpvId return_type, const ParamTypes &par
     return type_id;
 }
 
-SpvId SpvBuilder::declare_function(const std::string& name, SpvId function_type) {
+SpvId SpvBuilder::declare_function(const std::string &name, SpvId function_type) {
     SpvId existing_id = lookup_id(name);
-    if(existing_id != SpvInvalidId) {
-        if(kind_of(existing_id) == SpvFunctionId) {
+    if (existing_id != SpvInvalidId) {
+        if (kind_of(existing_id) == SpvFunctionId) {
             SpvFunction existing_func = lookup_function(existing_id);
-            if(existing_func.type_id() == function_type) {
+            if (existing_func.type_id() == function_type) {
                 return existing_id;
             }
         }
@@ -837,11 +866,11 @@ SpvId SpvBuilder::declare_constant(const Type &type, const void *data) {
     return result_id;
 }
 
-SpvId SpvBuilder::declare_symbol(const std::string& symbol, SpvId id, SpvId scope_id) {
+SpvId SpvBuilder::declare_symbol(const std::string &symbol, SpvId id, SpvId scope_id) {
     SpvId existing_id = lookup_id(symbol);
-    if(existing_id != SpvInvalidId) {
+    if (existing_id != SpvInvalidId) {
         SpvId existing_scope = lookup_scope(existing_id);
-        if(existing_scope == scope_id) {
+        if (existing_scope == scope_id) {
             return existing_id;
         }
     }
@@ -850,7 +879,7 @@ SpvId SpvBuilder::declare_symbol(const std::string& symbol, SpvId id, SpvId scop
 }
 
 SpvStorageClass SpvBuilder::lookup_storage_class(SpvId id) const {
-    SpvStorageClass result = SpvInvalidStorageClass; 
+    SpvStorageClass result = SpvInvalidStorageClass;
     StorageClassMap::const_iterator it = storage_class_map.find(id);
     if (it != storage_class_map.end()) {
         result = it->second;
@@ -858,11 +887,11 @@ SpvStorageClass SpvBuilder::lookup_storage_class(SpvId id) const {
     return result;
 }
 
-SpvId SpvBuilder::lookup_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id ) const {
+SpvId SpvBuilder::lookup_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id) const {
     SpvId existing_id = lookup_id(name);
-    if(existing_id != SpvInvalidId) {
-        if( (kind_of(existing_id) == SpvVariableId) && 
-            (type_of(existing_id) == type_id) && 
+    if (existing_id != SpvInvalidId) {
+        if ((kind_of(existing_id) == SpvVariableId) &&
+            (type_of(existing_id) == type_id) &&
             (lookup_storage_class(existing_id) == storage_class) &&
             (lookup_scope(existing_id) == scope_id)) {
             return existing_id;
@@ -871,14 +900,14 @@ SpvId SpvBuilder::lookup_variable(const std::string& name, SpvId type_id, SpvSto
     return SpvInvalidId;
 }
 
-bool SpvBuilder::has_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id ) const {
-    return (lookup_variable(name, type_id, storage_class, scope_id ) != SpvInvalidId);
+bool SpvBuilder::has_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id) const {
+    return (lookup_variable(name, type_id, storage_class, scope_id) != SpvInvalidId);
 }
 
-SpvId SpvBuilder::declare_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId init_id) {
+SpvId SpvBuilder::declare_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId init_id) {
     SpvId block_id = current_block().id();
     SpvId existing_id = lookup_variable(name, type_id, storage_class, block_id);
-    if(existing_id != SpvInvalidId) {
+    if (existing_id != SpvInvalidId) {
         return existing_id;
     }
 
@@ -890,7 +919,7 @@ SpvId SpvBuilder::declare_variable(const std::string& name, SpvId type_id, SpvSt
     return var_id;
 }
 
-SpvId SpvBuilder::declare_global_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId init_id) {
+SpvId SpvBuilder::declare_global_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId init_id) {
     SpvId var_id = reserve_id(SpvVariableId);
     debug(3) << "    declare_global_variable: %" << var_id << " name='" << name << "' type_id=" << type_id << " storage_class=" << (uint32_t)storage_class << " init_id=" << init_id << "\n";
     module.add_global(SpvFactory::variable(var_id, type_id, storage_class, init_id));
@@ -902,8 +931,8 @@ SpvId SpvBuilder::declare_global_variable(const std::string& name, SpvId type_id
 void SpvBuilder::add_entry_point(SpvId func_id, SpvExecutionModel exec_model,
                                  const Variables &variables) {
 
-    const std::string& func_name = lookup_symbol(func_id);
-    if(func_name.empty()) {
+    const std::string &func_name = lookup_symbol(func_id);
+    if (func_name.empty()) {
         internal_error << "SPIRV: Function missing name definition: " << func_id << "\n";
     } else {
         debug(3) << "    add_entry_point: %" << func_id << " func_name='" << func_name << "' exec_model=" << (uint32_t)exec_model << " variable_count=" << (uint32_t)variables.size() << "\n";
@@ -912,7 +941,7 @@ void SpvBuilder::add_entry_point(SpvId func_id, SpvExecutionModel exec_model,
     }
 }
 
-SpvId SpvBuilder::add_function(const std::string& name, SpvId return_type_id, const ParamTypes &param_types) {
+SpvId SpvBuilder::add_function(const std::string &name, SpvId return_type_id, const ParamTypes &param_types) {
     SpvId func_id = make_id(SpvFunctionId);
     SpvId func_type_id = declare_function_type(return_type_id, param_types);
 
@@ -934,11 +963,10 @@ SpvId SpvBuilder::add_function(const std::string& name, SpvId return_type_id, co
     return func_id;
 }
 
-
 void SpvBuilder::add_annotation(SpvId target_id, SpvDecoration decoration_type, const Literals &literals) {
     SpvInstruction inst = SpvFactory::decorate(target_id, decoration_type, literals);
     debug(3) << "    add_annotation: %" << target_id << " decoration_type=" << uint32_t(decoration_type) << " literals=[";
-    for(uint32_t v : literals) {
+    for (uint32_t v : literals) {
         debug(3) << " " << v;
     }
     debug(3) << " ]\n";
@@ -948,7 +976,7 @@ void SpvBuilder::add_annotation(SpvId target_id, SpvDecoration decoration_type,
 void SpvBuilder::add_struct_annotation(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const Literals &literals) {
     SpvInstruction inst = SpvFactory::decorate_member(struct_type_id, member_index, decoration_type, literals);
     debug(3) << "    add_struct_annotation: %" << struct_type_id << " member_index=" << member_index << " decoration_type=" << uint32_t(decoration_type) << " literals=[";
-    for(uint32_t v : literals) {
+    for (uint32_t v : literals) {
         debug(3) << " " << v;
     }
     debug(3) << " ]\n";
@@ -1005,7 +1033,7 @@ std::string SpvBuilder::lookup_symbol(SpvId id) const {
     return name;
 }
 
-SpvId SpvBuilder::lookup_id(const std::string& symbol) const {
+SpvId SpvBuilder::lookup_id(const std::string &symbol) const {
     SpvId result = SpvInvalidId;
     SymbolIdMap::const_iterator it = symbol_id_map.find(symbol);
     if (it != symbol_id_map.end()) {
@@ -1014,7 +1042,7 @@ SpvId SpvBuilder::lookup_id(const std::string& symbol) const {
     return result;
 }
 
-void SpvBuilder::add_symbol(const std::string& symbol, SpvId id, SpvId scope_id) {
+void SpvBuilder::add_symbol(const std::string &symbol, SpvId id, SpvId scope_id) {
     symbol_id_map[symbol] = id;
     id_symbol_map[id] = symbol;
     scope_map[id] = scope_id;
@@ -1087,7 +1115,7 @@ SpvAddressingModel SpvBuilder::addressing_model() const {
 }
 
 SpvMemoryModel SpvBuilder::memory_model() const {
-    return module.memory_model();    
+    return module.memory_model();
 }
 
 void SpvBuilder::require_capability(SpvCapability capability) {
@@ -1168,10 +1196,10 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
             debug(3) << "    add_float_type: %" << type_id << " bits=" << type.bits() << "\n";
             SpvInstruction inst = SpvFactory::float_type(type_id, type.bits());
             module.add_type(inst);
-            if(type.bits() == 16) {
-                module.require_capability(SpvCapabilityFloat16);                
-            } else if(type.bits() == 64) {
-                module.require_capability(SpvCapabilityFloat64);                
+            if (type.bits() == 16) {
+                module.require_capability(SpvCapabilityFloat16);
+            } else if (type.bits() == 64) {
+                module.require_capability(SpvCapabilityFloat64);
             }
         } else if (type.is_int_or_uint()) {
             type_id = make_id(SpvIntTypeId);
@@ -1179,12 +1207,12 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
             debug(3) << "    add_integer_type: %" << type_id << " bits=" << type.bits() << " signed=" << (signedness ? "true" : "false") << "\n";
             SpvInstruction inst = SpvFactory::integer_type(type_id, type.bits(), signedness);
             module.add_type(inst);
-            if(type.bits() == 8) {
-                module.require_capability(SpvCapabilityInt8);                
-            } else if(type.bits() == 16) {
-                module.require_capability(SpvCapabilityInt16);                
-            } else if(type.bits() == 64) {
-                module.require_capability(SpvCapabilityInt64);                
+            if (type.bits() == 8) {
+                module.require_capability(SpvCapabilityInt8);
+            } else if (type.bits() == 16) {
+                module.require_capability(SpvCapabilityInt16);
+            } else if (type.bits() == 64) {
+                module.require_capability(SpvCapabilityInt64);
             }
         } else {
             internal_error << "SPIRV: Unsupported type " << type << "\n";
@@ -1207,29 +1235,29 @@ SpvBuilder::TypeKey SpvBuilder::make_struct_type_key(const StructMemberTypes &me
     return key;
 }
 
-SpvId SpvBuilder::lookup_struct(const std::string& struct_name, const StructMemberTypes &member_type_ids) const {
+SpvId SpvBuilder::lookup_struct(const std::string &struct_name, const StructMemberTypes &member_type_ids) const {
     TypeKey key = make_struct_type_key(member_type_ids);
     TypeMap::const_iterator it = struct_map.find(key);
     if (it != struct_map.end()) {
-        if(struct_name == lookup_symbol(it->second)) {
+        if (struct_name == lookup_symbol(it->second)) {
             return it->second;
         }
     }
     return SpvInvalidId;
 }
 
-SpvId SpvBuilder::add_struct(const std::string& struct_name, const StructMemberTypes &member_type_ids) {
+SpvId SpvBuilder::add_struct(const std::string &struct_name, const StructMemberTypes &member_type_ids) {
     TypeKey key = make_struct_type_key(member_type_ids);
     TypeMap::const_iterator it = struct_map.find(key);
     if (it != struct_map.end()) {
-        if(struct_name == lookup_symbol(it->second)) {
+        if (struct_name == lookup_symbol(it->second)) {
             return it->second;
         }
     }
 
     SpvId struct_type_id = make_id(SpvStructTypeId);
     debug(3) << "    add_struct_type: %" << struct_type_id << " name=" << struct_name << " member_type_ids=[";
-    for(SpvId m : member_type_ids) {
+    for (SpvId m : member_type_ids) {
         debug(3) << " " << m;
     }
     debug(3) << " ]\n";
@@ -1240,7 +1268,7 @@ SpvId SpvBuilder::add_struct(const std::string& struct_name, const StructMemberT
     return struct_type_id;
 }
 
-SpvId SpvBuilder::declare_struct(const std::string& struct_name, const StructMemberTypes &member_types) {
+SpvId SpvBuilder::declare_struct(const std::string &struct_name, const StructMemberTypes &member_types) {
     SpvId struct_id = lookup_struct(struct_name, member_types);
     if (struct_id == SpvInvalidId) {
         struct_id = add_struct(struct_name, member_types);
@@ -1299,14 +1327,14 @@ SpvId SpvBuilder::add_pointer_type(SpvId base_type_id, SpvStorageClass storage_c
     return pointer_type_id;
 }
 
-SpvBuilder::ConstantKey SpvBuilder::make_constant_key( uint8_t code, uint8_t bits, int lanes, size_t bytes, const void* data) const {
+SpvBuilder::ConstantKey SpvBuilder::make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data) const {
     ConstantKey key = hash_splitmix64(code);
     key = hash_combine(key, bits);
     key = hash_combine(key, lanes);
     key = hash_combine(key, bytes);
-    
-    const int8_t* ptr = reinterpret_bits<const int8_t*>(data);
-    for(size_t i = 0; i < bytes; ++i) {
+
+    const int8_t *ptr = reinterpret_bits<const int8_t *>(data);
+    for (size_t i = 0; i < bytes; ++i) {
         key = hash_combine(key, uint64_t(ptr[i]));
     }
     return key;
@@ -1322,8 +1350,8 @@ SpvBuilder::ConstantKey SpvBuilder::make_bool_constant_key(bool value) const {
     return make_constant_key(type, &data);
 }
 
-SpvBuilder::ConstantKey SpvBuilder::make_string_constant_key(const std::string& value) const {
-    return make_constant_key(halide_type_handle, 8, 1, value.length(), (const char*)(value.c_str()));
+SpvBuilder::ConstantKey SpvBuilder::make_string_constant_key(const std::string &value) const {
+    return make_constant_key(halide_type_handle, 8, 1, value.length(), (const char *)(value.c_str()));
 }
 
 SpvBuilder::ConstantKey SpvBuilder::make_null_constant_key(const Type &type) const {
@@ -1367,14 +1395,14 @@ SpvId SpvBuilder::declare_bool_constant(bool value) {
     SpvId result_id = make_id(SpvBoolConstantId);
     SpvId type_id = add_type(type);
 
-    debug(3) << "    declare_bool_constant: %" << result_id << " bool " <<  value << "\n";
+    debug(3) << "    declare_bool_constant: %" << result_id << " bool " << value << "\n";
     SpvInstruction inst = SpvFactory::bool_constant(result_id, type_id, value);
     module.add_constant(inst);
     constant_map[key] = result_id;
     return result_id;
 }
 
-SpvId SpvBuilder::declare_string_constant(const std::string& value) {
+SpvId SpvBuilder::declare_string_constant(const std::string &value) {
     ConstantKey key = make_string_constant_key(value);
     ConstantMap::const_iterator it = constant_map.find(key);
     if (it != constant_map.end()) {
@@ -1382,7 +1410,7 @@ SpvId SpvBuilder::declare_string_constant(const std::string& value) {
     }
 
     SpvId result_id = make_id(SpvStringConstantId);
-    debug(3) << "    declare_string_constant: %" << result_id << " string '" <<  value << "'\n";
+    debug(3) << "    declare_string_constant: %" << result_id << " string '" << value << "'\n";
     SpvInstruction inst = SpvFactory::string_constant(result_id, value);
     module.add_constant(inst);
     constant_map[key] = result_id;
@@ -1409,10 +1437,10 @@ SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *d
     SpvId result_id = SpvInvalidId;
     if (scalar_type.is_float()) {
         result_id = make_id(SpvFloatConstantId);
-        debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const float*)(data)) << "\n";
+        debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const float *)(data)) << "\n";
     } else if (scalar_type.is_int_or_uint()) {
         result_id = make_id(SpvIntConstantId);
-        debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const int32_t*)(data)) << "\n";
+        debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const int32_t *)(data)) << "\n";
     } else {
         internal_error << "SPIRV: Unsupported type:" << scalar_type << "\n";
         return SpvInvalidId;
@@ -1566,7 +1594,7 @@ SpvId SpvBuilder::add_function_type(SpvId return_type_id, const ParamTypes &para
 
     SpvId function_type_id = make_id(SpvFunctionTypeId);
     debug(3) << "    add_function_type: %" << function_type_id << " return_type_id=" << return_type_id << " param_type_ids=[";
-    for(SpvId p : param_type_ids) {
+    for (SpvId p : param_type_ids) {
         debug(3) << " " << p;
     }
     debug(3) << " ]\n";
@@ -1614,14 +1642,14 @@ SpvInstruction SpvFactory::debug_line(SpvId string_id, uint32_t line, uint32_t c
     return inst;
 }
 
-SpvInstruction SpvFactory::debug_string(SpvId result_id, const std::string& string) {
+SpvInstruction SpvFactory::debug_string(SpvId result_id, const std::string &string) {
     SpvInstruction inst = SpvInstruction::make(SpvOpString);
     inst.set_result_id(result_id);
     inst.add_string(string);
     return inst;
 }
 
-SpvInstruction SpvFactory::debug_symbol(SpvId target_id, const std::string& symbol) {
+SpvInstruction SpvFactory::debug_symbol(SpvId target_id, const std::string &symbol) {
     SpvInstruction inst = SpvInstruction::make(SpvOpName);
     inst.set_result_id(target_id);
     inst.add_string(symbol);
@@ -1774,7 +1802,7 @@ SpvInstruction SpvFactory::bool_constant(SpvId result_id, SpvId type_id, bool va
     return inst;
 }
 
-SpvInstruction SpvFactory::string_constant(SpvId result_id, const std::string& value) {
+SpvInstruction SpvFactory::string_constant(SpvId result_id, const std::string &value) {
     SpvInstruction inst = SpvInstruction::make(SpvOpString);
     inst.set_result_id(result_id);
     inst.add_string(value);
@@ -2006,7 +2034,7 @@ SpvInstruction SpvFactory::conditional_branch(SpvId condition_label_id, SpvId tr
     for (uint32_t w : weights) {
         inst.add_immediate(w);
     }
-    return inst;   
+    return inst;
 }
 
 SpvInstruction SpvFactory::equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
@@ -2015,7 +2043,7 @@ SpvInstruction SpvFactory::equal(SpvId type_id, SpvId result_id, SpvId src_a_id,
     inst.add_operand(result_id);
     inst.add_operand(src_a_id);
     inst.add_operand(src_b_id);
-    return inst;   
+    return inst;
 }
 
 SpvInstruction SpvFactory::not_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
@@ -2024,7 +2052,7 @@ SpvInstruction SpvFactory::not_equal(SpvId type_id, SpvId result_id, SpvId src_a
     inst.add_operand(result_id);
     inst.add_operand(src_a_id);
     inst.add_operand(src_b_id);
-    return inst;   
+    return inst;
 }
 
 SpvInstruction SpvFactory::less_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
@@ -2033,7 +2061,7 @@ SpvInstruction SpvFactory::less_than(SpvId type_id, SpvId result_id, SpvId src_a
     inst.add_operand(result_id);
     inst.add_operand(src_a_id);
     inst.add_operand(src_b_id);
-    return inst;   
+    return inst;
 }
 
 SpvInstruction SpvFactory::less_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
@@ -2042,7 +2070,7 @@ SpvInstruction SpvFactory::less_than_equal(SpvId type_id, SpvId result_id, SpvId
     inst.add_operand(result_id);
     inst.add_operand(src_a_id);
     inst.add_operand(src_b_id);
-    return inst;   
+    return inst;
 }
 
 SpvInstruction SpvFactory::greater_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
@@ -2051,7 +2079,7 @@ SpvInstruction SpvFactory::greater_than(SpvId type_id, SpvId result_id, SpvId sr
     inst.add_operand(result_id);
     inst.add_operand(src_a_id);
     inst.add_operand(src_b_id);
-    return inst;   
+    return inst;
 }
 
 SpvInstruction SpvFactory::greater_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
@@ -2060,7 +2088,7 @@ SpvInstruction SpvFactory::greater_than_equal(SpvId type_id, SpvId result_id, Sp
     inst.add_operand(result_id);
     inst.add_operand(src_a_id);
     inst.add_operand(src_b_id);
-    return inst;   
+    return inst;
 }
 
 SpvInstruction SpvFactory::loop_merge(SpvId merge_label_id, SpvId continue_label_id, uint32_t loop_control_mask) {
@@ -2186,7 +2214,7 @@ void spirv_ir_test() {
     SpvId output_type_id = builder.declare_type(Type(Type::UInt, 32, 1));
     SpvId output_id = builder.declare_global_variable("OutputVar", output_type_id, SpvStorageClassOutput);
 
-    SpvBuilder::Variables entry_point_variables = { intrinsic_id, output_id };
+    SpvBuilder::Variables entry_point_variables = {intrinsic_id, output_id};
     builder.add_entry_point(kernel_func_id, SpvExecutionModelKernel, entry_point_variables);
 
     SpvBuilder::Literals annotation_literals = {SpvBuiltInWorkgroupId};
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 448afa7d3449..3de612199eb8 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -78,7 +78,7 @@ enum SpvKind {
 using SpvId = uint32_t;
 using SpvBinary = std::vector<uint32_t>;
 
-static constexpr SpvStorageClass SpvInvalidStorageClass = SpvStorageClassMax; // sentinel for invalid storage class
+static constexpr SpvStorageClass SpvInvalidStorageClass = SpvStorageClassMax;  // sentinel for invalid storage class
 static constexpr SpvId SpvInvalidId = SpvId(-1);
 static constexpr SpvId SpvNoResult = 0;
 static constexpr SpvId SpvNoType = 0;
@@ -230,8 +230,8 @@ class SpvModule {
     SpvModule(SpvModule &&) = default;
     SpvModule &operator=(SpvModule &&) = default;
 
-    void add_debug_string(SpvId result_id, const std::string& string);
-    void add_debug_symbol(SpvId id, const std::string& symbol);    
+    void add_debug_string(SpvId result_id, const std::string &string);
+    void add_debug_symbol(SpvId id, const std::string &symbol);
     void add_annotation(const SpvInstruction &val);
     void add_type(const SpvInstruction &val);
     void add_constant(const SpvInstruction &val);
@@ -310,19 +310,19 @@ class SpvBuilder {
     SpvId declare_vector_constant(const Type &type, const void *data);
     SpvId declare_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId element_id, const Indices &indices);
     SpvId declare_function_type(SpvId return_type, const ParamTypes &param_types = {});
-    SpvId declare_function(const std::string& name, SpvId function_type);
-    SpvId declare_struct(const std::string& name, const StructMemberTypes &member_types);
-    SpvId declare_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId initializer_id = SpvInvalidId);
-    SpvId declare_global_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId initializer_id = SpvInvalidId);
+    SpvId declare_function(const std::string &name, SpvId function_type);
+    SpvId declare_struct(const std::string &name, const StructMemberTypes &member_types);
+    SpvId declare_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId initializer_id = SpvInvalidId);
+    SpvId declare_global_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId initializer_id = SpvInvalidId);
 
     SpvId add_type(const Type &type, uint32_t array_size = 1);
-    SpvId add_struct(const std::string& name, const StructMemberTypes &member_types);
+    SpvId add_struct(const std::string &name, const StructMemberTypes &member_types);
     SpvId add_runtime_array(SpvId base_type_id);
     SpvId add_pointer_type(const Type &type, SpvStorageClass storage_class);
     SpvId add_pointer_type(SpvId base_type_id, SpvStorageClass storage_class);
     SpvId add_constant(const Type &type, const void *data);
     SpvId add_function_type(SpvId return_type_id, const ParamTypes &param_type_ids);
-    SpvId add_function(const std::string& name, SpvId return_type, const ParamTypes &param_types = {});   
+    SpvId add_function(const std::string &name, SpvId return_type, const ParamTypes &param_types = {});
     SpvId add_instruction(SpvInstruction val);
 
     void add_annotation(SpvId target_id, SpvDecoration decoration_type, const Literals &literals = {});
@@ -336,11 +336,11 @@ class SpvBuilder {
     void set_source_language(SpvSourceLanguage val);
     void set_addressing_model(SpvAddressingModel val);
     void set_memory_model(SpvMemoryModel val);
-    
+
     SpvSourceLanguage source_language() const;
     SpvAddressingModel addressing_model() const;
     SpvMemoryModel memory_model() const;
-    
+
     void require_extension(const std::string &extension);
     void require_capability(SpvCapability);
 
@@ -360,11 +360,11 @@ class SpvBuilder {
     void update_id(SpvId id);
 
     SpvStorageClass lookup_storage_class(SpvId id) const;
-    SpvId lookup_id(const std::string& symbol) const;
+    SpvId lookup_id(const std::string &symbol) const;
     SpvId lookup_scope(SpvId id) const;
     std::string lookup_symbol(SpvId id) const;
-    SpvId declare_symbol(const std::string& symbol, SpvId id, SpvId scope_id);
-    void add_symbol(const std::string& symbol, SpvId id, SpvId scope_id);
+    SpvId declare_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
+    void add_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
 
     SpvModule current_module() const;
 
@@ -374,7 +374,6 @@ class SpvBuilder {
     void reset();
 
 protected:
-    
     using TypeKey = uint64_t;
     using TypeMap = std::unordered_map<TypeKey, SpvId>;
     using KindMap = std::unordered_map<SpvId, SpvKind>;
@@ -402,7 +401,7 @@ class SpvBuilder {
     SpvId lookup_type(const Type &type, uint32_t array_size = 1) const;
 
     TypeKey make_struct_type_key(const StructMemberTypes &member_types) const;
-    SpvId lookup_struct(const std::string& name, const StructMemberTypes &member_types) const;
+    SpvId lookup_struct(const std::string &name, const StructMemberTypes &member_types) const;
 
     PointerTypeKey make_pointer_type_key(const Type &type, SpvStorageClass storage_class) const;
     SpvId lookup_pointer_type(const Type &type, SpvStorageClass storage_class) const;
@@ -411,8 +410,8 @@ class SpvBuilder {
     SpvId lookup_pointer_type(SpvId base_type_id, SpvStorageClass storage_class) const;
 
     ConstantKey make_bool_constant_key(bool value) const;
-    ConstantKey make_string_constant_key(const std::string& value) const;
-    ConstantKey make_constant_key( uint8_t code, uint8_t bits, int lanes, size_t bytes, const void* data) const;
+    ConstantKey make_string_constant_key(const std::string &value) const;
+    ConstantKey make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data) const;
     ConstantKey make_constant_key(const Type &type, const void *data) const;
     SpvId lookup_constant(const Type &type, const void *data) const;
 
@@ -423,9 +422,9 @@ class SpvBuilder {
     SpvInstruction lookup_instruction(SpvId result_id) const;
     bool has_instruction(SpvId inst) const;
 
-    SpvId lookup_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id ) const;
-    bool has_variable(const std::string& name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id ) const;
-    
+    SpvId lookup_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id) const;
+    bool has_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id) const;
+
     FunctionTypeKey make_function_type_key(SpvId return_type_id, const ParamTypes &param_type_ids) const;
     SpvId lookup_function_type(SpvId return_type_id, const ParamTypes &param_type_ids) const;
 
@@ -468,8 +467,8 @@ struct SpvFactory {
     static SpvInstruction import(const std::string &import);
     static SpvInstruction label(SpvId result_id);
     static SpvInstruction debug_line(SpvId string_id, uint32_t line, uint32_t column);
-    static SpvInstruction debug_string(SpvId result_id, const std::string& string);
-    static SpvInstruction debug_symbol(SpvId target_id, const std::string& symbol);
+    static SpvInstruction debug_string(SpvId result_id, const std::string &string);
+    static SpvInstruction debug_symbol(SpvId target_id, const std::string &symbol);
     static SpvInstruction decorate(SpvId target_id, SpvDecoration decoration_type, const Literals &literals = {});
     static SpvInstruction decorate_member(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const Literals &literals = {});
     static SpvInstruction void_type(SpvId void_type_id);
@@ -485,7 +484,7 @@ struct SpvFactory {
     static SpvInstruction constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data);
     static SpvInstruction null_constant(SpvId result_id, SpvId type_id);
     static SpvInstruction bool_constant(SpvId result_id, SpvId type_id, bool value);
-    static SpvInstruction string_constant(SpvId result_id, const std::string& value);
+    static SpvInstruction string_constant(SpvId result_id, const std::string &value);
     static SpvInstruction composite_constant(SpvId result_id, SpvId type_id, const Components &components);
     static SpvInstruction variable(SpvId result_id, SpvId result_type_id, uint32_t storage_class, SpvId initializer_id = SpvInvalidId);
     static SpvInstruction function(SpvId return_type_id, SpvId func_id, uint32_t control_mask, SpvId func_type_id);

From 6661c3f691310f6417b4af796055a0630071fe7a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 26 Sep 2022 15:50:07 -0700
Subject: [PATCH 018/166] Fix formatting

---
 src/CodeGen_Vulkan_Dev.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 8272aac6b0a4..589eff05269f 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -682,7 +682,9 @@ int thread_loop_workgroup_index(const std::string &name) {
                          ".__thread_id_y",
                          ".__thread_id_z"};
     for (size_t i = 0; i < sizeof(ids) / sizeof(std::string); i++) {
-        if (ends_with(name, ids[i])) { return i; }
+        if (ends_with(name, ids[i])) {
+            return i;
+        }
     }
     return -1;
 }

From 58d21ac342d153d0cb51ee0a57f6be9ecd416083 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 26 Sep 2022 16:11:15 -0700
Subject: [PATCH 019/166] Remove leftover ifdef

---
 src/CodeGen_Vulkan_Dev.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 589eff05269f..af3f6cbb12bc 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1,11 +1,6 @@
 #include <algorithm>
 #include <sstream>
 
-// XXX
-#ifndef WITH_SPIRV
-#define WITH_SPIRV
-#endif
-
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Vulkan_Dev.h"

From 80c1ef0fdb8d1c3e5316b23e521d866190eaa459 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 27 Sep 2022 11:36:58 -0700
Subject: [PATCH 020/166] Fix build error for clang OSX for mismatched type
 comparison

---
 src/CodeGen_Vulkan_Dev.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index af3f6cbb12bc..d6cdd2ebaf7b 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -572,7 +572,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
     SpvId base_id = id_and_storage_class.first;
     SpvStorageClass storage_class = id_and_storage_class.second;
     internal_assert(base_id != SpvInvalidId);
-    internal_assert(storage_class != SpvInvalidId);
+    internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax));
 
     op->index.accept(this);
     SpvId index_id = builder.current_id();
@@ -610,7 +610,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
     SpvId base_id = id_and_storage_class.first;
     SpvStorageClass storage_class = id_and_storage_class.second;
     internal_assert(base_id != SpvInvalidId);
-    internal_assert(storage_class != SpvInvalidId);
+    internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax));
 
     op->index.accept(this);
     SpvId index_id = builder.current_id();

From 7ac73cf60cc9582c25d591e3021014db46694379 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 27 Sep 2022 14:20:23 -0700
Subject: [PATCH 021/166] Refactor loops and conditionals to use blocks

---
 src/CodeGen_Vulkan_Dev.cpp | 138 ++++++++++++++++++++++---------------
 src/SpirvIR.cpp            | 100 ++++++++++++---------------
 src/SpirvIR.h              |  17 ++---
 3 files changed, 131 insertions(+), 124 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index d6cdd2ebaf7b..3b2b4f6f781e 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -746,43 +746,56 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         // Declare loop var
         SpvId loop_var_id = builder.declare_variable(unique_name("loop_index"), index_var_type_id, SpvStorageClassFunction, min_id);
 
-        SpvId header_label_id = builder.reserve_id(SpvLabelId);
-        SpvId loop_top_label_id = builder.reserve_id(SpvLabelId);
-        SpvId body_label_id = builder.reserve_id(SpvLabelId);
-        SpvId continue_label_id = builder.reserve_id(SpvLabelId);
-        SpvId merge_label_id = builder.reserve_id(SpvLabelId);
-
-        builder.append(SpvFactory::label(header_label_id));
-        builder.append(SpvFactory::loop_merge(merge_label_id, continue_label_id, SpvLoopControlMaskNone));
-        builder.append(SpvFactory::branch(loop_top_label_id));
-        builder.append(SpvFactory::label(loop_top_label_id));
-
-        // loop test.
-        SpvId cur_index_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::load(index_type_id, cur_index_id, loop_var_id));
-
-        SpvId loop_test_type_id = builder.declare_type(Bool());
-        SpvId loop_test_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::less_than_equal(loop_test_type_id, loop_test_id, cur_index_id, max_id, true));
-        builder.append(SpvFactory::conditional_branch(loop_test_id, body_label_id, merge_label_id));
-        builder.append(SpvFactory::label(body_label_id));
+        SpvId header_block_id = builder.reserve_id(SpvBlockId);
+        SpvId top_block_id = builder.reserve_id(SpvBlockId);
+        SpvId body_block_id = builder.reserve_id(SpvBlockId);
+        SpvId continue_block_id = builder.reserve_id(SpvBlockId);
+        SpvId merge_block_id = builder.reserve_id(SpvBlockId);
+
+        SpvBlock header_block = builder.create_block(header_block_id);
+        builder.enter_block(header_block);
+        {
+            builder.append(SpvFactory::loop_merge(merge_block_id, continue_block_id, SpvLoopControlMaskNone));
+            builder.append(SpvFactory::branch(top_block_id));
+        }
+        builder.leave_block();
+
+        SpvId current_index_id = builder.reserve_id(SpvResultId);
+        SpvBlock top_block = builder.create_block(top_block_id);
+        builder.enter_block(top_block);
+        {
+            SpvId loop_test_type_id = builder.declare_type(Bool());
+            SpvId loop_test_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::load(index_type_id, current_index_id, loop_var_id));
+            builder.append(SpvFactory::less_than_equal(loop_test_type_id, loop_test_id, current_index_id, max_id, true));
+            builder.append(SpvFactory::conditional_branch(loop_test_id, body_block_id, merge_block_id));
+        }
+        builder.leave_block();
 
+        SpvBlock body_block = builder.create_block(body_block_id);
+        builder.enter_block(body_block);
         {
-            ScopedSymbolBinding binding(symbol_table, op->name, {cur_index_id, SpvStorageClassFunction});
+            ScopedSymbolBinding binding(symbol_table, op->name, {current_index_id, SpvStorageClassFunction});
             op->body.accept(this);
+            builder.append(SpvFactory::branch(continue_block_id));
+        }
+        builder.leave_block();
+
+        SpvBlock continue_block = builder.create_block(continue_block_id);
+        builder.enter_block(continue_block);
+        {
+            // Update loop variable
+            int32_t one = 1;
+            SpvId next_index_id = builder.reserve_id(SpvResultId);
+            SpvId constant_one_id = builder.declare_constant(Int(32), &one);
+            builder.append(SpvFactory::integer_add(index_type_id, next_index_id, current_index_id, constant_one_id));
+            builder.append(SpvFactory::store(loop_var_id, next_index_id));
+            builder.append(SpvFactory::branch(header_block_id));
         }
-        builder.append(SpvFactory::branch(continue_label_id));
-        builder.append(SpvFactory::label(continue_label_id));
-
-        // Update loop variable
-        int32_t one = 1;
-        SpvId next_index_id = builder.reserve_id(SpvResultId);
-        SpvId constant_one_id = builder.declare_constant(Int(32), &one);
-        builder.append(SpvFactory::integer_add(index_type_id, next_index_id, cur_index_id, constant_one_id));
-        builder.append(SpvFactory::store(loop_var_id, next_index_id));
-
-        builder.append(SpvFactory::branch(header_label_id));
-        builder.append(SpvFactory::label(merge_label_id));
+        builder.leave_block();
+
+        SpvBlock merge_block = builder.create_block(merge_block_id);
+        builder.enter_block(merge_block);
     }
 }
 
@@ -847,38 +860,53 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
                                                      StmtOrExpr then_case, StmtOrExpr else_case) {
     condition.accept(this);
     SpvId cond_id = builder.current_id();
-    SpvId then_label_id = builder.reserve_id(SpvLabelId);
-    SpvId else_label_id = builder.reserve_id(SpvLabelId);
-    SpvId merge_label_id = builder.reserve_id(SpvLabelId);
+    SpvId merge_block_id = builder.reserve_id(SpvBlockId);
+//    SpvId if_block_id = builder.reserve_id(SpvBlockId);
+    SpvId then_block_id = builder.reserve_id(SpvBlockId);
+    SpvId else_block_id = else_case.defined() ? builder.reserve_id(SpvBlockId) : merge_block_id;
+ 
+    SpvFactory::BlockVariables block_vars;
 
     // If Conditional
-    builder.append(SpvFactory::selection_merge(merge_label_id, SpvSelectionControlMaskNone));
-    builder.append(SpvFactory::conditional_branch(cond_id, then_label_id, else_label_id));
+//    SpvBlock if_block = builder.create_block(if_block_id);
+//    builder.enter_block(if_block);
+//    {
+        builder.append(SpvFactory::selection_merge(merge_block_id, SpvSelectionControlMaskNone));
+        builder.append(SpvFactory::conditional_branch(cond_id, then_block_id, else_block_id));
+//    }
+//    builder.leave_block();
 
     // Then block
-    builder.append(SpvFactory::label(then_label_id));
-    then_case.accept(this);
-    SpvId then_id = builder.current_id();
-    builder.append(SpvFactory::branch(merge_label_id));
-
-    SpvFactory::BlockVariables block_vars = {
-        {then_id, then_label_id}};
+    SpvBlock then_block = builder.create_block(then_block_id);
+    builder.enter_block(then_block);
+    {
+        then_case.accept(this);
+        SpvId then_id = builder.current_id();
+        builder.append(SpvFactory::branch(merge_block_id));
+        block_vars.push_back({then_id, then_block_id});
+    }
+    builder.leave_block();
 
     // Else block (optional)
-    builder.append(SpvFactory::label(else_label_id));
     if (else_case.defined()) {
-        else_case.accept(this);
-        SpvId else_id = builder.current_id();
-        block_vars.push_back({else_id, else_label_id});
+        SpvBlock else_block = builder.create_block(else_block_id);
+        builder.enter_block(else_block);
+        {
+            else_case.accept(this);
+            SpvId else_id = builder.current_id();
+            builder.append(SpvFactory::branch(merge_block_id));
+            block_vars.push_back({else_id, else_block_id});
+        }
+        builder.leave_block();
     }
-    builder.append(SpvFactory::branch(merge_label_id));
-
-    // Merge label
-    builder.append(SpvFactory::label(merge_label_id));
 
+    // Merge block
+    SpvBlock merge_block = builder.create_block(merge_block_id);
+    builder.enter_block(merge_block);
     return block_vars;
 }
 
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IfThenElse *op) {
     emit_if_then_else(op->condition, op->then_case, op->else_case);
 }
@@ -974,8 +1002,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     SpvId kernel_func_id = builder.add_function(name, void_type_id);
     SpvFunction kernel_func = builder.lookup_function(kernel_func_id);
     builder.enter_function(kernel_func);
-    builder.enter_block(kernel_func.entry_block());
-    //    SpvId start_label_id = kernel_func.entry_block().id();
 
     // TODO: only add the SIMT intrinsics used
     SpvFactory::Variables entry_point_variables;
@@ -1083,7 +1109,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     s.accept(this);
 
     // Insert return statement end delimiter
-    kernel_func.entry_block().add_instruction(SpvFactory::return_stmt());
+    kernel_func.last_block().add_instruction(SpvFactory::return_stmt());
 
     workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
     workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 36a4e996ff6b..16ecdf093325 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -284,8 +284,27 @@ bool SpvFunction::is_defined() const {
     return contents.defined();
 }
 
+SpvBlock SpvFunction::create_block(SpvId block_id) {
+    check_defined();
+    if(contents->blocks.size()) {
+        SpvBlock tail_block = last_block();
+        if(!tail_block.is_terminated()) {
+            tail_block.add_instruction(SpvFactory::branch(block_id));
+        }
+    }
+    SpvBlock block = SpvBlock::make(*this, block_id);
+    contents->blocks.push_back(block);
+    return block;
+}
+
 void SpvFunction::add_block(const SpvBlock &block) {
     check_defined();
+    if(contents->blocks.size()) {
+        SpvBlock tail_block = last_block();
+        if(!tail_block.is_terminated()) {
+            tail_block.add_instruction(SpvFactory::branch(block.id()));
+        }
+    }
     contents->blocks.push_back(block);
 }
 
@@ -304,6 +323,11 @@ SpvBlock SpvFunction::entry_block() const {
     return contents->blocks.front();
 }
 
+SpvBlock SpvFunction::last_block() const {
+    check_defined();
+    return contents->blocks.back();
+}
+
 SpvPrecision SpvFunction::return_precision() const {
     check_defined();
     SpvId return_id = contents->declaration.result_id();
@@ -653,20 +677,15 @@ void SpvBuilder::reset() {
     function_map.clear();
     id_symbol_map.clear();
     symbol_id_map.clear();
-    instruction_map.clear();
     storage_class_map.clear();
     pointer_type_map.clear();
     variable_type_map.clear();
     function_type_map.clear();
 
-    FunctionStack empty_fs;
-    function_stack.swap(empty_fs);
-
-    BlockStack empty_bs;
-    block_stack.swap(empty_bs);
-
-    scope_id = SpvInvalidId;
     active_id = SpvInvalidId;
+    active_block = SpvBlock();
+    active_function = SpvFunction();
+    
     SpvId module_id = make_id(SpvModuleId);
     module = SpvModule::make(module_id);
 }
@@ -951,7 +970,6 @@ SpvId SpvBuilder::add_function(const std::string &name, SpvId return_type_id, co
         SpvId param_id = make_id(SpvParameterId);
         SpvInstruction param_inst = SpvFactory::function_parameter(param_type_id, param_id);
         func.add_parameter(param_inst);
-        map_instruction(param_inst);
     }
     SpvId block_id = make_id(SpvBlockId);
     SpvBlock entry_block = SpvBlock::make(func, block_id);
@@ -959,7 +977,6 @@ SpvId SpvBuilder::add_function(const std::string &name, SpvId return_type_id, co
     module.add_function(func);
     function_map[func_id] = func;
     declare_symbol(name, func_id, module.id());
-    map_instruction(func.declaration());
     return func_id;
 }
 
@@ -995,24 +1012,21 @@ void SpvBuilder::add_execution_mode_local_size(SpvId func_id,
 }
 
 void SpvBuilder::enter_block(const SpvBlock &block) {
-    block_stack.push(block);
+    active_block = block;
 }
 
 SpvBlock SpvBuilder::current_block() const {
-    SpvBlock block;
-    if (!block_stack.empty()) {
-        block = block_stack.top();
-    }
-    return block;
+    return active_block;
+}
+
+SpvBlock SpvBuilder::create_block(SpvId block_id) {
+    return current_function().create_block(block_id);
 }
 
 SpvBlock SpvBuilder::leave_block() {
-    SpvBlock block;
-    if (!block_stack.empty()) {
-        block = block_stack.top();
-        block_stack.pop();
-    }
-    return block;
+    SpvBlock prev_block = active_block;
+    active_block = SpvBlock();
+    return prev_block;
 }
 
 SpvFunction SpvBuilder::lookup_function(SpvId func_id) const {
@@ -1060,26 +1074,18 @@ SpvId SpvBuilder::lookup_scope(SpvId id) const {
 }
 
 void SpvBuilder::enter_function(const SpvFunction &func) {
-    function_stack.push(func);
-    enter_block(func.entry_block());
+    active_function = func;
+    enter_block(active_function.entry_block());
 }
 
 SpvFunction SpvBuilder::current_function() const {
-    SpvFunction func;
-    if (!function_stack.empty()) {
-        func = function_stack.top();
-    }
-    return func;
+    return active_function;
 }
 
 SpvFunction SpvBuilder::leave_function() {
-    SpvFunction func;
-    leave_block();
-    if (!function_stack.empty()) {
-        func = function_stack.top();
-        function_stack.pop();
-    }
-    return func;
+    SpvFunction prev_func = active_function;
+    active_function = SpvFunction();
+    return prev_func;
 }
 
 SpvId SpvBuilder::current_id() const {
@@ -1550,24 +1556,6 @@ SpvId SpvBuilder::declare_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId e
     return access_chain_id;
 }
 
-SpvId SpvBuilder::map_instruction(const SpvInstruction &inst) {
-    const SpvId key = inst.result_id();
-    if (instruction_map.find(key) == instruction_map.end()) {
-        instruction_map.insert({key, inst});
-    } else {
-        instruction_map[key] = inst;
-    }
-    return key;
-}
-
-SpvInstruction SpvBuilder::lookup_instruction(SpvId result_id) const {
-    InstructionMap::const_iterator it = instruction_map.find(result_id);
-    if (it == instruction_map.end()) {
-        return SpvInstruction();
-    }
-    return it->second;
-}
-
 SpvBuilder::FunctionTypeKey SpvBuilder::make_function_type_key(SpvId return_type_id, const ParamTypes &param_type_ids) const {
     TypeKey key = hash_splitmix64(return_type_id);
     for (SpvId type_id : param_type_ids) {
@@ -1612,8 +1600,8 @@ SpvId SpvBuilder::add_runtime_array(SpvId base_type_id) {
 }
 
 void SpvBuilder::append(SpvInstruction inst) {
-    if (!block_stack.empty()) {
-        current_block().add_instruction(std::move(inst));
+    if (active_block.is_defined()) {
+        active_block.add_instruction(std::move(inst));
     } else {
         internal_error << "SPIRV: Current block undefined! Unable to append!\n";
     }
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 3de612199eb8..99e1479f1d22 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -189,6 +189,7 @@ class SpvFunction {
     SpvFunction(SpvFunction &&) = default;
     SpvFunction &operator=(SpvFunction &&) = default;
 
+    SpvBlock create_block(SpvId block_id);
     void add_block(const SpvBlock &block);
     void add_parameter(const SpvInstruction &param);
     void set_module(SpvModule module);
@@ -197,6 +198,7 @@ class SpvFunction {
     bool is_defined() const;
 
     SpvBlock entry_block() const;
+    SpvBlock last_block() const;
     SpvPrecision return_precision() const;
     SpvPrecision parameter_precision(uint32_t index) const;
     uint32_t parameter_count() const;
@@ -348,6 +350,7 @@ class SpvBuilder {
     bool is_capability_required(SpvCapability) const;
 
     void enter_block(const SpvBlock &block);
+    SpvBlock create_block(SpvId block_id);
     SpvBlock current_block() const;
     SpvBlock leave_block();
 
@@ -387,13 +390,9 @@ class SpvBuilder {
     using ScopeMap = std::unordered_map<SpvId, SpvId>;
     using IdSymbolMap = std::unordered_map<SpvId, std::string>;
     using SymbolIdMap = std::unordered_map<std::string, SpvId>;
-    using InstructionMap = std::unordered_map<SpvId, SpvInstruction>;
     using FunctionTypeKey = uint64_t;
     using FunctionTypeMap = std::unordered_map<FunctionTypeKey, SpvId>;
     using FunctionMap = std::unordered_map<SpvId, SpvFunction>;
-    using FunctionStack = std::stack<SpvFunction>;
-    using BlockStack = std::stack<SpvBlock>;
-    using IdStack = std::stack<SpvId>;
 
     SpvId make_id(SpvKind kind);
 
@@ -418,18 +417,15 @@ class SpvBuilder {
     ConstantKey make_null_constant_key(const Type &type) const;
     SpvId lookup_null_constant(const Type &type) const;
 
-    SpvId map_instruction(const SpvInstruction &inst);
-    SpvInstruction lookup_instruction(SpvId result_id) const;
-    bool has_instruction(SpvId inst) const;
-
     SpvId lookup_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id) const;
     bool has_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId scope_id) const;
 
     FunctionTypeKey make_function_type_key(SpvId return_type_id, const ParamTypes &param_type_ids) const;
     SpvId lookup_function_type(SpvId return_type_id, const ParamTypes &param_type_ids) const;
 
-    SpvId scope_id = SpvInvalidId;
     SpvId active_id = SpvInvalidId;
+    SpvFunction active_function;
+    SpvBlock active_block;
     SpvModule module;
     KindMap kind_map;
     TypeMap type_map;
@@ -440,13 +436,10 @@ class SpvBuilder {
     FunctionMap function_map;
     IdSymbolMap id_symbol_map;
     SymbolIdMap symbol_id_map;
-    InstructionMap instruction_map;
     StorageClassMap storage_class_map;
     PointerTypeMap pointer_type_map;
     VariableTypeMap variable_type_map;
     FunctionTypeMap function_type_map;
-    FunctionStack function_stack;
-    BlockStack block_stack;
 };
 
 /** Factory interface for constructing specific SPIR-V instructions */

From 81e8839d0e4da21106cd678e14a2af01dcd8a038 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 27 Sep 2022 14:27:11 -0700
Subject: [PATCH 022/166] Clang tidy/format pass

---
 src/CodeGen_Vulkan_Dev.cpp | 21 ++++++++++-----------
 src/SpirvIR.cpp            | 20 ++++++++++----------
 src/SpirvIR.h              |  2 +-
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 3b2b4f6f781e..187e4e6ef118 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -861,20 +861,20 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
     condition.accept(this);
     SpvId cond_id = builder.current_id();
     SpvId merge_block_id = builder.reserve_id(SpvBlockId);
-//    SpvId if_block_id = builder.reserve_id(SpvBlockId);
+    //    SpvId if_block_id = builder.reserve_id(SpvBlockId);
     SpvId then_block_id = builder.reserve_id(SpvBlockId);
     SpvId else_block_id = else_case.defined() ? builder.reserve_id(SpvBlockId) : merge_block_id;
- 
+
     SpvFactory::BlockVariables block_vars;
 
     // If Conditional
-//    SpvBlock if_block = builder.create_block(if_block_id);
-//    builder.enter_block(if_block);
-//    {
-        builder.append(SpvFactory::selection_merge(merge_block_id, SpvSelectionControlMaskNone));
-        builder.append(SpvFactory::conditional_branch(cond_id, then_block_id, else_block_id));
-//    }
-//    builder.leave_block();
+    //    SpvBlock if_block = builder.create_block(if_block_id);
+    //    builder.enter_block(if_block);
+    //    {
+    builder.append(SpvFactory::selection_merge(merge_block_id, SpvSelectionControlMaskNone));
+    builder.append(SpvFactory::conditional_branch(cond_id, then_block_id, else_block_id));
+    //    }
+    //    builder.leave_block();
 
     // Then block
     SpvBlock then_block = builder.create_block(then_block_id);
@@ -906,7 +906,6 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
     return block_vars;
 }
 
-
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IfThenElse *op) {
     emit_if_then_else(op->condition, op->then_case, op->else_case);
 }
@@ -1109,7 +1108,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     s.accept(this);
 
     // Insert return statement end delimiter
-    kernel_func.last_block().add_instruction(SpvFactory::return_stmt());
+    kernel_func.tail_block().add_instruction(SpvFactory::return_stmt());
 
     workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
     workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 16ecdf093325..70bb192beacc 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -286,10 +286,10 @@ bool SpvFunction::is_defined() const {
 
 SpvBlock SpvFunction::create_block(SpvId block_id) {
     check_defined();
-    if(contents->blocks.size()) {
-        SpvBlock tail_block = last_block();
-        if(!tail_block.is_terminated()) {
-            tail_block.add_instruction(SpvFactory::branch(block_id));
+    if (!contents->blocks.empty()) {
+        SpvBlock last_block = tail_block();
+        if (!last_block.is_terminated()) {
+            last_block.add_instruction(SpvFactory::branch(block_id));
         }
     }
     SpvBlock block = SpvBlock::make(*this, block_id);
@@ -299,10 +299,10 @@ SpvBlock SpvFunction::create_block(SpvId block_id) {
 
 void SpvFunction::add_block(const SpvBlock &block) {
     check_defined();
-    if(contents->blocks.size()) {
-        SpvBlock tail_block = last_block();
-        if(!tail_block.is_terminated()) {
-            tail_block.add_instruction(SpvFactory::branch(block.id()));
+    if (!contents->blocks.empty()) {
+        SpvBlock last_block = tail_block();
+        if (!last_block.is_terminated()) {
+            last_block.add_instruction(SpvFactory::branch(block.id()));
         }
     }
     contents->blocks.push_back(block);
@@ -323,7 +323,7 @@ SpvBlock SpvFunction::entry_block() const {
     return contents->blocks.front();
 }
 
-SpvBlock SpvFunction::last_block() const {
+SpvBlock SpvFunction::tail_block() const {
     check_defined();
     return contents->blocks.back();
 }
@@ -685,7 +685,7 @@ void SpvBuilder::reset() {
     active_id = SpvInvalidId;
     active_block = SpvBlock();
     active_function = SpvFunction();
-    
+
     SpvId module_id = make_id(SpvModuleId);
     module = SpvModule::make(module_id);
 }
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 99e1479f1d22..86e18f193acb 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -198,7 +198,7 @@ class SpvFunction {
     bool is_defined() const;
 
     SpvBlock entry_block() const;
-    SpvBlock last_block() const;
+    SpvBlock tail_block() const;
     SpvPrecision return_precision() const;
     SpvPrecision parameter_precision(uint32_t index) const;
     uint32_t parameter_count() const;

From b150737f9a2e8f5aa51baf908973cbcf25781c0b Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 27 Sep 2022 15:26:54 -0700
Subject: [PATCH 023/166] Add detailed comments for acquire context parameters

---
 src/runtime/HalideRuntimeVulkan.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/runtime/HalideRuntimeVulkan.h b/src/runtime/HalideRuntimeVulkan.h
index 88b8d3a8d714..4e52da4fb034 100644
--- a/src/runtime/HalideRuntimeVulkan.h
+++ b/src/runtime/HalideRuntimeVulkan.h
@@ -44,15 +44,22 @@ extern void halide_vulkan_finalize_kernels(void *user_context, void *state_ptr);
 // Overriding implementations of acquire/release must implement the
 // following behavior:
 
-//  - halide_acquire_vulkan_context should always store a valid
+// - halide_acquire_vulkan_context should always store a valid
 //   instance/device/queue in the corresponding out parameters,
 //   or return an error code.
 // - A call to halide_acquire_vulkan_context is followed by a matching
 //   call to halide_release_vulkan_context. halide_acquire_vulkan_context
 //   should block while a previous call (if any) has not yet been
 //   released via halide_release_vulkan_context.
-// TODO: describe memory type index
-// TODO: describe queue family index
+// - Parameters:
+//      allocator: an internal halide type handle used for allocating resources
+//      instance: the vulkan instance handle
+//      device: the vulkan device handle
+//      physical_device: the vulkan physical device handle
+//      command_pool: the vulkan command pool handle (strangely doesn't have a VkCommandPool_T typedef)
+//      queue: the vulkan queue handle
+//      queue_family_index: the index corresponding to the device queue properties for the device (as described by vkGetPhysicalDeviceQueueFamilyProperties)
+//      create: if set to true, attempt to create a new vulkan context, otherwise acquire the current one
 struct halide_vulkan_memory_allocator;
 extern int halide_vulkan_acquire_context(void *user_context,
                                          struct halide_vulkan_memory_allocator **allocator,

From 4816e5ab431ce1c2d5a4f52613bd06ea9be53e5b Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 27 Sep 2022 15:27:28 -0700
Subject: [PATCH 024/166] Add comments describing loader method exports and
 dynamically resolved function pointers Other minor cleanups

---
 src/runtime/mini_vulkan.h      |  3 ++-
 src/runtime/vulkan.cpp         | 16 ++++++++--------
 src/runtime/vulkan_functions.h |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index fc501cfcc72f..4f1c8716558a 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -2624,12 +2624,13 @@ typedef void(VKAPI_PTR *PFN_vkCmdNextSubpass)(VkCommandBuffer commandBuffer, VkS
 typedef void(VKAPI_PTR *PFN_vkCmdEndRenderPass)(VkCommandBuffer commandBuffer);
 typedef void(VKAPI_PTR *PFN_vkCmdExecuteCommands)(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer *pCommandBuffers);
 
+// This appears to be exported by the loader
 VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance(
     const VkInstanceCreateInfo *pCreateInfo,
     const VkAllocationCallbacks *pAllocator,
     VkInstance *pInstance);
 
-// Moved this to be the only prototype depended on.
+// Same as above ... these two methods are the only prototypes we depend upon
 VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
     VkInstance instance,
     const char *pName);
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 16e9c8c5e42f..84c65bb6ffb0 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -39,17 +39,17 @@ WEAK int halide_vulkan_acquire_context(void *user_context,
                                        uint32_t *queue_family_index,
                                        bool create) {
 
-    halide_abort_if_false(user_context, instance != nullptr);
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, queue != nullptr);
-    halide_abort_if_false(user_context, &thread_lock != nullptr);
+    halide_debug_assert(user_context, instance != nullptr);
+    halide_debug_assert(user_context, device != nullptr);
+    halide_debug_assert(user_context, queue != nullptr);
+    halide_debug_assert(user_context, &thread_lock != nullptr);
     while (__atomic_test_and_set(&thread_lock, __ATOMIC_ACQUIRE)) {}
 
     // If the context has not been initialized, initialize it now.
-    halide_abort_if_false(user_context, &cached_instance != nullptr);
-    halide_abort_if_false(user_context, &cached_device != nullptr);
-    halide_abort_if_false(user_context, &cached_queue != nullptr);
-    halide_abort_if_false(user_context, &cached_physical_device != nullptr);
+    halide_debug_assert(user_context, &cached_instance != nullptr);
+    halide_debug_assert(user_context, &cached_device != nullptr);
+    halide_debug_assert(user_context, &cached_queue != nullptr);
+    halide_debug_assert(user_context, &cached_physical_device != nullptr);
     if ((cached_instance == nullptr) && create) {
         int result = vk_create_context(user_context,
                                        reinterpret_cast<VulkanMemoryAllocator **>(&cached_allocator),
diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
index 8d2e996145ab..0cb42c2be080 100644
--- a/src/runtime/vulkan_functions.h
+++ b/src/runtime/vulkan_functions.h
@@ -1,4 +1,4 @@
-// VULKAN_FN(vkCreateInstance)
+// NOTE: vkCreateInstance is already defined in the mini_vulkan header
 VULKAN_FN(vkDestroyInstance)
 VULKAN_FN(vkCreateDevice)
 VULKAN_FN(vkDestroyDevice)

From e31247cdcfc1c90bf81aed354faecb6821ce7558 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 27 Sep 2022 15:28:40 -0700
Subject: [PATCH 025/166] Change aborts to debug asserts for context
 parameters. Add error handling to acquire context.

---
 src/runtime/vulkan_context.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index bfa68200b098..fd3f6a2444c8 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -59,13 +59,15 @@ class VulkanContext {
         int result = halide_vulkan_acquire_context(user_context,
                                                    reinterpret_cast<halide_vulkan_memory_allocator **>(&allocator),
                                                    &instance, &device, &physical_device, &command_pool, &queue, &queue_family_index);
-        halide_abort_if_false(user_context, result == 0);
-        halide_abort_if_false(user_context, allocator != nullptr);
-        halide_abort_if_false(user_context, instance != nullptr);
-        halide_abort_if_false(user_context, device != nullptr);
-        halide_abort_if_false(user_context, command_pool != 0);
-        halide_abort_if_false(user_context, queue != nullptr);
-        halide_abort_if_false(user_context, physical_device != nullptr);
+        if(result != halide_error_code_success) {
+            halide_error_no_device_interface(user_context);
+        }                                               
+        halide_debug_assert(user_context, allocator != nullptr);
+        halide_debug_assert(user_context, instance != nullptr);
+        halide_debug_assert(user_context, device != nullptr);
+        halide_debug_assert(user_context, command_pool != 0);
+        halide_debug_assert(user_context, queue != nullptr);
+        halide_debug_assert(user_context, physical_device != nullptr);
     }
 
     HALIDE_ALWAYS_INLINE ~VulkanContext() {

From 66803caf8bf8daa2237c9322529ab96a38b99b76 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 29 Sep 2022 14:31:51 -0700
Subject: [PATCH 026/166] Cache Vulkan descriptor sets and other shader module
 objects in compilation cache for reuse

---
 src/runtime/vulkan.cpp         | 168 ++++++++++++++++++---------------
 src/runtime/vulkan_context.h   |   4 +-
 src/runtime/vulkan_internal.h  |  12 ++-
 src/runtime/vulkan_resources.h | 126 +++++++++++++++++--------
 4 files changed, 192 insertions(+), 118 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 84c65bb6ffb0..e448432d4708 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -128,8 +128,8 @@ WEAK int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr,
 #endif
 
     debug(user_context) << "halide_vulkan_initialize_kernels got compilation_cache mutex.\n";
-    VkShaderModule *shader_module = nullptr;
-    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.device, shader_module,
+    VulkanCompilationCacheEntry *cache_entry = nullptr;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.device, cache_entry,
                                               Halide::Runtime::Internal::Vulkan::vk_compile_shader_module,
                                               user_context, ctx.allocator, src, size)) {
         return halide_error_code_generic_error;
@@ -591,96 +591,117 @@ WEAK int halide_vulkan_run(void *user_context,
 #endif
 
     // Running a Vulkan pipeline requires a large number of steps
-    // and boilerplate:
-    // 1. Create a descriptor set layout
-    // 1a. Create the buffer for the scalar params
-    // 2. Create a pipeline layout
-    // 3. Create a compute pipeline
-    // --- The above can be cached between invocations ---
-    // 4. Create a descriptor set
-    // 5. Set bindings for buffers in the descriptor set
-    // 6. Create a command buffer from the command pool
-    // 7. Fill the command buffer with a dispatch call
-    // 7a. Bind the compute pipeline from #3
-    // 7b. Bind the descriptor set
-    // 7c. Add a dispatch to the command buffer
-    // 7d. End the command buffer
-    // 8. Submit the command buffer to our command queue
+    // and boilerplate.  We save pipeline specific objects alongside the
+    // shader module in the compilation cache to avoid re-creating these
+    // if used more than once.
+    //
+    // 1. Lookup the shader module cache entry in the compilation cache
+    //    1a. If shader module doesn't exist yet, then lookup invokes compile
+    // 2. If the rest of the cache entry is uninitialized, then create new objects:
+    //    2a. Create a descriptor set layout
+    //    2b. Create the buffer for the scalar params
+    //    2c. Create a pipeline layout
+    //    2d. Create a compute pipeline
+    //    2e. Create a descriptor set
+    //    --- The above can be cached between invocations ---
+    // 3. Set bindings for buffers in the descriptor set
+    //    3a. Copy args into uniform buffer
+    //    3b. Update buffer bindings for descriptor set
+    // 4. Create a command buffer from the command pool
+    // 5. Fill the command buffer with a dispatch call
+    //    7a. Bind the compute pipeline
+    //    7b. Bind the descriptor set
+    //    7c. Add a dispatch to the command buffer
+    //    7d. End the command buffer
+    // 6. Submit the command buffer to our command queue
     // --- The following isn't best practice, but it's in line
     //     with what we do in Metal etc. ---
-    // 9. Wait until the queue is done with the command buffer
-    // 10. Cleanup all temporary objects
+    // 7. Wait until the queue is done with the command buffer
+    // 8. Cleanup all temporary objects
 
-    uint32_t num_bindings = vk_count_bindings_for_descriptor_set(user_context, arg_sizes, args, arg_is_buffer);
+    //// 3. Create a compute pipeline
+    // Get the shader module
+    VulkanCompilationCacheEntry *cache_entry = nullptr;
+    bool found = compilation_cache.lookup(ctx.device, state_ptr, cache_entry);
+    halide_abort_if_false(user_context, found);
+    if (cache_entry == nullptr) {
+        error(user_context) << "Vulkan: Failed to locate shader module! Unable to proceed!\n";
+        return halide_error_code_internal_error;
+    }
 
     //// 1. Create a descriptor set layout
-    VkDescriptorSetLayout descriptor_set_layout;
-    VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, arg_sizes, args, arg_is_buffer, &descriptor_set_layout);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
-        return result;
+    if (cache_entry->descriptor_set_layout == 0) {
+        cache_entry->bindings_count = vk_count_bindings_for_descriptor_set(user_context, arg_sizes, args, arg_is_buffer);
+        cache_entry->buffer_count = (cache_entry->bindings_count - 1);  // first binding is args packed into a param buffer, all others are halide buffers
+        VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, arg_sizes, args, arg_is_buffer, &(cache_entry->descriptor_set_layout));
+        if (result != VK_SUCCESS) {
+            error(user_context) << "Vulkan: vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
+            return result;
+        }
     }
 
     //// 1a. Create a buffer for the scalar parameters
-    // First allocate memory, then map it and copy params, then create a buffer and bind the allocation
-    MemoryRegion *scalar_args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, arg_sizes, args, arg_is_buffer);
-    if (scalar_args_region == nullptr) {
-        error(user_context) << "Vulkan: vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
-        return result;
-    }
-
-    VkBuffer *scalar_args_buffer = reinterpret_cast<VkBuffer *>(scalar_args_region->handle);
-    if (scalar_args_buffer == nullptr) {
-        error(user_context) << "Vulkan: Failed to retrieve scalar args buffer for device memory!\n";
-        return halide_error_code_internal_error;
+    if (cache_entry->args_region == nullptr) {
+        cache_entry->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, arg_sizes, args, arg_is_buffer);
+        if (cache_entry->args_region == nullptr) {
+            error(user_context) << "Vulkan: vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
+            return halide_error_code_internal_error;
+        }
     }
 
     ///// 2. Create a pipeline layout
-    VkPipelineLayout pipeline_layout;
-    result = vk_create_pipeline_layout(user_context, ctx.allocator, &descriptor_set_layout, &pipeline_layout);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
-        return result;
-    }
-
-    //// 3. Create a compute pipeline
-    // Get the shader module
-    VkShaderModule *shader_module = nullptr;
-    bool found = compilation_cache.lookup(ctx.device, state_ptr, shader_module);
-    halide_abort_if_false(user_context, found);
-    if (shader_module == nullptr) {
-        error(user_context) << "Vulkan: Failed to locate shader module! Unable to proceed!\n";
-        return halide_error_code_internal_error;
+    if (cache_entry->pipeline_layout == 0) {
+        VkResult result = vk_create_pipeline_layout(user_context, ctx.allocator, &(cache_entry->descriptor_set_layout), &(cache_entry->pipeline_layout));
+        if (result != VK_SUCCESS) {
+            error(user_context) << "Vulkan: vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
+            return halide_error_code_internal_error;
+        }
     }
 
     // Construct the pipeline
-    VkPipeline compute_pipeline;
-    result = vk_create_compute_pipeline(user_context, ctx.allocator, entry_name, *shader_module, pipeline_layout, &compute_pipeline);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-        return result;
+    if (cache_entry->compute_pipeline == 0) {
+        VkResult result = vk_create_compute_pipeline(user_context, ctx.allocator, entry_name, cache_entry->shader_module, cache_entry->pipeline_layout, &(cache_entry->compute_pipeline));
+        if (result != VK_SUCCESS) {
+            error(user_context) << "Vulkan: vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            return halide_error_code_internal_error;
+        }
     }
 
     //// 4. Create a descriptor set
-    // Construct a descriptor pool
-    VkDescriptorPool descriptor_pool;
-    uint32_t storage_buffer_count = num_bindings - 1;
-    result = vk_create_descriptor_pool(user_context, ctx.allocator, storage_buffer_count, &descriptor_pool);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-        return result;
+    if (cache_entry->descriptor_set == 0) {
+
+        // Construct a descriptor pool
+        //
+        // NOTE: while this could be re-used across multiple pipelines, we only know the storage requirements of this kernel's
+        //       inputs and outputs ... so create a pool specific to the number of buffers known at this time
+        VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, cache_entry->buffer_count, &(cache_entry->descriptor_pool));
+        if (result != VK_SUCCESS) {
+            error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            return result;
+        }
+
+        // Create the descriptor set
+        result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layout, cache_entry->descriptor_pool, &(cache_entry->descriptor_set));
+        if (result != VK_SUCCESS) {
+            error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            return result;
+        }
     }
 
-    // Create the descriptor set
-    VkDescriptorSet descriptor_set;
-    result = vk_create_descriptor_set(user_context, ctx.allocator, descriptor_set_layout, descriptor_pool, &descriptor_set);
+    //// 5. Update uniform args and bindings for buffers in the descriptor set
+    VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, cache_entry->args_region, arg_sizes, args, arg_is_buffer);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        debug(user_context) << "Vulkan: vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
     }
 
-    //// 5. Set bindings for buffers in the descriptor set
-    result = vk_update_descriptor_set(user_context, ctx.allocator, *scalar_args_buffer, storage_buffer_count, arg_sizes, args, arg_is_buffer, descriptor_set);
+    VkBuffer *args_buffer = reinterpret_cast<VkBuffer *>(cache_entry->args_region->handle);
+    if (args_buffer == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve scalar args buffer for device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    result = vk_update_descriptor_set(user_context, ctx.allocator, *args_buffer, cache_entry->buffer_count, arg_sizes, args, arg_is_buffer, cache_entry->descriptor_set);
     if (result != VK_SUCCESS) {
         debug(user_context) << "Vulkan: vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
@@ -696,7 +717,7 @@ WEAK int halide_vulkan_run(void *user_context,
 
     //// 7. Begin the command buffer
     result = vk_fill_command_buffer_with_dispatch_call(user_context,
-                                                       ctx.device, command_buffer, compute_pipeline, pipeline_layout, descriptor_set,
+                                                       ctx.device, command_buffer, cache_entry->compute_pipeline, cache_entry->pipeline_layout, cache_entry->descriptor_set,
                                                        blocksX, blocksY, blocksZ);
     if (result != VK_SUCCESS) {
         debug(user_context) << "Vulkan: vk_fill_command_buffer_with_dispatch_call() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
@@ -718,13 +739,6 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     //// 10. Cleanup
-    // Release all temporary objects for this run
-    vk_destroy_scalar_uniform_buffer(user_context, ctx.allocator, scalar_args_region);
-    vk_destroy_descriptor_set_layout(user_context, ctx.allocator, descriptor_set_layout);
-    vk_destroy_descriptor_pool(user_context, ctx.allocator, descriptor_pool);
-    vk_destroy_pipeline_layout(user_context, ctx.allocator, pipeline_layout);
-    vk_destroy_compute_pipeline(user_context, ctx.allocator, compute_pipeline);
-
     vkResetCommandPool(ctx.device, ctx.command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
 
 #ifdef DEBUG_RUNTIME
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index fd3f6a2444c8..2b8c8bb85240 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -59,9 +59,9 @@ class VulkanContext {
         int result = halide_vulkan_acquire_context(user_context,
                                                    reinterpret_cast<halide_vulkan_memory_allocator **>(&allocator),
                                                    &instance, &device, &physical_device, &command_pool, &queue, &queue_family_index);
-        if(result != halide_error_code_success) {
+        if (result != halide_error_code_success) {
             halide_error_no_device_interface(user_context);
-        }                                               
+        }
         halide_debug_assert(user_context, allocator != nullptr);
         halide_debug_assert(user_context, instance != nullptr);
         halide_debug_assert(user_context, device != nullptr);
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index d472bf26eb9a..3befa4dbf797 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -23,6 +23,7 @@ namespace Vulkan {
 
 // Declarations
 class VulkanMemoryAllocator;
+struct VulkanCompilationCacheEntry;
 
 // --------------------------------------------------------------------------
 
@@ -107,6 +108,13 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
                                               void *args[],
                                               int8_t arg_is_buffer[]);
 
+VkResult vk_update_scalar_uniform_buffer(void *user_context,
+                                         VulkanMemoryAllocator *allocator,
+                                         MemoryRegion *region,
+                                         size_t arg_sizes[],
+                                         void *args[],
+                                         int8_t arg_is_buffer[]);
+
 void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
                                       MemoryRegion *scalar_args_region);
 // -- Descriptor Pool
@@ -174,8 +182,8 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
                                      VkPipeline compute_pipeline);
 
 // -- Shader Module
-VkShaderModule *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
-                                         const char *src, int size);
+VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
+                                                      const char *src, int size);
 
 int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocator);
 
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 6fa8f9472d6a..7cdaeb111dfa 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -12,7 +12,19 @@ namespace Internal {
 namespace Vulkan {
 
 // Compilation cache for compiled shader modules
-WEAK Halide::Internal::GPUCompilationCache<VkDevice, VkShaderModule *> compilation_cache;
+struct VulkanCompilationCacheEntry {
+    VkShaderModule shader_module = {0};
+    VkDescriptorPool descriptor_pool = {0};
+    VkDescriptorSet descriptor_set = {0};
+    VkDescriptorSetLayout descriptor_set_layout = {0};
+    VkPipelineLayout pipeline_layout = {0};
+    VkPipeline compute_pipeline = {0};
+    MemoryRegion *args_region = nullptr;
+    uint32_t bindings_count = 0;
+    uint32_t buffer_count = 0;
+};
+
+WEAK Halide::Internal::GPUCompilationCache<VkDevice, VulkanCompilationCacheEntry *> compilation_cache;
 
 // --------------------------------------------------------------------------
 
@@ -154,14 +166,14 @@ uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
                                               void *args[],
                                               int8_t arg_is_buffer[]) {
     int i = 0;
-    uint32_t num_bindings = 1;  // first binding is for passing scalar parameters in a buffer
+    uint32_t bindings_count = 1;  // first binding is for passing scalar parameters in a buffer
     while (arg_sizes[i] > 0) {
         if (arg_is_buffer[i]) {
-            num_bindings++;
+            bindings_count++;
         }
         i++;
     }
-    return num_bindings;
+    return bindings_count;
 }
 
 // --
@@ -177,25 +189,22 @@ VkResult vk_create_descriptor_pool(void *user_context,
         << "device: " << (void *)allocator->current_device() << ", "
         << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n";
 #endif
-    static const uint32_t uniform_buffer_count = 1;  // all scalar args are packed into one uniform buffer
-
     VkDescriptorPoolSize descriptor_pool_sizes[2] = {
         {
             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
-            uniform_buffer_count                // how many
+            1                                   // all kernel args are packed into one uniform buffer
         },
         {
             VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,  // descriptor type
-            storage_buffer_count                // how many
+            storage_buffer_count                // all halide buffers are passed as storage buffers
         }};
 
-    uint32_t descriptor_set_count = (uniform_buffer_count + storage_buffer_count);
     VkDescriptorPoolCreateInfo descriptor_pool_info =
         {
             VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,  // struct type
             nullptr,                                        // point to struct extending this
             0,                                              // flags
-            descriptor_set_count,                           // max number of sets that can be allocated TODO:should this be 1?
+            1,                                              // this pool will only be used for creating one descriptor set!
             2,                                              // pool size count
             descriptor_pool_sizes                           // ptr to descriptr pool sizes
         };
@@ -478,27 +487,47 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
         return nullptr;
     }
 
+    // return the allocated region for the uniform buffer
+    return region;
+}
+
+VkResult vk_update_scalar_uniform_buffer(void *user_context,
+                                         VulkanMemoryAllocator *allocator,
+                                         MemoryRegion *region,
+                                         size_t arg_sizes[],
+                                         void *args[],
+                                         int8_t arg_is_buffer[]) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_update_scalar_uniform_buffer (user_context: " << user_context << ", "
+        << "region: " << (void *)region << ")\n";
+#endif
+
+    if ((region == nullptr) || (region->handle == nullptr)) {
+        error(user_context) << "Vulkan: Invalid memory region used for updating scalar uniform buffer!\n";
+        return VK_INCOMPLETE;
+    }
+
     // map the region to a host ptr
-    uint8_t *scalar_buffer_host_ptr = (uint8_t *)allocator->map(user_context, region);
-    if (scalar_buffer_host_ptr == nullptr) {
+    uint8_t *host_ptr = (uint8_t *)allocator->map(user_context, region);
+    if (host_ptr == nullptr) {
         error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
-        return nullptr;
+        return VK_INCOMPLETE;
     }
 
     // copy to the (host-visible/coherent) scalar uniform buffer
-    size_t scalar_arg_offset = 0;
+    size_t arg_offset = 0;
     for (size_t i = 0; arg_sizes[i] > 0; i++) {
         if (!arg_is_buffer[i]) {
-            memcpy(scalar_buffer_host_ptr + scalar_arg_offset, args[i], arg_sizes[i]);
-            scalar_arg_offset += arg_sizes[i];
+            memcpy(host_ptr + arg_offset, args[i], arg_sizes[i]);
+            arg_offset += arg_sizes[i];
         }
     }
 
     // unmap the pointer to the buffer for the region
     allocator->unmap(user_context, region);
-
-    // return the allocated region for the uniform buffer
-    return region;
+    return VK_SUCCESS;
 }
 
 void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
@@ -631,8 +660,8 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
 
 // --------------------------------------------------------------------------
 
-VkShaderModule *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
-                                         const char *src, int size) {
+VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
+                                                      const char *src, int size) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "Vulkan: vk_compile_shader_module (user_context: " << user_context << ", "
@@ -653,16 +682,17 @@ VkShaderModule *vk_compile_shader_module(void *user_context, VulkanMemoryAllocat
     };
 
     VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT;
-    VkShaderModule *shader_module = (VkShaderModule *)vk_host_malloc(user_context, sizeof(VkShaderModule), 0, alloc_scope, allocator->callbacks());
-    if (shader_module == nullptr) {
-        error(user_context) << "Vulkan: Failed to allocatoe shader module! Out of memory!\n";
+    VulkanCompilationCacheEntry *cache_entry = (VulkanCompilationCacheEntry *)vk_host_malloc(user_context, sizeof(VulkanCompilationCacheEntry), 0, alloc_scope, allocator->callbacks());
+    if (cache_entry == nullptr) {
+        error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!\n";
         return nullptr;
     }
+    memset(cache_entry, 0, sizeof(VulkanCompilationCacheEntry));
 
-    VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), shader_module);
-    if ((result != VK_SUCCESS) || (shader_module == nullptr)) {
+    VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &cache_entry->shader_module);
+    if ((result != VK_SUCCESS)) {  // } || (cache_entry->shader_module == nullptr)) {
         error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
-        vk_host_free(user_context, shader_module, allocator->callbacks());
+        vk_host_free(user_context, cache_entry, allocator->callbacks());
         return nullptr;
     }
 
@@ -671,7 +701,7 @@ VkShaderModule *vk_compile_shader_module(void *user_context, VulkanMemoryAllocat
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return shader_module;
+    return cache_entry;
 }
 
 int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocator) {
@@ -688,22 +718,44 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
     // Functor to match compilation cache destruction call with scoped params
     struct DestroyShaderModule {
         void *user_context = nullptr;
-        VkDevice device = nullptr;
-        const VkAllocationCallbacks *allocation_callbacks = nullptr;
+        VulkanMemoryAllocator *allocator = nullptr;
 
-        DestroyShaderModule(void *ctx, VkDevice dev, const VkAllocationCallbacks *callbacks)
-            : user_context(ctx), device(dev), allocation_callbacks(callbacks) {
+        DestroyShaderModule(void *ctx, VulkanMemoryAllocator *allocator)
+            : user_context(ctx), allocator(allocator) {
         }
 
-        void operator()(VkShaderModule *shader_module) {
-            if (shader_module != nullptr) {
-                vkDestroyShaderModule(device, *shader_module, allocation_callbacks);
-                vk_host_free(user_context, shader_module, allocation_callbacks);
+        void operator()(VulkanCompilationCacheEntry *cache_entry) {
+            if (cache_entry != nullptr) {
+                if (cache_entry->shader_module) {
+                    vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
+                    cache_entry->shader_module = {0};
+                }
+                if (cache_entry->args_region) {
+                    vk_destroy_scalar_uniform_buffer(user_context, allocator, cache_entry->args_region);
+                    cache_entry->args_region = nullptr;
+                }
+                if (cache_entry->descriptor_set_layout) {
+                    vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layout);
+                    cache_entry->descriptor_set_layout = {0};
+                }
+                if (cache_entry->descriptor_pool) {
+                    vk_destroy_descriptor_pool(user_context, allocator, cache_entry->descriptor_pool);
+                    cache_entry->descriptor_pool = {0};
+                }
+                if (cache_entry->pipeline_layout) {
+                    vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
+                    cache_entry->pipeline_layout = {0};
+                }
+                if (cache_entry->compute_pipeline) {
+                    vk_destroy_compute_pipeline(user_context, allocator, cache_entry->compute_pipeline);
+                    cache_entry->compute_pipeline = {0};
+                }
+                vk_host_free(user_context, cache_entry, allocator->callbacks());
             }
         }
     };
 
-    DestroyShaderModule module_destructor(user_context, allocator->current_device(), allocator->callbacks());
+    DestroyShaderModule module_destructor(user_context, allocator);
     compilation_cache.delete_context(user_context, allocator->current_device(), module_destructor);
 
 #ifdef DEBUG_RUNTIME

From 25788051af937cb817945846522bc0c618dd3ced Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 29 Sep 2022 14:49:39 -0700
Subject: [PATCH 027/166] Replace platform specific strncpy for grabbing
 Extension strings with StringUtils::copy_upto

---
 src/runtime/internal/string_storage.h |  7 +++++++
 src/runtime/vulkan_extensions.h       | 14 +++++---------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/runtime/internal/string_storage.h b/src/runtime/internal/string_storage.h
index 21f20981dc33..d5b16e8b1a84 100644
--- a/src/runtime/internal/string_storage.h
+++ b/src/runtime/internal/string_storage.h
@@ -67,6 +67,13 @@ struct StringUtils {
         }
         return size_t(ptr - str);
     }
+
+    static size_t copy_upto(char* dst, const char* src, size_t max_chars) {
+        size_t length = count_length(src, max_chars);
+        memcpy(dst, src, length);
+        dst[length] = '\0';
+        return length;
+    }
 };
 
 // --
diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
index 343e66238ac5..75bf2e415524 100644
--- a/src/runtime/vulkan_extensions.h
+++ b/src/runtime/vulkan_extensions.h
@@ -34,10 +34,9 @@ namespace {
 void vk_set_layer_names_internal(const char *n) {
     if (n) {
         size_t buffer_size = sizeof(layer_names) / sizeof(layer_names[0]);
-        strncpy(layer_names, n, buffer_size);
-        layer_names[buffer_size - 1] = '\0';
+        StringUtils::copy_upto(layer_names, n, buffer_size);
     } else {
-        layer_names[0] = '\0';
+        layer_names[0] = 0;
     }
     layer_names_initialized = true;
 }
@@ -56,8 +55,7 @@ const char *vk_get_layer_names_internal(void *user_context) {
 void vk_set_extension_names_internal(const char *n) {
     if (n) {
         size_t buffer_size = sizeof(extension_names) / sizeof(extension_names[0]);
-        strncpy(extension_names, n, buffer_size);
-        extension_names[buffer_size - 1] = 0;
+        StringUtils::copy_upto(extension_names, n, buffer_size);
     } else {
         extension_names[0] = 0;
     }
@@ -75,8 +73,7 @@ const char *vk_get_extension_names_internal(void *user_context) {
 void vk_set_device_type_internal(const char *n) {
     if (n) {
         size_t buffer_size = sizeof(device_type) / sizeof(device_type[0]);
-        strncpy(device_type, n, buffer_size);
-        device_type[buffer_size - 1] = 0;
+        StringUtils::copy_upto(device_type, n, buffer_size);
     } else {
         device_type[0] = 0;
     }
@@ -94,8 +91,7 @@ const char *vk_get_device_type_internal(void *user_context) {
 void vk_set_build_options_internal(const char *n) {
     if (n) {
         size_t buffer_size = sizeof(build_options) / sizeof(build_options[0]);
-        strncpy(build_options, n, buffer_size);
-        build_options[buffer_size - 1] = 0;
+        StringUtils::copy_upto(build_options, n, buffer_size);
     } else {
         build_options[0] = 0;
     }

From 23cc23b816b1132fb58affabb78cbb851e3db558 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 29 Sep 2022 16:50:33 -0700
Subject: [PATCH 028/166] Enable device features for selected device

---
 src/runtime/internal/string_storage.h | 2 +-
 src/runtime/vulkan_context.h          | 9 +++++++--
 src/runtime/vulkan_extensions.h       | 8 ++++++--
 src/runtime/vulkan_functions.h        | 2 ++
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/runtime/internal/string_storage.h b/src/runtime/internal/string_storage.h
index d5b16e8b1a84..83b3acc13623 100644
--- a/src/runtime/internal/string_storage.h
+++ b/src/runtime/internal/string_storage.h
@@ -68,7 +68,7 @@ struct StringUtils {
         return size_t(ptr - str);
     }
 
-    static size_t copy_upto(char* dst, const char* src, size_t max_chars) {
+    static size_t copy_upto(char *dst, const char *src, size_t max_chars) {
         size_t length = count_length(src, max_chars);
         memcpy(dst, src, length);
         dst[length] = '\0';
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 2b8c8bb85240..15249d9cdce3 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -229,7 +229,7 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     }
 
     // enable all available optional extensions
-    debug(user_context) << "Vulkan: Found " << (uint32_t)optional_device_extensions.size() << " optional extensions for device!\n";
+    debug(user_context) << "Vulkan: Checking for " << (uint32_t)optional_device_extensions.size() << " optional extensions for device ...\n";
     for (int n = 0; n < (int)optional_device_extensions.size(); ++n) {
         if (supported_device_extensions.contains(optional_device_extensions[n])) {
             debug(user_context) << "    optional extension: " << optional_device_extensions[n] << "\n";
@@ -247,6 +247,11 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
         &queue_priority,
     };
 
+    // Get the device features so that all supported features are enabled when device is created
+    //
+    VkPhysicalDeviceFeatures device_features;
+    vkGetPhysicalDeviceFeatures(*physical_device, &device_features);
+
     VkDeviceCreateInfo device_create_info = {
         VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
         nullptr,  // Next
@@ -255,7 +260,7 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
         &device_queue_create_info,
         (uint32_t)requested_layers.size(), requested_layers.data(),                      // Layers
         (uint32_t)required_device_extensions.size(), required_device_extensions.data(),  // Enabled extensions
-        nullptr,                                                                         // VkPhysicalDeviceFeatures
+        &device_features,                                                                // Requested device features
     };
 
     VkResult result = vkCreateDevice(*physical_device, &device_create_info, alloc_callbacks, device);
diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
index 75bf2e415524..7f69a17426ee 100644
--- a/src/runtime/vulkan_extensions.h
+++ b/src/runtime/vulkan_extensions.h
@@ -169,8 +169,12 @@ uint32_t vk_get_required_device_extensions(void *user_context, StringTable &ext_
 }
 
 uint32_t vk_get_optional_device_extensions(void *user_context, StringTable &ext_table) {
-    const uint32_t optional_ext_count = 1;
-    const char *optional_ext_table[] = {"VK_KHR_portability_subset"};
+    const char *optional_ext_table[] = {
+        "VK_KHR_portability_subset",  //< necessary for running under Molten (aka Vulkan on Mac)
+        "VK_KHR_16bit_storage",
+        "VK_KHR_shader_float16_int8",
+        "VK_KHR_shader_float_controls"};
+    const uint32_t optional_ext_count = sizeof(optional_ext_table) / sizeof(optional_ext_table[0]);
 
     ext_table.resize(user_context, optional_ext_count);
     for (uint32_t n = 0; n < optional_ext_count; ++n) {
diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
index 0cb42c2be080..6536811e71ee 100644
--- a/src/runtime/vulkan_functions.h
+++ b/src/runtime/vulkan_functions.h
@@ -13,8 +13,10 @@ VULKAN_FN(vkDestroyPipelineLayout)
 VULKAN_FN(vkDestroyShaderModule)
 VULKAN_FN(vkEnumerateDeviceExtensionProperties)
 VULKAN_FN(vkEnumeratePhysicalDevices)
+VULKAN_FN(vkGetPhysicalDeviceFeatures)
 VULKAN_FN(vkGetPhysicalDeviceProperties)
 VULKAN_FN(vkGetPhysicalDeviceQueueFamilyProperties)
+VULKAN_FN(vkGetBufferMemoryRequirements)
 VULKAN_FN(vkCreateShaderModule)
 VULKAN_FN(vkCreateDescriptorSetLayout)
 VULKAN_FN(vkCreatePipelineLayout)

From 6e1e5d9b604d757eaa66122219b3dc4a07fe6853 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 30 Sep 2022 12:41:15 -0700
Subject: [PATCH 029/166] Fix alignment constraints for to match Vulkan buffer
 memory requirements. Add env vars to control Vulkan Memory Allocator config.

---
 src/runtime/internal/memory_resources.h |  8 ++++
 src/runtime/internal/region_allocator.h | 17 +++++---
 src/runtime/vulkan_memory.h             | 52 ++++++++++++++++++++++++-
 3 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index 455ce43ab277..ad72ec5dc689 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -55,6 +55,7 @@ struct MemoryProperties {
     MemoryVisibility visibility = MemoryVisibility::InvalidVisibility;
     MemoryUsage usage = MemoryUsage::InvalidUsage;
     MemoryCaching caching = MemoryCaching::InvalidCaching;
+    size_t alignment = 0;         //< required alignment of allocations (zero for no constraint)
 };
 
 // Client-facing struct for exchanging memory block allocation requests
@@ -111,6 +112,13 @@ ALWAYS_INLINE size_t aligned_offset(size_t offset, size_t alignment) {
     return (offset + (alignment - 1)) & ~(alignment - 1);
 }
 
+// Returns a suitable alignment such that requested alignment is a suitable
+// integer multiple of the required alignment
+ALWAYS_INLINE size_t conform_alignment(size_t requested, size_t required) {
+    size_t alignment = max(requested, required);
+    return (alignment > required) ? (required * ((alignment / required) + 1)) : alignment;
+}
+
 // Returns a padded size to accomodate an adjusted offset due to alignment constraints
 // -- Alignment must be power of two!
 ALWAYS_INLINE size_t aligned_size(size_t offset, size_t size, size_t alignment) {
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index e01a94699ed6..145a7093bacb 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -197,7 +197,8 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
             continue;
         }
 
-        size_t actual_size = aligned_size(block_region->memory.offset, request.size, request.alignment);
+        size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
+        size_t actual_size = aligned_size(block_region->memory.offset, request.size, actual_alignment);
 
         // is the adjusted size larger than the current region?
         if (actual_size > block_region->memory.size) {
@@ -270,10 +271,15 @@ bool RegionAllocator::can_split(BlockRegion *block_region, size_t size) {
 }
 
 BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
-    size_t adjusted_size = aligned_size(block_region->memory.offset, size, alignment);
-    size_t adjusted_offset = aligned_offset(block_region->memory.offset, alignment);
+    alignment = conform_alignment(alignment, block->memory.properties.alignment);
+
+    debug(nullptr) << "RegionAllocator: Conforming alignment (" 
+                        << "requested=" << (uint32_t)alignment << " "
+                        << "required=" << (uint32_t)block->memory.properties.alignment << " "
+                        << "actual=" << (uint32_t)alignment << ")\n";
 
-    size_t empty_offset = adjusted_offset + size;
+    size_t adjusted_size = aligned_size(block_region->memory.offset, size, alignment);
+    size_t adjusted_offset = aligned_offset(block_region->memory.offset + size, alignment);
     size_t empty_size = block_region->memory.size - adjusted_size;
 
 #ifdef DEBUG_INTERNAL
@@ -285,7 +291,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     BlockRegion *next_region = block_region->next_ptr;
     BlockRegion *empty_region = create_block_region(user_context,
                                                     block_region->memory.properties,
-                                                    empty_offset, empty_size,
+                                                    adjusted_offset, empty_size,
                                                     block_region->memory.dedicated);
     halide_abort_if_false(user_context, empty_region != nullptr);
 
@@ -304,6 +310,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
                         << "user_context=" << (void *)(user_context) << " "
                         << "offset=" << (uint32_t)offset << " "
                         << "size=" << (uint32_t)size << " "
+                        << "alignment=" << (uint32_t)properties.alignment << " "
                         << "dedicated=" << (dedicated ? "true" : "false") << " "
                         << "usage=" << halide_memory_usage_name(properties.usage) << " "
                         << "caching=" << halide_memory_caching_name(properties.caching) << " "
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index d911389e0715..dc5b15ffeee6 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -336,6 +336,35 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
         return;
     }
 
+    uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties);
+    
+    VkBufferCreateInfo create_info = {
+        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
+        nullptr,                               // struct extending this
+        0,                                     // create flags
+        block->size,                           // buffer size (in bytes)
+        usage_flags,                           // buffer usage flags
+        VK_SHARING_MODE_EXCLUSIVE,             // sharing mode
+        0, nullptr};
+
+    // Create a buffer to determine alignment requirements
+    VkBuffer buffer = {0};
+    result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, &buffer);
+    if (result != VK_SUCCESS) {
+        error(nullptr) << "VulkanMemoryAllocator: Failed to create buffer!\n\t"
+                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+        return;
+    }
+
+    VkMemoryRequirements memory_requirements = {0};
+    vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements);
+    vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
+    debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
+                   << "size=" << (uint32_t)block->size << ", "
+                   << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
+
+    block->properties.alignment = memory_requirements.alignment;
     block->handle = (void *)device_memory;
     instance->block_byte_count += block->size;
     instance->block_count++;
@@ -675,8 +704,29 @@ VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context,
 
     SystemMemoryAllocatorFns system_allocator = {vk_system_malloc, vk_system_free};
 
+    VulkanMemoryConfig config = memory_allocator_config;
+
+    const char* min_block_size_env = getenv("HL_VK_MIN_BLOCK_SIZE");
+    const char* max_block_size_env = getenv("HL_VK_MAX_BLOCK_SIZE");
+    const char* max_block_count_env = getenv("HL_VK_MAX_BLOCK_COUNT");
+
+    if(!StringUtils::is_empty(min_block_size_env)) {
+        config.minimum_block_size = atoi(min_block_size_env) * 1024 * 1024; 
+        debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.minimum_block_size << " for minimum block size (in bytes)\n";
+    }
+
+    if(!StringUtils::is_empty(max_block_size_env)) {
+        config.maximum_block_size = atoi(max_block_size_env) * 1024 * 1024; 
+        debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_size << " for maximum block size (in bytes)\n";
+    }
+
+    if(!StringUtils::is_empty(max_block_count_env)) {
+        config.maximum_block_count = atoi(max_block_count_env) ;
+        debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_count << " for maximum block count\n";
+    }
+
     return VulkanMemoryAllocator::create(user_context,
-                                         memory_allocator_config, device, physical_device,
+                                         config, device, physical_device,
                                          system_allocator, alloc_callbacks);
 }
 

From ec7fa05a8eb1ac5e12f58169f45a89f18544490a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 30 Sep 2022 12:42:56 -0700
Subject: [PATCH 030/166] Add Vulkan to list of supported APIs in README.md Add
 Vulkan specific README_vulkan.md

---
 README.md        |   2 +-
 README_vulkan.md | 571 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 572 insertions(+), 1 deletion(-)
 create mode 100644 README_vulkan.md

diff --git a/README.md b/README.md
index 7ae0b66ecdff..8bfdd97630f9 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ currently targets:
 - CPU architectures: X86, ARM, MIPS, Hexagon, PowerPC, RISC-V
 - Operating systems: Linux, Windows, macOS, Android, iOS, Qualcomm QuRT
 - GPU Compute APIs: CUDA, OpenCL, OpenGL Compute Shaders, Apple Metal, Microsoft
-  Direct X 12
+  Direct X 12, Vulkan
 
 Rather than being a standalone programming language, Halide is embedded in C++.
 This means you write C++ code that builds an in-memory representation of a
diff --git a/README_vulkan.md b/README_vulkan.md
new file mode 100644
index 000000000000..a2937cc2f175
--- /dev/null
+++ b/README_vulkan.md
@@ -0,0 +1,571 @@
+# Vulkan Support for Halide
+
+Halide supports the Khronos Vulkan framework as a compute API backend for GPU-like 
+devices, and compiles directly to a binary SPIR-V representation as part of its 
+code generation before submitting it to the Vulkan API. Both JIT and AOT usage 
+are supported via the `vulkan` target flag (eg `HL_JIT_TARGET=host-vulkan`).
+
+Vulkan support is actively under development, and considered *EXPERIMENTAL*
+at this stage.  Basic tests are passing, but there's still work to do to
+until we have adequate feature parity for production use.  
+
+See [below](#current-status) for details on specific test cases.
+
+# Compiling Halide w/Vulkan Support
+
+You'll need to configure Halide and enable the cmake option TARGET_VULKAN.
+
+For example, on Linux & OSX:
+
+```
+% cmake -G Ninja -DTARGET_VULKAN=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_DIR=$LLVM_ROOT/lib/cmake/llvm -S . -B build
+% cmake --build build
+```
+
+
+# Vulkan Runtime Environment:
+
+Halide has no direct dependency on Vulkan for code-generation, but the runtime
+requires a working Vulkan environment to run Halide generated code. Any valid 
+Vulkan v1.0+ device driver should work. 
+
+Specifically, you'll need:
+
+-   A vendor specific Vulkan device driver
+-   The generic Vulkan loader library
+
+For AMD & NVIDIA & Intel devices, download and install the latest graphics driver 
+for your platform. Vulkan support should be included.
+
+## Windows 
+
+AMD:
+https://www.amd.com/en/technologies/vulkan
+
+NVIDIA:
+https://developer.nvidia.com/vulkan-driver
+
+INTEL:
+https://www.intel.com/content/www/us/en/download-center/home.html
+
+
+## Linux 
+
+On Ubuntu Linux, proprietary drivers can be installed via 'apt' using 
+PPA's for each vendor.
+
+For AMD:
+```
+$ sudo add-apt-repository ppa:oibaf/graphics-drivers
+$ sudo apt update
+$ sudo apt upgrade
+$ sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-utils
+```
+
+For NVIDIA:
+```
+$ sudo add-apt-repository ppa:graphics-drivers/ppa
+$ sudo apt update
+$ sudo apt upgrade
+# - replace ### with latest driver release (eg 515)
+$ sudo apt install nvidia-driver-### nvidia-settings vulkan vulkan-utils
+```
+
+Note that only valid drivers for your system should be installed since there's been 
+reports of the Vulkan loader segfaulting just by having a non-supported driver present. 
+Specifically, the seemingly generic `mesa-vulkan-drivers` actually includes the AMD 
+graphics driver, which can cause problems if installed on an NVIDIA only system. 
+
+## Mac
+
+You're better off using Halide's Metal backend instead, but it is possible to run 
+Vulkan apps on a Mac via the MoltenVK library:
+
+MoltenVK:
+https://github.com/KhronosGroup/MoltenVK
+
+# Testing Your Vulkan Environment
+
+You can validate that everything is configured correctly by running the `vulkaninfo`
+app (bundled in the vulkan-utils package) to make sure your device is detected (eg):
+
+```
+$ vulkaninfo
+==========
+VULKANINFO
+==========
+
+Vulkan Instance Version: 1.3.224
+
+
+Instance Extensions: count = 19
+===============================
+	...
+
+Layers: count = 10
+==================
+VK_LAYER_KHRONOS_profiles (Khronos Profiles layer) Vulkan version 1.3.224, layer version 1:
+	Layer Extensions: count = 0
+	Devices: count = 1
+		GPU id = 0 (NVIDIA GeForce RTX 3070 Ti)
+		Layer-Device Extensions: count = 1
+
+...
+
+```
+
+Make sure everything looks correct before continuing!
+
+# Targetting Vulkan
+
+To generate Halide code for Vulkan, simply add the `vulkan` flag to your target.
+
+For AOT generators add `vulkan` to the target command line option:
+
+```
+$ ./lesson_15_generate -g my_first_generator -o . target=host-vulkan
+```
+
+For JIT apps use the `HL_JIT_TARGET` environment variable:
+
+```
+$ HL_JIT_TARGET=host-vulkan ./tutorial/lesson_01_basics
+```
+
+# Useful Environment Variables
+
+`HL_VK_LAYERS=...` will tell Halide to choose a suitable Vulkan instance
+that supports the given list of layers. If not set, `VK_INSTANCE_LAYERS=...` 
+will be used instead. If neither are present, Halide will use the first 
+Vulkan compute device it can find.
+
+`HL_VK_DEVICE_TYPE=...` will tell Halide to choose which type of device
+to select for creating the Vulkan instance. Valid options are 'gpu', 
+'discrete-gpu', 'integrated-gpu', 'virtual-gpu', or 'cpu'. If not set,
+Halide will search for the first 'gpu' like device it can find, or fall back
+to the first compute device it can find.
+
+`HL_VK_MIN_BLOCK_SIZE=N` will tell Halide to configure the Vulkan memory
+allocator to always request a minimum of N megabytes for a resource block,
+which will be used as a pool for suballocations.  Increasing this value
+may improve performance while sacrificing the amount of available device 
+memory. Default is 32MB.
+
+`HL_VK_MAX_BLOCK_SIZE=N` will tell Halide to configure the Vulkan memory
+allocator to never exceed a maximum of N megabytes for a resource block,
+which will be used as a pool for suballocations.  Decreasing this value
+may free up more memory but may impact performance, and/or restrict 
+allocations to be unusably small. Default is 0 ... meaning no limit.
+
+`HL_VK_MAX_BLOCK_COUNT=N` will tell Halide to configure the Vulkan memory
+allocator to never exceed a total of N block allocations.  Decreasing this 
+value may free up more memory but may impact performance, and/or restrict 
+allocations. Default is 0 ... meaning no limit.
+
+`HL_DEBUG_CODEGEN=3` will print out debug info that includees the SPIR-V
+code generator used for Vulkan while it is compiling.
+
+`HL_SPIRV_DUMP_FILE=...` specifies a file to dump the binary SPIR-V generated
+during compilation. Useful for debugging CodeGen issues. Can be inspected,
+validated and disassembled via the SPIR-V tools:
+
+https://github.com/KhronosGroup/SPIRV-Tools
+
+
+# Known Limitations And Caveats
+
+-   During CodeGen we enable capabilities in the SPIR-V binary for 
+    Int8, Int16, Float16, and Float64 based on the Halide IR, assuming
+    the device will support these requirements.  We may need limit 
+    these if targetting a lower class device.
+
+# Known TODO:
+
+-   Fix as many tests as possible
+-   Shared local memory and barriers need work
+-   More platform support (e.g. Windows, Android, etc)
+-   Better debugging utilities using the Vulkan debug hooks.
+-   Allow debug symbols to be stripped from SPIR-V during codegen to reduce
+    memory overhead for large kernels.
+-   Investigate floating point rounding and precision (v1.3 adds more controls)
+-   Investigate memory model usage (are Halide's assumptions correct?)
+
+# Current Status
+
+The following table outlines the state of the correctness tests (as of Sep-30) when 
+run with `HL_JIT_TARGET=host-vulkan` (NOTE: some tests may need additional 
+modifications to run under Vulkan):
+
+| Totals | *PASS* 🟢 | *FAIL* 🔴 | 
+| --     | --        | --        |
+|        | 300       | 65        |
+
+
+| Test Name | Status |
+| :--       |    --: |
+| correctness_cse_nan | *PASS* 🟢 |
+| correctness_simd_op_check_hvx | *PASS* 🟢 |
+| correctness_compute_with_in | *PASS* 🟢 |
+| correctness_extern_stage | *PASS* 🟢 |
+| correctness_pipeline_set_jit_externs_func | *PASS* 🟢 |
+| correctness_likely | *PASS* 🟢 |
+| correctness_vector_bounds_inference | *PASS* 🟢 |
+| correctness_inline_reduction | *PASS* 🟢 |
+| correctness_unsafe_promises | *PASS* 🟢 |
+| correctness_reorder_rvars | *FAIL* 🔴 | 
+| correctness_lossless_cast | *PASS* 🟢 |
+| correctness_gpu_reuse_shared_memory | *FAIL* 🔴 | 
+| correctness_boundary_conditions | *FAIL* 🔴 | 
+| correctness_min_extent | *PASS* 🟢 |
+| correctness_gpu_sum_scan | *FAIL* 🔴 | 
+| correctness_dynamic_allocation_in_gpu_kernel | *FAIL* 🔴 | 
+| correctness_image_of_lists | *PASS* 🟢 |
+| correctness_tracing_broadcast | *PASS* 🟢 |
+| correctness_scatter | *PASS* 🟢 |
+| correctness_stmt_to_html | *PASS* 🟢 |
+| correctness_host_alignment | *PASS* 🟢 |
+| correctness_custom_allocator | *PASS* 🟢 |
+| correctness_issue_3926 | *PASS* 🟢 |
+| correctness_compare_vars | *PASS* 🟢 |
+| correctness_non_vector_aligned_embeded_buffer | *PASS* 🟢 |
+| correctness_realize_larger_than_two_gigs | *PASS* 🟢 |
+| correctness_gpu_transpose | *FAIL* 🔴 | 
+| correctness_side_effects | *PASS* 🟢 |
+| correctness_logical | *FAIL* 🔴 | 
+| correctness_func_lifetime_2 | *PASS* 🟢 |
+| correctness_device_crop | *FAIL* 🔴 | 
+| correctness_print_loop_nest | *PASS* 🟢 |
+| correctness_bool_compute_root_vectorize | *FAIL* 🔴 | 
+| correctness_extract_concat_bits | *PASS* 🟢 |
+| correctness_dead_realization_in_specialization | *PASS* 🟢 |
+| correctness_undef | *FAIL* 🔴 | 
+| correctness_growing_stack | *PASS* 🟢 |
+| correctness_parallel_scatter | *PASS* 🟢 |
+| correctness_multi_splits_with_diff_tail_strategies | *PASS* 🟢 |
+| correctness_gpu_arg_types | *PASS* 🟢 |
+| correctness_cascaded_filters | *FAIL* 🔴 | 
+| correctness_trim_no_ops | *FAIL* 🔴 | 
+| correctness_float16_t_comparison | *PASS* 🟢 |
+| correctness_legal_race_condition | *PASS* 🟢 |
+| correctness_explicit_inline_reductions | *PASS* 🟢 |
+| correctness_vector_tile | *PASS* 🟢 |
+| correctness_skip_stages_memoize | *PASS* 🟢 |
+| correctness_intrinsics | *PASS* 🟢 |
+| correctness_strict_float | *PASS* 🟢 |
+| correctness_bounds_query | *PASS* 🟢 |
+| correctness_vector_reductions | *PASS* 🟢 |
+| correctness_custom_lowering_| *PASS* 🟢 | | *PASS* 🟢 |
+| correctness_gpu_assertion_in_kernel | *PASS* 🟢 |
+| correctness_low_bit_depth_noise | *PASS* 🟢 |
+| correctness_fuse | *FAIL* 🔴 | 
+| correctness_vector_cast | *FAIL* 🔴 | 
+| correctness_concat | *PASS* 🟢 |
+| correctness_mod | *PASS* 🟢 |
+| correctness_parallel_rvar | *PASS* 🟢 |
+| correctness_make_struct | *PASS* 🟢 |
+| correctness_reduction_predicate_racing | *PASS* 🟢 |
+| correctness_bounds_inference_chunk | *PASS* 🟢 |
+| correctness_realize_over_shifted_domain | *PASS* 🟢 |
+| correctness_compute_at_split_rvar | *PASS* 🟢 |
+| correctness_split_fuse_rvar | *PASS* 🟢 |
+| correctness_memoize_cloned | *PASS* 🟢 |
+| correctness_| *FAIL* 🔴 | _unroll | *PASS* 🟢 |
+| correctness_gpu_vectorized_shared_memory | *PASS* 🟢 |
+| correctness_bounds_inference_complex | *PASS* 🟢 |
+| correctness_widening_reduction | *FAIL* 🔴 | 
+| correctness_extern_partial | *PASS* 🟢 |
+| correctness_multi_output_pipeline_with_bad_sizes | *PASS* 🟢 |
+| correctness_hoist_loop_invariant_if_statements | *PASS* 🟢 |
+| correctness_extern_sort | *FAIL* 🔴 | 
+| correctness_multiple_outputs_extern | *PASS* 🟢 |
+| correctness_tracing_bounds | *PASS* 🟢 |
+| correctness_gpu_object_lifetime_1 | *PASS* 🟢 |
+| correctness_nested_tail_strategies | *PASS* 🟢 |
+| correctness_parallel_reductions | *PASS* 🟢 |
+| correctness_custom_error_reporter | *PASS* 🟢 |
+| correctness_many_dimensions | *PASS* 🟢 |
+| correctness_predicated_store_load | *PASS* 🟢 |
+| correctness_random | *PASS* 🟢 |
+| correctness_partition_loops_bug | *PASS* 🟢 |
+| correctness_stencil_chain_in_update_definitions | *PASS* 🟢 |
+| correctness_inverse | *PASS* 🟢 |
+| correctness_skip_stages | *PASS* 🟢 |
+| correctness_cuda_8_bit_dot_product | *PASS* 🟢 |
+| correctness_gpu_vectorize | *FAIL* 🔴 | 
+| correctness_gpu_object_lifetime_3 | *FAIL* 🔴 | 
+| correctness_histogram | *PASS* 🟢 |
+| correctness_shared_self_references | *PASS* 🟢 |
+| correctness_gpu_mixed_shared_mem_types | *FAIL* 🔴 | 
+| correctness_custom_cuda_context | *PASS* 🟢 |
+| correctness_implicit_args_tests | *PASS* 🟢 |
+| correctness_compile_to_lowered_stmt | *PASS* 🟢 |
+| correctness_bounds_of_func | *PASS* 🟢 |
+| correctness_interleave_rgb | *FAIL* 🔴 | 
+| correctness_multi_gpu_gpu_multi_device | *PASS* 🟢 |
+| correctness_lambda | *PASS* 🟢 |
+| correctness_interval | *PASS* 🟢 |
+| correctness_unused_func | *PASS* 🟢 |
+| correctness_fuzz_float_stores | *PASS* 🟢 |
+| correctness_newtons_method | *FAIL* 🔴 | 
+| correctness_compile_to_bitcode | *PASS* 🟢 |
+| correctness_lazy_convolution | *PASS* 🟢 |
+| correctness_image_wrapper | *PASS* 🟢 |
+| correctness_reduction_chain | *PASS* 🟢 |
+| correctness_storage_folding | *PASS* 🟢 |
+| correctness_reorder_storage | *PASS* 🟢 |
+| correctness_bit_counting | *PASS* 🟢 |
+| correctness_tiled_matmul | *PASS* 🟢 |
+| correctness_async_device_copy | *FAIL* 🔴 | 
+| correctness_lots_of_dimensions | *PASS* 🟢 |
+| correctness_interleave | *PASS* 🟢 |
+| correctness_dynamic_reduction_bounds | *PASS* 🟢 |
+| correctness_atomic_tuples | *PASS* 🟢 |
+| correctness_named_updates | *PASS* 🟢 |
+| correctness_unroll_dynamic_loop | *PASS* 🟢 |
+| correctness_buffer_t | *PASS* 🟢 |
+| correctness_hello_gpu | *PASS* 🟢 |
+| correctness_gpu_object_lifetime_2 | *FAIL* 🔴 | 
+| correctness_update_chunk | *PASS* 🟢 |
+| correctness_autodiff | *PASS* 🟢 |
+| correctness_extern_consumer | *PASS* 🟢 |
+| correctness_func_wrapper | *PASS* 🟢 |
+| correctness_bounds_of_multiply | *PASS* 🟢 |
+| correctness_gpu_store_in_register_with_no_lanes_loop | *FAIL* 🔴 | 
+| correctness_gpu_condition_lifting | *PASS* 🟢 |
+| correctness_extern_consumer_tiled | *PASS* 🟢 |
+| correctness_float16_t_neon_op_check | *PASS* 🟢 |
+| correctness_split_by_non_factor | *PASS* 🟢 |
+| correctness_parallel_fork | *PASS* 🟢 |
+| correctness_hexagon_scatter | *PASS* 🟢 |
+| correctness_partition_loops | *PASS* 🟢 |
+| correctness_process_some_tiles | *PASS* 🟢 |
+| correctness_parameter_constraints | *PASS* 🟢 |
+| correctness_callable | *PASS* 🟢 |
+| correctness_bounds_inference | *FAIL* 🔴 | 
+| correctness_indexing_access_undef | *PASS* 🟢 |
+| correctness_partial_realization | *PASS* 🟢 |
+| correctness_gpu_mixed_dimensionality | *FAIL* 🔴 | 
+| correctness_uninitialized_read | *PASS* 🟢 |
+| correctness_unsafe_dedup_lets | *PASS* 🟢 |
+| correctness_output_larger_than_two_gigs | *PASS* 🟢 |
+| correctness_obscure_image_references | *PASS* 🟢 |
+| correctness_chunk | *FAIL* 🔴 | 
+| correctness_vectorized_load_from_vectorized_allocation | *PASS* 🟢 |
+| correctness_load_library | *PASS* 🟢 |
+| correctness_compute_inside_guard | *PASS* 🟢 |
+| correctness_multi_| *PASS* 🟢 |_reduction | *PASS* 🟢 |
+| correctness_lerp | *PASS* 🟢 |
+| correctness_realize_condition_depends_on_tuple | *PASS* 🟢 |
+| correctness_vectorized_initialization | *PASS* 🟢 |
+| correctness_loop_level_generator_param | *PASS* 🟢 |
+| correctness_two_vector_args | *PASS* 🟢 |
+| correctness_argmax | *FAIL* 🔴 | 
+| correctness_custom_auto_scheduler | *PASS* 🟢 |
+| correctness_shadowed_bound | *PASS* 🟢 |
+| correctness_inlined_generator | *PASS* 🟢 |
+| correctness_math | *FAIL* 🔴 | 
+| correctness_gpu_different_blocks_threads_dimensions | *PASS* 🟢 |
+| correctness_extern_stage_on_device | *FAIL* 🔴 | 
+| correctness_bound | *PASS* 🟢 |
+| correctness_popc_clz_ctz_bounds | *PASS* 🟢 |
+| correctness_bounds | *PASS* 🟢 |
+| correctness_prefetch | *PASS* 🟢 |
+| correctness_force_onto_stack | *PASS* 🟢 |
+| correctness_input_image_bounds_check | *PASS* 🟢 |
+| correctness_sort_exprs | *PASS* 🟢 |
+| correctness_let_in_rdom_bound | *PASS* 🟢 |
+| correctness_func_lifetime | *PASS* 🟢 |
+| correctness_compute_outermost | *PASS* 🟢 |
+| correctness_histogram_equalize | *PASS* 🟢 |
+| correctness_func_clone | *PASS* 🟢 |
+| correctness_tracing_stack | *PASS* 🟢 |
+| correctness_simplify | *PASS* 🟢 |
+| correctness_gameoflife | *PASS* 🟢 |
+| correctness_thread_safety | *PASS* 🟢 |
+| correctness_fuse_gpu_threads | *PASS* 🟢 |
+| correctness_split_reuse_inner_name_bug | *PASS* 🟢 |
+| correctness_gpu_jit_explicit_copy_to_device | *FAIL* 🔴 | 
+| correctness_tuple_select | *PASS* 🟢 |
+| correctness_device_buffer_copy | *FAIL* 🔴 | 
+| correctness_pseudostack_shares_slots | *PASS* 🟢 |
+| correctness_lots_of_loop_invariants | *PASS* 🟢 |
+| correctness_fuzz_simplify | *PASS* 🟢 |
+| correctness_div_round_to_zero | *PASS* 🟢 |
+| correctness_rfactor | *PASS* 🟢 |
+| correctness_custom_jit_context | *PASS* 🟢 |
+| correctness_round | *PASS* 🟢 |
+| correctness_device_slice | *FAIL* 🔴 | 
+| correctness_iterate_over_circle | *PASS* 🟢 |
+| correctness_vector_print_bug | *PASS* 🟢 |
+| correctness_mux | *PASS* 🟢 |
+| correctness_vectorize_varying_allocation_size | *PASS* 🟢 |
+| correctness_parallel_nested_1 | *PASS* 🟢 |
+| correctness_compile_to_multitarget | *PASS* 🟢 |
+| correctness_bounds_inference_outer_split | *PASS* 🟢 |
+| correctness_leak_device_memory | *FAIL* 🔴 | 
+| correctness_reduction_schedule | *PASS* 🟢 |
+| correctness_many_small_extern_stages | *PASS* 🟢 |
+| correctness_parallel_alloc | *PASS* 🟢 |
+| correctness_multiple_outputs | *FAIL* 🔴 | 
+| correctness_vectorize_nested | *PASS* 🟢 |
+| correctness_bad_likely | *PASS* 🟢 |
+| correctness_sliding_reduction | *PASS* 🟢 |
+| correctness_bounds_of_split | *PASS* 🟢 |
+| correctness_erf | *PASS* 🟢 |
+| correctness_float16_t_image_type | *PASS* 🟢 |
+| correctness_gpu_non_monotonic_shared_mem_size | *FAIL* 🔴 | 
+| correctness_extern_reorder_storage | *PASS* 🟢 |
+| correctness_gather | *PASS* 🟢 |
+| correctness_gpu_many_kernels | *PASS* 🟢 |
+| correctness_early_out | *PASS* 🟢 |
+| correctness_strict_float_bounds | *PASS* 🟢 |
+| correctness_bounds_of_abs | *PASS* 🟢 |
+| correctness_tuple_vector_reduce | *PASS* 🟢 |
+| correctness_debug_to_file_reorder | *FAIL* 🔴 | 
+| correctness_vectorized_reduction_bug | *PASS* 🟢 |
+| correctness_input_larger_than_two_gigs | *PASS* 🟢 |
+| correctness_computed_index | *PASS* 🟢 |
+| correctness_reduction_non_rectangular | *FAIL* 🔴 | 
+| correctness_left_shift_negative | *PASS* 🟢 |
+| correctness_set_custom_trace | *PASS* 🟢 |
+| correctness_vectorized_gpu_allocation | *FAIL* 🔴 | 
+| correctness_split_store_compute | *PASS* 🟢 |
+| correctness_c_function | *PASS* 🟢 |
+| correctness_specialize | *PASS* 🟢 |
+| correctness_nested_shiftinwards | *PASS* 🟢 |
+| correctness_assertion_failure_in_parallel_for | *PASS* 🟢 |
+| correctness_plain_c_includes | *PASS* 🟢 |
+| correctness_stream_compaction | *PASS* 🟢 |
+| correctness_async | *PASS* 🟢 |
+| correctness_atomics | *PASS* 🟢 |
+| correctness_multi| *PASS* 🟢 |_constraints | *PASS* 🟢 |
+| correctness_target | *PASS* 🟢 |
+| correctness_tuple_reduction | *FAIL* 🔴 | 
+| correctness_dilate3x3 | *FAIL* 🔴 | 
+| correctness_image_io | *PASS* 🟢 |
+| correctness_gpu_param_allocation | *FAIL* 🔴 | 
+| correctness_reschedule | *PASS* 🟢 |
+| correctness_isnan | *FAIL* 🔴 | 
+| correctness_halide_buffer | *PASS* 🟢 |
+| correctness_bounds_of_cast | *PASS* 🟢 |
+| correctness_handle | *PASS* 🟢 |
+| correctness_param | *PASS* 🟢 |
+| correctness_saturating_casts | *PASS* 🟢 |
+| correctness_extern_producer | *FAIL* 🔴 | 
+| correctness_shift_by_unsigned_negated | *PASS* 🟢 |
+| correctness_circular_reference_leak | *PASS* 🟢 |
+| correctness_specialize_to_gpu | *FAIL* 🔴 | 
+| correctness_device_copy_at_inner_loop | *FAIL* 🔴 | 
+| correctness_fit_function | *PASS* 🟢 |
+| correctness_compute_at_reordered_update_stage | *PASS* 🟢 |
+| correctness_non_nesting_extern_bounds_query | *PASS* 🟢 |
+| correctness_bitwise_ops | *PASS* 🟢 |
+| correctness_gpu_data_flows | *FAIL* 🔴 | 
+| correctness_cast | *PASS* 🟢 |
+| correctness_stack_allocations | *PASS* 🟢 |
+| correctness_sliding_backwards | *PASS* 🟢 |
+| correctness_float16_t | *PASS* 🟢 |
+| correctness_simd_op_check | *PASS* 🟢 |
+| correctness_typed_func | *PASS* 🟢 |
+| correctness_tuple_partial_update | *PASS* 🟢 |
+| correctness_heap_cleanup | *PASS* 🟢 |
+| correctness_implicit_args | *PASS* 🟢 |
+| correctness_deferred_loop_level | *PASS* 🟢 |
+| correctness_interleave_x | *PASS* 🟢 |
+| correctness_fuzz_bounds | *PASS* 🟢 |
+| correctness_strided_load | *PASS* 🟢 |
+| correctness_bound_storage | *PASS* 🟢 |
+| correctness_gpu_cpu_simultaneous_read | *FAIL* 🔴 | 
+| correctness_fast_trigonometric | *PASS* 🟢 |
+| correctness_compute_with | *FAIL* 🔴 | 
+| correctness_gpu_allocation_cache | *FAIL* 🔴 | 
+| correctness_compile_to | *PASS* 🟢 |
+| correctness_extern_output_expansion | *PASS* 🟢 |
+| correctness_gpu_texture | *PASS* 🟢 |
+| correctness_many_updates | *PASS* 🟢 |
+| correctness_memoize | *PASS* 🟢 |
+| correctness_gpu_multi_kernel | *FAIL* 🔴 | 
+| correctness_extern_error | *PASS* 🟢 |
+| correctness_partition_max_filter | *PASS* 🟢 |
+| correctness_bound_small_allocations | *PASS* 🟢 |
+| correctness_median3x3 | *FAIL* 🔴 | 
+| correctness_reuse_stack_alloc | *PASS* 🟢 |
+| correctness_debug_to_file | *FAIL* 🔴 | 
+| correctness_embed_bitcode | *PASS* 🟢 |
+| correctness_gpu_large_alloc | *FAIL* 🔴 | 
+| correctness_pytorch | *PASS* 🟢 |
+| correctness_in_place | *FAIL* 🔴 | 
+| correctness_exception | *PASS* 🟢 |
+| correctness_python_extension_gen | *PASS* 🟢 |
+| correctness_cross_compilation | *PASS* 🟢 |
+| correctness_extern_bounds_inference | *PASS* 🟢 |
+| correctness_bounds_of_monotonic_math | *PASS* 🟢 |
+| correctness_loop_invariant_extern_calls | *PASS* 🟢 |
+| correctness_skip_stages_external_array_functions | *PASS* 🟢 |
+| correctness_chunk_sharing | *PASS* 🟢 |
+| correctness_multi_way_select | *FAIL* 🔴 | 
+| correctness_async_copy_chain | *FAIL* 🔴 | 
+| correctness_gpu_give_input_buffers_device_allocations | *FAIL* 🔴 | 
+| correctness_oddly_sized_output | *PASS* 🟢 |
+| correctness_fuzz_cse | *PASS* 🟢 |
+| correctness_half_native_interleave | *PASS* 🟢 |
+| correctness_introspection | *PASS* 🟢 |
+| correctness_callable_generator | *PASS* 🟢 |
+| correctness_fused_where_inner_extent_is_zero | *PASS* 🟢 |
+| correctness_tuple_update_ops | *PASS* 🟢 |
+| correctness_constraints | *PASS* 🟢 |
+| correctness_multiple_scatter | *PASS* 🟢 |
+| correctness_unrolled_reduction | *PASS* 🟢 |
+| correctness_tracing | *PASS* 🟢 |
+| correctness_simplified_away_embedded_image | *PASS* 🟢 |
+| correctness_mul_div_mod | *FAIL* 🔴 | 
+| correctness_infer_arguments | *PASS* 🟢 |
+| correctness_convolution | *FAIL* 🔴 | 
+| correctness_truncated_pyramid | *PASS* 🟢 |
+| correctness_for_each_element | *PASS* 🟢 |
+| correctness_store_in | *PASS* 🟢 |
+| correctness_transitive_bounds | *PASS* 🟢 |
+| correctness_vectorize_guard_with_if | *PASS* 🟢 |
+| correctness_widening_lerp | *PASS* 🟢 |
+| correctness_cast_handle | *PASS* 🟢 |
+| correctness_tuple_undef | *PASS* 🟢 |
+| correctness_partial_application | *PASS* 🟢 |
+| correctness_vectorize_mixed_widths | *PASS* 🟢 |
+| correctness_print | *PASS* 🟢 |
+| correctness_fibonacci | *PASS* 🟢 |
+| correctness_parallel_nested | *PASS* 🟢 |
+| correctness_sliding_window | *PASS* 🟢 |
+| correctness_integer_powers | *PASS* 🟢 |
+| correctness_unique_func_image | *PASS* 🟢 |
+| correctness_constant_type | *PASS* 🟢 |
+| correctness_shifted_image | *PASS* 🟢 |
+| correctness_vector_extern | *PASS* 🟢 |
+| correctness_compute_with_inlined | *PASS* 🟢 |
+| correctness_param_map | *PASS* 🟢 |
+| correctness_float16_t_constants | *PASS* 🟢 |
+| correctness_callable_typed | *PASS* 🟢 |
+| correctness_unroll_huge_mux | *PASS* 🟢 |
+| correctness_parallel | *PASS* 🟢 |
+| correctness_code_explosion | *PASS* 🟢 |
+| correctness_gpu_dynamic_shared | *FAIL* 🔴 | 
+| correctness_div_by_zero | *PASS* 🟢 |
+| correctness_convolution_multiple_kernels | *FAIL* 🔴 | 
+| correctness_deinterleave4 | *PASS* 🟢 |
+| correctness_align_bounds | *PASS* 🟢 |
+| correctness_gpu_bounds_inference_failure | *PASS* 🟢 |
+| correctness_interpreter | *FAIL* 🔴 | 
+| correctness_parallel_gpu_nested | *PASS* 🟢 |
+| correctness_gpu_thread_barrier | *FAIL* 🔴 | 
+| correctness_debug_to_file_multiple_outputs | *PASS* 🟢 |
+| correctness_gpu_free_sync | *PASS* 🟢 |
+| correctness_out_constraint | *PASS* 🟢 |
+| correctness_gpu_specialize | *PASS* 🟢| 
+| correctness_register_shuffle | *PASS* 🟢 |
+| correctness_constant_expr | *PASS* 🟢 |
+| correctness_out_of_memory | *PASS* 🟢 |
+| correctness_gpu_non_contiguous_copy | *PASS* 🟢 |
+| correctness_sliding_over_guard_with_if | *PASS* 🟢 |
+| correctness_vector_math | *PASS* 🟢 |
+| correctness_require | *PASS* 🟢 |
+| correctness_callable_errors | *PASS* 🟢 |
+

From 68cccce266646dd1fff92ef399e94e260cc0f31e Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 30 Sep 2022 12:46:45 -0700
Subject: [PATCH 031/166] Clang tidy/format pass

---
 src/runtime/internal/memory_resources.h |  2 +-
 src/runtime/internal/region_allocator.h |  4 +++-
 src/runtime/vulkan_memory.h             | 20 ++++++++++----------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index ad72ec5dc689..aff265977d40 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -55,7 +55,7 @@ struct MemoryProperties {
     MemoryVisibility visibility = MemoryVisibility::InvalidVisibility;
     MemoryUsage usage = MemoryUsage::InvalidUsage;
     MemoryCaching caching = MemoryCaching::InvalidCaching;
-    size_t alignment = 0;         //< required alignment of allocations (zero for no constraint)
+    size_t alignment = 0;  //< required alignment of allocations (zero for no constraint)
 };
 
 // Client-facing struct for exchanging memory block allocation requests
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 145a7093bacb..d0ad2c3a24f8 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -273,10 +273,12 @@ bool RegionAllocator::can_split(BlockRegion *block_region, size_t size) {
 BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
     alignment = conform_alignment(alignment, block->memory.properties.alignment);
 
-    debug(nullptr) << "RegionAllocator: Conforming alignment (" 
+#ifdef DEBUG_INTERNAL
+    debug(user_context) << "RegionAllocator: Conforming alignment ("
                         << "requested=" << (uint32_t)alignment << " "
                         << "required=" << (uint32_t)block->memory.properties.alignment << " "
                         << "actual=" << (uint32_t)alignment << ")\n";
+#endif
 
     size_t adjusted_size = aligned_size(block_region->memory.offset, size, alignment);
     size_t adjusted_offset = aligned_offset(block_region->memory.offset + size, alignment);
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index dc5b15ffeee6..18b50a0e8f17 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -337,7 +337,7 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     }
 
     uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties);
-    
+
     VkBufferCreateInfo create_info = {
         VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
         nullptr,                               // struct extending this
@@ -706,22 +706,22 @@ VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context,
 
     VulkanMemoryConfig config = memory_allocator_config;
 
-    const char* min_block_size_env = getenv("HL_VK_MIN_BLOCK_SIZE");
-    const char* max_block_size_env = getenv("HL_VK_MAX_BLOCK_SIZE");
-    const char* max_block_count_env = getenv("HL_VK_MAX_BLOCK_COUNT");
+    const char *min_block_size_env = getenv("HL_VK_MIN_BLOCK_SIZE");
+    const char *max_block_size_env = getenv("HL_VK_MAX_BLOCK_SIZE");
+    const char *max_block_count_env = getenv("HL_VK_MAX_BLOCK_COUNT");
 
-    if(!StringUtils::is_empty(min_block_size_env)) {
-        config.minimum_block_size = atoi(min_block_size_env) * 1024 * 1024; 
+    if (!StringUtils::is_empty(min_block_size_env)) {
+        config.minimum_block_size = atoi(min_block_size_env) * 1024 * 1024;
         debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.minimum_block_size << " for minimum block size (in bytes)\n";
     }
 
-    if(!StringUtils::is_empty(max_block_size_env)) {
-        config.maximum_block_size = atoi(max_block_size_env) * 1024 * 1024; 
+    if (!StringUtils::is_empty(max_block_size_env)) {
+        config.maximum_block_size = atoi(max_block_size_env) * 1024 * 1024;
         debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_size << " for maximum block size (in bytes)\n";
     }
 
-    if(!StringUtils::is_empty(max_block_count_env)) {
-        config.maximum_block_count = atoi(max_block_count_env) ;
+    if (!StringUtils::is_empty(max_block_count_env)) {
+        config.maximum_block_count = atoi(max_block_count_env);
         debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_count << " for maximum block count\n";
     }
 

From 5d783534f03dd0abd098fba06d827f7a9f244039 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 30 Sep 2022 14:19:50 -0700
Subject: [PATCH 032/166] Fix conform_alignment to handle zero values

---
 src/runtime/internal/memory_resources.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index aff265977d40..f06f850cc28b 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -116,7 +116,7 @@ ALWAYS_INLINE size_t aligned_offset(size_t offset, size_t alignment) {
 // integer multiple of the required alignment
 ALWAYS_INLINE size_t conform_alignment(size_t requested, size_t required) {
     size_t alignment = max(requested, required);
-    return (alignment > required) ? (required * ((alignment / required) + 1)) : alignment;
+    return ((required > 0) && (alignment > required)) ? (required * ((alignment / required) + 1)) : alignment;
 }
 
 // Returns a padded size to accomodate an adjusted offset due to alignment constraints

From c27bcef6d297ebf386ce359ccd37a87f3c99598a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 30 Sep 2022 14:48:58 -0700
Subject: [PATCH 033/166] Fix declaration of custom_allocation_callbacks to be
 static. Change to constexpr for invalid values

---
 src/runtime/vulkan_memory.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 18b50a0e8f17..7e458d84fba8 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -13,7 +13,7 @@ namespace Vulkan {
 
 // Enable external client to override Vulkan allocation callbacks (if they so desire)
 WEAK ScopedSpinLock::AtomicFlag custom_allocation_callbacks_lock = 0;
-static const VkAllocationCallbacks *custom_allocation_callbacks = nullptr;  // nullptr => use Vulkan runtime implementation
+WEAK const VkAllocationCallbacks *custom_allocation_callbacks = nullptr;  // nullptr => use Vulkan runtime implementation
 
 // --------------------------------------------------------------------------
 
@@ -86,8 +86,8 @@ class VulkanMemoryAllocator {
     size_t regions_allocated() const;
 
 private:
-    static const uint32_t invalid_usage_flags = uint32_t(-1);
-    static const uint32_t invalid_memory_type = uint32_t(VK_MAX_MEMORY_TYPES);
+    static constexpr uint32_t invalid_usage_flags = uint32_t(-1);
+    static constexpr uint32_t invalid_memory_type = uint32_t(VK_MAX_MEMORY_TYPES);
 
     // Initializes a new instance
     void initialize(void *user_context, const VulkanMemoryConfig &config,

From d2225c8160283633658f4d014e28e9d6f2285662 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 3 Oct 2022 14:29:43 -0700
Subject: [PATCH 034/166] Whitespace change to trigger build.

---
 README_vulkan.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index a2937cc2f175..b0bd8dec39f8 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -22,7 +22,6 @@ For example, on Linux & OSX:
 % cmake --build build
 ```
 
-
 # Vulkan Runtime Environment:
 
 Halide has no direct dependency on Vulkan for code-generation, but the runtime

From b7155976be9f179f032bf59092bb2e8b5cd3182e Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 4 Oct 2022 12:42:09 -0700
Subject: [PATCH 035/166] Handle Vulkan kernels that don't require storage
 buffers. Updated test status. Fixes 7 test cases.

---
 README_vulkan.md               | 33 ++++++------
 src/runtime/vulkan.cpp         | 34 +++++++-----
 src/runtime/vulkan_internal.h  |  6 +--
 src/runtime/vulkan_resources.h | 98 ++++++++++++++++++----------------
 4 files changed, 91 insertions(+), 80 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index b0bd8dec39f8..3fd4eb3a44d4 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -191,13 +191,13 @@ https://github.com/KhronosGroup/SPIRV-Tools
 
 # Current Status
 
-The following table outlines the state of the correctness tests (as of Sep-30) when 
+The following table outlines the state of the correctness tests (as of Oct-04) when 
 run with `HL_JIT_TARGET=host-vulkan` (NOTE: some tests may need additional 
 modifications to run under Vulkan):
 
 | Totals | *PASS* 🟢 | *FAIL* 🔴 | 
 | --     | --        | --        |
-|        | 300       | 65        |
+|        | 307       | 58        |
 
 
 | Test Name | Status |
@@ -211,7 +211,7 @@ modifications to run under Vulkan):
 | correctness_vector_bounds_inference | *PASS* 🟢 |
 | correctness_inline_reduction | *PASS* 🟢 |
 | correctness_unsafe_promises | *PASS* 🟢 |
-| correctness_reorder_rvars | *FAIL* 🔴 | 
+| correctness_reorder_rvars | *PASS* 🟢 |
 | correctness_lossless_cast | *PASS* 🟢 |
 | correctness_gpu_reuse_shared_memory | *FAIL* 🔴 | 
 | correctness_boundary_conditions | *FAIL* 🔴 | 
@@ -237,12 +237,12 @@ modifications to run under Vulkan):
 | correctness_bool_compute_root_vectorize | *FAIL* 🔴 | 
 | correctness_extract_concat_bits | *PASS* 🟢 |
 | correctness_dead_realization_in_specialization | *PASS* 🟢 |
-| correctness_undef | *FAIL* 🔴 | 
+| correctness_undef | *PASS* 🟢 |
 | correctness_growing_stack | *PASS* 🟢 |
 | correctness_parallel_scatter | *PASS* 🟢 |
 | correctness_multi_splits_with_diff_tail_strategies | *PASS* 🟢 |
 | correctness_gpu_arg_types | *PASS* 🟢 |
-| correctness_cascaded_filters | *FAIL* 🔴 | 
+| correctness_cascaded_filters | *PASS* 🟢 |
 | correctness_trim_no_ops | *FAIL* 🔴 | 
 | correctness_float16_t_comparison | *PASS* 🟢 |
 | correctness_legal_race_condition | *PASS* 🟢 |
@@ -253,10 +253,10 @@ modifications to run under Vulkan):
 | correctness_strict_float | *PASS* 🟢 |
 | correctness_bounds_query | *PASS* 🟢 |
 | correctness_vector_reductions | *PASS* 🟢 |
-| correctness_custom_lowering_| *PASS* 🟢 | | *PASS* 🟢 |
+| correctness_custom_lowering_pass | *PASS* 🟢 |
 | correctness_gpu_assertion_in_kernel | *PASS* 🟢 |
 | correctness_low_bit_depth_noise | *PASS* 🟢 |
-| correctness_fuse | *FAIL* 🔴 | 
+| correctness_fuse | *PASS* 🟢 |
 | correctness_vector_cast | *FAIL* 🔴 | 
 | correctness_concat | *PASS* 🟢 |
 | correctness_mod | *PASS* 🟢 |
@@ -268,14 +268,14 @@ modifications to run under Vulkan):
 | correctness_compute_at_split_rvar | *PASS* 🟢 |
 | correctness_split_fuse_rvar | *PASS* 🟢 |
 | correctness_memoize_cloned | *PASS* 🟢 |
-| correctness_| *FAIL* 🔴 | _unroll | *PASS* 🟢 |
+| correctness_failed_unroll | *PASS* 🟢 |
 | correctness_gpu_vectorized_shared_memory | *PASS* 🟢 |
 | correctness_bounds_inference_complex | *PASS* 🟢 |
 | correctness_widening_reduction | *FAIL* 🔴 | 
 | correctness_extern_partial | *PASS* 🟢 |
 | correctness_multi_output_pipeline_with_bad_sizes | *PASS* 🟢 |
 | correctness_hoist_loop_invariant_if_statements | *PASS* 🟢 |
-| correctness_extern_sort | *FAIL* 🔴 | 
+| correctness_extern_sort | *PASS* 🟢 |
 | correctness_multiple_outputs_extern | *PASS* 🟢 |
 | correctness_tracing_bounds | *PASS* 🟢 |
 | correctness_gpu_object_lifetime_1 | *PASS* 🟢 |
@@ -352,7 +352,7 @@ modifications to run under Vulkan):
 | correctness_vectorized_load_from_vectorized_allocation | *PASS* 🟢 |
 | correctness_load_library | *PASS* 🟢 |
 | correctness_compute_inside_guard | *PASS* 🟢 |
-| correctness_multi_| *PASS* 🟢 |_reduction | *PASS* 🟢 |
+| correctness_multi_pass_reduction | *PASS* 🟢 |
 | correctness_lerp | *PASS* 🟢 |
 | correctness_realize_condition_depends_on_tuple | *PASS* 🟢 |
 | correctness_vectorized_initialization | *PASS* 🟢 |
@@ -392,7 +392,7 @@ modifications to run under Vulkan):
 | correctness_div_round_to_zero | *PASS* 🟢 |
 | correctness_rfactor | *PASS* 🟢 |
 | correctness_custom_jit_context | *PASS* 🟢 |
-| correctness_round | *PASS* 🟢 |
+| correctness_round | *FAIL* 🔴 | 
 | correctness_device_slice | *FAIL* 🔴 | 
 | correctness_iterate_over_circle | *PASS* 🟢 |
 | correctness_vector_print_bug | *PASS* 🟢 |
@@ -437,7 +437,7 @@ modifications to run under Vulkan):
 | correctness_stream_compaction | *PASS* 🟢 |
 | correctness_async | *PASS* 🟢 |
 | correctness_atomics | *PASS* 🟢 |
-| correctness_multi| *PASS* 🟢 |_constraints | *PASS* 🟢 |
+| correctness_multipass_constraints | *PASS* 🟢 |
 | correctness_target | *PASS* 🟢 |
 | correctness_tuple_reduction | *FAIL* 🔴 | 
 | correctness_dilate3x3 | *FAIL* 🔴 | 
@@ -450,7 +450,7 @@ modifications to run under Vulkan):
 | correctness_handle | *PASS* 🟢 |
 | correctness_param | *PASS* 🟢 |
 | correctness_saturating_casts | *PASS* 🟢 |
-| correctness_extern_producer | *FAIL* 🔴 | 
+| correctness_extern_producer | *PASS* 🟢 |
 | correctness_shift_by_unsigned_negated | *PASS* 🟢 |
 | correctness_circular_reference_leak | *PASS* 🟢 |
 | correctness_specialize_to_gpu | *FAIL* 🔴 | 
@@ -493,7 +493,7 @@ modifications to run under Vulkan):
 | correctness_embed_bitcode | *PASS* 🟢 |
 | correctness_gpu_large_alloc | *FAIL* 🔴 | 
 | correctness_pytorch | *PASS* 🟢 |
-| correctness_in_place | *FAIL* 🔴 | 
+| correctness_in_place | *PASS* 🟢 |
 | correctness_exception | *PASS* 🟢 |
 | correctness_python_extension_gen | *PASS* 🟢 |
 | correctness_cross_compilation | *PASS* 🟢 |
@@ -502,7 +502,7 @@ modifications to run under Vulkan):
 | correctness_loop_invariant_extern_calls | *PASS* 🟢 |
 | correctness_skip_stages_external_array_functions | *PASS* 🟢 |
 | correctness_chunk_sharing | *PASS* 🟢 |
-| correctness_multi_way_select | *FAIL* 🔴 | 
+| correctness_multi_way_select | *PASS* 🟢 |
 | correctness_async_copy_chain | *FAIL* 🔴 | 
 | correctness_gpu_give_input_buffers_device_allocations | *FAIL* 🔴 | 
 | correctness_oddly_sized_output | *PASS* 🟢 |
@@ -558,7 +558,7 @@ modifications to run under Vulkan):
 | correctness_debug_to_file_multiple_outputs | *PASS* 🟢 |
 | correctness_gpu_free_sync | *PASS* 🟢 |
 | correctness_out_constraint | *PASS* 🟢 |
-| correctness_gpu_specialize | *PASS* 🟢| 
+| correctness_gpu_specialize | *FAIL* 🔴 | 
 | correctness_register_shuffle | *PASS* 🟢 |
 | correctness_constant_expr | *PASS* 🟢 |
 | correctness_out_of_memory | *PASS* 🟢 |
@@ -567,4 +567,3 @@ modifications to run under Vulkan):
 | correctness_vector_math | *PASS* 🟢 |
 | correctness_require | *PASS* 🟢 |
 | correctness_callable_errors | *PASS* 🟢 |
-
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index e448432d4708..5b3083f31103 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -642,10 +642,13 @@ WEAK int halide_vulkan_run(void *user_context,
 
     //// 1a. Create a buffer for the scalar parameters
     if (cache_entry->args_region == nullptr) {
-        cache_entry->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, arg_sizes, args, arg_is_buffer);
-        if (cache_entry->args_region == nullptr) {
-            error(user_context) << "Vulkan: vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
-            return halide_error_code_internal_error;
+        size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context, arg_sizes, args, arg_is_buffer);
+        if (scalar_buffer_size > 0) {
+            cache_entry->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
+            if (cache_entry->args_region == nullptr) {
+                error(user_context) << "Vulkan: vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
+                return halide_error_code_internal_error;
+            }
         }
     }
 
@@ -689,19 +692,22 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     //// 5. Update uniform args and bindings for buffers in the descriptor set
-    VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, cache_entry->args_region, arg_sizes, args, arg_is_buffer);
-    if (result != VK_SUCCESS) {
-        debug(user_context) << "Vulkan: vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-        return result;
-    }
+    VkBuffer *args_buffer = nullptr;
+    if (cache_entry->args_region != nullptr) {
+        VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, cache_entry->args_region, arg_sizes, args, arg_is_buffer);
+        if (result != VK_SUCCESS) {
+            debug(user_context) << "Vulkan: vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            return result;
+        }
 
-    VkBuffer *args_buffer = reinterpret_cast<VkBuffer *>(cache_entry->args_region->handle);
-    if (args_buffer == nullptr) {
-        error(user_context) << "Vulkan: Failed to retrieve scalar args buffer for device memory!\n";
-        return halide_error_code_internal_error;
+        args_buffer = reinterpret_cast<VkBuffer *>(cache_entry->args_region->handle);
+        if (args_buffer == nullptr) {
+            error(user_context) << "Vulkan: Failed to retrieve scalar args buffer for device memory!\n";
+            return halide_error_code_internal_error;
+        }
     }
 
-    result = vk_update_descriptor_set(user_context, ctx.allocator, *args_buffer, cache_entry->buffer_count, arg_sizes, args, arg_is_buffer, cache_entry->descriptor_set);
+    VkResult result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, cache_entry->buffer_count, arg_sizes, args, arg_is_buffer, cache_entry->descriptor_set);
     if (result != VK_SUCCESS) {
         debug(user_context) << "Vulkan: vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 3befa4dbf797..8cb122f918fc 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -104,9 +104,7 @@ size_t vk_estimate_scalar_uniform_buffer_size(void *user_context,
 
 MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
                                               VulkanMemoryAllocator *allocator,
-                                              size_t arg_sizes[],
-                                              void *args[],
-                                              int8_t arg_is_buffer[]);
+                                              size_t scalar_buffer_size);
 
 VkResult vk_update_scalar_uniform_buffer(void *user_context,
                                          VulkanMemoryAllocator *allocator,
@@ -153,7 +151,7 @@ VkResult vk_create_descriptor_set(void *user_context,
 
 VkResult vk_update_descriptor_set(void *user_context,
                                   VulkanMemoryAllocator *allocator,
-                                  VkBuffer scalar_args_buffer,
+                                  VkBuffer *scalar_args_buffer,
                                   size_t storage_buffer_count,
                                   size_t arg_sizes[],
                                   void *args[],
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 7cdaeb111dfa..6e35101de5d5 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -189,25 +189,35 @@ VkResult vk_create_descriptor_pool(void *user_context,
         << "device: " << (void *)allocator->current_device() << ", "
         << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n";
 #endif
-    VkDescriptorPoolSize descriptor_pool_sizes[2] = {
-        {
-            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
-            1                                   // all kernel args are packed into one uniform buffer
-        },
-        {
+
+    BlockStorage::Config pool_config;
+    pool_config.entry_size = sizeof(VkDescriptorPoolSize);
+    pool_config.minimum_capacity = 1 + (storage_buffer_count ? 1 : 0);
+    BlockStorage pool_sizes(user_context, pool_config);
+
+    // First binding is reserved for passing scalar parameters as a uniform buffer
+    VkDescriptorPoolSize uniform_buffer_size = {
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
+        1                                   // all kernel args are packed into one uniform buffer
+    };
+    pool_sizes.append(user_context, &uniform_buffer_size);
+
+    if (storage_buffer_count > 0) {
+        VkDescriptorPoolSize storage_buffer_size = {
             VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,  // descriptor type
             storage_buffer_count                // all halide buffers are passed as storage buffers
-        }};
-
-    VkDescriptorPoolCreateInfo descriptor_pool_info =
-        {
-            VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,  // struct type
-            nullptr,                                        // point to struct extending this
-            0,                                              // flags
-            1,                                              // this pool will only be used for creating one descriptor set!
-            2,                                              // pool size count
-            descriptor_pool_sizes                           // ptr to descriptr pool sizes
         };
+        pool_sizes.append(user_context, &storage_buffer_size);
+    }
+
+    VkDescriptorPoolCreateInfo descriptor_pool_info = {
+        VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,   // struct type
+        nullptr,                                         // point to struct extending this
+        0,                                               // flags
+        1,                                               // this pool will only be used for creating one descriptor set!
+        (uint32_t)pool_sizes.size(),                     // pool size count
+        (const VkDescriptorPoolSize *)pool_sizes.data()  // ptr to descriptr pool sizes
+    };
 
     VkResult result = vkCreateDescriptorPool(allocator->current_device(), &descriptor_pool_info, allocator->callbacks(), descriptor_pool);
     if (result != VK_SUCCESS) {
@@ -347,7 +357,7 @@ VkResult vk_create_descriptor_set(void *user_context,
 
 VkResult vk_update_descriptor_set(void *user_context,
                                   VulkanMemoryAllocator *allocator,
-                                  VkBuffer scalar_args_buffer,
+                                  VkBuffer *scalar_args_buffer,
                                   size_t storage_buffer_count,
                                   size_t arg_sizes[],
                                   void *args[],
@@ -374,28 +384,31 @@ VkResult vk_update_descriptor_set(void *user_context,
     wds_config.entry_size = sizeof(VkWriteDescriptorSet);
     BlockStorage write_descriptor_set(user_context, wds_config);
 
-    // First binding will be the scalar params buffer
-    VkDescriptorBufferInfo scalar_args_descriptor_buffer_info = {
-        scalar_args_buffer,  // the buffer
-        0,                   // offset
-        VK_WHOLE_SIZE        // range
-    };
-    descriptor_buffer_info.append(user_context, &scalar_args_descriptor_buffer_info);
-    VkDescriptorBufferInfo *scalar_args_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
-
-    VkWriteDescriptorSet scalar_args_write_descriptor_set = {
-        VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
-        nullptr,                                 // pointer to struct extending this
-        descriptor_set,                          // descriptor set to update
-        0,                                       // binding slot
-        0,                                       // array elem
-        1,                                       // num to update
-        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,       // descriptor type
-        nullptr,                                 // for images
-        scalar_args_entry,                       // info for buffer
-        nullptr                                  // for texel buffers
-    };
-    write_descriptor_set.append(user_context, &scalar_args_write_descriptor_set);
+    // First binding will be the scalar args buffer (if needed)
+    VkDescriptorBufferInfo *scalar_args_entry = nullptr;
+    if (scalar_args_buffer != nullptr) {
+        VkDescriptorBufferInfo scalar_args_descriptor_buffer_info = {
+            *scalar_args_buffer,  // the buffer
+            0,                    // offset
+            VK_WHOLE_SIZE         // range
+        };
+        descriptor_buffer_info.append(user_context, &scalar_args_descriptor_buffer_info);
+        scalar_args_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
+
+        VkWriteDescriptorSet scalar_args_write_descriptor_set = {
+            VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
+            nullptr,                                 // pointer to struct extending this
+            descriptor_set,                          // descriptor set to update
+            0,                                       // binding slot
+            0,                                       // array elem
+            1,                                       // num to update
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,       // descriptor type
+            nullptr,                                 // for images
+            scalar_args_entry,                       // info for buffer
+            nullptr                                  // for texel buffers
+        };
+        write_descriptor_set.append(user_context, &scalar_args_write_descriptor_set);
+    }
 
     // Add all the other device buffers
     for (size_t i = 0; arg_sizes[i] > 0; i++) {
@@ -459,12 +472,7 @@ size_t vk_estimate_scalar_uniform_buffer_size(void *user_context,
 
 MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
                                               VulkanMemoryAllocator *allocator,
-                                              size_t arg_sizes[],
-                                              void *args[],
-                                              int8_t arg_is_buffer[]) {
-
-    size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context,
-                                                                       arg_sizes, args, arg_is_buffer);
+                                              size_t scalar_buffer_size) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)

From 18cc5871a0b20a4dc4ddfbba75cc6e363a5cacfc Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 6 Oct 2022 10:49:50 -0700
Subject: [PATCH 036/166] Add src/mini_vulkan.h Apache 2.0 license requirements
 to License file

---
 LICENSE.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/LICENSE.txt b/LICENSE.txt
index 13146db88f3b..9faf448676b1 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -195,6 +195,23 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
 IN THE MATERIALS.
 
+
+----
+
+src/mini_vulkan.h is Copyright (c) 2014-2017 The Khronos Group Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
 ----
 
 apps/linear_algebra/include/cblas.h is licensed under the BLAS license.

From 7546625943a8d4e5d4e610bada9d80244e518c70 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 10 Oct 2022 20:30:53 -0700
Subject: [PATCH 037/166] Add descriptor set binding info as pre-amble to
 SPIR-V code module Fix shared memory allocation to use global variables in
 workgroup storage space Add extern calls for spirv and glsl builtins Add
 memory fence call to gpu thread barrier Add missing visitors to Vulkan
 CodeGen Add scalar index & vector index methods for load/store

---
 .../spirv/include/spirv/1.0/GLSL.std.450.h    | 131 +++
 src/CodeGen_Vulkan_Dev.cpp                    | 894 ++++++++++++++----
 src/SpirvIR.cpp                               | 334 ++++++-
 src/SpirvIR.h                                 |  60 +-
 src/runtime/mini_vulkan.h                     |   9 +
 src/runtime/vulkan.cpp                        |  97 +-
 src/runtime/vulkan_context.h                  |  38 +-
 src/runtime/vulkan_internal.h                 |  22 +-
 src/runtime/vulkan_resources.h                | 319 +++++--
 9 files changed, 1588 insertions(+), 316 deletions(-)
 create mode 100644 dependencies/spirv/include/spirv/1.0/GLSL.std.450.h

diff --git a/dependencies/spirv/include/spirv/1.0/GLSL.std.450.h b/dependencies/spirv/include/spirv/1.0/GLSL.std.450.h
new file mode 100644
index 000000000000..54cc00e9a888
--- /dev/null
+++ b/dependencies/spirv/include/spirv/1.0/GLSL.std.450.h
@@ -0,0 +1,131 @@
+/*
+** Copyright (c) 2014-2016 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and/or associated documentation files (the "Materials"),
+** to deal in the Materials without restriction, including without limitation
+** the rights to use, copy, modify, merge, publish, distribute, sublicense,
+** and/or sell copies of the Materials, and to permit persons to whom the
+** Materials are furnished to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in
+** all copies or substantial portions of the Materials.
+**
+** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
+** STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
+** HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ 
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+** THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+** FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
+** IN THE MATERIALS.
+*/
+
+#ifndef GLSLstd450_H
+#define GLSLstd450_H
+
+static const int GLSLstd450Version = 100;
+static const int GLSLstd450Revision = 3;
+
+enum GLSLstd450 {
+    GLSLstd450Bad = 0,              // Don't use
+
+    GLSLstd450Round = 1,
+    GLSLstd450RoundEven = 2,
+    GLSLstd450Trunc = 3,
+    GLSLstd450FAbs = 4,
+    GLSLstd450SAbs = 5,
+    GLSLstd450FSign = 6,
+    GLSLstd450SSign = 7,
+    GLSLstd450Floor = 8,
+    GLSLstd450Ceil = 9,
+    GLSLstd450Fract = 10,
+
+    GLSLstd450Radians = 11,
+    GLSLstd450Degrees = 12,
+    GLSLstd450Sin = 13,
+    GLSLstd450Cos = 14,
+    GLSLstd450Tan = 15,
+    GLSLstd450Asin = 16,
+    GLSLstd450Acos = 17,
+    GLSLstd450Atan = 18,
+    GLSLstd450Sinh = 19,
+    GLSLstd450Cosh = 20,
+    GLSLstd450Tanh = 21,
+    GLSLstd450Asinh = 22,
+    GLSLstd450Acosh = 23,
+    GLSLstd450Atanh = 24,
+    GLSLstd450Atan2 = 25,
+
+    GLSLstd450Pow = 26,
+    GLSLstd450Exp = 27,
+    GLSLstd450Log = 28,
+    GLSLstd450Exp2 = 29,
+    GLSLstd450Log2 = 30,
+    GLSLstd450Sqrt = 31,
+    GLSLstd450InverseSqrt = 32,
+
+    GLSLstd450Determinant = 33,
+    GLSLstd450MatrixInverse = 34,
+
+    GLSLstd450Modf = 35,            // second operand needs an OpVariable to write to
+    GLSLstd450ModfStruct = 36,      // no OpVariable operand
+    GLSLstd450FMin = 37,
+    GLSLstd450UMin = 38,
+    GLSLstd450SMin = 39,
+    GLSLstd450FMax = 40,
+    GLSLstd450UMax = 41,
+    GLSLstd450SMax = 42,
+    GLSLstd450FClamp = 43,
+    GLSLstd450UClamp = 44,
+    GLSLstd450SClamp = 45,
+    GLSLstd450FMix = 46,
+    GLSLstd450IMix = 47,            // Reserved
+    GLSLstd450Step = 48,
+    GLSLstd450SmoothStep = 49,
+
+    GLSLstd450Fma = 50,
+    GLSLstd450Frexp = 51,            // second operand needs an OpVariable to write to
+    GLSLstd450FrexpStruct = 52,      // no OpVariable operand
+    GLSLstd450Ldexp = 53,
+
+    GLSLstd450PackSnorm4x8 = 54,
+    GLSLstd450PackUnorm4x8 = 55,
+    GLSLstd450PackSnorm2x16 = 56,
+    GLSLstd450PackUnorm2x16 = 57,
+    GLSLstd450PackHalf2x16 = 58,
+    GLSLstd450PackDouble2x32 = 59,
+    GLSLstd450UnpackSnorm2x16 = 60,
+    GLSLstd450UnpackUnorm2x16 = 61,
+    GLSLstd450UnpackHalf2x16 = 62,
+    GLSLstd450UnpackSnorm4x8 = 63,
+    GLSLstd450UnpackUnorm4x8 = 64,
+    GLSLstd450UnpackDouble2x32 = 65,
+
+    GLSLstd450Length = 66,
+    GLSLstd450Distance = 67,
+    GLSLstd450Cross = 68,
+    GLSLstd450Normalize = 69,
+    GLSLstd450FaceForward = 70,
+    GLSLstd450Reflect = 71,
+    GLSLstd450Refract = 72,
+
+    GLSLstd450FindILsb = 73,
+    GLSLstd450FindSMsb = 74,
+    GLSLstd450FindUMsb = 75,
+
+    GLSLstd450InterpolateAtCentroid = 76,
+    GLSLstd450InterpolateAtSample = 77,
+    GLSLstd450InterpolateAtOffset = 78,
+
+    GLSLstd450NMin = 79,
+    GLSLstd450NMax = 80,
+    GLSLstd450NClamp = 81,
+
+    GLSLstd450Count
+};
+
+#endif  // #ifndef GLSLstd450_H
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 187e4e6ef118..37e3de9fcdf8 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -6,6 +6,7 @@
 #include "CodeGen_Vulkan_Dev.h"
 #include "Debug.h"
 #include "Deinterleave.h"
+#include "FindIntrinsics.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "Scope.h"
@@ -68,19 +69,20 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 
         using IRVisitor::visit;
 
-        void visit(const Variable *) override;
         void visit(const IntImm *) override;
         void visit(const UIntImm *) override;
-        void visit(const StringImm *) override;
         void visit(const FloatImm *) override;
+        void visit(const StringImm *) override;
         void visit(const Cast *) override;
+        void visit(const Reinterpret *) override;
+        void visit(const Variable *) override;
         void visit(const Add *) override;
         void visit(const Sub *) override;
         void visit(const Mul *) override;
         void visit(const Div *) override;
         void visit(const Mod *) override;
-        void visit(const Max *) override;
         void visit(const Min *) override;
+        void visit(const Max *) override;
         void visit(const EQ *) override;
         void visit(const NE *) override;
         void visit(const LT *) override;
@@ -90,17 +92,17 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const And *) override;
         void visit(const Or *) override;
         void visit(const Not *) override;
-        void visit(const Call *) override;
         void visit(const Select *) override;
         void visit(const Load *) override;
-        void visit(const Store *) override;
+        void visit(const Ramp *) override;
+        void visit(const Broadcast *) override;
+        void visit(const Call *) override;
         void visit(const Let *) override;
         void visit(const LetStmt *) override;
         void visit(const AssertStmt *) override;
-        void visit(const ProducerConsumer *) override;
+//        void visit(const ProducerConsumer *) override;
         void visit(const For *) override;
-        void visit(const Ramp *) override;
-        void visit(const Broadcast *) override;
+        void visit(const Store *) override;
         void visit(const Provide *) override;
         void visit(const Allocate *) override;
         void visit(const Free *) override;
@@ -108,12 +110,77 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const IfThenElse *) override;
         void visit(const Evaluate *) override;
         void visit(const Shuffle *) override;
+        void visit(const VectorReduce *) override;
         void visit(const Prefetch *) override;
         void visit(const Fork *) override;
         void visit(const Acquire *) override;
+        void visit(const Atomic *) override;
 
+        void visit_unaryop(Type t, const Expr &a, SpvOp op_code);
         void visit_binop(Type t, const Expr &a, const Expr &b, SpvOp op_code);
 
+        void visit_glsl_unaryop(Type t, const Expr &a, SpvId glsl_op_code);
+        void visit_glsl_binop(Type t, const Expr &a, const Expr &b, SpvId glsl_op_code);
+
+        void load_from_scalar_index( SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class );
+        void load_from_vector_index( const Load *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class );
+
+        void store_at_scalar_index( SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id );
+        void store_at_vector_index( const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id );
+
+        using BuiltinMap = std::unordered_map<std::string, SpvId>;
+        const BuiltinMap spirv_builtin = {
+            {"is_nan_f16", SpvOpIsNan},
+            {"is_nan_f32", SpvOpIsNan},
+            {"is_nan_f64", SpvOpIsNan},
+            {"is_inf_f16", SpvOpIsInf},
+            {"is_inf_f32", SpvOpIsInf},
+            {"is_inf_f64", SpvOpIsInf},
+        };
+
+        const BuiltinMap glsl_builtin = {
+            {"acos_f16", GLSLstd450Acos},
+            {"acos_f32", GLSLstd450Acos},
+            {"acosh_f16", GLSLstd450Acosh},
+            {"acosh_f32", GLSLstd450Acosh},
+            {"asin_f16", GLSLstd450Asin},
+            {"asin_f32", GLSLstd450Asin},
+            {"asinh_f16", GLSLstd450Asinh},
+            {"asinh_f32", GLSLstd450Asinh},
+            {"atan2_f16", GLSLstd450Atan}, // also called atan in GLSL
+            {"atan2_f32", GLSLstd450Atan}, 
+            {"atan_f16", GLSLstd450Atan},
+            {"atan_f32", GLSLstd450Atan},
+            {"atanh_f16", GLSLstd450Atanh},
+            {"atanh_f32", GLSLstd450Atanh},
+            {"ceil_f16", GLSLstd450Ceil},
+            {"ceil_f32", GLSLstd450Ceil},
+            {"cos_f16", GLSLstd450Cos},
+            {"cos_f32", GLSLstd450Cos},
+            {"cosh_f16", GLSLstd450Cosh},
+            {"cosh_f32", GLSLstd450Cosh},
+            {"exp_f16", GLSLstd450Exp},
+            {"exp_f32", GLSLstd450Exp},
+            {"fast_inverse_sqrt_f16", GLSLstd450InverseSqrt},
+            {"fast_inverse_sqrt_f32", GLSLstd450InverseSqrt},
+            {"floor_f16", GLSLstd450Floor},
+            {"floor_f32", GLSLstd450Floor},
+            {"log_f16", GLSLstd450Log},
+            {"log_f32", GLSLstd450Log},
+            {"sin_f16", GLSLstd450Sin},
+            {"sin_f32", GLSLstd450Sin},
+            {"sinh_f16", GLSLstd450Sinh},
+            {"sinh_f32", GLSLstd450Sinh},
+            {"sqrt_f16", GLSLstd450Sqrt},
+            {"sqrt_f32", GLSLstd450Sqrt},
+            {"tan_f16", GLSLstd450Tan},
+            {"tan_f32", GLSLstd450Tan},
+            {"tanh_f16", GLSLstd450Tanh},
+            {"tanh_f32", GLSLstd450Tanh},
+            {"trunc_f16", GLSLstd450Trunc},
+            {"trunc_f32", GLSLstd450Trunc},
+        };
+        
         // The SPIRV-IR builder
         SpvBuilder builder;
 
@@ -132,6 +199,20 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         using ScopedSymbolBinding = ScopedBinding<SymbolIdStorageClassPair>;
         SymbolScope symbol_table;
 
+        // Keep track of the descriptor sets so we can add a sidecar to module
+        // indicating which descriptor set to use for each entry point
+        struct DescriptorSet {
+            std::string entry_point_name;
+            uint32_t uniform_buffer_count = 0;
+            uint32_t storage_buffer_count = 0;
+        };
+        using DescriptorSetTable = std::vector<DescriptorSet>;
+        DescriptorSetTable descriptor_set_table;
+
+        // Encode the descriptor sets into a sidecar which will be added
+        // as a header to the module prior to the actual SPIR-V binary
+        void encode_header(SpvBinary& spirv_header);
+
         // The workgroup size.  Must be the same for all kernels.
         uint32_t workgroup_size[3];
 
@@ -164,9 +245,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(const Expr &e) {
 SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::map_type_to_pair(const Type &t) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::map_type_to_pair(): " << t << "\n";
     SpvId base_type_id = builder.declare_type(t);
-    const std::string &type_name = type_to_c_type(t, false, false) + std::string("_pair");
     SpvBuilder::StructMemberTypes member_type_ids = {base_type_id, base_type_id};
-    SpvId struct_type_id = builder.declare_struct(type_name, member_type_ids);
+    const std::string struct_name = std::string("_struct_") + type_to_c_type(t, false, false) + std::string("_pair");
+    SpvId struct_type_id = builder.declare_struct(struct_name, member_type_ids);
     return struct_type_id;
 }
 
@@ -243,46 +324,72 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast): " << op->value.type() << " to " << op->type << "\n";
 
+    Type value_type = op->value.type();
+    Type target_type = op->type;
+    SpvId target_type_id = builder.declare_type(target_type);
+    op->value.accept(this);
+    SpvId src_id = builder.current_id();
+
     SpvOp op_code = SpvOpNop;
-    if (op->value.type().is_float()) {
-        if (op->type.is_float()) {
+    if (value_type.is_float()) {
+        if (target_type.is_float()) {
             op_code = SpvOpFConvert;
-        } else if (op->type.is_uint()) {
+        } else if (target_type.is_uint()) {
             op_code = SpvOpConvertFToU;
-        } else if (op->type.is_int()) {
+        } else if (target_type.is_int()) {
             op_code = SpvOpConvertFToS;
         } else {
-            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
+            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
         }
-    } else if (op->value.type().is_uint()) {
-        if (op->type.is_float()) {
+    } else if (value_type.is_uint()) {
+        if (target_type.is_float()) {
             op_code = SpvOpConvertUToF;
-        } else if (op->type.is_uint()) {
+        } else if (target_type.is_uint()) {
             op_code = SpvOpUConvert;
-        } else if (op->type.is_int()) {
-            op_code = SpvOpSatConvertUToS;
+        } else if (target_type.is_int()) {
+            if(builder.is_capability_required(SpvCapabilityKernel)) {
+                op_code = SpvOpSatConvertUToS;
+            } else {
+                op_code = SpvOpBitcast;
+            }
         } else {
-            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
+            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
         }
-    } else if (op->value.type().is_int()) {
-        if (op->type.is_float()) {
+    } else if (value_type.is_int()) {
+        if (target_type.is_float()) {
             op_code = SpvOpConvertSToF;
-        } else if (op->type.is_uint()) {
-            op_code = SpvOpSatConvertSToU;
-        } else if (op->type.is_int()) {
+        } else if (target_type.is_uint()) {
+            if(builder.is_capability_required(SpvCapabilityKernel)) {
+                op_code = SpvOpSatConvertSToU;
+            } else {
+                op_code = SpvOpBitcast;
+            }
+        } else if (target_type.is_int()) {
             op_code = SpvOpSConvert;
         } else {
-            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
+            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
         }
     } else {
-        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
     }
 
+
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    if(op_code == SpvOpBitcast) {
+        builder.append(SpvFactory::bitcast(target_type_id, result_id, src_id));
+    } else {
+        builder.append(SpvFactory::convert(op_code, target_type_id, result_id, src_id));
+    }
+    builder.update_id(result_id);
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Reinterpret *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Reinterpret): " << op->value.type() << " to " << op->type << "\n";
     SpvId type_id = builder.declare_type(op->type);
     op->value.accept(this);
     SpvId src_id = builder.current_id();
     SpvId result_id = builder.reserve_id(SpvResultId);
-    builder.append(SpvFactory::convert(op_code, type_id, result_id, src_id));
+    builder.append(SpvFactory::bitcast(type_id, result_id, src_id));
     builder.update_id(result_id);
 }
 
@@ -442,12 +549,85 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Call): " << op->type << " " << op->name << " args=" << (uint32_t)op->args.size() << "\n";
 
     if (op->is_intrinsic(Call::gpu_thread_barrier)) {
-        // TODO: Check the scopes here and figure out if this is the
-        // right memory barrier. Might be able to use
-        // SpvMemorySemanticsMaskNone instead.
-        SpvId current_function_id = builder.current_function().id();
-        builder.append(SpvFactory::control_barrier(current_function_id, current_function_id,
-                                                   SpvMemorySemanticsAcquireReleaseMask));
+        internal_assert(op->args.size() == 1) << "gpu_thread_barrier() intrinsic must specify memory fence type.\n";
+
+        const auto *fence_type_ptr = as_const_int(op->args[0]);
+        internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n";
+        auto fence_type = *fence_type_ptr;
+
+        if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device &&
+            !(fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared)) {
+            
+            uint32_t execution_scope = SpvDeviceScope;
+            uint32_t memory_scope = SpvDeviceScope;
+            uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask | 
+                                     SpvMemorySemanticsWorkgroupMemoryMask);
+            SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
+            SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
+            SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
+            builder.append(SpvFactory::control_barrier(exec_scope_id, memory_scope_id, control_mask_id));
+
+        } else if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device) {
+            uint32_t execution_scope = SpvDeviceScope;
+            uint32_t memory_scope = SpvDeviceScope;
+            uint32_t control_mask = (SpvMemorySemanticsSequentiallyConsistentMask |
+                                     SpvMemorySemanticsUniformMemoryMask |
+                                     SpvMemorySemanticsSubgroupMemoryMask |
+                                     SpvMemorySemanticsWorkgroupMemoryMask |
+                                     SpvMemorySemanticsCrossWorkgroupMemoryMask |
+                                     SpvMemorySemanticsAtomicCounterMemoryMask |
+                                     SpvMemorySemanticsImageMemoryMask);
+            SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
+            SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
+            SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
+            builder.append(SpvFactory::control_barrier(exec_scope_id, memory_scope_id, control_mask_id));
+
+        } else if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared) {
+            uint32_t memory_scope = SpvWorkgroupScope;
+            uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask | 
+                                     SpvMemorySemanticsWorkgroupMemoryMask);
+            SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
+            SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
+            builder.append(SpvFactory::memory_barrier(memory_scope_id, control_mask_id));
+
+        } else {
+        
+            uint32_t execution_scope = SpvDeviceScope;
+            uint32_t memory_scope = SpvDeviceScope;
+            uint32_t control_mask = SpvMemorySemanticsMaskNone;
+            SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
+            SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
+            SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
+            builder.append(SpvFactory::control_barrier(exec_scope_id, memory_scope_id, control_mask_id));
+        }
+    } else if (op->is_intrinsic(Call::abs)) {
+        internal_assert(op->args.size() == 1);
+
+        SpvId op_code = SpvInvalidId;
+        if (op->type.is_float()) {
+            op_code = GLSLstd450FAbs;
+        } else {
+            op_code = GLSLstd450SAbs;
+        } 
+        visit_glsl_unaryop(op->type, op->args[0], op_code);
+
+    } else if (op->is_intrinsic(Call::IntrinsicOp::round)) {
+        internal_assert(op->args.size() == 1);
+        visit_glsl_unaryop(op->type, op->args[0], GLSLstd450RoundEven);
+
+    } else if (op->is_intrinsic(Call::absd)) {
+        internal_assert(op->args.size() == 2);
+        Expr a = op->args[0];
+        Expr b = op->args[1];
+        Expr e = cast(op->type, select(a < b, b - a, a - b));
+        e->accept(this);
+    } else if (op->is_intrinsic(Call::return_second)) {
+        internal_assert(op->args.size() == 2);
+        // Simply discard the first argument, which is generally a call to
+        // 'halide_printf'.
+        if(op->args[1].defined()) {
+            op->args[1]->accept(this);
+        }
     } else if (op->is_intrinsic(Call::bitwise_and)) {
         internal_assert(op->args.size() == 2);
         visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseAnd);
@@ -463,7 +643,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         op->args[0]->accept(this);
         SpvId arg_id = builder.current_id();
         SpvId result_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::logical_not(type_id, result_id, arg_id));
+        builder.append(SpvFactory::bitwise_not(type_id, result_id, arg_id));
         builder.update_id(result_id);
     } else if (op->is_intrinsic(Call::if_then_else)) {
         if (op->type.is_vector()) {
@@ -480,65 +660,95 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
     } else if (op->is_intrinsic(Call::IntrinsicOp::div_round_to_zero)) {
         internal_assert(op->args.size() == 2);
         SpvOp op_code = SpvOpNop;
-        if (op->type.is_int()) {
+        if (op->type.is_float()) {
+            op_code = SpvOpFDiv;
+        } else if (op->type.is_int()) {
             op_code = SpvOpSDiv;
         } else if (op->type.is_uint()) {
             op_code = SpvOpUDiv;
         } else {
-            internal_error << "div_round_to_zero of non-integer type.\n";
+            internal_error << "div_round_to_zero of unhandled type.\n";
         }
         visit_binop(op->type, op->args[0], op->args[1], op_code);
     } else if (op->is_intrinsic(Call::IntrinsicOp::mod_round_to_zero)) {
         internal_assert(op->args.size() == 2);
         SpvOp op_code = SpvOpNop;
-        if (op->type.is_int()) {
+        if (op->type.is_float()) {
+            op_code = SpvOpFMod;
+        } else if (op->type.is_int()) {
             op_code = SpvOpSMod;
         } else if (op->type.is_uint()) {
             op_code = SpvOpUMod;
         } else {
-            internal_error << "mod_round_to_zero of non-integer type.\n";
+            internal_error << "mod_round_to_zero of unhandled type.\n";
         }
         visit_binop(op->type, op->args[0], op->args[1], op_code);
-    } else if (op->is_intrinsic(Call::IntrinsicOp::mul_shift_right)) {
-        internal_assert(op->args.size() == 3);
-        uint32_t type_id = builder.declare_type(op->type);
 
-        op->args[0].accept(this);
-        SpvId src_a_id = builder.current_id();
-        op->args[1].accept(this);
-        SpvId src_b_id = builder.current_id();
-
-        SpvId pair_type_id = map_type_to_pair(op->type);
-
-        // Double width multiply
-        SpvId product_pair_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::multiply_extended(pair_type_id, product_pair_id, src_a_id, src_b_id, op->type.is_uint() ? false : true));
-
-        SpvFactory::Indices indices = {1};
-        uint32_t high_item_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::composite_extract(type_id, high_item_id, product_pair_id, indices));
-
-        const UIntImm *shift = op->args[2].as<UIntImm>();
-        internal_assert(shift != nullptr) << "Third argument to mul_shift_right intrinsic must be an unsigned integer immediate.\n";
-
-        SpvId result_id = high_item_id;
-        if (shift->value != 0) {
-            // TODO: This code depends on compilation happening on a little-endian host.
-            SpvId shift_amount_id = builder.declare_constant(shift->type, &shift->value);
-            result_id = builder.reserve_id(SpvResultId);
-            if (op->type.is_uint()) {
-                builder.append(SpvFactory::shift_right_logical(type_id, result_id, high_item_id, shift_amount_id));
-            } else {
-                builder.append(SpvFactory::shift_right_arithmetic(type_id, result_id, high_item_id, shift_amount_id));
-            }
+    } else if (op->is_intrinsic(Call::shift_right)) {
+        if (op->type.is_uint()) {
+           visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftRightLogical);
+        } else {
+           visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftRightArithmetic);
         }
-        builder.update_id(result_id);
+    } else if (op->is_intrinsic(Call::shift_left)) {
+        visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftLeftLogical);
+    } else if (op->is_intrinsic(Call::strict_float)) {
+        // TODO: Enable/Disable RelaxedPrecision flags?
+        internal_assert(op->args.size() == 1);
+        op->args[0].accept(this);
     } else if (op->is_intrinsic(Call::IntrinsicOp::sorted_avg)) {
         internal_assert(op->args.size() == 2);
         // b > a, so the following works without widening:
         // a + (b - a)/2
         Expr e = op->args[0] + (op->args[1] - op->args[0]) / 2;
         e.accept(this);
+
+    } else if (op->is_intrinsic(Call::widen_right_add) ||
+               op->is_intrinsic(Call::widen_right_mul) ||
+               op->is_intrinsic(Call::widen_right_sub) ||
+               op->is_intrinsic(Call::widening_add) ||
+               op->is_intrinsic(Call::widening_mul) ||
+               op->is_intrinsic(Call::widening_sub) ||
+               op->is_intrinsic(Call::widening_shift_left) ||
+               op->is_intrinsic(Call::widening_shift_right) ||
+               op->is_intrinsic(Call::rounding_shift_left) ||
+               op->is_intrinsic(Call::rounding_shift_right) ||
+               op->is_intrinsic(Call::saturating_cast) ||
+               op->is_intrinsic(Call::saturating_add) ||
+               op->is_intrinsic(Call::saturating_sub) ||
+               op->is_intrinsic(Call::saturating_cast) ||
+               op->is_intrinsic(Call::halving_add) ||
+               op->is_intrinsic(Call::halving_sub) ||
+               op->is_intrinsic(Call::rounding_halving_add) ||
+               op->is_intrinsic(Call::mul_shift_right) ||
+               op->is_intrinsic(Call::rounding_mul_shift_right)) {
+        Expr e = lower_intrinsic(op);
+        e.accept(this);
+        return;
+    } else {
+        BuiltinMap::const_iterator spirv_it = spirv_builtin.find(op->name);
+        if(spirv_it != spirv_builtin.end()) {
+            SpvId spirv_op = spirv_it->second;
+            if((spirv_op == SpvOpIsInf) || (spirv_op == SpvOpIsNan)) {
+                internal_assert(op->args.size() == 1);
+                visit_unaryop(op->type, op->args[0], (SpvOp)spirv_op);
+            } else {
+                user_error << "Vulkan: unhandled SPIR-V builtin function '" << op->name << "' encountered.\n";
+            }
+        }
+        BuiltinMap::const_iterator glsl_it = glsl_builtin.find(op->name);
+        if (glsl_it == glsl_builtin.end()) {
+            user_error << "Vulkan: unknown function '" << op->name << "' encountered.\n";
+        }
+
+        SpvId glsl_op_code = glsl_it->second;
+        if(is_glsl_unary_op(glsl_op_code)) {
+            internal_assert(op->args.size() == 1);
+            visit_glsl_unaryop(op->type, op->args[0], glsl_op_code);
+        } else if(is_glsl_binary_op(glsl_op_code)) {
+            internal_assert(op->args.size() == 2);
+            visit_glsl_binop(op->type, op->args[0], op->args[1], glsl_op_code);
+        }
     }
 }
 
@@ -556,15 +766,167 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Select *op) {
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): " << (Expr)op << "\n";
-    user_assert(is_const_one(op->predicate)) << "Predicated loads not supported by SPIR-V codegen\n";
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index( SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class ) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(): " 
+             << "index_id=" << index_id << " "
+             << "base_id=" << base_id << " "
+             << "type_id=" << type_id << " "
+             << "ptr_type_id=" << ptr_type_id << " "
+             << "storage_class=" << storage_class << "\n";
+
+    uint32_t zero = 0;
+    SpvId src_id = SpvInvalidId;
+    SpvId src_index_id = index_id;
+        if(storage_class == SpvStorageClassUniform) {
+        SpvId zero_id = builder.declare_constant(UInt(32), &zero);
+        SpvFactory::Indices access_indices = {zero_id, src_index_id};
+        src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+    } else if(storage_class == SpvStorageClassWorkgroup) {
+        SpvFactory::Indices access_indices = {src_index_id};
+        src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+    } else if(storage_class == SpvStorageClassFunction) {
+        src_id = base_id;
+    } else {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): unhandled storage class encountered on op: " << storage_class << "\n";
+    }
+    internal_assert(src_id != SpvInvalidId);
+
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::load(type_id, result_id, src_id));
+    builder.update_id(result_id);
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index( const Load *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class ) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(): " 
+             << "base_id=" << base_id << " "
+             << "type_id=" << type_id << " "
+             << "ptr_type_id=" << ptr_type_id << " "
+             << "storage_class=" << storage_class << "\n";
+
+    internal_assert(op->index.type().is_vector());
+
+    // determine the base type id for the source value
+    SpvId base_type_id = builder.type_of(base_id);
+    if(builder.is_pointer_type(base_type_id)) {
+        base_type_id = builder.lookup_base_type(base_type_id);
+    }
+
+    // If this is a dense vector load and the buffer has a vector base type,
+    // then index the buffer using the base of the ramp divided by the number
+    // of lanes.
+    SpvFactory::Indices index_components;
+    if(builder.is_vector_type(base_type_id)) {
+        Expr ramp_base = strided_ramp_base(op->index);
+        if (ramp_base.defined()) {
+            Expr ramp_index = (ramp_base / op->type.lanes());
+            ramp_index.accept(this);
+            SpvId index_id = builder.current_id();
+            load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
+            return;
+        }
+    }
+    
+    op->index.accept(this);
+    SpvId index_id = builder.current_id();
+
+    // Gather vector elements.
+    SpvFactory::Components loaded_values;
+    SpvId value_type_id = builder.declare_type(op->type.with_lanes(1));
+    SpvId index_type_id = builder.declare_type(op->index.type().with_lanes(1));
+    for (uint32_t i = 0; i < (uint32_t)op->index.type().lanes(); i++) {
+        SpvFactory::Indices extract_indices = { i };
+        SpvId index_component_id = builder.declare_variable(unique_name("_idx_ex"), index_type_id, SpvStorageClassFunction);
+        builder.append(SpvFactory::composite_extract(index_type_id, index_component_id, index_id, extract_indices));
+        load_from_scalar_index(index_component_id, base_id, type_id, ptr_type_id, storage_class);
+        SpvId value_id = builder.current_id();
+        SpvId value_component_id = builder.declare_variable(unique_name("_val_ex"), value_type_id, SpvStorageClassFunction);
+        builder.append(SpvFactory::composite_extract(value_type_id, value_component_id, value_id, extract_indices));
+        loaded_values.push_back(value_component_id);
+    }
+
+    // Create a composite vector from the individual loads
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::composite_construct(type_id, result_id, loaded_values));
+    builder.update_id(result_id);
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index( SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id ) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(): " 
+             << "index_id=" << index_id << " "
+             << "base_id=" << base_id << " "
+             << "type_id=" << type_id << " "
+             << "ptr_type_id=" << ptr_type_id << " "
+             << "storage_class=" << storage_class << " "
+             << "value_id=" << value_id << "\n";
+
+    uint32_t zero = 0;
+    SpvId dst_id = SpvInvalidId;
+    SpvId dst_index_id = index_id;
+
+    if(storage_class == SpvStorageClassUniform) {
+        SpvId zero_id = builder.declare_constant(UInt(32), &zero);
+        SpvFactory::Indices access_indices = {zero_id, dst_index_id};
+        dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+    } else if(storage_class == SpvStorageClassWorkgroup) {
+        SpvFactory::Indices access_indices = {dst_index_id};
+        dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+    } else if(storage_class == SpvStorageClassFunction) {
+        dst_id = base_id;
+    } else {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): unhandled storage class encountered on op: " << storage_class << "\n";
+    }
+    internal_assert(dst_id != SpvInvalidId);
+    builder.append(SpvFactory::store(dst_id, value_id));
+}
 
-    // TODO: implement vector loads
-    // TODO: correct casting to the appropriate memory space
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index( const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id ) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(): " 
+             << "base_id=" << base_id << " "
+             << "type_id=" << type_id << " "
+             << "ptr_type_id=" << ptr_type_id << " "
+             << "storage_class=" << storage_class << "\n";
 
-    internal_assert(!(op->index.type().is_vector()));
-    internal_assert(op->param.defined() && op->param.is_buffer());
+    internal_assert(op->index.type().is_vector());
+
+    // determine the base type id for the source value
+    SpvId base_type_id = builder.type_of(base_id);
+    if(builder.is_pointer_type(base_type_id)) {
+        base_type_id = builder.lookup_base_type(base_type_id);
+    }
+
+    // If this is a dense vector load and the buffer has a vector base type,
+    // then index the buffer using the base of the ramp divided by the number
+    // of lanes.
+    if(builder.is_vector_type(base_type_id)) {
+        Expr ramp_base = strided_ramp_base(op->index);
+        if (ramp_base.defined()) {
+            Expr ramp_index =  (ramp_base / op->value.type().lanes());
+            ramp_index.accept(this);
+            SpvId index_id = builder.current_id();
+            store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);
+            return;
+        }
+    }
+    
+    op->index.accept(this);
+    SpvId index_id = builder.current_id();
+
+    // Scatter vector elements.
+    SpvId value_type_id = builder.declare_type(op->value.type().with_lanes(1));
+    SpvId index_type_id = builder.declare_type(op->index.type().with_lanes(1));
+    for (uint32_t i = 0; i < (uint32_t)op->index.type().lanes(); i++) {
+        SpvFactory::Indices extract_indices = { i };
+        SpvId index_component_id = builder.declare_variable(unique_name("_idx_ex"), index_type_id, SpvStorageClassFunction);
+        builder.append(SpvFactory::composite_extract(index_type_id, index_component_id, index_id, extract_indices));
+        SpvId value_component_id = builder.declare_variable(unique_name("_val_ex"), value_type_id, SpvStorageClassFunction);
+        builder.append(SpvFactory::composite_extract(value_type_id, value_component_id, value_id, extract_indices));
+        store_at_scalar_index(index_component_id, base_id, type_id, ptr_type_id, storage_class, value_component_id);
+    }
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): " << op->type << " " << op->name << "[" << op->index << "]\n";
+    user_assert(is_const_one(op->predicate)) << "Predicated loads not supported by SPIR-V codegen\n";
 
     // Construct the pointer to read from
     internal_assert(symbol_table.contains(op->name));
@@ -574,33 +936,22 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
     internal_assert(base_id != SpvInvalidId);
     internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax));
 
-    op->index.accept(this);
-    SpvId index_id = builder.current_id();
-
-    uint32_t zero = 0;
     SpvId type_id = builder.declare_type(op->type);
-    SpvId zero_id = builder.declare_constant(UInt(32), &zero);
     SpvId ptr_type_id = builder.declare_pointer_type(type_id, storage_class);
-    SpvId access_chain_id = builder.reserve_id(SpvResultId);
-    SpvFactory::Indices indices = {index_id};
-    builder.append(SpvFactory::in_bounds_access_chain(ptr_type_id, access_chain_id, base_id, zero_id, indices));
 
-    SpvId result_id = builder.reserve_id(SpvResultId);
-    SpvId result_type_id = builder.declare_type(op->type);
-    builder.append(SpvFactory::load(result_type_id, result_id, access_chain_id));
-    builder.update_id(result_id);
+    if(op->index.type().is_scalar()) {
+        op->index.accept(this);
+        SpvId index_id = builder.current_id();
+        load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);        
+    } else {
+        load_from_vector_index(op, base_id, type_id, ptr_type_id, storage_class);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): " << op->name << "[" << op->index << "] = (" << op->value << ")\n";
     user_assert(is_const_one(op->predicate)) << "Predicated stores not supported by SPIR-V codegen!\n";
 
-    // TODO: implement vector writes
-    // TODO: correct casting to the appropriate memory space
-
-    internal_assert(!(op->index.type().is_vector()));
-    internal_assert(op->param.defined() && op->param.is_buffer());
-
     op->value.accept(this);
     SpvId value_id = builder.current_id();
 
@@ -612,17 +963,16 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
     internal_assert(base_id != SpvInvalidId);
     internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax));
 
-    op->index.accept(this);
-    SpvId index_id = builder.current_id();
     SpvId type_id = builder.declare_type(op->value.type());
     SpvId ptr_type_id = builder.declare_pointer_type(type_id, storage_class);
-    SpvId access_chain_id = builder.reserve_id(SpvResultId);
 
-    SpvId zero = 0;
-    SpvId zero_id = builder.declare_constant(UInt(32), &zero);
-    SpvFactory::Indices indices = {index_id};
-    builder.append(SpvFactory::in_bounds_access_chain(ptr_type_id, access_chain_id, base_id, zero_id, indices));
-    builder.append(SpvFactory::store(access_chain_id, value_id));
+    if(op->index.type().is_scalar()) {
+        op->index.accept(this);
+        SpvId index_id = builder.current_id();
+        store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);        
+    } else {
+        store_at_vector_index(op, base_id, type_id, ptr_type_id, storage_class, value_id);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Let *let) {
@@ -644,12 +994,11 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LetStmt *let) {
     builder.update_id(SpvInvalidId);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const AssertStmt *) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const AssertStmt *stmt) {
     // TODO: Fill this in.
-}
-
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const ProducerConsumer *) {
-    // I believe these nodes are solely for annotation purposes.
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(AssertStmt): "
+             << "condition=" << stmt->condition << " "
+             << "message=" << stmt->message << "\n";
 }
 
 namespace {
@@ -744,7 +1093,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         builder.append(SpvFactory::integer_add(index_type_id, max_id, min_id, extent_id));
 
         // Declare loop var
-        SpvId loop_var_id = builder.declare_variable(unique_name("loop_index"), index_var_type_id, SpvStorageClassFunction, min_id);
+        SpvId loop_var_id = builder.declare_variable(unique_name("_loop_idx"), index_var_type_id, SpvStorageClassFunction, min_id);
 
         SpvId header_block_id = builder.reserve_id(SpvBlockId);
         SpvId top_block_id = builder.reserve_id(SpvBlockId);
@@ -800,6 +1149,11 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Ramp *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Ramp): "
+             << "base=" << op->base << " "
+             << "stride=" << op->stride << " "
+             << "lanes=" << (uint32_t)op->lanes << "\n";
+
     // TODO: Is there a way to do this that doesn't require duplicating lane values?
     SpvId base_type_id = builder.declare_type(op->base.type());
     SpvId type_id = builder.declare_type(op->type);
@@ -828,6 +1182,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Ramp *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Broadcast *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Broadcast): "
+             << "type=" << op->type << " "
+             << "value=" << op->value << "\n";
+
     // TODO: Is there a way to do this that doesn't require duplicating lane values?
     SpvId type_id = builder.declare_type(op->type);
     op->value.accept(this);
@@ -844,10 +1202,48 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Provide *) {
     internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Provide *): Provide encountered during codegen\n";
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
+
+    SpvId base_type_id = SpvInvalidId;
+    SpvId variable_id = SpvInvalidId;
+    SpvStorageClass storage_class = SpvStorageClassGeneric;
+    if (op->memory_type == MemoryType::GPUShared) {
+        // Allocation of shared memory must be declared at global scope
+        internal_assert(op->extents.size() == 1 && is_const(op->extents[0]));
+        int32_t size = op->constant_allocation_size();
+        base_type_id = builder.declare_type(op->type, size);
+        storage_class = SpvStorageClassWorkgroup; // shared across workgroup
+        debug(2) << "Vulkan: Allocate " << op->name << "[" << (uint32_t)size << "] in shared memory on device in global scope\n";
+        SpvId ptr_type_id = builder.declare_pointer_type(base_type_id, storage_class);
+        variable_id = builder.declare_global_variable(op->name, ptr_type_id, storage_class);
+
+    } else {
+
+        // Allocation is not a shared memory allocation, just make a local declaration.
+        debug(2) << "Vulkan: Allocate " << op->name << " on device in function scope\n";
+        int32_t size = op->constant_allocation_size();
+
+        // It must have a constant size.
+        user_assert(size > 0)
+            << "Allocation " << op->name << " has a dynamic size. "
+            << "Only fixed-size allocations are supported on the gpu. "
+            << "Try storing into shared memory instead.";
+
+        base_type_id = builder.declare_type(op->type, size);
+        storage_class = SpvStorageClassFunction; // function scope
+        SpvId ptr_type_id = builder.declare_pointer_type(base_type_id, storage_class);
+        variable_id = builder.declare_variable(op->name, ptr_type_id, storage_class);
+    }
+
+    debug(3) << "Vulkan: Pushing allocation called " << op->name << " onto the symbol table\n";
+    symbol_table.push(op->name, {variable_id, storage_class});
+    op->body.accept(this);    
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Free *) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Free *op) {
+    debug(3) << "Vulkan: Popping allocation called " << op->name << " off the symbol table\n";
+    internal_assert(symbol_table.contains(op->name));
+    symbol_table.pop(op->name);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Realize *) {
@@ -861,26 +1257,30 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
     condition.accept(this);
     SpvId cond_id = builder.current_id();
     SpvId merge_block_id = builder.reserve_id(SpvBlockId);
-    //    SpvId if_block_id = builder.reserve_id(SpvBlockId);
+    SpvId if_block_id = builder.reserve_id(SpvBlockId);
     SpvId then_block_id = builder.reserve_id(SpvBlockId);
     SpvId else_block_id = else_case.defined() ? builder.reserve_id(SpvBlockId) : merge_block_id;
 
     SpvFactory::BlockVariables block_vars;
 
     // If Conditional
-    //    SpvBlock if_block = builder.create_block(if_block_id);
-    //    builder.enter_block(if_block);
-    //    {
-    builder.append(SpvFactory::selection_merge(merge_block_id, SpvSelectionControlMaskNone));
-    builder.append(SpvFactory::conditional_branch(cond_id, then_block_id, else_block_id));
-    //    }
-    //    builder.leave_block();
+    SpvBlock if_block = builder.create_block(if_block_id);
+    builder.enter_block(if_block);
+    {
+        debug(2) << "Vulkan: If (" << condition << " )\n";
+
+        builder.append(SpvFactory::selection_merge(merge_block_id, SpvSelectionControlMaskNone));
+        builder.append(SpvFactory::conditional_branch(cond_id, then_block_id, else_block_id));
+    }
+    builder.leave_block();
 
     // Then block
     SpvBlock then_block = builder.create_block(then_block_id);
     builder.enter_block(then_block);
     {
         then_case.accept(this);
+        debug(2) << "Vulkan: Then {" << then_case << " }\n";
+
         SpvId then_id = builder.current_id();
         builder.append(SpvFactory::branch(merge_block_id));
         block_vars.push_back({then_id, then_block_id});
@@ -893,6 +1293,7 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
         builder.enter_block(else_block);
         {
             else_case.accept(this);
+            debug(2) << "Vulkan: Else { " << else_case << " }\n";
             SpvId else_id = builder.current_id();
             builder.append(SpvFactory::branch(merge_block_id));
             block_vars.push_back({else_id, else_block_id});
@@ -930,19 +1331,36 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
     builder.update_id(result_id);
 }
 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const VectorReduce *) {
+    internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const VectorReduce *): VectorReduce not implemented for codegen\n";
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Prefetch *) {
-    internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Prefetch *): Prefetch encountered during codegen\n";
+    internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Prefetch *): Prefetch not implemented for codegen\n";
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Fork *) {
-    internal_error << "void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Fork *) not supported yet.";
+    internal_error << "void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Fork *): Fork not implemented for codegen";
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Acquire *) {
-    internal_error << "void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Acquire *) not supported yet.";
+    internal_error << "void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Acquire *): Acquire not implemented for codegen";
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Atomic *) {
+    internal_error << "void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Atomic *): Atomic not implemented for codegen";
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_unaryop(Type t, const Expr &a, SpvOp op_code) {
+    SpvId type_id = builder.declare_type(t);
+    a.accept(this);
+    SpvId src_a_id = builder.current_id();
+
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::unary_op(op_code, type_id, result_id, src_a_id));
+    builder.update_id(result_id);
 }
 
-// TODO: fast math decorations.
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_binop(Type t, const Expr &a, const Expr &b, SpvOp op_code) {
     SpvId type_id = builder.declare_type(t);
     a.accept(this);
@@ -955,6 +1373,32 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_binop(Type t, const Expr &a, const
     builder.update_id(result_id);
 }
 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_glsl_unaryop(Type type, const Expr &a, SpvId glsl_op_code) {
+    uint32_t type_id = builder.declare_type(type);
+    a.accept(this);
+    SpvId src_a_id = builder.current_id();
+
+    SpvId inst_set_id = builder.import_glsl_intrinsics();
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    SpvFactory::Operands operands = {src_a_id};
+    builder.append(SpvFactory::extended(inst_set_id, glsl_op_code, type_id, result_id, operands));
+    builder.update_id(result_id);
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_glsl_binop(Type type, const Expr &a, const Expr &b, SpvId glsl_op_code) {
+    SpvId type_id = builder.declare_type(type);
+    a.accept(this);
+    SpvId src_a_id = builder.current_id();
+    b.accept(this);
+    SpvId src_b_id = builder.current_id();
+
+    SpvId inst_set_id = builder.import_glsl_intrinsics();
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    SpvFactory::Operands operands = {src_a_id, src_b_id};
+    builder.append(SpvFactory::extended(inst_set_id, glsl_op_code, type_id, result_id, operands));
+    builder.update_id(result_id);
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
 
     builder.reset();
@@ -968,18 +1412,88 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
     // TODO: Is there a better memory model to use?
     builder.set_memory_model(SpvMemoryModelGLSL450);
 
-    // Capabilities
+    // NOTE: Execution model for Vulkan must be GLCompute which requires Shader support
     builder.require_capability(SpvCapabilityShader);
 
     // NOTE: Extensions are handled in finalize
 }
 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header(SpvBinary& spirv_header) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header\n";
+
+    // Encode a sidecar for the module that lists the descriptor sets 
+    // corresponding to each entry point contained in the module
+    // 
+    // [0] Header word count (total length of header)
+    // [1] Number of descriptor sets
+    // ... For each descriptor set ...
+    // ... [0] Number of uniform buffers for this descriptor set
+    // ... [1] Number of storage buffers for this descriptor set
+    // ... [2] Length of entry point name (padded to nearest word size)
+    // ... [X] Entry point string data 
+    //
+
+    // NOTE: The Vulkan runtime consumes this header prior to compiling.
+    // 
+    // Both vk_decode_entry_point_data() and vk_compile_shader_module() will
+    // need to be updated if the header encoding ever changes!
+    //
+    uint32_t index = 0;
+    spirv_header.push_back(descriptor_set_table.size());
+    for(const DescriptorSet& ds: descriptor_set_table) {
+        std::vector<char> padded_name;       
+        uint32_t padded_word_count = (ds.entry_point_name.length() + 3) / 4;
+        uint32_t padded_str_length = padded_word_count * 4;
+        padded_name.reserve(padded_str_length);
+        padded_name.insert(padded_name.begin(), ds.entry_point_name.c_str(), (ds.entry_point_name.c_str() + ds.entry_point_name.length()));
+        uint32_t padding = (padded_str_length - ds.entry_point_name.length());
+        for(uint32_t i = 0; i < padding; ++i) {
+            padded_name.push_back('\0');
+        }
+
+        debug(2) << "    [" << index << "] "
+                 << "uniform_buffer_count=" << ds.uniform_buffer_count << " "
+                 << "storage_buffer_count=" << ds.storage_buffer_count << " "
+                 << "entry_point_name_length=" << padded_str_length << " "                 
+                 << "entry_point_name_size=" << padded_name.size() << " "                 
+                 << "entry_point_name: " << (const char*)padded_name.data() << "\n";
+
+        spirv_header.push_back(ds.uniform_buffer_count);
+        spirv_header.push_back(ds.storage_buffer_count);
+        spirv_header.push_back(padded_str_length);
+        internal_assert(padded_name.size() == padded_str_length);
+        spirv_header.insert(spirv_header.end(), (const uint32_t *)padded_name.data(), (const uint32_t *)(padded_name.data() + padded_name.size()));
+        ++index;
+    }
+    uint32_t header_word_count = spirv_header.size();
+    spirv_header.insert(spirv_header.begin(), header_word_count + 1);
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::compile\n";
+
+    // First encode the descriptor set bindings for each entry point 
+    // as a sidecar which we will add as a preamble header to the actual
+    // SPIR-V binary so the runtime can know which descriptor set to use
+    // for each entry point
+    SpvBinary spirv_header;
+    encode_header(spirv_header);
+
+    // Finalize and encode the SPIR-V IR into a compliant binary
     SpvBinary spirv_binary;
     builder.finalize();
     builder.encode(spirv_binary);
-    module.reserve(spirv_binary.size() * sizeof(uint32_t));
+
+    size_t header_bytes = spirv_header.size() * sizeof(uint32_t); 
+    size_t binary_bytes = spirv_binary.size() * sizeof(uint32_t); 
+
+    debug(2) << "    encoding module ("
+                        << "header_size: " << (uint32_t)(header_bytes) << ", "
+                        << "binary_size: " << (uint32_t)(binary_bytes) << ")\n";
+
+    // Combine the header and binary into the module
+    module.reserve(header_bytes + binary_bytes);
+    module.insert(module.end(), (const char *)spirv_header.data(), (const char *)(spirv_header.data() + spirv_header.size()));
     module.insert(module.end(), (const char *)spirv_binary.data(), (const char *)(spirv_binary.data() + spirv_binary.size()));
 }
 
@@ -996,6 +1510,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     workgroup_size[1] = 0;
     workgroup_size[2] = 0;
 
+    // Keep track of the descriptor set needed to bind this kernel's inputs / outputs
+    DescriptorSet descriptor_set;
+    descriptor_set.entry_point_name = name;
+
     // Declare the kernel function
     SpvId void_type_id = builder.declare_void_type();
     SpvId kernel_func_id = builder.add_function(name, void_type_id);
@@ -1024,7 +1542,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
         entry_point_variables.push_back(intrinsic_id);
     }
 
-    // Add the entry point and exection mode
+    // Add the entry point with the appropriate execution model
+    // NOTE: exec_model must be GLCompute to work with Vulkan ... Kernel is only supported in OpenCL 
+    uint32_t current_entry_point = builder.current_module().entry_point_count();
     builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
 
     // GLSL-style: each input buffer is a runtime array in a buffer struct
@@ -1037,42 +1557,80 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
             param_struct_members.push_back(arg_type_id);
         }
     }
-    SpvId param_struct_type_id = builder.declare_struct(unique_name("param_struct"), param_struct_members);
 
-    // Add a decoration describing the offset for each parameter struct member
-    uint32_t param_member_index = 0;
-    uint32_t param_member_offset = 0;
-    for (const auto &arg : args) {
-        if (!arg.is_buffer) {
-            SpvBuilder::Literals param_offset_literals = {param_member_offset};
-            builder.add_struct_annotation(param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals);
-            param_member_offset += arg.type.bytes();
-            param_member_index++;
+    // Add a binding for a uniform buffer packed with all scalar args
+    uint32_t binding_counter = 0;
+    if(param_struct_members.size() > 0) {
+        const std::string struct_name = std::string("_struct") + name + std::string("_args");
+        SpvId param_struct_type_id = builder.declare_struct(struct_name, param_struct_members);
+
+        // Add a decoration describing the offset for each parameter struct member
+        uint32_t param_member_index = 0;
+        uint32_t param_member_offset = 0;
+        for (const auto &arg : args) {
+            if (!arg.is_buffer) {
+                SpvBuilder::Literals param_offset_literals = {param_member_offset};
+                builder.add_struct_annotation(param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals);
+                param_member_offset += arg.type.bytes();
+                param_member_index++;
+            }
         }
-    }
-
-    // Add a Block decoration for the parameter pack itself
-    builder.add_annotation(param_struct_type_id, SpvDecorationBlock);
 
-    // Add a variable for the parameter pack
-    SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
-    SpvId param_pack_var_id = builder.declare_global_variable(unique_name("kernel_params"), param_pack_ptr_type_id, SpvStorageClassUniform);
-
-    // We always pass in the parameter pack as the first binding
-    SpvBuilder::Literals zero_literal = {0};
-    builder.add_annotation(param_pack_var_id, SpvDecorationDescriptorSet, zero_literal);
-    builder.add_annotation(param_pack_var_id, SpvDecorationBinding, zero_literal);
+        // Add a Block decoration for the parameter pack itself
+        builder.add_annotation(param_struct_type_id, SpvDecorationBlock);
+
+        // Add a variable for the parameter pack
+        const std::string param_pack_var_name = std::string("_var") + name + std::string("_args");
+        SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
+        SpvId param_pack_var_id = builder.declare_global_variable(param_pack_var_name, param_pack_ptr_type_id, SpvStorageClassUniform);
+
+        // We always pass in the parameter pack as the first binding
+        SpvBuilder::Literals binding_index = {0};
+        SpvBuilder::Literals dset_index = {current_entry_point};
+        builder.add_annotation(param_pack_var_id, SpvDecorationDescriptorSet, dset_index);
+        builder.add_annotation(param_pack_var_id, SpvDecorationBinding, binding_index);
+        descriptor_set.uniform_buffer_count++;
+        binding_counter++;
+
+        // Declare all the args with appropriate offsets into the parameter struct
+        uint32_t scalar_index = 0;
+        for (const auto &arg : args) {
+            if (!arg.is_buffer) {
+
+                SpvId arg_type_id = builder.declare_type(arg.type);
+                SpvId access_index_id = builder.declare_constant(UInt(32), &scalar_index);
+                SpvId pointer_type_id = builder.declare_pointer_type(arg_type_id, SpvStorageClassUniform);
+                SpvFactory::Indices access_indices = {access_index_id};
+                SpvId access_chain_id = builder.declare_access_chain(pointer_type_id, param_pack_var_id, access_indices);
+                scalar_index++;
+
+                SpvId param_id = builder.reserve_id(SpvResultId);
+                builder.append(SpvFactory::load(arg_type_id, param_id, access_chain_id));
+                symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
+            }        
+        }
+    }
 
-    uint32_t binding_counter = 1;
-    uint32_t scalar_index = 0;
+    // Add bindings for all device buffers
     for (const auto &arg : args) {
         if (arg.is_buffer) {
+
+            // Add required extension support for storage types
+            if(arg.type.is_int_or_uint()) {
+                if(arg.type.bits() == 8) {
+                    builder.require_extension("SPV_KHR_8bit_storage");
+                } else if(arg.type.bits() == 16) {
+                    builder.require_extension("SPV_KHR_16bit_storage");
+                }
+            }
+
             SpvId element_type_id = builder.declare_type(arg.type);
             SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
             SpvBuilder::StructMemberTypes struct_member_types = {runtime_arr_type_id};
-            SpvId struct_type_id = builder.declare_struct(unique_name("param_buffer_" + std::to_string(binding_counter)), struct_member_types);
+            const std::string struct_name = std::string("_struct") + name + std::string("_b") + std::to_string(binding_counter);
+            SpvId struct_type_id = builder.declare_struct(struct_name, struct_member_types);
             SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, SpvStorageClassUniform);
-            SpvId param_id = builder.declare_global_variable(unique_name("param_" + arg.name), ptr_struct_type_id, SpvStorageClassUniform);
+            SpvId param_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, SpvStorageClassUniform);
 
             // Annotate the struct to indicate it's passed in a GLSL-style buffer block
             builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
@@ -1086,25 +1644,15 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
             builder.add_struct_annotation(struct_type_id, 0, SpvDecorationOffset, zero_literal);
 
             // Set DescriptorSet and Binding
+            SpvBuilder::Literals dset_index = {current_entry_point};
             SpvBuilder::Literals binding_index = {uint32_t(binding_counter++)};
-            builder.add_annotation(param_id, SpvDecorationDescriptorSet, zero_literal);
+            builder.add_annotation(param_id, SpvDecorationDescriptorSet, dset_index);
             builder.add_annotation(param_id, SpvDecorationBinding, binding_index);
             symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
-
-        } else {
-
-            SpvId arg_type_id = builder.declare_type(arg.type);
-            SpvId access_index_id = builder.declare_constant(UInt(32), &scalar_index);
-            SpvId pointer_type_id = builder.declare_pointer_type(arg_type_id, SpvStorageClassUniform);
-            SpvId access_chain_id = builder.declare_access_chain(pointer_type_id, param_pack_var_id, access_index_id, {});
-            scalar_index++;
-
-            SpvId param_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::load(arg_type_id, param_id, access_chain_id));
-            symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
+            descriptor_set.storage_buffer_count++;
         }
     }
-
+    descriptor_set_table.push_back(descriptor_set);
     s.accept(this);
 
     // Insert return statement end delimiter
@@ -1172,10 +1720,18 @@ std::string CodeGen_Vulkan_Dev::print_gpu_name(const std::string &name) {
 
 void CodeGen_Vulkan_Dev::dump() {
     std::vector<char> module = compile_to_src();
+
+    // Skip the header and only output the SPIR-V binary
+    const uint32_t* decode = (const uint32_t*)(module.data());
+    uint32_t header_word_count = decode[0];
+    size_t header_size = header_word_count * sizeof(uint32_t);
+    const uint32_t* binary_ptr = (decode + header_word_count);
+    size_t binary_size = (module.size() - header_size);
+
     const char *filename = getenv("HL_SPIRV_DUMP_FILE") ? getenv("HL_SPIRV_DUMP_FILE") : "out.spv";
     debug(1) << "Vulkan: Dumping SPIRV module to file: '" << filename << "'\n";
     std::ofstream f(filename, std::ios::out | std::ios::binary);
-    f.write((char *)(module.data()), module.size());
+    f.write((const char *)(binary_ptr), binary_size);
     f.close();
 }
 
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 70bb192beacc..11c4e632ba75 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -193,7 +193,7 @@ void SpvBlock::add_instruction(SpvInstruction inst) {
 void SpvBlock::add_variable(SpvInstruction var) {
     check_defined();
     var.set_block(*this);
-    contents->instructions.push_back(var);
+    contents->variables.push_back(var);
 }
 
 void SpvBlock::set_function(SpvFunction func) {
@@ -500,7 +500,12 @@ void SpvModule::set_memory_model(SpvMemoryModel val) {
     contents->memory_model = val;
 }
 
-SpvId SpvModule::binding_count() const {
+uint32_t SpvModule::entry_point_count() const {
+    check_defined();
+    return (uint32_t)contents->entry_points.size();
+}
+
+uint32_t SpvModule::binding_count() const {
     check_defined();
     return contents->binding_count;
 }
@@ -535,6 +540,13 @@ SpvInstruction SpvModule::entry_point(const std::string &name) const {
     }
 }
 
+void SpvModule::import_instruction_set(SpvId id, const std::string &instruction_set) {
+    check_defined();
+    if(contents->imports.find(instruction_set) == contents->imports.end()) {
+        contents->imports.insert({instruction_set, id});
+    }
+}
+
 void SpvModule::require_extension(const std::string &extension) {
     check_defined();
     if (contents->extensions.find(extension) == contents->extensions.end()) {
@@ -542,6 +554,14 @@ void SpvModule::require_extension(const std::string &extension) {
     }
 }
 
+bool SpvModule::is_imported(const std::string &instruction_set) const {
+    check_defined();
+    if (contents->imports.find(instruction_set) != contents->imports.end()) {
+        return true;
+    }
+    return false;
+}
+
 bool SpvModule::is_extension_required(const std::string &extension) const {
     check_defined();
     if (contents->extensions.find(extension) != contents->extensions.end()) {
@@ -574,6 +594,24 @@ SpvModule::EntryPointNames SpvModule::entry_point_names() const {
     return entry_point_names;
 }
 
+SpvModule::ImportNames SpvModule::import_names() const {
+    check_defined();
+    SpvModule::ImportNames results(contents->imports.size());
+    for (const SpvModuleContents::Imports::value_type& v : contents->imports) {
+        results.push_back(v.first);
+    }
+    return results;
+}
+
+SpvId SpvModule::lookup_import(const std::string& instruction_set) const {
+    SpvId result_id = SpvInvalidId;
+    SpvModuleContents::Imports::const_iterator it = contents->imports.find(instruction_set);
+    if(it != contents->imports.end()) {
+        result_id = it->second;
+    }
+    return result_id;
+}
+
 SpvId SpvModule::id() const {
     check_defined();
     return contents->module_id;
@@ -606,8 +644,10 @@ void SpvModule::encode(SpvBinary &binary) const {
     }
 
     // 3. Extended Instruction Set Imports
-    for (const std::string &import : contents->imports) {
-        SpvInstruction inst = SpvFactory::import(import);
+    for (const SpvModuleContents::Imports::value_type &import : contents->imports) {
+        const std::string& import_name = import.first;
+        SpvId import_id = import.second;
+        SpvInstruction inst = SpvFactory::import(import_id, import_name);
         inst.encode(binary);
     }
 
@@ -677,6 +717,7 @@ void SpvBuilder::reset() {
     function_map.clear();
     id_symbol_map.clear();
     symbol_id_map.clear();
+    base_type_map.clear();
     storage_class_map.clear();
     pointer_type_map.clear();
     variable_type_map.clear();
@@ -801,17 +842,17 @@ std::string SpvBuilder::kind_name(SpvKind kind) const {
 SpvKind SpvBuilder::kind_of(SpvId item_id) const {
     KindMap::const_iterator it = kind_map.find(item_id);
     if (it != kind_map.end()) {
-        return SpvInvalidItem;
+        return it->second;
     }
-    return it->second;
+    return SpvInvalidItem;
 }
 
 SpvId SpvBuilder::type_of(SpvId variable_id) const {
     VariableTypeMap::const_iterator it = variable_type_map.find(variable_id);
     if (it != variable_type_map.end()) {
-        return SpvInvalidId;
+        return it->second;
     }
-    return it->second;
+    return SpvInvalidId;
 }
 
 void SpvBuilder::finalize() {
@@ -924,7 +965,7 @@ bool SpvBuilder::has_variable(const std::string &name, SpvId type_id, SpvStorage
 }
 
 SpvId SpvBuilder::declare_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId init_id) {
-    SpvId block_id = current_block().id();
+    SpvId block_id = current_function().entry_block().id();
     SpvId existing_id = lookup_variable(name, type_id, storage_class, block_id);
     if (existing_id != SpvInvalidId) {
         return existing_id;
@@ -932,9 +973,10 @@ SpvId SpvBuilder::declare_variable(const std::string &name, SpvId type_id, SpvSt
 
     SpvId var_id = reserve_id(SpvVariableId);
     debug(3) << "    declare_variable: %" << var_id << " name='" << name << "' type_id=" << type_id << " storage_class=" << (uint32_t)storage_class << " init_id=" << init_id << "\n";
-    current_block().add_variable(SpvFactory::variable(var_id, type_id, storage_class, init_id));
+    current_function().entry_block().add_variable(SpvFactory::variable(var_id, type_id, storage_class, init_id));
     declare_symbol(name, var_id, block_id);
     storage_class_map[var_id] = storage_class;
+    variable_type_map[var_id] = type_id;
     return var_id;
 }
 
@@ -944,6 +986,7 @@ SpvId SpvBuilder::declare_global_variable(const std::string &name, SpvId type_id
     module.add_global(SpvFactory::variable(var_id, type_id, storage_class, init_id));
     declare_symbol(name, var_id, module.id());
     storage_class_map[var_id] = storage_class;
+    variable_type_map[var_id] = type_id;
     return var_id;
 }
 
@@ -1073,6 +1116,10 @@ SpvId SpvBuilder::lookup_scope(SpvId id) const {
     return result;
 }
 
+SpvId SpvBuilder::lookup_import(const std::string &instruction_set) const {
+    return module.lookup_import(instruction_set);
+}
+    
 void SpvBuilder::enter_function(const SpvFunction &func) {
     active_function = func;
     enter_block(active_function.entry_block());
@@ -1124,12 +1171,29 @@ SpvMemoryModel SpvBuilder::memory_model() const {
     return module.memory_model();
 }
 
+SpvId SpvBuilder::import_glsl_intrinsics() {
+    return import_instruction_set("GLSL.std.450");
+}
+
+SpvId SpvBuilder::import_instruction_set(const std::string& instruction_set) {
+    SpvId result_id = module.lookup_import(instruction_set);
+    if (result_id == SpvInvalidId) {
+        result_id = make_id(SpvImportId);
+        module.import_instruction_set(result_id, instruction_set);
+    }
+    return result_id;
+}
+
 void SpvBuilder::require_capability(SpvCapability capability) {
     if (!module.is_capability_required(capability)) {
         module.require_capability(capability);
     }
 }
 
+bool SpvBuilder::is_imported(const std::string& instruction_set) const {
+    return module.is_imported(instruction_set);
+}
+
 bool SpvBuilder::is_capability_required(SpvCapability capability) const {
     return module.is_capability_required(capability);
 }
@@ -1170,10 +1234,21 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
     }
 
     if (array_size > 1) {
+
+        // first declare the array size as a uint32 constant value
+        Type array_size_type = UInt(32);
+        ConstantKey constant_key = make_constant_key(array_size_type, &array_size);
+        SpvId array_size_id = make_id(SpvIntConstantId);
+        SpvId array_size_type_id = add_type(array_size_type);
+        SpvInstruction array_size_inst = SpvFactory::constant(array_size_id, array_size_type_id, array_size_type.bytes(), &array_size);
+        module.add_type(array_size_inst); // needs to be defined in the type section (prior to its use in the array_type inst)
+        constant_map[constant_key] = array_size_id;
+
+        // declare the array type
         SpvId array_type_id = make_id(SpvArrayTypeId);
         SpvId element_type_id = add_type(type, 1);
         debug(3) << "    add_array_type: %" << array_type_id << " element_type_id='" << element_type_id << " array_size='" << array_size << "\n";
-        SpvInstruction inst = SpvFactory::array_type(array_type_id, element_type_id, array_size);
+        SpvInstruction inst = SpvFactory::array_type(array_type_id, element_type_id, array_size_id);
         module.add_type(inst);
         type_map[type_key] = array_type_id;
         return array_type_id;
@@ -1208,8 +1283,13 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
                 module.require_capability(SpvCapabilityFloat64);
             }
         } else if (type.is_int_or_uint()) {
+            SpvId signedness = 0;
+            bool signedness_support = module.is_capability_required(SpvCapabilityKernel) ? false : true; // kernel execution doesn't track signedness
+            if(signedness_support) {
+                signedness = type.is_uint() ? 0 : 1;
+            }
+
             type_id = make_id(SpvIntTypeId);
-            SpvId signedness = type.is_uint() ? 0 : 1;
             debug(3) << "    add_integer_type: %" << type_id << " bits=" << type.bits() << " signed=" << (signedness ? "true" : "false") << "\n";
             SpvInstruction inst = SpvFactory::integer_type(type_id, type.bits(), signedness);
             module.add_type(inst);
@@ -1330,6 +1410,7 @@ SpvId SpvBuilder::add_pointer_type(SpvId base_type_id, SpvStorageClass storage_c
     module.add_type(inst);
     pointer_type_map[key] = pointer_type_id;
     storage_class_map[pointer_type_id] = storage_class;
+    base_type_map[pointer_type_id] = base_type_id;
     return pointer_type_id;
 }
 
@@ -1550,9 +1631,15 @@ SpvId SpvBuilder::add_constant(const Type &type, const void *data) {
     }
 }
 
-SpvId SpvBuilder::declare_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId element_id, const Indices &indices) {
+SpvId SpvBuilder::declare_access_chain(SpvId ptr_type_id, SpvId base_id, const Indices &indices) {
+    SpvId access_chain_id = make_id(SpvAccessChainId);
+    append(SpvFactory::in_bounds_access_chain(ptr_type_id, access_chain_id, base_id, indices));
+    return access_chain_id;
+}
+
+SpvId SpvBuilder::declare_pointer_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId element_id, const Indices &indices) {
     SpvId access_chain_id = make_id(SpvAccessChainId);
-    append(SpvFactory::in_bounds_access_chain(ptr_type_id, access_chain_id, base_id, element_id, indices));
+    append(SpvFactory::pointer_access_chain(ptr_type_id, access_chain_id, base_id, element_id, indices));
     return access_chain_id;
 }
 
@@ -1599,6 +1686,69 @@ SpvId SpvBuilder::add_runtime_array(SpvId base_type_id) {
     return runtime_array_id;
 }
 
+bool SpvBuilder::is_pointer_type(SpvId id) const {
+    BaseTypeMap::const_iterator it = base_type_map.find(id);
+    if (it != base_type_map.end()) {
+        return true;
+    }
+    return false;
+}
+
+bool SpvBuilder::is_struct_type(SpvId id) const {
+    SpvKind kind = kind_of(id);
+    if(kind == SpvStructTypeId) {
+        return true;
+    }
+    return false;
+}
+
+bool SpvBuilder::is_vector_type(SpvId id) const {
+    SpvKind kind = kind_of(id);
+    if(kind == SpvVectorTypeId) {
+        return true;
+    }
+    return false;
+}
+
+bool SpvBuilder::is_scalar_type(SpvId id) const {
+    SpvKind kind = kind_of(id);
+    if((kind == SpvFloatTypeId) || 
+       (kind == SpvIntTypeId) || 
+       (kind == SpvBoolTypeId)) {
+        return true;
+    }
+    return false;
+}
+
+bool SpvBuilder::is_array_type(SpvId id) const {
+    SpvKind kind = kind_of(id);
+    if((kind == SpvArrayTypeId)) {
+        return true;
+    }
+    return false;
+}
+
+bool SpvBuilder::is_constant(SpvId id) const {
+    SpvKind kind = kind_of(id);
+    if((kind == SpvConstantId) || 
+       (kind == SpvBoolConstantId) || 
+       (kind == SpvIntConstantId) || 
+       (kind == SpvFloatConstantId) || 
+       (kind == SpvStringConstantId) || 
+       (kind == SpvCompositeConstantId)) {
+        return true;
+    }
+    return false;
+}
+
+SpvId SpvBuilder::lookup_base_type(SpvId pointer_type) const {
+    BaseTypeMap::const_iterator it = base_type_map.find(pointer_type);
+    if (it != base_type_map.end()) {
+        return it->second;
+    }
+    return SpvInvalidId;
+}
+
 void SpvBuilder::append(SpvInstruction inst) {
     if (active_block.is_defined()) {
         active_block.add_instruction(std::move(inst));
@@ -1725,11 +1875,11 @@ SpvInstruction SpvFactory::vector_type(SpvId vector_type_id, SpvId element_type_
     return inst;
 }
 
-SpvInstruction SpvFactory::array_type(SpvId array_type_id, SpvId element_type_id, uint32_t array_size) {
+SpvInstruction SpvFactory::array_type(SpvId array_type_id, SpvId element_type_id, SpvId array_size_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeArray);
     inst.set_result_id(array_type_id);
     inst.add_operand(element_type_id);
-    inst.add_immediate(array_size);
+    inst.add_operand(array_size_id);
     return inst;
 }
 
@@ -1876,18 +2026,29 @@ SpvInstruction SpvFactory::exec_mode_local_size(SpvId function_id, uint32_t wg_s
     return inst;
 }
 
-SpvInstruction SpvFactory::control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, uint32_t semantics_mask) {
+SpvInstruction SpvFactory::memory_barrier(SpvId memory_scope_id, SpvId semantics_mask_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpMemoryBarrier);
+    inst.add_operand(memory_scope_id);
+    inst.add_operand(semantics_mask_id);
+    return inst;
+}
+
+SpvInstruction SpvFactory::control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, SpvId semantics_mask_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpControlBarrier);
     inst.add_operand(execution_scope_id);
     inst.add_operand(memory_scope_id);
-    inst.add_immediate(semantics_mask);
+    inst.add_operand(semantics_mask_id);
     return inst;
 }
 
-SpvInstruction SpvFactory::logical_not(SpvId type_id, SpvId result_id, SpvId src_id) {
+SpvInstruction SpvFactory::bitwise_not(SpvId type_id, SpvId result_id, SpvId src_id) {
     return unary_op(SpvOpNot, type_id, result_id, src_id);
 }
 
+SpvInstruction SpvFactory::logical_not(SpvId type_id, SpvId result_id, SpvId src_id) {
+    return unary_op(SpvOpLogicalNot, type_id, result_id, src_id);
+}
+
 SpvInstruction SpvFactory::shift_right_logical(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpShiftRightLogical);
     inst.add_operand(type_id);
@@ -1920,11 +2081,22 @@ SpvInstruction SpvFactory::select(SpvId type_id, SpvId result_id, SpvId conditio
     return inst;
 }
 
-SpvInstruction SpvFactory::in_bounds_access_chain(SpvId type_id, SpvId result_id, SpvId base_id, SpvId element_id, const SpvFactory::Indices &indices) {
+SpvInstruction SpvFactory::in_bounds_access_chain(SpvId type_id, SpvId result_id, SpvId base_id, const SpvFactory::Indices &indices) {
     SpvInstruction inst = SpvInstruction::make(SpvOpInBoundsAccessChain);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
     inst.add_operand(base_id);
+    for (SpvId i : indices) {
+        inst.add_operand(i);
+    }
+    return inst;
+}
+
+SpvInstruction SpvFactory::pointer_access_chain(SpvId type_id, SpvId result_id, SpvId base_id, SpvId element_id, const SpvFactory::Indices &indices) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpPtrAccessChain);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operand(base_id);
     inst.add_operand(element_id);
     for (SpvId i : indices) {
         inst.add_operand(i);
@@ -1992,6 +2164,22 @@ SpvInstruction SpvFactory::vector_shuffle(SpvId type_id, SpvId result_id, SpvId
     return inst;
 }
 
+SpvInstruction SpvFactory::is_inf(SpvId type_id, SpvId result_id, SpvId src_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpIsInf);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operand(src_id);
+    return inst;
+}
+
+SpvInstruction SpvFactory::is_nan(SpvId type_id, SpvId result_id, SpvId src_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpIsNan);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operand(src_id);
+    return inst;
+}
+
 SpvInstruction SpvFactory::bitcast(SpvId type_id, SpvId result_id, SpvId src_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpBitcast);
     inst.set_type_id(type_id);
@@ -2117,12 +2305,114 @@ SpvInstruction SpvFactory::extension(const std::string &extension) {
     return inst;
 }
 
-SpvInstruction SpvFactory::import(const std::string &import) {
+SpvInstruction SpvFactory::import(SpvId instruction_set_id, const std::string &instruction_set_name) {
     SpvInstruction inst = SpvInstruction::make(SpvOpExtInstImport);
-    inst.add_string(import);
+    inst.set_result_id(instruction_set_id);
+    inst.add_string(instruction_set_name);
+    return inst;
+}
+
+SpvInstruction SpvFactory::extended(SpvId instruction_set_id, SpvId instruction_number, SpvId type_id, SpvId result_id, const SpvFactory::Operands& operands) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpExtInst);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operand(instruction_set_id);
+    inst.add_immediate(instruction_number);
+    for(SpvId o : operands) {
+        inst.add_operand(o);
+    }
     return inst;
 }
 
+/** GLSL extended instruction utility methods */
+
+bool is_glsl_unary_op(SpvId glsl_op_code) {
+    switch(glsl_op_code) {
+        case GLSLstd450Round: 
+        case GLSLstd450RoundEven: 
+        case GLSLstd450Trunc:
+        case GLSLstd450FAbs:
+        case GLSLstd450SAbs:
+        case GLSLstd450FSign:
+        case GLSLstd450SSign:
+        case GLSLstd450Floor:
+        case GLSLstd450Ceil:
+        case GLSLstd450Fract:
+        case GLSLstd450Radians:
+        case GLSLstd450Degrees:
+        case GLSLstd450Sin:
+        case GLSLstd450Cos:
+        case GLSLstd450Tan:
+        case GLSLstd450Asin:
+        case GLSLstd450Acos: 
+        case GLSLstd450Atan: 
+        case GLSLstd450Asinh: 
+        case GLSLstd450Acosh: 
+        case GLSLstd450Atanh: 
+        case GLSLstd450Cosh: 
+        case GLSLstd450Exp: 
+        case GLSLstd450Log: 
+        case GLSLstd450Exp2: 
+        case GLSLstd450Log2: 
+        case GLSLstd450Sqrt: 
+        case GLSLstd450InverseSqrt: 
+        case GLSLstd450Determinant: 
+        case GLSLstd450MatrixInverse: 
+        case GLSLstd450ModfStruct: 
+        case GLSLstd450FrexpStruct: 
+        case GLSLstd450PackSnorm4x8: 
+        case GLSLstd450PackUnorm4x8: 
+        case GLSLstd450PackSnorm2x16: 
+        case GLSLstd450PackUnorm2x16: 
+        case GLSLstd450PackHalf2x16: 
+        case GLSLstd450PackDouble2x32: 
+        case GLSLstd450UnpackSnorm4x8: 
+        case GLSLstd450UnpackUnorm4x8: 
+        case GLSLstd450UnpackSnorm2x16: 
+        case GLSLstd450UnpackUnorm2x16: 
+        case GLSLstd450UnpackHalf2x16: 
+        case GLSLstd450UnpackDouble2x32: 
+        case GLSLstd450Length: 
+        case GLSLstd450Normalize: 
+        case GLSLstd450FindILsb: 
+        case GLSLstd450FindSMsb: 
+        case GLSLstd450FindUMsb: 
+        case GLSLstd450InterpolateAtCentroid:
+            return true;
+        default:
+            break;
+    };
+    return false;
+}
+
+bool is_glsl_binary_op(SpvId glsl_op_code) {
+    switch(glsl_op_code) {
+        case GLSLstd450Atan2:
+        case GLSLstd450Pow:
+        case GLSLstd450Modf:
+        case GLSLstd450FMin:
+        case GLSLstd450UMin:
+        case GLSLstd450SMin:
+        case GLSLstd450FMax:
+        case GLSLstd450UMax:
+        case GLSLstd450SMax:
+        case GLSLstd450Step:
+        case GLSLstd450Frexp:
+        case GLSLstd450Ldexp:
+        case GLSLstd450Distance:
+        case GLSLstd450Cross:
+        case GLSLstd450Reflect:
+        case GLSLstd450InterpolateAtOffset:
+        case GLSLstd450InterpolateAtSample:
+        case GLSLstd450NMax:
+        case GLSLstd450NMin:
+            return true;
+        default:
+            break;
+    };
+    return false;
+}
+
 /** Specializations for reference counted classes */
 template<>
 RefCount &ref_count<SpvInstructionContents>(const SpvInstructionContents *c) noexcept {
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 86e18f193acb..08c469f79a51 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -23,7 +23,8 @@
 #include "IntrusivePtr.h"
 #include "Type.h"
 
-#include <spirv/1.0/spirv.h>  // Use v1.0 spec as the minimal viable version (for maximum compatiblity)
+#include <spirv/1.0/spirv.h>         // Use v1.0 spec as the minimal viable version (for maximum compatiblity)
+#include <spirv/1.0/GLSL.std.450.h>  // GLSL extended instructions for common intrinsics
 
 namespace Halide {
 namespace Internal {
@@ -34,6 +35,15 @@ enum SpvPrecision {
     SpvRelaxedPrecision,
 };
 
+/** Scope qualifiers for Execution & Memory operations */
+enum SpvScope {
+    SpvCrossDeviceScope = 0,
+    SpvDeviceScope = 1,
+    SpvWorkgroupScope = 2,
+    SpvSubgroupScope = 3,
+    SpvInvocationScope = 4
+};
+
 /** Specific types of predefined constants */
 enum SpvPredefinedConstant {
     SpvNullConstant,
@@ -70,6 +80,7 @@ enum SpvKind {
     SpvBlockId,
     SpvLabelId,
     SpvParameterId,
+    SpvImportId,
     SpvModuleId,
     SpvUnknownItem,
 };
@@ -221,6 +232,7 @@ class SpvFunction {
 /** General interface for representing a SPIR-V code module */
 class SpvModule {
 public:
+    using ImportNames = std::vector<std::string>;
     using EntryPointNames = std::vector<std::string>;
     using Instructions = std::vector<SpvInstruction>;
 
@@ -243,6 +255,7 @@ class SpvModule {
     void add_instruction(const SpvInstruction &val);
     void add_entry_point(const std::string &name, SpvInstruction entry_point);
 
+    void import_instruction_set(SpvId id, const std::string &instruction_set);
     void require_capability(SpvCapability val);
     void require_extension(const std::string &val);
 
@@ -256,10 +269,14 @@ class SpvModule {
     SpvMemoryModel memory_model() const;
     SpvInstruction entry_point(const std::string &name) const;
     EntryPointNames entry_point_names() const;
+    ImportNames import_names() const;
+    SpvId lookup_import(const std::string& Instruction_set) const;
+    uint32_t entry_point_count() const;
     const Instructions &execution_modes() const;
-    SpvId binding_count() const;
+    uint32_t binding_count() const;
     SpvModule module() const;
 
+    bool is_imported(const std::string& instruction_set) const;
     bool is_capability_required(SpvCapability val) const;
     bool is_extension_required(const std::string &val) const;
     bool is_defined() const;
@@ -310,7 +327,8 @@ class SpvBuilder {
     SpvId declare_string_constant(const std::string &str);
     SpvId declare_scalar_constant(const Type &type, const void *data);
     SpvId declare_vector_constant(const Type &type, const void *data);
-    SpvId declare_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId element_id, const Indices &indices);
+    SpvId declare_access_chain(SpvId ptr_type_id, SpvId base_id, const Indices &indices);
+    SpvId declare_pointer_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId element_id, const Indices &indices);
     SpvId declare_function_type(SpvId return_type, const ParamTypes &param_types = {});
     SpvId declare_function(const std::string &name, SpvId function_type);
     SpvId declare_struct(const std::string &name, const StructMemberTypes &member_types);
@@ -343,9 +361,13 @@ class SpvBuilder {
     SpvAddressingModel addressing_model() const;
     SpvMemoryModel memory_model() const;
 
+    SpvId import_glsl_intrinsics();
+    SpvId import_instruction_set(const std::string &instruction_set);
+    
     void require_extension(const std::string &extension);
     void require_capability(SpvCapability);
 
+    bool is_imported(const std::string& instruction_set) const;
     bool is_extension_required(const std::string &extension) const;
     bool is_capability_required(SpvCapability) const;
 
@@ -362,9 +384,19 @@ class SpvBuilder {
     SpvId current_id() const;
     void update_id(SpvId id);
 
+    bool is_pointer_type(SpvId id) const;
+    bool is_struct_type(SpvId id) const;
+    bool is_vector_type(SpvId id) const;
+    bool is_scalar_type(SpvId id) const;
+    bool is_array_type(SpvId id) const;
+    bool is_constant(SpvId id) const;
+    
+    SpvId lookup_base_type(SpvId pointer_type) const;
     SpvStorageClass lookup_storage_class(SpvId id) const;
     SpvId lookup_id(const std::string &symbol) const;
     SpvId lookup_scope(SpvId id) const;
+    SpvId lookup_import(const std::string &instruction_set) const;
+    
     std::string lookup_symbol(SpvId id) const;
     SpvId declare_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
     void add_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
@@ -382,6 +414,7 @@ class SpvBuilder {
     using KindMap = std::unordered_map<SpvId, SpvKind>;
     using PointerTypeKey = std::pair<SpvId, SpvStorageClass>;
     using PointerTypeMap = std::map<PointerTypeKey, SpvId>;
+    using BaseTypeMap = std::unordered_map<SpvId, SpvId>;
     using VariableTypeMap = std::unordered_map<SpvId, SpvId>;
     using StorageClassMap = std::unordered_map<SpvId, SpvStorageClass>;
     using ConstantKey = uint64_t;
@@ -436,6 +469,7 @@ class SpvBuilder {
     FunctionMap function_map;
     IdSymbolMap id_symbol_map;
     SymbolIdMap symbol_id_map;
+    BaseTypeMap base_type_map;
     StorageClassMap storage_class_map;
     PointerTypeMap pointer_type_map;
     VariableTypeMap variable_type_map;
@@ -450,6 +484,7 @@ struct SpvFactory {
     using Components = std::vector<SpvId>;
     using ParamTypes = std::vector<SpvId>;
     using MemberTypeIds = std::vector<SpvId>;
+    using Operands = std::vector<SpvId>;
     using Variables = std::vector<SpvId>;
     using VariableBlockIdPair = std::pair<SpvId, SpvId>;  // (Variable Id, Block Id)
     using BlockVariables = std::vector<VariableBlockIdPair>;
@@ -457,7 +492,7 @@ struct SpvFactory {
     static SpvInstruction no_op(SpvId result_id);
     static SpvInstruction capability(const SpvCapability &capability);
     static SpvInstruction extension(const std::string &extension);
-    static SpvInstruction import(const std::string &import);
+    static SpvInstruction import(SpvId instruction_set_id, const std::string &instruction_set_name);
     static SpvInstruction label(SpvId result_id);
     static SpvInstruction debug_line(SpvId string_id, uint32_t line, uint32_t column);
     static SpvInstruction debug_string(SpvId result_id, const std::string &string);
@@ -469,7 +504,7 @@ struct SpvFactory {
     static SpvInstruction integer_type(SpvId int_type_id, uint32_t bits, uint32_t signedness);
     static SpvInstruction float_type(SpvId float_type_id, uint32_t bits);
     static SpvInstruction vector_type(SpvId vector_type_id, SpvId element_type_id, uint32_t vector_size);
-    static SpvInstruction array_type(SpvId array_type_id, SpvId element_type_id, uint32_t array_size);
+    static SpvInstruction array_type(SpvId array_type_id, SpvId element_type_id, SpvId array_size_id);
     static SpvInstruction struct_type(SpvId result_id, const MemberTypeIds &member_type_ids);
     static SpvInstruction runtime_array_type(SpvId result_type_id, SpvId base_type_id);
     static SpvInstruction pointer_type(SpvId pointer_type_id, SpvStorageClass storage_class, SpvId base_type_id);
@@ -487,19 +522,24 @@ struct SpvFactory {
     static SpvInstruction entry_point(SpvId exec_model, SpvId func_id, const std::string &name, const Variables &variables);
     static SpvInstruction memory_model(SpvAddressingModel addressing_model, SpvMemoryModel memory_model);
     static SpvInstruction exec_mode_local_size(SpvId function_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z);
-    static SpvInstruction control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, uint32_t semantics_mask);
+    static SpvInstruction memory_barrier(SpvId memory_scope_id, SpvId semantics_mask_id);
+    static SpvInstruction control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, SpvId semantics_mask_id);
+    static SpvInstruction bitwise_not(SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction logical_not(SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction shift_right_logical(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id);
     static SpvInstruction shift_right_arithmetic(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id);
     static SpvInstruction multiply_extended(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
     static SpvInstruction select(SpvId type_id, SpvId result_id, SpvId condition_id, SpvId true_id, SpvId false_id);
-    static SpvInstruction in_bounds_access_chain(SpvId type_id, SpvId result_id, SpvId base_id, SpvId element_id, const Indices &indices);
+    static SpvInstruction in_bounds_access_chain(SpvId type_id, SpvId result_id, SpvId base_id, const Indices &indices);
+    static SpvInstruction pointer_access_chain(SpvId type_id, SpvId result_id, SpvId base_id, SpvId element_id, const Indices &indices);
     static SpvInstruction load(SpvId type_id, SpvId result_id, SpvId ptr_id, uint32_t access_mask = 0x0);
     static SpvInstruction store(SpvId ptr_id, SpvId obj_id, uint32_t access_mask = 0x0);
     static SpvInstruction vector_insert_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, uint32_t index);
     static SpvInstruction vector_shuffle(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, const Indices &indices);
     static SpvInstruction composite_extract(SpvId type_id, SpvId result_id, SpvId composite_id, const Indices &indices);
     static SpvInstruction composite_construct(SpvId type_id, SpvId result_id, const Components &constituents);
+    static SpvInstruction is_inf(SpvId type_id, SpvId result_id, SpvId src_id);
+    static SpvInstruction is_nan(SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction bitcast(SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction integer_add(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
     static SpvInstruction float_add(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
@@ -517,6 +557,7 @@ struct SpvFactory {
     static SpvInstruction unary_op(SpvOp op_code, SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction binary_op(SpvOp op_code, SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
     static SpvInstruction convert(SpvOp op_code, SpvId type_id, SpvId result_id, SpvId src_id);
+    static SpvInstruction extended(SpvId instruction_set_id, SpvId instruction_number, SpvId type_id, SpvId result_id, const SpvFactory::Operands& operands);
 };
 
 /** Contents of a SPIR-V Instruction */
@@ -568,7 +609,7 @@ struct SpvFunctionContents {
 struct SpvModuleContents {
     using Capabilities = std::set<SpvCapability>;
     using Extensions = std::set<std::string>;
-    using Imports = std::set<std::string>;
+    using Imports = std::unordered_map<std::string, SpvId>;
     using Functions = std::vector<SpvFunction>;
     using Instructions = std::vector<SpvInstruction>;
     using EntryPoints = std::unordered_map<std::string, SpvInstruction>;
@@ -594,6 +635,9 @@ struct SpvModuleContents {
     Instructions instructions;
 };
 
+bool is_glsl_unary_op(SpvId glsl_op_code);
+bool is_glsl_binary_op(SpvId glsl_op_code);
+
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index 4f1c8716558a..4cbd15f0da9f 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -6034,6 +6034,15 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetValidationCacheDataEXT(
 #define VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_SPEC_VERSION 1
 #define VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME "VK_EXT_shader_viewport_index_layer"
 
+// Provided by VK_VERSION_1_2
+typedef struct VkPhysicalDeviceShaderFloat16Int8Features {
+    uint32_t           sType;
+    void*              pNext;
+    VkBool32           shaderFloat16;
+    VkBool32           shaderInt8;
+} VkPhysicalDeviceShaderFloat16Int8FeaturesKHR;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES 1000082000
+
 }  // extern "C"
 
 #endif  // HALIDE_MINI_VULKAN_H
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 5b3083f31103..44431242d820 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -147,10 +147,20 @@ WEAK void halide_vulkan_finalize_kernels(void *user_context, void *state_ptr) {
     debug(user_context)
         << "Vulkan: halide_vulkan_finalize_kernels (user_context: " << user_context
         << ", state_ptr: " << state_ptr << "\n";
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
     VulkanContext ctx(user_context);
     if (ctx.error == VK_SUCCESS) {
         compilation_cache.release_hold(user_context, ctx.device, state_ptr);
     }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
 }
 
 // Used to generate correct timings when tracing
@@ -629,41 +639,63 @@ WEAK int halide_vulkan_run(void *user_context,
         return halide_error_code_internal_error;
     }
 
-    //// 1. Create a descriptor set layout
-    if (cache_entry->descriptor_set_layout == 0) {
-        cache_entry->bindings_count = vk_count_bindings_for_descriptor_set(user_context, arg_sizes, args, arg_is_buffer);
-        cache_entry->buffer_count = (cache_entry->bindings_count - 1);  // first binding is args packed into a param buffer, all others are halide buffers
-        VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, arg_sizes, args, arg_is_buffer, &(cache_entry->descriptor_set_layout));
+    // 3a. Locate the correct entry point from the cache
+    bool found_entry_point = false;
+    uint32_t entry_point_index = 0;
+    for(uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
+        if(strstr(cache_entry->entry_point_data[n].entry_point_name, entry_name) != nullptr) {
+            entry_point_index = n;
+            found_entry_point = true;
+        }
+    }
+    if(!found_entry_point) {
+        error(user_context) << "Vulkan: Failed to locate shader entry point! Unable to proceed!\n";
+        return halide_error_code_internal_error;
+    }
+    debug(user_context) << "    found entry point [" 
+                        << entry_point_index << "/" << cache_entry->entry_point_count 
+                        << "] '" << entry_name << "'\n";
+
+    halide_abort_if_false(user_context, cache_entry->descriptor_set_layouts != nullptr);
+    if (cache_entry->pipeline_layout == 0) {
+
+        //// 1. Create all descriptor set layouts        
+        for(uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
+            uint32_t uniform_buffer_count = cache_entry->entry_point_data[n].uniform_buffer_count; 
+            uint32_t storage_buffer_count = cache_entry->entry_point_data[n].storage_buffer_count; 
+            VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(cache_entry->descriptor_set_layouts[n]));
+            if (result != VK_SUCCESS) {
+                error(user_context) << "Vulkan: vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
+                return result;
+            }
+        }
+
+        ///// 2. Create the pipeline layout
+        VkResult result = vk_create_pipeline_layout(user_context, ctx.allocator, cache_entry->entry_point_count, cache_entry->descriptor_set_layouts, &(cache_entry->pipeline_layout));
         if (result != VK_SUCCESS) {
-            error(user_context) << "Vulkan: vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
-            return result;
+            error(user_context) << "Vulkan: vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
+            return halide_error_code_internal_error;
         }
     }
 
+    VulkanEntryPointData* entry_point_data = (cache_entry->entry_point_data + entry_point_index);
+    halide_abort_if_false(user_context, entry_point_data != nullptr);
+
     //// 1a. Create a buffer for the scalar parameters
-    if (cache_entry->args_region == nullptr) {
+    if ((entry_point_data->args_region == nullptr) && entry_point_data->uniform_buffer_count) {
         size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context, arg_sizes, args, arg_is_buffer);
         if (scalar_buffer_size > 0) {
-            cache_entry->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
-            if (cache_entry->args_region == nullptr) {
+            entry_point_data->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
+            if (entry_point_data->args_region == nullptr) {
                 error(user_context) << "Vulkan: vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
                 return halide_error_code_internal_error;
             }
         }
     }
 
-    ///// 2. Create a pipeline layout
-    if (cache_entry->pipeline_layout == 0) {
-        VkResult result = vk_create_pipeline_layout(user_context, ctx.allocator, &(cache_entry->descriptor_set_layout), &(cache_entry->pipeline_layout));
-        if (result != VK_SUCCESS) {
-            error(user_context) << "Vulkan: vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
-            return halide_error_code_internal_error;
-        }
-    }
-
     // Construct the pipeline
-    if (cache_entry->compute_pipeline == 0) {
-        VkResult result = vk_create_compute_pipeline(user_context, ctx.allocator, entry_name, cache_entry->shader_module, cache_entry->pipeline_layout, &(cache_entry->compute_pipeline));
+    if (entry_point_data->compute_pipeline == 0) {
+        VkResult result = vk_create_compute_pipeline(user_context, ctx.allocator, entry_name, cache_entry->shader_module, cache_entry->pipeline_layout, &(entry_point_data->compute_pipeline));
         if (result != VK_SUCCESS) {
             error(user_context) << "Vulkan: vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return halide_error_code_internal_error;
@@ -671,20 +703,23 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     //// 4. Create a descriptor set
-    if (cache_entry->descriptor_set == 0) {
+    if (entry_point_data->descriptor_set == 0) {
 
         // Construct a descriptor pool
         //
         // NOTE: while this could be re-used across multiple pipelines, we only know the storage requirements of this kernel's
         //       inputs and outputs ... so create a pool specific to the number of buffers known at this time
-        VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, cache_entry->buffer_count, &(cache_entry->descriptor_pool));
+
+        uint32_t uniform_buffer_count = entry_point_data->uniform_buffer_count; // needs_scalar_uniform_buffer ? 1 : 0;
+        uint32_t storage_buffer_count = entry_point_data->storage_buffer_count; // entry_point_data->buffer_count;
+        VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(entry_point_data->descriptor_pool));
         if (result != VK_SUCCESS) {
             error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
 
         // Create the descriptor set
-        result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layout, cache_entry->descriptor_pool, &(cache_entry->descriptor_set));
+        result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layouts[entry_point_index], entry_point_data->descriptor_pool, &(entry_point_data->descriptor_set));
         if (result != VK_SUCCESS) {
             error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
@@ -693,21 +728,21 @@ WEAK int halide_vulkan_run(void *user_context,
 
     //// 5. Update uniform args and bindings for buffers in the descriptor set
     VkBuffer *args_buffer = nullptr;
-    if (cache_entry->args_region != nullptr) {
-        VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, cache_entry->args_region, arg_sizes, args, arg_is_buffer);
+    if ((entry_point_data->args_region != nullptr) && entry_point_data->uniform_buffer_count) {
+        VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, entry_point_data->args_region, arg_sizes, args, arg_is_buffer);
         if (result != VK_SUCCESS) {
             debug(user_context) << "Vulkan: vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
 
-        args_buffer = reinterpret_cast<VkBuffer *>(cache_entry->args_region->handle);
+        args_buffer = reinterpret_cast<VkBuffer *>(entry_point_data->args_region->handle);
         if (args_buffer == nullptr) {
             error(user_context) << "Vulkan: Failed to retrieve scalar args buffer for device memory!\n";
             return halide_error_code_internal_error;
         }
     }
 
-    VkResult result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, cache_entry->buffer_count, arg_sizes, args, arg_is_buffer, cache_entry->descriptor_set);
+    VkResult result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_data->uniform_buffer_count, entry_point_data->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_data->descriptor_set);
     if (result != VK_SUCCESS) {
         debug(user_context) << "Vulkan: vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
@@ -723,7 +758,11 @@ WEAK int halide_vulkan_run(void *user_context,
 
     //// 7. Begin the command buffer
     result = vk_fill_command_buffer_with_dispatch_call(user_context,
-                                                       ctx.device, command_buffer, cache_entry->compute_pipeline, cache_entry->pipeline_layout, cache_entry->descriptor_set,
+                                                       ctx.device, command_buffer, 
+                                                       entry_point_data->compute_pipeline, 
+                                                       cache_entry->pipeline_layout, 
+                                                       entry_point_data->descriptor_set,
+                                                       entry_point_index,
                                                        blocksX, blocksY, blocksZ);
     if (result != VK_SUCCESS) {
         debug(user_context) << "Vulkan: vk_fill_command_buffer_with_dispatch_call() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 15249d9cdce3..81cfae29f0e6 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -249,18 +249,44 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
 
     // Get the device features so that all supported features are enabled when device is created
     //
-    VkPhysicalDeviceFeatures device_features;
-    vkGetPhysicalDeviceFeatures(*physical_device, &device_features);
+    VkPhysicalDeviceFeatures device_features = {};
+
+    // If the instance runtime supports querying extended device features, request them
+    VkPhysicalDeviceShaderFloat16Int8FeaturesKHR shader_f16_i8_ext = {
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES, 
+        nullptr, VK_FALSE, VK_FALSE
+    };
+
+    VkPhysicalDeviceFeatures2KHR device_features_ext = {
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR, 
+        &shader_f16_i8_ext,
+        device_features
+    };    
+
+    void* extended_features_ptr = nullptr;
+    void* standard_features_ptr = nullptr;
+    PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2KHR");   // v1.0+
+    if(!vkGetPhysicalDeviceFeatures2KHR) { vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2"); } // v1.1+
+    if(vkGetPhysicalDeviceFeatures2KHR) {
+        debug(user_context) << "Vulkan: Querying for extended device features...\n";
+        vkGetPhysicalDeviceFeatures2KHR(*physical_device, &device_features_ext);
+        debug(user_context) << "Vulkan: Shader Int8 support: " << (shader_f16_i8_ext.shaderInt8 ? "true" : "false") << "...\n";
+        debug(user_context) << "Vulkan: Shader Float16 support: " << (shader_f16_i8_ext.shaderFloat16 ? "true" : "false") << "...\n";
+        extended_features_ptr = (void*)(&device_features_ext); // pass v1.1 extended features (which also contains the standard features)
+    } else {
+        vkGetPhysicalDeviceFeatures(*physical_device, &device_features);
+        standard_features_ptr = &device_features;   // pass v1.0 standard features
+    }
 
     VkDeviceCreateInfo device_create_info = {
         VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
-        nullptr,  // Next
-        0,        // Flags
-        1,        // Count of queues to create
+        extended_features_ptr,  // Extended struct ptr (used here for requesting chain of extended features)
+        0,                      // Flags
+        1,                      // Count of queues to create
         &device_queue_create_info,
         (uint32_t)requested_layers.size(), requested_layers.data(),                      // Layers
         (uint32_t)required_device_extensions.size(), required_device_extensions.data(),  // Enabled extensions
-        &device_features,                                                                // Requested device features
+        (VkPhysicalDeviceFeatures*)standard_features_ptr,                                // Requested device features
     };
 
     VkResult result = vkCreateDevice(*physical_device, &device_create_info, alloc_callbacks, device);
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 8cb122f918fc..e8df451e655f 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -23,6 +23,7 @@ namespace Vulkan {
 
 // Declarations
 class VulkanMemoryAllocator;
+struct VulkanEntryPointData;
 struct VulkanCompilationCacheEntry;
 
 // --------------------------------------------------------------------------
@@ -92,11 +93,17 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
                                                    VkPipeline compute_pipeline,
                                                    VkPipelineLayout pipeline_layout,
                                                    VkDescriptorSet descriptor_set,
+                                                   uint32_t descriptor_set_index,
                                                    int blocksX, int blocksY, int blocksZ);
 
 VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer);
 
 // -- Scalar Uniform Buffer
+bool vk_needs_scalar_uniform_buffer(void *user_context,
+                                    size_t arg_sizes[],
+                                    void *args[],
+                                    int8_t arg_is_buffer[]);
+
 size_t vk_estimate_scalar_uniform_buffer_size(void *user_context,
                                               size_t arg_sizes[],
                                               void *args[],
@@ -118,6 +125,7 @@ void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator
 // -- Descriptor Pool
 VkResult vk_create_descriptor_pool(void *user_context,
                                    VulkanMemoryAllocator *allocator,
+                                   uint32_t uniform_buffer_count,
                                    uint32_t storage_buffer_count,
                                    VkDescriptorPool *descriptor_pool);
 
@@ -132,10 +140,9 @@ uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
                                               int8_t arg_is_buffer[]);
 
 VkResult vk_create_descriptor_set_layout(void *user_context,
-                                         VkDevice device,
-                                         size_t arg_sizes[],
-                                         void *args[],
-                                         int8_t arg_is_buffer[],
+                                         VulkanMemoryAllocator *allocator,
+                                         uint32_t uniform_buffer_count,
+                                         uint32_t storage_buffer_count,
                                          VkDescriptorSetLayout *layout);
 
 VkResult vk_destroy_descriptor_set_layout(void *user_context,
@@ -152,6 +159,7 @@ VkResult vk_create_descriptor_set(void *user_context,
 VkResult vk_update_descriptor_set(void *user_context,
                                   VulkanMemoryAllocator *allocator,
                                   VkBuffer *scalar_args_buffer,
+                                  size_t uniform_buffer_count,
                                   size_t storage_buffer_count,
                                   size_t arg_sizes[],
                                   void *args[],
@@ -161,7 +169,8 @@ VkResult vk_update_descriptor_set(void *user_context,
 // -- Pipeline Layout
 VkResult vk_create_pipeline_layout(void *user_context,
                                    VulkanMemoryAllocator *allocator,
-                                   VkDescriptorSetLayout *descriptor_set_layout,
+                                   uint32_t descriptor_set_count,
+                                   VkDescriptorSetLayout* descriptor_set_layouts,
                                    VkPipelineLayout *pipeline_layout);
 
 VkResult vk_destroy_pipeline_layout(void *user_context,
@@ -180,6 +189,9 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
                                      VkPipeline compute_pipeline);
 
 // -- Shader Module
+VulkanEntryPointData* vk_decode_entry_point_data(void* user_context, VulkanMemoryAllocator *allocator, 
+                                                 const uint32_t* module_ptr, uint32_t module_size);
+
 VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
                                                       const char *src, int size);
 
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 6e35101de5d5..764dc6bfbca2 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -12,16 +12,23 @@ namespace Internal {
 namespace Vulkan {
 
 // Compilation cache for compiled shader modules
-struct VulkanCompilationCacheEntry {
-    VkShaderModule shader_module = {0};
+struct VulkanEntryPointData {
+    const char* entry_point_name = nullptr;
     VkDescriptorPool descriptor_pool = {0};
     VkDescriptorSet descriptor_set = {0};
-    VkDescriptorSetLayout descriptor_set_layout = {0};
-    VkPipelineLayout pipeline_layout = {0};
     VkPipeline compute_pipeline = {0};
-    MemoryRegion *args_region = nullptr;
+    uint32_t uniform_buffer_count = 0;
+    uint32_t storage_buffer_count = 0;
     uint32_t bindings_count = 0;
-    uint32_t buffer_count = 0;
+    MemoryRegion *args_region = nullptr;
+};
+
+struct VulkanCompilationCacheEntry {
+    VkShaderModule shader_module = {0};
+    VkDescriptorSetLayout* descriptor_set_layouts = {0};
+    VkPipelineLayout pipeline_layout = {0};
+    uint32_t entry_point_count = 0;
+    VulkanEntryPointData* entry_point_data = nullptr;
 };
 
 WEAK Halide::Internal::GPUCompilationCache<VkDevice, VulkanCompilationCacheEntry *> compilation_cache;
@@ -91,6 +98,7 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
                                                    VkPipeline compute_pipeline,
                                                    VkPipelineLayout pipeline_layout,
                                                    VkDescriptorSet descriptor_set,
+                                                   uint32_t descriptor_set_index,
                                                    int blocksX, int blocksY, int blocksZ) {
 
 #ifdef DEBUG_RUNTIME
@@ -100,6 +108,7 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
         << "command_buffer: " << (void *)command_buffer << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ", "
         << "descriptor_set: " << (void *)descriptor_set << ", "
+        << "descriptor_set_index: " << descriptor_set_index << ", "
         << "blocks: " << blocksX << ", " << blocksY << ", " << blocksZ << ")\n";
 #endif
 
@@ -118,7 +127,7 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
 
     vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute_pipeline);
     vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_layout,
-                            0, 1, &descriptor_set, 0, nullptr);
+                            descriptor_set_index, 1, &descriptor_set, 0, nullptr);
     vkCmdDispatch(command_buffer, blocksX, blocksY, blocksZ);  // TODO: make sure this is right!
 
     result = vkEndCommandBuffer(command_buffer);
@@ -161,12 +170,29 @@ VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBu
 
 // --
 
+bool vk_needs_scalar_uniform_buffer(void *user_context,
+                                    size_t arg_sizes[],
+                                    void *args[],
+                                    int8_t arg_is_buffer[]) {
+    int i = 0;
+    while (arg_sizes[i] > 0) {
+        if (!arg_is_buffer[i]) {
+            return true;
+        }
+        i++;
+    }
+    return false;
+}
+
 uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
                                               size_t arg_sizes[],
                                               void *args[],
                                               int8_t arg_is_buffer[]) {
+
+    // first binding is for passing scalar parameters in a buffer (if necessary)
+    uint32_t bindings_count = vk_needs_scalar_uniform_buffer(user_context, arg_sizes, args, arg_is_buffer);
+
     int i = 0;
-    uint32_t bindings_count = 1;  // first binding is for passing scalar parameters in a buffer
     while (arg_sizes[i] > 0) {
         if (arg_is_buffer[i]) {
             bindings_count++;
@@ -180,6 +206,7 @@ uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
 
 VkResult vk_create_descriptor_pool(void *user_context,
                                    VulkanMemoryAllocator *allocator,
+                                   uint32_t uniform_buffer_count,
                                    uint32_t storage_buffer_count,
                                    VkDescriptorPool *descriptor_pool) {
 #ifdef DEBUG_RUNTIME
@@ -187,20 +214,23 @@ VkResult vk_create_descriptor_pool(void *user_context,
         << "Vulkan: vk_create_descriptor_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
+        << "uniform_buffer_count: " << (uint32_t)uniform_buffer_count << ", "
         << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n";
 #endif
 
     BlockStorage::Config pool_config;
     pool_config.entry_size = sizeof(VkDescriptorPoolSize);
-    pool_config.minimum_capacity = 1 + (storage_buffer_count ? 1 : 0);
+    pool_config.minimum_capacity = (uniform_buffer_count ? 1 : 0) + (storage_buffer_count ? 1 : 0);
     BlockStorage pool_sizes(user_context, pool_config);
 
     // First binding is reserved for passing scalar parameters as a uniform buffer
-    VkDescriptorPoolSize uniform_buffer_size = {
-        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
-        1                                   // all kernel args are packed into one uniform buffer
-    };
-    pool_sizes.append(user_context, &uniform_buffer_size);
+    if(uniform_buffer_count > 0) {
+        VkDescriptorPoolSize uniform_buffer_size = {
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
+            uniform_buffer_count                // all kernel args are packed into uniform buffers
+        };
+        pool_sizes.append(user_context, &uniform_buffer_size);
+    }
 
     if (storage_buffer_count > 0) {
         VkDescriptorPoolSize storage_buffer_size = {
@@ -245,9 +275,8 @@ VkResult vk_destroy_descriptor_pool(void *user_context,
 
 VkResult vk_create_descriptor_set_layout(void *user_context,
                                          VulkanMemoryAllocator *allocator,
-                                         size_t arg_sizes[],
-                                         void *args[],
-                                         int8_t arg_is_buffer[],
+                                         uint32_t uniform_buffer_count,
+                                         uint32_t storage_buffer_count,
                                          VkDescriptorSetLayout *layout) {
 
 #ifdef DEBUG_RUNTIME
@@ -255,45 +284,53 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
         << "Vulkan: vk_create_descriptor_set_layout (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
+        << "uniform_buffer_count: " << uniform_buffer_count << ", "
+        << "storage_buffer_count: " << storage_buffer_count << ", "
         << "layout: " << (void *)layout << ")\n";
 #endif
 
-    // The first binding is used for scalar parameters
-    uint32_t num_bindings = vk_count_bindings_for_descriptor_set(user_context, arg_sizes, args, arg_is_buffer);
-
     BlockStorage::Config layout_config;
     layout_config.entry_size = sizeof(VkDescriptorSetLayoutBinding);
-    layout_config.minimum_capacity = num_bindings;
-
+    layout_config.minimum_capacity = uniform_buffer_count + storage_buffer_count;
     BlockStorage layout_bindings(user_context, layout_config);
 
-    // First binding is reserved for passing scalar parameters as a uniform buffer
-    VkDescriptorSetLayoutBinding scalar_uniform_layout = {
-        0,                                  // binding index
-        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
-        1,                                  // descriptor count
-        VK_SHADER_STAGE_COMPUTE_BIT,        // stage flags
-        nullptr                             // immutable samplers
-    };
-    layout_bindings.append(user_context, &scalar_uniform_layout);
+    // add all uniform buffers first
+    for(uint32_t n = 0; n < uniform_buffer_count; ++n) {
+        VkDescriptorSetLayoutBinding uniform_buffer_layout = {
+            (uint32_t)layout_bindings.size(),   // binding index
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
+            1,                                  // descriptor count
+            VK_SHADER_STAGE_COMPUTE_BIT,        // stage flags
+            nullptr                             // immutable samplers
+        };
 
-    // Add all other bindings for buffer data
-    int i = 0;
-    while (arg_sizes[i] > 0) {
-        if (arg_is_buffer[i]) {
-            // TODO: I don't quite understand why STORAGE_BUFFER is valid
-            // here, but examples all across the docs seem to do this
-            VkDescriptorSetLayoutBinding storage_buffer_layout = {
-                (uint32_t)layout_bindings.size(),   // binding index
-                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,  // descriptor type
-                1,                                  // descriptor count
-                VK_SHADER_STAGE_COMPUTE_BIT,        // stage flags
-                nullptr                             // immutable samplers
-            };
-            layout_bindings.append(user_context, &storage_buffer_layout);
-        }
-        i++;
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "        [" <<  (uint32_t)layout_bindings.size() << "] : UNIFORM_BUFFER\n";
+#endif
+
+        layout_bindings.append(user_context, &uniform_buffer_layout);
     }
+
+    // Add all other storage buffers
+    for(uint32_t n = 0; n < storage_buffer_count; ++n) {
+
+        // halide buffers will be passed as STORAGE_BUFFERS
+        VkDescriptorSetLayoutBinding storage_buffer_layout = {
+            (uint32_t)layout_bindings.size(),   // binding index
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,  // descriptor type
+            1,                                  // descriptor count
+            VK_SHADER_STAGE_COMPUTE_BIT,        // stage flags
+            nullptr                             // immutable samplers
+        };
+#ifdef DEBUG_RUNTIME
+debug(user_context)
+    << "        [" <<  (uint32_t)layout_bindings.size() << "] : STORAGE_BUFFER\n";
+#endif
+
+        layout_bindings.append(user_context, &storage_buffer_layout);
+    }
+
     // Create the LayoutInfo struct
     VkDescriptorSetLayoutCreateInfo layout_info = {
         VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,    // structure type
@@ -358,6 +395,7 @@ VkResult vk_create_descriptor_set(void *user_context,
 VkResult vk_update_descriptor_set(void *user_context,
                                   VulkanMemoryAllocator *allocator,
                                   VkBuffer *scalar_args_buffer,
+                                  size_t uniform_buffer_count,
                                   size_t storage_buffer_count,
                                   size_t arg_sizes[],
                                   void *args[],
@@ -368,12 +406,12 @@ VkResult vk_update_descriptor_set(void *user_context,
         << "Vulkan: vk_update_descriptor_set (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
+        << "scalar_args_buffer: " << (void *)scalar_args_buffer << ", "
+        << "uniform_buffer_count: " << (uint32_t)uniform_buffer_count << ", "
         << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ", "
         << "descriptor_set: " << (void *)descriptor_set << ")\n";
 #endif
 
-    static const int uniform_buffer_count = 1;  // scalar args are always packed into one uniform buffer
-
     BlockStorage::Config dbi_config;
     dbi_config.minimum_capacity = storage_buffer_count + uniform_buffer_count;
     dbi_config.entry_size = sizeof(VkDescriptorBufferInfo);
@@ -384,7 +422,7 @@ VkResult vk_update_descriptor_set(void *user_context,
     wds_config.entry_size = sizeof(VkWriteDescriptorSet);
     BlockStorage write_descriptor_set(user_context, wds_config);
 
-    // First binding will be the scalar args buffer (if needed)
+    // First binding will be the scalar args buffer (if needed) passed as a UNIFORM BUFFER
     VkDescriptorBufferInfo *scalar_args_entry = nullptr;
     if (scalar_args_buffer != nullptr) {
         VkDescriptorBufferInfo scalar_args_descriptor_buffer_info = {
@@ -395,7 +433,7 @@ VkResult vk_update_descriptor_set(void *user_context,
         descriptor_buffer_info.append(user_context, &scalar_args_descriptor_buffer_info);
         scalar_args_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
 
-        VkWriteDescriptorSet scalar_args_write_descriptor_set = {
+        VkWriteDescriptorSet uniform_buffer_write_descriptor_set = {
             VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
             nullptr,                                 // pointer to struct extending this
             descriptor_set,                          // descriptor set to update
@@ -407,10 +445,10 @@ VkResult vk_update_descriptor_set(void *user_context,
             scalar_args_entry,                       // info for buffer
             nullptr                                  // for texel buffers
         };
-        write_descriptor_set.append(user_context, &scalar_args_write_descriptor_set);
+        write_descriptor_set.append(user_context, &uniform_buffer_write_descriptor_set);
     }
 
-    // Add all the other device buffers
+    // Add all the other device buffers as STORAGE BUFFERs
     for (size_t i = 0; arg_sizes[i] > 0; i++) {
         if (arg_is_buffer[i]) {
 
@@ -432,7 +470,7 @@ VkResult vk_update_descriptor_set(void *user_context,
             descriptor_buffer_info.append(user_context, &device_buffer_info);
             VkDescriptorBufferInfo *device_buffer_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
 
-            VkWriteDescriptorSet scalar_args_write_descriptor_set = {
+            VkWriteDescriptorSet storage_buffer_write_descriptor_set = {
                 VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
                 nullptr,                                 // pointer to struct extending this
                 descriptor_set,                          // descriptor set to update
@@ -444,7 +482,7 @@ VkResult vk_update_descriptor_set(void *user_context,
                 device_buffer_entry,                     // info for buffer
                 nullptr                                  // for texel buffers
             };
-            write_descriptor_set.append(user_context, &scalar_args_write_descriptor_set);
+            write_descriptor_set.append(user_context, &storage_buffer_write_descriptor_set);
         }
     }
 
@@ -559,7 +597,8 @@ void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator
 
 VkResult vk_create_pipeline_layout(void *user_context,
                                    VulkanMemoryAllocator *allocator,
-                                   VkDescriptorSetLayout *descriptor_set_layout,
+                                   uint32_t descriptor_set_count,
+                                   VkDescriptorSetLayout* descriptor_set_layouts,
                                    VkPipelineLayout *pipeline_layout) {
 
 #ifdef DEBUG_RUNTIME
@@ -567,7 +606,8 @@ VkResult vk_create_pipeline_layout(void *user_context,
         << "Vulkan: vk_create_pipeline_layout (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
-        << "descriptor_set_layout: " << (void *)descriptor_set_layout << ", "
+        << "descriptor_set_count: " << descriptor_set_count << ", "
+        << "descriptor_set_layouts: " << (void *)descriptor_set_layouts << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
 
@@ -575,8 +615,8 @@ VkResult vk_create_pipeline_layout(void *user_context,
         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,  // structure type
         nullptr,                                        // pointer to a structure extending this
         0,                                              // flags
-        1,                                              // number of descriptor sets
-        descriptor_set_layout,                          // pointer to the descriptor sets
+        descriptor_set_count,                           // number of descriptor sets
+        descriptor_set_layouts,                         // pointer to the descriptor sets
         0,                                              // number of push constant ranges
         nullptr                                         // pointer to push constant range structs
     };
@@ -668,25 +708,120 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
 
 // --------------------------------------------------------------------------
 
+VulkanEntryPointData* vk_decode_entry_point_data(void* user_context, VulkanMemoryAllocator *allocator, const uint32_t* module_ptr, uint32_t module_size) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "Vulkan: vk_decode_entry_point_data (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "module_ptr: " << (void*)module_ptr << ", "
+        << "module_size: " << module_size << ")\n";
+
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+    halide_debug_assert(user_context, module_ptr != nullptr);
+    halide_debug_assert(user_context, module_size >= (2 * sizeof(uint32_t)));
+
+    // Decode the sidecar for the module that lists the descriptor sets 
+    // corresponding to each entry point contained in the module
+    //
+    // [0] Header word count (total length of header)
+    // [1] Number of descriptor sets
+    // ... For each descriptor set ...
+    // ... [0] Number of uniform buffers for this descriptor set
+    // ... [1] Number of storage buffers for this descriptor set
+    // ... [2] Length of entry point name (padded to nearest word size)
+    // ... [X] Entry point string data 
+    //
+    // NOTE: See CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header() for the encoding
+    //
+    //
+    uint32_t module_entries = module_size / sizeof(uint32_t);
+    uint32_t idx = 1; // skip past the header_word_count
+    uint32_t entry_point_count = module_ptr[idx++];
+    if(entry_point_count < 1) {
+        return nullptr; // no descriptors 
+    }
+
+    // allocate an array of entry point data
+    VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT;
+    size_t entry_point_data_size = entry_point_count * sizeof(VulkanEntryPointData);
+    VulkanEntryPointData *entry_point_data = (VulkanEntryPointData *)vk_host_malloc(user_context, entry_point_data_size, 0, alloc_scope, allocator->callbacks());
+    if (entry_point_data == nullptr) {
+        error(user_context) << "Vulkan: Failed to allocate entry_point_data! Out of memory!\n";
+        return nullptr;
+    }
+    memset(entry_point_data, 0, entry_point_data_size);
+
+    // decode and fill in each entry point
+    for(uint32_t n = 0; (n < entry_point_count) && (idx < module_entries); n++) {
+        halide_debug_assert(user_context,(idx + 4) < module_entries);
+        uint32_t uniform_buffer_count = module_ptr[idx++];
+        uint32_t storage_buffer_count = module_ptr[idx++];
+        uint32_t padded_string_length = module_ptr[idx++];
+        const char* entry_point_name = (const char*)(module_ptr + idx);
+
+        debug(user_context) << "    [" << n << "] "
+                 << "uniform_buffer_count=" << uniform_buffer_count << " "
+                 << "storage_buffer_count=" << storage_buffer_count << " "
+                 << "entry_point_name_length=" << padded_string_length << " "                 
+                 << "entry_point_name: " << (const char*)entry_point_name << "\n";
+
+        entry_point_data[n].entry_point_name = entry_point_name; // NOTE: module owns string data
+        entry_point_data[n].uniform_buffer_count = uniform_buffer_count;
+        entry_point_data[n].storage_buffer_count = storage_buffer_count;
+        idx += (padded_string_length / sizeof(uint32_t)); // skip past string data
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif    
+
+    return entry_point_data;
+}
+
 VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
-                                                      const char *src, int size) {
+                                                      const char *ptr, int size) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "Vulkan: vk_compile_shader_module (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
-        << "source: " << (void *)src << ", "
+        << "module: " << (void *)ptr << ", "
         << "size: " << size << ")\n";
 
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
+    const uint32_t* module_ptr = (const uint32_t*)ptr;
+    const uint32_t module_size = (const uint32_t)size;
+
+    halide_debug_assert(user_context, module_ptr != nullptr);
+    halide_debug_assert(user_context, module_size >= (2 * sizeof(uint32_t)));
+
+    uint32_t header_word_count = module_ptr[0]; 
+    uint32_t entry_point_count = module_ptr[1];
+    uint32_t header_size = header_word_count * sizeof(uint32_t);
+
+    // skip past the preamble header to the start of the SPIR-V binary
+    const uint32_t* binary_ptr = (module_ptr + header_word_count);
+    size_t binary_size = (size - header_size);
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "Vulkan: Decoding module ("
+                        << "module_ptr: " << (void*)module_ptr << ", "
+                        << "header_word_count: " << header_word_count << ", "
+                        << "header_size: " << header_size << ", "
+                        << "binar_ptr: " << (void*)binary_ptr << ", "
+                        << "binary_size: " << (uint32_t)binary_size << ")\n";
+#endif
+
     VkShaderModuleCreateInfo shader_info = {
         VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
         nullptr,               // pointer to structure extending this
         0,                     // flags (curently unused)
-        (size_t)size,          // code size in bytes
-        (const uint32_t *)src  // source
+        (size_t)binary_size,          // code size in bytes
+        (const uint32_t *)binary_ptr  // source
     };
 
     VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT;
@@ -697,13 +832,30 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
     }
     memset(cache_entry, 0, sizeof(VulkanCompilationCacheEntry));
 
+    // decode the entry point data and save it in the cache entry
+    cache_entry->entry_point_data = vk_decode_entry_point_data(user_context, allocator, module_ptr, module_size);
+    if(cache_entry->entry_point_data != nullptr) {
+        cache_entry->entry_point_count = entry_point_count;
+    }    
+    
     VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &cache_entry->shader_module);
     if ((result != VK_SUCCESS)) {  // } || (cache_entry->shader_module == nullptr)) {
         error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
+        vk_host_free(user_context, cache_entry->entry_point_data, allocator->callbacks());
         vk_host_free(user_context, cache_entry, allocator->callbacks());
         return nullptr;
     }
 
+    // allocate an array for storing the descriptor set layouts
+    if(cache_entry->entry_point_count) {
+        cache_entry->descriptor_set_layouts = (VkDescriptorSetLayout *)vk_host_malloc(user_context, cache_entry->entry_point_count * sizeof(VkDescriptorSetLayout), 0, alloc_scope, allocator->callbacks());
+        if (cache_entry->descriptor_set_layouts == nullptr) {
+            error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!\n";
+            return nullptr;
+        }
+        memset(cache_entry->descriptor_set_layouts, 0, sizeof(VkDescriptorSetLayout));
+    }
+
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
@@ -738,26 +890,39 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                     vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
                     cache_entry->shader_module = {0};
                 }
-                if (cache_entry->args_region) {
-                    vk_destroy_scalar_uniform_buffer(user_context, allocator, cache_entry->args_region);
-                    cache_entry->args_region = nullptr;
-                }
-                if (cache_entry->descriptor_set_layout) {
-                    vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layout);
-                    cache_entry->descriptor_set_layout = {0};
+                if(cache_entry->entry_point_data) {
+                    for( uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
+                        if (cache_entry->entry_point_data[n].args_region) {
+                            vk_destroy_scalar_uniform_buffer(user_context, allocator, cache_entry->entry_point_data[n].args_region);
+                            cache_entry->entry_point_data[n].args_region = nullptr;
+                        }
+                        if (cache_entry->entry_point_data[n].descriptor_pool) {
+                            vk_destroy_descriptor_pool(user_context, allocator, cache_entry->entry_point_data[n].descriptor_pool);
+                            cache_entry->entry_point_data[n].descriptor_pool = {0};
+                        }
+                        if (cache_entry->entry_point_data[n].compute_pipeline) {
+                            vk_destroy_compute_pipeline(user_context, allocator, cache_entry->entry_point_data[n].compute_pipeline);
+                            cache_entry->entry_point_data[n].compute_pipeline = {0};
+                        }
+                    }
+
+                    vk_host_free(user_context, cache_entry->entry_point_data, allocator->callbacks());
+                    cache_entry->entry_point_data = nullptr;
+                    cache_entry->entry_point_count = 0;
                 }
-                if (cache_entry->descriptor_pool) {
-                    vk_destroy_descriptor_pool(user_context, allocator, cache_entry->descriptor_pool);
-                    cache_entry->descriptor_pool = {0};
+                if (cache_entry->descriptor_set_layouts) {
+                    for( uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
+                        vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
+                        cache_entry->descriptor_set_layouts[n] = {0};
+                    }
+                    vk_host_free(user_context, cache_entry->descriptor_set_layouts, allocator->callbacks());
+                    cache_entry->descriptor_set_layouts = nullptr;
                 }
                 if (cache_entry->pipeline_layout) {
                     vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
                     cache_entry->pipeline_layout = {0};
                 }
-                if (cache_entry->compute_pipeline) {
-                    vk_destroy_compute_pipeline(user_context, allocator, cache_entry->compute_pipeline);
-                    cache_entry->compute_pipeline = {0};
-                }
+
                 vk_host_free(user_context, cache_entry, allocator->callbacks());
             }
         }

From fdd281afa23e719fb49e7c88c40faa414bef3e76 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 10 Oct 2022 20:42:27 -0700
Subject: [PATCH 038/166] Clang tidy & format pass

---
 src/CodeGen_Vulkan_Dev.cpp     | 157 +++++++++++++------------
 src/SpirvIR.cpp                | 204 ++++++++++++++++-----------------
 src/SpirvIR.h                  |  16 +--
 src/runtime/mini_vulkan.h      |   8 +-
 src/runtime/vulkan.cpp         |  30 ++---
 src/runtime/vulkan_context.h   |  28 +++--
 src/runtime/vulkan_internal.h  |   6 +-
 src/runtime/vulkan_resources.h |  84 +++++++-------
 8 files changed, 265 insertions(+), 268 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 37e3de9fcdf8..ab27269394cf 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -100,7 +100,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const Let *) override;
         void visit(const LetStmt *) override;
         void visit(const AssertStmt *) override;
-//        void visit(const ProducerConsumer *) override;
+        //        void visit(const ProducerConsumer *) override;
         void visit(const For *) override;
         void visit(const Store *) override;
         void visit(const Provide *) override;
@@ -122,11 +122,11 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit_glsl_unaryop(Type t, const Expr &a, SpvId glsl_op_code);
         void visit_glsl_binop(Type t, const Expr &a, const Expr &b, SpvId glsl_op_code);
 
-        void load_from_scalar_index( SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class );
-        void load_from_vector_index( const Load *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class );
+        void load_from_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class);
+        void load_from_vector_index(const Load *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class);
 
-        void store_at_scalar_index( SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id );
-        void store_at_vector_index( const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id );
+        void store_at_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id);
+        void store_at_vector_index(const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id);
 
         using BuiltinMap = std::unordered_map<std::string, SpvId>;
         const BuiltinMap spirv_builtin = {
@@ -147,8 +147,8 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
             {"asin_f32", GLSLstd450Asin},
             {"asinh_f16", GLSLstd450Asinh},
             {"asinh_f32", GLSLstd450Asinh},
-            {"atan2_f16", GLSLstd450Atan}, // also called atan in GLSL
-            {"atan2_f32", GLSLstd450Atan}, 
+            {"atan2_f16", GLSLstd450Atan},  // also called atan in GLSL
+            {"atan2_f32", GLSLstd450Atan},
             {"atan_f16", GLSLstd450Atan},
             {"atan_f32", GLSLstd450Atan},
             {"atanh_f16", GLSLstd450Atanh},
@@ -180,7 +180,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
             {"trunc_f16", GLSLstd450Trunc},
             {"trunc_f32", GLSLstd450Trunc},
         };
-        
+
         // The SPIRV-IR builder
         SpvBuilder builder;
 
@@ -211,7 +211,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 
         // Encode the descriptor sets into a sidecar which will be added
         // as a header to the module prior to the actual SPIR-V binary
-        void encode_header(SpvBinary& spirv_header);
+        void encode_header(SpvBinary &spirv_header);
 
         // The workgroup size.  Must be the same for all kernels.
         uint32_t workgroup_size[3];
@@ -347,7 +347,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
         } else if (target_type.is_uint()) {
             op_code = SpvOpUConvert;
         } else if (target_type.is_int()) {
-            if(builder.is_capability_required(SpvCapabilityKernel)) {
+            if (builder.is_capability_required(SpvCapabilityKernel)) {
                 op_code = SpvOpSatConvertUToS;
             } else {
                 op_code = SpvOpBitcast;
@@ -359,7 +359,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
         if (target_type.is_float()) {
             op_code = SpvOpConvertSToF;
         } else if (target_type.is_uint()) {
-            if(builder.is_capability_required(SpvCapabilityKernel)) {
+            if (builder.is_capability_required(SpvCapabilityKernel)) {
                 op_code = SpvOpSatConvertSToU;
             } else {
                 op_code = SpvOpBitcast;
@@ -373,9 +373,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
     }
 
-
     SpvId result_id = builder.reserve_id(SpvResultId);
-    if(op_code == SpvOpBitcast) {
+    if (op_code == SpvOpBitcast) {
         builder.append(SpvFactory::bitcast(target_type_id, result_id, src_id));
     } else {
         builder.append(SpvFactory::convert(op_code, target_type_id, result_id, src_id));
@@ -557,10 +556,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
 
         if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device &&
             !(fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared)) {
-            
+
             uint32_t execution_scope = SpvDeviceScope;
             uint32_t memory_scope = SpvDeviceScope;
-            uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask | 
+            uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask |
                                      SpvMemorySemanticsWorkgroupMemoryMask);
             SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
             SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
@@ -584,14 +583,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
 
         } else if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared) {
             uint32_t memory_scope = SpvWorkgroupScope;
-            uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask | 
+            uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask |
                                      SpvMemorySemanticsWorkgroupMemoryMask);
             SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
             SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
             builder.append(SpvFactory::memory_barrier(memory_scope_id, control_mask_id));
 
         } else {
-        
+
             uint32_t execution_scope = SpvDeviceScope;
             uint32_t memory_scope = SpvDeviceScope;
             uint32_t control_mask = SpvMemorySemanticsMaskNone;
@@ -608,7 +607,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             op_code = GLSLstd450FAbs;
         } else {
             op_code = GLSLstd450SAbs;
-        } 
+        }
         visit_glsl_unaryop(op->type, op->args[0], op_code);
 
     } else if (op->is_intrinsic(Call::IntrinsicOp::round)) {
@@ -625,7 +624,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         internal_assert(op->args.size() == 2);
         // Simply discard the first argument, which is generally a call to
         // 'halide_printf'.
-        if(op->args[1].defined()) {
+        if (op->args[1].defined()) {
             op->args[1]->accept(this);
         }
     } else if (op->is_intrinsic(Call::bitwise_and)) {
@@ -686,9 +685,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
 
     } else if (op->is_intrinsic(Call::shift_right)) {
         if (op->type.is_uint()) {
-           visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftRightLogical);
+            visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftRightLogical);
         } else {
-           visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftRightArithmetic);
+            visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftRightArithmetic);
         }
     } else if (op->is_intrinsic(Call::shift_left)) {
         visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftLeftLogical);
@@ -727,9 +726,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         return;
     } else {
         BuiltinMap::const_iterator spirv_it = spirv_builtin.find(op->name);
-        if(spirv_it != spirv_builtin.end()) {
+        if (spirv_it != spirv_builtin.end()) {
             SpvId spirv_op = spirv_it->second;
-            if((spirv_op == SpvOpIsInf) || (spirv_op == SpvOpIsNan)) {
+            if ((spirv_op == SpvOpIsInf) || (spirv_op == SpvOpIsNan)) {
                 internal_assert(op->args.size() == 1);
                 visit_unaryop(op->type, op->args[0], (SpvOp)spirv_op);
             } else {
@@ -742,10 +741,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         }
 
         SpvId glsl_op_code = glsl_it->second;
-        if(is_glsl_unary_op(glsl_op_code)) {
+        if (is_glsl_unary_op(glsl_op_code)) {
             internal_assert(op->args.size() == 1);
             visit_glsl_unaryop(op->type, op->args[0], glsl_op_code);
-        } else if(is_glsl_binary_op(glsl_op_code)) {
+        } else if (is_glsl_binary_op(glsl_op_code)) {
             internal_assert(op->args.size() == 2);
             visit_glsl_binop(op->type, op->args[0], op->args[1], glsl_op_code);
         }
@@ -766,8 +765,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Select *op) {
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index( SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class ) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(): " 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(): "
              << "index_id=" << index_id << " "
              << "base_id=" << base_id << " "
              << "type_id=" << type_id << " "
@@ -777,14 +776,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index( SpvId index_id,
     uint32_t zero = 0;
     SpvId src_id = SpvInvalidId;
     SpvId src_index_id = index_id;
-        if(storage_class == SpvStorageClassUniform) {
+    if (storage_class == SpvStorageClassUniform) {
         SpvId zero_id = builder.declare_constant(UInt(32), &zero);
         SpvFactory::Indices access_indices = {zero_id, src_index_id};
         src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
-    } else if(storage_class == SpvStorageClassWorkgroup) {
+    } else if (storage_class == SpvStorageClassWorkgroup) {
         SpvFactory::Indices access_indices = {src_index_id};
         src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
-    } else if(storage_class == SpvStorageClassFunction) {
+    } else if (storage_class == SpvStorageClassFunction) {
         src_id = base_id;
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): unhandled storage class encountered on op: " << storage_class << "\n";
@@ -796,8 +795,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index( SpvId index_id,
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index( const Load *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class ) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(): " 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(): "
              << "base_id=" << base_id << " "
              << "type_id=" << type_id << " "
              << "ptr_type_id=" << ptr_type_id << " "
@@ -807,7 +806,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index( const Load *op,
 
     // determine the base type id for the source value
     SpvId base_type_id = builder.type_of(base_id);
-    if(builder.is_pointer_type(base_type_id)) {
+    if (builder.is_pointer_type(base_type_id)) {
         base_type_id = builder.lookup_base_type(base_type_id);
     }
 
@@ -815,7 +814,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index( const Load *op,
     // then index the buffer using the base of the ramp divided by the number
     // of lanes.
     SpvFactory::Indices index_components;
-    if(builder.is_vector_type(base_type_id)) {
+    if (builder.is_vector_type(base_type_id)) {
         Expr ramp_base = strided_ramp_base(op->index);
         if (ramp_base.defined()) {
             Expr ramp_index = (ramp_base / op->type.lanes());
@@ -825,7 +824,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index( const Load *op,
             return;
         }
     }
-    
+
     op->index.accept(this);
     SpvId index_id = builder.current_id();
 
@@ -834,7 +833,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index( const Load *op,
     SpvId value_type_id = builder.declare_type(op->type.with_lanes(1));
     SpvId index_type_id = builder.declare_type(op->index.type().with_lanes(1));
     for (uint32_t i = 0; i < (uint32_t)op->index.type().lanes(); i++) {
-        SpvFactory::Indices extract_indices = { i };
+        SpvFactory::Indices extract_indices = {i};
         SpvId index_component_id = builder.declare_variable(unique_name("_idx_ex"), index_type_id, SpvStorageClassFunction);
         builder.append(SpvFactory::composite_extract(index_type_id, index_component_id, index_id, extract_indices));
         load_from_scalar_index(index_component_id, base_id, type_id, ptr_type_id, storage_class);
@@ -850,8 +849,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index( const Load *op,
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index( SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id ) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(): " 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(): "
              << "index_id=" << index_id << " "
              << "base_id=" << base_id << " "
              << "type_id=" << type_id << " "
@@ -863,14 +862,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index( SpvId index_id, S
     SpvId dst_id = SpvInvalidId;
     SpvId dst_index_id = index_id;
 
-    if(storage_class == SpvStorageClassUniform) {
+    if (storage_class == SpvStorageClassUniform) {
         SpvId zero_id = builder.declare_constant(UInt(32), &zero);
         SpvFactory::Indices access_indices = {zero_id, dst_index_id};
         dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
-    } else if(storage_class == SpvStorageClassWorkgroup) {
+    } else if (storage_class == SpvStorageClassWorkgroup) {
         SpvFactory::Indices access_indices = {dst_index_id};
         dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
-    } else if(storage_class == SpvStorageClassFunction) {
+    } else if (storage_class == SpvStorageClassFunction) {
         dst_id = base_id;
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): unhandled storage class encountered on op: " << storage_class << "\n";
@@ -879,8 +878,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index( SpvId index_id, S
     builder.append(SpvFactory::store(dst_id, value_id));
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index( const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id ) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(): " 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(): "
              << "base_id=" << base_id << " "
              << "type_id=" << type_id << " "
              << "ptr_type_id=" << ptr_type_id << " "
@@ -890,24 +889,24 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index( const Store *op,
 
     // determine the base type id for the source value
     SpvId base_type_id = builder.type_of(base_id);
-    if(builder.is_pointer_type(base_type_id)) {
+    if (builder.is_pointer_type(base_type_id)) {
         base_type_id = builder.lookup_base_type(base_type_id);
     }
 
     // If this is a dense vector load and the buffer has a vector base type,
     // then index the buffer using the base of the ramp divided by the number
     // of lanes.
-    if(builder.is_vector_type(base_type_id)) {
+    if (builder.is_vector_type(base_type_id)) {
         Expr ramp_base = strided_ramp_base(op->index);
         if (ramp_base.defined()) {
-            Expr ramp_index =  (ramp_base / op->value.type().lanes());
+            Expr ramp_index = (ramp_base / op->value.type().lanes());
             ramp_index.accept(this);
             SpvId index_id = builder.current_id();
             store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);
             return;
         }
     }
-    
+
     op->index.accept(this);
     SpvId index_id = builder.current_id();
 
@@ -915,7 +914,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index( const Store *op,
     SpvId value_type_id = builder.declare_type(op->value.type().with_lanes(1));
     SpvId index_type_id = builder.declare_type(op->index.type().with_lanes(1));
     for (uint32_t i = 0; i < (uint32_t)op->index.type().lanes(); i++) {
-        SpvFactory::Indices extract_indices = { i };
+        SpvFactory::Indices extract_indices = {i};
         SpvId index_component_id = builder.declare_variable(unique_name("_idx_ex"), index_type_id, SpvStorageClassFunction);
         builder.append(SpvFactory::composite_extract(index_type_id, index_component_id, index_id, extract_indices));
         SpvId value_component_id = builder.declare_variable(unique_name("_val_ex"), value_type_id, SpvStorageClassFunction);
@@ -939,10 +938,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
     SpvId type_id = builder.declare_type(op->type);
     SpvId ptr_type_id = builder.declare_pointer_type(type_id, storage_class);
 
-    if(op->index.type().is_scalar()) {
+    if (op->index.type().is_scalar()) {
         op->index.accept(this);
         SpvId index_id = builder.current_id();
-        load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);        
+        load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
     } else {
         load_from_vector_index(op, base_id, type_id, ptr_type_id, storage_class);
     }
@@ -966,10 +965,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
     SpvId type_id = builder.declare_type(op->value.type());
     SpvId ptr_type_id = builder.declare_pointer_type(type_id, storage_class);
 
-    if(op->index.type().is_scalar()) {
+    if (op->index.type().is_scalar()) {
         op->index.accept(this);
         SpvId index_id = builder.current_id();
-        store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);        
+        store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);
     } else {
         store_at_vector_index(op, base_id, type_id, ptr_type_id, storage_class, value_id);
     }
@@ -1212,7 +1211,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
         internal_assert(op->extents.size() == 1 && is_const(op->extents[0]));
         int32_t size = op->constant_allocation_size();
         base_type_id = builder.declare_type(op->type, size);
-        storage_class = SpvStorageClassWorkgroup; // shared across workgroup
+        storage_class = SpvStorageClassWorkgroup;  // shared across workgroup
         debug(2) << "Vulkan: Allocate " << op->name << "[" << (uint32_t)size << "] in shared memory on device in global scope\n";
         SpvId ptr_type_id = builder.declare_pointer_type(base_type_id, storage_class);
         variable_id = builder.declare_global_variable(op->name, ptr_type_id, storage_class);
@@ -1230,14 +1229,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
             << "Try storing into shared memory instead.";
 
         base_type_id = builder.declare_type(op->type, size);
-        storage_class = SpvStorageClassFunction; // function scope
+        storage_class = SpvStorageClassFunction;  // function scope
         SpvId ptr_type_id = builder.declare_pointer_type(base_type_id, storage_class);
         variable_id = builder.declare_variable(op->name, ptr_type_id, storage_class);
     }
 
     debug(3) << "Vulkan: Pushing allocation called " << op->name << " onto the symbol table\n";
     symbol_table.push(op->name, {variable_id, storage_class});
-    op->body.accept(this);    
+    op->body.accept(this);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Free *op) {
@@ -1418,45 +1417,45 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
     // NOTE: Extensions are handled in finalize
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header(SpvBinary& spirv_header) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header(SpvBinary &spirv_header) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header\n";
 
-    // Encode a sidecar for the module that lists the descriptor sets 
+    // Encode a sidecar for the module that lists the descriptor sets
     // corresponding to each entry point contained in the module
-    // 
+    //
     // [0] Header word count (total length of header)
     // [1] Number of descriptor sets
     // ... For each descriptor set ...
     // ... [0] Number of uniform buffers for this descriptor set
     // ... [1] Number of storage buffers for this descriptor set
     // ... [2] Length of entry point name (padded to nearest word size)
-    // ... [X] Entry point string data 
+    // ... [X] Entry point string data
     //
 
     // NOTE: The Vulkan runtime consumes this header prior to compiling.
-    // 
+    //
     // Both vk_decode_entry_point_data() and vk_compile_shader_module() will
     // need to be updated if the header encoding ever changes!
     //
     uint32_t index = 0;
     spirv_header.push_back(descriptor_set_table.size());
-    for(const DescriptorSet& ds: descriptor_set_table) {
-        std::vector<char> padded_name;       
+    for (const DescriptorSet &ds : descriptor_set_table) {
+        std::vector<char> padded_name;
         uint32_t padded_word_count = (ds.entry_point_name.length() + 3) / 4;
         uint32_t padded_str_length = padded_word_count * 4;
         padded_name.reserve(padded_str_length);
         padded_name.insert(padded_name.begin(), ds.entry_point_name.c_str(), (ds.entry_point_name.c_str() + ds.entry_point_name.length()));
         uint32_t padding = (padded_str_length - ds.entry_point_name.length());
-        for(uint32_t i = 0; i < padding; ++i) {
+        for (uint32_t i = 0; i < padding; ++i) {
             padded_name.push_back('\0');
         }
 
         debug(2) << "    [" << index << "] "
                  << "uniform_buffer_count=" << ds.uniform_buffer_count << " "
                  << "storage_buffer_count=" << ds.storage_buffer_count << " "
-                 << "entry_point_name_length=" << padded_str_length << " "                 
-                 << "entry_point_name_size=" << padded_name.size() << " "                 
-                 << "entry_point_name: " << (const char*)padded_name.data() << "\n";
+                 << "entry_point_name_length=" << padded_str_length << " "
+                 << "entry_point_name_size=" << padded_name.size() << " "
+                 << "entry_point_name: " << (const char *)padded_name.data() << "\n";
 
         spirv_header.push_back(ds.uniform_buffer_count);
         spirv_header.push_back(ds.storage_buffer_count);
@@ -1472,7 +1471,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header(SpvBinary& spirv_header) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::compile\n";
 
-    // First encode the descriptor set bindings for each entry point 
+    // First encode the descriptor set bindings for each entry point
     // as a sidecar which we will add as a preamble header to the actual
     // SPIR-V binary so the runtime can know which descriptor set to use
     // for each entry point
@@ -1484,12 +1483,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
     builder.finalize();
     builder.encode(spirv_binary);
 
-    size_t header_bytes = spirv_header.size() * sizeof(uint32_t); 
-    size_t binary_bytes = spirv_binary.size() * sizeof(uint32_t); 
+    size_t header_bytes = spirv_header.size() * sizeof(uint32_t);
+    size_t binary_bytes = spirv_binary.size() * sizeof(uint32_t);
 
     debug(2) << "    encoding module ("
-                        << "header_size: " << (uint32_t)(header_bytes) << ", "
-                        << "binary_size: " << (uint32_t)(binary_bytes) << ")\n";
+             << "header_size: " << (uint32_t)(header_bytes) << ", "
+             << "binary_size: " << (uint32_t)(binary_bytes) << ")\n";
 
     // Combine the header and binary into the module
     module.reserve(header_bytes + binary_bytes);
@@ -1543,7 +1542,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     }
 
     // Add the entry point with the appropriate execution model
-    // NOTE: exec_model must be GLCompute to work with Vulkan ... Kernel is only supported in OpenCL 
+    // NOTE: exec_model must be GLCompute to work with Vulkan ... Kernel is only supported in OpenCL
     uint32_t current_entry_point = builder.current_module().entry_point_count();
     builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
 
@@ -1560,7 +1559,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
 
     // Add a binding for a uniform buffer packed with all scalar args
     uint32_t binding_counter = 0;
-    if(param_struct_members.size() > 0) {
+    if (!param_struct_members.empty()) {
         const std::string struct_name = std::string("_struct") + name + std::string("_args");
         SpvId param_struct_type_id = builder.declare_struct(struct_name, param_struct_members);
 
@@ -1607,7 +1606,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
                 SpvId param_id = builder.reserve_id(SpvResultId);
                 builder.append(SpvFactory::load(arg_type_id, param_id, access_chain_id));
                 symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
-            }        
+            }
         }
     }
 
@@ -1616,10 +1615,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
         if (arg.is_buffer) {
 
             // Add required extension support for storage types
-            if(arg.type.is_int_or_uint()) {
-                if(arg.type.bits() == 8) {
+            if (arg.type.is_int_or_uint()) {
+                if (arg.type.bits() == 8) {
                     builder.require_extension("SPV_KHR_8bit_storage");
-                } else if(arg.type.bits() == 16) {
+                } else if (arg.type.bits() == 16) {
                     builder.require_extension("SPV_KHR_16bit_storage");
                 }
             }
@@ -1722,10 +1721,10 @@ void CodeGen_Vulkan_Dev::dump() {
     std::vector<char> module = compile_to_src();
 
     // Skip the header and only output the SPIR-V binary
-    const uint32_t* decode = (const uint32_t*)(module.data());
+    const uint32_t *decode = (const uint32_t *)(module.data());
     uint32_t header_word_count = decode[0];
     size_t header_size = header_word_count * sizeof(uint32_t);
-    const uint32_t* binary_ptr = (decode + header_word_count);
+    const uint32_t *binary_ptr = (decode + header_word_count);
     size_t binary_size = (module.size() - header_size);
 
     const char *filename = getenv("HL_SPIRV_DUMP_FILE") ? getenv("HL_SPIRV_DUMP_FILE") : "out.spv";
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 11c4e632ba75..fc5605cf207d 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -542,7 +542,7 @@ SpvInstruction SpvModule::entry_point(const std::string &name) const {
 
 void SpvModule::import_instruction_set(SpvId id, const std::string &instruction_set) {
     check_defined();
-    if(contents->imports.find(instruction_set) == contents->imports.end()) {
+    if (contents->imports.find(instruction_set) == contents->imports.end()) {
         contents->imports.insert({instruction_set, id});
     }
 }
@@ -597,16 +597,16 @@ SpvModule::EntryPointNames SpvModule::entry_point_names() const {
 SpvModule::ImportNames SpvModule::import_names() const {
     check_defined();
     SpvModule::ImportNames results(contents->imports.size());
-    for (const SpvModuleContents::Imports::value_type& v : contents->imports) {
+    for (const SpvModuleContents::Imports::value_type &v : contents->imports) {
         results.push_back(v.first);
     }
     return results;
 }
 
-SpvId SpvModule::lookup_import(const std::string& instruction_set) const {
+SpvId SpvModule::lookup_import(const std::string &instruction_set) const {
     SpvId result_id = SpvInvalidId;
     SpvModuleContents::Imports::const_iterator it = contents->imports.find(instruction_set);
-    if(it != contents->imports.end()) {
+    if (it != contents->imports.end()) {
         result_id = it->second;
     }
     return result_id;
@@ -645,7 +645,7 @@ void SpvModule::encode(SpvBinary &binary) const {
 
     // 3. Extended Instruction Set Imports
     for (const SpvModuleContents::Imports::value_type &import : contents->imports) {
-        const std::string& import_name = import.first;
+        const std::string &import_name = import.first;
         SpvId import_id = import.second;
         SpvInstruction inst = SpvFactory::import(import_id, import_name);
         inst.encode(binary);
@@ -1119,7 +1119,7 @@ SpvId SpvBuilder::lookup_scope(SpvId id) const {
 SpvId SpvBuilder::lookup_import(const std::string &instruction_set) const {
     return module.lookup_import(instruction_set);
 }
-    
+
 void SpvBuilder::enter_function(const SpvFunction &func) {
     active_function = func;
     enter_block(active_function.entry_block());
@@ -1175,7 +1175,7 @@ SpvId SpvBuilder::import_glsl_intrinsics() {
     return import_instruction_set("GLSL.std.450");
 }
 
-SpvId SpvBuilder::import_instruction_set(const std::string& instruction_set) {
+SpvId SpvBuilder::import_instruction_set(const std::string &instruction_set) {
     SpvId result_id = module.lookup_import(instruction_set);
     if (result_id == SpvInvalidId) {
         result_id = make_id(SpvImportId);
@@ -1190,7 +1190,7 @@ void SpvBuilder::require_capability(SpvCapability capability) {
     }
 }
 
-bool SpvBuilder::is_imported(const std::string& instruction_set) const {
+bool SpvBuilder::is_imported(const std::string &instruction_set) const {
     return module.is_imported(instruction_set);
 }
 
@@ -1241,7 +1241,7 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
         SpvId array_size_id = make_id(SpvIntConstantId);
         SpvId array_size_type_id = add_type(array_size_type);
         SpvInstruction array_size_inst = SpvFactory::constant(array_size_id, array_size_type_id, array_size_type.bytes(), &array_size);
-        module.add_type(array_size_inst); // needs to be defined in the type section (prior to its use in the array_type inst)
+        module.add_type(array_size_inst);  // needs to be defined in the type section (prior to its use in the array_type inst)
         constant_map[constant_key] = array_size_id;
 
         // declare the array type
@@ -1284,8 +1284,8 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
             }
         } else if (type.is_int_or_uint()) {
             SpvId signedness = 0;
-            bool signedness_support = module.is_capability_required(SpvCapabilityKernel) ? false : true; // kernel execution doesn't track signedness
-            if(signedness_support) {
+            bool signedness_support = module.is_capability_required(SpvCapabilityKernel) ? false : true;  // kernel execution doesn't track signedness
+            if (signedness_support) {
                 signedness = type.is_uint() ? 0 : 1;
             }
 
@@ -1696,7 +1696,7 @@ bool SpvBuilder::is_pointer_type(SpvId id) const {
 
 bool SpvBuilder::is_struct_type(SpvId id) const {
     SpvKind kind = kind_of(id);
-    if(kind == SpvStructTypeId) {
+    if (kind == SpvStructTypeId) {
         return true;
     }
     return false;
@@ -1704,7 +1704,7 @@ bool SpvBuilder::is_struct_type(SpvId id) const {
 
 bool SpvBuilder::is_vector_type(SpvId id) const {
     SpvKind kind = kind_of(id);
-    if(kind == SpvVectorTypeId) {
+    if (kind == SpvVectorTypeId) {
         return true;
     }
     return false;
@@ -1712,9 +1712,9 @@ bool SpvBuilder::is_vector_type(SpvId id) const {
 
 bool SpvBuilder::is_scalar_type(SpvId id) const {
     SpvKind kind = kind_of(id);
-    if((kind == SpvFloatTypeId) || 
-       (kind == SpvIntTypeId) || 
-       (kind == SpvBoolTypeId)) {
+    if ((kind == SpvFloatTypeId) ||
+        (kind == SpvIntTypeId) ||
+        (kind == SpvBoolTypeId)) {
         return true;
     }
     return false;
@@ -1722,7 +1722,7 @@ bool SpvBuilder::is_scalar_type(SpvId id) const {
 
 bool SpvBuilder::is_array_type(SpvId id) const {
     SpvKind kind = kind_of(id);
-    if((kind == SpvArrayTypeId)) {
+    if ((kind == SpvArrayTypeId)) {
         return true;
     }
     return false;
@@ -1730,12 +1730,12 @@ bool SpvBuilder::is_array_type(SpvId id) const {
 
 bool SpvBuilder::is_constant(SpvId id) const {
     SpvKind kind = kind_of(id);
-    if((kind == SpvConstantId) || 
-       (kind == SpvBoolConstantId) || 
-       (kind == SpvIntConstantId) || 
-       (kind == SpvFloatConstantId) || 
-       (kind == SpvStringConstantId) || 
-       (kind == SpvCompositeConstantId)) {
+    if ((kind == SpvConstantId) ||
+        (kind == SpvBoolConstantId) ||
+        (kind == SpvIntConstantId) ||
+        (kind == SpvFloatConstantId) ||
+        (kind == SpvStringConstantId) ||
+        (kind == SpvCompositeConstantId)) {
         return true;
     }
     return false;
@@ -2312,13 +2312,13 @@ SpvInstruction SpvFactory::import(SpvId instruction_set_id, const std::string &i
     return inst;
 }
 
-SpvInstruction SpvFactory::extended(SpvId instruction_set_id, SpvId instruction_number, SpvId type_id, SpvId result_id, const SpvFactory::Operands& operands) {
+SpvInstruction SpvFactory::extended(SpvId instruction_set_id, SpvId instruction_number, SpvId type_id, SpvId result_id, const SpvFactory::Operands &operands) {
     SpvInstruction inst = SpvInstruction::make(SpvOpExtInst);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
     inst.add_operand(instruction_set_id);
     inst.add_immediate(instruction_number);
-    for(SpvId o : operands) {
+    for (SpvId o : operands) {
         inst.add_operand(o);
     }
     return inst;
@@ -2327,88 +2327,88 @@ SpvInstruction SpvFactory::extended(SpvId instruction_set_id, SpvId instruction_
 /** GLSL extended instruction utility methods */
 
 bool is_glsl_unary_op(SpvId glsl_op_code) {
-    switch(glsl_op_code) {
-        case GLSLstd450Round: 
-        case GLSLstd450RoundEven: 
-        case GLSLstd450Trunc:
-        case GLSLstd450FAbs:
-        case GLSLstd450SAbs:
-        case GLSLstd450FSign:
-        case GLSLstd450SSign:
-        case GLSLstd450Floor:
-        case GLSLstd450Ceil:
-        case GLSLstd450Fract:
-        case GLSLstd450Radians:
-        case GLSLstd450Degrees:
-        case GLSLstd450Sin:
-        case GLSLstd450Cos:
-        case GLSLstd450Tan:
-        case GLSLstd450Asin:
-        case GLSLstd450Acos: 
-        case GLSLstd450Atan: 
-        case GLSLstd450Asinh: 
-        case GLSLstd450Acosh: 
-        case GLSLstd450Atanh: 
-        case GLSLstd450Cosh: 
-        case GLSLstd450Exp: 
-        case GLSLstd450Log: 
-        case GLSLstd450Exp2: 
-        case GLSLstd450Log2: 
-        case GLSLstd450Sqrt: 
-        case GLSLstd450InverseSqrt: 
-        case GLSLstd450Determinant: 
-        case GLSLstd450MatrixInverse: 
-        case GLSLstd450ModfStruct: 
-        case GLSLstd450FrexpStruct: 
-        case GLSLstd450PackSnorm4x8: 
-        case GLSLstd450PackUnorm4x8: 
-        case GLSLstd450PackSnorm2x16: 
-        case GLSLstd450PackUnorm2x16: 
-        case GLSLstd450PackHalf2x16: 
-        case GLSLstd450PackDouble2x32: 
-        case GLSLstd450UnpackSnorm4x8: 
-        case GLSLstd450UnpackUnorm4x8: 
-        case GLSLstd450UnpackSnorm2x16: 
-        case GLSLstd450UnpackUnorm2x16: 
-        case GLSLstd450UnpackHalf2x16: 
-        case GLSLstd450UnpackDouble2x32: 
-        case GLSLstd450Length: 
-        case GLSLstd450Normalize: 
-        case GLSLstd450FindILsb: 
-        case GLSLstd450FindSMsb: 
-        case GLSLstd450FindUMsb: 
-        case GLSLstd450InterpolateAtCentroid:
-            return true;
-        default:
-            break;
+    switch (glsl_op_code) {
+    case GLSLstd450Round:
+    case GLSLstd450RoundEven:
+    case GLSLstd450Trunc:
+    case GLSLstd450FAbs:
+    case GLSLstd450SAbs:
+    case GLSLstd450FSign:
+    case GLSLstd450SSign:
+    case GLSLstd450Floor:
+    case GLSLstd450Ceil:
+    case GLSLstd450Fract:
+    case GLSLstd450Radians:
+    case GLSLstd450Degrees:
+    case GLSLstd450Sin:
+    case GLSLstd450Cos:
+    case GLSLstd450Tan:
+    case GLSLstd450Asin:
+    case GLSLstd450Acos:
+    case GLSLstd450Atan:
+    case GLSLstd450Asinh:
+    case GLSLstd450Acosh:
+    case GLSLstd450Atanh:
+    case GLSLstd450Cosh:
+    case GLSLstd450Exp:
+    case GLSLstd450Log:
+    case GLSLstd450Exp2:
+    case GLSLstd450Log2:
+    case GLSLstd450Sqrt:
+    case GLSLstd450InverseSqrt:
+    case GLSLstd450Determinant:
+    case GLSLstd450MatrixInverse:
+    case GLSLstd450ModfStruct:
+    case GLSLstd450FrexpStruct:
+    case GLSLstd450PackSnorm4x8:
+    case GLSLstd450PackUnorm4x8:
+    case GLSLstd450PackSnorm2x16:
+    case GLSLstd450PackUnorm2x16:
+    case GLSLstd450PackHalf2x16:
+    case GLSLstd450PackDouble2x32:
+    case GLSLstd450UnpackSnorm4x8:
+    case GLSLstd450UnpackUnorm4x8:
+    case GLSLstd450UnpackSnorm2x16:
+    case GLSLstd450UnpackUnorm2x16:
+    case GLSLstd450UnpackHalf2x16:
+    case GLSLstd450UnpackDouble2x32:
+    case GLSLstd450Length:
+    case GLSLstd450Normalize:
+    case GLSLstd450FindILsb:
+    case GLSLstd450FindSMsb:
+    case GLSLstd450FindUMsb:
+    case GLSLstd450InterpolateAtCentroid:
+        return true;
+    default:
+        break;
     };
     return false;
 }
 
 bool is_glsl_binary_op(SpvId glsl_op_code) {
-    switch(glsl_op_code) {
-        case GLSLstd450Atan2:
-        case GLSLstd450Pow:
-        case GLSLstd450Modf:
-        case GLSLstd450FMin:
-        case GLSLstd450UMin:
-        case GLSLstd450SMin:
-        case GLSLstd450FMax:
-        case GLSLstd450UMax:
-        case GLSLstd450SMax:
-        case GLSLstd450Step:
-        case GLSLstd450Frexp:
-        case GLSLstd450Ldexp:
-        case GLSLstd450Distance:
-        case GLSLstd450Cross:
-        case GLSLstd450Reflect:
-        case GLSLstd450InterpolateAtOffset:
-        case GLSLstd450InterpolateAtSample:
-        case GLSLstd450NMax:
-        case GLSLstd450NMin:
-            return true;
-        default:
-            break;
+    switch (glsl_op_code) {
+    case GLSLstd450Atan2:
+    case GLSLstd450Pow:
+    case GLSLstd450Modf:
+    case GLSLstd450FMin:
+    case GLSLstd450UMin:
+    case GLSLstd450SMin:
+    case GLSLstd450FMax:
+    case GLSLstd450UMax:
+    case GLSLstd450SMax:
+    case GLSLstd450Step:
+    case GLSLstd450Frexp:
+    case GLSLstd450Ldexp:
+    case GLSLstd450Distance:
+    case GLSLstd450Cross:
+    case GLSLstd450Reflect:
+    case GLSLstd450InterpolateAtOffset:
+    case GLSLstd450InterpolateAtSample:
+    case GLSLstd450NMax:
+    case GLSLstd450NMin:
+        return true;
+    default:
+        break;
     };
     return false;
 }
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 08c469f79a51..03a4abc0dded 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -23,8 +23,8 @@
 #include "IntrusivePtr.h"
 #include "Type.h"
 
-#include <spirv/1.0/spirv.h>         // Use v1.0 spec as the minimal viable version (for maximum compatiblity)
 #include <spirv/1.0/GLSL.std.450.h>  // GLSL extended instructions for common intrinsics
+#include <spirv/1.0/spirv.h>         // Use v1.0 spec as the minimal viable version (for maximum compatiblity)
 
 namespace Halide {
 namespace Internal {
@@ -270,13 +270,13 @@ class SpvModule {
     SpvInstruction entry_point(const std::string &name) const;
     EntryPointNames entry_point_names() const;
     ImportNames import_names() const;
-    SpvId lookup_import(const std::string& Instruction_set) const;
+    SpvId lookup_import(const std::string &Instruction_set) const;
     uint32_t entry_point_count() const;
     const Instructions &execution_modes() const;
     uint32_t binding_count() const;
     SpvModule module() const;
 
-    bool is_imported(const std::string& instruction_set) const;
+    bool is_imported(const std::string &instruction_set) const;
     bool is_capability_required(SpvCapability val) const;
     bool is_extension_required(const std::string &val) const;
     bool is_defined() const;
@@ -363,11 +363,11 @@ class SpvBuilder {
 
     SpvId import_glsl_intrinsics();
     SpvId import_instruction_set(const std::string &instruction_set);
-    
+
     void require_extension(const std::string &extension);
     void require_capability(SpvCapability);
 
-    bool is_imported(const std::string& instruction_set) const;
+    bool is_imported(const std::string &instruction_set) const;
     bool is_extension_required(const std::string &extension) const;
     bool is_capability_required(SpvCapability) const;
 
@@ -390,13 +390,13 @@ class SpvBuilder {
     bool is_scalar_type(SpvId id) const;
     bool is_array_type(SpvId id) const;
     bool is_constant(SpvId id) const;
-    
+
     SpvId lookup_base_type(SpvId pointer_type) const;
     SpvStorageClass lookup_storage_class(SpvId id) const;
     SpvId lookup_id(const std::string &symbol) const;
     SpvId lookup_scope(SpvId id) const;
     SpvId lookup_import(const std::string &instruction_set) const;
-    
+
     std::string lookup_symbol(SpvId id) const;
     SpvId declare_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
     void add_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
@@ -557,7 +557,7 @@ struct SpvFactory {
     static SpvInstruction unary_op(SpvOp op_code, SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction binary_op(SpvOp op_code, SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
     static SpvInstruction convert(SpvOp op_code, SpvId type_id, SpvId result_id, SpvId src_id);
-    static SpvInstruction extended(SpvId instruction_set_id, SpvId instruction_number, SpvId type_id, SpvId result_id, const SpvFactory::Operands& operands);
+    static SpvInstruction extended(SpvId instruction_set_id, SpvId instruction_number, SpvId type_id, SpvId result_id, const SpvFactory::Operands &operands);
 };
 
 /** Contents of a SPIR-V Instruction */
diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index 4cbd15f0da9f..6e94c530abb8 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -6036,10 +6036,10 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetValidationCacheDataEXT(
 
 // Provided by VK_VERSION_1_2
 typedef struct VkPhysicalDeviceShaderFloat16Int8Features {
-    uint32_t           sType;
-    void*              pNext;
-    VkBool32           shaderFloat16;
-    VkBool32           shaderInt8;
+    uint32_t sType;
+    void *pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
 } VkPhysicalDeviceShaderFloat16Int8FeaturesKHR;
 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES 1000082000
 
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 44431242d820..6e7bdaa2c9b3 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -642,27 +642,27 @@ WEAK int halide_vulkan_run(void *user_context,
     // 3a. Locate the correct entry point from the cache
     bool found_entry_point = false;
     uint32_t entry_point_index = 0;
-    for(uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
-        if(strstr(cache_entry->entry_point_data[n].entry_point_name, entry_name) != nullptr) {
+    for (uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
+        if (strstr(cache_entry->entry_point_data[n].entry_point_name, entry_name) != nullptr) {
             entry_point_index = n;
             found_entry_point = true;
         }
     }
-    if(!found_entry_point) {
+    if (!found_entry_point) {
         error(user_context) << "Vulkan: Failed to locate shader entry point! Unable to proceed!\n";
         return halide_error_code_internal_error;
     }
-    debug(user_context) << "    found entry point [" 
-                        << entry_point_index << "/" << cache_entry->entry_point_count 
+    debug(user_context) << "    found entry point ["
+                        << entry_point_index << "/" << cache_entry->entry_point_count
                         << "] '" << entry_name << "'\n";
 
     halide_abort_if_false(user_context, cache_entry->descriptor_set_layouts != nullptr);
     if (cache_entry->pipeline_layout == 0) {
 
-        //// 1. Create all descriptor set layouts        
-        for(uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
-            uint32_t uniform_buffer_count = cache_entry->entry_point_data[n].uniform_buffer_count; 
-            uint32_t storage_buffer_count = cache_entry->entry_point_data[n].storage_buffer_count; 
+        //// 1. Create all descriptor set layouts
+        for (uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
+            uint32_t uniform_buffer_count = cache_entry->entry_point_data[n].uniform_buffer_count;
+            uint32_t storage_buffer_count = cache_entry->entry_point_data[n].storage_buffer_count;
             VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(cache_entry->descriptor_set_layouts[n]));
             if (result != VK_SUCCESS) {
                 error(user_context) << "Vulkan: vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
@@ -678,7 +678,7 @@ WEAK int halide_vulkan_run(void *user_context,
         }
     }
 
-    VulkanEntryPointData* entry_point_data = (cache_entry->entry_point_data + entry_point_index);
+    VulkanEntryPointData *entry_point_data = (cache_entry->entry_point_data + entry_point_index);
     halide_abort_if_false(user_context, entry_point_data != nullptr);
 
     //// 1a. Create a buffer for the scalar parameters
@@ -710,8 +710,8 @@ WEAK int halide_vulkan_run(void *user_context,
         // NOTE: while this could be re-used across multiple pipelines, we only know the storage requirements of this kernel's
         //       inputs and outputs ... so create a pool specific to the number of buffers known at this time
 
-        uint32_t uniform_buffer_count = entry_point_data->uniform_buffer_count; // needs_scalar_uniform_buffer ? 1 : 0;
-        uint32_t storage_buffer_count = entry_point_data->storage_buffer_count; // entry_point_data->buffer_count;
+        uint32_t uniform_buffer_count = entry_point_data->uniform_buffer_count;  // needs_scalar_uniform_buffer ? 1 : 0;
+        uint32_t storage_buffer_count = entry_point_data->storage_buffer_count;  // entry_point_data->buffer_count;
         VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(entry_point_data->descriptor_pool));
         if (result != VK_SUCCESS) {
             error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
@@ -758,9 +758,9 @@ WEAK int halide_vulkan_run(void *user_context,
 
     //// 7. Begin the command buffer
     result = vk_fill_command_buffer_with_dispatch_call(user_context,
-                                                       ctx.device, command_buffer, 
-                                                       entry_point_data->compute_pipeline, 
-                                                       cache_entry->pipeline_layout, 
+                                                       ctx.device, command_buffer,
+                                                       entry_point_data->compute_pipeline,
+                                                       cache_entry->pipeline_layout,
                                                        entry_point_data->descriptor_set,
                                                        entry_point_index,
                                                        blocksX, blocksY, blocksZ);
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 81cfae29f0e6..01d9002b903d 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -253,29 +253,27 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
 
     // If the instance runtime supports querying extended device features, request them
     VkPhysicalDeviceShaderFloat16Int8FeaturesKHR shader_f16_i8_ext = {
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES, 
-        nullptr, VK_FALSE, VK_FALSE
-    };
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES,
+        nullptr, VK_FALSE, VK_FALSE};
 
     VkPhysicalDeviceFeatures2KHR device_features_ext = {
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR, 
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR,
         &shader_f16_i8_ext,
-        device_features
-    };    
-
-    void* extended_features_ptr = nullptr;
-    void* standard_features_ptr = nullptr;
-    PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2KHR");   // v1.0+
-    if(!vkGetPhysicalDeviceFeatures2KHR) { vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2"); } // v1.1+
-    if(vkGetPhysicalDeviceFeatures2KHR) {
+        device_features};
+
+    void *extended_features_ptr = nullptr;
+    void *standard_features_ptr = nullptr;
+    PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2KHR");     // v1.0+
+    if (!vkGetPhysicalDeviceFeatures2KHR) { vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2"); }  // v1.1+
+    if (vkGetPhysicalDeviceFeatures2KHR) {
         debug(user_context) << "Vulkan: Querying for extended device features...\n";
         vkGetPhysicalDeviceFeatures2KHR(*physical_device, &device_features_ext);
         debug(user_context) << "Vulkan: Shader Int8 support: " << (shader_f16_i8_ext.shaderInt8 ? "true" : "false") << "...\n";
         debug(user_context) << "Vulkan: Shader Float16 support: " << (shader_f16_i8_ext.shaderFloat16 ? "true" : "false") << "...\n";
-        extended_features_ptr = (void*)(&device_features_ext); // pass v1.1 extended features (which also contains the standard features)
+        extended_features_ptr = (void *)(&device_features_ext);  // pass v1.1 extended features (which also contains the standard features)
     } else {
         vkGetPhysicalDeviceFeatures(*physical_device, &device_features);
-        standard_features_ptr = &device_features;   // pass v1.0 standard features
+        standard_features_ptr = &device_features;  // pass v1.0 standard features
     }
 
     VkDeviceCreateInfo device_create_info = {
@@ -286,7 +284,7 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
         &device_queue_create_info,
         (uint32_t)requested_layers.size(), requested_layers.data(),                      // Layers
         (uint32_t)required_device_extensions.size(), required_device_extensions.data(),  // Enabled extensions
-        (VkPhysicalDeviceFeatures*)standard_features_ptr,                                // Requested device features
+        (VkPhysicalDeviceFeatures *)standard_features_ptr,                               // Requested device features
     };
 
     VkResult result = vkCreateDevice(*physical_device, &device_create_info, alloc_callbacks, device);
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index e8df451e655f..696cdd9dd871 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -170,7 +170,7 @@ VkResult vk_update_descriptor_set(void *user_context,
 VkResult vk_create_pipeline_layout(void *user_context,
                                    VulkanMemoryAllocator *allocator,
                                    uint32_t descriptor_set_count,
-                                   VkDescriptorSetLayout* descriptor_set_layouts,
+                                   VkDescriptorSetLayout *descriptor_set_layouts,
                                    VkPipelineLayout *pipeline_layout);
 
 VkResult vk_destroy_pipeline_layout(void *user_context,
@@ -189,8 +189,8 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
                                      VkPipeline compute_pipeline);
 
 // -- Shader Module
-VulkanEntryPointData* vk_decode_entry_point_data(void* user_context, VulkanMemoryAllocator *allocator, 
-                                                 const uint32_t* module_ptr, uint32_t module_size);
+VulkanEntryPointData *vk_decode_entry_point_data(void *user_context, VulkanMemoryAllocator *allocator,
+                                                 const uint32_t *module_ptr, uint32_t module_size);
 
 VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
                                                       const char *src, int size);
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 764dc6bfbca2..0923c216cc66 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -13,7 +13,7 @@ namespace Vulkan {
 
 // Compilation cache for compiled shader modules
 struct VulkanEntryPointData {
-    const char* entry_point_name = nullptr;
+    const char *entry_point_name = nullptr;
     VkDescriptorPool descriptor_pool = {0};
     VkDescriptorSet descriptor_set = {0};
     VkPipeline compute_pipeline = {0};
@@ -25,10 +25,10 @@ struct VulkanEntryPointData {
 
 struct VulkanCompilationCacheEntry {
     VkShaderModule shader_module = {0};
-    VkDescriptorSetLayout* descriptor_set_layouts = {0};
+    VkDescriptorSetLayout *descriptor_set_layouts = nullptr;
     VkPipelineLayout pipeline_layout = {0};
     uint32_t entry_point_count = 0;
-    VulkanEntryPointData* entry_point_data = nullptr;
+    VulkanEntryPointData *entry_point_data = nullptr;
 };
 
 WEAK Halide::Internal::GPUCompilationCache<VkDevice, VulkanCompilationCacheEntry *> compilation_cache;
@@ -224,7 +224,7 @@ VkResult vk_create_descriptor_pool(void *user_context,
     BlockStorage pool_sizes(user_context, pool_config);
 
     // First binding is reserved for passing scalar parameters as a uniform buffer
-    if(uniform_buffer_count > 0) {
+    if (uniform_buffer_count > 0) {
         VkDescriptorPoolSize uniform_buffer_size = {
             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
             uniform_buffer_count                // all kernel args are packed into uniform buffers
@@ -295,7 +295,7 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
     BlockStorage layout_bindings(user_context, layout_config);
 
     // add all uniform buffers first
-    for(uint32_t n = 0; n < uniform_buffer_count; ++n) {
+    for (uint32_t n = 0; n < uniform_buffer_count; ++n) {
         VkDescriptorSetLayoutBinding uniform_buffer_layout = {
             (uint32_t)layout_bindings.size(),   // binding index
             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,  // descriptor type
@@ -305,15 +305,15 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
         };
 
 #ifdef DEBUG_RUNTIME
-    debug(user_context)
-        << "        [" <<  (uint32_t)layout_bindings.size() << "] : UNIFORM_BUFFER\n";
+        debug(user_context)
+            << "        [" << (uint32_t)layout_bindings.size() << "] : UNIFORM_BUFFER\n";
 #endif
 
         layout_bindings.append(user_context, &uniform_buffer_layout);
     }
 
     // Add all other storage buffers
-    for(uint32_t n = 0; n < storage_buffer_count; ++n) {
+    for (uint32_t n = 0; n < storage_buffer_count; ++n) {
 
         // halide buffers will be passed as STORAGE_BUFFERS
         VkDescriptorSetLayoutBinding storage_buffer_layout = {
@@ -324,8 +324,8 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
             nullptr                             // immutable samplers
         };
 #ifdef DEBUG_RUNTIME
-debug(user_context)
-    << "        [" <<  (uint32_t)layout_bindings.size() << "] : STORAGE_BUFFER\n";
+        debug(user_context)
+            << "        [" << (uint32_t)layout_bindings.size() << "] : STORAGE_BUFFER\n";
 #endif
 
         layout_bindings.append(user_context, &storage_buffer_layout);
@@ -598,7 +598,7 @@ void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator
 VkResult vk_create_pipeline_layout(void *user_context,
                                    VulkanMemoryAllocator *allocator,
                                    uint32_t descriptor_set_count,
-                                   VkDescriptorSetLayout* descriptor_set_layouts,
+                                   VkDescriptorSetLayout *descriptor_set_layouts,
                                    VkPipelineLayout *pipeline_layout) {
 
 #ifdef DEBUG_RUNTIME
@@ -708,12 +708,12 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
 
 // --------------------------------------------------------------------------
 
-VulkanEntryPointData* vk_decode_entry_point_data(void* user_context, VulkanMemoryAllocator *allocator, const uint32_t* module_ptr, uint32_t module_size) {
+VulkanEntryPointData *vk_decode_entry_point_data(void *user_context, VulkanMemoryAllocator *allocator, const uint32_t *module_ptr, uint32_t module_size) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "Vulkan: vk_decode_entry_point_data (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "module_ptr: " << (void*)module_ptr << ", "
+        << "module_ptr: " << (void *)module_ptr << ", "
         << "module_size: " << module_size << ")\n";
 
     uint64_t t_before = halide_current_time_ns(user_context);
@@ -721,7 +721,7 @@ VulkanEntryPointData* vk_decode_entry_point_data(void* user_context, VulkanMemor
     halide_debug_assert(user_context, module_ptr != nullptr);
     halide_debug_assert(user_context, module_size >= (2 * sizeof(uint32_t)));
 
-    // Decode the sidecar for the module that lists the descriptor sets 
+    // Decode the sidecar for the module that lists the descriptor sets
     // corresponding to each entry point contained in the module
     //
     // [0] Header word count (total length of header)
@@ -730,16 +730,16 @@ VulkanEntryPointData* vk_decode_entry_point_data(void* user_context, VulkanMemor
     // ... [0] Number of uniform buffers for this descriptor set
     // ... [1] Number of storage buffers for this descriptor set
     // ... [2] Length of entry point name (padded to nearest word size)
-    // ... [X] Entry point string data 
+    // ... [X] Entry point string data
     //
     // NOTE: See CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header() for the encoding
     //
     //
     uint32_t module_entries = module_size / sizeof(uint32_t);
-    uint32_t idx = 1; // skip past the header_word_count
+    uint32_t idx = 1;  // skip past the header_word_count
     uint32_t entry_point_count = module_ptr[idx++];
-    if(entry_point_count < 1) {
-        return nullptr; // no descriptors 
+    if (entry_point_count < 1) {
+        return nullptr;  // no descriptors
     }
 
     // allocate an array of entry point data
@@ -753,29 +753,29 @@ VulkanEntryPointData* vk_decode_entry_point_data(void* user_context, VulkanMemor
     memset(entry_point_data, 0, entry_point_data_size);
 
     // decode and fill in each entry point
-    for(uint32_t n = 0; (n < entry_point_count) && (idx < module_entries); n++) {
-        halide_debug_assert(user_context,(idx + 4) < module_entries);
+    for (uint32_t n = 0; (n < entry_point_count) && (idx < module_entries); n++) {
+        halide_debug_assert(user_context, (idx + 4) < module_entries);
         uint32_t uniform_buffer_count = module_ptr[idx++];
         uint32_t storage_buffer_count = module_ptr[idx++];
         uint32_t padded_string_length = module_ptr[idx++];
-        const char* entry_point_name = (const char*)(module_ptr + idx);
+        const char *entry_point_name = (const char *)(module_ptr + idx);
 
         debug(user_context) << "    [" << n << "] "
-                 << "uniform_buffer_count=" << uniform_buffer_count << " "
-                 << "storage_buffer_count=" << storage_buffer_count << " "
-                 << "entry_point_name_length=" << padded_string_length << " "                 
-                 << "entry_point_name: " << (const char*)entry_point_name << "\n";
+                            << "uniform_buffer_count=" << uniform_buffer_count << " "
+                            << "storage_buffer_count=" << storage_buffer_count << " "
+                            << "entry_point_name_length=" << padded_string_length << " "
+                            << "entry_point_name: " << (const char *)entry_point_name << "\n";
 
-        entry_point_data[n].entry_point_name = entry_point_name; // NOTE: module owns string data
+        entry_point_data[n].entry_point_name = entry_point_name;  // NOTE: module owns string data
         entry_point_data[n].uniform_buffer_count = uniform_buffer_count;
         entry_point_data[n].storage_buffer_count = storage_buffer_count;
-        idx += (padded_string_length / sizeof(uint32_t)); // skip past string data
+        idx += (padded_string_length / sizeof(uint32_t));  // skip past string data
     }
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
-#endif    
+#endif
 
     return entry_point_data;
 }
@@ -793,33 +793,33 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    const uint32_t* module_ptr = (const uint32_t*)ptr;
+    const uint32_t *module_ptr = (const uint32_t *)ptr;
     const uint32_t module_size = (const uint32_t)size;
 
     halide_debug_assert(user_context, module_ptr != nullptr);
     halide_debug_assert(user_context, module_size >= (2 * sizeof(uint32_t)));
 
-    uint32_t header_word_count = module_ptr[0]; 
+    uint32_t header_word_count = module_ptr[0];
     uint32_t entry_point_count = module_ptr[1];
     uint32_t header_size = header_word_count * sizeof(uint32_t);
 
     // skip past the preamble header to the start of the SPIR-V binary
-    const uint32_t* binary_ptr = (module_ptr + header_word_count);
+    const uint32_t *binary_ptr = (module_ptr + header_word_count);
     size_t binary_size = (size - header_size);
 
 #ifdef DEBUG_RUNTIME
     debug(user_context) << "Vulkan: Decoding module ("
-                        << "module_ptr: " << (void*)module_ptr << ", "
+                        << "module_ptr: " << (void *)module_ptr << ", "
                         << "header_word_count: " << header_word_count << ", "
                         << "header_size: " << header_size << ", "
-                        << "binar_ptr: " << (void*)binary_ptr << ", "
+                        << "binar_ptr: " << (void *)binary_ptr << ", "
                         << "binary_size: " << (uint32_t)binary_size << ")\n";
 #endif
 
     VkShaderModuleCreateInfo shader_info = {
         VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
-        nullptr,               // pointer to structure extending this
-        0,                     // flags (curently unused)
+        nullptr,                      // pointer to structure extending this
+        0,                            // flags (curently unused)
         (size_t)binary_size,          // code size in bytes
         (const uint32_t *)binary_ptr  // source
     };
@@ -834,10 +834,10 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
 
     // decode the entry point data and save it in the cache entry
     cache_entry->entry_point_data = vk_decode_entry_point_data(user_context, allocator, module_ptr, module_size);
-    if(cache_entry->entry_point_data != nullptr) {
+    if (cache_entry->entry_point_data != nullptr) {
         cache_entry->entry_point_count = entry_point_count;
-    }    
-    
+    }
+
     VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &cache_entry->shader_module);
     if ((result != VK_SUCCESS)) {  // } || (cache_entry->shader_module == nullptr)) {
         error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
@@ -847,7 +847,7 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
     }
 
     // allocate an array for storing the descriptor set layouts
-    if(cache_entry->entry_point_count) {
+    if (cache_entry->entry_point_count) {
         cache_entry->descriptor_set_layouts = (VkDescriptorSetLayout *)vk_host_malloc(user_context, cache_entry->entry_point_count * sizeof(VkDescriptorSetLayout), 0, alloc_scope, allocator->callbacks());
         if (cache_entry->descriptor_set_layouts == nullptr) {
             error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!\n";
@@ -890,8 +890,8 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                     vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
                     cache_entry->shader_module = {0};
                 }
-                if(cache_entry->entry_point_data) {
-                    for( uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
+                if (cache_entry->entry_point_data) {
+                    for (uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
                         if (cache_entry->entry_point_data[n].args_region) {
                             vk_destroy_scalar_uniform_buffer(user_context, allocator, cache_entry->entry_point_data[n].args_region);
                             cache_entry->entry_point_data[n].args_region = nullptr;
@@ -911,7 +911,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                     cache_entry->entry_point_count = 0;
                 }
                 if (cache_entry->descriptor_set_layouts) {
-                    for( uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
+                    for (uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
                         vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
                         cache_entry->descriptor_set_layouts[n] = {0};
                     }

From bedf1b39c095986d583f496a08fbb3b46d48aa85 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 10 Oct 2022 20:42:44 -0700
Subject: [PATCH 039/166] Update test results for Vulkan docs. Passing: 326
 Failing: 39

---
 README_vulkan.md | 124 +++++++++++++++++++++++------------------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 3fd4eb3a44d4..5385ddb2ff86 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -197,7 +197,7 @@ modifications to run under Vulkan):
 
 | Totals | *PASS* 🟢 | *FAIL* 🔴 | 
 | --     | --        | --        |
-|        | 307       | 58        |
+|        | 326       | 39        |
 
 
 | Test Name | Status |
@@ -213,11 +213,11 @@ modifications to run under Vulkan):
 | correctness_unsafe_promises | *PASS* 🟢 |
 | correctness_reorder_rvars | *PASS* 🟢 |
 | correctness_lossless_cast | *PASS* 🟢 |
-| correctness_gpu_reuse_shared_memory | *FAIL* 🔴 | 
-| correctness_boundary_conditions | *FAIL* 🔴 | 
+| correctness_gpu_reuse_shared_memory | *FAIL* 🔴 |
+| correctness_boundary_conditions | *FAIL* 🔴 |
 | correctness_min_extent | *PASS* 🟢 |
-| correctness_gpu_sum_scan | *FAIL* 🔴 | 
-| correctness_dynamic_allocation_in_gpu_kernel | *FAIL* 🔴 | 
+| correctness_gpu_sum_scan | *PASS* 🟢 |
+| correctness_dynamic_allocation_in_gpu_kernel | *FAIL* 🔴 |
 | correctness_image_of_lists | *PASS* 🟢 |
 | correctness_tracing_broadcast | *PASS* 🟢 |
 | correctness_scatter | *PASS* 🟢 |
@@ -228,13 +228,13 @@ modifications to run under Vulkan):
 | correctness_compare_vars | *PASS* 🟢 |
 | correctness_non_vector_aligned_embeded_buffer | *PASS* 🟢 |
 | correctness_realize_larger_than_two_gigs | *PASS* 🟢 |
-| correctness_gpu_transpose | *FAIL* 🔴 | 
+| correctness_gpu_transpose | *FAIL* 🔴 |
 | correctness_side_effects | *PASS* 🟢 |
-| correctness_logical | *FAIL* 🔴 | 
+| correctness_logical | *FAIL* 🔴 |
 | correctness_func_lifetime_2 | *PASS* 🟢 |
-| correctness_device_crop | *FAIL* 🔴 | 
+| correctness_device_crop | *FAIL* 🔴 |
 | correctness_print_loop_nest | *PASS* 🟢 |
-| correctness_bool_compute_root_vectorize | *FAIL* 🔴 | 
+| correctness_bool_compute_root_vectorize | *FAIL* 🔴 |
 | correctness_extract_concat_bits | *PASS* 🟢 |
 | correctness_dead_realization_in_specialization | *PASS* 🟢 |
 | correctness_undef | *PASS* 🟢 |
@@ -242,8 +242,8 @@ modifications to run under Vulkan):
 | correctness_parallel_scatter | *PASS* 🟢 |
 | correctness_multi_splits_with_diff_tail_strategies | *PASS* 🟢 |
 | correctness_gpu_arg_types | *PASS* 🟢 |
-| correctness_cascaded_filters | *PASS* 🟢 |
-| correctness_trim_no_ops | *FAIL* 🔴 | 
+| correctness_cascaded_filters | *FAIL* 🔴 |
+| correctness_trim_no_ops | *PASS* 🟢 |
 | correctness_float16_t_comparison | *PASS* 🟢 |
 | correctness_legal_race_condition | *PASS* 🟢 |
 | correctness_explicit_inline_reductions | *PASS* 🟢 |
@@ -257,7 +257,7 @@ modifications to run under Vulkan):
 | correctness_gpu_assertion_in_kernel | *PASS* 🟢 |
 | correctness_low_bit_depth_noise | *PASS* 🟢 |
 | correctness_fuse | *PASS* 🟢 |
-| correctness_vector_cast | *FAIL* 🔴 | 
+| correctness_vector_cast | *FAIL* 🔴 |
 | correctness_concat | *PASS* 🟢 |
 | correctness_mod | *PASS* 🟢 |
 | correctness_parallel_rvar | *PASS* 🟢 |
@@ -271,7 +271,7 @@ modifications to run under Vulkan):
 | correctness_failed_unroll | *PASS* 🟢 |
 | correctness_gpu_vectorized_shared_memory | *PASS* 🟢 |
 | correctness_bounds_inference_complex | *PASS* 🟢 |
-| correctness_widening_reduction | *FAIL* 🔴 | 
+| correctness_widening_reduction | *FAIL* 🔴 |
 | correctness_extern_partial | *PASS* 🟢 |
 | correctness_multi_output_pipeline_with_bad_sizes | *PASS* 🟢 |
 | correctness_hoist_loop_invariant_if_statements | *PASS* 🟢 |
@@ -287,25 +287,25 @@ modifications to run under Vulkan):
 | correctness_random | *PASS* 🟢 |
 | correctness_partition_loops_bug | *PASS* 🟢 |
 | correctness_stencil_chain_in_update_definitions | *PASS* 🟢 |
-| correctness_inverse | *PASS* 🟢 |
+| correctness_inverse | *FAIL* 🔴 |
 | correctness_skip_stages | *PASS* 🟢 |
 | correctness_cuda_8_bit_dot_product | *PASS* 🟢 |
-| correctness_gpu_vectorize | *FAIL* 🔴 | 
-| correctness_gpu_object_lifetime_3 | *FAIL* 🔴 | 
+| correctness_gpu_vectorize | *FAIL* 🔴 |
+| correctness_gpu_object_lifetime_3 | *PASS* 🟢 |
 | correctness_histogram | *PASS* 🟢 |
 | correctness_shared_self_references | *PASS* 🟢 |
-| correctness_gpu_mixed_shared_mem_types | *FAIL* 🔴 | 
+| correctness_gpu_mixed_shared_mem_types | *FAIL* 🔴 |
 | correctness_custom_cuda_context | *PASS* 🟢 |
 | correctness_implicit_args_tests | *PASS* 🟢 |
 | correctness_compile_to_lowered_stmt | *PASS* 🟢 |
 | correctness_bounds_of_func | *PASS* 🟢 |
-| correctness_interleave_rgb | *FAIL* 🔴 | 
+| correctness_interleave_rgb | *FAIL* 🔴 |
 | correctness_multi_gpu_gpu_multi_device | *PASS* 🟢 |
 | correctness_lambda | *PASS* 🟢 |
 | correctness_interval | *PASS* 🟢 |
 | correctness_unused_func | *PASS* 🟢 |
 | correctness_fuzz_float_stores | *PASS* 🟢 |
-| correctness_newtons_method | *FAIL* 🔴 | 
+| correctness_newtons_method | *FAIL* 🔴 |
 | correctness_compile_to_bitcode | *PASS* 🟢 |
 | correctness_lazy_convolution | *PASS* 🟢 |
 | correctness_image_wrapper | *PASS* 🟢 |
@@ -314,7 +314,7 @@ modifications to run under Vulkan):
 | correctness_reorder_storage | *PASS* 🟢 |
 | correctness_bit_counting | *PASS* 🟢 |
 | correctness_tiled_matmul | *PASS* 🟢 |
-| correctness_async_device_copy | *FAIL* 🔴 | 
+| correctness_async_device_copy | *FAIL* 🔴 |
 | correctness_lots_of_dimensions | *PASS* 🟢 |
 | correctness_interleave | *PASS* 🟢 |
 | correctness_dynamic_reduction_bounds | *PASS* 🟢 |
@@ -323,13 +323,13 @@ modifications to run under Vulkan):
 | correctness_unroll_dynamic_loop | *PASS* 🟢 |
 | correctness_buffer_t | *PASS* 🟢 |
 | correctness_hello_gpu | *PASS* 🟢 |
-| correctness_gpu_object_lifetime_2 | *FAIL* 🔴 | 
+| correctness_gpu_object_lifetime_2 | *PASS* 🟢 |
 | correctness_update_chunk | *PASS* 🟢 |
 | correctness_autodiff | *PASS* 🟢 |
 | correctness_extern_consumer | *PASS* 🟢 |
 | correctness_func_wrapper | *PASS* 🟢 |
 | correctness_bounds_of_multiply | *PASS* 🟢 |
-| correctness_gpu_store_in_register_with_no_lanes_loop | *FAIL* 🔴 | 
+| correctness_gpu_store_in_register_with_no_lanes_loop | *FAIL* 🔴 |
 | correctness_gpu_condition_lifting | *PASS* 🟢 |
 | correctness_extern_consumer_tiled | *PASS* 🟢 |
 | correctness_float16_t_neon_op_check | *PASS* 🟢 |
@@ -340,15 +340,15 @@ modifications to run under Vulkan):
 | correctness_process_some_tiles | *PASS* 🟢 |
 | correctness_parameter_constraints | *PASS* 🟢 |
 | correctness_callable | *PASS* 🟢 |
-| correctness_bounds_inference | *FAIL* 🔴 | 
+| correctness_bounds_inference | *PASS* 🟢 |
 | correctness_indexing_access_undef | *PASS* 🟢 |
 | correctness_partial_realization | *PASS* 🟢 |
-| correctness_gpu_mixed_dimensionality | *FAIL* 🔴 | 
+| correctness_gpu_mixed_dimensionality | *FAIL* 🔴 |
 | correctness_uninitialized_read | *PASS* 🟢 |
 | correctness_unsafe_dedup_lets | *PASS* 🟢 |
 | correctness_output_larger_than_two_gigs | *PASS* 🟢 |
 | correctness_obscure_image_references | *PASS* 🟢 |
-| correctness_chunk | *FAIL* 🔴 | 
+| correctness_chunk | *PASS* 🟢 |
 | correctness_vectorized_load_from_vectorized_allocation | *PASS* 🟢 |
 | correctness_load_library | *PASS* 🟢 |
 | correctness_compute_inside_guard | *PASS* 🟢 |
@@ -358,13 +358,13 @@ modifications to run under Vulkan):
 | correctness_vectorized_initialization | *PASS* 🟢 |
 | correctness_loop_level_generator_param | *PASS* 🟢 |
 | correctness_two_vector_args | *PASS* 🟢 |
-| correctness_argmax | *FAIL* 🔴 | 
+| correctness_argmax | *FAIL* 🔴 |
 | correctness_custom_auto_scheduler | *PASS* 🟢 |
 | correctness_shadowed_bound | *PASS* 🟢 |
 | correctness_inlined_generator | *PASS* 🟢 |
-| correctness_math | *FAIL* 🔴 | 
+| correctness_math | *FAIL* 🔴 |
 | correctness_gpu_different_blocks_threads_dimensions | *PASS* 🟢 |
-| correctness_extern_stage_on_device | *FAIL* 🔴 | 
+| correctness_extern_stage_on_device | *PASS* 🟢 |
 | correctness_bound | *PASS* 🟢 |
 | correctness_popc_clz_ctz_bounds | *PASS* 🟢 |
 | correctness_bounds | *PASS* 🟢 |
@@ -383,17 +383,17 @@ modifications to run under Vulkan):
 | correctness_thread_safety | *PASS* 🟢 |
 | correctness_fuse_gpu_threads | *PASS* 🟢 |
 | correctness_split_reuse_inner_name_bug | *PASS* 🟢 |
-| correctness_gpu_jit_explicit_copy_to_device | *FAIL* 🔴 | 
+| correctness_gpu_jit_explicit_copy_to_device | *PASS* 🟢 |
 | correctness_tuple_select | *PASS* 🟢 |
-| correctness_device_buffer_copy | *FAIL* 🔴 | 
+| correctness_device_buffer_copy | *FAIL* 🔴 |
 | correctness_pseudostack_shares_slots | *PASS* 🟢 |
 | correctness_lots_of_loop_invariants | *PASS* 🟢 |
 | correctness_fuzz_simplify | *PASS* 🟢 |
 | correctness_div_round_to_zero | *PASS* 🟢 |
 | correctness_rfactor | *PASS* 🟢 |
 | correctness_custom_jit_context | *PASS* 🟢 |
-| correctness_round | *FAIL* 🔴 | 
-| correctness_device_slice | *FAIL* 🔴 | 
+| correctness_round | *FAIL* 🔴 |
+| correctness_device_slice | *FAIL* 🔴 |
 | correctness_iterate_over_circle | *PASS* 🟢 |
 | correctness_vector_print_bug | *PASS* 🟢 |
 | correctness_mux | *PASS* 🟢 |
@@ -401,18 +401,18 @@ modifications to run under Vulkan):
 | correctness_parallel_nested_1 | *PASS* 🟢 |
 | correctness_compile_to_multitarget | *PASS* 🟢 |
 | correctness_bounds_inference_outer_split | *PASS* 🟢 |
-| correctness_leak_device_memory | *FAIL* 🔴 | 
+| correctness_leak_device_memory | *PASS* 🟢 |
 | correctness_reduction_schedule | *PASS* 🟢 |
 | correctness_many_small_extern_stages | *PASS* 🟢 |
 | correctness_parallel_alloc | *PASS* 🟢 |
-| correctness_multiple_outputs | *FAIL* 🔴 | 
+| correctness_multiple_outputs | *FAIL* 🔴 |
 | correctness_vectorize_nested | *PASS* 🟢 |
 | correctness_bad_likely | *PASS* 🟢 |
 | correctness_sliding_reduction | *PASS* 🟢 |
 | correctness_bounds_of_split | *PASS* 🟢 |
 | correctness_erf | *PASS* 🟢 |
 | correctness_float16_t_image_type | *PASS* 🟢 |
-| correctness_gpu_non_monotonic_shared_mem_size | *FAIL* 🔴 | 
+| correctness_gpu_non_monotonic_shared_mem_size | *FAIL* 🔴 |
 | correctness_extern_reorder_storage | *PASS* 🟢 |
 | correctness_gather | *PASS* 🟢 |
 | correctness_gpu_many_kernels | *PASS* 🟢 |
@@ -420,14 +420,14 @@ modifications to run under Vulkan):
 | correctness_strict_float_bounds | *PASS* 🟢 |
 | correctness_bounds_of_abs | *PASS* 🟢 |
 | correctness_tuple_vector_reduce | *PASS* 🟢 |
-| correctness_debug_to_file_reorder | *FAIL* 🔴 | 
+| correctness_debug_to_file_reorder | *PASS* 🟢 |
 | correctness_vectorized_reduction_bug | *PASS* 🟢 |
 | correctness_input_larger_than_two_gigs | *PASS* 🟢 |
 | correctness_computed_index | *PASS* 🟢 |
-| correctness_reduction_non_rectangular | *FAIL* 🔴 | 
+| correctness_reduction_non_rectangular | *PASS* 🟢 |
 | correctness_left_shift_negative | *PASS* 🟢 |
 | correctness_set_custom_trace | *PASS* 🟢 |
-| correctness_vectorized_gpu_allocation | *FAIL* 🔴 | 
+| correctness_vectorized_gpu_allocation | *FAIL* 🔴 |
 | correctness_split_store_compute | *PASS* 🟢 |
 | correctness_c_function | *PASS* 🟢 |
 | correctness_specialize | *PASS* 🟢 |
@@ -439,12 +439,12 @@ modifications to run under Vulkan):
 | correctness_atomics | *PASS* 🟢 |
 | correctness_multipass_constraints | *PASS* 🟢 |
 | correctness_target | *PASS* 🟢 |
-| correctness_tuple_reduction | *FAIL* 🔴 | 
-| correctness_dilate3x3 | *FAIL* 🔴 | 
+| correctness_tuple_reduction | *FAIL* 🔴 |
+| correctness_dilate3x3 | *PASS* 🟢 |
 | correctness_image_io | *PASS* 🟢 |
-| correctness_gpu_param_allocation | *FAIL* 🔴 | 
+| correctness_gpu_param_allocation | *FAIL* 🔴 |
 | correctness_reschedule | *PASS* 🟢 |
-| correctness_isnan | *FAIL* 🔴 | 
+| correctness_isnan | *FAIL* 🔴 |
 | correctness_halide_buffer | *PASS* 🟢 |
 | correctness_bounds_of_cast | *PASS* 🟢 |
 | correctness_handle | *PASS* 🟢 |
@@ -453,13 +453,13 @@ modifications to run under Vulkan):
 | correctness_extern_producer | *PASS* 🟢 |
 | correctness_shift_by_unsigned_negated | *PASS* 🟢 |
 | correctness_circular_reference_leak | *PASS* 🟢 |
-| correctness_specialize_to_gpu | *FAIL* 🔴 | 
-| correctness_device_copy_at_inner_loop | *FAIL* 🔴 | 
+| correctness_specialize_to_gpu | *PASS* 🟢 |
+| correctness_device_copy_at_inner_loop | *PASS* 🟢 |
 | correctness_fit_function | *PASS* 🟢 |
 | correctness_compute_at_reordered_update_stage | *PASS* 🟢 |
 | correctness_non_nesting_extern_bounds_query | *PASS* 🟢 |
 | correctness_bitwise_ops | *PASS* 🟢 |
-| correctness_gpu_data_flows | *FAIL* 🔴 | 
+| correctness_gpu_data_flows | *PASS* 🟢 |
 | correctness_cast | *PASS* 🟢 |
 | correctness_stack_allocations | *PASS* 🟢 |
 | correctness_sliding_backwards | *PASS* 🟢 |
@@ -474,24 +474,24 @@ modifications to run under Vulkan):
 | correctness_fuzz_bounds | *PASS* 🟢 |
 | correctness_strided_load | *PASS* 🟢 |
 | correctness_bound_storage | *PASS* 🟢 |
-| correctness_gpu_cpu_simultaneous_read | *FAIL* 🔴 | 
+| correctness_gpu_cpu_simultaneous_read | *PASS* 🟢 |
 | correctness_fast_trigonometric | *PASS* 🟢 |
-| correctness_compute_with | *FAIL* 🔴 | 
-| correctness_gpu_allocation_cache | *FAIL* 🔴 | 
+| correctness_compute_with | *PASS* 🟢 |
+| correctness_gpu_allocation_cache | *FAIL* 🔴 |
 | correctness_compile_to | *PASS* 🟢 |
 | correctness_extern_output_expansion | *PASS* 🟢 |
 | correctness_gpu_texture | *PASS* 🟢 |
 | correctness_many_updates | *PASS* 🟢 |
 | correctness_memoize | *PASS* 🟢 |
-| correctness_gpu_multi_kernel | *FAIL* 🔴 | 
+| correctness_gpu_multi_kernel | *PASS* 🟢 |
 | correctness_extern_error | *PASS* 🟢 |
 | correctness_partition_max_filter | *PASS* 🟢 |
 | correctness_bound_small_allocations | *PASS* 🟢 |
-| correctness_median3x3 | *FAIL* 🔴 | 
+| correctness_median3x3 | *PASS* 🟢 |
 | correctness_reuse_stack_alloc | *PASS* 🟢 |
-| correctness_debug_to_file | *FAIL* 🔴 | 
+| correctness_debug_to_file | *PASS* 🟢 |
 | correctness_embed_bitcode | *PASS* 🟢 |
-| correctness_gpu_large_alloc | *FAIL* 🔴 | 
+| correctness_gpu_large_alloc | *PASS* 🟢 |
 | correctness_pytorch | *PASS* 🟢 |
 | correctness_in_place | *PASS* 🟢 |
 | correctness_exception | *PASS* 🟢 |
@@ -502,9 +502,9 @@ modifications to run under Vulkan):
 | correctness_loop_invariant_extern_calls | *PASS* 🟢 |
 | correctness_skip_stages_external_array_functions | *PASS* 🟢 |
 | correctness_chunk_sharing | *PASS* 🟢 |
-| correctness_multi_way_select | *PASS* 🟢 |
-| correctness_async_copy_chain | *FAIL* 🔴 | 
-| correctness_gpu_give_input_buffers_device_allocations | *FAIL* 🔴 | 
+| correctness_multi_way_select | *FAIL* 🔴 |
+| correctness_async_copy_chain | *FAIL* 🔴 |
+| correctness_gpu_give_input_buffers_device_allocations | *PASS* 🟢 |
 | correctness_oddly_sized_output | *PASS* 🟢 |
 | correctness_fuzz_cse | *PASS* 🟢 |
 | correctness_half_native_interleave | *PASS* 🟢 |
@@ -517,9 +517,9 @@ modifications to run under Vulkan):
 | correctness_unrolled_reduction | *PASS* 🟢 |
 | correctness_tracing | *PASS* 🟢 |
 | correctness_simplified_away_embedded_image | *PASS* 🟢 |
-| correctness_mul_div_mod | *FAIL* 🔴 | 
+| correctness_mul_div_mod | *FAIL* 🔴 |
 | correctness_infer_arguments | *PASS* 🟢 |
-| correctness_convolution | *FAIL* 🔴 | 
+| correctness_convolution | *FAIL* 🔴 |
 | correctness_truncated_pyramid | *PASS* 🟢 |
 | correctness_for_each_element | *PASS* 🟢 |
 | correctness_store_in | *PASS* 🟢 |
@@ -546,19 +546,19 @@ modifications to run under Vulkan):
 | correctness_unroll_huge_mux | *PASS* 🟢 |
 | correctness_parallel | *PASS* 🟢 |
 | correctness_code_explosion | *PASS* 🟢 |
-| correctness_gpu_dynamic_shared | *FAIL* 🔴 | 
+| correctness_gpu_dynamic_shared | *FAIL* 🔴 |
 | correctness_div_by_zero | *PASS* 🟢 |
-| correctness_convolution_multiple_kernels | *FAIL* 🔴 | 
+| correctness_convolution_multiple_kernels | *FAIL* 🔴 |
 | correctness_deinterleave4 | *PASS* 🟢 |
 | correctness_align_bounds | *PASS* 🟢 |
 | correctness_gpu_bounds_inference_failure | *PASS* 🟢 |
-| correctness_interpreter | *FAIL* 🔴 | 
+| correctness_interpreter | *FAIL* 🔴 |
 | correctness_parallel_gpu_nested | *PASS* 🟢 |
-| correctness_gpu_thread_barrier | *FAIL* 🔴 | 
+| correctness_gpu_thread_barrier | *FAIL* 🔴 |
 | correctness_debug_to_file_multiple_outputs | *PASS* 🟢 |
 | correctness_gpu_free_sync | *PASS* 🟢 |
 | correctness_out_constraint | *PASS* 🟢 |
-| correctness_gpu_specialize | *FAIL* 🔴 | 
+| correctness_gpu_specialize | *FAIL* 🔴 |
 | correctness_register_shuffle | *PASS* 🟢 |
 | correctness_constant_expr | *PASS* 🟢 |
 | correctness_out_of_memory | *PASS* 🟢 |

From 095248f7b0deb15a5ffa64748322fb6519fd8414 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 10 Oct 2022 20:48:48 -0700
Subject: [PATCH 040/166] Fix formatting

---
 src/runtime/vulkan_context.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 01d9002b903d..ad747abe57f1 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -263,8 +263,13 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
 
     void *extended_features_ptr = nullptr;
     void *standard_features_ptr = nullptr;
-    PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2KHR");     // v1.0+
-    if (!vkGetPhysicalDeviceFeatures2KHR) { vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2"); }  // v1.1+
+
+    // Look for v1.1+ device feature query method
+    PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2KHR");  // v1.0+
+    if (!vkGetPhysicalDeviceFeatures2KHR) {
+        vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2");
+    }
+
     if (vkGetPhysicalDeviceFeatures2KHR) {
         debug(user_context) << "Vulkan: Querying for extended device features...\n";
         vkGetPhysicalDeviceFeatures2KHR(*physical_device, &device_features_ext);

From 732fc87f66d9a212528037b6de8ee2bfe8bf491a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 10 Oct 2022 21:01:42 -0700
Subject: [PATCH 041/166] Remove extraneous parentheses for is_array_type()

---
 src/SpirvIR.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index fc5605cf207d..207a3ae12f6b 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -1722,7 +1722,7 @@ bool SpvBuilder::is_scalar_type(SpvId id) const {
 
 bool SpvBuilder::is_array_type(SpvId id) const {
     SpvKind kind = kind_of(id);
-    if ((kind == SpvArrayTypeId)) {
+    if (kind == SpvArrayTypeId) {
         return true;
     }
     return false;

From aae0ad93dff0456cbaef847a105347a73895fa53 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 12 Oct 2022 15:11:38 -0700
Subject: [PATCH 042/166] Add Vulkan library to linkage fo Halide generator
 helpers

---
 cmake/HalideGeneratorHelpers.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake
index ca622ca634a8..0558d0454ee0 100644
--- a/cmake/HalideGeneratorHelpers.cmake
+++ b/cmake/HalideGeneratorHelpers.cmake
@@ -672,6 +672,11 @@ function(_Halide_target_link_gpu_libs TARGET VISIBILITY)
         endif ()
     endif ()
 
+    if ("${ARGN}" MATCHES "vulkan")
+        find_package(Vulkan REQUIRED)
+        target_link_libraries(${TARGET} ${VISIBILITY} Vulkan::Vulkan)
+    endif ()
+
     if ("${ARGN}" MATCHES "metal")
         find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
         find_library(METAL_LIBRARY Metal REQUIRED)

From 2a6700fda3c281e25bb14063d6a756bf9b50c8ab Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 12 Oct 2022 15:28:34 -0700
Subject: [PATCH 043/166] Add SPIR-V formatted output (for debugging)

---
 src/SpirvIR.cpp | 1702 +++++++++++++++++++++++++++++++++++++++++++++--
 src/SpirvIR.h   |   68 +-
 2 files changed, 1718 insertions(+), 52 deletions(-)

diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 207a3ae12f6b..1141efb6322c 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -8,6 +8,16 @@ namespace Internal {
 
 namespace {
 
+/** Returns the major version of the SPIR-V header version indicator **/
+inline uint32_t spirv_major_version(uint32_t version) {
+    return ((version >> 16) & 0xff);
+}
+
+/** Returns the minor version of the SPIR-V header version indicator **/
+inline uint32_t spirv_minor_version(uint32_t version) {
+    return ((version >> 8) & 0xff);
+}
+
 template<typename T, typename S>
 typename std::enable_if<std::is_unsigned<T>::value, T>::type constexpr rotl(const T n, const S i) {
     const T m = (std::numeric_limits<T>::digits - 1);
@@ -64,13 +74,13 @@ void SpvInstruction::set_op_code(SpvOp op_code) {
 void SpvInstruction::add_operand(SpvId id) {
     check_defined();
     contents->operands.push_back(id);
-    contents->immediates.push_back(false);
+    contents->value_types.push_back(SpvOperandId);
 }
 
-void SpvInstruction::add_immediate(SpvId id) {
+void SpvInstruction::add_immediate(SpvId id, SpvValueType value_type) {
     check_defined();
     contents->operands.push_back(id);
-    contents->immediates.push_back(true);
+    contents->value_types.push_back(value_type);
 }
 
 SpvId SpvInstruction::result_id() const {
@@ -88,11 +98,26 @@ SpvOp SpvInstruction::op_code() const {
     return contents->op_code;
 }
 
-SpvId SpvInstruction::operand(uint32_t index) {
+const void* SpvInstruction::data(uint32_t index) const {
+    check_defined();
+    return &(contents->operands[index]);
+}
+
+SpvId SpvInstruction::operand(uint32_t index) const {
     check_defined();
     return contents->operands[index];
 }
 
+SpvValueType SpvInstruction::value_type(uint32_t index) const {
+    check_defined();
+    return contents->value_types[index];
+}
+
+const SpvInstruction::Operands& SpvInstruction::operands() const {
+    check_defined();
+    return contents->operands;
+}
+
 bool SpvInstruction::has_type() const {
     if (!is_defined()) {
         return false;
@@ -113,7 +138,7 @@ bool SpvInstruction::is_defined() const {
 
 bool SpvInstruction::is_immediate(uint32_t index) const {
     check_defined();
-    return contents->immediates[index];
+    return (contents->value_types[index] != SpvOperandId);
 }
 
 uint32_t SpvInstruction::length() const {
@@ -126,7 +151,7 @@ SpvBlock SpvInstruction::block() const {
     return contents->block;
 }
 
-void SpvInstruction::add_data(uint32_t bytes, const void *data) {
+void SpvInstruction::add_data(uint32_t bytes, const void *data, SpvValueType value_type) {
     check_defined();
     uint32_t extra_words = (bytes + 3) / 4;
     const uint8_t *ptr = (const uint8_t *)data;
@@ -136,14 +161,14 @@ void SpvInstruction::add_data(uint32_t bytes, const void *data) {
         SpvId entry = 0;
         memcpy(&entry, ptr, copy_size);
         bytes_copied += copy_size;
-        add_immediate(entry);
+        add_immediate(entry, value_type);
         ptr += 4;
     }
 }
 
 void SpvInstruction::add_string(const std::string &str) {
     check_defined();
-    add_data(str.length() + 1, (const void *)str.c_str());
+    add_data(str.length() + 1, (const void *)str.c_str(), SpvStringData);
 }
 
 void SpvInstruction::check_defined() const {
@@ -372,6 +397,16 @@ SpvInstruction SpvFunction::declaration() const {
     return contents->declaration;
 }
 
+const SpvFunction::Blocks& SpvFunction::blocks() const {
+    check_defined();
+    return contents->blocks;
+}
+
+const SpvFunction::Parameters& SpvFunction::parameters() const {
+    check_defined();
+    return contents->parameters;
+}
+
 SpvModule SpvFunction::module() const {
     check_defined();
     return contents->parent;
@@ -510,6 +545,11 @@ uint32_t SpvModule::binding_count() const {
     return contents->binding_count;
 }
 
+uint32_t SpvModule::version_format() const {
+    check_defined();
+    return contents->version_format;
+}
+
 SpvSourceLanguage SpvModule::source_language() const {
     check_defined();
     return contents->source_language;
@@ -520,11 +560,77 @@ SpvAddressingModel SpvModule::addressing_model() const {
     return contents->addressing_model;
 }
 
+SpvModule::Imports SpvModule::imports() const {
+    check_defined();
+    SpvModule::Imports results;
+    results.reserve(contents->imports.size());
+    for(const SpvModuleContents::Imports::value_type& v : contents->imports) {
+        SpvModule::ImportDefinition definition = {v.second, v.first};
+        results.push_back(definition);
+    }
+    return results;
+}
+
+SpvModule::Extensions SpvModule::extensions() const {
+    check_defined();
+    SpvModule::Extensions results;
+    results.reserve(contents->extensions.size());
+    for(const SpvModuleContents::Extensions::value_type& v: contents->extensions) {
+        results.push_back(v);
+    }
+    return results;
+}
+
+SpvModule::Capabilities SpvModule::capabilities() const {
+    check_defined();
+    SpvModule::Capabilities results;
+    results.reserve(contents->capabilities.size());
+    for(const SpvModuleContents::Capabilities::value_type& v: contents->capabilities) {
+        results.push_back(v);
+    }
+    return results;
+}
+
 const SpvModule::Instructions &SpvModule::execution_modes() const {
     check_defined();
     return contents->execution_modes;
 }
 
+const SpvModule::Instructions &SpvModule::debug_source() const {
+    check_defined();
+    return contents->debug_source;
+}
+
+const SpvModule::Instructions &SpvModule::debug_symbols() const {
+    check_defined();
+    return contents->debug_symbols;
+}
+
+const SpvModule::Instructions &SpvModule::annotations() const {
+    check_defined();
+    return contents->annotations;
+}
+
+const SpvModule::Instructions &SpvModule::type_definitions() const {
+    check_defined();
+    return contents->types;
+}
+
+const SpvModule::Instructions &SpvModule::global_constants() const {
+    check_defined();
+    return contents->constants;
+}
+
+const SpvModule::Instructions &SpvModule::global_variables() const {
+    check_defined();
+    return contents->globals;
+}
+
+const SpvModule::Functions &SpvModule::function_definitions() const {
+    check_defined();
+    return contents->functions;
+}
+
 SpvMemoryModel SpvModule::memory_model() const {
     check_defined();
     return contents->memory_model;
@@ -587,16 +693,28 @@ bool SpvModule::is_capability_required(SpvCapability capability) const {
 
 SpvModule::EntryPointNames SpvModule::entry_point_names() const {
     check_defined();
-    SpvModule::EntryPointNames entry_point_names(contents->entry_points.size());
+    SpvModule::EntryPointNames entry_point_names;
+    entry_point_names.reserve(contents->entry_points.size());
     for (const SpvModuleContents::EntryPoints::value_type &v : contents->entry_points) {
         entry_point_names.push_back(v.first);
     }
     return entry_point_names;
 }
 
+SpvModule::Instructions SpvModule::entry_points() const {
+    check_defined();
+    SpvModule::Instructions entry_points;
+    entry_points.reserve(contents->entry_points.size());
+    for (const SpvModuleContents::EntryPoints::value_type &v : contents->entry_points) {
+        entry_points.push_back(v.second);
+    }
+    return entry_points;
+}
+
 SpvModule::ImportNames SpvModule::import_names() const {
     check_defined();
-    SpvModule::ImportNames results(contents->imports.size());
+    SpvModule::ImportNames results;
+    results.reserve(contents->imports.size());
     for (const SpvModuleContents::Imports::value_type &v : contents->imports) {
         results.push_back(v.first);
     }
@@ -626,7 +744,7 @@ void SpvModule::encode(SpvBinary &binary) const {
 
     // 0. Encode the header
     binary.push_back(SpvMagicNumber);
-    binary.push_back(SpvVersion);
+    binary.push_back(contents->version_format);
     binary.push_back(contents->source_language);
     binary.push_back(contents->binding_count);  // last id bound to this module (aka last id used)
     binary.push_back(0);                        // Reserved for schema.
@@ -1240,7 +1358,7 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
         ConstantKey constant_key = make_constant_key(array_size_type, &array_size);
         SpvId array_size_id = make_id(SpvIntConstantId);
         SpvId array_size_type_id = add_type(array_size_type);
-        SpvInstruction array_size_inst = SpvFactory::constant(array_size_id, array_size_type_id, array_size_type.bytes(), &array_size);
+        SpvInstruction array_size_inst = SpvFactory::constant(array_size_id, array_size_type_id, array_size_type.bytes(), &array_size, SpvIntegerData);
         module.add_type(array_size_inst);  // needs to be defined in the type section (prior to its use in the array_type inst)
         constant_map[constant_key] = array_size_id;
 
@@ -1522,11 +1640,14 @@ SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *d
     }
 
     SpvId result_id = SpvInvalidId;
+    SpvValueType value_type = SpvInvalidValueType;
     if (scalar_type.is_float()) {
         result_id = make_id(SpvFloatConstantId);
+        value_type = SpvFloatData;
         debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const float *)(data)) << "\n";
     } else if (scalar_type.is_int_or_uint()) {
         result_id = make_id(SpvIntConstantId);
+        value_type = SpvIntegerData;
         debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const int32_t *)(data)) << "\n";
     } else {
         internal_error << "SPIRV: Unsupported type:" << scalar_type << "\n";
@@ -1534,7 +1655,7 @@ SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *d
     }
 
     SpvId type_id = add_type(scalar_type);
-    SpvInstruction inst = SpvFactory::constant(result_id, type_id, scalar_type.bytes(), data);
+    SpvInstruction inst = SpvFactory::constant(result_id, type_id, scalar_type.bytes(), data, value_type);
     module.add_constant(inst);
     constant_map[constant_key] = result_id;
     return result_id;
@@ -1775,8 +1896,8 @@ SpvInstruction SpvFactory::label(SpvId result_id) {
 SpvInstruction SpvFactory::debug_line(SpvId string_id, uint32_t line, uint32_t column) {
     SpvInstruction inst = SpvInstruction::make(SpvOpLine);
     inst.add_operand(string_id);
-    inst.add_immediate(line);
-    inst.add_immediate(column);
+    inst.add_immediate(line, SpvIntegerLiteral);
+    inst.add_immediate(column, SpvIntegerLiteral);
     return inst;
 }
 
@@ -1797,9 +1918,9 @@ SpvInstruction SpvFactory::debug_symbol(SpvId target_id, const std::string &symb
 SpvInstruction SpvFactory::decorate(SpvId target_id, SpvDecoration decoration_type, const SpvFactory::Literals &literals) {
     SpvInstruction inst = SpvInstruction::make(SpvOpDecorate);
     inst.add_operand(target_id);
-    inst.add_immediate(decoration_type);
+    inst.add_immediate(decoration_type, SpvIntegerLiteral);
     for (uint32_t l : literals) {
-        inst.add_immediate(l);
+        inst.add_immediate(l, SpvIntegerLiteral);
     }
     return inst;
 }
@@ -1807,10 +1928,10 @@ SpvInstruction SpvFactory::decorate(SpvId target_id, SpvDecoration decoration_ty
 SpvInstruction SpvFactory::decorate_member(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const SpvFactory::Literals &literals) {
     SpvInstruction inst = SpvInstruction::make(SpvOpMemberDecorate);
     inst.add_operand(struct_type_id);
-    inst.add_immediate(member_index);
-    inst.add_immediate(decoration_type);
+    inst.add_immediate(member_index, SpvIntegerLiteral);
+    inst.add_immediate(decoration_type, SpvIntegerLiteral);
     for (uint32_t l : literals) {
-        inst.add_immediate(l);
+        inst.add_immediate(l, SpvIntegerLiteral);
     }
     return inst;
 }
@@ -1855,15 +1976,15 @@ SpvInstruction SpvFactory::bool_type(SpvId bool_type_id) {
 SpvInstruction SpvFactory::integer_type(SpvId int_type_id, uint32_t bits, uint32_t signedness) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeInt);
     inst.set_result_id(int_type_id);
-    inst.add_immediate(bits);
-    inst.add_immediate(signedness);
+    inst.add_immediate(bits, SpvIntegerLiteral);
+    inst.add_immediate(signedness, SpvIntegerLiteral);
     return inst;
 }
 
 SpvInstruction SpvFactory::float_type(SpvId float_type_id, uint32_t bits) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeFloat);
     inst.set_result_id(float_type_id);
-    inst.add_immediate(bits);
+    inst.add_immediate(bits, SpvIntegerLiteral);
     return inst;
 }
 
@@ -1871,7 +1992,7 @@ SpvInstruction SpvFactory::vector_type(SpvId vector_type_id, SpvId element_type_
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeVector);
     inst.set_result_id(vector_type_id);
     inst.add_operand(element_type_id);
-    inst.add_immediate(vector_size);
+    inst.add_immediate(vector_size, SpvIntegerLiteral);
     return inst;
 }
 
@@ -1902,7 +2023,7 @@ SpvInstruction SpvFactory::runtime_array_type(SpvId result_type_id, SpvId base_t
 SpvInstruction SpvFactory::pointer_type(SpvId pointer_type_id, SpvStorageClass storage_class, SpvId base_type_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypePointer);
     inst.set_result_id(pointer_type_id);
-    inst.add_immediate(storage_class);
+    inst.add_immediate(storage_class, SpvIntegerLiteral);
     inst.add_operand(base_type_id);
     return inst;
 }
@@ -1917,11 +2038,11 @@ SpvInstruction SpvFactory::function_type(SpvId function_type_id, SpvId return_ty
     return inst;
 }
 
-SpvInstruction SpvFactory::constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data) {
+SpvInstruction SpvFactory::constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data, SpvValueType value_type) {
     SpvInstruction inst = SpvInstruction::make(SpvOpConstant);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_data(bytes, data);
+    inst.add_data(bytes, data, value_type);
     return inst;
 }
 
@@ -1961,7 +2082,7 @@ SpvInstruction SpvFactory::variable(SpvId result_id, SpvId result_type_id, uint3
     SpvInstruction inst = SpvInstruction::make(SpvOpVariable);
     inst.set_type_id(result_type_id);
     inst.set_result_id(result_id);
-    inst.add_immediate(storage_class);
+    inst.add_immediate(storage_class, SpvIntegerLiteral);
     if (initializer_id != SpvInvalidId) {
         inst.add_operand(initializer_id);
     }
@@ -1972,7 +2093,7 @@ SpvInstruction SpvFactory::function(SpvId return_type_id, SpvId func_id, uint32_
     SpvInstruction inst = SpvInstruction::make(SpvOpFunction);
     inst.set_type_id(return_type_id);
     inst.set_result_id(func_id);
-    inst.add_immediate(control_mask);
+    inst.add_immediate(control_mask, SpvBitMaskLiteral);
     inst.add_operand(func_type_id);
     return inst;
 }
@@ -2000,7 +2121,7 @@ SpvInstruction SpvFactory::return_stmt(SpvId return_value_id) {
 
 SpvInstruction SpvFactory::entry_point(SpvId exec_model, SpvId func_id, const std::string &name, const SpvFactory::Variables &variables) {
     SpvInstruction inst = SpvInstruction::make(SpvOpEntryPoint);
-    inst.add_immediate(exec_model);
+    inst.add_immediate(exec_model, SpvIntegerLiteral);
     inst.add_operand(func_id);
     inst.add_string(name);
     for (SpvId var : variables) {
@@ -2011,18 +2132,18 @@ SpvInstruction SpvFactory::entry_point(SpvId exec_model, SpvId func_id, const st
 
 SpvInstruction SpvFactory::memory_model(SpvAddressingModel addressing_model, SpvMemoryModel memory_model) {
     SpvInstruction inst = SpvInstruction::make(SpvOpMemoryModel);
-    inst.add_immediate(addressing_model);
-    inst.add_immediate(memory_model);
+    inst.add_immediate(addressing_model, SpvIntegerLiteral);
+    inst.add_immediate(memory_model, SpvIntegerLiteral);
     return inst;
 }
 
 SpvInstruction SpvFactory::exec_mode_local_size(SpvId function_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z) {
     SpvInstruction inst = SpvInstruction::make(SpvOpExecutionMode);
     inst.add_operand(function_id);
-    inst.add_immediate(SpvExecutionModeLocalSize);
-    inst.add_immediate(wg_size_x);
-    inst.add_immediate(wg_size_y);
-    inst.add_immediate(wg_size_z);
+    inst.add_immediate(SpvExecutionModeLocalSize, SpvIntegerLiteral);
+    inst.add_immediate(wg_size_x, SpvIntegerLiteral);
+    inst.add_immediate(wg_size_y, SpvIntegerLiteral);
+    inst.add_immediate(wg_size_z, SpvIntegerLiteral);
     return inst;
 }
 
@@ -2109,7 +2230,7 @@ SpvInstruction SpvFactory::load(SpvId type_id, SpvId result_id, SpvId ptr_id, ui
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
     inst.add_operand(ptr_id);
-    inst.add_immediate(access_mask);
+    inst.add_immediate(access_mask, SpvBitMaskLiteral);
     return inst;
 }
 
@@ -2117,7 +2238,7 @@ SpvInstruction SpvFactory::store(SpvId ptr_id, SpvId obj_id, uint32_t access_mas
     SpvInstruction inst = SpvInstruction::make(SpvOpStore);
     inst.add_operand(ptr_id);
     inst.add_operand(obj_id);
-    inst.add_immediate(access_mask);
+    inst.add_immediate(access_mask, SpvBitMaskLiteral);
     return inst;
 }
 
@@ -2127,7 +2248,7 @@ SpvInstruction SpvFactory::composite_extract(SpvId type_id, SpvId result_id, Spv
     inst.set_result_id(result_id);
     inst.add_operand(composite_id);
     for (SpvId i : indices) {
-        inst.add_immediate(i);
+        inst.add_immediate(i, SpvIntegerLiteral);
     }
     return inst;
 }
@@ -2148,7 +2269,7 @@ SpvInstruction SpvFactory::vector_insert_dynamic(SpvId type_id, SpvId result_id,
     inst.set_result_id(result_id);
     inst.add_operand(vector_id);
     inst.add_operand(value_id);
-    inst.add_immediate(index);
+    inst.add_immediate(index, SpvIntegerLiteral);
     return inst;
 }
 
@@ -2159,7 +2280,7 @@ SpvInstruction SpvFactory::vector_shuffle(SpvId type_id, SpvId result_id, SpvId
     inst.add_operand(src_a_id);
     inst.add_operand(src_b_id);
     for (SpvId i : indices) {
-        inst.add_immediate(i);
+        inst.add_immediate(i, SpvIntegerLiteral);
     }
     return inst;
 }
@@ -2208,7 +2329,7 @@ SpvInstruction SpvFactory::conditional_branch(SpvId condition_label_id, SpvId tr
     inst.add_operand(true_label_id);
     inst.add_operand(false_label_id);
     for (uint32_t w : weights) {
-        inst.add_immediate(w);
+        inst.add_immediate(w, SpvIntegerLiteral);
     }
     return inst;
 }
@@ -2271,14 +2392,14 @@ SpvInstruction SpvFactory::loop_merge(SpvId merge_label_id, SpvId continue_label
     SpvInstruction inst = SpvInstruction::make(SpvOpLoopMerge);
     inst.add_operand(merge_label_id);
     inst.add_operand(continue_label_id);
-    inst.add_immediate(loop_control_mask);
+    inst.add_immediate(loop_control_mask, SpvBitMaskLiteral);
     return inst;
 }
 
 SpvInstruction SpvFactory::selection_merge(SpvId merge_label_id, uint32_t selection_control_mask) {
     SpvInstruction inst = SpvInstruction::make(SpvOpSelectionMerge);
     inst.add_operand(merge_label_id);
-    inst.add_immediate(selection_control_mask);
+    inst.add_immediate(selection_control_mask, SpvBitMaskLiteral);
     return inst;
 }
 
@@ -2295,7 +2416,7 @@ SpvInstruction SpvFactory::phi(SpvId type_id, SpvId result_id, const SpvFactory:
 
 SpvInstruction SpvFactory::capability(const SpvCapability &capability) {
     SpvInstruction inst = SpvInstruction::make(SpvOpCapability);
-    inst.add_immediate(capability);
+    inst.add_immediate(capability, SpvIntegerLiteral);
     return inst;
 }
 
@@ -2317,7 +2438,7 @@ SpvInstruction SpvFactory::extended(SpvId instruction_set_id, SpvId instruction_
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
     inst.add_operand(instruction_set_id);
-    inst.add_immediate(instruction_number);
+    inst.add_immediate(instruction_number, SpvIntegerLiteral);
     for (SpvId o : operands) {
         inst.add_operand(o);
     }
@@ -2454,6 +2575,1495 @@ void destroy<SpvModuleContents>(const SpvModuleContents *c) {
     delete c;
 }
 
+// --
+
+
+std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
+    if (!module.is_defined()) {
+        stream << "(undefined)";
+        return stream;
+    }
+
+    stream << "; SPIR-V\n";
+    stream << "; Version: " 
+           << std::to_string(spirv_major_version(module.version_format())) << "." 
+           << std::to_string(spirv_minor_version(module.version_format())) << "\n";
+    stream << "; Generator: Khronos; 0\n";
+    stream << "; Bound: " << std::to_string(module.binding_count()) << "\n";
+    stream << "; Schema: 0\n"; // reserved for future use
+
+    SpvModule::Capabilities capabilities = module.capabilities();
+    if(!capabilities.empty()) {
+        stream << "\n";
+        stream << "; Capabilities\n";
+        for (const SpvCapability &value : capabilities) {
+            SpvInstruction inst = SpvFactory::capability(value);
+            stream << inst;
+        }
+    }
+
+    SpvModule::Extensions extensions = module.extensions();
+    if(!extensions.empty()) {
+        stream << "\n";
+        stream << "; Extensions\n";
+        for (const std::string &value : extensions) {
+            SpvInstruction inst = SpvFactory::extension(value);
+            stream << inst;
+        }
+    }
+
+    SpvModule::Imports imports = module.imports();
+    if(!imports.empty()) {
+        stream << "\n";
+        stream << "; Extended Instruction Set Imports\n";
+        for (const SpvModule::Imports::value_type &v : imports) {
+            SpvInstruction inst = SpvFactory::import(v.first, v.second);
+            stream << inst;
+        }
+    }
+
+    SpvInstruction memory_model = SpvFactory::memory_model(module.addressing_model(), module.memory_model());
+    stream << "\n";
+    stream << "; Memory Model\n";
+    stream << memory_model;
+
+    if(module.entry_point_count() > 0) {
+        stream << "\n";
+        stream << "; Entry Points\n";
+        SpvModule::EntryPointNames entry_point_names = module.entry_point_names();
+        for( const std::string& name : entry_point_names) {            
+            SpvInstruction inst = module.entry_point(name);
+            stream << "; " << name << "\n";
+            stream << inst;
+        }
+    }
+
+    for (const SpvInstruction &inst : module.execution_modes()) {
+        stream << inst;
+    }
+
+    if(!module.debug_source().empty() || !module.debug_symbols().empty()) {
+        stream << "\n";
+        stream << "; Debug Information\n";
+    }
+    for (const SpvInstruction &inst : module.debug_source()) {
+        stream << inst;
+    }
+    for (const SpvInstruction &inst : module.debug_symbols()) {
+        stream << inst;
+    }
+
+    if(!module.annotations().empty()) {
+        stream << "\n";
+        stream << "; Annotations\n";
+        for (const SpvInstruction &inst : module.annotations()) {
+            stream << inst;
+        }
+    }
+
+    if(!module.type_definitions().empty()) {
+        stream << "\n";
+        stream << "; Type Definitions\n";
+        for (const SpvInstruction &inst : module.type_definitions()) {
+            stream << inst;
+        }
+    }
+
+    if(!module.global_constants().empty()) {
+        stream << "\n";
+        stream << "; Global Constants\n";
+        for (const SpvInstruction &inst : module.global_constants()) {
+            stream << inst;
+        }
+    }
+
+    if(!module.global_variables().empty()) {
+        stream << "\n";
+        stream << "; Global Variables\n";
+        for (const SpvInstruction &inst : module.global_variables()) {
+            stream << inst;
+        }
+    }
+
+    if(!module.function_definitions().empty()) {
+        stream << "\n";
+        stream << "; Function Definitions\n";
+        for (const SpvFunction &func : module.function_definitions()) {
+            stream << func;
+        }
+    }
+
+    return stream;
+}
+
+std::ostream &operator<<(std::ostream &stream, const SpvFunction &func) {
+    if (!func.is_defined()) {
+        stream << "(undefined)";
+        return stream;
+    }
+    stream << func.declaration();
+    for (const SpvInstruction &param : func.parameters()) {
+        stream << param;
+    }
+    for (const SpvBlock &block : func.blocks()) {
+        stream << block;
+    }
+    SpvInstruction inst = SpvFactory::function_end();
+    stream << inst;
+    return stream;
+}
+
+std::ostream &operator<<(std::ostream &stream, const SpvBlock &block) {
+    if (!block.is_defined()) {
+        stream << "(undefined)";
+        return stream;
+    }    
+
+    SpvInstruction label = SpvFactory::label(block.id());
+    stream << label;
+    for (const SpvInstruction &variable : block.variables()) {
+        stream << variable;
+    }
+    for (const SpvInstruction &instruction : block.instructions()) {
+        stream << instruction;
+    }
+
+    return stream;
+}
+
+std::ostream &operator<<(std::ostream &stream, const SpvInstruction & inst) {
+    if (!inst.is_defined()) {
+        stream << "(undefined)";
+        return stream;
+    }
+
+    if (inst.has_result()) {
+        stream << std::string("%") << std::to_string(inst.result_id());
+        stream << " = ";
+    }
+    
+    switch(inst.op_code()) {
+        case SpvOpNop: {
+            stream << "OpNop";
+            break;
+        }  
+        case SpvOpUndef: {
+            stream << "OpUndef";
+            break;
+        }  
+        case SpvOpSourceContinued: {
+            stream << "OpSourceContinued";
+            break;
+        }  
+        case SpvOpSource: {
+            stream << "OpSource";
+            break;
+        }  
+        case SpvOpSourceExtension: {
+            stream << "OpSourceExtension";
+            break;
+        }  
+        case SpvOpName: {
+            stream << "OpName";
+            break;
+        }  
+        case SpvOpMemberName: {
+            stream << "OpMemberName";
+            break;
+        }  
+        case SpvOpString: {
+            stream << "OpString";
+            break;
+        }  
+        case SpvOpLine: {
+            stream << "OpLine";
+            break;
+        }  
+        case SpvOpExtension: {
+            stream << "OpExtension";
+            break;
+        }  
+        case SpvOpExtInstImport: {
+            stream << "OpExtInstImport";
+            break;
+        }  
+        case SpvOpExtInst: {
+            stream << "OpExtInst";
+            break;
+        }  
+        case SpvOpMemoryModel: {
+            stream << "OpMemoryModel";
+            break;
+        }  
+        case SpvOpEntryPoint: {
+            stream << "OpEntryPoint";
+            break;
+        }  
+        case SpvOpExecutionMode: {
+            stream << "OpExecutionMode";
+            break;
+        }  
+        case SpvOpCapability: {
+            stream << "OpCapability";
+            break;
+        }  
+        case SpvOpTypeVoid: {
+            stream << "OpTypeVoid";
+            break;
+        }  
+        case SpvOpTypeBool: {
+            stream << "OpTypeBool";
+            break;
+        }  
+        case SpvOpTypeInt: {
+            stream << "OpTypeInt";
+            break;
+        }  
+        case SpvOpTypeFloat: {
+            stream << "OpTypeFloat";
+            break;
+        }  
+        case SpvOpTypeVector: {
+            stream << "OpTypeVector";
+            break;
+        }  
+        case SpvOpTypeMatrix: {
+            stream << "OpTypeMatrix";
+            break;
+        }  
+        case SpvOpTypeImage: {
+            stream << "OpTypeImage";
+            break;
+        }  
+        case SpvOpTypeSampler: {
+            stream << "OpTypeSampler";
+            break;
+        }  
+        case SpvOpTypeSampledImage: {
+            stream << "OpTypeSampledImage";
+            break;
+        }  
+        case SpvOpTypeArray: {
+            stream << "OpTypeArray";
+            break;
+        }  
+        case SpvOpTypeRuntimeArray: {
+            stream << "OpTypeRuntimeArray";
+            break;
+        }  
+        case SpvOpTypeStruct: {
+            stream << "OpTypeStruct";
+            break;
+        }  
+        case SpvOpTypeOpaque: {
+            stream << "OpTypeOpaque";
+            break;
+        }  
+        case SpvOpTypePointer: {
+            stream << "OpTypePointer";
+            break;
+        }  
+        case SpvOpTypeFunction: {
+            stream << "OpTypeFunction";
+            break;
+        }  
+        case SpvOpTypeEvent: {
+            stream << "OpTypeEvent";
+            break;
+        }  
+        case SpvOpTypeDeviceEvent: {
+            stream << "OpTypeDeviceEvent";
+            break;
+        }  
+        case SpvOpTypeReserveId: {
+            stream << "OpTypeReserveId";
+            break;
+        }  
+        case SpvOpTypeQueue: {
+            stream << "OpTypeQueue";
+            break;
+        }  
+        case SpvOpTypePipe: {
+            stream << "OpTypePipe";
+            break;
+        }  
+        case SpvOpTypeForwardPointer: {
+            stream << "OpTypeForwardPointer";
+            break;
+        }  
+        case SpvOpConstantTrue: {
+            stream << "OpConstantTrue";
+            break;
+        }  
+        case SpvOpConstantFalse: {
+            stream << "OpConstantFalse";
+            break;
+        }  
+        case SpvOpConstant: {
+            stream << "OpConstant";
+            break;
+        }  
+        case SpvOpConstantComposite: {
+            stream << "OpConstantComposite";
+            break;
+        }  
+        case SpvOpConstantSampler: {
+            stream << "OpConstantSampler";
+            break;
+        }  
+        case SpvOpConstantNull: {
+            stream << "OpConstantNull";
+            break;
+        }  
+        case SpvOpSpecConstantTrue: {
+            stream << "OpSpecConstantTrue";
+            break;
+        }  
+        case SpvOpSpecConstantFalse: {
+            stream << "OpSpecConstantFalse";
+            break;
+        }  
+        case SpvOpSpecConstant: {
+            stream << "OpSpecConstant";
+            break;
+        }  
+        case SpvOpSpecConstantComposite: {
+            stream << "OpSpecConstantComposite";
+            break;
+        }  
+        case SpvOpSpecConstantOp: {
+            stream << "OpSpecConstantOp";
+            break;
+        }  
+        case SpvOpFunction: {
+            stream << "OpFunction";
+            break;
+        }  
+        case SpvOpFunctionParameter: {
+            stream << "OpFunctionParameter";
+            break;
+        }  
+        case SpvOpFunctionEnd: {
+            stream << "OpFunctionEnd";
+            break;
+        }  
+        case SpvOpFunctionCall: {
+            stream << "OpFunctionCall";
+            break;
+        }  
+        case SpvOpVariable: {
+            stream << "OpVariable";
+            break;
+        }  
+        case SpvOpImageTexelPointer: {
+            stream << "OpImageTexelPointer";
+            break;
+        }  
+        case SpvOpLoad: {
+            stream << "OpLoad";
+            break;
+        }  
+        case SpvOpStore: {
+            stream << "OpStore";
+            break;
+        }  
+        case SpvOpCopyMemory: {
+            stream << "OpCopyMemory";
+            break;
+        }  
+        case SpvOpCopyMemorySized: {
+            stream << "OpCopyMemorySized";
+            break;
+        }  
+        case SpvOpAccessChain: {
+            stream << "OpAccessChain";
+            break;
+        }  
+        case SpvOpInBoundsAccessChain: {
+            stream << "OpInBoundsAccessChain";
+            break;
+        }  
+        case SpvOpPtrAccessChain: {
+            stream << "OpPtrAccessChain";
+            break;
+        }  
+        case SpvOpArrayLength: {
+            stream << "OpArrayLength";
+            break;
+        }  
+        case SpvOpGenericPtrMemSemantics: {
+            stream << "OpGenericPtrMemSemantics";
+            break;
+        }  
+        case SpvOpInBoundsPtrAccessChain: {
+            stream << "OpInBoundsPtrAccessChain";
+            break;
+        }  
+        case SpvOpDecorate: {
+            stream << "OpDecorate";
+            break;
+        }  
+        case SpvOpMemberDecorate: {
+            stream << "OpMemberDecorate";
+            break;
+        }  
+        case SpvOpDecorationGroup: {
+            stream << "OpDecorationGroup";
+            break;
+        }  
+        case SpvOpGroupDecorate: {
+            stream << "OpGroupDecorate";
+            break;
+        }  
+        case SpvOpGroupMemberDecorate: {
+            stream << "OpGroupMemberDecorate";
+            break;
+        }  
+        case SpvOpVectorExtractDynamic: {
+            stream << "OpVectorExtractDynamic";
+            break;
+        }  
+        case SpvOpVectorInsertDynamic: {
+            stream << "OpVectorInsertDynamic";
+            break;
+        }  
+        case SpvOpVectorShuffle: {
+            stream << "OpVectorShuffle";
+            break;
+        }  
+        case SpvOpCompositeConstruct: {
+            stream << "OpCompositeConstruct";
+            break;
+        }  
+        case SpvOpCompositeExtract: {
+            stream << "OpCompositeExtract";
+            break;
+        }  
+        case SpvOpCompositeInsert: {
+            stream << "OpCompositeInsert";
+            break;
+        }  
+        case SpvOpCopyObject: {
+            stream << "OpCopyObject";
+            break;
+        }  
+        case SpvOpTranspose: {
+            stream << "OpTranspose";
+            break;
+        }  
+        case SpvOpSampledImage: {
+            stream << "OpSampledImage";
+            break;
+        }  
+        case SpvOpImageSampleImplicitLod: {
+            stream << "OpImageSampleImplicitLod";
+            break;
+        }  
+        case SpvOpImageSampleExplicitLod: {
+            stream << "OpImageSampleExplicitLod";
+            break;
+        }  
+        case SpvOpImageSampleDrefImplicitLod: {
+            stream << "OpImageSampleDrefImplicitLod";
+            break;
+        }  
+        case SpvOpImageSampleDrefExplicitLod: {
+            stream << "OpImageSampleDrefExplicitLod";
+            break;
+        }  
+        case SpvOpImageSampleProjImplicitLod: {
+            stream << "OpImageSampleProjImplicitLod";
+            break;
+        }  
+        case SpvOpImageSampleProjExplicitLod: {
+            stream << "OpImageSampleProjExplicitLod";
+            break;
+        }  
+        case SpvOpImageSampleProjDrefImplicitLod: {
+            stream << "OpImageSampleProjDrefImplicitLod";
+            break;
+        }  
+        case SpvOpImageSampleProjDrefExplicitLod: {
+            stream << "OpImageSampleProjDrefExplicitLod";
+            break;
+        }  
+        case SpvOpImageFetch: {
+            stream << "OpImageFetch";
+            break;
+        }  
+        case SpvOpImageGather: {
+            stream << "OpImageGather";
+            break;
+        }  
+        case SpvOpImageDrefGather: {
+            stream << "OpImageDrefGather";
+            break;
+        }  
+        case SpvOpImageRead: {
+            stream << "OpImageRead";
+            break;
+        }  
+        case SpvOpImageWrite: {
+            stream << "OpImageWrite";
+            break;
+        }  
+        case SpvOpImage: {
+            stream << "OpImage";
+            break;
+        }  
+        case SpvOpImageQueryFormat: {
+            stream << "OpImageQueryFormat";
+            break;
+        }  
+        case SpvOpImageQueryOrder: {
+            stream << "OpImageQueryOrder";
+            break;
+        }  
+        case SpvOpImageQuerySizeLod: {
+            stream << "OpImageQuerySizeLod";
+            break;
+        }  
+        case SpvOpImageQuerySize: {
+            stream << "OpImageQuerySize";
+            break;
+        }  
+        case SpvOpImageQueryLod: {
+            stream << "OpImageQueryLod";
+            break;
+        }  
+        case SpvOpImageQueryLevels: {
+            stream << "OpImageQueryLevels";
+            break;
+        }  
+        case SpvOpImageQuerySamples: {
+            stream << "OpImageQuerySamples";
+            break;
+        }  
+        case SpvOpConvertFToU: {
+            stream << "OpConvertFToU";
+            break;
+        }  
+        case SpvOpConvertFToS: {
+            stream << "OpConvertFToS";
+            break;
+        }  
+        case SpvOpConvertSToF: {
+            stream << "OpConvertSToF";
+            break;
+        }  
+        case SpvOpConvertUToF: {
+            stream << "OpConvertUToF";
+            break;
+        }  
+        case SpvOpUConvert: {
+            stream << "OpUConvert";
+            break;
+        }  
+        case SpvOpSConvert: {
+            stream << "OpSConvert";
+            break;
+        }  
+        case SpvOpFConvert: {
+            stream << "OpFConvert";
+            break;
+        }  
+        case SpvOpConvertPtrToU: {
+            stream << "OpConvertPtrToU";
+            break;
+        }  
+        case SpvOpSatConvertSToU: {
+            stream << "OpSatConvertSToU";
+            break;
+        }  
+        case SpvOpSatConvertUToS: {
+            stream << "OpSatConvertUToS";
+            break;
+        }  
+        case SpvOpConvertUToPtr: {
+            stream << "OpConvertUToPtr";
+            break;
+        }  
+        case SpvOpPtrCastToGeneric: {
+            stream << "OpPtrCastToGeneric";
+            break;
+        }  
+        case SpvOpGenericCastToPtr: {
+            stream << "OpGenericCastToPtr";
+            break;
+        }  
+        case SpvOpGenericCastToPtrExplicit: {
+            stream << "OpGenericCastToPtrExplicit";
+            break;
+        }  
+        case SpvOpBitcast: {
+            stream << "OpBitcast";
+            break;
+        }  
+        case SpvOpSNegate: {
+            stream << "OpSNegate";
+            break;
+        }  
+        case SpvOpFNegate: {
+            stream << "OpFNegate";
+            break;
+        }  
+        case SpvOpIAdd: {
+            stream << "OpIAdd";
+            break;
+        }  
+        case SpvOpFAdd: {
+            stream << "OpFAdd";
+            break;
+        }  
+        case SpvOpISub: {
+            stream << "OpISub";
+            break;
+        }  
+        case SpvOpFSub: {
+            stream << "OpFSub";
+            break;
+        }  
+        case SpvOpIMul: {
+            stream << "OpIMul";
+            break;
+        }  
+        case SpvOpFMul: {
+            stream << "OpFMul";
+            break;
+        }  
+        case SpvOpUDiv: {
+            stream << "OpUDiv";
+            break;
+        }  
+        case SpvOpSDiv: {
+            stream << "OpSDiv";
+            break;
+        }  
+        case SpvOpFDiv: {
+            stream << "OpFDiv";
+            break;
+        }  
+        case SpvOpUMod: {
+            stream << "OpUMod";
+            break;
+        }  
+        case SpvOpSRem: {
+            stream << "OpSRem";
+            break;
+        }  
+        case SpvOpSMod: {
+            stream << "OpSMod";
+            break;
+        }  
+        case SpvOpFRem: {
+            stream << "OpFRem";
+            break;
+        }  
+        case SpvOpFMod: {
+            stream << "OpFMod";
+            break;
+        }  
+        case SpvOpVectorTimesScalar: {
+            stream << "OpVectorTimesScalar";
+            break;
+        }  
+        case SpvOpMatrixTimesScalar: {
+            stream << "OpMatrixTimesScalar";
+            break;
+        }  
+        case SpvOpVectorTimesMatrix: {
+            stream << "OpVectorTimesMatrix";
+            break;
+        }  
+        case SpvOpMatrixTimesVector: {
+            stream << "OpMatrixTimesVector";
+            break;
+        }  
+        case SpvOpMatrixTimesMatrix: {
+            stream << "OpMatrixTimesMatrix";
+            break;
+        }  
+        case SpvOpOuterProduct: {
+            stream << "OpOuterProduct";
+            break;
+        }  
+        case SpvOpDot: {
+            stream << "OpDot";
+            break;
+        }  
+        case SpvOpIAddCarry: {
+            stream << "OpIAddCarry";
+            break;
+        }  
+        case SpvOpISubBorrow: {
+            stream << "OpISubBorrow";
+            break;
+        }  
+        case SpvOpUMulExtended: {
+            stream << "OpUMulExtended";
+            break;
+        }  
+        case SpvOpSMulExtended: {
+            stream << "OpSMulExtended";
+            break;
+        }  
+        case SpvOpAny: {
+            stream << "OpAny";
+            break;
+        }  
+        case SpvOpAll: {
+            stream << "OpAll";
+            break;
+        }  
+        case SpvOpIsNan: {
+            stream << "OpIsNan";
+            break;
+        }  
+        case SpvOpIsInf: {
+            stream << "OpIsInf";
+            break;
+        }  
+        case SpvOpIsFinite: {
+            stream << "OpIsFinite";
+            break;
+        }  
+        case SpvOpIsNormal: {
+            stream << "OpIsNormal";
+            break;
+        }  
+        case SpvOpSignBitSet: {
+            stream << "OpSignBitSet";
+            break;
+        }  
+        case SpvOpLessOrGreater: {
+            stream << "OpLessOrGreater";
+            break;
+        }  
+        case SpvOpOrdered: {
+            stream << "OpOrdered";
+            break;
+        }  
+        case SpvOpUnordered: {
+            stream << "OpUnordered";
+            break;
+        }  
+        case SpvOpLogicalEqual: {
+            stream << "OpLogicalEqual";
+            break;
+        }  
+        case SpvOpLogicalNotEqual: {
+            stream << "OpLogicalNotEqual";
+            break;
+        }  
+        case SpvOpLogicalOr: {
+            stream << "OpLogicalOr";
+            break;
+        }  
+        case SpvOpLogicalAnd: {
+            stream << "OpLogicalAnd";
+            break;
+        }  
+        case SpvOpLogicalNot: {
+            stream << "OpLogicalNot";
+            break;
+        }  
+        case SpvOpSelect: {
+            stream << "OpSelect";
+            break;
+        }  
+        case SpvOpIEqual: {
+            stream << "OpIEqual";
+            break;
+        }  
+        case SpvOpINotEqual: {
+            stream << "OpINotEqual";
+            break;
+        }  
+        case SpvOpUGreaterThan: {
+            stream << "OpUGreaterThan";
+            break;
+        }  
+        case SpvOpSGreaterThan: {
+            stream << "OpSGreaterThan";
+            break;
+        }  
+        case SpvOpUGreaterThanEqual: {
+            stream << "OpUGreaterThanEqual";
+            break;
+        }  
+        case SpvOpSGreaterThanEqual: {
+            stream << "OpSGreaterThanEqual";
+            break;
+        }  
+        case SpvOpULessThan: {
+            stream << "OpULessThan";
+            break;
+        }  
+        case SpvOpSLessThan: {
+            stream << "OpSLessThan";
+            break;
+        }  
+        case SpvOpULessThanEqual: {
+            stream << "OpULessThanEqual";
+            break;
+        }  
+        case SpvOpSLessThanEqual: {
+            stream << "OpSLessThanEqual";
+            break;
+        }  
+        case SpvOpFOrdEqual: {
+            stream << "OpFOrdEqual";
+            break;
+        }  
+        case SpvOpFUnordEqual: {
+            stream << "OpFUnordEqual";
+            break;
+        }  
+        case SpvOpFOrdNotEqual: {
+            stream << "OpFOrdNotEqual";
+            break;
+        }  
+        case SpvOpFUnordNotEqual: {
+            stream << "OpFUnordNotEqual";
+            break;
+        }  
+        case SpvOpFOrdLessThan: {
+            stream << "OpFOrdLessThan";
+            break;
+        }  
+        case SpvOpFUnordLessThan: {
+            stream << "OpFUnordLessThan";
+            break;
+        }  
+        case SpvOpFOrdGreaterThan: {
+            stream << "OpFOrdGreaterThan";
+            break;
+        }  
+        case SpvOpFUnordGreaterThan: {
+            stream << "OpFUnordGreaterThan";
+            break;
+        }  
+        case SpvOpFOrdLessThanEqual: {
+            stream << "OpFOrdLessThanEqual";
+            break;
+        }  
+        case SpvOpFUnordLessThanEqual: {
+            stream << "OpFUnordLessThanEqual";
+            break;
+        }  
+        case SpvOpFOrdGreaterThanEqual: {
+            stream << "OpFOrdGreaterThanEqual";
+            break;
+        }  
+        case SpvOpFUnordGreaterThanEqual: {
+            stream << "OpFUnordGreaterThanEqual";
+            break;
+        }  
+        case SpvOpShiftRightLogical: {
+            stream << "OpShiftRightLogical";
+            break;
+        }  
+        case SpvOpShiftRightArithmetic: {
+            stream << "OpShiftRightArithmetic";
+            break;
+        }  
+        case SpvOpShiftLeftLogical: {
+            stream << "OpShiftLeftLogical";
+            break;
+        }  
+        case SpvOpBitwiseOr: {
+            stream << "OpBitwiseOr";
+            break;
+        }  
+        case SpvOpBitwiseXor: {
+            stream << "OpBitwiseXor";
+            break;
+        }  
+        case SpvOpBitwiseAnd: {
+            stream << "OpBitwiseAnd";
+            break;
+        }  
+        case SpvOpNot: {
+            stream << "OpNot";
+            break;
+        }  
+        case SpvOpBitFieldInsert: {
+            stream << "OpBitFieldInsert";
+            break;
+        }  
+        case SpvOpBitFieldSExtract: {
+            stream << "OpBitFieldSExtract";
+            break;
+        }  
+        case SpvOpBitFieldUExtract: {
+            stream << "OpBitFieldUExtract";
+            break;
+        }  
+        case SpvOpBitReverse: {
+            stream << "OpBitReverse";
+            break;
+        }  
+        case SpvOpBitCount: {
+            stream << "OpBitCount";
+            break;
+        }  
+        case SpvOpDPdx: {
+            stream << "OpDPdx";
+            break;
+        }  
+        case SpvOpDPdy: {
+            stream << "OpDPdy";
+            break;
+        }  
+        case SpvOpFwidth: {
+            stream << "OpFwidth";
+            break;
+        }  
+        case SpvOpDPdxFine: {
+            stream << "OpDPdxFine";
+            break;
+        }  
+        case SpvOpDPdyFine: {
+            stream << "OpDPdyFine";
+            break;
+        }  
+        case SpvOpFwidthFine: {
+            stream << "OpFwidthFine";
+            break;
+        }  
+        case SpvOpDPdxCoarse: {
+            stream << "OpDPdxCoarse";
+            break;
+        }  
+        case SpvOpDPdyCoarse: {
+            stream << "OpDPdyCoarse";
+            break;
+        }  
+        case SpvOpFwidthCoarse: {
+            stream << "OpFwidthCoarse";
+            break;
+        }  
+        case SpvOpEmitVertex: {
+            stream << "OpEmitVertex";
+            break;
+        }  
+        case SpvOpEndPrimitive: {
+            stream << "OpEndPrimitive";
+            break;
+        }  
+        case SpvOpEmitStreamVertex: {
+            stream << "OpEmitStreamVertex";
+            break;
+        }  
+        case SpvOpEndStreamPrimitive: {
+            stream << "OpEndStreamPrimitive";
+            break;
+        }  
+        case SpvOpControlBarrier: {
+            stream << "OpControlBarrier";
+            break;
+        }  
+        case SpvOpMemoryBarrier: {
+            stream << "OpMemoryBarrier";
+            break;
+        }  
+        case SpvOpAtomicLoad: {
+            stream << "OpAtomicLoad";
+            break;
+        }  
+        case SpvOpAtomicStore: {
+            stream << "OpAtomicStore";
+            break;
+        }  
+        case SpvOpAtomicExchange: {
+            stream << "OpAtomicExchange";
+            break;
+        }  
+        case SpvOpAtomicCompareExchange: {
+            stream << "OpAtomicCompareExchange";
+            break;
+        }  
+        case SpvOpAtomicCompareExchangeWeak: {
+            stream << "OpAtomicCompareExchangeWeak";
+            break;
+        }  
+        case SpvOpAtomicIIncrement: {
+            stream << "OpAtomicIIncrement";
+            break;
+        }  
+        case SpvOpAtomicIDecrement: {
+            stream << "OpAtomicIDecrement";
+            break;
+        }  
+        case SpvOpAtomicIAdd: {
+            stream << "OpAtomicIAdd";
+            break;
+        }  
+        case SpvOpAtomicISub: {
+            stream << "OpAtomicISub";
+            break;
+        }  
+        case SpvOpAtomicSMin: {
+            stream << "OpAtomicSMin";
+            break;
+        }  
+        case SpvOpAtomicUMin: {
+            stream << "OpAtomicUMin";
+            break;
+        }  
+        case SpvOpAtomicSMax: {
+            stream << "OpAtomicSMax";
+            break;
+        }  
+        case SpvOpAtomicUMax: {
+            stream << "OpAtomicUMax";
+            break;
+        }  
+        case SpvOpAtomicAnd: {
+            stream << "OpAtomicAnd";
+            break;
+        }  
+        case SpvOpAtomicOr: {
+            stream << "OpAtomicOr";
+            break;
+        }  
+        case SpvOpAtomicXor: {
+            stream << "OpAtomicXor";
+            break;
+        }  
+        case SpvOpPhi: {
+            stream << "OpPhi";
+            break;
+        }  
+        case SpvOpLoopMerge: {
+            stream << "OpLoopMerge";
+            break;
+        }  
+        case SpvOpSelectionMerge: {
+            stream << "OpSelectionMerge";
+            break;
+        }  
+        case SpvOpLabel: {
+            stream << "OpLabel";
+            break;
+        }  
+        case SpvOpBranch: {
+            stream << "OpBranch";
+            break;
+        }  
+        case SpvOpBranchConditional: {
+            stream << "OpBranchConditional";
+            break;
+        }  
+        case SpvOpSwitch: {
+            stream << "OpSwitch";
+            break;
+        }  
+        case SpvOpKill: {
+            stream << "OpKill";
+            break;
+        }  
+        case SpvOpReturn: {
+            stream << "OpReturn";
+            break;
+        }  
+        case SpvOpReturnValue: {
+            stream << "OpReturnValue";
+            break;
+        }  
+        case SpvOpUnreachable: {
+            stream << "OpUnreachable";
+            break;
+        }  
+        case SpvOpLifetimeStart: {
+            stream << "OpLifetimeStart";
+            break;
+        }  
+        case SpvOpLifetimeStop: {
+            stream << "OpLifetimeStop";
+            break;
+        }  
+        case SpvOpGroupAsyncCopy: {
+            stream << "OpGroupAsyncCopy";
+            break;
+        }  
+        case SpvOpGroupWaitEvents: {
+            stream << "OpGroupWaitEvents";
+            break;
+        }  
+        case SpvOpGroupAll: {
+            stream << "OpGroupAll";
+            break;
+        }  
+        case SpvOpGroupAny: {
+            stream << "OpGroupAny";
+            break;
+        }  
+        case SpvOpGroupBroadcast: {
+            stream << "OpGroupBroadcast";
+            break;
+        }  
+        case SpvOpGroupIAdd: {
+            stream << "OpGroupIAdd";
+            break;
+        }  
+        case SpvOpGroupFAdd: {
+            stream << "OpGroupFAdd";
+            break;
+        }  
+        case SpvOpGroupFMin: {
+            stream << "OpGroupFMin";
+            break;
+        }  
+        case SpvOpGroupUMin: {
+            stream << "OpGroupUMin";
+            break;
+        }  
+        case SpvOpGroupSMin: {
+            stream << "OpGroupSMin";
+            break;
+        }  
+        case SpvOpGroupFMax: {
+            stream << "OpGroupFMax";
+            break;
+        }  
+        case SpvOpGroupUMax: {
+            stream << "OpGroupUMax";
+            break;
+        }  
+        case SpvOpGroupSMax: {
+            stream << "OpGroupSMax";
+            break;
+        }  
+        case SpvOpReadPipe: {
+            stream << "OpReadPipe";
+            break;
+        }  
+        case SpvOpWritePipe: {
+            stream << "OpWritePipe";
+            break;
+        }  
+        case SpvOpReservedReadPipe: {
+            stream << "OpReservedReadPipe";
+            break;
+        }  
+        case SpvOpReservedWritePipe: {
+            stream << "OpReservedWritePipe";
+            break;
+        }  
+        case SpvOpReserveReadPipePackets: {
+            stream << "OpReserveReadPipePackets";
+            break;
+        }  
+        case SpvOpReserveWritePipePackets: {
+            stream << "OpReserveWritePipePackets";
+            break;
+        }  
+        case SpvOpCommitReadPipe: {
+            stream << "OpCommitReadPipe";
+            break;
+        }  
+        case SpvOpCommitWritePipe: {
+            stream << "OpCommitWritePipe";
+            break;
+        }  
+        case SpvOpIsValidReserveId: {
+            stream << "OpIsValidReserveId";
+            break;
+        }  
+        case SpvOpGetNumPipePackets: {
+            stream << "OpGetNumPipePackets";
+            break;
+        }  
+        case SpvOpGetMaxPipePackets: {
+            stream << "OpGetMaxPipePackets";
+            break;
+        }  
+        case SpvOpGroupReserveReadPipePackets: {
+            stream << "OpGroupReserveReadPipePackets";
+            break;
+        }  
+        case SpvOpGroupReserveWritePipePackets: {
+            stream << "OpGroupReserveWritePipePackets";
+            break;
+        }  
+        case SpvOpGroupCommitReadPipe: {
+            stream << "OpGroupCommitReadPipe";
+            break;
+        }  
+        case SpvOpGroupCommitWritePipe: {
+            stream << "OpGroupCommitWritePipe";
+            break;
+        }  
+        case SpvOpEnqueueMarker: {
+            stream << "OpEnqueueMarker";
+            break;
+        }  
+        case SpvOpEnqueueKernel: {
+            stream << "OpEnqueueKernel";
+            break;
+        }  
+        case SpvOpGetKernelNDrangeSubGroupCount: {
+            stream << "OpGetKernelNDrangeSubGroupCount";
+            break;
+        }  
+        case SpvOpGetKernelNDrangeMaxSubGroupSize: {
+            stream << "OpGetKernelNDrangeMaxSubGroupSize";
+            break;
+        }  
+        case SpvOpGetKernelWorkGroupSize: {
+            stream << "OpGetKernelWorkGroupSize";
+            break;
+        }  
+        case SpvOpGetKernelPreferredWorkGroupSizeMultiple: {
+            stream << "OpGetKernelPreferredWorkGroupSizeMultiple";
+            break;
+        }  
+        case SpvOpRetainEvent: {
+            stream << "OpRetainEvent";
+            break;
+        }  
+        case SpvOpReleaseEvent: {
+            stream << "OpReleaseEvent";
+            break;
+        }  
+        case SpvOpCreateUserEvent: {
+            stream << "OpCreateUserEvent";
+            break;
+        }  
+        case SpvOpIsValidEvent: {
+            stream << "OpIsValidEvent";
+            break;
+        }  
+        case SpvOpSetUserEventStatus: {
+            stream << "OpSetUserEventStatus";
+            break;
+        }  
+        case SpvOpCaptureEventProfilingInfo: {
+            stream << "OpCaptureEventProfilingInfo";
+            break;
+        }  
+        case SpvOpGetDefaultQueue: {
+            stream << "OpGetDefaultQueue";
+            break;
+        }  
+        case SpvOpBuildNDRange: {
+            stream << "OpBuildNDRange";
+            break;
+        }  
+        case SpvOpImageSparseSampleImplicitLod: {
+            stream << "OpImageSparseSampleImplicitLod";
+            break;
+        }  
+        case SpvOpImageSparseSampleExplicitLod: {
+            stream << "OpImageSparseSampleExplicitLod";
+            break;
+        }  
+        case SpvOpImageSparseSampleDrefImplicitLod: {
+            stream << "OpImageSparseSampleDrefImplicitLod";
+            break;
+        }  
+        case SpvOpImageSparseSampleDrefExplicitLod: {
+            stream << "OpImageSparseSampleDrefExplicitLod";
+            break;
+        }  
+        case SpvOpImageSparseSampleProjImplicitLod: {
+            stream << "OpImageSparseSampleProjImplicitLod";
+            break;
+        }  
+        case SpvOpImageSparseSampleProjExplicitLod: {
+            stream << "OpImageSparseSampleProjExplicitLod";
+            break;
+        }  
+        case SpvOpImageSparseSampleProjDrefImplicitLod: {
+            stream << "OpImageSparseSampleProjDrefImplicitLod";
+            break;
+        }  
+        case SpvOpImageSparseSampleProjDrefExplicitLod: {
+            stream << "OpImageSparseSampleProjDrefExplicitLod";
+            break;
+        }  
+        case SpvOpImageSparseFetch: {
+            stream << "OpImageSparseFetch";
+            break;
+        }  
+        case SpvOpImageSparseGather: {
+            stream << "OpImageSparseGather";
+            break;
+        }  
+        case SpvOpImageSparseDrefGather: {
+            stream << "OpImageSparseDrefGather";
+            break;
+        }  
+        case SpvOpImageSparseTexelsResident: {
+            stream << "OpImageSparseTexelsResident";
+            break;
+        }  
+        case SpvOpNoLine: {
+            stream << "OpNoLine";
+            break;
+        }  
+        case SpvOpAtomicFlagTestAndSet: {
+            stream << "OpAtomicFlagTestAndSet";
+            break;
+        }  
+        case SpvOpAtomicFlagClear: {
+            stream << "OpAtomicFlagClear";
+            break;
+        }  
+        case SpvOpImageSparseRead: {
+            stream << "OpImageSparseRead";
+            break;
+        }  
+        case SpvOpDecorateId: {
+            stream << "OpDecorateId";
+            break;
+        }  
+        case SpvOpSubgroupBallotKHR: {
+            stream << "OpSubgroupBallotKHR";
+            break;
+        }  
+        case SpvOpSubgroupFirstInvocationKHR: {
+            stream << "OpSubgroupFirstInvocationKHR";
+            break;
+        }  
+        case SpvOpSubgroupAllKHR: {
+            stream << "OpSubgroupAllKHR";
+            break;
+        }  
+        case SpvOpSubgroupAnyKHR: {
+            stream << "OpSubgroupAnyKHR";
+            break;
+        }  
+        case SpvOpSubgroupAllEqualKHR: {
+            stream << "OpSubgroupAllEqualKHR";
+            break;
+        }  
+        case SpvOpSubgroupReadInvocationKHR: {
+            stream << "OpSubgroupReadInvocationKHR";
+            break;
+        }  
+        case SpvOpGroupIAddNonUniformAMD: {
+            stream << "OpGroupIAddNonUniformAMD";
+            break;
+        }  
+        case SpvOpGroupFAddNonUniformAMD: {
+            stream << "OpGroupFAddNonUniformAMD";
+            break;
+        }  
+        case SpvOpGroupFMinNonUniformAMD: {
+            stream << "OpGroupFMinNonUniformAMD";
+            break;
+        }  
+        case SpvOpGroupUMinNonUniformAMD: {
+            stream << "OpGroupUMinNonUniformAMD";
+            break;
+        }  
+        case SpvOpGroupSMinNonUniformAMD: {
+            stream << "OpGroupSMinNonUniformAMD";
+            break;
+        }  
+        case SpvOpGroupFMaxNonUniformAMD: {
+            stream << "OpGroupFMaxNonUniformAMD";
+            break;
+        }  
+        case SpvOpGroupUMaxNonUniformAMD: {
+            stream << "OpGroupUMaxNonUniformAMD";
+            break;
+        }  
+        case SpvOpGroupSMaxNonUniformAMD: {
+            stream << "OpGroupSMaxNonUniformAMD";
+            break;
+        }  
+        case SpvOpFragmentMaskFetchAMD: {
+            stream << "OpFragmentMaskFetchAMD";
+            break;
+        }  
+        case SpvOpFragmentFetchAMD: {
+            stream << "OpFragmentFetchAMD";
+            break;
+        }  
+        case SpvOpSubgroupShuffleINTEL: {
+            stream << "OpSubgroupShuffleINTEL";
+            break;
+        }  
+        case SpvOpSubgroupShuffleDownINTEL: {
+            stream << "OpSubgroupShuffleDownINTEL";
+            break;
+        }  
+        case SpvOpSubgroupShuffleUpINTEL: {
+            stream << "OpSubgroupShuffleUpINTEL";
+            break;
+        }  
+        case SpvOpSubgroupShuffleXorINTEL: {
+            stream << "OpSubgroupShuffleXorINTEL";
+            break;
+        }  
+        case SpvOpSubgroupBlockReadINTEL: {
+            stream << "OpSubgroupBlockReadINTEL";
+            break;
+        }  
+        case SpvOpSubgroupBlockWriteINTEL: {
+            stream << "OpSubgroupBlockWriteINTEL";
+            break;
+        }  
+        case SpvOpSubgroupImageBlockReadINTEL: {
+            stream << "OpSubgroupImageBlockReadINTEL";
+            break;
+        }  
+        case SpvOpSubgroupImageBlockWriteINTEL: {
+            stream << "OpSubgroupImageBlockWriteINTEL";
+            break;
+        }  
+        case SpvOpDecorateStringGOOGLE: {
+            stream << "OpDecorateStringGOOGLE";
+            break;
+        }  
+        case SpvOpMemberDecorateStringGOOGLE: {
+            stream << "OpMemberDecorateStringGOOGLE";
+            break;
+        }  
+        case SpvOpMax: 
+        default: {
+            stream << "*INVALID*";
+            break;
+        }
+    };
+
+    if (inst.has_type()) {
+        stream << std::string(" %") << std::to_string(inst.type_id());
+    }
+
+    for(uint32_t i = 0; i < inst.length(); i++) {
+        if(inst.is_immediate(i)) {
+            if(inst.value_type(i) == SpvStringData) {
+                const char* str = (const char*)inst.data(i);
+                stream << std::string(" \"") << str << "\"";
+                break;
+            } else if(inst.value_type(i) == SpvIntegerData) {
+                const int* data = (const int*)inst.data(i);
+                stream << std::string(" ") << std::to_string(*data);
+                break;
+            } else if(inst.value_type(i) == SpvFloatData) {
+                const float* data = (const float*)inst.data(i);
+                stream << std::string(" ") << std::to_string(*data);
+                break;
+            } else if(inst.value_type(i) == SpvBitMaskLiteral) {
+                stream << std::string(" ") << std::hex << std::showbase << std::uppercase << inst.operand(i);            
+            } else {
+                stream << std::string(" ") << std::to_string(inst.operand(i));
+            }
+        } else {
+            stream << std::string(" %") << std::to_string(inst.operand(i));
+        }
+    }    
+
+    stream << "\n";
+    return stream;
+}
+
+// --
+
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 03a4abc0dded..1bf876c47a46 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -85,6 +85,18 @@ enum SpvKind {
     SpvUnknownItem,
 };
 
+/** Specific types of SPIR-V operand types */
+enum SpvValueType {
+    SpvInvalidValueType,
+    SpvOperandId,
+    SpvBitMaskLiteral,
+    SpvIntegerLiteral,
+    SpvIntegerData,
+    SpvFloatData,
+    SpvStringData,
+    SpvUnknownValueType
+};
+
 /** SPIR-V requires all IDs to be 32-bit unsigned integers */
 using SpvId = uint32_t;
 using SpvBinary = std::vector<uint32_t>;
@@ -116,6 +128,9 @@ using SpvInstructionContentsPtr = IntrusivePtr<SpvInstructionContents>;
 /** General interface for representing a SPIR-V Instruction */
 class SpvInstruction {
 public:
+    using Operands = std::vector<SpvId>;
+    using ValueTypes = std::vector<SpvValueType>;
+
     SpvInstruction() = default;
     ~SpvInstruction() = default;
 
@@ -129,14 +144,17 @@ class SpvInstruction {
     void set_type_id(SpvId id);
     void set_op_code(SpvOp opcode);
     void add_operand(SpvId id);
-    void add_immediate(SpvId id);
-    void add_data(uint32_t bytes, const void *data);
+    void add_immediate(SpvId id, SpvValueType type);
+    void add_data(uint32_t bytes, const void *data, SpvValueType type);
     void add_string(const std::string &str);
 
     SpvId result_id() const;
     SpvId type_id() const;
     SpvOp op_code() const;
-    SpvId operand(uint32_t index);
+    SpvId operand(uint32_t index) const;
+    const void* data(uint32_t index=0) const;
+    SpvValueType value_type(uint32_t index) const;
+    const Operands& operands() const;
 
     bool has_type() const;
     bool has_result() const;
@@ -192,6 +210,9 @@ class SpvBlock {
 /** General interface for representing a SPIR-V Function */
 class SpvFunction {
 public:
+    using Blocks = std::vector<SpvBlock>;
+    using Parameters = std::vector<SpvInstruction>;
+
     SpvFunction() = default;
     ~SpvFunction() = default;
 
@@ -208,9 +229,11 @@ class SpvFunction {
     void set_parameter_precision(uint32_t index, SpvPrecision precision);
     bool is_defined() const;
 
+    const Blocks& blocks() const;
     SpvBlock entry_block() const;
     SpvBlock tail_block() const;
     SpvPrecision return_precision() const;
+    const Parameters& parameters() const;
     SpvPrecision parameter_precision(uint32_t index) const;
     uint32_t parameter_count() const;
     uint32_t control_mask() const;
@@ -232,9 +255,14 @@ class SpvFunction {
 /** General interface for representing a SPIR-V code module */
 class SpvModule {
 public:
+    using ImportDefinition = std::pair<SpvId, std::string>;
     using ImportNames = std::vector<std::string>;
     using EntryPointNames = std::vector<std::string>;
     using Instructions = std::vector<SpvInstruction>;
+    using Functions = std::vector<SpvFunction>;
+    using Capabilities = std::vector<SpvCapability>;
+    using Extensions = std::vector<std::string>;
+    using Imports = std::vector<ImportDefinition>;
 
     SpvModule() = default;
     ~SpvModule() = default;
@@ -264,6 +292,7 @@ class SpvModule {
     void set_memory_model(SpvMemoryModel val);
     void set_binding_count(SpvId count);
 
+    uint32_t version_format() const;
     SpvSourceLanguage source_language() const;
     SpvAddressingModel addressing_model() const;
     SpvMemoryModel memory_model() const;
@@ -272,7 +301,20 @@ class SpvModule {
     ImportNames import_names() const;
     SpvId lookup_import(const std::string &Instruction_set) const;
     uint32_t entry_point_count() const;
+
+    Imports imports() const;
+    Extensions extensions() const;
+    Capabilities capabilities() const;
+    Instructions entry_points() const;
     const Instructions &execution_modes() const;
+    const Instructions &debug_source() const;
+    const Instructions &debug_symbols() const;
+    const Instructions &annotations() const;
+    const Instructions &type_definitions() const;
+    const Instructions &global_constants() const;
+    const Instructions &global_variables() const;
+    const Functions &function_definitions() const;
+
     uint32_t binding_count() const;
     SpvModule module() const;
 
@@ -509,7 +551,7 @@ struct SpvFactory {
     static SpvInstruction runtime_array_type(SpvId result_type_id, SpvId base_type_id);
     static SpvInstruction pointer_type(SpvId pointer_type_id, SpvStorageClass storage_class, SpvId base_type_id);
     static SpvInstruction function_type(SpvId function_type_id, SpvId return_type_id, const ParamTypes &param_type_ids);
-    static SpvInstruction constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data);
+    static SpvInstruction constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data,  SpvValueType value_type);
     static SpvInstruction null_constant(SpvId result_id, SpvId type_id);
     static SpvInstruction bool_constant(SpvId result_id, SpvId type_id, bool value);
     static SpvInstruction string_constant(SpvId result_id, const std::string &value);
@@ -563,13 +605,13 @@ struct SpvFactory {
 /** Contents of a SPIR-V Instruction */
 struct SpvInstructionContents {
     using Operands = std::vector<SpvId>;
-    using Immediates = std::vector<bool>;
+    using ValueTypes = std::vector<SpvValueType>;
     mutable RefCount ref_count;
     SpvOp op_code = SpvOpNop;
     SpvId result_id = SpvNoResult;
     SpvId type_id = SpvNoType;
     Operands operands;
-    Immediates immediates;
+    ValueTypes value_types;
     SpvBlock block;
 };
 
@@ -616,6 +658,7 @@ struct SpvModuleContents {
 
     mutable RefCount ref_count;
     SpvId module_id = SpvInvalidId;
+    SpvId version_format = SpvVersion;
     SpvId binding_count = 0;
     SpvSourceLanguage source_language = SpvSourceLanguageUnknown;
     SpvAddressingModel addressing_model = SpvAddressingModelLogical;
@@ -635,9 +678,22 @@ struct SpvModuleContents {
     Instructions instructions;
 };
 
+/** Helper functions for determining calling convention of GLSL builtins **/
 bool is_glsl_unary_op(SpvId glsl_op_code);
 bool is_glsl_binary_op(SpvId glsl_op_code);
 
+/** Output the contents of a SPIR-V module in human-readable form **/
+std::ostream &operator<<(std::ostream &stream, const SpvModule &);
+
+/** Output the definition of a SPIR-V function in human-readable form **/
+std::ostream &operator<<(std::ostream &stream, const SpvFunction &);
+
+/** Output the contents of a SPIR-V block in human-readable form **/
+std::ostream &operator<<(std::ostream &stream, const SpvBlock &);
+
+/** Output a SPIR-V instruction in human-readable form **/
+std::ostream &operator<<(std::ostream &stream, const SpvInstruction &);
+
 }  // namespace Internal
 }  // namespace Halide
 

From 2cc00be4d97a517684bc845f595d39ae04ee2e15 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 12 Oct 2022 15:29:41 -0700
Subject: [PATCH 044/166] Only declare SIMT intrinics that are actually used.
 Cleanup & refactor add_kernel method.

---
 src/CodeGen_Vulkan_Dev.cpp | 358 ++++++++++++++++++++++++++++++++-----
 1 file changed, 315 insertions(+), 43 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index ab27269394cf..458393fa6f61 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <unordered_set>
 #include <sstream>
 
 #include "CodeGen_GPU_Dev.h"
@@ -188,11 +189,18 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void add_kernel(const Stmt &s, const std::string &name, const std::vector<DeviceArgument> &args);
         void init_module();
         void compile(std::vector<char> &binary);
+        void dump() const;
 
         // Scalarize expressions
         void scalarize(const Expr &e);
         SpvId map_type_to_pair(const Type &t);
 
+        // Workgroup size 
+        void reset_workgroup_size();
+        void declare_workgroup_size(SpvId kernel_func_id);
+        void declare_entry_point(const Stmt &s, SpvId kernel_func_id);
+        void declare_device_args(uint32_t entry_point_index, const std::string &kernel_name, const std::vector<DeviceArgument> &args);
+
         // The scope contains both the symbol id and its storage class
         using SymbolIdStorageClassPair = std::pair<SpvId, SpvStorageClass>;
         using SymbolScope = Scope<SymbolIdStorageClassPair>;
@@ -213,7 +221,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         // as a header to the module prior to the actual SPIR-V binary
         void encode_header(SpvBinary &spirv_header);
 
-        // The workgroup size.  Must be the same for all kernels.
+        // The workgroup size.  May vary between kernels.
         uint32_t workgroup_size[3];
 
         // Returns Phi node inputs.
@@ -725,6 +733,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         e.accept(this);
         return;
     } else {
+
+        // First check for a standard SPIR-V built-in
         BuiltinMap::const_iterator spirv_it = spirv_builtin.find(op->name);
         if (spirv_it != spirv_builtin.end()) {
             SpvId spirv_op = spirv_it->second;
@@ -735,6 +745,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
                 user_error << "Vulkan: unhandled SPIR-V builtin function '" << op->name << "' encountered.\n";
             }
         }
+
+        // If its not a standard SPIR-V built-in, see if there's a GLSL extended builtin 
         BuiltinMap::const_iterator glsl_it = glsl_builtin.find(op->name);
         if (glsl_it == glsl_builtin.end()) {
             user_error << "Vulkan: unknown function '" << op->name << "' encountered.\n";
@@ -747,6 +759,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         } else if (is_glsl_binary_op(glsl_op_code)) {
             internal_assert(op->args.size() == 2);
             visit_glsl_binop(op->type, op->args[0], op->args[1], glsl_op_code);
+        } else {
+            user_error << "Vulkan: unhandled SPIR-V GLSL builtin function '" << op->name << "' encountered.\n";
         }
     }
 }
@@ -765,21 +779,32 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Select *op) {
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(SpvId index_id, SpvId base_id, SpvId result_type_id, SpvId ptr_type_id, SpvStorageClass storage_class) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(): "
              << "index_id=" << index_id << " "
              << "base_id=" << base_id << " "
-             << "type_id=" << type_id << " "
              << "ptr_type_id=" << ptr_type_id << " "
+             << "result_type_id=" << result_type_id << " "
              << "storage_class=" << storage_class << "\n";
 
+    // determine the base type id for the source value
+    SpvId base_type_id = builder.type_of(base_id);
+    if (builder.is_pointer_type(base_type_id)) {
+        base_type_id = builder.lookup_base_type(base_type_id);
+    }
+
     uint32_t zero = 0;
     SpvId src_id = SpvInvalidId;
     SpvId src_index_id = index_id;
     if (storage_class == SpvStorageClassUniform) {
-        SpvId zero_id = builder.declare_constant(UInt(32), &zero);
-        SpvFactory::Indices access_indices = {zero_id, src_index_id};
-        src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+        if(builder.is_struct_type(base_type_id)) {
+            SpvId zero_id = builder.declare_constant(UInt(32), &zero);
+            SpvFactory::Indices access_indices = {zero_id, src_index_id};
+            src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+        } else {
+            SpvFactory::Indices access_indices = {src_index_id};
+            src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+        }
     } else if (storage_class == SpvStorageClassWorkgroup) {
         SpvFactory::Indices access_indices = {src_index_id};
         src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
@@ -791,7 +816,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(SpvId index_id, S
     internal_assert(src_id != SpvInvalidId);
 
     SpvId result_id = builder.reserve_id(SpvResultId);
-    builder.append(SpvFactory::load(type_id, result_id, src_id));
+    builder.append(SpvFactory::load(result_type_id, result_id, src_id));
     builder.update_id(result_id);
 }
 
@@ -813,7 +838,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, S
     // If this is a dense vector load and the buffer has a vector base type,
     // then index the buffer using the base of the ramp divided by the number
     // of lanes.
-    SpvFactory::Indices index_components;
     if (builder.is_vector_type(base_type_id)) {
         Expr ramp_base = strided_ramp_base(op->index);
         if (ramp_base.defined()) {
@@ -830,23 +854,24 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, S
 
     // Gather vector elements.
     SpvFactory::Components loaded_values;
-    SpvId value_type_id = builder.declare_type(op->type.with_lanes(1));
-    SpvId index_type_id = builder.declare_type(op->index.type().with_lanes(1));
+    SpvId scalar_value_type_id = builder.declare_type(op->type.with_lanes(1));
+    SpvId scalar_ptr_type_id = builder.declare_pointer_type(scalar_value_type_id, storage_class);
+    SpvId scalar_index_type_id = builder.declare_type(op->index.type().with_lanes(1));
     for (uint32_t i = 0; i < (uint32_t)op->index.type().lanes(); i++) {
         SpvFactory::Indices extract_indices = {i};
-        SpvId index_component_id = builder.declare_variable(unique_name("_idx_ex"), index_type_id, SpvStorageClassFunction);
-        builder.append(SpvFactory::composite_extract(index_type_id, index_component_id, index_id, extract_indices));
-        load_from_scalar_index(index_component_id, base_id, type_id, ptr_type_id, storage_class);
-        SpvId value_id = builder.current_id();
-        SpvId value_component_id = builder.declare_variable(unique_name("_val_ex"), value_type_id, SpvStorageClassFunction);
-        builder.append(SpvFactory::composite_extract(value_type_id, value_component_id, value_id, extract_indices));
+        SpvId index_component_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::composite_extract(scalar_index_type_id, index_component_id, index_id, extract_indices));
+        load_from_scalar_index(index_component_id, base_id, scalar_value_type_id, scalar_ptr_type_id, storage_class);
+        SpvId value_component_id = builder.current_id();
         loaded_values.push_back(value_component_id);
     }
 
     // Create a composite vector from the individual loads
-    SpvId result_id = builder.reserve_id(SpvResultId);
-    builder.append(SpvFactory::composite_construct(type_id, result_id, loaded_values));
-    builder.update_id(result_id);
+    if(loaded_values.size() > 1) {
+        SpvId result_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::composite_construct(type_id, result_id, loaded_values));
+        builder.update_id(result_id);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id) {
@@ -858,14 +883,25 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(SpvId index_id, Sp
              << "storage_class=" << storage_class << " "
              << "value_id=" << value_id << "\n";
 
+    // determine the base type id for the source value
+    SpvId base_type_id = builder.type_of(base_id);
+    if (builder.is_pointer_type(base_type_id)) {
+        base_type_id = builder.lookup_base_type(base_type_id);
+    }
+
     uint32_t zero = 0;
     SpvId dst_id = SpvInvalidId;
     SpvId dst_index_id = index_id;
 
     if (storage_class == SpvStorageClassUniform) {
-        SpvId zero_id = builder.declare_constant(UInt(32), &zero);
-        SpvFactory::Indices access_indices = {zero_id, dst_index_id};
-        dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+        if(builder.is_struct_type(base_type_id)) {
+            SpvId zero_id = builder.declare_constant(UInt(32), &zero);
+            SpvFactory::Indices access_indices = {zero_id, dst_index_id};
+            dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+        } else {
+            SpvFactory::Indices access_indices = {dst_index_id};
+            dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+        }
     } else if (storage_class == SpvStorageClassWorkgroup) {
         SpvFactory::Indices access_indices = {dst_index_id};
         dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
@@ -911,15 +947,16 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, S
     SpvId index_id = builder.current_id();
 
     // Scatter vector elements.
-    SpvId value_type_id = builder.declare_type(op->value.type().with_lanes(1));
-    SpvId index_type_id = builder.declare_type(op->index.type().with_lanes(1));
+    SpvId scalar_value_type_id = builder.declare_type(op->value.type().with_lanes(1));
+    SpvId scalar_ptr_type_id = builder.declare_pointer_type(scalar_value_type_id, storage_class);
+    SpvId scalar_index_type_id = builder.declare_type(op->index.type().with_lanes(1));
     for (uint32_t i = 0; i < (uint32_t)op->index.type().lanes(); i++) {
         SpvFactory::Indices extract_indices = {i};
-        SpvId index_component_id = builder.declare_variable(unique_name("_idx_ex"), index_type_id, SpvStorageClassFunction);
-        builder.append(SpvFactory::composite_extract(index_type_id, index_component_id, index_id, extract_indices));
-        SpvId value_component_id = builder.declare_variable(unique_name("_val_ex"), value_type_id, SpvStorageClassFunction);
-        builder.append(SpvFactory::composite_extract(value_type_id, value_component_id, value_id, extract_indices));
-        store_at_scalar_index(index_component_id, base_id, type_id, ptr_type_id, storage_class, value_component_id);
+        SpvId index_component_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::composite_extract(scalar_index_type_id, index_component_id, index_id, extract_indices));
+        SpvId value_component_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::composite_extract(scalar_value_type_id, value_component_id, value_id, extract_indices));
+        store_at_scalar_index(index_component_id, base_id, scalar_value_type_id, scalar_ptr_type_id, storage_class, value_component_id);
     }
 }
 
@@ -943,6 +980,19 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
         SpvId index_id = builder.current_id();
         load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
     } else {
+
+        // If this is a dense vector load and the buffer has a vector base type,
+        // then index the buffer using the base of the ramp divided by the number
+        // of lanes.
+        if (op->type.is_vector()) {
+            Expr ramp_base = strided_ramp_base(op->index);
+            if (ramp_base.defined()) {
+                Expr ramp_index = (ramp_base / op->type.lanes());
+                ramp_index.accept(this);
+                SpvId index_id = builder.current_id();
+                load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
+            }
+        } 
         load_from_vector_index(op, base_id, type_id, ptr_type_id, storage_class);
     }
 }
@@ -1020,6 +1070,7 @@ std::pair<std::string, uint32_t> simt_intrinsic(const std::string &name) {
     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
     return {"", -1};
 }
+
 int thread_loop_workgroup_index(const std::string &name) {
     std::string ids[] = {".__thread_id_x",
                          ".__thread_id_y",
@@ -1031,6 +1082,7 @@ int thread_loop_workgroup_index(const std::string &name) {
     }
     return -1;
 }
+
 }  // anonymous namespace
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
@@ -1468,6 +1520,219 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header(SpvBinary &spirv_header) {
     spirv_header.insert(spirv_header.begin(), header_word_count + 1);
 }
 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::reset_workgroup_size() {
+    workgroup_size[0] = 0;
+    workgroup_size[1] = 0;
+    workgroup_size[2] = 0;
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_workgroup_size(SpvId kernel_func_id) {
+    workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
+    workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
+    workgroup_size[2] = std::max(workgroup_size[2], (uint32_t)1);
+
+    // Add workgroup size to execution mode
+    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(kernel_func_id, workgroup_size[0], workgroup_size[1], workgroup_size[2]);
+    builder.current_module().add_execution_mode(exec_mode_inst);
+}
+
+namespace {
+
+// Locate all the unique GPU variables used as SIMT intrinsics
+class FindIntrinsicsUsed : public IRVisitor {
+    using IRVisitor::visit;
+    void visit(const For *op) override {
+        if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
+            auto intrinsic = simt_intrinsic(op->name);
+            intrinsics_used.insert(intrinsic.first);
+        }
+        op->body.accept(this);        
+    }
+    void visit(const Variable *op) override {
+        if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
+            auto intrinsic = simt_intrinsic(op->name);
+            intrinsics_used.insert(intrinsic.first);
+        }
+    }
+
+public:  
+    std::unordered_set<std::string> intrinsics_used;
+    FindIntrinsicsUsed() = default;
+};
+
+// Map the SPIR-V builtin intrinsic name to its corresponding enum value
+SpvBuiltIn map_simt_builtin(const std::string &intrinsic_name) {
+    if (starts_with(intrinsic_name, "Workgroup")) {
+        return SpvBuiltInWorkgroupId;
+    } else if (starts_with(intrinsic_name, "Local")) {
+        return SpvBuiltInLocalInvocationId;
+    } 
+    internal_error << "map_simt_builtin called on bad variable name: " << intrinsic_name << "\n";
+    return SpvBuiltInMax;
+}
+
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_entry_point(const Stmt &s, SpvId kernel_func_id) {
+
+    // Locate all simt intrinsics 
+    FindIntrinsicsUsed find_intrinsics;
+    s.accept(&find_intrinsics);
+
+    SpvFactory::Variables entry_point_variables;
+    for (const std::string &intrinsic_name : find_intrinsics.intrinsics_used) {
+
+        // The builtins are pointers to vec3
+        SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
+        SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, SpvStorageClassInput);
+        SpvId intrinsic_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, SpvStorageClassInput);
+        SpvId intrinsic_loaded_id = builder.reserve_id();
+        builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id));
+        symbol_table.push(intrinsic_name, {intrinsic_loaded_id, SpvStorageClassInput});
+
+        // Annotate that this is the specific builtin
+        SpvBuiltIn built_in_kind = map_simt_builtin(intrinsic_name);
+        SpvBuilder::Literals annotation_literals = {(uint32_t)built_in_kind};
+        builder.add_annotation(intrinsic_id, SpvDecorationBuiltIn, annotation_literals);
+
+        // Add the builtin to the interface
+        entry_point_variables.push_back(intrinsic_id);
+    }
+
+    // Add the entry point with the appropriate execution model
+    // NOTE: exec_model must be GLCompute to work with Vulkan ... Kernel is only supported in OpenCL
+    builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point_index, 
+                                                            const std::string &entry_point_name, 
+                                                            const std::vector<DeviceArgument> &args)
+{
+
+    // Keep track of the descriptor set needed to bind this kernel's inputs / outputs
+    DescriptorSet descriptor_set;
+    descriptor_set.entry_point_name = entry_point_name;
+
+    // Add required extension support for storage types
+    for (const auto &arg : args) {
+        if (arg.is_buffer) {
+            if (arg.type.is_int_or_uint()) {
+                if (arg.type.bits() == 8) {
+                    builder.require_extension("SPV_KHR_8bit_storage");
+                } else if (arg.type.bits() == 16) {
+                    builder.require_extension("SPV_KHR_16bit_storage");
+                }
+            }
+        }
+    }
+
+    // GLSL-style: each input buffer is a runtime array in a buffer struct
+    // All other params get passed in as a single uniform block
+    // First, need to count scalar parameters to construct the uniform struct
+    SpvBuilder::StructMemberTypes param_struct_members;
+    for (const auto &arg : args) {
+        if (!arg.is_buffer) {
+            SpvId arg_type_id = builder.declare_type(arg.type);
+            param_struct_members.push_back(arg_type_id);
+        }
+    }
+
+    // Add a binding for a uniform buffer packed with all scalar args
+    uint32_t binding_counter = 0;
+    if (!param_struct_members.empty()) {
+        const std::string struct_name = std::string("_struct") + entry_point_name + std::string("_args");
+        SpvId param_struct_type_id = builder.declare_struct(struct_name, param_struct_members);
+
+        // Add a decoration describing the offset for each parameter struct member
+        uint32_t param_member_index = 0;
+        uint32_t param_member_offset = 0;
+        for (const auto &arg : args) {
+            if (!arg.is_buffer) {
+                SpvBuilder::Literals param_offset_literals = {param_member_offset};
+                builder.add_struct_annotation(param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals);
+                param_member_offset += arg.type.bytes();
+                param_member_index++;
+            }
+        }
+
+        // Add a Block decoration for the parameter pack itself
+        builder.add_annotation(param_struct_type_id, SpvDecorationBlock);
+
+        // Add a variable for the parameter pack
+        const std::string param_pack_var_name = std::string("_var") + entry_point_name + std::string("_args");
+        SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
+        SpvId param_pack_var_id = builder.declare_global_variable(param_pack_var_name, param_pack_ptr_type_id, SpvStorageClassUniform);
+
+        // We always pass in the parameter pack as the first binding
+        SpvBuilder::Literals binding_index = {0};
+        SpvBuilder::Literals dset_index = {entry_point_index};
+        builder.add_annotation(param_pack_var_id, SpvDecorationDescriptorSet, dset_index);
+        builder.add_annotation(param_pack_var_id, SpvDecorationBinding, binding_index);
+        descriptor_set.uniform_buffer_count++;
+        binding_counter++;
+
+        // Declare all the args with appropriate offsets into the parameter struct
+        uint32_t scalar_index = 0;
+        for (const auto &arg : args) {
+            if (!arg.is_buffer) {
+
+                SpvId arg_type_id = builder.declare_type(arg.type);
+                SpvId access_index_id = builder.declare_constant(UInt(32), &scalar_index);
+                SpvId pointer_type_id = builder.declare_pointer_type(arg_type_id, SpvStorageClassUniform);
+                SpvFactory::Indices access_indices = {access_index_id};
+                SpvId access_chain_id = builder.declare_access_chain(pointer_type_id, param_pack_var_id, access_indices);
+                scalar_index++;
+
+                SpvId param_id = builder.reserve_id(SpvResultId);
+                builder.append(SpvFactory::load(arg_type_id, param_id, access_chain_id));
+                symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
+            }
+        }
+    }
+
+    // Add bindings for all device buffers as uniform buffers
+    for (const auto &arg : args) {
+        if (arg.is_buffer) {
+            
+            // Declare the runtime array (which maps directly to the Halide device buffer)
+            SpvId element_type_id = builder.declare_type(arg.type);
+            SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
+
+            // Annotate the array with its stride
+            SpvBuilder::Literals array_stride = {(uint32_t)(arg.type.bytes())};
+            builder.add_annotation(runtime_arr_type_id, SpvDecorationArrayStride, array_stride);
+
+            // Wrap the runtime array in a struct (required with SPIR-V buffer block semantics)
+            SpvBuilder::StructMemberTypes struct_member_types = {runtime_arr_type_id};
+            const std::string struct_name = std::string("_struct") + entry_point_name + std::string("_b") + std::to_string(binding_counter);
+            SpvId struct_type_id = builder.declare_struct(struct_name, struct_member_types);
+
+            // Declare a pointer to the struct as a global variable 
+            SpvStorageClass storage_class = SpvStorageClassUniform;
+            SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, storage_class);
+            SpvId param_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, storage_class);
+
+            // Annotate the struct to indicate it's passed in a GLSL-style buffer block
+            builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
+
+            // Annotate the offset for the array
+            SpvBuilder::Literals zero_literal = {uint32_t(0)};
+            builder.add_struct_annotation(struct_type_id, 0, SpvDecorationOffset, zero_literal);
+
+            // Set descriptor set and binding indices
+            SpvBuilder::Literals dset_index = {entry_point_index};
+            SpvBuilder::Literals binding_index = {uint32_t(binding_counter++)};
+            builder.add_annotation(param_id, SpvDecorationDescriptorSet, dset_index);
+            builder.add_annotation(param_id, SpvDecorationBinding, binding_index);
+            symbol_table.push(arg.name, {param_id, storage_class});
+            descriptor_set.storage_buffer_count++;
+        }
+    }
+
+    // Save the descriptor set (so we can output the binding information as a header to the code module)
+    descriptor_set_table.push_back(descriptor_set);
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::compile\n";
 
@@ -1505,20 +1770,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     // TODO: can we use one of the function control annotations?
 
     // We'll discover the workgroup size as we traverse the kernel
-    workgroup_size[0] = 0;
-    workgroup_size[1] = 0;
-    workgroup_size[2] = 0;
-
-    // Keep track of the descriptor set needed to bind this kernel's inputs / outputs
-    DescriptorSet descriptor_set;
-    descriptor_set.entry_point_name = name;
+    reset_workgroup_size();
 
     // Declare the kernel function
     SpvId void_type_id = builder.declare_void_type();
     SpvId kernel_func_id = builder.add_function(name, void_type_id);
     SpvFunction kernel_func = builder.lookup_function(kernel_func_id);
+    uint32_t entry_point_index = builder.current_module().entry_point_count();
     builder.enter_function(kernel_func);
 
+    // Declare the entry point and input intrinsics for the kernel func
+    declare_entry_point(s, kernel_func_id);
+
+    // Declare all parameters -- scalar args and device buffers
+    declare_device_args(entry_point_index, name, args);
+/*
     // TODO: only add the SIMT intrinsics used
     SpvFactory::Variables entry_point_variables;
     auto intrinsics = {"WorkgroupId", "LocalInvocationId"};
@@ -1545,7 +1811,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     // NOTE: exec_model must be GLCompute to work with Vulkan ... Kernel is only supported in OpenCL
     uint32_t current_entry_point = builder.current_module().entry_point_count();
     builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
+*/
 
+/*
     // GLSL-style: each input buffer is a runtime array in a buffer struct
     // All other params get passed in as a single uniform block
     // First, need to count scalar parameters to construct the uniform struct
@@ -1652,18 +1920,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
         }
     }
     descriptor_set_table.push_back(descriptor_set);
+*/    
     s.accept(this);
 
     // Insert return statement end delimiter
     kernel_func.tail_block().add_instruction(SpvFactory::return_stmt());
 
-    workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
-    workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
-    workgroup_size[2] = std::max(workgroup_size[2], (uint32_t)1);
-
-    // Add workgroup size to execution mode
-    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(kernel_func_id, workgroup_size[0], workgroup_size[1], workgroup_size[2]);
-    builder.current_module().add_execution_mode(exec_mode_inst);
+    // Declare the workgroup size now that we've traversed the kernel 
+    declare_workgroup_size(kernel_func_id);
 
     // Pop scope
     for (const auto &arg : args) {
@@ -1673,6 +1937,11 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     builder.leave_function();
 }
 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::dump() const {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::dump()\n";
+    std::cerr << builder.current_module();
+}
+
 CodeGen_Vulkan_Dev::CodeGen_Vulkan_Dev(Target t) {
 }
 
@@ -1720,6 +1989,9 @@ std::string CodeGen_Vulkan_Dev::print_gpu_name(const std::string &name) {
 void CodeGen_Vulkan_Dev::dump() {
     std::vector<char> module = compile_to_src();
 
+    // Print the contents of the compiled SPIR-V module
+    emitter.dump();
+    
     // Skip the header and only output the SPIR-V binary
     const uint32_t *decode = (const uint32_t *)(module.data());
     uint32_t header_word_count = decode[0];

From 541458c835f290e74bdf5ce8dbc13e58490de1e5 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 12 Oct 2022 15:31:23 -0700
Subject: [PATCH 045/166] Add Vulkan handler to test targets

---
 test/generator/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 70d586d2ee69..7a9b6b000072 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -63,6 +63,9 @@ function(halide_define_aot_test NAME)
         if ("${Halide_TARGET}" MATCHES "cuda")
             target_compile_definitions("${TARGET}" PRIVATE TEST_CUDA)
         endif ()
+        if ("${Halide_TARGET}" MATCHES "vulkan")
+            target_compile_definitions("${TARGET}" PRIVATE TEST_VULKAN)
+        endif ()
         add_halide_test("${TARGET}" GROUPS generator "${args_GROUPS}")
     endif ()
 endfunction()

From cc569df0997837d1e17a1fd2afcd603c828baae2 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 12 Oct 2022 15:38:34 -0700
Subject: [PATCH 046/166] Clang format/tidy pass

---
 src/CodeGen_Vulkan_Dev.cpp |  281 ++--
 src/SpirvIR.cpp            | 2677 ++++++++++++++++++------------------
 src/SpirvIR.h              |   10 +-
 3 files changed, 1483 insertions(+), 1485 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 458393fa6f61..1603a6443c5b 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1,6 +1,6 @@
 #include <algorithm>
-#include <unordered_set>
 #include <sstream>
+#include <unordered_set>
 
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
@@ -195,7 +195,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void scalarize(const Expr &e);
         SpvId map_type_to_pair(const Type &t);
 
-        // Workgroup size 
+        // Workgroup size
         void reset_workgroup_size();
         void declare_workgroup_size(SpvId kernel_func_id);
         void declare_entry_point(const Stmt &s, SpvId kernel_func_id);
@@ -746,7 +746,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             }
         }
 
-        // If its not a standard SPIR-V built-in, see if there's a GLSL extended builtin 
+        // If its not a standard SPIR-V built-in, see if there's a GLSL extended builtin
         BuiltinMap::const_iterator glsl_it = glsl_builtin.find(op->name);
         if (glsl_it == glsl_builtin.end()) {
             user_error << "Vulkan: unknown function '" << op->name << "' encountered.\n";
@@ -797,7 +797,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(SpvId index_id, S
     SpvId src_id = SpvInvalidId;
     SpvId src_index_id = index_id;
     if (storage_class == SpvStorageClassUniform) {
-        if(builder.is_struct_type(base_type_id)) {
+        if (builder.is_struct_type(base_type_id)) {
             SpvId zero_id = builder.declare_constant(UInt(32), &zero);
             SpvFactory::Indices access_indices = {zero_id, src_index_id};
             src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
@@ -867,7 +867,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, S
     }
 
     // Create a composite vector from the individual loads
-    if(loaded_values.size() > 1) {
+    if (loaded_values.size() > 1) {
         SpvId result_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::composite_construct(type_id, result_id, loaded_values));
         builder.update_id(result_id);
@@ -894,7 +894,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(SpvId index_id, Sp
     SpvId dst_index_id = index_id;
 
     if (storage_class == SpvStorageClassUniform) {
-        if(builder.is_struct_type(base_type_id)) {
+        if (builder.is_struct_type(base_type_id)) {
             SpvId zero_id = builder.declare_constant(UInt(32), &zero);
             SpvFactory::Indices access_indices = {zero_id, dst_index_id};
             dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
@@ -992,7 +992,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
                 SpvId index_id = builder.current_id();
                 load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
             }
-        } 
+        }
         load_from_vector_index(op, base_id, type_id, ptr_type_id, storage_class);
     }
 }
@@ -1546,7 +1546,7 @@ class FindIntrinsicsUsed : public IRVisitor {
             auto intrinsic = simt_intrinsic(op->name);
             intrinsics_used.insert(intrinsic.first);
         }
-        op->body.accept(this);        
+        op->body.accept(this);
     }
     void visit(const Variable *op) override {
         if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
@@ -1555,7 +1555,7 @@ class FindIntrinsicsUsed : public IRVisitor {
         }
     }
 
-public:  
+public:
     std::unordered_set<std::string> intrinsics_used;
     FindIntrinsicsUsed() = default;
 };
@@ -1566,16 +1566,16 @@ SpvBuiltIn map_simt_builtin(const std::string &intrinsic_name) {
         return SpvBuiltInWorkgroupId;
     } else if (starts_with(intrinsic_name, "Local")) {
         return SpvBuiltInLocalInvocationId;
-    } 
+    }
     internal_error << "map_simt_builtin called on bad variable name: " << intrinsic_name << "\n";
     return SpvBuiltInMax;
 }
 
-}
+}  // namespace
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_entry_point(const Stmt &s, SpvId kernel_func_id) {
 
-    // Locate all simt intrinsics 
+    // Locate all simt intrinsics
     FindIntrinsicsUsed find_intrinsics;
     s.accept(&find_intrinsics);
 
@@ -1604,10 +1604,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_entry_point(const Stmt &s, SpvId
     builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point_index, 
-                                                            const std::string &entry_point_name, 
-                                                            const std::vector<DeviceArgument> &args)
-{
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point_index,
+                                                            const std::string &entry_point_name,
+                                                            const std::vector<DeviceArgument> &args) {
 
     // Keep track of the descriptor set needed to bind this kernel's inputs / outputs
     DescriptorSet descriptor_set;
@@ -1693,7 +1692,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point
     // Add bindings for all device buffers as uniform buffers
     for (const auto &arg : args) {
         if (arg.is_buffer) {
-            
+
             // Declare the runtime array (which maps directly to the Halide device buffer)
             SpvId element_type_id = builder.declare_type(arg.type);
             SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
@@ -1707,7 +1706,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point
             const std::string struct_name = std::string("_struct") + entry_point_name + std::string("_b") + std::to_string(binding_counter);
             SpvId struct_type_id = builder.declare_struct(struct_name, struct_member_types);
 
-            // Declare a pointer to the struct as a global variable 
+            // Declare a pointer to the struct as a global variable
             SpvStorageClass storage_class = SpvStorageClassUniform;
             SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, storage_class);
             SpvId param_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, storage_class);
@@ -1784,149 +1783,149 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
 
     // Declare all parameters -- scalar args and device buffers
     declare_device_args(entry_point_index, name, args);
-/*
-    // TODO: only add the SIMT intrinsics used
-    SpvFactory::Variables entry_point_variables;
-    auto intrinsics = {"WorkgroupId", "LocalInvocationId"};
-    for (const std::string &intrinsic_name : intrinsics) {
-
-        // The builtins are pointers to vec3
-        SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
-        SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, SpvStorageClassInput);
-        SpvId intrinsic_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, SpvStorageClassInput);
-        SpvId intrinsic_loaded_id = builder.reserve_id();
-        builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id));
-        symbol_table.push(intrinsic_name, {intrinsic_loaded_id, SpvStorageClassInput});
-
-        // Annotate that this is the specific builtin
-        SpvBuiltIn built_in_kind = starts_with(intrinsic_name, "Workgroup") ? SpvBuiltInWorkgroupId : SpvBuiltInLocalInvocationId;
-        SpvBuilder::Literals annotation_literals = {(uint32_t)built_in_kind};
-        builder.add_annotation(intrinsic_id, SpvDecorationBuiltIn, annotation_literals);
-
-        // Add the builtin to the interface
-        entry_point_variables.push_back(intrinsic_id);
-    }
-
-    // Add the entry point with the appropriate execution model
-    // NOTE: exec_model must be GLCompute to work with Vulkan ... Kernel is only supported in OpenCL
-    uint32_t current_entry_point = builder.current_module().entry_point_count();
-    builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
-*/
-
-/*
-    // GLSL-style: each input buffer is a runtime array in a buffer struct
-    // All other params get passed in as a single uniform block
-    // First, need to count scalar parameters to construct the uniform struct
-    SpvBuilder::StructMemberTypes param_struct_members;
-    for (const auto &arg : args) {
-        if (!arg.is_buffer) {
-            SpvId arg_type_id = builder.declare_type(arg.type);
-            param_struct_members.push_back(arg_type_id);
+    /*
+        // TODO: only add the SIMT intrinsics used
+        SpvFactory::Variables entry_point_variables;
+        auto intrinsics = {"WorkgroupId", "LocalInvocationId"};
+        for (const std::string &intrinsic_name : intrinsics) {
+
+            // The builtins are pointers to vec3
+            SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
+            SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, SpvStorageClassInput);
+            SpvId intrinsic_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, SpvStorageClassInput);
+            SpvId intrinsic_loaded_id = builder.reserve_id();
+            builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id));
+            symbol_table.push(intrinsic_name, {intrinsic_loaded_id, SpvStorageClassInput});
+
+            // Annotate that this is the specific builtin
+            SpvBuiltIn built_in_kind = starts_with(intrinsic_name, "Workgroup") ? SpvBuiltInWorkgroupId : SpvBuiltInLocalInvocationId;
+            SpvBuilder::Literals annotation_literals = {(uint32_t)built_in_kind};
+            builder.add_annotation(intrinsic_id, SpvDecorationBuiltIn, annotation_literals);
+
+            // Add the builtin to the interface
+            entry_point_variables.push_back(intrinsic_id);
         }
-    }
 
-    // Add a binding for a uniform buffer packed with all scalar args
-    uint32_t binding_counter = 0;
-    if (!param_struct_members.empty()) {
-        const std::string struct_name = std::string("_struct") + name + std::string("_args");
-        SpvId param_struct_type_id = builder.declare_struct(struct_name, param_struct_members);
-
-        // Add a decoration describing the offset for each parameter struct member
-        uint32_t param_member_index = 0;
-        uint32_t param_member_offset = 0;
+        // Add the entry point with the appropriate execution model
+        // NOTE: exec_model must be GLCompute to work with Vulkan ... Kernel is only supported in OpenCL
+        uint32_t current_entry_point = builder.current_module().entry_point_count();
+        builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
+    */
+
+    /*
+        // GLSL-style: each input buffer is a runtime array in a buffer struct
+        // All other params get passed in as a single uniform block
+        // First, need to count scalar parameters to construct the uniform struct
+        SpvBuilder::StructMemberTypes param_struct_members;
         for (const auto &arg : args) {
             if (!arg.is_buffer) {
-                SpvBuilder::Literals param_offset_literals = {param_member_offset};
-                builder.add_struct_annotation(param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals);
-                param_member_offset += arg.type.bytes();
-                param_member_index++;
-            }
-        }
-
-        // Add a Block decoration for the parameter pack itself
-        builder.add_annotation(param_struct_type_id, SpvDecorationBlock);
-
-        // Add a variable for the parameter pack
-        const std::string param_pack_var_name = std::string("_var") + name + std::string("_args");
-        SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
-        SpvId param_pack_var_id = builder.declare_global_variable(param_pack_var_name, param_pack_ptr_type_id, SpvStorageClassUniform);
-
-        // We always pass in the parameter pack as the first binding
-        SpvBuilder::Literals binding_index = {0};
-        SpvBuilder::Literals dset_index = {current_entry_point};
-        builder.add_annotation(param_pack_var_id, SpvDecorationDescriptorSet, dset_index);
-        builder.add_annotation(param_pack_var_id, SpvDecorationBinding, binding_index);
-        descriptor_set.uniform_buffer_count++;
-        binding_counter++;
-
-        // Declare all the args with appropriate offsets into the parameter struct
-        uint32_t scalar_index = 0;
-        for (const auto &arg : args) {
-            if (!arg.is_buffer) {
-
                 SpvId arg_type_id = builder.declare_type(arg.type);
-                SpvId access_index_id = builder.declare_constant(UInt(32), &scalar_index);
-                SpvId pointer_type_id = builder.declare_pointer_type(arg_type_id, SpvStorageClassUniform);
-                SpvFactory::Indices access_indices = {access_index_id};
-                SpvId access_chain_id = builder.declare_access_chain(pointer_type_id, param_pack_var_id, access_indices);
-                scalar_index++;
-
-                SpvId param_id = builder.reserve_id(SpvResultId);
-                builder.append(SpvFactory::load(arg_type_id, param_id, access_chain_id));
-                symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
+                param_struct_members.push_back(arg_type_id);
             }
         }
-    }
 
-    // Add bindings for all device buffers
-    for (const auto &arg : args) {
-        if (arg.is_buffer) {
-
-            // Add required extension support for storage types
-            if (arg.type.is_int_or_uint()) {
-                if (arg.type.bits() == 8) {
-                    builder.require_extension("SPV_KHR_8bit_storage");
-                } else if (arg.type.bits() == 16) {
-                    builder.require_extension("SPV_KHR_16bit_storage");
+        // Add a binding for a uniform buffer packed with all scalar args
+        uint32_t binding_counter = 0;
+        if (!param_struct_members.empty()) {
+            const std::string struct_name = std::string("_struct") + name + std::string("_args");
+            SpvId param_struct_type_id = builder.declare_struct(struct_name, param_struct_members);
+
+            // Add a decoration describing the offset for each parameter struct member
+            uint32_t param_member_index = 0;
+            uint32_t param_member_offset = 0;
+            for (const auto &arg : args) {
+                if (!arg.is_buffer) {
+                    SpvBuilder::Literals param_offset_literals = {param_member_offset};
+                    builder.add_struct_annotation(param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals);
+                    param_member_offset += arg.type.bytes();
+                    param_member_index++;
                 }
             }
 
-            SpvId element_type_id = builder.declare_type(arg.type);
-            SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
-            SpvBuilder::StructMemberTypes struct_member_types = {runtime_arr_type_id};
-            const std::string struct_name = std::string("_struct") + name + std::string("_b") + std::to_string(binding_counter);
-            SpvId struct_type_id = builder.declare_struct(struct_name, struct_member_types);
-            SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, SpvStorageClassUniform);
-            SpvId param_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, SpvStorageClassUniform);
+            // Add a Block decoration for the parameter pack itself
+            builder.add_annotation(param_struct_type_id, SpvDecorationBlock);
 
-            // Annotate the struct to indicate it's passed in a GLSL-style buffer block
-            builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
+            // Add a variable for the parameter pack
+            const std::string param_pack_var_name = std::string("_var") + name + std::string("_args");
+            SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
+            SpvId param_pack_var_id = builder.declare_global_variable(param_pack_var_name, param_pack_ptr_type_id, SpvStorageClassUniform);
 
-            // Annotate the array with its stride
-            SpvBuilder::Literals array_stride = {(uint32_t)(arg.type.bytes())};
-            builder.add_annotation(runtime_arr_type_id, SpvDecorationArrayStride, array_stride);
+            // We always pass in the parameter pack as the first binding
+            SpvBuilder::Literals binding_index = {0};
+            SpvBuilder::Literals dset_index = {current_entry_point};
+            builder.add_annotation(param_pack_var_id, SpvDecorationDescriptorSet, dset_index);
+            builder.add_annotation(param_pack_var_id, SpvDecorationBinding, binding_index);
+            descriptor_set.uniform_buffer_count++;
+            binding_counter++;
+
+            // Declare all the args with appropriate offsets into the parameter struct
+            uint32_t scalar_index = 0;
+            for (const auto &arg : args) {
+                if (!arg.is_buffer) {
+
+                    SpvId arg_type_id = builder.declare_type(arg.type);
+                    SpvId access_index_id = builder.declare_constant(UInt(32), &scalar_index);
+                    SpvId pointer_type_id = builder.declare_pointer_type(arg_type_id, SpvStorageClassUniform);
+                    SpvFactory::Indices access_indices = {access_index_id};
+                    SpvId access_chain_id = builder.declare_access_chain(pointer_type_id, param_pack_var_id, access_indices);
+                    scalar_index++;
+
+                    SpvId param_id = builder.reserve_id(SpvResultId);
+                    builder.append(SpvFactory::load(arg_type_id, param_id, access_chain_id));
+                    symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
+                }
+            }
+        }
 
-            // Annotate the offset for the array
-            SpvBuilder::Literals zero_literal = {uint32_t(0)};
-            builder.add_struct_annotation(struct_type_id, 0, SpvDecorationOffset, zero_literal);
+        // Add bindings for all device buffers
+        for (const auto &arg : args) {
+            if (arg.is_buffer) {
+
+                // Add required extension support for storage types
+                if (arg.type.is_int_or_uint()) {
+                    if (arg.type.bits() == 8) {
+                        builder.require_extension("SPV_KHR_8bit_storage");
+                    } else if (arg.type.bits() == 16) {
+                        builder.require_extension("SPV_KHR_16bit_storage");
+                    }
+                }
 
-            // Set DescriptorSet and Binding
-            SpvBuilder::Literals dset_index = {current_entry_point};
-            SpvBuilder::Literals binding_index = {uint32_t(binding_counter++)};
-            builder.add_annotation(param_id, SpvDecorationDescriptorSet, dset_index);
-            builder.add_annotation(param_id, SpvDecorationBinding, binding_index);
-            symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
-            descriptor_set.storage_buffer_count++;
+                SpvId element_type_id = builder.declare_type(arg.type);
+                SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
+                SpvBuilder::StructMemberTypes struct_member_types = {runtime_arr_type_id};
+                const std::string struct_name = std::string("_struct") + name + std::string("_b") + std::to_string(binding_counter);
+                SpvId struct_type_id = builder.declare_struct(struct_name, struct_member_types);
+                SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, SpvStorageClassUniform);
+                SpvId param_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, SpvStorageClassUniform);
+
+                // Annotate the struct to indicate it's passed in a GLSL-style buffer block
+                builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
+
+                // Annotate the array with its stride
+                SpvBuilder::Literals array_stride = {(uint32_t)(arg.type.bytes())};
+                builder.add_annotation(runtime_arr_type_id, SpvDecorationArrayStride, array_stride);
+
+                // Annotate the offset for the array
+                SpvBuilder::Literals zero_literal = {uint32_t(0)};
+                builder.add_struct_annotation(struct_type_id, 0, SpvDecorationOffset, zero_literal);
+
+                // Set DescriptorSet and Binding
+                SpvBuilder::Literals dset_index = {current_entry_point};
+                SpvBuilder::Literals binding_index = {uint32_t(binding_counter++)};
+                builder.add_annotation(param_id, SpvDecorationDescriptorSet, dset_index);
+                builder.add_annotation(param_id, SpvDecorationBinding, binding_index);
+                symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
+                descriptor_set.storage_buffer_count++;
+            }
         }
-    }
-    descriptor_set_table.push_back(descriptor_set);
-*/    
+        descriptor_set_table.push_back(descriptor_set);
+    */
     s.accept(this);
 
     // Insert return statement end delimiter
     kernel_func.tail_block().add_instruction(SpvFactory::return_stmt());
 
-    // Declare the workgroup size now that we've traversed the kernel 
+    // Declare the workgroup size now that we've traversed the kernel
     declare_workgroup_size(kernel_func_id);
 
     // Pop scope
@@ -1991,7 +1990,7 @@ void CodeGen_Vulkan_Dev::dump() {
 
     // Print the contents of the compiled SPIR-V module
     emitter.dump();
-    
+
     // Skip the header and only output the SPIR-V binary
     const uint32_t *decode = (const uint32_t *)(module.data());
     uint32_t header_word_count = decode[0];
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 1141efb6322c..745163fd1932 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -98,7 +98,7 @@ SpvOp SpvInstruction::op_code() const {
     return contents->op_code;
 }
 
-const void* SpvInstruction::data(uint32_t index) const {
+const void *SpvInstruction::data(uint32_t index) const {
     check_defined();
     return &(contents->operands[index]);
 }
@@ -113,7 +113,7 @@ SpvValueType SpvInstruction::value_type(uint32_t index) const {
     return contents->value_types[index];
 }
 
-const SpvInstruction::Operands& SpvInstruction::operands() const {
+const SpvInstruction::Operands &SpvInstruction::operands() const {
     check_defined();
     return contents->operands;
 }
@@ -397,12 +397,12 @@ SpvInstruction SpvFunction::declaration() const {
     return contents->declaration;
 }
 
-const SpvFunction::Blocks& SpvFunction::blocks() const {
+const SpvFunction::Blocks &SpvFunction::blocks() const {
     check_defined();
     return contents->blocks;
 }
 
-const SpvFunction::Parameters& SpvFunction::parameters() const {
+const SpvFunction::Parameters &SpvFunction::parameters() const {
     check_defined();
     return contents->parameters;
 }
@@ -564,7 +564,7 @@ SpvModule::Imports SpvModule::imports() const {
     check_defined();
     SpvModule::Imports results;
     results.reserve(contents->imports.size());
-    for(const SpvModuleContents::Imports::value_type& v : contents->imports) {
+    for (const SpvModuleContents::Imports::value_type &v : contents->imports) {
         SpvModule::ImportDefinition definition = {v.second, v.first};
         results.push_back(definition);
     }
@@ -575,7 +575,7 @@ SpvModule::Extensions SpvModule::extensions() const {
     check_defined();
     SpvModule::Extensions results;
     results.reserve(contents->extensions.size());
-    for(const SpvModuleContents::Extensions::value_type& v: contents->extensions) {
+    for (const SpvModuleContents::Extensions::value_type &v : contents->extensions) {
         results.push_back(v);
     }
     return results;
@@ -585,7 +585,7 @@ SpvModule::Capabilities SpvModule::capabilities() const {
     check_defined();
     SpvModule::Capabilities results;
     results.reserve(contents->capabilities.size());
-    for(const SpvModuleContents::Capabilities::value_type& v: contents->capabilities) {
+    for (const SpvModuleContents::Capabilities::value_type &v : contents->capabilities) {
         results.push_back(v);
     }
     return results;
@@ -2577,23 +2577,22 @@ void destroy<SpvModuleContents>(const SpvModuleContents *c) {
 
 // --
 
-
-std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
+std::ostream &operator<<(std::ostream &stream, const SpvModule &module) {
     if (!module.is_defined()) {
         stream << "(undefined)";
         return stream;
     }
 
     stream << "; SPIR-V\n";
-    stream << "; Version: " 
-           << std::to_string(spirv_major_version(module.version_format())) << "." 
+    stream << "; Version: "
+           << std::to_string(spirv_major_version(module.version_format())) << "."
            << std::to_string(spirv_minor_version(module.version_format())) << "\n";
     stream << "; Generator: Khronos; 0\n";
     stream << "; Bound: " << std::to_string(module.binding_count()) << "\n";
-    stream << "; Schema: 0\n"; // reserved for future use
+    stream << "; Schema: 0\n";  // reserved for future use
 
     SpvModule::Capabilities capabilities = module.capabilities();
-    if(!capabilities.empty()) {
+    if (!capabilities.empty()) {
         stream << "\n";
         stream << "; Capabilities\n";
         for (const SpvCapability &value : capabilities) {
@@ -2603,7 +2602,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
     }
 
     SpvModule::Extensions extensions = module.extensions();
-    if(!extensions.empty()) {
+    if (!extensions.empty()) {
         stream << "\n";
         stream << "; Extensions\n";
         for (const std::string &value : extensions) {
@@ -2613,7 +2612,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
     }
 
     SpvModule::Imports imports = module.imports();
-    if(!imports.empty()) {
+    if (!imports.empty()) {
         stream << "\n";
         stream << "; Extended Instruction Set Imports\n";
         for (const SpvModule::Imports::value_type &v : imports) {
@@ -2627,11 +2626,11 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
     stream << "; Memory Model\n";
     stream << memory_model;
 
-    if(module.entry_point_count() > 0) {
+    if (module.entry_point_count() > 0) {
         stream << "\n";
         stream << "; Entry Points\n";
         SpvModule::EntryPointNames entry_point_names = module.entry_point_names();
-        for( const std::string& name : entry_point_names) {            
+        for (const std::string &name : entry_point_names) {
             SpvInstruction inst = module.entry_point(name);
             stream << "; " << name << "\n";
             stream << inst;
@@ -2642,7 +2641,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
         stream << inst;
     }
 
-    if(!module.debug_source().empty() || !module.debug_symbols().empty()) {
+    if (!module.debug_source().empty() || !module.debug_symbols().empty()) {
         stream << "\n";
         stream << "; Debug Information\n";
     }
@@ -2653,7 +2652,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
         stream << inst;
     }
 
-    if(!module.annotations().empty()) {
+    if (!module.annotations().empty()) {
         stream << "\n";
         stream << "; Annotations\n";
         for (const SpvInstruction &inst : module.annotations()) {
@@ -2661,7 +2660,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
         }
     }
 
-    if(!module.type_definitions().empty()) {
+    if (!module.type_definitions().empty()) {
         stream << "\n";
         stream << "; Type Definitions\n";
         for (const SpvInstruction &inst : module.type_definitions()) {
@@ -2669,7 +2668,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
         }
     }
 
-    if(!module.global_constants().empty()) {
+    if (!module.global_constants().empty()) {
         stream << "\n";
         stream << "; Global Constants\n";
         for (const SpvInstruction &inst : module.global_constants()) {
@@ -2677,7 +2676,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
         }
     }
 
-    if(!module.global_variables().empty()) {
+    if (!module.global_variables().empty()) {
         stream << "\n";
         stream << "; Global Variables\n";
         for (const SpvInstruction &inst : module.global_variables()) {
@@ -2685,7 +2684,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvModule &module){
         }
     }
 
-    if(!module.function_definitions().empty()) {
+    if (!module.function_definitions().empty()) {
         stream << "\n";
         stream << "; Function Definitions\n";
         for (const SpvFunction &func : module.function_definitions()) {
@@ -2717,7 +2716,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvBlock &block) {
     if (!block.is_defined()) {
         stream << "(undefined)";
         return stream;
-    }    
+    }
 
     SpvInstruction label = SpvFactory::label(block.id());
     stream << label;
@@ -2731,7 +2730,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvBlock &block) {
     return stream;
 }
 
-std::ostream &operator<<(std::ostream &stream, const SpvInstruction & inst) {
+std::ostream &operator<<(std::ostream &stream, const SpvInstruction &inst) {
     if (!inst.is_defined()) {
         stream << "(undefined)";
         return stream;
@@ -2741,1322 +2740,1322 @@ std::ostream &operator<<(std::ostream &stream, const SpvInstruction & inst) {
         stream << std::string("%") << std::to_string(inst.result_id());
         stream << " = ";
     }
-    
-    switch(inst.op_code()) {
-        case SpvOpNop: {
-            stream << "OpNop";
-            break;
-        }  
-        case SpvOpUndef: {
-            stream << "OpUndef";
-            break;
-        }  
-        case SpvOpSourceContinued: {
-            stream << "OpSourceContinued";
-            break;
-        }  
-        case SpvOpSource: {
-            stream << "OpSource";
-            break;
-        }  
-        case SpvOpSourceExtension: {
-            stream << "OpSourceExtension";
-            break;
-        }  
-        case SpvOpName: {
-            stream << "OpName";
-            break;
-        }  
-        case SpvOpMemberName: {
-            stream << "OpMemberName";
-            break;
-        }  
-        case SpvOpString: {
-            stream << "OpString";
-            break;
-        }  
-        case SpvOpLine: {
-            stream << "OpLine";
-            break;
-        }  
-        case SpvOpExtension: {
-            stream << "OpExtension";
-            break;
-        }  
-        case SpvOpExtInstImport: {
-            stream << "OpExtInstImport";
-            break;
-        }  
-        case SpvOpExtInst: {
-            stream << "OpExtInst";
-            break;
-        }  
-        case SpvOpMemoryModel: {
-            stream << "OpMemoryModel";
-            break;
-        }  
-        case SpvOpEntryPoint: {
-            stream << "OpEntryPoint";
-            break;
-        }  
-        case SpvOpExecutionMode: {
-            stream << "OpExecutionMode";
-            break;
-        }  
-        case SpvOpCapability: {
-            stream << "OpCapability";
-            break;
-        }  
-        case SpvOpTypeVoid: {
-            stream << "OpTypeVoid";
-            break;
-        }  
-        case SpvOpTypeBool: {
-            stream << "OpTypeBool";
-            break;
-        }  
-        case SpvOpTypeInt: {
-            stream << "OpTypeInt";
-            break;
-        }  
-        case SpvOpTypeFloat: {
-            stream << "OpTypeFloat";
-            break;
-        }  
-        case SpvOpTypeVector: {
-            stream << "OpTypeVector";
-            break;
-        }  
-        case SpvOpTypeMatrix: {
-            stream << "OpTypeMatrix";
-            break;
-        }  
-        case SpvOpTypeImage: {
-            stream << "OpTypeImage";
-            break;
-        }  
-        case SpvOpTypeSampler: {
-            stream << "OpTypeSampler";
-            break;
-        }  
-        case SpvOpTypeSampledImage: {
-            stream << "OpTypeSampledImage";
-            break;
-        }  
-        case SpvOpTypeArray: {
-            stream << "OpTypeArray";
-            break;
-        }  
-        case SpvOpTypeRuntimeArray: {
-            stream << "OpTypeRuntimeArray";
-            break;
-        }  
-        case SpvOpTypeStruct: {
-            stream << "OpTypeStruct";
-            break;
-        }  
-        case SpvOpTypeOpaque: {
-            stream << "OpTypeOpaque";
-            break;
-        }  
-        case SpvOpTypePointer: {
-            stream << "OpTypePointer";
-            break;
-        }  
-        case SpvOpTypeFunction: {
-            stream << "OpTypeFunction";
-            break;
-        }  
-        case SpvOpTypeEvent: {
-            stream << "OpTypeEvent";
-            break;
-        }  
-        case SpvOpTypeDeviceEvent: {
-            stream << "OpTypeDeviceEvent";
-            break;
-        }  
-        case SpvOpTypeReserveId: {
-            stream << "OpTypeReserveId";
-            break;
-        }  
-        case SpvOpTypeQueue: {
-            stream << "OpTypeQueue";
-            break;
-        }  
-        case SpvOpTypePipe: {
-            stream << "OpTypePipe";
-            break;
-        }  
-        case SpvOpTypeForwardPointer: {
-            stream << "OpTypeForwardPointer";
-            break;
-        }  
-        case SpvOpConstantTrue: {
-            stream << "OpConstantTrue";
-            break;
-        }  
-        case SpvOpConstantFalse: {
-            stream << "OpConstantFalse";
-            break;
-        }  
-        case SpvOpConstant: {
-            stream << "OpConstant";
-            break;
-        }  
-        case SpvOpConstantComposite: {
-            stream << "OpConstantComposite";
-            break;
-        }  
-        case SpvOpConstantSampler: {
-            stream << "OpConstantSampler";
-            break;
-        }  
-        case SpvOpConstantNull: {
-            stream << "OpConstantNull";
-            break;
-        }  
-        case SpvOpSpecConstantTrue: {
-            stream << "OpSpecConstantTrue";
-            break;
-        }  
-        case SpvOpSpecConstantFalse: {
-            stream << "OpSpecConstantFalse";
-            break;
-        }  
-        case SpvOpSpecConstant: {
-            stream << "OpSpecConstant";
-            break;
-        }  
-        case SpvOpSpecConstantComposite: {
-            stream << "OpSpecConstantComposite";
-            break;
-        }  
-        case SpvOpSpecConstantOp: {
-            stream << "OpSpecConstantOp";
-            break;
-        }  
-        case SpvOpFunction: {
-            stream << "OpFunction";
-            break;
-        }  
-        case SpvOpFunctionParameter: {
-            stream << "OpFunctionParameter";
-            break;
-        }  
-        case SpvOpFunctionEnd: {
-            stream << "OpFunctionEnd";
-            break;
-        }  
-        case SpvOpFunctionCall: {
-            stream << "OpFunctionCall";
-            break;
-        }  
-        case SpvOpVariable: {
-            stream << "OpVariable";
-            break;
-        }  
-        case SpvOpImageTexelPointer: {
-            stream << "OpImageTexelPointer";
-            break;
-        }  
-        case SpvOpLoad: {
-            stream << "OpLoad";
-            break;
-        }  
-        case SpvOpStore: {
-            stream << "OpStore";
-            break;
-        }  
-        case SpvOpCopyMemory: {
-            stream << "OpCopyMemory";
-            break;
-        }  
-        case SpvOpCopyMemorySized: {
-            stream << "OpCopyMemorySized";
-            break;
-        }  
-        case SpvOpAccessChain: {
-            stream << "OpAccessChain";
-            break;
-        }  
-        case SpvOpInBoundsAccessChain: {
-            stream << "OpInBoundsAccessChain";
-            break;
-        }  
-        case SpvOpPtrAccessChain: {
-            stream << "OpPtrAccessChain";
-            break;
-        }  
-        case SpvOpArrayLength: {
-            stream << "OpArrayLength";
-            break;
-        }  
-        case SpvOpGenericPtrMemSemantics: {
-            stream << "OpGenericPtrMemSemantics";
-            break;
-        }  
-        case SpvOpInBoundsPtrAccessChain: {
-            stream << "OpInBoundsPtrAccessChain";
-            break;
-        }  
-        case SpvOpDecorate: {
-            stream << "OpDecorate";
-            break;
-        }  
-        case SpvOpMemberDecorate: {
-            stream << "OpMemberDecorate";
-            break;
-        }  
-        case SpvOpDecorationGroup: {
-            stream << "OpDecorationGroup";
-            break;
-        }  
-        case SpvOpGroupDecorate: {
-            stream << "OpGroupDecorate";
-            break;
-        }  
-        case SpvOpGroupMemberDecorate: {
-            stream << "OpGroupMemberDecorate";
-            break;
-        }  
-        case SpvOpVectorExtractDynamic: {
-            stream << "OpVectorExtractDynamic";
-            break;
-        }  
-        case SpvOpVectorInsertDynamic: {
-            stream << "OpVectorInsertDynamic";
-            break;
-        }  
-        case SpvOpVectorShuffle: {
-            stream << "OpVectorShuffle";
-            break;
-        }  
-        case SpvOpCompositeConstruct: {
-            stream << "OpCompositeConstruct";
-            break;
-        }  
-        case SpvOpCompositeExtract: {
-            stream << "OpCompositeExtract";
-            break;
-        }  
-        case SpvOpCompositeInsert: {
-            stream << "OpCompositeInsert";
-            break;
-        }  
-        case SpvOpCopyObject: {
-            stream << "OpCopyObject";
-            break;
-        }  
-        case SpvOpTranspose: {
-            stream << "OpTranspose";
-            break;
-        }  
-        case SpvOpSampledImage: {
-            stream << "OpSampledImage";
-            break;
-        }  
-        case SpvOpImageSampleImplicitLod: {
-            stream << "OpImageSampleImplicitLod";
-            break;
-        }  
-        case SpvOpImageSampleExplicitLod: {
-            stream << "OpImageSampleExplicitLod";
-            break;
-        }  
-        case SpvOpImageSampleDrefImplicitLod: {
-            stream << "OpImageSampleDrefImplicitLod";
-            break;
-        }  
-        case SpvOpImageSampleDrefExplicitLod: {
-            stream << "OpImageSampleDrefExplicitLod";
-            break;
-        }  
-        case SpvOpImageSampleProjImplicitLod: {
-            stream << "OpImageSampleProjImplicitLod";
-            break;
-        }  
-        case SpvOpImageSampleProjExplicitLod: {
-            stream << "OpImageSampleProjExplicitLod";
-            break;
-        }  
-        case SpvOpImageSampleProjDrefImplicitLod: {
-            stream << "OpImageSampleProjDrefImplicitLod";
-            break;
-        }  
-        case SpvOpImageSampleProjDrefExplicitLod: {
-            stream << "OpImageSampleProjDrefExplicitLod";
-            break;
-        }  
-        case SpvOpImageFetch: {
-            stream << "OpImageFetch";
-            break;
-        }  
-        case SpvOpImageGather: {
-            stream << "OpImageGather";
-            break;
-        }  
-        case SpvOpImageDrefGather: {
-            stream << "OpImageDrefGather";
-            break;
-        }  
-        case SpvOpImageRead: {
-            stream << "OpImageRead";
-            break;
-        }  
-        case SpvOpImageWrite: {
-            stream << "OpImageWrite";
-            break;
-        }  
-        case SpvOpImage: {
-            stream << "OpImage";
-            break;
-        }  
-        case SpvOpImageQueryFormat: {
-            stream << "OpImageQueryFormat";
-            break;
-        }  
-        case SpvOpImageQueryOrder: {
-            stream << "OpImageQueryOrder";
-            break;
-        }  
-        case SpvOpImageQuerySizeLod: {
-            stream << "OpImageQuerySizeLod";
-            break;
-        }  
-        case SpvOpImageQuerySize: {
-            stream << "OpImageQuerySize";
-            break;
-        }  
-        case SpvOpImageQueryLod: {
-            stream << "OpImageQueryLod";
-            break;
-        }  
-        case SpvOpImageQueryLevels: {
-            stream << "OpImageQueryLevels";
-            break;
-        }  
-        case SpvOpImageQuerySamples: {
-            stream << "OpImageQuerySamples";
-            break;
-        }  
-        case SpvOpConvertFToU: {
-            stream << "OpConvertFToU";
-            break;
-        }  
-        case SpvOpConvertFToS: {
-            stream << "OpConvertFToS";
-            break;
-        }  
-        case SpvOpConvertSToF: {
-            stream << "OpConvertSToF";
-            break;
-        }  
-        case SpvOpConvertUToF: {
-            stream << "OpConvertUToF";
-            break;
-        }  
-        case SpvOpUConvert: {
-            stream << "OpUConvert";
-            break;
-        }  
-        case SpvOpSConvert: {
-            stream << "OpSConvert";
-            break;
-        }  
-        case SpvOpFConvert: {
-            stream << "OpFConvert";
-            break;
-        }  
-        case SpvOpConvertPtrToU: {
-            stream << "OpConvertPtrToU";
-            break;
-        }  
-        case SpvOpSatConvertSToU: {
-            stream << "OpSatConvertSToU";
-            break;
-        }  
-        case SpvOpSatConvertUToS: {
-            stream << "OpSatConvertUToS";
-            break;
-        }  
-        case SpvOpConvertUToPtr: {
-            stream << "OpConvertUToPtr";
-            break;
-        }  
-        case SpvOpPtrCastToGeneric: {
-            stream << "OpPtrCastToGeneric";
-            break;
-        }  
-        case SpvOpGenericCastToPtr: {
-            stream << "OpGenericCastToPtr";
-            break;
-        }  
-        case SpvOpGenericCastToPtrExplicit: {
-            stream << "OpGenericCastToPtrExplicit";
-            break;
-        }  
-        case SpvOpBitcast: {
-            stream << "OpBitcast";
-            break;
-        }  
-        case SpvOpSNegate: {
-            stream << "OpSNegate";
-            break;
-        }  
-        case SpvOpFNegate: {
-            stream << "OpFNegate";
-            break;
-        }  
-        case SpvOpIAdd: {
-            stream << "OpIAdd";
-            break;
-        }  
-        case SpvOpFAdd: {
-            stream << "OpFAdd";
-            break;
-        }  
-        case SpvOpISub: {
-            stream << "OpISub";
-            break;
-        }  
-        case SpvOpFSub: {
-            stream << "OpFSub";
-            break;
-        }  
-        case SpvOpIMul: {
-            stream << "OpIMul";
-            break;
-        }  
-        case SpvOpFMul: {
-            stream << "OpFMul";
-            break;
-        }  
-        case SpvOpUDiv: {
-            stream << "OpUDiv";
-            break;
-        }  
-        case SpvOpSDiv: {
-            stream << "OpSDiv";
-            break;
-        }  
-        case SpvOpFDiv: {
-            stream << "OpFDiv";
-            break;
-        }  
-        case SpvOpUMod: {
-            stream << "OpUMod";
-            break;
-        }  
-        case SpvOpSRem: {
-            stream << "OpSRem";
-            break;
-        }  
-        case SpvOpSMod: {
-            stream << "OpSMod";
-            break;
-        }  
-        case SpvOpFRem: {
-            stream << "OpFRem";
-            break;
-        }  
-        case SpvOpFMod: {
-            stream << "OpFMod";
-            break;
-        }  
-        case SpvOpVectorTimesScalar: {
-            stream << "OpVectorTimesScalar";
-            break;
-        }  
-        case SpvOpMatrixTimesScalar: {
-            stream << "OpMatrixTimesScalar";
-            break;
-        }  
-        case SpvOpVectorTimesMatrix: {
-            stream << "OpVectorTimesMatrix";
-            break;
-        }  
-        case SpvOpMatrixTimesVector: {
-            stream << "OpMatrixTimesVector";
-            break;
-        }  
-        case SpvOpMatrixTimesMatrix: {
-            stream << "OpMatrixTimesMatrix";
-            break;
-        }  
-        case SpvOpOuterProduct: {
-            stream << "OpOuterProduct";
-            break;
-        }  
-        case SpvOpDot: {
-            stream << "OpDot";
-            break;
-        }  
-        case SpvOpIAddCarry: {
-            stream << "OpIAddCarry";
-            break;
-        }  
-        case SpvOpISubBorrow: {
-            stream << "OpISubBorrow";
-            break;
-        }  
-        case SpvOpUMulExtended: {
-            stream << "OpUMulExtended";
-            break;
-        }  
-        case SpvOpSMulExtended: {
-            stream << "OpSMulExtended";
-            break;
-        }  
-        case SpvOpAny: {
-            stream << "OpAny";
-            break;
-        }  
-        case SpvOpAll: {
-            stream << "OpAll";
-            break;
-        }  
-        case SpvOpIsNan: {
-            stream << "OpIsNan";
-            break;
-        }  
-        case SpvOpIsInf: {
-            stream << "OpIsInf";
-            break;
-        }  
-        case SpvOpIsFinite: {
-            stream << "OpIsFinite";
-            break;
-        }  
-        case SpvOpIsNormal: {
-            stream << "OpIsNormal";
-            break;
-        }  
-        case SpvOpSignBitSet: {
-            stream << "OpSignBitSet";
-            break;
-        }  
-        case SpvOpLessOrGreater: {
-            stream << "OpLessOrGreater";
-            break;
-        }  
-        case SpvOpOrdered: {
-            stream << "OpOrdered";
-            break;
-        }  
-        case SpvOpUnordered: {
-            stream << "OpUnordered";
-            break;
-        }  
-        case SpvOpLogicalEqual: {
-            stream << "OpLogicalEqual";
-            break;
-        }  
-        case SpvOpLogicalNotEqual: {
-            stream << "OpLogicalNotEqual";
-            break;
-        }  
-        case SpvOpLogicalOr: {
-            stream << "OpLogicalOr";
-            break;
-        }  
-        case SpvOpLogicalAnd: {
-            stream << "OpLogicalAnd";
-            break;
-        }  
-        case SpvOpLogicalNot: {
-            stream << "OpLogicalNot";
-            break;
-        }  
-        case SpvOpSelect: {
-            stream << "OpSelect";
-            break;
-        }  
-        case SpvOpIEqual: {
-            stream << "OpIEqual";
-            break;
-        }  
-        case SpvOpINotEqual: {
-            stream << "OpINotEqual";
-            break;
-        }  
-        case SpvOpUGreaterThan: {
-            stream << "OpUGreaterThan";
-            break;
-        }  
-        case SpvOpSGreaterThan: {
-            stream << "OpSGreaterThan";
-            break;
-        }  
-        case SpvOpUGreaterThanEqual: {
-            stream << "OpUGreaterThanEqual";
-            break;
-        }  
-        case SpvOpSGreaterThanEqual: {
-            stream << "OpSGreaterThanEqual";
-            break;
-        }  
-        case SpvOpULessThan: {
-            stream << "OpULessThan";
-            break;
-        }  
-        case SpvOpSLessThan: {
-            stream << "OpSLessThan";
-            break;
-        }  
-        case SpvOpULessThanEqual: {
-            stream << "OpULessThanEqual";
-            break;
-        }  
-        case SpvOpSLessThanEqual: {
-            stream << "OpSLessThanEqual";
-            break;
-        }  
-        case SpvOpFOrdEqual: {
-            stream << "OpFOrdEqual";
-            break;
-        }  
-        case SpvOpFUnordEqual: {
-            stream << "OpFUnordEqual";
-            break;
-        }  
-        case SpvOpFOrdNotEqual: {
-            stream << "OpFOrdNotEqual";
-            break;
-        }  
-        case SpvOpFUnordNotEqual: {
-            stream << "OpFUnordNotEqual";
-            break;
-        }  
-        case SpvOpFOrdLessThan: {
-            stream << "OpFOrdLessThan";
-            break;
-        }  
-        case SpvOpFUnordLessThan: {
-            stream << "OpFUnordLessThan";
-            break;
-        }  
-        case SpvOpFOrdGreaterThan: {
-            stream << "OpFOrdGreaterThan";
-            break;
-        }  
-        case SpvOpFUnordGreaterThan: {
-            stream << "OpFUnordGreaterThan";
-            break;
-        }  
-        case SpvOpFOrdLessThanEqual: {
-            stream << "OpFOrdLessThanEqual";
-            break;
-        }  
-        case SpvOpFUnordLessThanEqual: {
-            stream << "OpFUnordLessThanEqual";
-            break;
-        }  
-        case SpvOpFOrdGreaterThanEqual: {
-            stream << "OpFOrdGreaterThanEqual";
-            break;
-        }  
-        case SpvOpFUnordGreaterThanEqual: {
-            stream << "OpFUnordGreaterThanEqual";
-            break;
-        }  
-        case SpvOpShiftRightLogical: {
-            stream << "OpShiftRightLogical";
-            break;
-        }  
-        case SpvOpShiftRightArithmetic: {
-            stream << "OpShiftRightArithmetic";
-            break;
-        }  
-        case SpvOpShiftLeftLogical: {
-            stream << "OpShiftLeftLogical";
-            break;
-        }  
-        case SpvOpBitwiseOr: {
-            stream << "OpBitwiseOr";
-            break;
-        }  
-        case SpvOpBitwiseXor: {
-            stream << "OpBitwiseXor";
-            break;
-        }  
-        case SpvOpBitwiseAnd: {
-            stream << "OpBitwiseAnd";
-            break;
-        }  
-        case SpvOpNot: {
-            stream << "OpNot";
-            break;
-        }  
-        case SpvOpBitFieldInsert: {
-            stream << "OpBitFieldInsert";
-            break;
-        }  
-        case SpvOpBitFieldSExtract: {
-            stream << "OpBitFieldSExtract";
-            break;
-        }  
-        case SpvOpBitFieldUExtract: {
-            stream << "OpBitFieldUExtract";
-            break;
-        }  
-        case SpvOpBitReverse: {
-            stream << "OpBitReverse";
-            break;
-        }  
-        case SpvOpBitCount: {
-            stream << "OpBitCount";
-            break;
-        }  
-        case SpvOpDPdx: {
-            stream << "OpDPdx";
-            break;
-        }  
-        case SpvOpDPdy: {
-            stream << "OpDPdy";
-            break;
-        }  
-        case SpvOpFwidth: {
-            stream << "OpFwidth";
-            break;
-        }  
-        case SpvOpDPdxFine: {
-            stream << "OpDPdxFine";
-            break;
-        }  
-        case SpvOpDPdyFine: {
-            stream << "OpDPdyFine";
-            break;
-        }  
-        case SpvOpFwidthFine: {
-            stream << "OpFwidthFine";
-            break;
-        }  
-        case SpvOpDPdxCoarse: {
-            stream << "OpDPdxCoarse";
-            break;
-        }  
-        case SpvOpDPdyCoarse: {
-            stream << "OpDPdyCoarse";
-            break;
-        }  
-        case SpvOpFwidthCoarse: {
-            stream << "OpFwidthCoarse";
-            break;
-        }  
-        case SpvOpEmitVertex: {
-            stream << "OpEmitVertex";
-            break;
-        }  
-        case SpvOpEndPrimitive: {
-            stream << "OpEndPrimitive";
-            break;
-        }  
-        case SpvOpEmitStreamVertex: {
-            stream << "OpEmitStreamVertex";
-            break;
-        }  
-        case SpvOpEndStreamPrimitive: {
-            stream << "OpEndStreamPrimitive";
-            break;
-        }  
-        case SpvOpControlBarrier: {
-            stream << "OpControlBarrier";
-            break;
-        }  
-        case SpvOpMemoryBarrier: {
-            stream << "OpMemoryBarrier";
-            break;
-        }  
-        case SpvOpAtomicLoad: {
-            stream << "OpAtomicLoad";
-            break;
-        }  
-        case SpvOpAtomicStore: {
-            stream << "OpAtomicStore";
-            break;
-        }  
-        case SpvOpAtomicExchange: {
-            stream << "OpAtomicExchange";
-            break;
-        }  
-        case SpvOpAtomicCompareExchange: {
-            stream << "OpAtomicCompareExchange";
-            break;
-        }  
-        case SpvOpAtomicCompareExchangeWeak: {
-            stream << "OpAtomicCompareExchangeWeak";
-            break;
-        }  
-        case SpvOpAtomicIIncrement: {
-            stream << "OpAtomicIIncrement";
-            break;
-        }  
-        case SpvOpAtomicIDecrement: {
-            stream << "OpAtomicIDecrement";
-            break;
-        }  
-        case SpvOpAtomicIAdd: {
-            stream << "OpAtomicIAdd";
-            break;
-        }  
-        case SpvOpAtomicISub: {
-            stream << "OpAtomicISub";
-            break;
-        }  
-        case SpvOpAtomicSMin: {
-            stream << "OpAtomicSMin";
-            break;
-        }  
-        case SpvOpAtomicUMin: {
-            stream << "OpAtomicUMin";
-            break;
-        }  
-        case SpvOpAtomicSMax: {
-            stream << "OpAtomicSMax";
-            break;
-        }  
-        case SpvOpAtomicUMax: {
-            stream << "OpAtomicUMax";
-            break;
-        }  
-        case SpvOpAtomicAnd: {
-            stream << "OpAtomicAnd";
-            break;
-        }  
-        case SpvOpAtomicOr: {
-            stream << "OpAtomicOr";
-            break;
-        }  
-        case SpvOpAtomicXor: {
-            stream << "OpAtomicXor";
-            break;
-        }  
-        case SpvOpPhi: {
-            stream << "OpPhi";
-            break;
-        }  
-        case SpvOpLoopMerge: {
-            stream << "OpLoopMerge";
-            break;
-        }  
-        case SpvOpSelectionMerge: {
-            stream << "OpSelectionMerge";
-            break;
-        }  
-        case SpvOpLabel: {
-            stream << "OpLabel";
-            break;
-        }  
-        case SpvOpBranch: {
-            stream << "OpBranch";
-            break;
-        }  
-        case SpvOpBranchConditional: {
-            stream << "OpBranchConditional";
-            break;
-        }  
-        case SpvOpSwitch: {
-            stream << "OpSwitch";
-            break;
-        }  
-        case SpvOpKill: {
-            stream << "OpKill";
-            break;
-        }  
-        case SpvOpReturn: {
-            stream << "OpReturn";
-            break;
-        }  
-        case SpvOpReturnValue: {
-            stream << "OpReturnValue";
-            break;
-        }  
-        case SpvOpUnreachable: {
-            stream << "OpUnreachable";
-            break;
-        }  
-        case SpvOpLifetimeStart: {
-            stream << "OpLifetimeStart";
-            break;
-        }  
-        case SpvOpLifetimeStop: {
-            stream << "OpLifetimeStop";
-            break;
-        }  
-        case SpvOpGroupAsyncCopy: {
-            stream << "OpGroupAsyncCopy";
-            break;
-        }  
-        case SpvOpGroupWaitEvents: {
-            stream << "OpGroupWaitEvents";
-            break;
-        }  
-        case SpvOpGroupAll: {
-            stream << "OpGroupAll";
-            break;
-        }  
-        case SpvOpGroupAny: {
-            stream << "OpGroupAny";
-            break;
-        }  
-        case SpvOpGroupBroadcast: {
-            stream << "OpGroupBroadcast";
-            break;
-        }  
-        case SpvOpGroupIAdd: {
-            stream << "OpGroupIAdd";
-            break;
-        }  
-        case SpvOpGroupFAdd: {
-            stream << "OpGroupFAdd";
-            break;
-        }  
-        case SpvOpGroupFMin: {
-            stream << "OpGroupFMin";
-            break;
-        }  
-        case SpvOpGroupUMin: {
-            stream << "OpGroupUMin";
-            break;
-        }  
-        case SpvOpGroupSMin: {
-            stream << "OpGroupSMin";
-            break;
-        }  
-        case SpvOpGroupFMax: {
-            stream << "OpGroupFMax";
-            break;
-        }  
-        case SpvOpGroupUMax: {
-            stream << "OpGroupUMax";
-            break;
-        }  
-        case SpvOpGroupSMax: {
-            stream << "OpGroupSMax";
-            break;
-        }  
-        case SpvOpReadPipe: {
-            stream << "OpReadPipe";
-            break;
-        }  
-        case SpvOpWritePipe: {
-            stream << "OpWritePipe";
-            break;
-        }  
-        case SpvOpReservedReadPipe: {
-            stream << "OpReservedReadPipe";
-            break;
-        }  
-        case SpvOpReservedWritePipe: {
-            stream << "OpReservedWritePipe";
-            break;
-        }  
-        case SpvOpReserveReadPipePackets: {
-            stream << "OpReserveReadPipePackets";
-            break;
-        }  
-        case SpvOpReserveWritePipePackets: {
-            stream << "OpReserveWritePipePackets";
-            break;
-        }  
-        case SpvOpCommitReadPipe: {
-            stream << "OpCommitReadPipe";
-            break;
-        }  
-        case SpvOpCommitWritePipe: {
-            stream << "OpCommitWritePipe";
-            break;
-        }  
-        case SpvOpIsValidReserveId: {
-            stream << "OpIsValidReserveId";
-            break;
-        }  
-        case SpvOpGetNumPipePackets: {
-            stream << "OpGetNumPipePackets";
-            break;
-        }  
-        case SpvOpGetMaxPipePackets: {
-            stream << "OpGetMaxPipePackets";
-            break;
-        }  
-        case SpvOpGroupReserveReadPipePackets: {
-            stream << "OpGroupReserveReadPipePackets";
-            break;
-        }  
-        case SpvOpGroupReserveWritePipePackets: {
-            stream << "OpGroupReserveWritePipePackets";
-            break;
-        }  
-        case SpvOpGroupCommitReadPipe: {
-            stream << "OpGroupCommitReadPipe";
-            break;
-        }  
-        case SpvOpGroupCommitWritePipe: {
-            stream << "OpGroupCommitWritePipe";
-            break;
-        }  
-        case SpvOpEnqueueMarker: {
-            stream << "OpEnqueueMarker";
-            break;
-        }  
-        case SpvOpEnqueueKernel: {
-            stream << "OpEnqueueKernel";
-            break;
-        }  
-        case SpvOpGetKernelNDrangeSubGroupCount: {
-            stream << "OpGetKernelNDrangeSubGroupCount";
-            break;
-        }  
-        case SpvOpGetKernelNDrangeMaxSubGroupSize: {
-            stream << "OpGetKernelNDrangeMaxSubGroupSize";
-            break;
-        }  
-        case SpvOpGetKernelWorkGroupSize: {
-            stream << "OpGetKernelWorkGroupSize";
-            break;
-        }  
-        case SpvOpGetKernelPreferredWorkGroupSizeMultiple: {
-            stream << "OpGetKernelPreferredWorkGroupSizeMultiple";
-            break;
-        }  
-        case SpvOpRetainEvent: {
-            stream << "OpRetainEvent";
-            break;
-        }  
-        case SpvOpReleaseEvent: {
-            stream << "OpReleaseEvent";
-            break;
-        }  
-        case SpvOpCreateUserEvent: {
-            stream << "OpCreateUserEvent";
-            break;
-        }  
-        case SpvOpIsValidEvent: {
-            stream << "OpIsValidEvent";
-            break;
-        }  
-        case SpvOpSetUserEventStatus: {
-            stream << "OpSetUserEventStatus";
-            break;
-        }  
-        case SpvOpCaptureEventProfilingInfo: {
-            stream << "OpCaptureEventProfilingInfo";
-            break;
-        }  
-        case SpvOpGetDefaultQueue: {
-            stream << "OpGetDefaultQueue";
-            break;
-        }  
-        case SpvOpBuildNDRange: {
-            stream << "OpBuildNDRange";
-            break;
-        }  
-        case SpvOpImageSparseSampleImplicitLod: {
-            stream << "OpImageSparseSampleImplicitLod";
-            break;
-        }  
-        case SpvOpImageSparseSampleExplicitLod: {
-            stream << "OpImageSparseSampleExplicitLod";
-            break;
-        }  
-        case SpvOpImageSparseSampleDrefImplicitLod: {
-            stream << "OpImageSparseSampleDrefImplicitLod";
-            break;
-        }  
-        case SpvOpImageSparseSampleDrefExplicitLod: {
-            stream << "OpImageSparseSampleDrefExplicitLod";
-            break;
-        }  
-        case SpvOpImageSparseSampleProjImplicitLod: {
-            stream << "OpImageSparseSampleProjImplicitLod";
-            break;
-        }  
-        case SpvOpImageSparseSampleProjExplicitLod: {
-            stream << "OpImageSparseSampleProjExplicitLod";
-            break;
-        }  
-        case SpvOpImageSparseSampleProjDrefImplicitLod: {
-            stream << "OpImageSparseSampleProjDrefImplicitLod";
-            break;
-        }  
-        case SpvOpImageSparseSampleProjDrefExplicitLod: {
-            stream << "OpImageSparseSampleProjDrefExplicitLod";
-            break;
-        }  
-        case SpvOpImageSparseFetch: {
-            stream << "OpImageSparseFetch";
-            break;
-        }  
-        case SpvOpImageSparseGather: {
-            stream << "OpImageSparseGather";
-            break;
-        }  
-        case SpvOpImageSparseDrefGather: {
-            stream << "OpImageSparseDrefGather";
-            break;
-        }  
-        case SpvOpImageSparseTexelsResident: {
-            stream << "OpImageSparseTexelsResident";
-            break;
-        }  
-        case SpvOpNoLine: {
-            stream << "OpNoLine";
-            break;
-        }  
-        case SpvOpAtomicFlagTestAndSet: {
-            stream << "OpAtomicFlagTestAndSet";
-            break;
-        }  
-        case SpvOpAtomicFlagClear: {
-            stream << "OpAtomicFlagClear";
-            break;
-        }  
-        case SpvOpImageSparseRead: {
-            stream << "OpImageSparseRead";
-            break;
-        }  
-        case SpvOpDecorateId: {
-            stream << "OpDecorateId";
-            break;
-        }  
-        case SpvOpSubgroupBallotKHR: {
-            stream << "OpSubgroupBallotKHR";
-            break;
-        }  
-        case SpvOpSubgroupFirstInvocationKHR: {
-            stream << "OpSubgroupFirstInvocationKHR";
-            break;
-        }  
-        case SpvOpSubgroupAllKHR: {
-            stream << "OpSubgroupAllKHR";
-            break;
-        }  
-        case SpvOpSubgroupAnyKHR: {
-            stream << "OpSubgroupAnyKHR";
-            break;
-        }  
-        case SpvOpSubgroupAllEqualKHR: {
-            stream << "OpSubgroupAllEqualKHR";
-            break;
-        }  
-        case SpvOpSubgroupReadInvocationKHR: {
-            stream << "OpSubgroupReadInvocationKHR";
-            break;
-        }  
-        case SpvOpGroupIAddNonUniformAMD: {
-            stream << "OpGroupIAddNonUniformAMD";
-            break;
-        }  
-        case SpvOpGroupFAddNonUniformAMD: {
-            stream << "OpGroupFAddNonUniformAMD";
-            break;
-        }  
-        case SpvOpGroupFMinNonUniformAMD: {
-            stream << "OpGroupFMinNonUniformAMD";
-            break;
-        }  
-        case SpvOpGroupUMinNonUniformAMD: {
-            stream << "OpGroupUMinNonUniformAMD";
-            break;
-        }  
-        case SpvOpGroupSMinNonUniformAMD: {
-            stream << "OpGroupSMinNonUniformAMD";
-            break;
-        }  
-        case SpvOpGroupFMaxNonUniformAMD: {
-            stream << "OpGroupFMaxNonUniformAMD";
-            break;
-        }  
-        case SpvOpGroupUMaxNonUniformAMD: {
-            stream << "OpGroupUMaxNonUniformAMD";
-            break;
-        }  
-        case SpvOpGroupSMaxNonUniformAMD: {
-            stream << "OpGroupSMaxNonUniformAMD";
-            break;
-        }  
-        case SpvOpFragmentMaskFetchAMD: {
-            stream << "OpFragmentMaskFetchAMD";
-            break;
-        }  
-        case SpvOpFragmentFetchAMD: {
-            stream << "OpFragmentFetchAMD";
-            break;
-        }  
-        case SpvOpSubgroupShuffleINTEL: {
-            stream << "OpSubgroupShuffleINTEL";
-            break;
-        }  
-        case SpvOpSubgroupShuffleDownINTEL: {
-            stream << "OpSubgroupShuffleDownINTEL";
-            break;
-        }  
-        case SpvOpSubgroupShuffleUpINTEL: {
-            stream << "OpSubgroupShuffleUpINTEL";
-            break;
-        }  
-        case SpvOpSubgroupShuffleXorINTEL: {
-            stream << "OpSubgroupShuffleXorINTEL";
-            break;
-        }  
-        case SpvOpSubgroupBlockReadINTEL: {
-            stream << "OpSubgroupBlockReadINTEL";
-            break;
-        }  
-        case SpvOpSubgroupBlockWriteINTEL: {
-            stream << "OpSubgroupBlockWriteINTEL";
-            break;
-        }  
-        case SpvOpSubgroupImageBlockReadINTEL: {
-            stream << "OpSubgroupImageBlockReadINTEL";
-            break;
-        }  
-        case SpvOpSubgroupImageBlockWriteINTEL: {
-            stream << "OpSubgroupImageBlockWriteINTEL";
-            break;
-        }  
-        case SpvOpDecorateStringGOOGLE: {
-            stream << "OpDecorateStringGOOGLE";
-            break;
-        }  
-        case SpvOpMemberDecorateStringGOOGLE: {
-            stream << "OpMemberDecorateStringGOOGLE";
-            break;
-        }  
-        case SpvOpMax: 
-        default: {
-            stream << "*INVALID*";
-            break;
-        }
-    };
 
-    if (inst.has_type()) {
-        stream << std::string(" %") << std::to_string(inst.type_id());
+    switch (inst.op_code()) {
+    case SpvOpNop: {
+        stream << "OpNop";
+        break;
+    }
+    case SpvOpUndef: {
+        stream << "OpUndef";
+        break;
+    }
+    case SpvOpSourceContinued: {
+        stream << "OpSourceContinued";
+        break;
+    }
+    case SpvOpSource: {
+        stream << "OpSource";
+        break;
+    }
+    case SpvOpSourceExtension: {
+        stream << "OpSourceExtension";
+        break;
+    }
+    case SpvOpName: {
+        stream << "OpName";
+        break;
+    }
+    case SpvOpMemberName: {
+        stream << "OpMemberName";
+        break;
+    }
+    case SpvOpString: {
+        stream << "OpString";
+        break;
+    }
+    case SpvOpLine: {
+        stream << "OpLine";
+        break;
+    }
+    case SpvOpExtension: {
+        stream << "OpExtension";
+        break;
+    }
+    case SpvOpExtInstImport: {
+        stream << "OpExtInstImport";
+        break;
+    }
+    case SpvOpExtInst: {
+        stream << "OpExtInst";
+        break;
+    }
+    case SpvOpMemoryModel: {
+        stream << "OpMemoryModel";
+        break;
+    }
+    case SpvOpEntryPoint: {
+        stream << "OpEntryPoint";
+        break;
+    }
+    case SpvOpExecutionMode: {
+        stream << "OpExecutionMode";
+        break;
+    }
+    case SpvOpCapability: {
+        stream << "OpCapability";
+        break;
+    }
+    case SpvOpTypeVoid: {
+        stream << "OpTypeVoid";
+        break;
+    }
+    case SpvOpTypeBool: {
+        stream << "OpTypeBool";
+        break;
+    }
+    case SpvOpTypeInt: {
+        stream << "OpTypeInt";
+        break;
+    }
+    case SpvOpTypeFloat: {
+        stream << "OpTypeFloat";
+        break;
+    }
+    case SpvOpTypeVector: {
+        stream << "OpTypeVector";
+        break;
+    }
+    case SpvOpTypeMatrix: {
+        stream << "OpTypeMatrix";
+        break;
+    }
+    case SpvOpTypeImage: {
+        stream << "OpTypeImage";
+        break;
+    }
+    case SpvOpTypeSampler: {
+        stream << "OpTypeSampler";
+        break;
+    }
+    case SpvOpTypeSampledImage: {
+        stream << "OpTypeSampledImage";
+        break;
+    }
+    case SpvOpTypeArray: {
+        stream << "OpTypeArray";
+        break;
+    }
+    case SpvOpTypeRuntimeArray: {
+        stream << "OpTypeRuntimeArray";
+        break;
+    }
+    case SpvOpTypeStruct: {
+        stream << "OpTypeStruct";
+        break;
+    }
+    case SpvOpTypeOpaque: {
+        stream << "OpTypeOpaque";
+        break;
+    }
+    case SpvOpTypePointer: {
+        stream << "OpTypePointer";
+        break;
+    }
+    case SpvOpTypeFunction: {
+        stream << "OpTypeFunction";
+        break;
+    }
+    case SpvOpTypeEvent: {
+        stream << "OpTypeEvent";
+        break;
+    }
+    case SpvOpTypeDeviceEvent: {
+        stream << "OpTypeDeviceEvent";
+        break;
+    }
+    case SpvOpTypeReserveId: {
+        stream << "OpTypeReserveId";
+        break;
+    }
+    case SpvOpTypeQueue: {
+        stream << "OpTypeQueue";
+        break;
+    }
+    case SpvOpTypePipe: {
+        stream << "OpTypePipe";
+        break;
+    }
+    case SpvOpTypeForwardPointer: {
+        stream << "OpTypeForwardPointer";
+        break;
+    }
+    case SpvOpConstantTrue: {
+        stream << "OpConstantTrue";
+        break;
+    }
+    case SpvOpConstantFalse: {
+        stream << "OpConstantFalse";
+        break;
+    }
+    case SpvOpConstant: {
+        stream << "OpConstant";
+        break;
+    }
+    case SpvOpConstantComposite: {
+        stream << "OpConstantComposite";
+        break;
+    }
+    case SpvOpConstantSampler: {
+        stream << "OpConstantSampler";
+        break;
+    }
+    case SpvOpConstantNull: {
+        stream << "OpConstantNull";
+        break;
+    }
+    case SpvOpSpecConstantTrue: {
+        stream << "OpSpecConstantTrue";
+        break;
+    }
+    case SpvOpSpecConstantFalse: {
+        stream << "OpSpecConstantFalse";
+        break;
+    }
+    case SpvOpSpecConstant: {
+        stream << "OpSpecConstant";
+        break;
+    }
+    case SpvOpSpecConstantComposite: {
+        stream << "OpSpecConstantComposite";
+        break;
+    }
+    case SpvOpSpecConstantOp: {
+        stream << "OpSpecConstantOp";
+        break;
+    }
+    case SpvOpFunction: {
+        stream << "OpFunction";
+        break;
+    }
+    case SpvOpFunctionParameter: {
+        stream << "OpFunctionParameter";
+        break;
+    }
+    case SpvOpFunctionEnd: {
+        stream << "OpFunctionEnd";
+        break;
+    }
+    case SpvOpFunctionCall: {
+        stream << "OpFunctionCall";
+        break;
+    }
+    case SpvOpVariable: {
+        stream << "OpVariable";
+        break;
+    }
+    case SpvOpImageTexelPointer: {
+        stream << "OpImageTexelPointer";
+        break;
+    }
+    case SpvOpLoad: {
+        stream << "OpLoad";
+        break;
+    }
+    case SpvOpStore: {
+        stream << "OpStore";
+        break;
+    }
+    case SpvOpCopyMemory: {
+        stream << "OpCopyMemory";
+        break;
+    }
+    case SpvOpCopyMemorySized: {
+        stream << "OpCopyMemorySized";
+        break;
+    }
+    case SpvOpAccessChain: {
+        stream << "OpAccessChain";
+        break;
+    }
+    case SpvOpInBoundsAccessChain: {
+        stream << "OpInBoundsAccessChain";
+        break;
+    }
+    case SpvOpPtrAccessChain: {
+        stream << "OpPtrAccessChain";
+        break;
+    }
+    case SpvOpArrayLength: {
+        stream << "OpArrayLength";
+        break;
+    }
+    case SpvOpGenericPtrMemSemantics: {
+        stream << "OpGenericPtrMemSemantics";
+        break;
+    }
+    case SpvOpInBoundsPtrAccessChain: {
+        stream << "OpInBoundsPtrAccessChain";
+        break;
+    }
+    case SpvOpDecorate: {
+        stream << "OpDecorate";
+        break;
+    }
+    case SpvOpMemberDecorate: {
+        stream << "OpMemberDecorate";
+        break;
+    }
+    case SpvOpDecorationGroup: {
+        stream << "OpDecorationGroup";
+        break;
+    }
+    case SpvOpGroupDecorate: {
+        stream << "OpGroupDecorate";
+        break;
+    }
+    case SpvOpGroupMemberDecorate: {
+        stream << "OpGroupMemberDecorate";
+        break;
+    }
+    case SpvOpVectorExtractDynamic: {
+        stream << "OpVectorExtractDynamic";
+        break;
+    }
+    case SpvOpVectorInsertDynamic: {
+        stream << "OpVectorInsertDynamic";
+        break;
+    }
+    case SpvOpVectorShuffle: {
+        stream << "OpVectorShuffle";
+        break;
+    }
+    case SpvOpCompositeConstruct: {
+        stream << "OpCompositeConstruct";
+        break;
+    }
+    case SpvOpCompositeExtract: {
+        stream << "OpCompositeExtract";
+        break;
+    }
+    case SpvOpCompositeInsert: {
+        stream << "OpCompositeInsert";
+        break;
+    }
+    case SpvOpCopyObject: {
+        stream << "OpCopyObject";
+        break;
+    }
+    case SpvOpTranspose: {
+        stream << "OpTranspose";
+        break;
+    }
+    case SpvOpSampledImage: {
+        stream << "OpSampledImage";
+        break;
+    }
+    case SpvOpImageSampleImplicitLod: {
+        stream << "OpImageSampleImplicitLod";
+        break;
+    }
+    case SpvOpImageSampleExplicitLod: {
+        stream << "OpImageSampleExplicitLod";
+        break;
+    }
+    case SpvOpImageSampleDrefImplicitLod: {
+        stream << "OpImageSampleDrefImplicitLod";
+        break;
+    }
+    case SpvOpImageSampleDrefExplicitLod: {
+        stream << "OpImageSampleDrefExplicitLod";
+        break;
+    }
+    case SpvOpImageSampleProjImplicitLod: {
+        stream << "OpImageSampleProjImplicitLod";
+        break;
+    }
+    case SpvOpImageSampleProjExplicitLod: {
+        stream << "OpImageSampleProjExplicitLod";
+        break;
+    }
+    case SpvOpImageSampleProjDrefImplicitLod: {
+        stream << "OpImageSampleProjDrefImplicitLod";
+        break;
+    }
+    case SpvOpImageSampleProjDrefExplicitLod: {
+        stream << "OpImageSampleProjDrefExplicitLod";
+        break;
+    }
+    case SpvOpImageFetch: {
+        stream << "OpImageFetch";
+        break;
+    }
+    case SpvOpImageGather: {
+        stream << "OpImageGather";
+        break;
+    }
+    case SpvOpImageDrefGather: {
+        stream << "OpImageDrefGather";
+        break;
+    }
+    case SpvOpImageRead: {
+        stream << "OpImageRead";
+        break;
+    }
+    case SpvOpImageWrite: {
+        stream << "OpImageWrite";
+        break;
+    }
+    case SpvOpImage: {
+        stream << "OpImage";
+        break;
+    }
+    case SpvOpImageQueryFormat: {
+        stream << "OpImageQueryFormat";
+        break;
+    }
+    case SpvOpImageQueryOrder: {
+        stream << "OpImageQueryOrder";
+        break;
+    }
+    case SpvOpImageQuerySizeLod: {
+        stream << "OpImageQuerySizeLod";
+        break;
+    }
+    case SpvOpImageQuerySize: {
+        stream << "OpImageQuerySize";
+        break;
+    }
+    case SpvOpImageQueryLod: {
+        stream << "OpImageQueryLod";
+        break;
+    }
+    case SpvOpImageQueryLevels: {
+        stream << "OpImageQueryLevels";
+        break;
+    }
+    case SpvOpImageQuerySamples: {
+        stream << "OpImageQuerySamples";
+        break;
+    }
+    case SpvOpConvertFToU: {
+        stream << "OpConvertFToU";
+        break;
+    }
+    case SpvOpConvertFToS: {
+        stream << "OpConvertFToS";
+        break;
+    }
+    case SpvOpConvertSToF: {
+        stream << "OpConvertSToF";
+        break;
+    }
+    case SpvOpConvertUToF: {
+        stream << "OpConvertUToF";
+        break;
+    }
+    case SpvOpUConvert: {
+        stream << "OpUConvert";
+        break;
+    }
+    case SpvOpSConvert: {
+        stream << "OpSConvert";
+        break;
+    }
+    case SpvOpFConvert: {
+        stream << "OpFConvert";
+        break;
+    }
+    case SpvOpConvertPtrToU: {
+        stream << "OpConvertPtrToU";
+        break;
+    }
+    case SpvOpSatConvertSToU: {
+        stream << "OpSatConvertSToU";
+        break;
+    }
+    case SpvOpSatConvertUToS: {
+        stream << "OpSatConvertUToS";
+        break;
+    }
+    case SpvOpConvertUToPtr: {
+        stream << "OpConvertUToPtr";
+        break;
+    }
+    case SpvOpPtrCastToGeneric: {
+        stream << "OpPtrCastToGeneric";
+        break;
+    }
+    case SpvOpGenericCastToPtr: {
+        stream << "OpGenericCastToPtr";
+        break;
+    }
+    case SpvOpGenericCastToPtrExplicit: {
+        stream << "OpGenericCastToPtrExplicit";
+        break;
+    }
+    case SpvOpBitcast: {
+        stream << "OpBitcast";
+        break;
+    }
+    case SpvOpSNegate: {
+        stream << "OpSNegate";
+        break;
+    }
+    case SpvOpFNegate: {
+        stream << "OpFNegate";
+        break;
+    }
+    case SpvOpIAdd: {
+        stream << "OpIAdd";
+        break;
+    }
+    case SpvOpFAdd: {
+        stream << "OpFAdd";
+        break;
+    }
+    case SpvOpISub: {
+        stream << "OpISub";
+        break;
+    }
+    case SpvOpFSub: {
+        stream << "OpFSub";
+        break;
+    }
+    case SpvOpIMul: {
+        stream << "OpIMul";
+        break;
+    }
+    case SpvOpFMul: {
+        stream << "OpFMul";
+        break;
+    }
+    case SpvOpUDiv: {
+        stream << "OpUDiv";
+        break;
+    }
+    case SpvOpSDiv: {
+        stream << "OpSDiv";
+        break;
+    }
+    case SpvOpFDiv: {
+        stream << "OpFDiv";
+        break;
+    }
+    case SpvOpUMod: {
+        stream << "OpUMod";
+        break;
+    }
+    case SpvOpSRem: {
+        stream << "OpSRem";
+        break;
+    }
+    case SpvOpSMod: {
+        stream << "OpSMod";
+        break;
+    }
+    case SpvOpFRem: {
+        stream << "OpFRem";
+        break;
+    }
+    case SpvOpFMod: {
+        stream << "OpFMod";
+        break;
+    }
+    case SpvOpVectorTimesScalar: {
+        stream << "OpVectorTimesScalar";
+        break;
+    }
+    case SpvOpMatrixTimesScalar: {
+        stream << "OpMatrixTimesScalar";
+        break;
+    }
+    case SpvOpVectorTimesMatrix: {
+        stream << "OpVectorTimesMatrix";
+        break;
+    }
+    case SpvOpMatrixTimesVector: {
+        stream << "OpMatrixTimesVector";
+        break;
+    }
+    case SpvOpMatrixTimesMatrix: {
+        stream << "OpMatrixTimesMatrix";
+        break;
+    }
+    case SpvOpOuterProduct: {
+        stream << "OpOuterProduct";
+        break;
+    }
+    case SpvOpDot: {
+        stream << "OpDot";
+        break;
+    }
+    case SpvOpIAddCarry: {
+        stream << "OpIAddCarry";
+        break;
+    }
+    case SpvOpISubBorrow: {
+        stream << "OpISubBorrow";
+        break;
+    }
+    case SpvOpUMulExtended: {
+        stream << "OpUMulExtended";
+        break;
+    }
+    case SpvOpSMulExtended: {
+        stream << "OpSMulExtended";
+        break;
+    }
+    case SpvOpAny: {
+        stream << "OpAny";
+        break;
+    }
+    case SpvOpAll: {
+        stream << "OpAll";
+        break;
+    }
+    case SpvOpIsNan: {
+        stream << "OpIsNan";
+        break;
+    }
+    case SpvOpIsInf: {
+        stream << "OpIsInf";
+        break;
+    }
+    case SpvOpIsFinite: {
+        stream << "OpIsFinite";
+        break;
+    }
+    case SpvOpIsNormal: {
+        stream << "OpIsNormal";
+        break;
+    }
+    case SpvOpSignBitSet: {
+        stream << "OpSignBitSet";
+        break;
+    }
+    case SpvOpLessOrGreater: {
+        stream << "OpLessOrGreater";
+        break;
+    }
+    case SpvOpOrdered: {
+        stream << "OpOrdered";
+        break;
+    }
+    case SpvOpUnordered: {
+        stream << "OpUnordered";
+        break;
+    }
+    case SpvOpLogicalEqual: {
+        stream << "OpLogicalEqual";
+        break;
+    }
+    case SpvOpLogicalNotEqual: {
+        stream << "OpLogicalNotEqual";
+        break;
+    }
+    case SpvOpLogicalOr: {
+        stream << "OpLogicalOr";
+        break;
+    }
+    case SpvOpLogicalAnd: {
+        stream << "OpLogicalAnd";
+        break;
+    }
+    case SpvOpLogicalNot: {
+        stream << "OpLogicalNot";
+        break;
+    }
+    case SpvOpSelect: {
+        stream << "OpSelect";
+        break;
+    }
+    case SpvOpIEqual: {
+        stream << "OpIEqual";
+        break;
+    }
+    case SpvOpINotEqual: {
+        stream << "OpINotEqual";
+        break;
+    }
+    case SpvOpUGreaterThan: {
+        stream << "OpUGreaterThan";
+        break;
+    }
+    case SpvOpSGreaterThan: {
+        stream << "OpSGreaterThan";
+        break;
+    }
+    case SpvOpUGreaterThanEqual: {
+        stream << "OpUGreaterThanEqual";
+        break;
+    }
+    case SpvOpSGreaterThanEqual: {
+        stream << "OpSGreaterThanEqual";
+        break;
+    }
+    case SpvOpULessThan: {
+        stream << "OpULessThan";
+        break;
+    }
+    case SpvOpSLessThan: {
+        stream << "OpSLessThan";
+        break;
+    }
+    case SpvOpULessThanEqual: {
+        stream << "OpULessThanEqual";
+        break;
+    }
+    case SpvOpSLessThanEqual: {
+        stream << "OpSLessThanEqual";
+        break;
+    }
+    case SpvOpFOrdEqual: {
+        stream << "OpFOrdEqual";
+        break;
+    }
+    case SpvOpFUnordEqual: {
+        stream << "OpFUnordEqual";
+        break;
+    }
+    case SpvOpFOrdNotEqual: {
+        stream << "OpFOrdNotEqual";
+        break;
+    }
+    case SpvOpFUnordNotEqual: {
+        stream << "OpFUnordNotEqual";
+        break;
+    }
+    case SpvOpFOrdLessThan: {
+        stream << "OpFOrdLessThan";
+        break;
+    }
+    case SpvOpFUnordLessThan: {
+        stream << "OpFUnordLessThan";
+        break;
+    }
+    case SpvOpFOrdGreaterThan: {
+        stream << "OpFOrdGreaterThan";
+        break;
+    }
+    case SpvOpFUnordGreaterThan: {
+        stream << "OpFUnordGreaterThan";
+        break;
+    }
+    case SpvOpFOrdLessThanEqual: {
+        stream << "OpFOrdLessThanEqual";
+        break;
+    }
+    case SpvOpFUnordLessThanEqual: {
+        stream << "OpFUnordLessThanEqual";
+        break;
+    }
+    case SpvOpFOrdGreaterThanEqual: {
+        stream << "OpFOrdGreaterThanEqual";
+        break;
+    }
+    case SpvOpFUnordGreaterThanEqual: {
+        stream << "OpFUnordGreaterThanEqual";
+        break;
+    }
+    case SpvOpShiftRightLogical: {
+        stream << "OpShiftRightLogical";
+        break;
+    }
+    case SpvOpShiftRightArithmetic: {
+        stream << "OpShiftRightArithmetic";
+        break;
+    }
+    case SpvOpShiftLeftLogical: {
+        stream << "OpShiftLeftLogical";
+        break;
+    }
+    case SpvOpBitwiseOr: {
+        stream << "OpBitwiseOr";
+        break;
+    }
+    case SpvOpBitwiseXor: {
+        stream << "OpBitwiseXor";
+        break;
+    }
+    case SpvOpBitwiseAnd: {
+        stream << "OpBitwiseAnd";
+        break;
+    }
+    case SpvOpNot: {
+        stream << "OpNot";
+        break;
+    }
+    case SpvOpBitFieldInsert: {
+        stream << "OpBitFieldInsert";
+        break;
+    }
+    case SpvOpBitFieldSExtract: {
+        stream << "OpBitFieldSExtract";
+        break;
+    }
+    case SpvOpBitFieldUExtract: {
+        stream << "OpBitFieldUExtract";
+        break;
+    }
+    case SpvOpBitReverse: {
+        stream << "OpBitReverse";
+        break;
+    }
+    case SpvOpBitCount: {
+        stream << "OpBitCount";
+        break;
+    }
+    case SpvOpDPdx: {
+        stream << "OpDPdx";
+        break;
+    }
+    case SpvOpDPdy: {
+        stream << "OpDPdy";
+        break;
+    }
+    case SpvOpFwidth: {
+        stream << "OpFwidth";
+        break;
+    }
+    case SpvOpDPdxFine: {
+        stream << "OpDPdxFine";
+        break;
+    }
+    case SpvOpDPdyFine: {
+        stream << "OpDPdyFine";
+        break;
+    }
+    case SpvOpFwidthFine: {
+        stream << "OpFwidthFine";
+        break;
+    }
+    case SpvOpDPdxCoarse: {
+        stream << "OpDPdxCoarse";
+        break;
+    }
+    case SpvOpDPdyCoarse: {
+        stream << "OpDPdyCoarse";
+        break;
+    }
+    case SpvOpFwidthCoarse: {
+        stream << "OpFwidthCoarse";
+        break;
+    }
+    case SpvOpEmitVertex: {
+        stream << "OpEmitVertex";
+        break;
+    }
+    case SpvOpEndPrimitive: {
+        stream << "OpEndPrimitive";
+        break;
+    }
+    case SpvOpEmitStreamVertex: {
+        stream << "OpEmitStreamVertex";
+        break;
+    }
+    case SpvOpEndStreamPrimitive: {
+        stream << "OpEndStreamPrimitive";
+        break;
+    }
+    case SpvOpControlBarrier: {
+        stream << "OpControlBarrier";
+        break;
+    }
+    case SpvOpMemoryBarrier: {
+        stream << "OpMemoryBarrier";
+        break;
+    }
+    case SpvOpAtomicLoad: {
+        stream << "OpAtomicLoad";
+        break;
+    }
+    case SpvOpAtomicStore: {
+        stream << "OpAtomicStore";
+        break;
+    }
+    case SpvOpAtomicExchange: {
+        stream << "OpAtomicExchange";
+        break;
+    }
+    case SpvOpAtomicCompareExchange: {
+        stream << "OpAtomicCompareExchange";
+        break;
+    }
+    case SpvOpAtomicCompareExchangeWeak: {
+        stream << "OpAtomicCompareExchangeWeak";
+        break;
+    }
+    case SpvOpAtomicIIncrement: {
+        stream << "OpAtomicIIncrement";
+        break;
+    }
+    case SpvOpAtomicIDecrement: {
+        stream << "OpAtomicIDecrement";
+        break;
+    }
+    case SpvOpAtomicIAdd: {
+        stream << "OpAtomicIAdd";
+        break;
+    }
+    case SpvOpAtomicISub: {
+        stream << "OpAtomicISub";
+        break;
+    }
+    case SpvOpAtomicSMin: {
+        stream << "OpAtomicSMin";
+        break;
+    }
+    case SpvOpAtomicUMin: {
+        stream << "OpAtomicUMin";
+        break;
+    }
+    case SpvOpAtomicSMax: {
+        stream << "OpAtomicSMax";
+        break;
+    }
+    case SpvOpAtomicUMax: {
+        stream << "OpAtomicUMax";
+        break;
+    }
+    case SpvOpAtomicAnd: {
+        stream << "OpAtomicAnd";
+        break;
+    }
+    case SpvOpAtomicOr: {
+        stream << "OpAtomicOr";
+        break;
+    }
+    case SpvOpAtomicXor: {
+        stream << "OpAtomicXor";
+        break;
+    }
+    case SpvOpPhi: {
+        stream << "OpPhi";
+        break;
+    }
+    case SpvOpLoopMerge: {
+        stream << "OpLoopMerge";
+        break;
+    }
+    case SpvOpSelectionMerge: {
+        stream << "OpSelectionMerge";
+        break;
+    }
+    case SpvOpLabel: {
+        stream << "OpLabel";
+        break;
+    }
+    case SpvOpBranch: {
+        stream << "OpBranch";
+        break;
+    }
+    case SpvOpBranchConditional: {
+        stream << "OpBranchConditional";
+        break;
+    }
+    case SpvOpSwitch: {
+        stream << "OpSwitch";
+        break;
+    }
+    case SpvOpKill: {
+        stream << "OpKill";
+        break;
+    }
+    case SpvOpReturn: {
+        stream << "OpReturn";
+        break;
+    }
+    case SpvOpReturnValue: {
+        stream << "OpReturnValue";
+        break;
+    }
+    case SpvOpUnreachable: {
+        stream << "OpUnreachable";
+        break;
+    }
+    case SpvOpLifetimeStart: {
+        stream << "OpLifetimeStart";
+        break;
+    }
+    case SpvOpLifetimeStop: {
+        stream << "OpLifetimeStop";
+        break;
+    }
+    case SpvOpGroupAsyncCopy: {
+        stream << "OpGroupAsyncCopy";
+        break;
+    }
+    case SpvOpGroupWaitEvents: {
+        stream << "OpGroupWaitEvents";
+        break;
+    }
+    case SpvOpGroupAll: {
+        stream << "OpGroupAll";
+        break;
+    }
+    case SpvOpGroupAny: {
+        stream << "OpGroupAny";
+        break;
+    }
+    case SpvOpGroupBroadcast: {
+        stream << "OpGroupBroadcast";
+        break;
+    }
+    case SpvOpGroupIAdd: {
+        stream << "OpGroupIAdd";
+        break;
+    }
+    case SpvOpGroupFAdd: {
+        stream << "OpGroupFAdd";
+        break;
+    }
+    case SpvOpGroupFMin: {
+        stream << "OpGroupFMin";
+        break;
+    }
+    case SpvOpGroupUMin: {
+        stream << "OpGroupUMin";
+        break;
+    }
+    case SpvOpGroupSMin: {
+        stream << "OpGroupSMin";
+        break;
+    }
+    case SpvOpGroupFMax: {
+        stream << "OpGroupFMax";
+        break;
+    }
+    case SpvOpGroupUMax: {
+        stream << "OpGroupUMax";
+        break;
+    }
+    case SpvOpGroupSMax: {
+        stream << "OpGroupSMax";
+        break;
+    }
+    case SpvOpReadPipe: {
+        stream << "OpReadPipe";
+        break;
+    }
+    case SpvOpWritePipe: {
+        stream << "OpWritePipe";
+        break;
+    }
+    case SpvOpReservedReadPipe: {
+        stream << "OpReservedReadPipe";
+        break;
+    }
+    case SpvOpReservedWritePipe: {
+        stream << "OpReservedWritePipe";
+        break;
+    }
+    case SpvOpReserveReadPipePackets: {
+        stream << "OpReserveReadPipePackets";
+        break;
+    }
+    case SpvOpReserveWritePipePackets: {
+        stream << "OpReserveWritePipePackets";
+        break;
+    }
+    case SpvOpCommitReadPipe: {
+        stream << "OpCommitReadPipe";
+        break;
+    }
+    case SpvOpCommitWritePipe: {
+        stream << "OpCommitWritePipe";
+        break;
+    }
+    case SpvOpIsValidReserveId: {
+        stream << "OpIsValidReserveId";
+        break;
+    }
+    case SpvOpGetNumPipePackets: {
+        stream << "OpGetNumPipePackets";
+        break;
+    }
+    case SpvOpGetMaxPipePackets: {
+        stream << "OpGetMaxPipePackets";
+        break;
+    }
+    case SpvOpGroupReserveReadPipePackets: {
+        stream << "OpGroupReserveReadPipePackets";
+        break;
+    }
+    case SpvOpGroupReserveWritePipePackets: {
+        stream << "OpGroupReserveWritePipePackets";
+        break;
+    }
+    case SpvOpGroupCommitReadPipe: {
+        stream << "OpGroupCommitReadPipe";
+        break;
+    }
+    case SpvOpGroupCommitWritePipe: {
+        stream << "OpGroupCommitWritePipe";
+        break;
+    }
+    case SpvOpEnqueueMarker: {
+        stream << "OpEnqueueMarker";
+        break;
+    }
+    case SpvOpEnqueueKernel: {
+        stream << "OpEnqueueKernel";
+        break;
+    }
+    case SpvOpGetKernelNDrangeSubGroupCount: {
+        stream << "OpGetKernelNDrangeSubGroupCount";
+        break;
+    }
+    case SpvOpGetKernelNDrangeMaxSubGroupSize: {
+        stream << "OpGetKernelNDrangeMaxSubGroupSize";
+        break;
+    }
+    case SpvOpGetKernelWorkGroupSize: {
+        stream << "OpGetKernelWorkGroupSize";
+        break;
+    }
+    case SpvOpGetKernelPreferredWorkGroupSizeMultiple: {
+        stream << "OpGetKernelPreferredWorkGroupSizeMultiple";
+        break;
+    }
+    case SpvOpRetainEvent: {
+        stream << "OpRetainEvent";
+        break;
+    }
+    case SpvOpReleaseEvent: {
+        stream << "OpReleaseEvent";
+        break;
+    }
+    case SpvOpCreateUserEvent: {
+        stream << "OpCreateUserEvent";
+        break;
+    }
+    case SpvOpIsValidEvent: {
+        stream << "OpIsValidEvent";
+        break;
+    }
+    case SpvOpSetUserEventStatus: {
+        stream << "OpSetUserEventStatus";
+        break;
+    }
+    case SpvOpCaptureEventProfilingInfo: {
+        stream << "OpCaptureEventProfilingInfo";
+        break;
+    }
+    case SpvOpGetDefaultQueue: {
+        stream << "OpGetDefaultQueue";
+        break;
+    }
+    case SpvOpBuildNDRange: {
+        stream << "OpBuildNDRange";
+        break;
+    }
+    case SpvOpImageSparseSampleImplicitLod: {
+        stream << "OpImageSparseSampleImplicitLod";
+        break;
+    }
+    case SpvOpImageSparseSampleExplicitLod: {
+        stream << "OpImageSparseSampleExplicitLod";
+        break;
+    }
+    case SpvOpImageSparseSampleDrefImplicitLod: {
+        stream << "OpImageSparseSampleDrefImplicitLod";
+        break;
+    }
+    case SpvOpImageSparseSampleDrefExplicitLod: {
+        stream << "OpImageSparseSampleDrefExplicitLod";
+        break;
+    }
+    case SpvOpImageSparseSampleProjImplicitLod: {
+        stream << "OpImageSparseSampleProjImplicitLod";
+        break;
+    }
+    case SpvOpImageSparseSampleProjExplicitLod: {
+        stream << "OpImageSparseSampleProjExplicitLod";
+        break;
+    }
+    case SpvOpImageSparseSampleProjDrefImplicitLod: {
+        stream << "OpImageSparseSampleProjDrefImplicitLod";
+        break;
+    }
+    case SpvOpImageSparseSampleProjDrefExplicitLod: {
+        stream << "OpImageSparseSampleProjDrefExplicitLod";
+        break;
+    }
+    case SpvOpImageSparseFetch: {
+        stream << "OpImageSparseFetch";
+        break;
+    }
+    case SpvOpImageSparseGather: {
+        stream << "OpImageSparseGather";
+        break;
+    }
+    case SpvOpImageSparseDrefGather: {
+        stream << "OpImageSparseDrefGather";
+        break;
+    }
+    case SpvOpImageSparseTexelsResident: {
+        stream << "OpImageSparseTexelsResident";
+        break;
+    }
+    case SpvOpNoLine: {
+        stream << "OpNoLine";
+        break;
+    }
+    case SpvOpAtomicFlagTestAndSet: {
+        stream << "OpAtomicFlagTestAndSet";
+        break;
+    }
+    case SpvOpAtomicFlagClear: {
+        stream << "OpAtomicFlagClear";
+        break;
+    }
+    case SpvOpImageSparseRead: {
+        stream << "OpImageSparseRead";
+        break;
+    }
+    case SpvOpDecorateId: {
+        stream << "OpDecorateId";
+        break;
+    }
+    case SpvOpSubgroupBallotKHR: {
+        stream << "OpSubgroupBallotKHR";
+        break;
+    }
+    case SpvOpSubgroupFirstInvocationKHR: {
+        stream << "OpSubgroupFirstInvocationKHR";
+        break;
+    }
+    case SpvOpSubgroupAllKHR: {
+        stream << "OpSubgroupAllKHR";
+        break;
+    }
+    case SpvOpSubgroupAnyKHR: {
+        stream << "OpSubgroupAnyKHR";
+        break;
+    }
+    case SpvOpSubgroupAllEqualKHR: {
+        stream << "OpSubgroupAllEqualKHR";
+        break;
+    }
+    case SpvOpSubgroupReadInvocationKHR: {
+        stream << "OpSubgroupReadInvocationKHR";
+        break;
+    }
+    case SpvOpGroupIAddNonUniformAMD: {
+        stream << "OpGroupIAddNonUniformAMD";
+        break;
+    }
+    case SpvOpGroupFAddNonUniformAMD: {
+        stream << "OpGroupFAddNonUniformAMD";
+        break;
+    }
+    case SpvOpGroupFMinNonUniformAMD: {
+        stream << "OpGroupFMinNonUniformAMD";
+        break;
+    }
+    case SpvOpGroupUMinNonUniformAMD: {
+        stream << "OpGroupUMinNonUniformAMD";
+        break;
+    }
+    case SpvOpGroupSMinNonUniformAMD: {
+        stream << "OpGroupSMinNonUniformAMD";
+        break;
+    }
+    case SpvOpGroupFMaxNonUniformAMD: {
+        stream << "OpGroupFMaxNonUniformAMD";
+        break;
+    }
+    case SpvOpGroupUMaxNonUniformAMD: {
+        stream << "OpGroupUMaxNonUniformAMD";
+        break;
+    }
+    case SpvOpGroupSMaxNonUniformAMD: {
+        stream << "OpGroupSMaxNonUniformAMD";
+        break;
+    }
+    case SpvOpFragmentMaskFetchAMD: {
+        stream << "OpFragmentMaskFetchAMD";
+        break;
+    }
+    case SpvOpFragmentFetchAMD: {
+        stream << "OpFragmentFetchAMD";
+        break;
+    }
+    case SpvOpSubgroupShuffleINTEL: {
+        stream << "OpSubgroupShuffleINTEL";
+        break;
+    }
+    case SpvOpSubgroupShuffleDownINTEL: {
+        stream << "OpSubgroupShuffleDownINTEL";
+        break;
+    }
+    case SpvOpSubgroupShuffleUpINTEL: {
+        stream << "OpSubgroupShuffleUpINTEL";
+        break;
+    }
+    case SpvOpSubgroupShuffleXorINTEL: {
+        stream << "OpSubgroupShuffleXorINTEL";
+        break;
+    }
+    case SpvOpSubgroupBlockReadINTEL: {
+        stream << "OpSubgroupBlockReadINTEL";
+        break;
+    }
+    case SpvOpSubgroupBlockWriteINTEL: {
+        stream << "OpSubgroupBlockWriteINTEL";
+        break;
+    }
+    case SpvOpSubgroupImageBlockReadINTEL: {
+        stream << "OpSubgroupImageBlockReadINTEL";
+        break;
+    }
+    case SpvOpSubgroupImageBlockWriteINTEL: {
+        stream << "OpSubgroupImageBlockWriteINTEL";
+        break;
+    }
+    case SpvOpDecorateStringGOOGLE: {
+        stream << "OpDecorateStringGOOGLE";
+        break;
+    }
+    case SpvOpMemberDecorateStringGOOGLE: {
+        stream << "OpMemberDecorateStringGOOGLE";
+        break;
+    }
+    case SpvOpMax:
+    default: {
+        stream << "*INVALID*";
+        break;
+    }
+    };
+
+    if (inst.has_type()) {
+        stream << std::string(" %") << std::to_string(inst.type_id());
+    }
+
+    for (uint32_t i = 0; i < inst.length(); i++) {
+        if (inst.is_immediate(i)) {
+            if (inst.value_type(i) == SpvStringData) {
+                const char *str = (const char *)inst.data(i);
+                stream << std::string(" \"") << str << "\"";
+                break;
+            } else if (inst.value_type(i) == SpvIntegerData) {
+                const int *data = (const int *)inst.data(i);
+                stream << std::string(" ") << std::to_string(*data);
+                break;
+            } else if (inst.value_type(i) == SpvFloatData) {
+                const float *data = (const float *)inst.data(i);
+                stream << std::string(" ") << std::to_string(*data);
+                break;
+            } else if (inst.value_type(i) == SpvBitMaskLiteral) {
+                stream << std::string(" ") << std::hex << std::showbase << std::uppercase << inst.operand(i);
+            } else {
+                stream << std::string(" ") << std::to_string(inst.operand(i));
+            }
+        } else {
+            stream << std::string(" %") << std::to_string(inst.operand(i));
+        }
     }
-
-    for(uint32_t i = 0; i < inst.length(); i++) {
-        if(inst.is_immediate(i)) {
-            if(inst.value_type(i) == SpvStringData) {
-                const char* str = (const char*)inst.data(i);
-                stream << std::string(" \"") << str << "\"";
-                break;
-            } else if(inst.value_type(i) == SpvIntegerData) {
-                const int* data = (const int*)inst.data(i);
-                stream << std::string(" ") << std::to_string(*data);
-                break;
-            } else if(inst.value_type(i) == SpvFloatData) {
-                const float* data = (const float*)inst.data(i);
-                stream << std::string(" ") << std::to_string(*data);
-                break;
-            } else if(inst.value_type(i) == SpvBitMaskLiteral) {
-                stream << std::string(" ") << std::hex << std::showbase << std::uppercase << inst.operand(i);            
-            } else {
-                stream << std::string(" ") << std::to_string(inst.operand(i));
-            }
-        } else {
-            stream << std::string(" %") << std::to_string(inst.operand(i));
-        }
-    }    
 
     stream << "\n";
     return stream;
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 1bf876c47a46..61e13e2fcc47 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -152,9 +152,9 @@ class SpvInstruction {
     SpvId type_id() const;
     SpvOp op_code() const;
     SpvId operand(uint32_t index) const;
-    const void* data(uint32_t index=0) const;
+    const void *data(uint32_t index = 0) const;
     SpvValueType value_type(uint32_t index) const;
-    const Operands& operands() const;
+    const Operands &operands() const;
 
     bool has_type() const;
     bool has_result() const;
@@ -229,11 +229,11 @@ class SpvFunction {
     void set_parameter_precision(uint32_t index, SpvPrecision precision);
     bool is_defined() const;
 
-    const Blocks& blocks() const;
+    const Blocks &blocks() const;
     SpvBlock entry_block() const;
     SpvBlock tail_block() const;
     SpvPrecision return_precision() const;
-    const Parameters& parameters() const;
+    const Parameters &parameters() const;
     SpvPrecision parameter_precision(uint32_t index) const;
     uint32_t parameter_count() const;
     uint32_t control_mask() const;
@@ -551,7 +551,7 @@ struct SpvFactory {
     static SpvInstruction runtime_array_type(SpvId result_type_id, SpvId base_type_id);
     static SpvInstruction pointer_type(SpvId pointer_type_id, SpvStorageClass storage_class, SpvId base_type_id);
     static SpvInstruction function_type(SpvId function_type_id, SpvId return_type_id, const ParamTypes &param_type_ids);
-    static SpvInstruction constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data,  SpvValueType value_type);
+    static SpvInstruction constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data, SpvValueType value_type);
     static SpvInstruction null_constant(SpvId result_id, SpvId type_id);
     static SpvInstruction bool_constant(SpvId result_id, SpvId type_id, bool value);
     static SpvInstruction string_constant(SpvId result_id, const std::string &value);

From f6740c9711c84e8026780180e2ea4e32a4e2f8c8 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 13 Oct 2022 12:52:20 -0700
Subject: [PATCH 047/166] Add doc-strings to SPIR-V interface

---
 src/SpirvIR.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 2 deletions(-)

diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 61e13e2fcc47..c52f8cb4e50b 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -353,12 +353,25 @@ class SpvBuilder {
     SpvBuilder(const SpvBuilder &) = delete;
     SpvBuilder &operator=(const SpvBuilder &) = delete;
 
+    // Reserve a unique ID to use for identifying a specifc kind of SPIR-V result **/
     SpvId reserve_id(SpvKind = SpvResultId);
 
+    // Look up the specific kind of SPIR-V item from it's unique ID
     SpvKind kind_of(SpvId id) const;
+
+    // Get a human readable name for a specific kind of SPIR-V item
     std::string kind_name(SpvKind kind) const;
+
+    // Look up the ID associated with the type for a given variable ID
     SpvId type_of(SpvId variable_id) const;
 
+    // Top-Level declaration methods ... each of these is a convenvience 
+    // function that checks to see if the requested thing has already been
+    // declared, in which case it returns its existing id, otherwise it
+    // adds a new declaration, and returns the new id.  This avoids all
+    // the logic checks in the calling code, and also ensures that 
+    // duplicates aren't created.
+
     SpvId declare_void_type();
     SpvId declare_type(const Type &type, uint32_t array_size = 1);
     SpvId declare_pointer_type(const Type &type, SpvStorageClass storage_class);
@@ -376,7 +389,10 @@ class SpvBuilder {
     SpvId declare_struct(const std::string &name, const StructMemberTypes &member_types);
     SpvId declare_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId initializer_id = SpvInvalidId);
     SpvId declare_global_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId initializer_id = SpvInvalidId);
+    SpvId declare_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
 
+    // Top level creation methods for adding new items ... these have a limited
+    // number of checks and the caller must insure that duplicates aren't created 
     SpvId add_type(const Type &type, uint32_t array_size = 1);
     SpvId add_struct(const std::string &name, const StructMemberTypes &member_types);
     SpvId add_runtime_array(SpvId base_type_id);
@@ -389,43 +405,84 @@ class SpvBuilder {
 
     void add_annotation(SpvId target_id, SpvDecoration decoration_type, const Literals &literals = {});
     void add_struct_annotation(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const Literals &literals = {});
+    void add_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
 
     void add_entry_point(SpvId func_id, SpvExecutionModel exec_model,
                          const Variables &variables = {});
 
     void add_execution_mode_local_size(SpvId entry_point_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z);
 
+    // Assigns a specific source language hint to the module 
     void set_source_language(SpvSourceLanguage val);
+
+    // Sets the addressing model to use for the module
     void set_addressing_model(SpvAddressingModel val);
+
+    // Sets the memory model to use for the module
     void set_memory_model(SpvMemoryModel val);
 
+    // Returns the source language hint for the module
     SpvSourceLanguage source_language() const;
+
+    // Returns the addressing model used for the module
     SpvAddressingModel addressing_model() const;
+
+    // Returns the memory model used for the module
     SpvMemoryModel memory_model() const;
 
+    // Import the GLSL.std.450 external instruction set. Returns its corresponding ID.
     SpvId import_glsl_intrinsics();
+
+    // Import an external instruction set bby name. Returns its corresponding ID.
     SpvId import_instruction_set(const std::string &instruction_set);
 
+    // Add an extension string to the list of required extensions for the module
     void require_extension(const std::string &extension);
+
+    // Add a specific capability to the list of requirements for the module
     void require_capability(SpvCapability);
 
+    // Returns true if the given instruction set has been imported
     bool is_imported(const std::string &instruction_set) const;
+
+    // Returns true if the given extension string is required by the module
     bool is_extension_required(const std::string &extension) const;
+
+    // Returns true if the given capability is required by the module
     bool is_capability_required(SpvCapability) const;
 
+    // Change the current build location to the given block. All local 
+    // declarations and instructions will be added here.
     void enter_block(const SpvBlock &block);
+
+    // Create a new block with the given ID
     SpvBlock create_block(SpvId block_id);
+
+    // Returns the current block (the active scope for building)
     SpvBlock current_block() const;
+
+    // Resets the block build scope, and unassigns the current block
     SpvBlock leave_block();
 
+    // Change the current build scope to be within the given function
     void enter_function(const SpvFunction &func);
+
+    // Returns the function object for the given ID (or an invalid function if none is found)
     SpvFunction lookup_function(SpvId func_id) const;
+
+    // Returns the current function being used as the active build scope
     SpvFunction current_function() const;
+
+    // Resets the function build scope, and unassigns the current function
     SpvFunction leave_function();
 
+    // Returns the current id being used for building (ie the last item created)
     SpvId current_id() const;
+
+    // Updates the current id being used for building
     void update_id(SpvId id);
 
+    // Returns true if the given id is of the corresponding type
     bool is_pointer_type(SpvId id) const;
     bool is_struct_type(SpvId id) const;
     bool is_vector_type(SpvId id) const;
@@ -433,21 +490,37 @@ class SpvBuilder {
     bool is_array_type(SpvId id) const;
     bool is_constant(SpvId id) const;
 
+    // Looks up the given pointer type id and returns a corresponding base type id (or an invalid id if none is found)
     SpvId lookup_base_type(SpvId pointer_type) const;
+
+    // Returns the storage class for the given variable id (or invalid if none is found)
     SpvStorageClass lookup_storage_class(SpvId id) const;
+
+    // Returns the item id for the given symbol name (or an invalid id if none is found)
     SpvId lookup_id(const std::string &symbol) const;
+
+    // Returns the build scope id for the item id (or an invalid id if none is found)
     SpvId lookup_scope(SpvId id) const;
+
+    // Returns the id for the imported instruction set (or an invalid id if none is found)
     SpvId lookup_import(const std::string &instruction_set) const;
 
+    // Returns the symbol string for the given id (or an empty string if none is found)
     std::string lookup_symbol(SpvId id) const;
-    SpvId declare_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
-    void add_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
 
+    // Returns the current module being used for building 
     SpvModule current_module() const;
 
+    // Appends the given instruction to the current build location
     void append(SpvInstruction inst);
+
+    // Finalizes the module and prepares it for encoding (must be called before module can be used)
     void finalize();
+
+    // Encodes the current module to the given binary 
     void encode(SpvBinary &binary) const;
+
+    // Resets the builder and all internal state
     void reset();
 
 protected:
@@ -469,6 +542,8 @@ class SpvBuilder {
     using FunctionTypeMap = std::unordered_map<FunctionTypeKey, SpvId>;
     using FunctionMap = std::unordered_map<SpvId, SpvFunction>;
 
+    // Internal methods for creating ids, keys, and look ups
+    
     SpvId make_id(SpvKind kind);
 
     TypeKey make_type_key(const Type &type, uint32_t array_size = 1) const;

From c0d94bb751dcb3cf42d9924cabf9f86713c1d182 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 13 Oct 2022 16:53:30 -0700
Subject: [PATCH 048/166] Adjust runtime array to widest vector width based on
 alignment and dense vector loads/stores Fix scalar and vector load/stores Fix
 casts for vectors Add missing nan, inf, neg_inf, is_finite builtins

---
 src/CodeGen_Vulkan_Dev.cpp | 486 +++++++++++++++++++------------------
 1 file changed, 248 insertions(+), 238 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 1603a6443c5b..c0f881cfda2e 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -129,6 +129,10 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void store_at_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id);
         void store_at_vector_index(const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id);
 
+        SpvFactory::Components split_vector(Type type, SpvId value_id );
+        SpvId join_vector(Type type, const SpvFactory::Components& value_components );
+        SpvId cast_scalar(const Cast* op, Type target_type, Type value_type, SpvId value_id);
+
         using BuiltinMap = std::unordered_map<std::string, SpvId>;
         const BuiltinMap spirv_builtin = {
             {"is_nan_f16", SpvOpIsNan},
@@ -199,7 +203,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void reset_workgroup_size();
         void declare_workgroup_size(SpvId kernel_func_id);
         void declare_entry_point(const Stmt &s, SpvId kernel_func_id);
-        void declare_device_args(uint32_t entry_point_index, const std::string &kernel_name, const std::vector<DeviceArgument> &args);
+        void declare_device_args(const Stmt &s, uint32_t entry_point_index, const std::string &kernel_name, const std::vector<DeviceArgument> &args);
 
         // The scope contains both the symbol id and its storage class
         using SymbolIdStorageClassPair = std::pair<SpvId, SpvStorageClass>;
@@ -207,8 +211,12 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         using ScopedSymbolBinding = ScopedBinding<SymbolIdStorageClassPair>;
         SymbolScope symbol_table;
 
-        // Keep track of the descriptor sets so we can add a sidecar to module
-        // indicating which descriptor set to use for each entry point
+        // Map from a variable ID for a buffer to its corresponding runtime array type
+        using RuntimeArrayMap = std::unordered_map<SpvId, SpvId>; 
+        RuntimeArrayMap runtime_array_map;
+
+        // Keep track of the descriptor sets so we can add a sidecar to the 
+        // module indicating which descriptor set to use for each entry point
         struct DescriptorSet {
             std::string entry_point_name;
             uint32_t uniform_buffer_count = 0;
@@ -233,6 +241,99 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
     std::string current_kernel_name;
 };
 
+// Check if all loads and stores to the member 'buffer' are dense, aligned, and
+// have the same number of lanes. If this is indeed the case then the 'lanes'
+// member stores the number of lanes in those loads and stores.
+//
+// FIXME: Refactor this and the version in CodeGen_OpenGLCompute_Dev to a common place!
+// 
+class CheckAlignedDenseVectorLoadStore : public IRVisitor {
+public:
+    // True if all loads and stores from the buffer are dense, aligned, and all
+    // have the same number of lanes, false otherwise.
+    bool are_all_dense = true;
+
+    // The number of lanes in the loads and stores. If the number of lanes is
+    // variable, then are_all_dense is set to false regardless, and this value
+    // is undefined. Initially set to -1 before any dense operation is
+    // discovered.
+    int lanes = -1;
+
+    CheckAlignedDenseVectorLoadStore(std::string name)
+        : buffer_name(std::move(name)) {
+    }
+
+private:
+    // The name of the buffer to check.
+    std::string buffer_name;
+
+    using IRVisitor::visit;
+
+    void visit(const Load *op) override {
+        IRVisitor::visit(op);
+
+        if (op->name != buffer_name) {
+            return;
+        }
+
+        if (op->type.is_scalar()) {
+            are_all_dense = false;
+            return;
+        }
+
+        Expr ramp_base = strided_ramp_base(op->index);
+        if (!ramp_base.defined()) {
+            are_all_dense = false;
+            return;
+        }
+
+        if ((op->alignment.modulus % op->type.lanes() != 0) ||
+            (op->alignment.remainder % op->type.lanes() != 0)) {
+            are_all_dense = false;
+            return;
+        }
+
+        if (lanes != -1 && op->type.lanes() != lanes) {
+            are_all_dense = false;
+            return;
+        }
+
+        lanes = op->type.lanes();
+    }
+
+    void visit(const Store *op) override {
+        IRVisitor::visit(op);
+
+        if (op->name != buffer_name) {
+            return;
+        }
+
+        if (op->value.type().is_scalar()) {
+            are_all_dense = false;
+            return;
+        }
+
+        Expr ramp_base = strided_ramp_base(op->index);
+        if (!ramp_base.defined()) {
+            are_all_dense = false;
+            return;
+        }
+
+        if ((op->alignment.modulus % op->value.type().lanes() != 0) ||
+            (op->alignment.remainder % op->value.type().lanes() != 0)) {
+            are_all_dense = false;
+            return;
+        }
+
+        if (lanes != -1 && op->value.type().lanes() != lanes) {
+            are_all_dense = false;
+            return;
+        }
+
+        lanes = op->value.type().lanes();
+    }
+};
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(const Expr &e) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(): " << (Expr)e << "\n";
     internal_assert(e.type().is_vector()) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize must be called with an expression of vector type.\n";
@@ -329,15 +430,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
     }
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast): " << op->value.type() << " to " << op->type << "\n";
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar(const Cast* op, Type target_type, Type value_type, SpvId value_id) {
 
-    Type value_type = op->value.type();
-    Type target_type = op->type;
     SpvId target_type_id = builder.declare_type(target_type);
-    op->value.accept(this);
-    SpvId src_id = builder.current_id();
-
     SpvOp op_code = SpvOpNop;
     if (value_type.is_float()) {
         if (target_type.is_float()) {
@@ -347,7 +442,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
         } else if (target_type.is_int()) {
             op_code = SpvOpConvertFToS;
         } else {
-            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
+            user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar():  unhandled case " << value_type << " to " << target_type << "\n";
         }
     } else if (value_type.is_uint()) {
         if (target_type.is_float()) {
@@ -355,39 +450,51 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
         } else if (target_type.is_uint()) {
             op_code = SpvOpUConvert;
         } else if (target_type.is_int()) {
-            if (builder.is_capability_required(SpvCapabilityKernel)) {
-                op_code = SpvOpSatConvertUToS;
-            } else {
-                op_code = SpvOpBitcast;
-            }
+            op_code = SpvOpUConvert;
         } else {
-            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
+            user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar():  unhandled case " << value_type << " to " << target_type << "\n";
         }
     } else if (value_type.is_int()) {
         if (target_type.is_float()) {
             op_code = SpvOpConvertSToF;
         } else if (target_type.is_uint()) {
-            if (builder.is_capability_required(SpvCapabilityKernel)) {
-                op_code = SpvOpSatConvertSToU;
-            } else {
-                op_code = SpvOpBitcast;
-            }
+            op_code = SpvOpSConvert;
         } else if (target_type.is_int()) {
             op_code = SpvOpSConvert;
         } else {
-            internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
+            user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar():  unhandled case " << value_type << " to " << target_type << "\n";
         }
     } else {
-        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << value_type << " to " << target_type << "\n";
+        user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar():  unhandled case " << value_type << " to " << target_type << "\n";
     }
 
     SpvId result_id = builder.reserve_id(SpvResultId);
-    if (op_code == SpvOpBitcast) {
-        builder.append(SpvFactory::bitcast(target_type_id, result_id, src_id));
+    builder.append(SpvFactory::convert(op_code, target_type_id, result_id, value_id));
+    return result_id;
+}
+
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast): " << op->value.type() << " to " << op->type << "\n";
+
+    Type value_type = op->value.type();
+    Type target_type = op->type;
+    op->value.accept(this);
+    SpvId value_id = builder.current_id();
+
+    if((value_type.is_vector() && target_type.is_vector())) {
+        Type scalar_target_type = target_type.with_lanes(1);
+        Type scalar_value_type = value_type.with_lanes(1);
+        SpvFactory::Components value_components = split_vector(value_type, value_id);
+        SpvFactory::Components target_components;
+        for(SpvId value_component_id : value_components) {
+            target_components.push_back(cast_scalar(op, scalar_target_type, scalar_value_type, value_component_id));
+        }
+        SpvId result_id = join_vector(target_type, target_components);
+        builder.update_id(result_id);
     } else {
-        builder.append(SpvFactory::convert(op_code, target_type_id, result_id, src_id));
+        SpvId result_id = cast_scalar(op, target_type, value_type, value_id);
+        builder.update_id(result_id);
     }
-    builder.update_id(result_id);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Reinterpret *op) {
@@ -620,7 +727,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
 
     } else if (op->is_intrinsic(Call::IntrinsicOp::round)) {
         internal_assert(op->args.size() == 1);
-        visit_glsl_unaryop(op->type, op->args[0], GLSLstd450RoundEven);
+        visit_glsl_unaryop(op->type, op->args[0], GLSLstd450Round);
 
     } else if (op->is_intrinsic(Call::absd)) {
         internal_assert(op->args.size() == 2);
@@ -732,6 +839,34 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         Expr e = lower_intrinsic(op);
         e.accept(this);
         return;
+
+    } else if(op->name == "nan_f32") {
+        float value = NAN;
+        SpvId result_id = builder.declare_constant(Float(32), &value);
+        builder.update_id(result_id);
+    } else if(op->name == "inf_f32") {
+        float value = INFINITY;
+        SpvId result_id = builder.declare_constant(Float(32), &value);
+        builder.update_id(result_id);
+    } else if(op->name == "neg_inf_f32") {
+        float value = -INFINITY;
+        SpvId result_id = builder.declare_constant(Float(32), &value);
+        builder.update_id(result_id);
+    } else if(op->name == "is_finite_f32" || 
+              op->name == "is_finite_f64") {
+        visit_unaryop(op->type, op->args[0], (SpvOp)SpvOpIsInf);
+        SpvId is_inf_id = builder.current_id();
+        visit_unaryop(op->type, op->args[0], (SpvOp)SpvOpIsNan);
+        SpvId is_nan_id = builder.current_id();
+
+        SpvId type_id = builder.declare_type(op->type);
+        SpvId not_is_nan_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::logical_not(type_id, not_is_nan_id, is_nan_id));
+        SpvId not_is_inf_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::logical_not(type_id, not_is_inf_id, is_inf_id));
+        SpvId result_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::logical_and(type_id, result_id, not_is_inf_id, not_is_nan_id));
+        builder.update_id(result_id);
     } else {
 
         // First check for a standard SPIR-V built-in
@@ -829,47 +964,43 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, S
 
     internal_assert(op->index.type().is_vector());
 
-    // determine the base type id for the source value
-    SpvId base_type_id = builder.type_of(base_id);
-    if (builder.is_pointer_type(base_type_id)) {
-        base_type_id = builder.lookup_base_type(base_type_id);
-    }
-
-    // If this is a dense vector load and the buffer has a vector base type,
-    // then index the buffer using the base of the ramp divided by the number
-    // of lanes.
-    if (builder.is_vector_type(base_type_id)) {
-        Expr ramp_base = strided_ramp_base(op->index);
-        if (ramp_base.defined()) {
-            Expr ramp_index = (ramp_base / op->type.lanes());
-            ramp_index.accept(this);
-            SpvId index_id = builder.current_id();
-            load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
-            return;
-        }
+    // If this is a load from a buffer block (mapped to a halide buffer) 
+    // and the runtime array is a vector type, then attempt to do a 
+    // dense vector load by using the base of the ramp divided by 
+    // the number of lanes.
+    RuntimeArrayMap::const_iterator it = runtime_array_map.find(base_id); 
+    if(it != runtime_array_map.end()) {
+        SpvId array_element_type_id = it->second;
+        if(builder.is_vector_type(array_element_type_id)) {
+            Expr ramp_base = strided_ramp_base(op->index);
+            if (ramp_base.defined()) {
+                Expr ramp_index = (ramp_base / op->type.lanes());
+                ramp_index.accept(this);
+                SpvId index_id = builder.current_id();
+                load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
+                return;
+            }            
+        }            
     }
 
     op->index.accept(this);
     SpvId index_id = builder.current_id();
 
+    SpvFactory::Components index_components = split_vector(op->index.type(), index_id);
+
     // Gather vector elements.
     SpvFactory::Components loaded_values;
     SpvId scalar_value_type_id = builder.declare_type(op->type.with_lanes(1));
     SpvId scalar_ptr_type_id = builder.declare_pointer_type(scalar_value_type_id, storage_class);
-    SpvId scalar_index_type_id = builder.declare_type(op->index.type().with_lanes(1));
-    for (uint32_t i = 0; i < (uint32_t)op->index.type().lanes(); i++) {
-        SpvFactory::Indices extract_indices = {i};
-        SpvId index_component_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::composite_extract(scalar_index_type_id, index_component_id, index_id, extract_indices));
-        load_from_scalar_index(index_component_id, base_id, scalar_value_type_id, scalar_ptr_type_id, storage_class);
+    for (SpvId scalar_index : index_components) {
+        load_from_scalar_index(scalar_index, base_id, scalar_value_type_id, scalar_ptr_type_id, storage_class);
         SpvId value_component_id = builder.current_id();
         loaded_values.push_back(value_component_id);
     }
 
     // Create a composite vector from the individual loads
     if (loaded_values.size() > 1) {
-        SpvId result_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::composite_construct(type_id, result_id, loaded_values));
+        SpvId result_id = join_vector(op->type, loaded_values);
         builder.update_id(result_id);
     }
 }
@@ -914,6 +1045,25 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(SpvId index_id, Sp
     builder.append(SpvFactory::store(dst_id, value_id));
 }
 
+SpvFactory::Components CodeGen_Vulkan_Dev::SPIRV_Emitter::split_vector(Type type, SpvId value_id ) {
+    SpvFactory::Components value_components;
+    SpvId scalar_value_type_id = builder.declare_type(type.with_lanes(1));
+    for (uint32_t i = 0; i < (uint32_t)type.lanes(); i++) {
+        SpvFactory::Indices extract_indices = {i};
+        SpvId value_component_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::composite_extract(scalar_value_type_id, value_component_id, value_id, extract_indices));
+        value_components.push_back(value_component_id);            
+    }
+    return value_components;
+}
+
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::join_vector(Type type, const SpvFactory::Components& value_components ) {
+    SpvId type_id = builder.declare_type(type);
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::composite_construct(type_id, result_id, value_components));
+    return result_id;
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(): "
              << "base_id=" << base_id << " "
@@ -923,39 +1073,39 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, S
 
     internal_assert(op->index.type().is_vector());
 
-    // determine the base type id for the source value
-    SpvId base_type_id = builder.type_of(base_id);
-    if (builder.is_pointer_type(base_type_id)) {
-        base_type_id = builder.lookup_base_type(base_type_id);
-    }
-
-    // If this is a dense vector load and the buffer has a vector base type,
-    // then index the buffer using the base of the ramp divided by the number
-    // of lanes.
-    if (builder.is_vector_type(base_type_id)) {
-        Expr ramp_base = strided_ramp_base(op->index);
-        if (ramp_base.defined()) {
-            Expr ramp_index = (ramp_base / op->value.type().lanes());
-            ramp_index.accept(this);
-            SpvId index_id = builder.current_id();
-            store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);
-            return;
-        }
+    // If this is a store to a buffer block (mapped to a halide buffer) 
+    // and the runtime array is a vector type, then attempt to do a 
+    // dense vector store by using the base of the ramp divided by 
+    // the number of lanes.
+    RuntimeArrayMap::const_iterator it = runtime_array_map.find(base_id); 
+    if(it != runtime_array_map.end()) {
+        SpvId array_element_type_id = it->second;
+        if(builder.is_vector_type(array_element_type_id)) {
+            Expr ramp_base = strided_ramp_base(op->index);
+            if (ramp_base.defined()) {
+                Expr ramp_index = (ramp_base / op->value.type().lanes());
+                ramp_index.accept(this);
+                SpvId index_id = builder.current_id();
+                store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);
+                return;
+            }            
+        }            
     }
 
     op->index.accept(this);
     SpvId index_id = builder.current_id();
 
+    // Split vector value into components
+    internal_assert(op->index.type().lanes() <= op->value.type().lanes());
+    SpvFactory::Components value_components = split_vector(op->value.type(), value_id);
+    SpvFactory::Components index_components = split_vector(op->index.type(), index_id);
+
     // Scatter vector elements.
     SpvId scalar_value_type_id = builder.declare_type(op->value.type().with_lanes(1));
     SpvId scalar_ptr_type_id = builder.declare_pointer_type(scalar_value_type_id, storage_class);
-    SpvId scalar_index_type_id = builder.declare_type(op->index.type().with_lanes(1));
-    for (uint32_t i = 0; i < (uint32_t)op->index.type().lanes(); i++) {
-        SpvFactory::Indices extract_indices = {i};
-        SpvId index_component_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::composite_extract(scalar_index_type_id, index_component_id, index_id, extract_indices));
-        SpvId value_component_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::composite_extract(scalar_value_type_id, value_component_id, value_id, extract_indices));
+    for (uint32_t i = 0; i < index_components.size(); i++) {
+        SpvId index_component_id = index_components[i];
+        SpvId value_component_id = value_components[i];
         store_at_scalar_index(index_component_id, base_id, scalar_value_type_id, scalar_ptr_type_id, storage_class, value_component_id);
     }
 }
@@ -980,19 +1130,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
         SpvId index_id = builder.current_id();
         load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
     } else {
-
-        // If this is a dense vector load and the buffer has a vector base type,
-        // then index the buffer using the base of the ramp divided by the number
-        // of lanes.
-        if (op->type.is_vector()) {
-            Expr ramp_base = strided_ramp_base(op->index);
-            if (ramp_base.defined()) {
-                Expr ramp_index = (ramp_base / op->type.lanes());
-                ramp_index.accept(this);
-                SpvId index_id = builder.current_id();
-                load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
-            }
-        }
         load_from_vector_index(op, base_id, type_id, ptr_type_id, storage_class);
     }
 }
@@ -1604,7 +1741,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_entry_point(const Stmt &s, SpvId
     builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point_index,
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint32_t entry_point_index,
                                                             const std::string &entry_point_name,
                                                             const std::vector<DeviceArgument> &args) {
 
@@ -1689,13 +1826,20 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point
         }
     }
 
-    // Add bindings for all device buffers as uniform buffers
+    // Add bindings for all device buffers declared as GLSL-style buffer blocks in uniform storage
     for (const auto &arg : args) {
         if (arg.is_buffer) {
 
+            // Check for dense loads & stores to determine the widest vector 
+            // width we can safely index 
+            CheckAlignedDenseVectorLoadStore check_dense(arg.name);
+            s.accept(&check_dense);
+            int lanes = check_dense.are_all_dense ? check_dense.lanes : 1;
+
             // Declare the runtime array (which maps directly to the Halide device buffer)
-            SpvId element_type_id = builder.declare_type(arg.type);
-            SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
+            Type array_element_type = arg.type.with_lanes(lanes);
+            SpvId array_element_type_id = builder.declare_type(array_element_type);
+            SpvId runtime_arr_type_id = builder.add_runtime_array(array_element_type_id);
 
             // Annotate the array with its stride
             SpvBuilder::Literals array_stride = {(uint32_t)(arg.type.bytes())};
@@ -1709,7 +1853,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point
             // Declare a pointer to the struct as a global variable
             SpvStorageClass storage_class = SpvStorageClassUniform;
             SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, storage_class);
-            SpvId param_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, storage_class);
+            SpvId buffer_block_var_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, storage_class);
 
             // Annotate the struct to indicate it's passed in a GLSL-style buffer block
             builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
@@ -1721,9 +1865,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(uint32_t entry_point
             // Set descriptor set and binding indices
             SpvBuilder::Literals dset_index = {entry_point_index};
             SpvBuilder::Literals binding_index = {uint32_t(binding_counter++)};
-            builder.add_annotation(param_id, SpvDecorationDescriptorSet, dset_index);
-            builder.add_annotation(param_id, SpvDecorationBinding, binding_index);
-            symbol_table.push(arg.name, {param_id, storage_class});
+            builder.add_annotation(buffer_block_var_id, SpvDecorationDescriptorSet, dset_index);
+            builder.add_annotation(buffer_block_var_id, SpvDecorationBinding, binding_index);
+            symbol_table.push(arg.name, {buffer_block_var_id, storage_class});
+            runtime_array_map[buffer_block_var_id] = array_element_type_id;
             descriptor_set.storage_buffer_count++;
         }
     }
@@ -1782,144 +1927,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     declare_entry_point(s, kernel_func_id);
 
     // Declare all parameters -- scalar args and device buffers
-    declare_device_args(entry_point_index, name, args);
-    /*
-        // TODO: only add the SIMT intrinsics used
-        SpvFactory::Variables entry_point_variables;
-        auto intrinsics = {"WorkgroupId", "LocalInvocationId"};
-        for (const std::string &intrinsic_name : intrinsics) {
-
-            // The builtins are pointers to vec3
-            SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
-            SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, SpvStorageClassInput);
-            SpvId intrinsic_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, SpvStorageClassInput);
-            SpvId intrinsic_loaded_id = builder.reserve_id();
-            builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id));
-            symbol_table.push(intrinsic_name, {intrinsic_loaded_id, SpvStorageClassInput});
-
-            // Annotate that this is the specific builtin
-            SpvBuiltIn built_in_kind = starts_with(intrinsic_name, "Workgroup") ? SpvBuiltInWorkgroupId : SpvBuiltInLocalInvocationId;
-            SpvBuilder::Literals annotation_literals = {(uint32_t)built_in_kind};
-            builder.add_annotation(intrinsic_id, SpvDecorationBuiltIn, annotation_literals);
-
-            // Add the builtin to the interface
-            entry_point_variables.push_back(intrinsic_id);
-        }
+    declare_device_args(s, entry_point_index, name, args);
 
-        // Add the entry point with the appropriate execution model
-        // NOTE: exec_model must be GLCompute to work with Vulkan ... Kernel is only supported in OpenCL
-        uint32_t current_entry_point = builder.current_module().entry_point_count();
-        builder.add_entry_point(kernel_func_id, SpvExecutionModelGLCompute, entry_point_variables);
-    */
-
-    /*
-        // GLSL-style: each input buffer is a runtime array in a buffer struct
-        // All other params get passed in as a single uniform block
-        // First, need to count scalar parameters to construct the uniform struct
-        SpvBuilder::StructMemberTypes param_struct_members;
-        for (const auto &arg : args) {
-            if (!arg.is_buffer) {
-                SpvId arg_type_id = builder.declare_type(arg.type);
-                param_struct_members.push_back(arg_type_id);
-            }
-        }
-
-        // Add a binding for a uniform buffer packed with all scalar args
-        uint32_t binding_counter = 0;
-        if (!param_struct_members.empty()) {
-            const std::string struct_name = std::string("_struct") + name + std::string("_args");
-            SpvId param_struct_type_id = builder.declare_struct(struct_name, param_struct_members);
-
-            // Add a decoration describing the offset for each parameter struct member
-            uint32_t param_member_index = 0;
-            uint32_t param_member_offset = 0;
-            for (const auto &arg : args) {
-                if (!arg.is_buffer) {
-                    SpvBuilder::Literals param_offset_literals = {param_member_offset};
-                    builder.add_struct_annotation(param_struct_type_id, param_member_index, SpvDecorationOffset, param_offset_literals);
-                    param_member_offset += arg.type.bytes();
-                    param_member_index++;
-                }
-            }
-
-            // Add a Block decoration for the parameter pack itself
-            builder.add_annotation(param_struct_type_id, SpvDecorationBlock);
-
-            // Add a variable for the parameter pack
-            const std::string param_pack_var_name = std::string("_var") + name + std::string("_args");
-            SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
-            SpvId param_pack_var_id = builder.declare_global_variable(param_pack_var_name, param_pack_ptr_type_id, SpvStorageClassUniform);
-
-            // We always pass in the parameter pack as the first binding
-            SpvBuilder::Literals binding_index = {0};
-            SpvBuilder::Literals dset_index = {current_entry_point};
-            builder.add_annotation(param_pack_var_id, SpvDecorationDescriptorSet, dset_index);
-            builder.add_annotation(param_pack_var_id, SpvDecorationBinding, binding_index);
-            descriptor_set.uniform_buffer_count++;
-            binding_counter++;
-
-            // Declare all the args with appropriate offsets into the parameter struct
-            uint32_t scalar_index = 0;
-            for (const auto &arg : args) {
-                if (!arg.is_buffer) {
-
-                    SpvId arg_type_id = builder.declare_type(arg.type);
-                    SpvId access_index_id = builder.declare_constant(UInt(32), &scalar_index);
-                    SpvId pointer_type_id = builder.declare_pointer_type(arg_type_id, SpvStorageClassUniform);
-                    SpvFactory::Indices access_indices = {access_index_id};
-                    SpvId access_chain_id = builder.declare_access_chain(pointer_type_id, param_pack_var_id, access_indices);
-                    scalar_index++;
-
-                    SpvId param_id = builder.reserve_id(SpvResultId);
-                    builder.append(SpvFactory::load(arg_type_id, param_id, access_chain_id));
-                    symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
-                }
-            }
-        }
-
-        // Add bindings for all device buffers
-        for (const auto &arg : args) {
-            if (arg.is_buffer) {
-
-                // Add required extension support for storage types
-                if (arg.type.is_int_or_uint()) {
-                    if (arg.type.bits() == 8) {
-                        builder.require_extension("SPV_KHR_8bit_storage");
-                    } else if (arg.type.bits() == 16) {
-                        builder.require_extension("SPV_KHR_16bit_storage");
-                    }
-                }
-
-                SpvId element_type_id = builder.declare_type(arg.type);
-                SpvId runtime_arr_type_id = builder.add_runtime_array(element_type_id);
-                SpvBuilder::StructMemberTypes struct_member_types = {runtime_arr_type_id};
-                const std::string struct_name = std::string("_struct") + name + std::string("_b") + std::to_string(binding_counter);
-                SpvId struct_type_id = builder.declare_struct(struct_name, struct_member_types);
-                SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, SpvStorageClassUniform);
-                SpvId param_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, SpvStorageClassUniform);
-
-                // Annotate the struct to indicate it's passed in a GLSL-style buffer block
-                builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
-
-                // Annotate the array with its stride
-                SpvBuilder::Literals array_stride = {(uint32_t)(arg.type.bytes())};
-                builder.add_annotation(runtime_arr_type_id, SpvDecorationArrayStride, array_stride);
-
-                // Annotate the offset for the array
-                SpvBuilder::Literals zero_literal = {uint32_t(0)};
-                builder.add_struct_annotation(struct_type_id, 0, SpvDecorationOffset, zero_literal);
-
-                // Set DescriptorSet and Binding
-                SpvBuilder::Literals dset_index = {current_entry_point};
-                SpvBuilder::Literals binding_index = {uint32_t(binding_counter++)};
-                builder.add_annotation(param_id, SpvDecorationDescriptorSet, dset_index);
-                builder.add_annotation(param_id, SpvDecorationBinding, binding_index);
-                symbol_table.push(arg.name, {param_id, SpvStorageClassUniform});
-                descriptor_set.storage_buffer_count++;
-            }
-        }
-        descriptor_set_table.push_back(descriptor_set);
-    */
+    // Traverse
     s.accept(this);
 
     // Insert return statement end delimiter

From 37efad073d040996310eab1401344d426fec5f9d Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 13 Oct 2022 16:56:23 -0700
Subject: [PATCH 049/166] Add missing bitwise and logical and methods.
 Cleanups.

---
 src/SpirvIR.cpp | 10 +++++++++-
 src/SpirvIR.h   |  4 +++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 745163fd1932..3516b9778a99 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -2166,10 +2166,18 @@ SpvInstruction SpvFactory::bitwise_not(SpvId type_id, SpvId result_id, SpvId src
     return unary_op(SpvOpNot, type_id, result_id, src_id);
 }
 
+SpvInstruction SpvFactory::bitwise_and(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
+    return binary_op(SpvOpBitwiseAnd, type_id, result_id, src_a_id, src_b_id);
+}
+
 SpvInstruction SpvFactory::logical_not(SpvId type_id, SpvId result_id, SpvId src_id) {
     return unary_op(SpvOpLogicalNot, type_id, result_id, src_id);
 }
 
+SpvInstruction SpvFactory::logical_and(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
+    return binary_op(SpvOpLogicalAnd, type_id, result_id, src_a_id, src_b_id);
+}
+
 SpvInstruction SpvFactory::shift_right_logical(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpShiftRightLogical);
     inst.add_operand(type_id);
@@ -4048,7 +4056,7 @@ std::ostream &operator<<(std::ostream &stream, const SpvInstruction &inst) {
                 stream << std::string(" ") << std::to_string(*data);
                 break;
             } else if (inst.value_type(i) == SpvBitMaskLiteral) {
-                stream << std::string(" ") << std::hex << std::showbase << std::uppercase << inst.operand(i);
+                stream << std::string(" ") << std::hex << std::showbase << std::uppercase << inst.operand(i) << std::dec;
             } else {
                 stream << std::string(" ") << std::to_string(inst.operand(i));
             }
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index c52f8cb4e50b..b67f18e2cc18 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -543,7 +543,7 @@ class SpvBuilder {
     using FunctionMap = std::unordered_map<SpvId, SpvFunction>;
 
     // Internal methods for creating ids, keys, and look ups
-    
+
     SpvId make_id(SpvKind kind);
 
     TypeKey make_type_key(const Type &type, uint32_t array_size = 1) const;
@@ -642,7 +642,9 @@ struct SpvFactory {
     static SpvInstruction memory_barrier(SpvId memory_scope_id, SpvId semantics_mask_id);
     static SpvInstruction control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, SpvId semantics_mask_id);
     static SpvInstruction bitwise_not(SpvId type_id, SpvId result_id, SpvId src_id);
+    static SpvInstruction bitwise_and(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
     static SpvInstruction logical_not(SpvId type_id, SpvId result_id, SpvId src_id);
+    static SpvInstruction logical_and(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
     static SpvInstruction shift_right_logical(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id);
     static SpvInstruction shift_right_arithmetic(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id);
     static SpvInstruction multiply_extended(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);

From 82b267bd87d32c18a6c89a8dbec922097fc4b0c5 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 13 Oct 2022 16:57:03 -0700
Subject: [PATCH 050/166] Add comments about necessary packages on Ubuntu
 v22.04 vs earlier versions

---
 README_vulkan.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 5385ddb2ff86..07244453f24b 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -53,23 +53,25 @@ https://www.intel.com/content/www/us/en/download-center/home.html
 On Ubuntu Linux, proprietary drivers can be installed via 'apt' using 
 PPA's for each vendor.
 
-For AMD:
+For AMD on Ubuntu v22.04:
 ```
 $ sudo add-apt-repository ppa:oibaf/graphics-drivers
 $ sudo apt update
 $ sudo apt upgrade
-$ sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-utils
+$ sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools
 ```
 
-For NVIDIA:
+For NVIDIA on Ubuntu v22.04:
 ```
 $ sudo add-apt-repository ppa:graphics-drivers/ppa
 $ sudo apt update
 $ sudo apt upgrade
 # - replace ### with latest driver release (eg 515)
-$ sudo apt install nvidia-driver-### nvidia-settings vulkan vulkan-utils
+$ sudo apt install nvidia-driver-### nvidia-settings vulkan vulkan-tools
 ```
 
+For earlier versions of Ubuntu (eg v20.x or v18.x) the contents of the `vulkan-tools` package was distributed as `vulkan-utils` so use this package instead.
+
 Note that only valid drivers for your system should be installed since there's been 
 reports of the Vulkan loader segfaulting just by having a non-supported driver present. 
 Specifically, the seemingly generic `mesa-vulkan-drivers` actually includes the AMD 

From a266dc88a7e5aa0df2eff192a7e9053e64243eff Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 13 Oct 2022 16:59:14 -0700
Subject: [PATCH 051/166] Clang tidy & format pass.

---
 src/CodeGen_Vulkan_Dev.cpp | 70 +++++++++++++++++++-------------------
 src/SpirvIR.h              | 14 ++++----
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index c0f881cfda2e..b3b222aa8ece 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -129,9 +129,9 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void store_at_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id);
         void store_at_vector_index(const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id);
 
-        SpvFactory::Components split_vector(Type type, SpvId value_id );
-        SpvId join_vector(Type type, const SpvFactory::Components& value_components );
-        SpvId cast_scalar(const Cast* op, Type target_type, Type value_type, SpvId value_id);
+        SpvFactory::Components split_vector(Type type, SpvId value_id);
+        SpvId join_vector(Type type, const SpvFactory::Components &value_components);
+        SpvId cast_scalar(const Cast *op, Type target_type, Type value_type, SpvId value_id);
 
         using BuiltinMap = std::unordered_map<std::string, SpvId>;
         const BuiltinMap spirv_builtin = {
@@ -212,10 +212,10 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         SymbolScope symbol_table;
 
         // Map from a variable ID for a buffer to its corresponding runtime array type
-        using RuntimeArrayMap = std::unordered_map<SpvId, SpvId>; 
+        using RuntimeArrayMap = std::unordered_map<SpvId, SpvId>;
         RuntimeArrayMap runtime_array_map;
 
-        // Keep track of the descriptor sets so we can add a sidecar to the 
+        // Keep track of the descriptor sets so we can add a sidecar to the
         // module indicating which descriptor set to use for each entry point
         struct DescriptorSet {
             std::string entry_point_name;
@@ -246,7 +246,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 // member stores the number of lanes in those loads and stores.
 //
 // FIXME: Refactor this and the version in CodeGen_OpenGLCompute_Dev to a common place!
-// 
+//
 class CheckAlignedDenseVectorLoadStore : public IRVisitor {
 public:
     // True if all loads and stores from the buffer are dense, aligned, and all
@@ -430,7 +430,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
     }
 }
 
-SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar(const Cast* op, Type target_type, Type value_type, SpvId value_id) {
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar(const Cast *op, Type target_type, Type value_type, SpvId value_id) {
 
     SpvId target_type_id = builder.declare_type(target_type);
     SpvOp op_code = SpvOpNop;
@@ -481,12 +481,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
     op->value.accept(this);
     SpvId value_id = builder.current_id();
 
-    if((value_type.is_vector() && target_type.is_vector())) {
+    if ((value_type.is_vector() && target_type.is_vector())) {
         Type scalar_target_type = target_type.with_lanes(1);
         Type scalar_value_type = value_type.with_lanes(1);
         SpvFactory::Components value_components = split_vector(value_type, value_id);
         SpvFactory::Components target_components;
-        for(SpvId value_component_id : value_components) {
+        for (SpvId value_component_id : value_components) {
             target_components.push_back(cast_scalar(op, scalar_target_type, scalar_value_type, value_component_id));
         }
         SpvId result_id = join_vector(target_type, target_components);
@@ -840,20 +840,20 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         e.accept(this);
         return;
 
-    } else if(op->name == "nan_f32") {
+    } else if (op->name == "nan_f32") {
         float value = NAN;
         SpvId result_id = builder.declare_constant(Float(32), &value);
         builder.update_id(result_id);
-    } else if(op->name == "inf_f32") {
+    } else if (op->name == "inf_f32") {
         float value = INFINITY;
         SpvId result_id = builder.declare_constant(Float(32), &value);
         builder.update_id(result_id);
-    } else if(op->name == "neg_inf_f32") {
+    } else if (op->name == "neg_inf_f32") {
         float value = -INFINITY;
         SpvId result_id = builder.declare_constant(Float(32), &value);
         builder.update_id(result_id);
-    } else if(op->name == "is_finite_f32" || 
-              op->name == "is_finite_f64") {
+    } else if (op->name == "is_finite_f32" ||
+               op->name == "is_finite_f64") {
         visit_unaryop(op->type, op->args[0], (SpvOp)SpvOpIsInf);
         SpvId is_inf_id = builder.current_id();
         visit_unaryop(op->type, op->args[0], (SpvOp)SpvOpIsNan);
@@ -964,14 +964,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, S
 
     internal_assert(op->index.type().is_vector());
 
-    // If this is a load from a buffer block (mapped to a halide buffer) 
-    // and the runtime array is a vector type, then attempt to do a 
-    // dense vector load by using the base of the ramp divided by 
+    // If this is a load from a buffer block (mapped to a halide buffer)
+    // and the runtime array is a vector type, then attempt to do a
+    // dense vector load by using the base of the ramp divided by
     // the number of lanes.
-    RuntimeArrayMap::const_iterator it = runtime_array_map.find(base_id); 
-    if(it != runtime_array_map.end()) {
+    RuntimeArrayMap::const_iterator it = runtime_array_map.find(base_id);
+    if (it != runtime_array_map.end()) {
         SpvId array_element_type_id = it->second;
-        if(builder.is_vector_type(array_element_type_id)) {
+        if (builder.is_vector_type(array_element_type_id)) {
             Expr ramp_base = strided_ramp_base(op->index);
             if (ramp_base.defined()) {
                 Expr ramp_index = (ramp_base / op->type.lanes());
@@ -979,8 +979,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, S
                 SpvId index_id = builder.current_id();
                 load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
                 return;
-            }            
-        }            
+            }
+        }
     }
 
     op->index.accept(this);
@@ -1045,19 +1045,19 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(SpvId index_id, Sp
     builder.append(SpvFactory::store(dst_id, value_id));
 }
 
-SpvFactory::Components CodeGen_Vulkan_Dev::SPIRV_Emitter::split_vector(Type type, SpvId value_id ) {
+SpvFactory::Components CodeGen_Vulkan_Dev::SPIRV_Emitter::split_vector(Type type, SpvId value_id) {
     SpvFactory::Components value_components;
     SpvId scalar_value_type_id = builder.declare_type(type.with_lanes(1));
     for (uint32_t i = 0; i < (uint32_t)type.lanes(); i++) {
         SpvFactory::Indices extract_indices = {i};
         SpvId value_component_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::composite_extract(scalar_value_type_id, value_component_id, value_id, extract_indices));
-        value_components.push_back(value_component_id);            
+        value_components.push_back(value_component_id);
     }
     return value_components;
 }
 
-SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::join_vector(Type type, const SpvFactory::Components& value_components ) {
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::join_vector(Type type, const SpvFactory::Components &value_components) {
     SpvId type_id = builder.declare_type(type);
     SpvId result_id = builder.reserve_id(SpvResultId);
     builder.append(SpvFactory::composite_construct(type_id, result_id, value_components));
@@ -1073,14 +1073,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, S
 
     internal_assert(op->index.type().is_vector());
 
-    // If this is a store to a buffer block (mapped to a halide buffer) 
-    // and the runtime array is a vector type, then attempt to do a 
-    // dense vector store by using the base of the ramp divided by 
+    // If this is a store to a buffer block (mapped to a halide buffer)
+    // and the runtime array is a vector type, then attempt to do a
+    // dense vector store by using the base of the ramp divided by
     // the number of lanes.
-    RuntimeArrayMap::const_iterator it = runtime_array_map.find(base_id); 
-    if(it != runtime_array_map.end()) {
+    RuntimeArrayMap::const_iterator it = runtime_array_map.find(base_id);
+    if (it != runtime_array_map.end()) {
         SpvId array_element_type_id = it->second;
-        if(builder.is_vector_type(array_element_type_id)) {
+        if (builder.is_vector_type(array_element_type_id)) {
             Expr ramp_base = strided_ramp_base(op->index);
             if (ramp_base.defined()) {
                 Expr ramp_index = (ramp_base / op->value.type().lanes());
@@ -1088,8 +1088,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, S
                 SpvId index_id = builder.current_id();
                 store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);
                 return;
-            }            
-        }            
+            }
+        }
     }
 
     op->index.accept(this);
@@ -1830,8 +1830,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
     for (const auto &arg : args) {
         if (arg.is_buffer) {
 
-            // Check for dense loads & stores to determine the widest vector 
-            // width we can safely index 
+            // Check for dense loads & stores to determine the widest vector
+            // width we can safely index
             CheckAlignedDenseVectorLoadStore check_dense(arg.name);
             s.accept(&check_dense);
             int lanes = check_dense.are_all_dense ? check_dense.lanes : 1;
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index b67f18e2cc18..45ddd126ddb0 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -365,11 +365,11 @@ class SpvBuilder {
     // Look up the ID associated with the type for a given variable ID
     SpvId type_of(SpvId variable_id) const;
 
-    // Top-Level declaration methods ... each of these is a convenvience 
+    // Top-Level declaration methods ... each of these is a convenvience
     // function that checks to see if the requested thing has already been
     // declared, in which case it returns its existing id, otherwise it
     // adds a new declaration, and returns the new id.  This avoids all
-    // the logic checks in the calling code, and also ensures that 
+    // the logic checks in the calling code, and also ensures that
     // duplicates aren't created.
 
     SpvId declare_void_type();
@@ -392,7 +392,7 @@ class SpvBuilder {
     SpvId declare_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
 
     // Top level creation methods for adding new items ... these have a limited
-    // number of checks and the caller must insure that duplicates aren't created 
+    // number of checks and the caller must insure that duplicates aren't created
     SpvId add_type(const Type &type, uint32_t array_size = 1);
     SpvId add_struct(const std::string &name, const StructMemberTypes &member_types);
     SpvId add_runtime_array(SpvId base_type_id);
@@ -412,7 +412,7 @@ class SpvBuilder {
 
     void add_execution_mode_local_size(SpvId entry_point_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z);
 
-    // Assigns a specific source language hint to the module 
+    // Assigns a specific source language hint to the module
     void set_source_language(SpvSourceLanguage val);
 
     // Sets the addressing model to use for the module
@@ -451,7 +451,7 @@ class SpvBuilder {
     // Returns true if the given capability is required by the module
     bool is_capability_required(SpvCapability) const;
 
-    // Change the current build location to the given block. All local 
+    // Change the current build location to the given block. All local
     // declarations and instructions will be added here.
     void enter_block(const SpvBlock &block);
 
@@ -508,7 +508,7 @@ class SpvBuilder {
     // Returns the symbol string for the given id (or an empty string if none is found)
     std::string lookup_symbol(SpvId id) const;
 
-    // Returns the current module being used for building 
+    // Returns the current module being used for building
     SpvModule current_module() const;
 
     // Appends the given instruction to the current build location
@@ -517,7 +517,7 @@ class SpvBuilder {
     // Finalizes the module and prepares it for encoding (must be called before module can be used)
     void finalize();
 
-    // Encodes the current module to the given binary 
+    // Encodes the current module to the given binary
     void encode(SpvBinary &binary) const;
 
     // Resets the builder and all internal state

From 72b616723c3a1462f804d593b76750bd08545546 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 13 Oct 2022 17:02:46 -0700
Subject: [PATCH 052/166] Update Vulkan test results. Pass: 329 Fail: 36

---
 README_vulkan.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 07244453f24b..4f1c4631d5c6 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -193,14 +193,13 @@ https://github.com/KhronosGroup/SPIRV-Tools
 
 # Current Status
 
-The following table outlines the state of the correctness tests (as of Oct-04) when 
+The following table outlines the state of the correctness tests (as of Oct-13) when 
 run with `HL_JIT_TARGET=host-vulkan` (NOTE: some tests may need additional 
 modifications to run under Vulkan):
 
 | Totals | *PASS* 🟢 | *FAIL* 🔴 | 
 | --     | --        | --        |
-|        | 326       | 39        |
-
+|        | 329       | 36        |
 
 | Test Name | Status |
 | :--       |    --: |
@@ -216,9 +215,9 @@ modifications to run under Vulkan):
 | correctness_reorder_rvars | *PASS* 🟢 |
 | correctness_lossless_cast | *PASS* 🟢 |
 | correctness_gpu_reuse_shared_memory | *FAIL* 🔴 |
-| correctness_boundary_conditions | *FAIL* 🔴 |
+| correctness_boundary_conditions | *PASS* 🟢 |
 | correctness_min_extent | *PASS* 🟢 |
-| correctness_gpu_sum_scan | *PASS* 🟢 |
+| correctness_gpu_sum_scan | *FAIL* 🔴 |
 | correctness_dynamic_allocation_in_gpu_kernel | *FAIL* 🔴 |
 | correctness_image_of_lists | *PASS* 🟢 |
 | correctness_tracing_broadcast | *PASS* 🟢 |
@@ -232,7 +231,7 @@ modifications to run under Vulkan):
 | correctness_realize_larger_than_two_gigs | *PASS* 🟢 |
 | correctness_gpu_transpose | *FAIL* 🔴 |
 | correctness_side_effects | *PASS* 🟢 |
-| correctness_logical | *FAIL* 🔴 |
+| correctness_logical | *PASS* 🟢 |
 | correctness_func_lifetime_2 | *PASS* 🟢 |
 | correctness_device_crop | *FAIL* 🔴 |
 | correctness_print_loop_nest | *PASS* 🟢 |
@@ -259,7 +258,7 @@ modifications to run under Vulkan):
 | correctness_gpu_assertion_in_kernel | *PASS* 🟢 |
 | correctness_low_bit_depth_noise | *PASS* 🟢 |
 | correctness_fuse | *PASS* 🟢 |
-| correctness_vector_cast | *FAIL* 🔴 |
+| correctness_vector_cast | *PASS* 🟢 |
 | correctness_concat | *PASS* 🟢 |
 | correctness_mod | *PASS* 🟢 |
 | correctness_parallel_rvar | *PASS* 🟢 |
@@ -407,7 +406,7 @@ modifications to run under Vulkan):
 | correctness_reduction_schedule | *PASS* 🟢 |
 | correctness_many_small_extern_stages | *PASS* 🟢 |
 | correctness_parallel_alloc | *PASS* 🟢 |
-| correctness_multiple_outputs | *FAIL* 🔴 |
+| correctness_multiple_outputs | *PASS* 🟢 |
 | correctness_vectorize_nested | *PASS* 🟢 |
 | correctness_bad_likely | *PASS* 🟢 |
 | correctness_sliding_reduction | *PASS* 🟢 |

From 4eb501210bb419de401c1bf6eb158696e2ef7bce Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 13 Oct 2022 17:04:16 -0700
Subject: [PATCH 053/166] Remove unused Produce/Consume visitor method

---
 src/CodeGen_Vulkan_Dev.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index b3b222aa8ece..51b38dc25905 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -101,7 +101,6 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const Let *) override;
         void visit(const LetStmt *) override;
         void visit(const AssertStmt *) override;
-        //        void visit(const ProducerConsumer *) override;
         void visit(const For *) override;
         void visit(const Store *) override;
         void visit(const Provide *) override;

From f885a20eb7b12b1e50823ce68a83997dbe5436ae Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 20 Oct 2022 12:00:50 -0700
Subject: [PATCH 054/166] Fix Molten VK initialization to work with v1.3+
 loader Add support for direct casts for same-size types Add missing mux, mix,
 lerp, sinh, tanh, etc intrinsics Add explicit storage access for variables
 Add a macro to enable debug messages in Vulkan Memory Allocator

---
 README_vulkan.md               |  10 +-
 src/BoundSmallAllocations.cpp  |   9 +
 src/CodeGen_Vulkan_Dev.cpp     | 577 +++++++++++++++++++--------------
 src/SpirvIR.cpp                |  53 ++-
 src/SpirvIR.h                  |   2 +
 src/runtime/mini_vulkan.h      |   9 +
 src/runtime/vulkan.cpp         |   4 +-
 src/runtime/vulkan_context.h   |  23 +-
 src/runtime/vulkan_memory.h    |  57 +++-
 src/runtime/vulkan_resources.h |   4 +-
 10 files changed, 467 insertions(+), 281 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 4f1c4631d5c6..0a6b6a608221 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -177,8 +177,14 @@ https://github.com/KhronosGroup/SPIRV-Tools
 
 -   During CodeGen we enable capabilities in the SPIR-V binary for 
     Int8, Int16, Float16, and Float64 based on the Halide IR, assuming
-    the device will support these requirements.  We may need limit 
-    these if targetting a lower class device.
+    the device will support these requirements.  We may need to limit 
+    these if targetting a lower class device. We may need to add Halide 
+	feature flags to address these issues.
+
+-   When generating vector lengths greater than 8 components, we assume 
+    Vector16 support is available.  This may not be the case when running
+	on lower class devices.  Same as above ... we may need a feature flag
+	to limit vector lengths to a specific class of device.
 
 # Known TODO:
 
diff --git a/src/BoundSmallAllocations.cpp b/src/BoundSmallAllocations.cpp
index ae492d00d908..227da06c378f 100644
--- a/src/BoundSmallAllocations.cpp
+++ b/src/BoundSmallAllocations.cpp
@@ -76,6 +76,8 @@ class BoundSmallAllocations : public IRMutator {
     bool must_be_constant(MemoryType memory_type) const {
         return (memory_type == MemoryType::Register ||
                 (device_api == DeviceAPI::OpenGLCompute &&
+                 memory_type == MemoryType::GPUShared) ||
+                (device_api == DeviceAPI::Vulkan &&
                  memory_type == MemoryType::GPUShared));
     }
 
@@ -132,6 +134,13 @@ class BoundSmallAllocations : public IRMutator {
                 << "Only fixed-size allocations can be stored in shared memory "
                 << "in OpenGL compute shaders. Try storing in MemoryType::Heap "
                 << "instead.";
+
+            user_assert(!(device_api == DeviceAPI::Vulkan &&
+                          op->memory_type == MemoryType::GPUShared))
+                << "Allocation " << op->name << " has a dynamic size. "
+                << "Only fixed-size allocations can be stored in shared memory "
+                << "in Vulkan compute shaders. Try storing in MemoryType::Heap "
+                << "instead.";
         }
 
         const int64_t *size_ptr = bound.defined() ? as_const_int(bound) : nullptr;
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 51b38dc25905..e37c7412c224 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -116,33 +116,25 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const Acquire *) override;
         void visit(const Atomic *) override;
 
-        void visit_unaryop(Type t, const Expr &a, SpvOp op_code);
-        void visit_binop(Type t, const Expr &a, const Expr &b, SpvOp op_code);
+        void visit_unary_op(SpvOp op_code, Type t, const Expr &a);
+        void visit_binary_op(SpvOp op_code, Type t, const Expr &a, const Expr &b);
+        void visit_glsl_op(SpvId glsl_op_code, Type t, const std::vector<Expr> &args);
 
-        void visit_glsl_unaryop(Type t, const Expr &a, SpvId glsl_op_code);
-        void visit_glsl_binop(Type t, const Expr &a, const Expr &b, SpvId glsl_op_code);
+        void load_from_scalar_index(const Load *op, SpvId index_id, SpvId variable_id, Type value_type, Type storage_type, SpvStorageClass storage_class);
+        void load_from_vector_index(const Load *op, SpvId variable_id, Type value_type, Type storage_type, SpvStorageClass storage_class);
 
-        void load_from_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class);
-        void load_from_vector_index(const Load *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class);
-
-        void store_at_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id);
-        void store_at_vector_index(const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id);
+        void store_at_scalar_index(const Store *op, SpvId index_id, SpvId variable_id, Type value_type, Type storage_type, SpvStorageClass storage_class, SpvId value_id);
+        void store_at_vector_index(const Store *op, SpvId variable_id, Type value_type, Type storage_type, SpvStorageClass storage_class, SpvId value_id);
 
         SpvFactory::Components split_vector(Type type, SpvId value_id);
         SpvId join_vector(Type type, const SpvFactory::Components &value_components);
-        SpvId cast_scalar(const Cast *op, Type target_type, Type value_type, SpvId value_id);
+        SpvId cast_type(Type target_type, Type value_type, SpvId value_id);
 
         using BuiltinMap = std::unordered_map<std::string, SpvId>;
-        const BuiltinMap spirv_builtin = {
-            {"is_nan_f16", SpvOpIsNan},
-            {"is_nan_f32", SpvOpIsNan},
-            {"is_nan_f64", SpvOpIsNan},
-            {"is_inf_f16", SpvOpIsInf},
-            {"is_inf_f32", SpvOpIsInf},
-            {"is_inf_f64", SpvOpIsInf},
-        };
 
         const BuiltinMap glsl_builtin = {
+            {"pow_f16", GLSLstd450Pow},
+            {"pow_f32", GLSLstd450Pow},
             {"acos_f16", GLSLstd450Acos},
             {"acos_f32", GLSLstd450Acos},
             {"acosh_f16", GLSLstd450Acosh},
@@ -151,8 +143,8 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
             {"asin_f32", GLSLstd450Asin},
             {"asinh_f16", GLSLstd450Asinh},
             {"asinh_f32", GLSLstd450Asinh},
-            {"atan2_f16", GLSLstd450Atan},  // also called atan in GLSL
-            {"atan2_f32", GLSLstd450Atan},
+            {"atan2_f16", GLSLstd450Atan2},
+            {"atan2_f32", GLSLstd450Atan2},
             {"atan_f16", GLSLstd450Atan},
             {"atan_f32", GLSLstd450Atan},
             {"atanh_f16", GLSLstd450Atanh},
@@ -183,6 +175,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
             {"tanh_f32", GLSLstd450Tanh},
             {"trunc_f16", GLSLstd450Trunc},
             {"trunc_f32", GLSLstd450Trunc},
+            {"mix", GLSLstd450FMix},
         };
 
         // The SPIRV-IR builder
@@ -210,9 +203,14 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         using ScopedSymbolBinding = ScopedBinding<SymbolIdStorageClassPair>;
         SymbolScope symbol_table;
 
-        // Map from a variable ID for a buffer to its corresponding runtime array type
-        using RuntimeArrayMap = std::unordered_map<SpvId, SpvId>;
-        RuntimeArrayMap runtime_array_map;
+        // Map from a variable ID to its corresponding storage type definition
+        struct StorageAccess {
+            SpvStorageClass storage_class = SpvStorageClassMax;
+            SpvId storage_type_id = SpvInvalidId;
+            Type storage_type;
+        };
+        using StorageAccessMap = std::unordered_map<SpvId, StorageAccess>;
+        StorageAccessMap storage_access_map;
 
         // Keep track of the descriptor sets so we can add a sidecar to the
         // module indicating which descriptor set to use for each entry point
@@ -429,9 +427,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
     }
 }
 
-SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar(const Cast *op, Type target_type, Type value_type, SpvId value_id) {
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_type, SpvId value_id) {
+
+    if (value_type == target_type) {
+        return value_id;
+    }
 
-    SpvId target_type_id = builder.declare_type(target_type);
     SpvOp op_code = SpvOpNop;
     if (value_type.is_float()) {
         if (target_type.is_float()) {
@@ -440,35 +441,54 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar(const Cast *op, Type target
             op_code = SpvOpConvertFToU;
         } else if (target_type.is_int()) {
             op_code = SpvOpConvertFToS;
-        } else {
-            user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar():  unhandled case " << value_type << " to " << target_type << "\n";
         }
     } else if (value_type.is_uint()) {
         if (target_type.is_float()) {
             op_code = SpvOpConvertUToF;
         } else if (target_type.is_uint()) {
-            op_code = SpvOpUConvert;
+            if (target_type.bits() != value_type.bits()) {
+                op_code = SpvOpUConvert;  // UConvert is only allowed on differing component widths
+            }
         } else if (target_type.is_int()) {
-            op_code = SpvOpUConvert;
-        } else {
-            user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar():  unhandled case " << value_type << " to " << target_type << "\n";
+            if (target_type.bits() != value_type.bits()) {
+                op_code = SpvOpSConvert;  // SConvert is only allowed on differing component widths
+            }
         }
     } else if (value_type.is_int()) {
         if (target_type.is_float()) {
             op_code = SpvOpConvertSToF;
         } else if (target_type.is_uint()) {
-            op_code = SpvOpSConvert;
-        } else if (target_type.is_int()) {
-            op_code = SpvOpSConvert;
-        } else {
-            user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar():  unhandled case " << value_type << " to " << target_type << "\n";
+            if (target_type.bits() != value_type.bits()) {
+                op_code = SpvOpUConvert;  // UConvert is only allowed on differing component widths
+            }
+        } else if (target_type.is_int() || target_type.is_uint()) {
+            if (target_type.bits() != value_type.bits()) {
+                op_code = SpvOpSConvert;  // SConvert is only allowed on differing component widths
+            }
         }
-    } else {
-        user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_scalar():  unhandled case " << value_type << " to " << target_type << "\n";
     }
 
+    // If none of the explicit conversions matched, do a direct bitcast if the total
+    // size of both types is the same
+    if (op_code == SpvOpNop) {
+        if (target_type.bytes() == value_type.bytes()) {
+            op_code = SpvOpBitcast;
+        }
+    }
+
+    // Error If we still didn't find a suitable cast ...
+    if (op_code == SpvOpNop) {
+        user_error << "Unhandled type cast from value type '" << value_type << "' to target type '" << target_type << "'!";
+        return SpvInvalidId;
+    }
+
+    SpvId target_type_id = builder.declare_type(target_type);
     SpvId result_id = builder.reserve_id(SpvResultId);
-    builder.append(SpvFactory::convert(op_code, target_type_id, result_id, value_id));
+    if (op_code == SpvOpBitcast) {
+        builder.append(SpvFactory::bitcast(target_type_id, result_id, value_id));
+    } else {
+        builder.append(SpvFactory::convert(op_code, target_type_id, result_id, value_id));
+    }
     return result_id;
 }
 
@@ -477,22 +497,26 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Cast *op) {
 
     Type value_type = op->value.type();
     Type target_type = op->type;
+
     op->value.accept(this);
     SpvId value_id = builder.current_id();
 
     if ((value_type.is_vector() && target_type.is_vector())) {
-        Type scalar_target_type = target_type.with_lanes(1);
-        Type scalar_value_type = value_type.with_lanes(1);
-        SpvFactory::Components value_components = split_vector(value_type, value_id);
-        SpvFactory::Components target_components;
-        for (SpvId value_component_id : value_components) {
-            target_components.push_back(cast_scalar(op, scalar_target_type, scalar_value_type, value_component_id));
+        if (value_type.lanes() == target_type.lanes()) {
+            SpvId result_id = cast_type(target_type, value_type, value_id);
+            builder.update_id(result_id);
+        } else {
+            user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << " (incompatible lanes)\n";
         }
-        SpvId result_id = join_vector(target_type, target_components);
+    } else if (value_type.is_scalar() && target_type.is_scalar()) {
+        debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast): scalar type (cast)\n";
+        SpvId result_id = cast_type(target_type, value_type, value_id);
         builder.update_id(result_id);
-    } else {
-        SpvId result_id = cast_scalar(op, target_type, value_type, value_id);
+    } else if (value_type.bytes() == target_type.bytes()) {
+        SpvId result_id = cast_type(target_type, value_type, value_id);
         builder.update_id(result_id);
+    } else {
+        user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Cast):  unhandled case " << op->value.type() << " to " << op->type << "\n";
     }
 }
 
@@ -508,17 +532,17 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Reinterpret *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Add *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Add): " << op->type << " ((" << op->a << ") + (" << op->b << "))\n";
-    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFAdd : SpvOpIAdd);
+    visit_binary_op(op->type.is_float() ? SpvOpFAdd : SpvOpIAdd, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Sub *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Sub): " << op->type << " ((" << op->a << ") - (" << op->b << "))\n";
-    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFSub : SpvOpISub);
+    visit_binary_op(op->type.is_float() ? SpvOpFSub : SpvOpISub, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mul *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mul): " << op->type << " ((" << op->a << ") * (" << op->b << "))\n";
-    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFMul : SpvOpIMul);
+    visit_binary_op(op->type.is_float() ? SpvOpFMul : SpvOpIMul, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
@@ -526,7 +550,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
     user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n";
 
     if (op->type.is_float()) {
-        visit_binop(op->type, op->a, op->b, SpvOpFDiv);
+        visit_binary_op(SpvOpFDiv, op->type, op->a, op->b);
     } else {
         Expr e = lower_int_uint_div(op->a, op->b);
         e.accept(this);
@@ -537,7 +561,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a << ") % (" << op->b << "))\n";
     if (op->type.is_float()) {
         // Takes sign of result from op->b
-        visit_binop(op->type, op->a, op->b, SpvOpFMod);
+        visit_binary_op(SpvOpFMod, op->type, op->a, op->b);
     } else {
         Expr e = lower_int_uint_mod(op->a, op->b);
         e.accept(this);
@@ -546,35 +570,44 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Max *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Max): " << op->type << " Max((" << op->a << "), (" << op->b << "))\n";
-
-    std::string a_name = unique_name('a');
-    std::string b_name = unique_name('b');
-    Expr a = Variable::make(op->a.type(), a_name);
-    Expr b = Variable::make(op->b.type(), b_name);
-    Expr temp = Let::make(a_name, op->a,
-                          Let::make(b_name, op->b, select(a > b, a, b)));
-    temp.accept(this);
+    SpvId op_code = SpvOpNop;
+    if (op->type.is_float()) {
+        op_code = GLSLstd450FMax;
+    } else if (op->type.is_int()) {
+        op_code = GLSLstd450SMax;
+    } else if (op->type.is_uint()) {
+        op_code = GLSLstd450UMax;
+    } else {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Max *op): unhandled type: " << op->type << "\n";
+    }
+    std::vector<Expr> args = {op->a, op->b};
+    visit_glsl_op(op_code, op->type, args);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Min): " << op->type << " Min((" << op->a << "), (" << op->b << "))\n";
-    std::string a_name = unique_name('a');
-    std::string b_name = unique_name('b');
-    Expr a = Variable::make(op->a.type(), a_name);
-    Expr b = Variable::make(op->b.type(), b_name);
-    Expr temp = Let::make(a_name, op->a,
-                          Let::make(b_name, op->b, select(a < b, a, b)));
-    temp.accept(this);
+    SpvId op_code = SpvOpNop;
+    if (op->type.is_float()) {
+        op_code = GLSLstd450FMin;
+    } else if (op->type.is_int()) {
+        op_code = GLSLstd450SMin;
+    } else if (op->type.is_uint()) {
+        op_code = GLSLstd450UMin;
+    } else {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op): unhandled type: " << op->type << "\n";
+    }
+    std::vector<Expr> args = {op->a, op->b};
+    visit_glsl_op(op_code, op->type, args);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(EQ): " << op->type << " (" << op->a << ") == (" << op->b << ")\n";
-    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFOrdEqual : SpvOpIEqual);
+    visit_binary_op(op->type.is_float() ? SpvOpFOrdEqual : SpvOpIEqual, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const NE *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(NE): " << op->type << " (" << op->a << ") != (" << op->b << ")\n";
-    visit_binop(op->type, op->a, op->b, op->type.is_float() ? SpvOpFOrdNotEqual : SpvOpINotEqual);
+    visit_binary_op(op->type.is_float() ? SpvOpFOrdNotEqual : SpvOpINotEqual, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
@@ -589,7 +622,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binop(op->type, op->a, op->b, op_code);
+    visit_binary_op(op_code, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
@@ -604,7 +637,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binop(op->type, op->a, op->b, op_code);
+    visit_binary_op(op_code, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
@@ -619,7 +652,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binop(op->type, op->a, op->b, op_code);
+    visit_binary_op(op_code, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
@@ -634,28 +667,22 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binop(op->type, op->a, op->b, op_code);
+    visit_binary_op(op_code, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const And *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(And): " << op->type << " (" << op->a << ") && (" << op->b << ")\n";
-    visit_binop(op->type, op->a, op->b, SpvOpLogicalAnd);
+    visit_binary_op(SpvOpLogicalAnd, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Or *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Or): " << op->type << " (" << op->a << ") || (" << op->b << ")\n";
-    visit_binop(op->type, op->a, op->b, SpvOpLogicalOr);
+    visit_binary_op(SpvOpLogicalOr, op->type, op->a, op->b);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Not *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Not): " << op->type << " !(" << op->a << ")\n";
-
-    SpvId type_id = builder.declare_type(op->type);
-    op->a.accept(this);
-    SpvId src_id = builder.current_id();
-    SpvId result_id = builder.reserve_id(SpvResultId);
-    builder.append(SpvFactory::logical_not(type_id, result_id, src_id));
-    builder.update_id(result_id);
+    visit_unary_op(SpvOpLogicalNot, op->type, op->a);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
@@ -671,7 +698,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device &&
             !(fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared)) {
 
-            uint32_t execution_scope = SpvDeviceScope;
+            // In Vulkan, execution scope is limited to Workgroup or Subgroup
+            uint32_t execution_scope = SpvWorkgroupScope;
             uint32_t memory_scope = SpvDeviceScope;
             uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask |
                                      SpvMemorySemanticsWorkgroupMemoryMask);
@@ -681,7 +709,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             builder.append(SpvFactory::control_barrier(exec_scope_id, memory_scope_id, control_mask_id));
 
         } else if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device) {
-            uint32_t execution_scope = SpvDeviceScope;
+            // In Vulkan, execution scope is limited to Workgroup or Subgroup
+            uint32_t execution_scope = SpvWorkgroupScope;
             uint32_t memory_scope = SpvDeviceScope;
             uint32_t control_mask = (SpvMemorySemanticsSequentiallyConsistentMask |
                                      SpvMemorySemanticsUniformMemoryMask |
@@ -704,8 +733,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             builder.append(SpvFactory::memory_barrier(memory_scope_id, control_mask_id));
 
         } else {
-
-            uint32_t execution_scope = SpvDeviceScope;
+            // In Vulkan, execution scope is limited to Workgroup or Subgroup
+            uint32_t execution_scope = SpvWorkgroupScope;
             uint32_t memory_scope = SpvDeviceScope;
             uint32_t control_mask = SpvMemorySemanticsMaskNone;
             SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
@@ -722,11 +751,13 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         } else {
             op_code = GLSLstd450SAbs;
         }
-        visit_glsl_unaryop(op->type, op->args[0], op_code);
+        visit_glsl_op(op_code, op->type, op->args);
 
     } else if (op->is_intrinsic(Call::IntrinsicOp::round)) {
         internal_assert(op->args.size() == 1);
-        visit_glsl_unaryop(op->type, op->args[0], GLSLstd450Round);
+
+        // GLSL RoundEven matches Halide's implementation
+        visit_glsl_op(GLSLstd450RoundEven, op->type, op->args);
 
     } else if (op->is_intrinsic(Call::absd)) {
         internal_assert(op->args.size() == 2);
@@ -743,21 +774,16 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         }
     } else if (op->is_intrinsic(Call::bitwise_and)) {
         internal_assert(op->args.size() == 2);
-        visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseAnd);
+        visit_binary_op(SpvOpBitwiseAnd, op->type, op->args[0], op->args[1]);
     } else if (op->is_intrinsic(Call::bitwise_xor)) {
         internal_assert(op->args.size() == 2);
-        visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseXor);
+        visit_binary_op(SpvOpBitwiseXor, op->type, op->args[0], op->args[1]);
     } else if (op->is_intrinsic(Call::bitwise_or)) {
         internal_assert(op->args.size() == 2);
-        visit_binop(op->type, op->args[0], op->args[1], SpvOpBitwiseOr);
+        visit_binary_op(SpvOpBitwiseOr, op->type, op->args[0], op->args[1]);
     } else if (op->is_intrinsic(Call::bitwise_not)) {
         internal_assert(op->args.size() == 1);
-        SpvId type_id = builder.declare_type(op->type);
-        op->args[0]->accept(this);
-        SpvId arg_id = builder.current_id();
-        SpvId result_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::bitwise_not(type_id, result_id, arg_id));
-        builder.update_id(result_id);
+        visit_unary_op(SpvOpNot, op->type, op->args[0]);
     } else if (op->is_intrinsic(Call::if_then_else)) {
         if (op->type.is_vector()) {
             scalarize(op);
@@ -782,7 +808,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         } else {
             internal_error << "div_round_to_zero of unhandled type.\n";
         }
-        visit_binop(op->type, op->args[0], op->args[1], op_code);
+        visit_binary_op(op_code, op->type, op->args[0], op->args[1]);
     } else if (op->is_intrinsic(Call::IntrinsicOp::mod_round_to_zero)) {
         internal_assert(op->args.size() == 2);
         SpvOp op_code = SpvOpNop;
@@ -795,16 +821,18 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         } else {
             internal_error << "mod_round_to_zero of unhandled type.\n";
         }
-        visit_binop(op->type, op->args[0], op->args[1], op_code);
+        visit_binary_op(op_code, op->type, op->args[0], op->args[1]);
 
     } else if (op->is_intrinsic(Call::shift_right)) {
+        internal_assert(op->args.size() == 2);
         if (op->type.is_uint()) {
-            visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftRightLogical);
+            visit_binary_op(SpvOpShiftRightLogical, op->type, op->args[0], op->args[1]);
         } else {
-            visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftRightArithmetic);
+            visit_binary_op(SpvOpShiftRightArithmetic, op->type, op->args[0], op->args[1]);
         }
     } else if (op->is_intrinsic(Call::shift_left)) {
-        visit_binop(op->type, op->args[0], op->args[1], SpvOpShiftLeftLogical);
+        internal_assert(op->args.size() == 2);
+        visit_binary_op(SpvOpShiftLeftLogical, op->type, op->args[0], op->args[1]);
     } else if (op->is_intrinsic(Call::strict_float)) {
         // TODO: Enable/Disable RelaxedPrecision flags?
         internal_assert(op->args.size() == 1);
@@ -815,7 +843,39 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         // a + (b - a)/2
         Expr e = op->args[0] + (op->args[1] - op->args[0]) / 2;
         e.accept(this);
+    } else if (op->is_intrinsic(Call::lerp)) {
+
+        // Implement lerp using GLSL's mix() function, which always uses
+        // floating point arithmetic.
+        Expr zero_val = op->args[0];
+        Expr one_val = op->args[1];
+        Expr weight = op->args[2];
+
+        internal_assert(weight.type().is_uint() || weight.type().is_float());
+        if (weight.type().is_uint()) {
+            // Normalize integer weights to [0.0f, 1.0f] range.
+            internal_assert(weight.type().bits() < 32);
+            weight = Div::make(Cast::make(Float(32), weight),
+                               Cast::make(Float(32), weight.type().max()));
+        } else if (op->type.is_uint()) {
+            // Round float weights down to next multiple of (1/op->type.imax())
+            // to give same results as lerp based on integer arithmetic.
+            internal_assert(op->type.bits() < 32);
+            weight = floor(weight * op->type.max()) / op->type.max();
+        }
+
+        Type result_type = Float(32, op->type.lanes());
+        Expr e = Call::make(result_type, "mix", {zero_val, one_val, weight}, Call::Extern);
+
+        if (!op->type.is_float()) {
+            // Mirror rounding implementation of Halide's integer lerp.
+            e = Cast::make(op->type, floor(e + 0.5f));
+        }
+        e.accept(this);
 
+    } else if (op->is_intrinsic(Call::mux)) {
+        Expr e = lower_mux(op);
+        e.accept(this);
     } else if (op->is_intrinsic(Call::widen_right_add) ||
                op->is_intrinsic(Call::widen_right_mul) ||
                op->is_intrinsic(Call::widen_right_sub) ||
@@ -851,11 +911,18 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         float value = -INFINITY;
         SpvId result_id = builder.declare_constant(Float(32), &value);
         builder.update_id(result_id);
-    } else if (op->name == "is_finite_f32" ||
-               op->name == "is_finite_f64") {
-        visit_unaryop(op->type, op->args[0], (SpvOp)SpvOpIsInf);
+    } else if (starts_with(op->name, "is_nan_f")) {
+        internal_assert(op->args.size() == 1);
+        visit_unary_op((SpvOp)SpvOpIsNan, op->type, op->args[0]);
+    } else if (starts_with(op->name, "is_inf_f")) {
+        internal_assert(op->args.size() == 1);
+        visit_unary_op((SpvOp)SpvOpIsInf, op->type, op->args[0]);
+    } else if (starts_with(op->name, "is_finite_f")) {
+
+        internal_assert(op->args.size() == 1);
+        visit_unary_op((SpvOp)SpvOpIsInf, op->type, op->args[0]);
         SpvId is_inf_id = builder.current_id();
-        visit_unaryop(op->type, op->args[0], (SpvOp)SpvOpIsNan);
+        visit_unary_op((SpvOp)SpvOpIsNan, op->type, op->args[0]);
         SpvId is_nan_id = builder.current_id();
 
         SpvId type_id = builder.declare_type(op->type);
@@ -866,36 +933,18 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         SpvId result_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::logical_and(type_id, result_id, not_is_inf_id, not_is_nan_id));
         builder.update_id(result_id);
-    } else {
 
-        // First check for a standard SPIR-V built-in
-        BuiltinMap::const_iterator spirv_it = spirv_builtin.find(op->name);
-        if (spirv_it != spirv_builtin.end()) {
-            SpvId spirv_op = spirv_it->second;
-            if ((spirv_op == SpvOpIsInf) || (spirv_op == SpvOpIsNan)) {
-                internal_assert(op->args.size() == 1);
-                visit_unaryop(op->type, op->args[0], (SpvOp)spirv_op);
-            } else {
-                user_error << "Vulkan: unhandled SPIR-V builtin function '" << op->name << "' encountered.\n";
-            }
-        }
+    } else {
 
         // If its not a standard SPIR-V built-in, see if there's a GLSL extended builtin
         BuiltinMap::const_iterator glsl_it = glsl_builtin.find(op->name);
         if (glsl_it == glsl_builtin.end()) {
-            user_error << "Vulkan: unknown function '" << op->name << "' encountered.\n";
+            user_error << "Vulkan: unhandled SPIR-V GLSL builtin function '" << op->name << "' encountered.\n";
         }
 
+        // Call the GLSL extended built-in
         SpvId glsl_op_code = glsl_it->second;
-        if (is_glsl_unary_op(glsl_op_code)) {
-            internal_assert(op->args.size() == 1);
-            visit_glsl_unaryop(op->type, op->args[0], glsl_op_code);
-        } else if (is_glsl_binary_op(glsl_op_code)) {
-            internal_assert(op->args.size() == 2);
-            visit_glsl_binop(op->type, op->args[0], op->args[1], glsl_op_code);
-        } else {
-            user_error << "Vulkan: unhandled SPIR-V GLSL builtin function '" << op->name << "' encountered.\n";
-        }
+        visit_glsl_op(glsl_op_code, op->type, op->args);
     }
 }
 
@@ -913,20 +962,23 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Select *op) {
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(SpvId index_id, SpvId base_id, SpvId result_type_id, SpvId ptr_type_id, SpvStorageClass storage_class) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(const Load *op, SpvId index_id, SpvId variable_id, Type value_type, Type storage_type, SpvStorageClass storage_class) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(): "
              << "index_id=" << index_id << " "
-             << "base_id=" << base_id << " "
-             << "ptr_type_id=" << ptr_type_id << " "
-             << "result_type_id=" << result_type_id << " "
+             << "variable_id=" << variable_id << " "
+             << "value_type=" << value_type << " "
+             << "storage_type=" << storage_type << " "
              << "storage_class=" << storage_class << "\n";
 
     // determine the base type id for the source value
-    SpvId base_type_id = builder.type_of(base_id);
+    SpvId base_type_id = builder.type_of(variable_id);
     if (builder.is_pointer_type(base_type_id)) {
         base_type_id = builder.lookup_base_type(base_type_id);
     }
 
+    SpvId storage_type_id = builder.declare_type(storage_type);
+    SpvId ptr_type_id = builder.declare_pointer_type(storage_type, storage_class);
+
     uint32_t zero = 0;
     SpvId src_id = SpvInvalidId;
     SpvId src_index_id = index_id;
@@ -934,49 +986,57 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_scalar_index(SpvId index_id, S
         if (builder.is_struct_type(base_type_id)) {
             SpvId zero_id = builder.declare_constant(UInt(32), &zero);
             SpvFactory::Indices access_indices = {zero_id, src_index_id};
-            src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+            src_id = builder.declare_access_chain(ptr_type_id, variable_id, access_indices);
         } else {
             SpvFactory::Indices access_indices = {src_index_id};
-            src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+            src_id = builder.declare_access_chain(ptr_type_id, variable_id, access_indices);
+        }
+    } else if ((storage_class == SpvStorageClassWorkgroup) || (storage_class == SpvStorageClassFunction)) {
+        if (builder.is_array_type(base_type_id)) {
+            SpvFactory::Indices access_indices = {src_index_id};
+            src_id = builder.declare_access_chain(ptr_type_id, variable_id, access_indices);
+        } else {
+            src_id = variable_id;
         }
-    } else if (storage_class == SpvStorageClassWorkgroup) {
-        SpvFactory::Indices access_indices = {src_index_id};
-        src_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
-    } else if (storage_class == SpvStorageClassFunction) {
-        src_id = base_id;
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): unhandled storage class encountered on op: " << storage_class << "\n";
     }
     internal_assert(src_id != SpvInvalidId);
 
-    SpvId result_id = builder.reserve_id(SpvResultId);
-    builder.append(SpvFactory::load(result_type_id, result_id, src_id));
+    SpvId value_id = builder.reserve_id(SpvResultId);
+    builder.append(SpvFactory::load(storage_type_id, value_id, src_id));
+
+    // if the value type doesn't match the base for the pointer type, cast it accordingly
+    SpvId result_id = value_id;
+    if (storage_type != value_type) {
+        result_id = cast_type(value_type, storage_type, result_id);
+    }
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, SpvId variable_id, Type value_type, Type storage_type, SpvStorageClass storage_class) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(): "
-             << "base_id=" << base_id << " "
-             << "type_id=" << type_id << " "
-             << "ptr_type_id=" << ptr_type_id << " "
+             << "variable_id=" << variable_id << " "
+             << "value_type=" << value_type << " "
+             << "storage_type=" << storage_type << " "
              << "storage_class=" << storage_class << "\n";
 
     internal_assert(op->index.type().is_vector());
 
-    // If this is a load from a buffer block (mapped to a halide buffer)
-    // and the runtime array is a vector type, then attempt to do a
+    // If the runtime array is a vector type, then attempt to do a
     // dense vector load by using the base of the ramp divided by
     // the number of lanes.
-    RuntimeArrayMap::const_iterator it = runtime_array_map.find(base_id);
-    if (it != runtime_array_map.end()) {
-        SpvId array_element_type_id = it->second;
-        if (builder.is_vector_type(array_element_type_id)) {
+    StorageAccessMap::const_iterator it = storage_access_map.find(variable_id);
+    if (it != storage_access_map.end()) {
+        storage_type = it->second.storage_type;  // use the storage type for the runtime array
+        SpvId storage_type_id = it->second.storage_type_id;
+        if (builder.is_vector_type(storage_type_id)) {
             Expr ramp_base = strided_ramp_base(op->index);
             if (ramp_base.defined()) {
                 Expr ramp_index = (ramp_base / op->type.lanes());
                 ramp_index.accept(this);
                 SpvId index_id = builder.current_id();
-                load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
+                load_from_scalar_index(op, index_id, variable_id, value_type, storage_type, storage_class);
                 return;
             }
         }
@@ -985,36 +1045,34 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::load_from_vector_index(const Load *op, S
     op->index.accept(this);
     SpvId index_id = builder.current_id();
 
-    SpvFactory::Components index_components = split_vector(op->index.type(), index_id);
-
     // Gather vector elements.
     SpvFactory::Components loaded_values;
-    SpvId scalar_value_type_id = builder.declare_type(op->type.with_lanes(1));
-    SpvId scalar_ptr_type_id = builder.declare_pointer_type(scalar_value_type_id, storage_class);
+    Type scalar_value_type = value_type.with_lanes(1);
+    SpvFactory::Components index_components = split_vector(op->index.type(), index_id);
     for (SpvId scalar_index : index_components) {
-        load_from_scalar_index(scalar_index, base_id, scalar_value_type_id, scalar_ptr_type_id, storage_class);
+        load_from_scalar_index(op, scalar_index, variable_id, scalar_value_type, storage_type, storage_class);
         SpvId value_component_id = builder.current_id();
         loaded_values.push_back(value_component_id);
     }
 
     // Create a composite vector from the individual loads
     if (loaded_values.size() > 1) {
-        SpvId result_id = join_vector(op->type, loaded_values);
+        SpvId result_id = join_vector(value_type, loaded_values);
         builder.update_id(result_id);
     }
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(SpvId index_id, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(const Store *op, SpvId index_id, SpvId variable_id, Type value_type, Type storage_type, SpvStorageClass storage_class, SpvId value_id) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(): "
              << "index_id=" << index_id << " "
-             << "base_id=" << base_id << " "
-             << "type_id=" << type_id << " "
-             << "ptr_type_id=" << ptr_type_id << " "
+             << "variable_id=" << variable_id << " "
+             << "value_type=" << value_type << " "
+             << "storage_type=" << storage_type << " "
              << "storage_class=" << storage_class << " "
              << "value_id=" << value_id << "\n";
 
     // determine the base type id for the source value
-    SpvId base_type_id = builder.type_of(base_id);
+    SpvId base_type_id = builder.type_of(variable_id);
     if (builder.is_pointer_type(base_type_id)) {
         base_type_id = builder.lookup_base_type(base_type_id);
     }
@@ -1023,69 +1081,59 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_scalar_index(SpvId index_id, Sp
     SpvId dst_id = SpvInvalidId;
     SpvId dst_index_id = index_id;
 
+    SpvId ptr_type_id = builder.declare_pointer_type(storage_type, storage_class);
     if (storage_class == SpvStorageClassUniform) {
         if (builder.is_struct_type(base_type_id)) {
             SpvId zero_id = builder.declare_constant(UInt(32), &zero);
             SpvFactory::Indices access_indices = {zero_id, dst_index_id};
-            dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+            dst_id = builder.declare_access_chain(ptr_type_id, variable_id, access_indices);
         } else {
             SpvFactory::Indices access_indices = {dst_index_id};
-            dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
+            dst_id = builder.declare_access_chain(ptr_type_id, variable_id, access_indices);
+        }
+    } else if ((storage_class == SpvStorageClassWorkgroup) || (storage_class == SpvStorageClassFunction)) {
+        if (builder.is_array_type(base_type_id)) {
+            SpvFactory::Indices access_indices = {dst_index_id};
+            dst_id = builder.declare_access_chain(ptr_type_id, variable_id, access_indices);
+        } else {
+            dst_id = variable_id;
         }
-    } else if (storage_class == SpvStorageClassWorkgroup) {
-        SpvFactory::Indices access_indices = {dst_index_id};
-        dst_id = builder.declare_access_chain(ptr_type_id, base_id, access_indices);
-    } else if (storage_class == SpvStorageClassFunction) {
-        dst_id = base_id;
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): unhandled storage class encountered on op: " << storage_class << "\n";
     }
     internal_assert(dst_id != SpvInvalidId);
-    builder.append(SpvFactory::store(dst_id, value_id));
-}
 
-SpvFactory::Components CodeGen_Vulkan_Dev::SPIRV_Emitter::split_vector(Type type, SpvId value_id) {
-    SpvFactory::Components value_components;
-    SpvId scalar_value_type_id = builder.declare_type(type.with_lanes(1));
-    for (uint32_t i = 0; i < (uint32_t)type.lanes(); i++) {
-        SpvFactory::Indices extract_indices = {i};
-        SpvId value_component_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::composite_extract(scalar_value_type_id, value_component_id, value_id, extract_indices));
-        value_components.push_back(value_component_id);
+    // if the value type doesn't match the base for the pointer type, cast it accordingly
+    if (storage_type != value_type) {
+        value_id = cast_type(storage_type, value_type, value_id);
     }
-    return value_components;
-}
 
-SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::join_vector(Type type, const SpvFactory::Components &value_components) {
-    SpvId type_id = builder.declare_type(type);
-    SpvId result_id = builder.reserve_id(SpvResultId);
-    builder.append(SpvFactory::composite_construct(type_id, result_id, value_components));
-    return result_id;
+    builder.append(SpvFactory::store(dst_id, value_id));
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, SpvId base_id, SpvId type_id, SpvId ptr_type_id, SpvStorageClass storage_class, SpvId value_id) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, SpvId variable_id, Type value_type, Type storage_type, SpvStorageClass storage_class, SpvId value_id) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(): "
-             << "base_id=" << base_id << " "
-             << "type_id=" << type_id << " "
-             << "ptr_type_id=" << ptr_type_id << " "
+             << "variable_id=" << variable_id << " "
+             << "value_type=" << value_type << " "
+             << "storage_type=" << storage_type << " "
              << "storage_class=" << storage_class << "\n";
 
     internal_assert(op->index.type().is_vector());
 
-    // If this is a store to a buffer block (mapped to a halide buffer)
-    // and the runtime array is a vector type, then attempt to do a
+    // If the runtime array is a vector type, then attempt to do a
     // dense vector store by using the base of the ramp divided by
     // the number of lanes.
-    RuntimeArrayMap::const_iterator it = runtime_array_map.find(base_id);
-    if (it != runtime_array_map.end()) {
-        SpvId array_element_type_id = it->second;
-        if (builder.is_vector_type(array_element_type_id)) {
+    StorageAccessMap::const_iterator it = storage_access_map.find(variable_id);
+    if (it != storage_access_map.end()) {
+        storage_type = it->second.storage_type;
+        SpvId storage_type_id = it->second.storage_type_id;
+        if (builder.is_vector_type(storage_type_id)) {
             Expr ramp_base = strided_ramp_base(op->index);
             if (ramp_base.defined()) {
                 Expr ramp_index = (ramp_base / op->value.type().lanes());
                 ramp_index.accept(this);
                 SpvId index_id = builder.current_id();
-                store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);
+                store_at_scalar_index(op, index_id, variable_id, value_type, storage_type, storage_class, value_id);
                 return;
             }
         }
@@ -1100,12 +1148,11 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::store_at_vector_index(const Store *op, S
     SpvFactory::Components index_components = split_vector(op->index.type(), index_id);
 
     // Scatter vector elements.
-    SpvId scalar_value_type_id = builder.declare_type(op->value.type().with_lanes(1));
-    SpvId scalar_ptr_type_id = builder.declare_pointer_type(scalar_value_type_id, storage_class);
+    Type scalar_value_type = op->value.type().with_lanes(1);
     for (uint32_t i = 0; i < index_components.size(); i++) {
         SpvId index_component_id = index_components[i];
         SpvId value_component_id = value_components[i];
-        store_at_scalar_index(index_component_id, base_id, scalar_value_type_id, scalar_ptr_type_id, storage_class, value_component_id);
+        store_at_scalar_index(op, index_component_id, variable_id, scalar_value_type, storage_type, storage_class, value_component_id);
     }
 }
 
@@ -1116,20 +1163,30 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
     // Construct the pointer to read from
     internal_assert(symbol_table.contains(op->name));
     SymbolIdStorageClassPair id_and_storage_class = symbol_table.get(op->name);
-    SpvId base_id = id_and_storage_class.first;
+    SpvId variable_id = id_and_storage_class.first;
     SpvStorageClass storage_class = id_and_storage_class.second;
-    internal_assert(base_id != SpvInvalidId);
+    internal_assert(variable_id != SpvInvalidId);
     internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax));
 
-    SpvId type_id = builder.declare_type(op->type);
-    SpvId ptr_type_id = builder.declare_pointer_type(type_id, storage_class);
+    // If this is a load from a buffer block (mapped to a halide buffer) or
+    // GPU shared memory, the pointer type must match the declared storage
+    // type for the runtime array.
+    Type value_type = op->type;
+    Type storage_type = value_type;
+    StorageAccessMap::const_iterator it = storage_access_map.find(variable_id);
+    if (it != storage_access_map.end()) {
+        storage_type = it->second.storage_type;
+    }
+
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): value_type=" << op->type << " storage_type=" << storage_type << "\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): index_type=" << op->index.type() << " index=" << op->index << "\n";
 
     if (op->index.type().is_scalar()) {
         op->index.accept(this);
         SpvId index_id = builder.current_id();
-        load_from_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class);
+        load_from_scalar_index(op, index_id, variable_id, value_type, storage_type, storage_class);
     } else {
-        load_from_vector_index(op, base_id, type_id, ptr_type_id, storage_class);
+        load_from_vector_index(op, variable_id, value_type, storage_type, storage_class);
     }
 }
 
@@ -1137,26 +1194,36 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): " << op->name << "[" << op->index << "] = (" << op->value << ")\n";
     user_assert(is_const_one(op->predicate)) << "Predicated stores not supported by SPIR-V codegen!\n";
 
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): value_type=" << op->value.type() << " value=" << op->value << "\n";
     op->value.accept(this);
     SpvId value_id = builder.current_id();
 
-    // Construct the pointer to write to
     internal_assert(symbol_table.contains(op->name));
     SymbolIdStorageClassPair id_and_storage_class = symbol_table.get(op->name);
-    SpvId base_id = id_and_storage_class.first;
+    SpvId variable_id = id_and_storage_class.first;
     SpvStorageClass storage_class = id_and_storage_class.second;
-    internal_assert(base_id != SpvInvalidId);
+    internal_assert(variable_id != SpvInvalidId);
     internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax));
 
-    SpvId type_id = builder.declare_type(op->value.type());
-    SpvId ptr_type_id = builder.declare_pointer_type(type_id, storage_class);
+    Type value_type = op->value.type();
+    Type storage_type = value_type;
+
+    // If this is a store to a buffer block (mapped to a halide buffer) or
+    // GPU shared memory, the pointer type must match the declared storage
+    // type for the runtime array
+    StorageAccessMap::const_iterator it = storage_access_map.find(variable_id);
+    if (it != storage_access_map.end()) {
+        storage_type = it->second.storage_type;
+    }
 
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): value_type=" << value_type << " storage_type=" << storage_type << "\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): index_type=" << op->index.type() << " index=" << op->index << "\n";
     if (op->index.type().is_scalar()) {
         op->index.accept(this);
         SpvId index_id = builder.current_id();
-        store_at_scalar_index(index_id, base_id, type_id, ptr_type_id, storage_class, value_id);
+        store_at_scalar_index(op, index_id, variable_id, value_type, storage_type, storage_class, value_id);
     } else {
-        store_at_vector_index(op, base_id, type_id, ptr_type_id, storage_class, value_id);
+        store_at_vector_index(op, variable_id, value_type, storage_type, storage_class, value_id);
     }
 }
 
@@ -1391,17 +1458,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Provide *) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
 
-    SpvId base_type_id = SpvInvalidId;
+    SpvId storage_type_id = builder.declare_type(op->type);
+    SpvId array_type_id = SpvInvalidId;
     SpvId variable_id = SpvInvalidId;
     SpvStorageClass storage_class = SpvStorageClassGeneric;
     if (op->memory_type == MemoryType::GPUShared) {
         // Allocation of shared memory must be declared at global scope
-        internal_assert(op->extents.size() == 1 && is_const(op->extents[0]));
+        user_assert(op->extents.size() == 1 && is_const(op->extents[0]))
+            << "Allocation " << op->name << " has a dynamic size. "
+            << "Only fixed-size allocations are supported with Vulkan.";
+
         int32_t size = op->constant_allocation_size();
-        base_type_id = builder.declare_type(op->type, size);
+        array_type_id = builder.declare_type(op->type, size);
         storage_class = SpvStorageClassWorkgroup;  // shared across workgroup
         debug(2) << "Vulkan: Allocate " << op->name << "[" << (uint32_t)size << "] in shared memory on device in global scope\n";
-        SpvId ptr_type_id = builder.declare_pointer_type(base_type_id, storage_class);
+        SpvId ptr_type_id = builder.declare_pointer_type(array_type_id, storage_class);
         variable_id = builder.declare_global_variable(op->name, ptr_type_id, storage_class);
 
     } else {
@@ -1413,15 +1484,20 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
         // It must have a constant size.
         user_assert(size > 0)
             << "Allocation " << op->name << " has a dynamic size. "
-            << "Only fixed-size allocations are supported on the gpu. "
-            << "Try storing into shared memory instead.";
+            << "Only fixed-size allocations are supported with Vulkan.";
 
-        base_type_id = builder.declare_type(op->type, size);
+        array_type_id = builder.declare_type(op->type, size);
         storage_class = SpvStorageClassFunction;  // function scope
-        SpvId ptr_type_id = builder.declare_pointer_type(base_type_id, storage_class);
+        SpvId ptr_type_id = builder.declare_pointer_type(array_type_id, storage_class);
         variable_id = builder.declare_variable(op->name, ptr_type_id, storage_class);
     }
 
+    StorageAccess access;
+    access.storage_class = storage_class;
+    access.storage_type_id = storage_type_id;
+    access.storage_type = op->type;
+    storage_access_map[variable_id] = access;
+
     debug(3) << "Vulkan: Pushing allocation called " << op->name << " onto the symbol table\n";
     symbol_table.push(op->name, {variable_id, storage_class});
     op->body.accept(this);
@@ -1430,6 +1506,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Free *op) {
     debug(3) << "Vulkan: Popping allocation called " << op->name << " off the symbol table\n";
     internal_assert(symbol_table.contains(op->name));
+    SpvId variable_id = symbol_table.get(op->name).first;
+    storage_access_map.erase(variable_id);
     symbol_table.pop(op->name);
 }
 
@@ -1538,7 +1616,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Atomic *) {
     internal_error << "void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Atomic *): Atomic not implemented for codegen";
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_unaryop(Type t, const Expr &a, SpvOp op_code) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_unary_op(SpvOp op_code, Type t, const Expr &a) {
     SpvId type_id = builder.declare_type(t);
     a.accept(this);
     SpvId src_a_id = builder.current_id();
@@ -1548,7 +1626,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_unaryop(Type t, const Expr &a, Spv
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_binop(Type t, const Expr &a, const Expr &b, SpvOp op_code) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_binary_op(SpvOp op_code, Type t, const Expr &a, const Expr &b) {
     SpvId type_id = builder.declare_type(t);
     a.accept(this);
     SpvId src_a_id = builder.current_id();
@@ -1560,30 +1638,42 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_binop(Type t, const Expr &a, const
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_glsl_unaryop(Type type, const Expr &a, SpvId glsl_op_code) {
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_glsl_op(SpvId glsl_op_code, Type type, const std::vector<Expr> &args) {
     uint32_t type_id = builder.declare_type(type);
-    a.accept(this);
-    SpvId src_a_id = builder.current_id();
+
+    SpvFactory::Operands operands;
+    operands.reserve(args.size());
+    for (const Expr &e : args) {
+        e.accept(this);
+        operands.push_back(builder.current_id());
+    }
+
+    // sanity check the expected number of operands
+    internal_assert(glsl_operand_count(glsl_op_code) == operands.size());
 
     SpvId inst_set_id = builder.import_glsl_intrinsics();
     SpvId result_id = builder.reserve_id(SpvResultId);
-    SpvFactory::Operands operands = {src_a_id};
     builder.append(SpvFactory::extended(inst_set_id, glsl_op_code, type_id, result_id, operands));
     builder.update_id(result_id);
 }
 
-void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_glsl_binop(Type type, const Expr &a, const Expr &b, SpvId glsl_op_code) {
-    SpvId type_id = builder.declare_type(type);
-    a.accept(this);
-    SpvId src_a_id = builder.current_id();
-    b.accept(this);
-    SpvId src_b_id = builder.current_id();
+SpvFactory::Components CodeGen_Vulkan_Dev::SPIRV_Emitter::split_vector(Type type, SpvId value_id) {
+    SpvFactory::Components value_components;
+    SpvId scalar_value_type_id = builder.declare_type(type.with_lanes(1));
+    for (uint32_t i = 0; i < (uint32_t)type.lanes(); i++) {
+        SpvFactory::Indices extract_indices = {i};
+        SpvId value_component_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::composite_extract(scalar_value_type_id, value_component_id, value_id, extract_indices));
+        value_components.push_back(value_component_id);
+    }
+    return value_components;
+}
 
-    SpvId inst_set_id = builder.import_glsl_intrinsics();
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::join_vector(Type type, const SpvFactory::Components &value_components) {
+    SpvId type_id = builder.declare_type(type);
     SpvId result_id = builder.reserve_id(SpvResultId);
-    SpvFactory::Operands operands = {src_a_id, src_b_id};
-    builder.append(SpvFactory::extended(inst_set_id, glsl_op_code, type_id, result_id, operands));
-    builder.update_id(result_id);
+    builder.append(SpvFactory::composite_construct(type_id, result_id, value_components));
+    return result_id;
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
@@ -1867,7 +1957,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
             builder.add_annotation(buffer_block_var_id, SpvDecorationDescriptorSet, dset_index);
             builder.add_annotation(buffer_block_var_id, SpvDecorationBinding, binding_index);
             symbol_table.push(arg.name, {buffer_block_var_id, storage_class});
-            runtime_array_map[buffer_block_var_id] = array_element_type_id;
+
+            StorageAccess access;
+            access.storage_type_id = array_element_type_id;
+            access.storage_type = array_element_type;
+            access.storage_class = storage_class;
+            storage_access_map[buffer_block_var_id] = access;
             descriptor_set.storage_buffer_count++;
         }
     }
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 3516b9778a99..f80626c7d856 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -1002,7 +1002,7 @@ SpvId SpvBuilder::declare_type(const Type &type, uint32_t array_size) {
 SpvId SpvBuilder::declare_pointer_type(const Type &type, SpvStorageClass storage_class) {
     SpvId ptr_type_id = lookup_pointer_type(type, storage_class);
     if (ptr_type_id == SpvInvalidId) {
-        ptr_type_id = add_pointer_type(ptr_type_id, storage_class);
+        ptr_type_id = add_pointer_type(type, storage_class);
     }
     return ptr_type_id;
 }
@@ -1407,7 +1407,7 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
                 signedness = type.is_uint() ? 0 : 1;
             }
 
-            type_id = make_id(SpvIntTypeId);
+            type_id = make_id(signedness ? SpvIntTypeId : SpvUIntTypeId);
             debug(3) << "    add_integer_type: %" << type_id << " bits=" << type.bits() << " signed=" << (signedness ? "true" : "false") << "\n";
             SpvInstruction inst = SpvFactory::integer_type(type_id, type.bits(), signedness);
             module.add_type(inst);
@@ -1512,10 +1512,17 @@ SpvId SpvBuilder::lookup_pointer_type(SpvId base_type_id, SpvStorageClass storag
 SpvId SpvBuilder::add_pointer_type(const Type &type, SpvStorageClass storage_class) {
     SpvId base_type_id = declare_type(type);
     debug(3) << "    add_pointer_type: base_type=" << type << " base_type_id=" << base_type_id << " storage_class=" << (uint32_t)(storage_class) << "\n";
+    if (base_type_id == SpvInvalidId) {
+        internal_error << "SPIRV: Attempted to create pointer type for undeclared base type! " << type << "\n";
+    }
     return add_pointer_type(base_type_id, storage_class);
 }
 
 SpvId SpvBuilder::add_pointer_type(SpvId base_type_id, SpvStorageClass storage_class) {
+    if (base_type_id == SpvInvalidId) {
+        internal_error << "SPIRV: Attempted to create pointer type for undeclared base type!\n";
+    }
+
     PointerTypeKey key = make_pointer_type_key(base_type_id, storage_class);
     PointerTypeMap::const_iterator it = pointer_type_map.find(key);
     if (it != pointer_type_map.end()) {
@@ -2456,6 +2463,14 @@ SpvInstruction SpvFactory::extended(SpvId instruction_set_id, SpvId instruction_
 /** GLSL extended instruction utility methods */
 
 bool is_glsl_unary_op(SpvId glsl_op_code) {
+    return (glsl_operand_count(glsl_op_code) == 1);
+}
+
+bool is_glsl_binary_op(SpvId glsl_op_code) {
+    return (glsl_operand_count(glsl_op_code) == 2);
+}
+
+uint32_t glsl_operand_count(SpvId glsl_op_code) {
     switch (glsl_op_code) {
     case GLSLstd450Round:
     case GLSLstd450RoundEven:
@@ -2479,6 +2494,8 @@ bool is_glsl_unary_op(SpvId glsl_op_code) {
     case GLSLstd450Acosh:
     case GLSLstd450Atanh:
     case GLSLstd450Cosh:
+    case GLSLstd450Sinh:
+    case GLSLstd450Tanh:
     case GLSLstd450Exp:
     case GLSLstd450Log:
     case GLSLstd450Exp2:
@@ -2506,16 +2523,9 @@ bool is_glsl_unary_op(SpvId glsl_op_code) {
     case GLSLstd450FindILsb:
     case GLSLstd450FindSMsb:
     case GLSLstd450FindUMsb:
-    case GLSLstd450InterpolateAtCentroid:
-        return true;
-    default:
-        break;
-    };
-    return false;
-}
-
-bool is_glsl_binary_op(SpvId glsl_op_code) {
-    switch (glsl_op_code) {
+    case GLSLstd450InterpolateAtCentroid: {
+        return 1;  // unary op
+    }
     case GLSLstd450Atan2:
     case GLSLstd450Pow:
     case GLSLstd450Modf:
@@ -2534,12 +2544,25 @@ bool is_glsl_binary_op(SpvId glsl_op_code) {
     case GLSLstd450InterpolateAtOffset:
     case GLSLstd450InterpolateAtSample:
     case GLSLstd450NMax:
-    case GLSLstd450NMin:
-        return true;
+    case GLSLstd450NMin: {
+        return 2;  // binary op
+    }
+    case GLSLstd450FMix:
+    case GLSLstd450IMix:
+    case GLSLstd450SmoothStep:
+    case GLSLstd450Fma:
+    case GLSLstd450FClamp:
+    case GLSLstd450UClamp:
+    case GLSLstd450SClamp:
+    case GLSLstd450NClamp: {
+        return 3;  // trinary op
+    }
+    case GLSLstd450Bad:
+    case GLSLstd450Count:
     default:
         break;
     };
-    return false;
+    return SpvInvalidId;
 }
 
 /** Specializations for reference counted classes */
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 45ddd126ddb0..69c9a8b98974 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -58,6 +58,7 @@ enum SpvKind {
     SpvVoidTypeId,
     SpvBoolTypeId,
     SpvIntTypeId,
+    SpvUIntTypeId,
     SpvFloatTypeId,
     SpvVectorTypeId,
     SpvArrayTypeId,
@@ -758,6 +759,7 @@ struct SpvModuleContents {
 /** Helper functions for determining calling convention of GLSL builtins **/
 bool is_glsl_unary_op(SpvId glsl_op_code);
 bool is_glsl_binary_op(SpvId glsl_op_code);
+uint32_t glsl_operand_count(SpvId glsl_op_code);
 
 /** Output the contents of a SPIR-V module in human-readable form **/
 std::ostream &operator<<(std::ostream &stream, const SpvModule &);
diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index 6e94c530abb8..5aa0d38e67d0 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -1029,6 +1029,10 @@ typedef enum VkObjectType {
     VK_OBJECT_TYPE_MAX_ENUM = 0x7FFFFFFF
 } VkObjectType;
 
+typedef enum VkInstanceCreateFlagBits {
+    VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001,
+    VK_INSTANCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkInstanceCreateFlagBits;
 typedef VkFlags VkInstanceCreateFlags;
 
 typedef enum VkFormatFeatureFlagBits {
@@ -6043,6 +6047,11 @@ typedef struct VkPhysicalDeviceShaderFloat16Int8Features {
 } VkPhysicalDeviceShaderFloat16Int8FeaturesKHR;
 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES 1000082000
 
+// Provided by VK_KHR_portability_enumeration
+#define VK_KHR_portability_enumeration 1
+#define VK_KHR_PORTABILITY_ENUMERATION_SPEC_VERSION 1
+#define VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME "VK_KHR_portability_enumeration"
+
 }  // extern "C"
 
 #endif  // HALIDE_MINI_VULKAN_H
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 6e7bdaa2c9b3..a83e2c9fff4c 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -710,8 +710,8 @@ WEAK int halide_vulkan_run(void *user_context,
         // NOTE: while this could be re-used across multiple pipelines, we only know the storage requirements of this kernel's
         //       inputs and outputs ... so create a pool specific to the number of buffers known at this time
 
-        uint32_t uniform_buffer_count = entry_point_data->uniform_buffer_count;  // needs_scalar_uniform_buffer ? 1 : 0;
-        uint32_t storage_buffer_count = entry_point_data->storage_buffer_count;  // entry_point_data->buffer_count;
+        uint32_t uniform_buffer_count = entry_point_data->uniform_buffer_count;
+        uint32_t storage_buffer_count = entry_point_data->storage_buffer_count;
         VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(entry_point_data->descriptor_pool));
         if (result != VK_SUCCESS) {
             error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index ad747abe57f1..1e9985561c83 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -102,6 +102,15 @@ int vk_create_instance(void *user_context, const StringTable &requested_layers,
         debug(user_context) << "    extension: " << required_instance_extensions[n] << "\n";
     }
 
+    // If we're running under Molten VK, we must enable the portability extension and create flags
+    // to allow non-physical devices that are emulated to appear in the device list.
+    uint32_t create_flags = 0;
+    if (supported_instance_extensions.contains("VK_KHR_portability_enumeration") &&
+        supported_instance_extensions.contains("VK_MVK_macos_surface")) {
+        create_flags = VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
+        required_instance_extensions.append(user_context, "VK_KHR_portability_enumeration");
+    }
+
     VkApplicationInfo app_info = {
         VK_STRUCTURE_TYPE_APPLICATION_INFO,                                                        // struct type
         nullptr,                                                                                   // Next
@@ -114,7 +123,7 @@ int vk_create_instance(void *user_context, const StringTable &requested_layers,
     VkInstanceCreateInfo create_info = {
         VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
         nullptr,                                                                            // Next
-        0,                                                                                  // Flags
+        create_flags,                                                                       // Flags
         &app_info,                                                                          // ApplicationInfo
         (uint32_t)requested_layers.size(), requested_layers.data(),                         // Layers
         (uint32_t)required_instance_extensions.size(), required_instance_extensions.data()  // Extensions
@@ -250,6 +259,14 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     // Get the device features so that all supported features are enabled when device is created
     //
     VkPhysicalDeviceFeatures device_features = {};
+    void *extended_features_ptr = nullptr;
+    void *standard_features_ptr = nullptr;
+
+    debug(user_context) << "Vulkan: Querying for device features...\n";
+    vkGetPhysicalDeviceFeatures(*physical_device, &device_features);
+    debug(user_context) << "Vulkan: Shader Float64 support: " << (device_features.shaderFloat64 ? "true" : "false") << "...\n";
+    debug(user_context) << "Vulkan: Shader Int64 support: " << (device_features.shaderInt64 ? "true" : "false") << "...\n";
+    debug(user_context) << "Vulkan: Shader Int16 support: " << (device_features.shaderInt16 ? "true" : "false") << "...\n";
 
     // If the instance runtime supports querying extended device features, request them
     VkPhysicalDeviceShaderFloat16Int8FeaturesKHR shader_f16_i8_ext = {
@@ -261,9 +278,6 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
         &shader_f16_i8_ext,
         device_features};
 
-    void *extended_features_ptr = nullptr;
-    void *standard_features_ptr = nullptr;
-
     // Look for v1.1+ device feature query method
     PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2KHR");  // v1.0+
     if (!vkGetPhysicalDeviceFeatures2KHR) {
@@ -277,7 +291,6 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
         debug(user_context) << "Vulkan: Shader Float16 support: " << (shader_f16_i8_ext.shaderFloat16 ? "true" : "false") << "...\n";
         extended_features_ptr = (void *)(&device_features_ext);  // pass v1.1 extended features (which also contains the standard features)
     } else {
-        vkGetPhysicalDeviceFeatures(*physical_device, &device_features);
         standard_features_ptr = &device_features;  // pass v1.0 standard features
     }
 
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 7e458d84fba8..45001a33a46c 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -4,6 +4,9 @@
 #include "internal/block_allocator.h"
 #include "vulkan_internal.h"
 
+// Uncomment to enable verbose memory allocation debugging
+// #define HL_VK_DEBUG_MEM 1
+
 namespace Halide {
 namespace Runtime {
 namespace Internal {
@@ -162,13 +165,14 @@ void VulkanMemoryAllocator::initialize(void *user_context,
 }
 
 MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Reserving memory ("
                    << "user_context=" << user_context << " "
                    << "block_allocator=" << (void *)(block_allocator) << " "
                    << "request_size=" << (uint32_t)(request.size) << " "
                    << "device=" << (void *)(device) << " "
                    << "physical_device=" << (void *)(physical_device) << ") ...\n";
-
+#endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
     halide_abort_if_false(user_context, block_allocator != nullptr);
@@ -176,12 +180,13 @@ MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &
 }
 
 void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Mapping region ("
                    << "user_context=" << user_context << " "
                    << "region=" << (void *)(region) << " "
                    << "device=" << (void *)(device) << " "
                    << "physical_device=" << (void *)(physical_device) << ") ...\n";
-
+#endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
     halide_abort_if_false(user_context, block_allocator != nullptr);
@@ -215,12 +220,13 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
 }
 
 void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Unmapping region ("
                    << "user_context=" << user_context << " "
                    << "region=" << (void *)(region) << " "
                    << "device=" << (void *)(device) << " "
                    << "physical_device=" << (void *)(physical_device) << ") ...\n";
-
+#endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
 
@@ -246,10 +252,11 @@ void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
 }
 
 void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Reclaiming region ("
                    << "user_context=" << user_context << " "
                    << "region=" << (void *)(region) << ") ... \n";
-
+#endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
 
@@ -257,9 +264,10 @@ void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
 }
 
 bool VulkanMemoryAllocator::collect(void *user_context) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Collecting unused memory ("
                    << "user_context=" << user_context << ") ... \n";
-
+#endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
 
@@ -267,9 +275,10 @@ bool VulkanMemoryAllocator::collect(void *user_context) {
 }
 
 void VulkanMemoryAllocator::release(void *user_context) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Releasing ("
                    << "user_context=" << user_context << ") ... \n";
-
+#endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
 
@@ -277,9 +286,10 @@ void VulkanMemoryAllocator::release(void *user_context) {
 }
 
 void VulkanMemoryAllocator::destroy(void *user_context) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Destroying allocator ("
                    << "user_context=" << user_context << ") ... \n";
-
+#endif
     block_allocator->destroy(this);
 }
 
@@ -292,22 +302,25 @@ VulkanMemoryAllocator::default_config() {
 // --
 
 void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *block) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
                    << "user_context=" << user_context << " "
                    << "block=" << (void *)(block) << ") ... \n";
-
+#endif
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
     halide_abort_if_false(user_context, instance->device != nullptr);
     halide_abort_if_false(user_context, instance->physical_device != nullptr);
     halide_abort_if_false(user_context, block != nullptr);
 
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
                    << "size=" << (uint32_t)block->size << ", "
                    << "dedicated=" << (block->dedicated ? "true" : "false") << " "
                    << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
                    << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
                    << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+#endif
 
     // Find an appropriate memory type given the flags
     uint32_t memory_type = instance->select_memory_type(user_context, instance->physical_device, block->properties, 0);
@@ -359,10 +372,13 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     VkMemoryRequirements memory_requirements = {0};
     vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements);
     vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
+
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
                    << "size=" << (uint32_t)block->size << ", "
                    << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
                    << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
+#endif
 
     block->properties.alignment = memory_requirements.alignment;
     block->handle = (void *)device_memory;
@@ -371,9 +387,11 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
 }
 
 void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *block) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Deallocating block ("
                    << "user_context=" << user_context << " "
                    << "block=" << (void *)(block) << ") ... \n";
+#endif
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
@@ -381,21 +399,23 @@ void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *bl
     halide_abort_if_false(user_context, instance->physical_device != nullptr);
     halide_abort_if_false(user_context, block != nullptr);
 
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanBlockAllocator: deallocating block ("
                    << "size=" << (uint32_t)block->size << ", "
                    << "dedicated=" << (block->dedicated ? "true" : "false") << " "
                    << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
                    << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
                    << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+#endif
 
     if (block->handle == nullptr) {
-        debug(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid handle!\n";
+        error(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid handle!\n";
         return;
     }
 
     VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block->handle);
     if (device_memory == nullptr) {
-        debug(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid device memory handle!\n";
+        error(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid device memory handle!\n";
         return;
     }
 
@@ -439,7 +459,7 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
     case MemoryVisibility::DefaultVisibility:
     case MemoryVisibility::InvalidVisibility:
     default:
-        debug(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory visibility request!\n\t"
+        error(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory visibility request!\n\t"
                        << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
         return invalid_memory_type;
     };
@@ -465,7 +485,7 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
         break;
     case MemoryCaching::InvalidCaching:
     default:
-        debug(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory caching request!\n\t"
+        error(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory caching request!\n\t"
                        << "caching=" << halide_memory_caching_name(properties.caching) << "\n";
         return invalid_memory_type;
     };
@@ -501,7 +521,7 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
     }
 
     if (result == invalid_memory_type) {
-        debug(nullptr) << "VulkanBlockAllocator: Failed to find appropriate memory type for given properties:\n\t"
+        error(nullptr) << "VulkanBlockAllocator: Failed to find appropriate memory type for given properties:\n\t"
                        << "usage=" << halide_memory_usage_name(properties.usage) << " "
                        << "caching=" << halide_memory_caching_name(properties.caching) << " "
                        << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
@@ -514,9 +534,11 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
 // --
 
 void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *region) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Allocating region ("
                    << "user_context=" << user_context << " "
                    << "region=" << (void *)(region) << ") ... \n";
+#endif
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
@@ -524,6 +546,7 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
     halide_abort_if_false(user_context, instance->physical_device != nullptr);
     halide_abort_if_false(user_context, region != nullptr);
 
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanRegionAllocator: Allocating region ("
                    << "size=" << (uint32_t)region->size << ", "
                    << "offset=" << (uint32_t)region->offset << ", "
@@ -531,6 +554,7 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
                    << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
                    << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
                    << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+#endif
 
     uint32_t usage_flags = instance->select_memory_usage(user_context, region->properties);
 
@@ -583,15 +607,19 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
 }
 
 void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *region) {
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Deallocating region ("
                    << "user_context=" << user_context << " "
                    << "region=" << (void *)(region) << ") ... \n";
+#endif
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
     halide_abort_if_false(user_context, instance->device != nullptr);
     halide_abort_if_false(user_context, instance->physical_device != nullptr);
     halide_abort_if_false(user_context, region != nullptr);
+
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanRegionAllocator: Deallocating region ("
                    << "size=" << (uint32_t)region->size << ", "
                    << "offset=" << (uint32_t)region->offset << ", "
@@ -599,9 +627,10 @@ void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *
                    << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
                    << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
                    << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+#endif
 
     if (region->handle == nullptr) {
-        debug(nullptr) << "VulkanRegionAllocator: Unable to deallocate region! Invalid handle!\n";
+        error(nullptr) << "VulkanRegionAllocator: Unable to deallocate region! Invalid handle!\n";
         return;
     }
 
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 0923c216cc66..3b7b331a795b 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -839,7 +839,7 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
     }
 
     VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &cache_entry->shader_module);
-    if ((result != VK_SUCCESS)) {  // } || (cache_entry->shader_module == nullptr)) {
+    if ((result != VK_SUCCESS)) {
         error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
         vk_host_free(user_context, cache_entry->entry_point_data, allocator->callbacks());
         vk_host_free(user_context, cache_entry, allocator->callbacks());
@@ -853,7 +853,7 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
             error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!\n";
             return nullptr;
         }
-        memset(cache_entry->descriptor_set_layouts, 0, sizeof(VkDescriptorSetLayout));
+        memset(cache_entry->descriptor_set_layouts, 0, cache_entry->entry_point_count * sizeof(VkDescriptorSetLayout));
     }
 
 #ifdef DEBUG_RUNTIME

From fe3e4181a2a38889244a1d08b2f7d69c1983437c Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 12:33:59 -0700
Subject: [PATCH 055/166] Disable dynamic shared memory portion of test for
 Vulkan (since its not supported yet)

---
 test/correctness/gpu_reuse_shared_memory.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/correctness/gpu_reuse_shared_memory.cpp b/test/correctness/gpu_reuse_shared_memory.cpp
index 899b882d25d4..2cdb7a53b3f2 100644
--- a/test/correctness/gpu_reuse_shared_memory.cpp
+++ b/test/correctness/gpu_reuse_shared_memory.cpp
@@ -191,6 +191,8 @@ int main(int argc, char **argv) {
         printf("Running dynamic shared test\n");
         if (t.has_feature(Target::OpenGLCompute) && memory_type == MemoryType::GPUShared) {
             printf("Skipping test because GL doesn't support dynamic sizes for shared memory\n");
+        } else if(t.has_feature(Target::Vulkan) && memory_type == MemoryType::GPUShared) {
+            printf("Skipping test because Vulkan doesn't support dynamic sizes for shared memory\n");
         } else {
             if (dynamic_shared_test(memory_type) != 0) {
                 return -1;

From 6afaca218875fc0e1c38d02ba8258c5156fee2dd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 12:34:40 -0700
Subject: [PATCH 056/166] Disable uncached portion of test for Vulkan (since it
 may OOM)

---
 test/correctness/gpu_allocation_cache.cpp | 36 +++++++++++++----------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/test/correctness/gpu_allocation_cache.cpp b/test/correctness/gpu_allocation_cache.cpp
index 97215b95aea5..68ae18e3bae9 100644
--- a/test/correctness/gpu_allocation_cache.cpp
+++ b/test/correctness/gpu_allocation_cache.cpp
@@ -149,22 +149,26 @@ int main(int argc, char **argv) {
         }
     }
 
-    // Now benchmark with and without, (just informational, as this isn't a performance test)
-    double t1 = Tools::benchmark([&]() {
-        test1(true, false);
-        test2(true, false);
-        test3(true, false);
-    });
-
-    double t2 = Tools::benchmark([&]() {
-        test1(false, false);
-        test2(false, false);
-        test3(false, false);
-    });
-
-    printf("Runtime with cache: %f\n"
-           "Without cache: %f\n",
-           t1, t2);
+    // Vulkan will OOM unless allocation cache is used ... skip this since we just ran the same tests above concurrently
+    if(!target.has_feature(Target::Vulkan)) {
+
+        // Now benchmark with and without, (just informational, as this isn't a performance test)
+        double t1 = Tools::benchmark([&]() {
+            test1(true, false);
+            test2(true, false);
+            test3(true, false);
+        });
+
+        double t2 = Tools::benchmark([&]() {
+            test1(false, false);
+            test2(false, false);
+            test3(false, false);
+        });
+
+        printf("Runtime with cache: %f\n"
+            "Without cache: %f\n",
+            t1, t2);
+    }
 
     printf("Success!\n");
     return 0;

From 0cf364e8bca1e5f7053aa37444572817e5e45e6d Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 12:35:35 -0700
Subject: [PATCH 057/166] Disable float64 support in Type::supports_type() for
 Vulkan target since it's not widely supported

---
 src/Target.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Target.cpp b/src/Target.cpp
index caeb8abd2c97..2e430a75f396 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -976,11 +976,13 @@ bool Target::supports_type(const Type &t) const {
     if (t.bits() == 64) {
         if (t.is_float()) {
             return !has_feature(Metal) &&
+                   !has_feature(Vulkan) &&
                    !has_feature(OpenGLCompute) &&
                    !has_feature(D3D12Compute) &&
                    (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles));
         } else {
             return (!has_feature(Metal) &&
+                    !has_feature(Vulkan) &&
                     !has_feature(OpenGLCompute) &&
                     !has_feature(D3D12Compute));
         }

From 3155f2f34efec363dd33a6f394052a4b95c74dea Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 12:36:32 -0700
Subject: [PATCH 058/166] Fix Shuffle to handle all known cases Hookup
 VulkanMemoryAllocator to gpu allocation cache. Fix if_then_else to allow
 calls and statements to be used Fix loop counter comparison, and don't allow
 dynamic loops to be unrolled. Fix scalarize to use CompositeInsert instead of
 VectorInsertDynamic Fix FMod to use FRem (cause SPIR-V's FMod doesn't do what
 you'd expect ... but FRem does?!) Use exact same sematics for barriers as
 GLSL Compute ... still not passing everything Fix SPIR-V block termination
 checks, keys for null constants, and other cleanups

---
 src/CodeGen_Vulkan_Dev.cpp              | 308 ++++++++++++++++++------
 src/SpirvIR.cpp                         |  40 ++-
 src/SpirvIR.h                           |   4 +-
 src/runtime/internal/block_allocator.h  |  54 +++--
 src/runtime/internal/region_allocator.h | 134 +++++++++--
 src/runtime/vulkan.cpp                  |  53 +++-
 src/runtime/vulkan_memory.h             |  53 +++-
 7 files changed, 516 insertions(+), 130 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index e37c7412c224..58bfd1a1c50c 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -16,6 +16,7 @@
 
 // Temporary:
 #include <fstream>
+#include <signal.h>
 
 #ifdef WITH_SPIRV
 
@@ -107,6 +108,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const Allocate *) override;
         void visit(const Free *) override;
         void visit(const Realize *) override;
+        void visit(const ProducerConsumer *op) override;
         void visit(const IfThenElse *) override;
         void visit(const Evaluate *) override;
         void visit(const Shuffle *) override;
@@ -340,10 +342,11 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(const Expr &e) {
     SpvId result_id = value_id;
     for (int i = 0; i < e.type().lanes(); i++) {
         extract_lane(e, i).accept(this);
-        SpvId vector_id = builder.current_id();
-        SpvId composite_vector_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::vector_insert_dynamic(type_id, composite_vector_id, vector_id, value_id, i));
-        result_id = composite_vector_id;
+        SpvId extracted_id = builder.current_id();
+        SpvId composite_id = builder.reserve_id(SpvResultId);
+        SpvFactory::Indices indices = { (uint32_t)i };
+        builder.append(SpvFactory::composite_insert(type_id, composite_id, extracted_id, value_id, indices));
+        result_id = composite_id;
     }
     builder.update_id(result_id);
 }
@@ -560,8 +563,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a << ") % (" << op->b << "))\n";
     if (op->type.is_float()) {
-        // Takes sign of result from op->b
-        visit_binary_op(SpvOpFMod, op->type, op->a, op->b);
+        // SPIR-V FMod is strangely not what we want .. FRem does what we need
+        visit_binary_op(SpvOpFRem, op->type, op->a, op->b);
     } else {
         Expr e = lower_int_uint_mod(op->a, op->b);
         e.accept(this);
@@ -684,6 +687,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Not *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Not): " << op->type << " !(" << op->a << ")\n";
     visit_unary_op(SpvOpLogicalNot, op->type, op->a);
 }
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const ProducerConsumer *op) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(ProducerConsumer): name=" << op->name << " is_producer=" << (op->is_producer ? "true" : "false") << "\n";
+    op->body.accept(this);
+}
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Call): " << op->type << " " << op->name << " args=" << (uint32_t)op->args.size() << "\n";
@@ -695,53 +702,33 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n";
         auto fence_type = *fence_type_ptr;
 
-        if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device &&
-            !(fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared)) {
-
-            // In Vulkan, execution scope is limited to Workgroup or Subgroup
-            uint32_t execution_scope = SpvWorkgroupScope;
-            uint32_t memory_scope = SpvDeviceScope;
-            uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask |
-                                     SpvMemorySemanticsWorkgroupMemoryMask);
-            SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
-            SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
-            SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
-            builder.append(SpvFactory::control_barrier(exec_scope_id, memory_scope_id, control_mask_id));
-
-        } else if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device) {
-            // In Vulkan, execution scope is limited to Workgroup or Subgroup
-            uint32_t execution_scope = SpvWorkgroupScope;
-            uint32_t memory_scope = SpvDeviceScope;
-            uint32_t control_mask = (SpvMemorySemanticsSequentiallyConsistentMask |
+        // Follow GLSL semantics for GLCompute ... 
+        //
+        // barrier() -> control_barrier(Workgroup, Workgroup, AcquireRelease | WorkgroupMemory)
+        //
+        uint32_t execution_scope = SpvWorkgroupScope;
+        uint32_t memory_scope = SpvWorkgroupScope;
+        uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask | SpvMemorySemanticsWorkgroupMemoryMask);
+        SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
+        SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
+        SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
+        builder.append(SpvFactory::control_barrier(exec_scope_id, memory_scope_id, control_mask_id));
+
+        if ((fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device) ||
+            (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared)) {
+
+            // groupMemoryBarrier() -> memory_barrier(Workgroup, AcquireRelease | UniformMemory | WorkgroupMemory | ImageMemory)
+            //
+            uint32_t memory_mask = (SpvMemorySemanticsAcquireReleaseMask |
                                      SpvMemorySemanticsUniformMemoryMask |
-                                     SpvMemorySemanticsSubgroupMemoryMask |
                                      SpvMemorySemanticsWorkgroupMemoryMask |
-                                     SpvMemorySemanticsCrossWorkgroupMemoryMask |
-                                     SpvMemorySemanticsAtomicCounterMemoryMask |
                                      SpvMemorySemanticsImageMemoryMask);
-            SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
-            SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
-            SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
-            builder.append(SpvFactory::control_barrier(exec_scope_id, memory_scope_id, control_mask_id));
-
-        } else if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared) {
-            uint32_t memory_scope = SpvWorkgroupScope;
-            uint32_t control_mask = (SpvMemorySemanticsAcquireReleaseMask |
-                                     SpvMemorySemanticsWorkgroupMemoryMask);
-            SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
-            SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
-            builder.append(SpvFactory::memory_barrier(memory_scope_id, control_mask_id));
-
-        } else {
-            // In Vulkan, execution scope is limited to Workgroup or Subgroup
-            uint32_t execution_scope = SpvWorkgroupScope;
-            uint32_t memory_scope = SpvDeviceScope;
-            uint32_t control_mask = SpvMemorySemanticsMaskNone;
-            SpvId exec_scope_id = builder.declare_constant(UInt(32), &execution_scope);
-            SpvId memory_scope_id = builder.declare_constant(UInt(32), &memory_scope);
-            SpvId control_mask_id = builder.declare_constant(UInt(32), &control_mask);
-            builder.append(SpvFactory::control_barrier(exec_scope_id, memory_scope_id, control_mask_id));
-        }
+            SpvId memory_mask_id = builder.declare_constant(UInt(32), &memory_mask);
+            builder.append(SpvFactory::memory_barrier(memory_scope_id, memory_mask_id));
+        } 
+        SpvId result_id = builder.declare_null_constant(op->type);
+        builder.update_id(result_id);        
+        
     } else if (op->is_intrinsic(Call::abs)) {
         internal_assert(op->args.size() == 1);
 
@@ -785,12 +772,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         internal_assert(op->args.size() == 1);
         visit_unary_op(SpvOpNot, op->type, op->args[0]);
     } else if (op->is_intrinsic(Call::if_then_else)) {
-        if (op->type.is_vector()) {
+        Expr cond = op->args[0];
+        if (const Broadcast *b = cond.as<Broadcast>()) {
+            cond = b->value;
+        }
+        if (cond.type().is_vector()) {
             scalarize(op);
+
         } else {
             // Generate Phi node if used as an expression.
-            internal_assert(op->args.size() == 3);
-            SpvFactory::BlockVariables block_vars = emit_if_then_else(op->args[0], op->args[1], op->args[2]);
+            internal_assert(op->args.size() == 2 || op->args.size() == 3);
+            Expr else_expr;
+            if( op->args.size() == 3 ) {
+                else_expr = op->args[2];
+            }
+            SpvFactory::BlockVariables block_vars = emit_if_then_else(op->args[0], op->args[1], else_expr);
             SpvId type_id = builder.declare_type(op->type);
             SpvId result_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::phi(type_id, result_id, block_vars));
@@ -1241,9 +1237,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LetStmt *let) {
     SpvId current_id = builder.current_id();
     ScopedSymbolBinding binding(symbol_table, let->name, {current_id, SpvStorageClassFunction});
     let->body.accept(this);
-
-    // TODO: Figure out undef here?
-    builder.update_id(SpvInvalidId);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const AssertStmt *stmt) {
@@ -1331,11 +1324,11 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
 
     } else {
 
-        internal_assert(op->for_type == ForType::Serial) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit unhandled For type: " << op->for_type << "\n";
+        debug(2) << "  (serial for loop): min=" << op->min << " extent=" << op->extent << "\n";
 
-        // TODO: Loop vars are alway int32_t right?
-        SpvId index_type_id = builder.declare_type(Int(32));
-        SpvId index_var_type_id = builder.declare_pointer_type(index_type_id, SpvStorageClassFunction);
+        internal_assert(op->for_type == ForType::Serial) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit unhandled For type: " << op->for_type << "\n";
+        user_assert(op->min.type() == op->extent.type());
+        user_assert(op->min.type().is_int() || op->min.type().is_uint());
 
         op->min.accept(this);
         SpvId min_id = builder.current_id();
@@ -1343,11 +1336,17 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         SpvId extent_id = builder.current_id();
 
         // Compute max.
+        Type index_type = Int(32);
+        SpvId index_type_id = builder.declare_type(index_type);
+        SpvStorageClass storage_class = SpvStorageClassFunction;
+        SpvId index_var_type_id = builder.declare_pointer_type(index_type_id, storage_class);
         SpvId max_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::integer_add(index_type_id, max_id, min_id, extent_id));
 
         // Declare loop var
-        SpvId loop_var_id = builder.declare_variable(unique_name("_loop_idx"), index_var_type_id, SpvStorageClassFunction, min_id);
+        const std::string loop_var_name = unique_name("_loop_idx");
+        SpvId loop_var_id = builder.declare_variable(loop_var_name, index_var_type_id, storage_class, min_id);
+        symbol_table.push(loop_var_name, { loop_var_id, storage_class});
 
         SpvId header_block_id = builder.reserve_id(SpvBlockId);
         SpvId top_block_id = builder.reserve_id(SpvBlockId);
@@ -1358,7 +1357,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         SpvBlock header_block = builder.create_block(header_block_id);
         builder.enter_block(header_block);
         {
-            builder.append(SpvFactory::loop_merge(merge_block_id, continue_block_id, SpvLoopControlMaskNone));
+            builder.append(SpvFactory::loop_merge(merge_block_id, continue_block_id, SpvLoopControlDontUnrollMask));
             builder.append(SpvFactory::branch(top_block_id));
         }
         builder.leave_block();
@@ -1370,7 +1369,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
             SpvId loop_test_type_id = builder.declare_type(Bool());
             SpvId loop_test_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::load(index_type_id, current_index_id, loop_var_id));
-            builder.append(SpvFactory::less_than_equal(loop_test_type_id, loop_test_id, current_index_id, max_id, true));
+            builder.append(SpvFactory::less_than(loop_test_type_id, loop_test_id, current_index_id, max_id, true));
             builder.append(SpvFactory::conditional_branch(loop_test_id, body_block_id, merge_block_id));
         }
         builder.leave_block();
@@ -1378,7 +1377,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         SpvBlock body_block = builder.create_block(body_block_id);
         builder.enter_block(body_block);
         {
-            ScopedSymbolBinding binding(symbol_table, op->name, {current_index_id, SpvStorageClassFunction});
+            ScopedSymbolBinding binding(symbol_table, op->name, {current_index_id, storage_class});
             op->body.accept(this);
             builder.append(SpvFactory::branch(continue_block_id));
         }
@@ -1396,6 +1395,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
             builder.append(SpvFactory::branch(header_block_id));
         }
         builder.leave_block();
+        symbol_table.pop(loop_var_name);
 
         SpvBlock merge_block = builder.create_block(merge_block_id);
         builder.enter_block(merge_block);
@@ -1573,6 +1573,9 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IfThenElse *op) {
+    if(!builder.current_function().is_defined()) {
+        user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IfThenElse *op): No active function for building!!\n";
+    }
     emit_if_then_else(op->condition, op->then_case, op->else_case);
 }
 
@@ -1581,19 +1584,172 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Evaluate *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
-    internal_assert(op->vectors.size() == 2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op): SPIR-V codegen currently only supports shuffles of vector pairs.\n";
-    SpvId type_id = builder.declare_type(op->type);
-    op->vectors[0].accept(this);
-    SpvId vector0_id = builder.current_id();
-    op->vectors[1].accept(this);
-    SpvId vector1_id = builder.current_id();
+    std::cout << " CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Shuffle): " 
+             << "type=" << op->type << " "
+             << "vectors=" << (uint32_t)op->vectors.size() << " "
+             << "is_interleave=" << (op->is_interleave() ? "true" : "false") << " "
+             << "is_extract_element=" << (op->is_extract_element() ? "true" : "false") << "\n";
+
+    // Traverse all the arg vectors
+    uint32_t arg_idx = 0;
+    SpvFactory::Operands arg_ids;
+    arg_ids.reserve(op->vectors.size());
+    for(const Expr& e : op->vectors) {
+        debug(2) << " CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Shuffle): Arg[" << arg_idx++ << "] => " << e << "\n"; 
+        e.accept(this);
+        arg_ids.push_back(builder.current_id());
+    }
 
-    SpvFactory::Indices indices;
-    indices.insert(indices.end(), op->indices.begin(), op->indices.end());
+    if (op->is_interleave()) {
+        int op_lanes = op->type.lanes();
+        internal_assert(!arg_ids.empty());
+        int arg_lanes = op->vectors[0].type().lanes();
 
-    SpvId result_id = builder.reserve_id(SpvResultId);
-    builder.append(SpvFactory::vector_shuffle(type_id, result_id, vector0_id, vector1_id, indices));
-    builder.update_id(result_id);
+        std::cout << "    vector interleave x" << (uint32_t)op->vectors.size() << " : ";
+        for(int idx : op->indices) {
+            std::cout << idx << " ";
+        }
+        std::cout << "\n";
+
+        if (arg_ids.size() == 1) {
+
+            // 1 argument, just do a simple assignment via a cast
+            SpvId result_id = cast_type(op->type, op->vectors[0].type(), arg_ids[0]);
+            builder.update_id(result_id);
+
+        } else if (arg_ids.size() == 2) {
+
+            // 2 arguments, use a composite insert to update even and odd indices 
+            uint32_t even_idx = 0;
+            uint32_t odd_idx = 1;
+            SpvFactory::Indices even_indices;
+            SpvFactory::Indices odd_indices;
+            for(int i = 0; i < op_lanes; ++i) {
+                even_indices.push_back(even_idx);
+                odd_indices.push_back(odd_idx);
+                even_idx += 2;
+                odd_idx += 2;
+            }
+
+            SpvId type_id = builder.declare_type(op->type);
+            SpvId value_id = builder.declare_null_constant(op->type);
+            SpvId partial_id = builder.reserve_id(SpvResultId);
+            SpvId result_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::composite_insert(type_id, partial_id, arg_ids[0], value_id, even_indices));
+            builder.append(SpvFactory::composite_insert(type_id, result_id, arg_ids[1], partial_id, odd_indices));
+            builder.update_id(result_id);
+
+        } else {
+            // 3+ arguments, shuffle via a vector literal
+            // selecting the appropriate elements of the vectors
+            int num_vectors = (int)op->vectors.size();           
+            std::vector<SpvFactory::Components> vector_component_ids(num_vectors);
+            for(uint32_t i = 0; i < (uint32_t)arg_ids.size(); ++i) {
+                if(op->vectors[i].type().is_vector()) {
+                    vector_component_ids[i] = split_vector(op->vectors[i].type(), arg_ids[i]);
+                } else {
+                    vector_component_ids[i] = { arg_ids[i] };
+                }
+            }
+            
+            SpvFactory::Components result_component_ids(op_lanes);
+            for (int i = 0; i < op_lanes; i++) {
+                int arg = i % num_vectors;
+                int arg_idx = i / num_vectors;
+                internal_assert(arg_idx <= arg_lanes);
+                result_component_ids[i] = vector_component_ids[arg][arg_idx];
+            }
+
+            SpvId result_id = join_vector(op->type, result_component_ids);
+            builder.update_id(result_id);
+        }
+    } else if (op->is_extract_element()) {
+        int idx = op->indices[0];
+        internal_assert(idx >= 0);
+        internal_assert(idx <= op->vectors[0].type().lanes());
+        if(op->vectors[0].type().is_vector()) {
+            SpvFactory::Indices indices = { (uint32_t)idx };
+            SpvId type_id = builder.declare_type(op->type);
+            SpvId result_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::composite_extract(type_id, result_id, arg_ids[0], indices));
+            builder.update_id(result_id);
+        } else {
+            SpvId result_id = cast_type(op->type, op->vectors[0].type(), arg_ids[0]);
+            builder.update_id(result_id);
+        }
+    } else if (op->type.is_scalar()) {
+        // Deduce which vector we need. Apparently it's not required
+        // that all vectors have identical lanes, so a loop is required.
+        // Since idx of -1 means "don't care", we'll treat it as 0 to simplify.
+        SpvId result_id = SpvInvalidId;
+        int idx = std::max(0, op->indices[0]);
+        for (size_t vec_idx = 0; vec_idx < op->vectors.size(); vec_idx++) {
+            const int vec_lanes = op->vectors[vec_idx].type().lanes();
+            if (idx < vec_lanes) {
+                if (op->vectors[vec_idx].type().is_vector()) {
+                    SpvFactory::Indices indices = { (uint32_t)idx };
+                    SpvId type_id = builder.declare_type(op->type);
+                    result_id = builder.reserve_id(SpvResultId);
+                    builder.append(SpvFactory::composite_extract(type_id, result_id, arg_ids[vec_idx], indices));
+                } else {
+                    result_id = arg_ids[vec_idx];
+                }
+                break;
+            }
+            idx -= vec_lanes;
+        }
+    
+    } else {
+
+        // vector shuffle ... not interleaving
+        int op_lanes = op->type.lanes();
+        int num_vectors = (int)op->vectors.size();
+
+        std::cout << "    vector shuffle x" << num_vectors << " : ";
+        for(int idx : op->indices) {
+            std::cout << idx << " ";
+        }
+        std::cout << "\n";
+
+        if(num_vectors == 1) {
+            // 1 argument, just do a simple assignment via a cast
+            SpvId result_id = cast_type(op->type, op->vectors[0].type(), arg_ids[0]);
+            builder.update_id(result_id);
+        
+        } else if(num_vectors == 2) {
+
+            // 2 arguments, use the builtin vector shuffle that takes a pair of vectors
+            SpvFactory::Indices indices;
+            indices.reserve(op->indices.size());
+            indices.insert(indices.end(), op->indices.begin(), op->indices.end());
+            SpvId type_id = builder.declare_type(op->type);
+            SpvId result_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::vector_shuffle(type_id, result_id, arg_ids[0], arg_ids[1], indices));
+            builder.update_id(result_id);
+        } else {
+            std::vector<SpvFactory::Components> vector_component_ids(num_vectors);
+            for(uint32_t i = 0; i < (uint32_t)arg_ids.size(); ++i) {
+                if(op->vectors[i].type().is_vector()) {
+                    vector_component_ids[i] = split_vector(op->vectors[i].type(), arg_ids[i]);
+                } else {
+                    vector_component_ids[i] = { arg_ids[i] };
+                }
+            }
+            
+            SpvFactory::Components result_component_ids(op_lanes);
+            for (int i = 0; i < op_lanes && i < (int)op->indices.size(); i++) {
+                int idx = op->indices[i];
+                int arg = idx % num_vectors;
+                int arg_idx = idx / num_vectors;
+                internal_assert(arg_idx <= (int)vector_component_ids[arg].size());
+                result_component_ids[i] = vector_component_ids[arg][arg_idx];
+            }
+
+            SpvId result_id = join_vector(op->type, result_component_ids);
+            builder.update_id(result_id);        
+
+        }
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const VectorReduce *) {
@@ -1686,7 +1842,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
     // TODO: Should we autodetect and/or force 32bit or 64bit?
     builder.set_addressing_model(SpvAddressingModelLogical);
 
-    // TODO: Is there a better memory model to use?
+    // TODO: Should we autodetect the VulkanMemoryModel extension and use that instead?
     builder.set_memory_model(SpvMemoryModelGLSL450);
 
     // NOTE: Execution model for Vulkan must be GLCompute which requires Shader support
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index f80626c7d856..7eaccae18a14 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -1,5 +1,6 @@
 #include "SpirvIR.h"
 #include <iostream>
+#include <signal.h>
 
 #ifdef WITH_SPIRV
 
@@ -252,6 +253,9 @@ bool SpvBlock::is_defined() const {
 
 bool SpvBlock::is_terminated() const {
     check_defined();
+    if(contents->instructions.empty()) {
+        return false;
+    }
     switch (contents->instructions.back().op_code()) {
     case SpvOpBranch:
     case SpvOpBranchConditional:
@@ -313,7 +317,7 @@ SpvBlock SpvFunction::create_block(SpvId block_id) {
     check_defined();
     if (!contents->blocks.empty()) {
         SpvBlock last_block = tail_block();
-        if (!last_block.is_terminated()) {
+        if (last_block.is_defined() && !last_block.is_terminated()) {
             last_block.add_instruction(SpvFactory::branch(block_id));
         }
     }
@@ -1545,9 +1549,11 @@ SpvBuilder::ConstantKey SpvBuilder::make_constant_key(uint8_t code, uint8_t bits
     key = hash_combine(key, lanes);
     key = hash_combine(key, bytes);
 
-    const int8_t *ptr = reinterpret_bits<const int8_t *>(data);
-    for (size_t i = 0; i < bytes; ++i) {
-        key = hash_combine(key, uint64_t(ptr[i]));
+    if(data != nullptr) {
+        const int8_t *ptr = reinterpret_bits<const int8_t *>(data);
+        for (size_t i = 0; i < bytes; ++i) {
+            key = hash_combine(key, uint64_t(ptr[i]));
+        }
     }
     return key;
 }
@@ -2257,6 +2263,18 @@ SpvInstruction SpvFactory::store(SpvId ptr_id, SpvId obj_id, uint32_t access_mas
     return inst;
 }
 
+SpvInstruction SpvFactory::composite_insert(SpvId type_id, SpvId result_id, SpvId object_id, SpvId composite_id, const SpvFactory::Indices &indices) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpCompositeInsert);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operand(object_id);
+    inst.add_operand(composite_id);
+    for (SpvId i : indices) {
+        inst.add_immediate(i, SpvIntegerLiteral);
+    }
+    return inst;
+}
+
 SpvInstruction SpvFactory::composite_extract(SpvId type_id, SpvId result_id, SpvId composite_id, const SpvFactory::Indices &indices) {
     SpvInstruction inst = SpvInstruction::make(SpvOpCompositeExtract);
     inst.set_type_id(type_id);
@@ -2278,13 +2296,23 @@ SpvInstruction SpvFactory::composite_construct(SpvId type_id, SpvId result_id, c
     return inst;
 }
 
-SpvInstruction SpvFactory::vector_insert_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, uint32_t index) {
+SpvInstruction SpvFactory::vector_insert_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, SpvId index_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpVectorInsertDynamic);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
     inst.add_operand(vector_id);
     inst.add_operand(value_id);
-    inst.add_immediate(index, SpvIntegerLiteral);
+    inst.add_operand(index_id);
+    return inst;
+}
+
+SpvInstruction SpvFactory::vector_extract_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, SpvId index_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpVectorExtractDynamic);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operand(vector_id);
+    inst.add_operand(value_id);
+    inst.add_operand(index_id);
     return inst;
 }
 
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 69c9a8b98974..037510a886cd 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -654,8 +654,10 @@ struct SpvFactory {
     static SpvInstruction pointer_access_chain(SpvId type_id, SpvId result_id, SpvId base_id, SpvId element_id, const Indices &indices);
     static SpvInstruction load(SpvId type_id, SpvId result_id, SpvId ptr_id, uint32_t access_mask = 0x0);
     static SpvInstruction store(SpvId ptr_id, SpvId obj_id, uint32_t access_mask = 0x0);
-    static SpvInstruction vector_insert_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, uint32_t index);
+    static SpvInstruction vector_insert_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, SpvId index_id);
+    static SpvInstruction vector_extract_dynamic(SpvId type_id, SpvId result_id, SpvId vector_id, SpvId value_id, SpvId index_id);
     static SpvInstruction vector_shuffle(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, const Indices &indices);
+    static SpvInstruction composite_insert(SpvId type_id, SpvId result_id, SpvId object_id, SpvId composite_id, const SpvFactory::Indices &indices);
     static SpvInstruction composite_extract(SpvId type_id, SpvId result_id, SpvId composite_id, const Indices &indices);
     static SpvInstruction composite_construct(SpvId type_id, SpvId result_id, const Components &constituents);
     static SpvInstruction is_inf(SpvId type_id, SpvId result_id, SpvId src_id);
diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index fbdf6f52b395..3c027c00775b 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -5,6 +5,8 @@
 #include "memory_resources.h"
 #include "region_allocator.h"
 
+// #define DEBUG_INTERNAL
+
 namespace Halide {
 namespace Runtime {
 namespace Internal {
@@ -51,7 +53,8 @@ class BlockAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    void reclaim(void *user_context, MemoryRegion *region);
+    void release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse
+    void reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate
     bool collect(void *user_context);  //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
@@ -143,7 +146,7 @@ void BlockAllocator::initialize(void *user_context, const Config &cfg, const Mem
 
 MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &request) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "BlockAllocator: Reserve ("
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: Reserve ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "offset=" << (uint32_t)request.offset << " "
                         << "size=" << (uint32_t)request.size << " "
@@ -154,7 +157,7 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
 #endif
     BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated);
     if (block_entry == nullptr) {
-        debug(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size ("
+        StackBasicPrinter<256>(nullptr) << "BlockAllocator: Failed to allocate new empty block of requested size ("
                             << (int32_t)(request.size) << " bytes)!\n";
         return nullptr;
     }
@@ -170,7 +173,7 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
         size_t actual_size = constrain_requested_size(request.size);
         block_entry = create_block_entry(user_context, request.properties, actual_size, request.dedicated);
         if (block_entry == nullptr) {
-            debug(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
+            StackBasicPrinter<256>(nullptr) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
                                 << (int32_t)(actual_size) << " bytes)!\n";
             return nullptr;
         }
@@ -185,6 +188,15 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
     return result;
 }
 
+void BlockAllocator::release(void *user_context, MemoryRegion *memory_region) {
+    halide_abort_if_false(user_context, memory_region != nullptr);
+    RegionAllocator *allocator = RegionAllocator::find_allocator(user_context, memory_region);
+    if (allocator == nullptr) {
+        return;
+    }
+    allocator->release(user_context, memory_region);
+}
+
 void BlockAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     halide_abort_if_false(user_context, memory_region != nullptr);
     RegionAllocator *allocator = RegionAllocator::find_allocator(user_context, memory_region);
@@ -199,14 +211,26 @@ bool BlockAllocator::collect(void *user_context) {
     BlockEntry *block_entry = block_list.back();
     while (block_entry != nullptr) {
         BlockEntry *prev_entry = block_entry->prev_ptr;
-
         const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
         if (block->allocator == nullptr) {
             block_entry = prev_entry;
             continue;
         }
 
-        block->allocator->collect(user_context);
+#ifdef DEBUG_INTERNAL
+        uint64_t reserved = block->reserved;
+#endif
+
+        bool collected = block->allocator->collect(user_context);
+        if(collected) {
+#ifdef DEBUG_INTERNAL
+            StackBasicPrinter<256>(nullptr) << "Collected block ("
+                << "block=" << (void*)block << " "
+                << "reserved=" << (uint32_t)block->reserved << " "
+                << "recovered=" << (uint32_t)(reserved - block->reserved) << " "
+                << ")\n";
+#endif
+        }
         if (block->reserved == 0) {
             destroy_block_entry(user_context, block_entry);
             result = true;
@@ -240,7 +264,7 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl
     MemoryRegion *result = allocator->reserve(user_context, request);
     if (result == nullptr) {
 #ifdef DEBUG_INTERNAL
-        debug(user_context) << "BlockAllocator: Failed to allocate region of size ("
+        StackBasicPrinter<256>(nullptr) << "BlockAllocator: Failed to allocate region of size ("
                             << (int32_t)(request.size) << " bytes)!\n";
 #endif
         // allocator has enough free space, but not enough contiguous space
@@ -275,7 +299,7 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
         size_t available = (block->memory.size - block->reserved);
         if (available >= size) {
 #ifdef DEBUG_INTERNAL
-            debug(user_context) << "BlockAllocator: find_block_entry (FOUND) ("
+            StackBasicPrinter<256>(nullptr) << "BlockAllocator: find_block_entry (FOUND) ("
                                 << "user_context=" << (void *)(user_context) << " "
                                 << "block_entry=" << (void *)(block_entry) << " "
                                 << "size=" << (uint32_t)size << " "
@@ -311,7 +335,7 @@ BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &
 RegionAllocator *
 BlockAllocator::create_region_allocator(void *user_context, BlockResource *block) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "BlockAllocator: Creating region allocator ("
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: Creating region allocator ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_resource=" << (void *)(block) << ")...\n";
 #endif
@@ -329,7 +353,7 @@ BlockAllocator::create_region_allocator(void *user_context, BlockResource *block
 
 void BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator *region_allocator) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "BlockAllocator: Destroying region allocator ("
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: Destroying region allocator ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "region_allocator=" << (void *)(region_allocator) << ")...\n";
 #endif
@@ -354,7 +378,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
     }
 
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "BlockAllocator: Creating block entry ("
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: Creating block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << " "
                         << "allocator=" << (void *)(allocators.block.allocate) << ")...\n";
@@ -372,7 +396,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
 
 void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "BlockAllocator: Releasing block entry ("
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: Releasing block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << ")...\n";
 #endif
@@ -384,7 +408,7 @@ void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::Blo
 
 void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "BlockAllocator: Destroying block entry ("
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: Destroying block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << " "
                         << "deallocator=" << (void *)(allocators.block.deallocate) << ")...\n";
@@ -400,7 +424,7 @@ void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Blo
 
 void BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n";
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n";
 #endif
     halide_abort_if_false(user_context, allocators.block.allocate != nullptr);
     MemoryBlock *memory_block = &(block->memory);
@@ -410,7 +434,7 @@ void BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block
 
 void BlockAllocator::free_memory_block(void *user_context, BlockResource *block) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n";
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n";
 #endif
     halide_abort_if_false(user_context, allocators.block.deallocate != nullptr);
     MemoryBlock *memory_block = &(block->memory);
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index d0ad2c3a24f8..169be2ce6bd4 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -8,6 +8,9 @@ namespace Halide {
 namespace Runtime {
 namespace Internal {
 
+// Uncomment this #define to enable verbose debug messages for all allocations
+// #define DEBUG_INTERNAL
+
 // --
 
 /** Allocator class interface for sub-allocating a contiguous
@@ -44,7 +47,8 @@ class RegionAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    void reclaim(void *user_context, MemoryRegion *memory_region);
+    void release(void *user_context, MemoryRegion *memory_region); //< unmark and cache the region for reuse
+    void reclaim(void *user_context, MemoryRegion *memory_region); //< free the region and consolidate
     bool collect(void *user_context);  //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
@@ -134,7 +138,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     size_t remaining = block->memory.size - block->reserved;
     if (remaining < request.size) {
 #ifdef DEBUG_INTERNAL
-        debug(user_context) << "RegionAllocator: Unable to reserve more memory from block "
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: Unable to reserve more memory from block "
                             << "-- requested size (" << (int32_t)(request.size) << " bytes) "
                             << "greater than available (" << (int32_t)(remaining) << " bytes)!\n";
 #endif
@@ -144,7 +148,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     BlockRegion *block_region = find_block_region(user_context, request);
     if (block_region == nullptr) {
 #ifdef DEBUG_INTERNAL
-        debug(user_context) << "RegionAllocator: Failed to locate region for requested size ("
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: Failed to locate region for requested size ("
                             << (int32_t)(request.size) << " bytes)!\n";
 #endif
         return nullptr;
@@ -152,7 +156,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
 
     if (can_split(block_region, request.size)) {
 #ifdef DEBUG_INTERNAL
-        debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
                             << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n";
 #endif
         split_block_region(user_context, block_region, request.size, request.alignment);
@@ -162,6 +166,13 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     return reinterpret_cast<MemoryRegion *>(block_region);
 }
 
+void RegionAllocator::release(void *user_context, MemoryRegion *memory_region) {
+    BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
+    halide_abort_if_false(user_context, block_region != nullptr);
+    halide_abort_if_false(user_context, block_region->block_ptr == block);
+    release_block_region(user_context, block_region);
+}
+
 void RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
@@ -232,7 +243,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
         BlockRegion *prev_region = block_region->prev_ptr;
 
 #ifdef DEBUG_INTERNAL
-        debug(user_context) << "RegionAllocator: Coalescing "
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: Coalescing "
                             << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) "
                             << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!";
 #endif
@@ -250,7 +261,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
         BlockRegion *next_region = block_region->next_ptr;
 
 #ifdef DEBUG_INTERNAL
-        debug(user_context) << "RegionAllocator: Coalescing "
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: Coalescing "
                             << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) "
                             << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n";
 #endif
@@ -274,7 +285,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     alignment = conform_alignment(alignment, block->memory.properties.alignment);
 
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Conforming alignment ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Conforming alignment ("
                         << "requested=" << (uint32_t)alignment << " "
                         << "required=" << (uint32_t)block->memory.properties.alignment << " "
                         << "actual=" << (uint32_t)alignment << ")\n";
@@ -285,9 +296,9 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     size_t empty_size = block_region->memory.size - adjusted_size;
 
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Splitting "
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Splitting "
                         << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
-                        << "to create empty region (offset=" << (int32_t)empty_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
+                        << "to create empty region (offset=" << (int32_t)adjusted_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
 #endif
 
     BlockRegion *next_region = block_region->next_ptr;
@@ -308,7 +319,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 
 BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Creating block region ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Creating block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "offset=" << (uint32_t)offset << " "
                         << "size=" << (uint32_t)size << " "
@@ -327,7 +338,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
     }
 
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Added block region ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Added block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
@@ -338,21 +349,45 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
     block_region->memory.dedicated = dedicated;
     block_region->status = AllocationStatus::Available;
     block_region->block_ptr = block;
+
+#ifdef DEBUG_INTERNAL
+    StackBasicPrinter<256>(nullptr) << "Creating region ("
+        << "block_ptr=" << (void*)block_region->block_ptr << " "
+        << "block_region=" << (void*)block_region << " "
+        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+        << ")\n";
+#endif
+
     return block_region;
 }
 
 void RegionAllocator::release_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Releasing block region ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Releasing block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
+
+    if ((block_region->status == AllocationStatus::InUse) ||
+        (block_region->status == AllocationStatus::Dedicated)) {
+
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "Releasing region ("
+            << "block_ptr=" << (void*)block_region->block_ptr << " "
+            << "block_region=" << (void*)block_region << " "
+            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+            << "block_reserved=" << (uint32_t)(block->reserved - block_region->memory.size) << " "
+            << ")\n";
+#endif
+
+        block->reserved -= block_region->memory.size;
+    }
     block_region->status = AllocationStatus::Available;
 }
 
 void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Destroying block region ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Destroying block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
@@ -363,39 +398,69 @@ void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *bloc
 
 void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Allocating region (size=" << (int32_t)(block_region->memory.size) << ", offset=" << (int32_t)block_region->memory.offset << ")!\n";
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Allocating region (user_context=" << (void *)(user_context) << " size=" << (int32_t)(block_region->memory.size) << " offset=" << (int32_t)block_region->memory.offset << ")!\n";
 #endif
     halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
     halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available);
     MemoryRegion *memory_region = &(block_region->memory);
-    allocators.region.allocate(user_context, memory_region);
+    if(memory_region->handle == nullptr) {
+        allocators.region.allocate(user_context, memory_region);
+
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "Allocating region ("
+            << "block_ptr=" << (void*)block_region->block_ptr << " "
+            << "block_region=" << (void*)block_region << " "
+            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+            << "block_reserved=" << (uint32_t)block->reserved << " "
+            << ")\n";
+#endif
+
+    } else {
+
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "Re-using region  ("
+            << "block_ptr=" << (void*)block_region->block_ptr << " "
+            << "block_region=" << (void*)block_region << " "
+            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+            << "block_reserved=" << (uint32_t)block->reserved << " "
+            << ")\n";
+#endif
+
+    }
     block_region->status = block_region->memory.dedicated ? AllocationStatus::Dedicated : AllocationStatus::InUse;
     block->reserved += block_region->memory.size;
 }
 
 void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Freeing block region ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Freeing block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
     if ((block_region->status == AllocationStatus::InUse) ||
         (block_region->status == AllocationStatus::Dedicated)) {
 #ifdef DEBUG_INTERNAL
-        debug(user_context) << "RegionAllocator: Deallocating region (size=" << (int32_t)(block_region->memory.size) << ", offset=" << (int32_t)block_region->memory.offset << ")!\n";
+        StackBasicPrinter<256>(nullptr) << "Freeing region ("
+            << "block_ptr=" << (void*)block_region->block_ptr << " "
+            << "block_region=" << (void*)block_region << " "
+            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+            << "block_reserved=" << (uint32_t)block->reserved << " "
+            << ")\n";
 #endif
         halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
         MemoryRegion *memory_region = &(block_region->memory);
         allocators.region.deallocate(user_context, memory_region);
         block->reserved -= block_region->memory.size;
         block_region->memory.size = 0;
+        block_region->memory.offset = 0;
+        block_region->memory.handle = nullptr;
     }
     block_region->status = AllocationStatus::Available;
 }
 
 void RegionAllocator::release(void *user_context) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Releasing all regions ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Releasing all regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
     for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
@@ -405,24 +470,53 @@ void RegionAllocator::release(void *user_context) {
 
 bool RegionAllocator::collect(void *user_context) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Collecting free block regions ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Collecting free block regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
+
+    uint64_t reserved = block->reserved;
+    StackBasicPrinter<256>(nullptr) << "    collecting unused regions ("
+        << "block_ptr=" << (void*)block << " "
+        << "block_reserved=" << (uint32_t)block->reserved << " "
+        << ")\n";
 #endif
+
     bool result = false;
+    uint32_t count = 0;
     for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
         if (block_region->status == AllocationStatus::Available) {
             if (can_coalesce(block_region)) {
+
+#ifdef DEBUG_INTERNAL
+                StackBasicPrinter<256>(nullptr) << "    collecting region ("
+                    << "block_ptr=" << (void*)block_region->block_ptr << " "
+                    << "block_region=" << (void*)block_region << " "
+                    << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                    << "block_reserved=" << (uint32_t)block->reserved << " "
+                    << ")\n";
+#endif
                 block_region = coalesce_block_regions(user_context, block_region);
                 result = true;
+                count++;
             }
         }
     }
+
+    if(result) {
+
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "    collected unused regions ("
+            << "block_ptr=" << (void*)block << " "
+            << "region_count=" << (uint32_t)count << " "
+            << "collected=" << (uint32_t)(reserved - block->reserved) << " "
+            << ")\n";
+#endif
+    }
     return result;
 }
 
 void RegionAllocator::destroy(void *user_context) {
 #ifdef DEBUG_INTERNAL
-    debug(user_context) << "RegionAllocator: Destroying all block regions ("
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Destroying all block regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
     for (BlockRegion *block_region = block->regions; block_region != nullptr;) {
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index a83e2c9fff4c..fae2b9a3ee88 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -97,7 +97,11 @@ WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_b
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
     if (ctx.allocator && device_region && device_region->handle) {
-        ctx.allocator->reclaim(user_context, device_region);
+        if(halide_can_reuse_device_allocations(user_context)) {
+            ctx.allocator->release(user_context, device_region);
+        } else {
+            ctx.allocator->reclaim(user_context, device_region);
+        }
     }
     halide_buffer->device = 0;
     halide_buffer->device_interface->impl->release_module();
@@ -417,7 +421,11 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     }
 
     //// 15. Reclaim the staging buffer
-    ctx.allocator->reclaim(user_context, staging_region);
+    if(halide_can_reuse_device_allocations(user_context)) {
+        ctx.allocator->release(user_context, staging_region);
+    } else {
+        ctx.allocator->reclaim(user_context, staging_region);
+    }
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -563,7 +571,11 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 
     // unmap the pointer and reclaim the staging region
     ctx.allocator->unmap(user_context, staging_region);
-    ctx.allocator->reclaim(user_context, staging_region);
+    if(halide_can_reuse_device_allocations(user_context)) {
+        ctx.allocator->release(user_context, staging_region);
+    } else {
+        ctx.allocator->reclaim(user_context, staging_region);
+    }
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -653,7 +665,7 @@ WEAK int halide_vulkan_run(void *user_context,
         return halide_error_code_internal_error;
     }
     debug(user_context) << "    found entry point ["
-                        << entry_point_index << "/" << cache_entry->entry_point_count
+                        << (entry_point_index + 1) << " of " << cache_entry->entry_point_count
                         << "] '" << entry_name << "'\n";
 
     halide_abort_if_false(user_context, cache_entry->descriptor_set_layouts != nullptr);
@@ -787,6 +799,13 @@ WEAK int halide_vulkan_run(void *user_context,
     vkResetCommandPool(ctx.device, ctx.command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
 
 #ifdef DEBUG_RUNTIME
+    debug(user_context) << "Vulkan: blocks_allocated="
+        << (uint32_t)ctx.allocator->blocks_allocated() << " "
+        << "bytes_allocated_for_blocks=" << (uint32_t)ctx.allocator->bytes_allocated_for_blocks() << " "
+        << "regions_allocated=" << (uint32_t)ctx.allocator->regions_allocated() << " "
+        << "bytes_allocated_for_regions=" << (uint32_t)ctx.allocator->bytes_allocated_for_regions() << " "
+        << "\n";
+    
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
@@ -836,11 +855,31 @@ WEAK const struct halide_device_interface_t *halide_vulkan_device_interface() {
     return &vulkan_device_interface;
 }
 
+WEAK halide_device_allocation_pool vulkan_allocation_pool;
+
+WEAK int halide_vulkan_release_unused_device_allocations(void *user_context) {
+    debug(user_context)
+        << "halide_vulkan_release_unused_device_allocations (user_context: " << user_context
+        << ")\n";
+
+    VulkanContext ctx(user_context);
+    if (ctx.error != VK_SUCCESS) {
+        return -1;
+    }
+    
+    // collect all unused allocations
+    ctx.allocator->collect(user_context);
+    return 0;
+}
+
 namespace {
 
-__attribute__((destructor))
-WEAK void
-halide_vulkan_cleanup() {
+WEAK __attribute__((constructor)) void register_vulkan_allocation_pool() {
+    vulkan_allocation_pool.release_unused = &halide_vulkan_release_unused_device_allocations;
+    halide_register_device_allocation_pool(&vulkan_allocation_pool);
+}
+
+WEAK __attribute__((destructor)) void halide_vulkan_cleanup() {
     halide_vulkan_device_release(nullptr);
 }
 
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 45001a33a46c..e0e5ad54f1e0 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -56,7 +56,8 @@ class VulkanMemoryAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, MemoryRequest &request);
-    void reclaim(void *user_context, MemoryRegion *region);
+    void release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse
+    void reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate
     bool collect(void *user_context);  //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
@@ -152,6 +153,10 @@ void VulkanMemoryAllocator::initialize(void *user_context,
     device = dev;
     physical_device = phys_dev;
     alloc_callbacks = callbacks;
+    region_count = 0;
+    region_byte_count = 0;
+    block_count = 0;
+    block_byte_count = 0;
     BlockAllocator::MemoryAllocators allocators;
     allocators.system = system_allocator;
     allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block};
@@ -251,6 +256,18 @@ void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
     vkUnmapMemory(device, *device_memory);
 }
 
+void VulkanMemoryAllocator::release(void *user_context, MemoryRegion *region) {
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Releasing region ("
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << ") ... \n";
+#endif
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, physical_device != nullptr);
+
+    return block_allocator->release(this, region);
+}
+
 void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Reclaiming region ("
@@ -291,6 +308,10 @@ void VulkanMemoryAllocator::destroy(void *user_context) {
                    << "user_context=" << user_context << ") ... \n";
 #endif
     block_allocator->destroy(this);
+    region_count = 0;
+    region_byte_count = 0;
+    block_count = 0;
+    block_byte_count = 0;
 }
 
 const VulkanMemoryConfig &
@@ -420,8 +441,20 @@ void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *bl
     }
 
     vkFreeMemory(instance->device, *device_memory, instance->alloc_callbacks);
-    instance->block_byte_count -= block->size;
-    instance->block_count--;
+
+    if(instance->block_count > 0) {
+        instance->block_count--;
+    } else {
+        error(nullptr) << "VulkanRegionAllocator: Block counter invalid ... reseting to zero!\n";
+        instance->block_count = 0;
+    }
+
+    if( int64_t(instance->block_byte_count) - int64_t(block->size) >= 0 ) {   
+        instance->block_byte_count -= block->size;
+    } else {
+        error(nullptr) << "VulkanRegionAllocator: Block byte counter invalid ... reseting to zero!\n";
+        instance->block_byte_count = 0;
+    }
 
     vk_host_free(nullptr, device_memory, instance->alloc_callbacks);
     device_memory = nullptr;
@@ -642,9 +675,19 @@ void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *
 
     vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks);
     region->handle = nullptr;
-    instance->region_byte_count -= region->size;
-    instance->region_count--;
+    if(instance->region_count > 0) {
+        instance->region_count--;
+    } else {
+        error(nullptr) << "VulkanRegionAllocator: Region counter invalid ... reseting to zero!\n";
+        instance->region_count = 0;
+    }
 
+    if( int64_t(instance->region_byte_count) - int64_t(region->size) >= 0 ) {   
+        instance->region_byte_count -= region->size;
+    } else {
+        error(nullptr) << "VulkanRegionAllocator: Region byte counter invalid ... reseting to zero!\n";
+        instance->region_byte_count = 0;
+    }
     vk_host_free(nullptr, buffer, instance->alloc_callbacks);
     buffer = nullptr;
 }

From 78f6cb57413b57f7ec9af3e9580f1883393ca2c3 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 12:40:01 -0700
Subject: [PATCH 059/166] Clang tidy & format pass

---
 src/CodeGen_C.cpp                            |   2 +-
 src/CodeGen_Vulkan_Dev.cpp                   |  84 +++++------
 src/SpirvIR.cpp                              |   5 +-
 src/runtime/internal/block_allocator.h       |  74 +++++-----
 src/runtime/internal/region_allocator.h      | 147 +++++++++----------
 src/runtime/vulkan.cpp                       |  20 +--
 src/runtime/vulkan_memory.h                  |  14 +-
 test/correctness/gpu_allocation_cache.cpp    |   6 +-
 test/correctness/gpu_reuse_shared_memory.cpp |   2 +-
 9 files changed, 174 insertions(+), 180 deletions(-)

diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index f5886a735222..1b627c79f9ea 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -61,6 +61,7 @@ const string headers = R"INLINE_CODE(
 #endif
 
 #include <assert.h>
+#include <fenv.h>
 #include <float.h>
 #include <limits.h>
 #include <math.h>
@@ -68,7 +69,6 @@ const string headers = R"INLINE_CODE(
 #include <stdio.h>
 #include <string.h>
 #include <type_traits>
-#include <fenv.h>
 )INLINE_CODE";
 
 // We now add definitions of things in the runtime which are
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 58bfd1a1c50c..cbcfc2ab7102 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <fstream>  // for dump to file
 #include <sstream>
 #include <unordered_set>
 
@@ -14,10 +15,6 @@
 #include "SpirvIR.h"
 #include "Target.h"
 
-// Temporary:
-#include <fstream>
-#include <signal.h>
-
 #ifdef WITH_SPIRV
 
 namespace Halide {
@@ -344,7 +341,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(const Expr &e) {
         extract_lane(e, i).accept(this);
         SpvId extracted_id = builder.current_id();
         SpvId composite_id = builder.reserve_id(SpvResultId);
-        SpvFactory::Indices indices = { (uint32_t)i };
+        SpvFactory::Indices indices = {(uint32_t)i};
         builder.append(SpvFactory::composite_insert(type_id, composite_id, extracted_id, value_id, indices));
         result_id = composite_id;
     }
@@ -702,7 +699,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n";
         auto fence_type = *fence_type_ptr;
 
-        // Follow GLSL semantics for GLCompute ... 
+        // Follow GLSL semantics for GLCompute ...
         //
         // barrier() -> control_barrier(Workgroup, Workgroup, AcquireRelease | WorkgroupMemory)
         //
@@ -720,15 +717,15 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             // groupMemoryBarrier() -> memory_barrier(Workgroup, AcquireRelease | UniformMemory | WorkgroupMemory | ImageMemory)
             //
             uint32_t memory_mask = (SpvMemorySemanticsAcquireReleaseMask |
-                                     SpvMemorySemanticsUniformMemoryMask |
-                                     SpvMemorySemanticsWorkgroupMemoryMask |
-                                     SpvMemorySemanticsImageMemoryMask);
+                                    SpvMemorySemanticsUniformMemoryMask |
+                                    SpvMemorySemanticsWorkgroupMemoryMask |
+                                    SpvMemorySemanticsImageMemoryMask);
             SpvId memory_mask_id = builder.declare_constant(UInt(32), &memory_mask);
             builder.append(SpvFactory::memory_barrier(memory_scope_id, memory_mask_id));
-        } 
+        }
         SpvId result_id = builder.declare_null_constant(op->type);
-        builder.update_id(result_id);        
-        
+        builder.update_id(result_id);
+
     } else if (op->is_intrinsic(Call::abs)) {
         internal_assert(op->args.size() == 1);
 
@@ -783,7 +780,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             // Generate Phi node if used as an expression.
             internal_assert(op->args.size() == 2 || op->args.size() == 3);
             Expr else_expr;
-            if( op->args.size() == 3 ) {
+            if (op->args.size() == 3) {
                 else_expr = op->args[2];
             }
             SpvFactory::BlockVariables block_vars = emit_if_then_else(op->args[0], op->args[1], else_expr);
@@ -1346,7 +1343,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         // Declare loop var
         const std::string loop_var_name = unique_name("_loop_idx");
         SpvId loop_var_id = builder.declare_variable(loop_var_name, index_var_type_id, storage_class, min_id);
-        symbol_table.push(loop_var_name, { loop_var_id, storage_class});
+        symbol_table.push(loop_var_name, {loop_var_id, storage_class});
 
         SpvId header_block_id = builder.reserve_id(SpvBlockId);
         SpvId top_block_id = builder.reserve_id(SpvBlockId);
@@ -1573,7 +1570,7 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IfThenElse *op) {
-    if(!builder.current_function().is_defined()) {
+    if (!builder.current_function().is_defined()) {
         user_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IfThenElse *op): No active function for building!!\n";
     }
     emit_if_then_else(op->condition, op->then_case, op->else_case);
@@ -1584,18 +1581,18 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Evaluate *op) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
-    std::cout << " CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Shuffle): " 
-             << "type=" << op->type << " "
-             << "vectors=" << (uint32_t)op->vectors.size() << " "
-             << "is_interleave=" << (op->is_interleave() ? "true" : "false") << " "
-             << "is_extract_element=" << (op->is_extract_element() ? "true" : "false") << "\n";
+    std::cout << " CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Shuffle): "
+              << "type=" << op->type << " "
+              << "vectors=" << (uint32_t)op->vectors.size() << " "
+              << "is_interleave=" << (op->is_interleave() ? "true" : "false") << " "
+              << "is_extract_element=" << (op->is_extract_element() ? "true" : "false") << "\n";
 
     // Traverse all the arg vectors
     uint32_t arg_idx = 0;
     SpvFactory::Operands arg_ids;
     arg_ids.reserve(op->vectors.size());
-    for(const Expr& e : op->vectors) {
-        debug(2) << " CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Shuffle): Arg[" << arg_idx++ << "] => " << e << "\n"; 
+    for (const Expr &e : op->vectors) {
+        debug(2) << " CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Shuffle): Arg[" << arg_idx++ << "] => " << e << "\n";
         e.accept(this);
         arg_ids.push_back(builder.current_id());
     }
@@ -1606,7 +1603,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
         int arg_lanes = op->vectors[0].type().lanes();
 
         std::cout << "    vector interleave x" << (uint32_t)op->vectors.size() << " : ";
-        for(int idx : op->indices) {
+        for (int idx : op->indices) {
             std::cout << idx << " ";
         }
         std::cout << "\n";
@@ -1619,12 +1616,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
 
         } else if (arg_ids.size() == 2) {
 
-            // 2 arguments, use a composite insert to update even and odd indices 
+            // 2 arguments, use a composite insert to update even and odd indices
             uint32_t even_idx = 0;
             uint32_t odd_idx = 1;
             SpvFactory::Indices even_indices;
             SpvFactory::Indices odd_indices;
-            for(int i = 0; i < op_lanes; ++i) {
+            for (int i = 0; i < op_lanes; ++i) {
                 even_indices.push_back(even_idx);
                 odd_indices.push_back(odd_idx);
                 even_idx += 2;
@@ -1642,16 +1639,16 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
         } else {
             // 3+ arguments, shuffle via a vector literal
             // selecting the appropriate elements of the vectors
-            int num_vectors = (int)op->vectors.size();           
+            int num_vectors = (int)op->vectors.size();
             std::vector<SpvFactory::Components> vector_component_ids(num_vectors);
-            for(uint32_t i = 0; i < (uint32_t)arg_ids.size(); ++i) {
-                if(op->vectors[i].type().is_vector()) {
+            for (uint32_t i = 0; i < (uint32_t)arg_ids.size(); ++i) {
+                if (op->vectors[i].type().is_vector()) {
                     vector_component_ids[i] = split_vector(op->vectors[i].type(), arg_ids[i]);
                 } else {
-                    vector_component_ids[i] = { arg_ids[i] };
+                    vector_component_ids[i] = {arg_ids[i]};
                 }
             }
-            
+
             SpvFactory::Components result_component_ids(op_lanes);
             for (int i = 0; i < op_lanes; i++) {
                 int arg = i % num_vectors;
@@ -1667,8 +1664,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
         int idx = op->indices[0];
         internal_assert(idx >= 0);
         internal_assert(idx <= op->vectors[0].type().lanes());
-        if(op->vectors[0].type().is_vector()) {
-            SpvFactory::Indices indices = { (uint32_t)idx };
+        if (op->vectors[0].type().is_vector()) {
+            SpvFactory::Indices indices = {(uint32_t)idx};
             SpvId type_id = builder.declare_type(op->type);
             SpvId result_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::composite_extract(type_id, result_id, arg_ids[0], indices));
@@ -1687,7 +1684,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
             const int vec_lanes = op->vectors[vec_idx].type().lanes();
             if (idx < vec_lanes) {
                 if (op->vectors[vec_idx].type().is_vector()) {
-                    SpvFactory::Indices indices = { (uint32_t)idx };
+                    SpvFactory::Indices indices = {(uint32_t)idx};
                     SpvId type_id = builder.declare_type(op->type);
                     result_id = builder.reserve_id(SpvResultId);
                     builder.append(SpvFactory::composite_extract(type_id, result_id, arg_ids[vec_idx], indices));
@@ -1698,7 +1695,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
             }
             idx -= vec_lanes;
         }
-    
+
     } else {
 
         // vector shuffle ... not interleaving
@@ -1706,17 +1703,17 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
         int num_vectors = (int)op->vectors.size();
 
         std::cout << "    vector shuffle x" << num_vectors << " : ";
-        for(int idx : op->indices) {
+        for (int idx : op->indices) {
             std::cout << idx << " ";
         }
         std::cout << "\n";
 
-        if(num_vectors == 1) {
+        if (num_vectors == 1) {
             // 1 argument, just do a simple assignment via a cast
             SpvId result_id = cast_type(op->type, op->vectors[0].type(), arg_ids[0]);
             builder.update_id(result_id);
-        
-        } else if(num_vectors == 2) {
+
+        } else if (num_vectors == 2) {
 
             // 2 arguments, use the builtin vector shuffle that takes a pair of vectors
             SpvFactory::Indices indices;
@@ -1728,14 +1725,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
             builder.update_id(result_id);
         } else {
             std::vector<SpvFactory::Components> vector_component_ids(num_vectors);
-            for(uint32_t i = 0; i < (uint32_t)arg_ids.size(); ++i) {
-                if(op->vectors[i].type().is_vector()) {
+            for (uint32_t i = 0; i < (uint32_t)arg_ids.size(); ++i) {
+                if (op->vectors[i].type().is_vector()) {
                     vector_component_ids[i] = split_vector(op->vectors[i].type(), arg_ids[i]);
                 } else {
-                    vector_component_ids[i] = { arg_ids[i] };
+                    vector_component_ids[i] = {arg_ids[i]};
                 }
             }
-            
+
             SpvFactory::Components result_component_ids(op_lanes);
             for (int i = 0; i < op_lanes && i < (int)op->indices.size(); i++) {
                 int idx = op->indices[i];
@@ -1746,8 +1743,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
             }
 
             SpvId result_id = join_vector(op->type, result_component_ids);
-            builder.update_id(result_id);        
-
+            builder.update_id(result_id);
         }
     }
 }
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 7eaccae18a14..b63931100b34 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -1,6 +1,5 @@
 #include "SpirvIR.h"
 #include <iostream>
-#include <signal.h>
 
 #ifdef WITH_SPIRV
 
@@ -253,7 +252,7 @@ bool SpvBlock::is_defined() const {
 
 bool SpvBlock::is_terminated() const {
     check_defined();
-    if(contents->instructions.empty()) {
+    if (contents->instructions.empty()) {
         return false;
     }
     switch (contents->instructions.back().op_code()) {
@@ -1549,7 +1548,7 @@ SpvBuilder::ConstantKey SpvBuilder::make_constant_key(uint8_t code, uint8_t bits
     key = hash_combine(key, lanes);
     key = hash_combine(key, bytes);
 
-    if(data != nullptr) {
+    if (data != nullptr) {
         const int8_t *ptr = reinterpret_bits<const int8_t *>(data);
         for (size_t i = 0; i < bytes; ++i) {
             key = hash_combine(key, uint64_t(ptr[i]));
diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index 3c027c00775b..9bf7a4efb172 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -53,9 +53,9 @@ class BlockAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    void release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse
-    void reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate
-    bool collect(void *user_context);  //< returns true if any blocks were removed
+    void release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
+    void reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
+    bool collect(void *user_context);                        //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
 
@@ -147,18 +147,18 @@ void BlockAllocator::initialize(void *user_context, const Config &cfg, const Mem
 MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &request) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Reserve ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "offset=" << (uint32_t)request.offset << " "
-                        << "size=" << (uint32_t)request.size << " "
-                        << "dedicated=" << (request.dedicated ? "true" : "false") << " "
-                        << "usage=" << halide_memory_usage_name(request.properties.usage) << " "
-                        << "caching=" << halide_memory_caching_name(request.properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << " "
+                                    << "offset=" << (uint32_t)request.offset << " "
+                                    << "size=" << (uint32_t)request.size << " "
+                                    << "dedicated=" << (request.dedicated ? "true" : "false") << " "
+                                    << "usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                                    << "caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                                    << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...\n";
 #endif
     BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated);
     if (block_entry == nullptr) {
         StackBasicPrinter<256>(nullptr) << "BlockAllocator: Failed to allocate new empty block of requested size ("
-                            << (int32_t)(request.size) << " bytes)!\n";
+                                        << (int32_t)(request.size) << " bytes)!\n";
         return nullptr;
     }
 
@@ -174,7 +174,7 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
         block_entry = create_block_entry(user_context, request.properties, actual_size, request.dedicated);
         if (block_entry == nullptr) {
             StackBasicPrinter<256>(nullptr) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
-                                << (int32_t)(actual_size) << " bytes)!\n";
+                                            << (int32_t)(actual_size) << " bytes)!\n";
             return nullptr;
         }
 
@@ -222,13 +222,13 @@ bool BlockAllocator::collect(void *user_context) {
 #endif
 
         bool collected = block->allocator->collect(user_context);
-        if(collected) {
+        if (collected) {
 #ifdef DEBUG_INTERNAL
             StackBasicPrinter<256>(nullptr) << "Collected block ("
-                << "block=" << (void*)block << " "
-                << "reserved=" << (uint32_t)block->reserved << " "
-                << "recovered=" << (uint32_t)(reserved - block->reserved) << " "
-                << ")\n";
+                                            << "block=" << (void *)block << " "
+                                            << "reserved=" << (uint32_t)block->reserved << " "
+                                            << "recovered=" << (uint32_t)(reserved - block->reserved) << " "
+                                            << ")\n";
 #endif
         }
         if (block->reserved == 0) {
@@ -265,7 +265,7 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl
     if (result == nullptr) {
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "BlockAllocator: Failed to allocate region of size ("
-                            << (int32_t)(request.size) << " bytes)!\n";
+                                        << (int32_t)(request.size) << " bytes)!\n";
 #endif
         // allocator has enough free space, but not enough contiguous space
         // -- collect and try to reallocate
@@ -300,13 +300,13 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
         if (available >= size) {
 #ifdef DEBUG_INTERNAL
             StackBasicPrinter<256>(nullptr) << "BlockAllocator: find_block_entry (FOUND) ("
-                                << "user_context=" << (void *)(user_context) << " "
-                                << "block_entry=" << (void *)(block_entry) << " "
-                                << "size=" << (uint32_t)size << " "
-                                << "dedicated=" << (dedicated ? "true" : "false") << " "
-                                << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                                << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                                << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n";
+                                            << "user_context=" << (void *)(user_context) << " "
+                                            << "block_entry=" << (void *)(block_entry) << " "
+                                            << "size=" << (uint32_t)size << " "
+                                            << "dedicated=" << (dedicated ? "true" : "false") << " "
+                                            << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                                            << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                                            << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n";
 #endif
             break;
         }
@@ -336,8 +336,8 @@ RegionAllocator *
 BlockAllocator::create_region_allocator(void *user_context, BlockResource *block) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Creating region allocator ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "block_resource=" << (void *)(block) << ")...\n";
+                                    << "user_context=" << (void *)(user_context) << " "
+                                    << "block_resource=" << (void *)(block) << ")...\n";
 #endif
     halide_abort_if_false(user_context, block != nullptr);
     RegionAllocator *region_allocator = RegionAllocator::create(
@@ -354,8 +354,8 @@ BlockAllocator::create_region_allocator(void *user_context, BlockResource *block
 void BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator *region_allocator) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Destroying region allocator ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "region_allocator=" << (void *)(region_allocator) << ")...\n";
+                                    << "user_context=" << (void *)(user_context) << " "
+                                    << "region_allocator=" << (void *)(region_allocator) << ")...\n";
 #endif
     if (region_allocator == nullptr) {
         return;
@@ -379,9 +379,9 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
 
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Creating block entry ("
-                        << "block_entry=" << (void *)(block_entry) << " "
-                        << "block=" << (void *)(block_entry->value) << " "
-                        << "allocator=" << (void *)(allocators.block.allocate) << ")...\n";
+                                    << "block_entry=" << (void *)(block_entry) << " "
+                                    << "block=" << (void *)(block_entry->value) << " "
+                                    << "allocator=" << (void *)(allocators.block.allocate) << ")...\n";
 #endif
 
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
@@ -397,8 +397,8 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
 void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Releasing block entry ("
-                        << "block_entry=" << (void *)(block_entry) << " "
-                        << "block=" << (void *)(block_entry->value) << ")...\n";
+                                    << "block_entry=" << (void *)(block_entry) << " "
+                                    << "block=" << (void *)(block_entry->value) << ")...\n";
 #endif
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
     if (block->allocator) {
@@ -409,9 +409,9 @@ void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::Blo
 void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Destroying block entry ("
-                        << "block_entry=" << (void *)(block_entry) << " "
-                        << "block=" << (void *)(block_entry->value) << " "
-                        << "deallocator=" << (void *)(allocators.block.deallocate) << ")...\n";
+                                    << "block_entry=" << (void *)(block_entry) << " "
+                                    << "block=" << (void *)(block_entry->value) << " "
+                                    << "deallocator=" << (void *)(allocators.block.deallocate) << ")...\n";
 #endif
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
     if (block->allocator) {
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 169be2ce6bd4..b19f68a040e4 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -47,9 +47,9 @@ class RegionAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    void release(void *user_context, MemoryRegion *memory_region); //< unmark and cache the region for reuse
-    void reclaim(void *user_context, MemoryRegion *memory_region); //< free the region and consolidate
-    bool collect(void *user_context);  //< returns true if any blocks were removed
+    void release(void *user_context, MemoryRegion *memory_region);  //< unmark and cache the region for reuse
+    void reclaim(void *user_context, MemoryRegion *memory_region);  //< free the region and consolidate
+    bool collect(void *user_context);                               //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
 
@@ -139,8 +139,8 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     if (remaining < request.size) {
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: Unable to reserve more memory from block "
-                            << "-- requested size (" << (int32_t)(request.size) << " bytes) "
-                            << "greater than available (" << (int32_t)(remaining) << " bytes)!\n";
+                                        << "-- requested size (" << (int32_t)(request.size) << " bytes) "
+                                        << "greater than available (" << (int32_t)(remaining) << " bytes)!\n";
 #endif
         return nullptr;
     }
@@ -149,7 +149,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     if (block_region == nullptr) {
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: Failed to locate region for requested size ("
-                            << (int32_t)(request.size) << " bytes)!\n";
+                                        << (int32_t)(request.size) << " bytes)!\n";
 #endif
         return nullptr;
     }
@@ -157,7 +157,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     if (can_split(block_region, request.size)) {
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
-                            << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n";
+                                        << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n";
 #endif
         split_block_region(user_context, block_region, request.size, request.alignment);
     }
@@ -244,8 +244,8 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: Coalescing "
-                            << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) "
-                            << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!";
+                                        << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) "
+                                        << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!";
 #endif
 
         prev_region->next_ptr = block_region->next_ptr;
@@ -262,8 +262,8 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: Coalescing "
-                            << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) "
-                            << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n";
+                                        << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) "
+                                        << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n";
 #endif
 
         if (next_region->next_ptr) {
@@ -286,9 +286,9 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Conforming alignment ("
-                        << "requested=" << (uint32_t)alignment << " "
-                        << "required=" << (uint32_t)block->memory.properties.alignment << " "
-                        << "actual=" << (uint32_t)alignment << ")\n";
+                                    << "requested=" << (uint32_t)alignment << " "
+                                    << "required=" << (uint32_t)block->memory.properties.alignment << " "
+                                    << "actual=" << (uint32_t)alignment << ")\n";
 #endif
 
     size_t adjusted_size = aligned_size(block_region->memory.offset, size, alignment);
@@ -297,8 +297,8 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Splitting "
-                        << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
-                        << "to create empty region (offset=" << (int32_t)adjusted_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
+                                    << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
+                                    << "to create empty region (offset=" << (int32_t)adjusted_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
 #endif
 
     BlockRegion *next_region = block_region->next_ptr;
@@ -320,14 +320,14 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Creating block region ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "offset=" << (uint32_t)offset << " "
-                        << "size=" << (uint32_t)size << " "
-                        << "alignment=" << (uint32_t)properties.alignment << " "
-                        << "dedicated=" << (dedicated ? "true" : "false") << " "
-                        << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                        << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << " "
+                                    << "offset=" << (uint32_t)offset << " "
+                                    << "size=" << (uint32_t)size << " "
+                                    << "alignment=" << (uint32_t)properties.alignment << " "
+                                    << "dedicated=" << (dedicated ? "true" : "false") << " "
+                                    << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                                    << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                                    << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n";
 #endif
 
     BlockRegion *block_region = static_cast<BlockRegion *>(arena->reserve(user_context, true));
@@ -339,8 +339,8 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
 
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Added block region ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "block_region=" << (void *)(block_region) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << " "
+                                    << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
 
     block_region->memory.offset = offset;
@@ -352,10 +352,10 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
 
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "Creating region ("
-        << "block_ptr=" << (void*)block_region->block_ptr << " "
-        << "block_region=" << (void*)block_region << " "
-        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
-        << ")\n";
+                                    << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                    << "block_region=" << (void *)block_region << " "
+                                    << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                    << ")\n";
 #endif
 
     return block_region;
@@ -364,8 +364,8 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
 void RegionAllocator::release_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Releasing block region ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "block_region=" << (void *)(block_region) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << " "
+                                    << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
 
     if ((block_region->status == AllocationStatus::InUse) ||
@@ -373,11 +373,11 @@ void RegionAllocator::release_block_region(void *user_context, BlockRegion *bloc
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Releasing region ("
-            << "block_ptr=" << (void*)block_region->block_ptr << " "
-            << "block_region=" << (void*)block_region << " "
-            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
-            << "block_reserved=" << (uint32_t)(block->reserved - block_region->memory.size) << " "
-            << ")\n";
+                                        << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                        << "block_region=" << (void *)block_region << " "
+                                        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                        << "block_reserved=" << (uint32_t)(block->reserved - block_region->memory.size) << " "
+                                        << ")\n";
 #endif
 
         block->reserved -= block_region->memory.size;
@@ -388,8 +388,8 @@ void RegionAllocator::release_block_region(void *user_context, BlockRegion *bloc
 void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Destroying block region ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "block_region=" << (void *)(block_region) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << " "
+                                    << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
 
     free_block_region(user_context, block_region);
@@ -403,29 +403,28 @@ void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_
     halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
     halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available);
     MemoryRegion *memory_region = &(block_region->memory);
-    if(memory_region->handle == nullptr) {
+    if (memory_region->handle == nullptr) {
         allocators.region.allocate(user_context, memory_region);
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Allocating region ("
-            << "block_ptr=" << (void*)block_region->block_ptr << " "
-            << "block_region=" << (void*)block_region << " "
-            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
-            << "block_reserved=" << (uint32_t)block->reserved << " "
-            << ")\n";
+                                        << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                        << "block_region=" << (void *)block_region << " "
+                                        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                        << "block_reserved=" << (uint32_t)block->reserved << " "
+                                        << ")\n";
 #endif
 
     } else {
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Re-using region  ("
-            << "block_ptr=" << (void*)block_region->block_ptr << " "
-            << "block_region=" << (void*)block_region << " "
-            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
-            << "block_reserved=" << (uint32_t)block->reserved << " "
-            << ")\n";
+                                        << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                        << "block_region=" << (void *)block_region << " "
+                                        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                        << "block_reserved=" << (uint32_t)block->reserved << " "
+                                        << ")\n";
 #endif
-
     }
     block_region->status = block_region->memory.dedicated ? AllocationStatus::Dedicated : AllocationStatus::InUse;
     block->reserved += block_region->memory.size;
@@ -434,18 +433,18 @@ void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_
 void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Freeing block region ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "block_region=" << (void *)(block_region) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << " "
+                                    << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
     if ((block_region->status == AllocationStatus::InUse) ||
         (block_region->status == AllocationStatus::Dedicated)) {
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Freeing region ("
-            << "block_ptr=" << (void*)block_region->block_ptr << " "
-            << "block_region=" << (void*)block_region << " "
-            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
-            << "block_reserved=" << (uint32_t)block->reserved << " "
-            << ")\n";
+                                        << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                        << "block_region=" << (void *)block_region << " "
+                                        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                        << "block_reserved=" << (uint32_t)block->reserved << " "
+                                        << ")\n";
 #endif
         halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
         MemoryRegion *memory_region = &(block_region->memory);
@@ -461,7 +460,7 @@ void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_r
 void RegionAllocator::release(void *user_context) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Releasing all regions ("
-                        << "user_context=" << (void *)(user_context) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
     for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
         release_block_region(user_context, block_region);
@@ -471,13 +470,13 @@ void RegionAllocator::release(void *user_context) {
 bool RegionAllocator::collect(void *user_context) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Collecting free block regions ("
-                        << "user_context=" << (void *)(user_context) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << ") ...\n";
 
     uint64_t reserved = block->reserved;
     StackBasicPrinter<256>(nullptr) << "    collecting unused regions ("
-        << "block_ptr=" << (void*)block << " "
-        << "block_reserved=" << (uint32_t)block->reserved << " "
-        << ")\n";
+                                    << "block_ptr=" << (void *)block << " "
+                                    << "block_reserved=" << (uint32_t)block->reserved << " "
+                                    << ")\n";
 #endif
 
     bool result = false;
@@ -488,11 +487,11 @@ bool RegionAllocator::collect(void *user_context) {
 
 #ifdef DEBUG_INTERNAL
                 StackBasicPrinter<256>(nullptr) << "    collecting region ("
-                    << "block_ptr=" << (void*)block_region->block_ptr << " "
-                    << "block_region=" << (void*)block_region << " "
-                    << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
-                    << "block_reserved=" << (uint32_t)block->reserved << " "
-                    << ")\n";
+                                                << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                                << "block_region=" << (void *)block_region << " "
+                                                << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                                << "block_reserved=" << (uint32_t)block->reserved << " "
+                                                << ")\n";
 #endif
                 block_region = coalesce_block_regions(user_context, block_region);
                 result = true;
@@ -501,14 +500,14 @@ bool RegionAllocator::collect(void *user_context) {
         }
     }
 
-    if(result) {
+    if (result) {
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "    collected unused regions ("
-            << "block_ptr=" << (void*)block << " "
-            << "region_count=" << (uint32_t)count << " "
-            << "collected=" << (uint32_t)(reserved - block->reserved) << " "
-            << ")\n";
+                                        << "block_ptr=" << (void *)block << " "
+                                        << "region_count=" << (uint32_t)count << " "
+                                        << "collected=" << (uint32_t)(reserved - block->reserved) << " "
+                                        << ")\n";
 #endif
     }
     return result;
@@ -517,7 +516,7 @@ bool RegionAllocator::collect(void *user_context) {
 void RegionAllocator::destroy(void *user_context) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Destroying all block regions ("
-                        << "user_context=" << (void *)(user_context) << ") ...\n";
+                                    << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
     for (BlockRegion *block_region = block->regions; block_region != nullptr;) {
 
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index fae2b9a3ee88..7250421c0f1d 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -97,7 +97,7 @@ WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_b
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
     if (ctx.allocator && device_region && device_region->handle) {
-        if(halide_can_reuse_device_allocations(user_context)) {
+        if (halide_can_reuse_device_allocations(user_context)) {
             ctx.allocator->release(user_context, device_region);
         } else {
             ctx.allocator->reclaim(user_context, device_region);
@@ -421,7 +421,7 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     }
 
     //// 15. Reclaim the staging buffer
-    if(halide_can_reuse_device_allocations(user_context)) {
+    if (halide_can_reuse_device_allocations(user_context)) {
         ctx.allocator->release(user_context, staging_region);
     } else {
         ctx.allocator->reclaim(user_context, staging_region);
@@ -571,7 +571,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 
     // unmap the pointer and reclaim the staging region
     ctx.allocator->unmap(user_context, staging_region);
-    if(halide_can_reuse_device_allocations(user_context)) {
+    if (halide_can_reuse_device_allocations(user_context)) {
         ctx.allocator->release(user_context, staging_region);
     } else {
         ctx.allocator->reclaim(user_context, staging_region);
@@ -800,12 +800,12 @@ WEAK int halide_vulkan_run(void *user_context,
 
 #ifdef DEBUG_RUNTIME
     debug(user_context) << "Vulkan: blocks_allocated="
-        << (uint32_t)ctx.allocator->blocks_allocated() << " "
-        << "bytes_allocated_for_blocks=" << (uint32_t)ctx.allocator->bytes_allocated_for_blocks() << " "
-        << "regions_allocated=" << (uint32_t)ctx.allocator->regions_allocated() << " "
-        << "bytes_allocated_for_regions=" << (uint32_t)ctx.allocator->bytes_allocated_for_regions() << " "
-        << "\n";
-    
+                        << (uint32_t)ctx.allocator->blocks_allocated() << " "
+                        << "bytes_allocated_for_blocks=" << (uint32_t)ctx.allocator->bytes_allocated_for_blocks() << " "
+                        << "regions_allocated=" << (uint32_t)ctx.allocator->regions_allocated() << " "
+                        << "bytes_allocated_for_regions=" << (uint32_t)ctx.allocator->bytes_allocated_for_regions() << " "
+                        << "\n";
+
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
@@ -866,7 +866,7 @@ WEAK int halide_vulkan_release_unused_device_allocations(void *user_context) {
     if (ctx.error != VK_SUCCESS) {
         return -1;
     }
-    
+
     // collect all unused allocations
     ctx.allocator->collect(user_context);
     return 0;
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index e0e5ad54f1e0..e6907a0eb0ce 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -56,9 +56,9 @@ class VulkanMemoryAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, MemoryRequest &request);
-    void release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse
-    void reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate
-    bool collect(void *user_context);  //< returns true if any blocks were removed
+    void release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
+    void reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
+    bool collect(void *user_context);                        //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
 
@@ -442,14 +442,14 @@ void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *bl
 
     vkFreeMemory(instance->device, *device_memory, instance->alloc_callbacks);
 
-    if(instance->block_count > 0) {
+    if (instance->block_count > 0) {
         instance->block_count--;
     } else {
         error(nullptr) << "VulkanRegionAllocator: Block counter invalid ... reseting to zero!\n";
         instance->block_count = 0;
     }
 
-    if( int64_t(instance->block_byte_count) - int64_t(block->size) >= 0 ) {   
+    if (int64_t(instance->block_byte_count) - int64_t(block->size) >= 0) {
         instance->block_byte_count -= block->size;
     } else {
         error(nullptr) << "VulkanRegionAllocator: Block byte counter invalid ... reseting to zero!\n";
@@ -675,14 +675,14 @@ void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *
 
     vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks);
     region->handle = nullptr;
-    if(instance->region_count > 0) {
+    if (instance->region_count > 0) {
         instance->region_count--;
     } else {
         error(nullptr) << "VulkanRegionAllocator: Region counter invalid ... reseting to zero!\n";
         instance->region_count = 0;
     }
 
-    if( int64_t(instance->region_byte_count) - int64_t(region->size) >= 0 ) {   
+    if (int64_t(instance->region_byte_count) - int64_t(region->size) >= 0) {
         instance->region_byte_count -= region->size;
     } else {
         error(nullptr) << "VulkanRegionAllocator: Region byte counter invalid ... reseting to zero!\n";
diff --git a/test/correctness/gpu_allocation_cache.cpp b/test/correctness/gpu_allocation_cache.cpp
index 68ae18e3bae9..4d1f63049d61 100644
--- a/test/correctness/gpu_allocation_cache.cpp
+++ b/test/correctness/gpu_allocation_cache.cpp
@@ -150,7 +150,7 @@ int main(int argc, char **argv) {
     }
 
     // Vulkan will OOM unless allocation cache is used ... skip this since we just ran the same tests above concurrently
-    if(!target.has_feature(Target::Vulkan)) {
+    if (!target.has_feature(Target::Vulkan)) {
 
         // Now benchmark with and without, (just informational, as this isn't a performance test)
         double t1 = Tools::benchmark([&]() {
@@ -166,8 +166,8 @@ int main(int argc, char **argv) {
         });
 
         printf("Runtime with cache: %f\n"
-            "Without cache: %f\n",
-            t1, t2);
+               "Without cache: %f\n",
+               t1, t2);
     }
 
     printf("Success!\n");
diff --git a/test/correctness/gpu_reuse_shared_memory.cpp b/test/correctness/gpu_reuse_shared_memory.cpp
index 2cdb7a53b3f2..2eda324a4d2b 100644
--- a/test/correctness/gpu_reuse_shared_memory.cpp
+++ b/test/correctness/gpu_reuse_shared_memory.cpp
@@ -191,7 +191,7 @@ int main(int argc, char **argv) {
         printf("Running dynamic shared test\n");
         if (t.has_feature(Target::OpenGLCompute) && memory_type == MemoryType::GPUShared) {
             printf("Skipping test because GL doesn't support dynamic sizes for shared memory\n");
-        } else if(t.has_feature(Target::Vulkan) && memory_type == MemoryType::GPUShared) {
+        } else if (t.has_feature(Target::Vulkan) && memory_type == MemoryType::GPUShared) {
             printf("Skipping test because Vulkan doesn't support dynamic sizes for shared memory\n");
         } else {
             if (dynamic_shared_test(memory_type) != 0) {

From 74015cf04a1f9a307101569378827c6b57430fd6 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 14:41:46 -0700
Subject: [PATCH 060/166] Update correctness test results.  PASS: 338, FAIL: 27

---
 README_vulkan.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 0a6b6a608221..f4b854898d39 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -199,13 +199,13 @@ https://github.com/KhronosGroup/SPIRV-Tools
 
 # Current Status
 
-The following table outlines the state of the correctness tests (as of Oct-13) when 
+The following table outlines the state of the correctness tests (as of Oct-23) when 
 run with `HL_JIT_TARGET=host-vulkan` (NOTE: some tests may need additional 
 modifications to run under Vulkan):
 
 | Totals | *PASS* 🟢 | *FAIL* 🔴 | 
 | --     | --        | --        |
-|        | 329       | 36        |
+|        | 338       | 27        |
 
 | Test Name | Status |
 | :--       |    --: |
@@ -223,7 +223,7 @@ modifications to run under Vulkan):
 | correctness_gpu_reuse_shared_memory | *FAIL* 🔴 |
 | correctness_boundary_conditions | *PASS* 🟢 |
 | correctness_min_extent | *PASS* 🟢 |
-| correctness_gpu_sum_scan | *FAIL* 🔴 |
+| correctness_gpu_sum_scan | *PASS* 🟢 |
 | correctness_dynamic_allocation_in_gpu_kernel | *FAIL* 🔴 |
 | correctness_image_of_lists | *PASS* 🟢 |
 | correctness_tracing_broadcast | *PASS* 🟢 |
@@ -235,9 +235,9 @@ modifications to run under Vulkan):
 | correctness_compare_vars | *PASS* 🟢 |
 | correctness_non_vector_aligned_embeded_buffer | *PASS* 🟢 |
 | correctness_realize_larger_than_two_gigs | *PASS* 🟢 |
-| correctness_gpu_transpose | *FAIL* 🔴 |
+| correctness_gpu_transpose | *PASS* 🟢 |
 | correctness_side_effects | *PASS* 🟢 |
-| correctness_logical | *PASS* 🟢 |
+| correctness_logical | *FAIL* 🔴 |
 | correctness_func_lifetime_2 | *PASS* 🟢 |
 | correctness_device_crop | *FAIL* 🔴 |
 | correctness_print_loop_nest | *PASS* 🟢 |
@@ -249,7 +249,7 @@ modifications to run under Vulkan):
 | correctness_parallel_scatter | *PASS* 🟢 |
 | correctness_multi_splits_with_diff_tail_strategies | *PASS* 🟢 |
 | correctness_gpu_arg_types | *PASS* 🟢 |
-| correctness_cascaded_filters | *FAIL* 🔴 |
+| correctness_cascaded_filters | *PASS* 🟢 |
 | correctness_trim_no_ops | *PASS* 🟢 |
 | correctness_float16_t_comparison | *PASS* 🟢 |
 | correctness_legal_race_condition | *PASS* 🟢 |
@@ -297,7 +297,7 @@ modifications to run under Vulkan):
 | correctness_inverse | *FAIL* 🔴 |
 | correctness_skip_stages | *PASS* 🟢 |
 | correctness_cuda_8_bit_dot_product | *PASS* 🟢 |
-| correctness_gpu_vectorize | *FAIL* 🔴 |
+| correctness_gpu_vectorize | *PASS* 🟢 |
 | correctness_gpu_object_lifetime_3 | *PASS* 🟢 |
 | correctness_histogram | *PASS* 🟢 |
 | correctness_shared_self_references | *PASS* 🟢 |
@@ -336,7 +336,7 @@ modifications to run under Vulkan):
 | correctness_extern_consumer | *PASS* 🟢 |
 | correctness_func_wrapper | *PASS* 🟢 |
 | correctness_bounds_of_multiply | *PASS* 🟢 |
-| correctness_gpu_store_in_register_with_no_lanes_loop | *FAIL* 🔴 |
+| correctness_gpu_store_in_register_with_no_lanes_loop | *PASS* 🟢 |
 | correctness_gpu_condition_lifting | *PASS* 🟢 |
 | correctness_extern_consumer_tiled | *PASS* 🟢 |
 | correctness_float16_t_neon_op_check | *PASS* 🟢 |
@@ -434,7 +434,7 @@ modifications to run under Vulkan):
 | correctness_reduction_non_rectangular | *PASS* 🟢 |
 | correctness_left_shift_negative | *PASS* 🟢 |
 | correctness_set_custom_trace | *PASS* 🟢 |
-| correctness_vectorized_gpu_allocation | *FAIL* 🔴 |
+| correctness_vectorized_gpu_allocation | *PASS* 🟢 |
 | correctness_split_store_compute | *PASS* 🟢 |
 | correctness_c_function | *PASS* 🟢 |
 | correctness_specialize | *PASS* 🟢 |
@@ -446,12 +446,12 @@ modifications to run under Vulkan):
 | correctness_atomics | *PASS* 🟢 |
 | correctness_multipass_constraints | *PASS* 🟢 |
 | correctness_target | *PASS* 🟢 |
-| correctness_tuple_reduction | *FAIL* 🔴 |
+| correctness_tuple_reduction | *PASS* 🟢 |
 | correctness_dilate3x3 | *PASS* 🟢 |
 | correctness_image_io | *PASS* 🟢 |
 | correctness_gpu_param_allocation | *FAIL* 🔴 |
 | correctness_reschedule | *PASS* 🟢 |
-| correctness_isnan | *FAIL* 🔴 |
+| correctness_isnan | *PASS* 🟢 |
 | correctness_halide_buffer | *PASS* 🟢 |
 | correctness_bounds_of_cast | *PASS* 🟢 |
 | correctness_handle | *PASS* 🟢 |
@@ -484,7 +484,7 @@ modifications to run under Vulkan):
 | correctness_gpu_cpu_simultaneous_read | *PASS* 🟢 |
 | correctness_fast_trigonometric | *PASS* 🟢 |
 | correctness_compute_with | *PASS* 🟢 |
-| correctness_gpu_allocation_cache | *FAIL* 🔴 |
+| correctness_gpu_allocation_cache | *PASS* 🟢 |
 | correctness_compile_to | *PASS* 🟢 |
 | correctness_extern_output_expansion | *PASS* 🟢 |
 | correctness_gpu_texture | *PASS* 🟢 |
@@ -509,7 +509,7 @@ modifications to run under Vulkan):
 | correctness_loop_invariant_extern_calls | *PASS* 🟢 |
 | correctness_skip_stages_external_array_functions | *PASS* 🟢 |
 | correctness_chunk_sharing | *PASS* 🟢 |
-| correctness_multi_way_select | *FAIL* 🔴 |
+| correctness_multi_way_select | *PASS* 🟢 |
 | correctness_async_copy_chain | *FAIL* 🔴 |
 | correctness_gpu_give_input_buffers_device_allocations | *PASS* 🟢 |
 | correctness_oddly_sized_output | *PASS* 🟢 |

From 5be5760397e3916a654c0d8b01c6c54078e6c680 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 16:19:53 -0700
Subject: [PATCH 061/166] Move counter inside debug #define to fix build

---
 src/runtime/internal/region_allocator.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index b19f68a040e4..980b6c77256c 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -472,6 +472,7 @@ bool RegionAllocator::collect(void *user_context) {
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Collecting free block regions ("
                                     << "user_context=" << (void *)(user_context) << ") ...\n";
 
+    uint32_t count = 0;
     uint64_t reserved = block->reserved;
     StackBasicPrinter<256>(nullptr) << "    collecting unused regions ("
                                     << "block_ptr=" << (void *)block << " "
@@ -480,12 +481,12 @@ bool RegionAllocator::collect(void *user_context) {
 #endif
 
     bool result = false;
-    uint32_t count = 0;
     for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
         if (block_region->status == AllocationStatus::Available) {
             if (can_coalesce(block_region)) {
 
 #ifdef DEBUG_INTERNAL
+                count++;
                 StackBasicPrinter<256>(nullptr) << "    collecting region ("
                                                 << "block_ptr=" << (void *)block_region->block_ptr << " "
                                                 << "block_region=" << (void *)block_region << " "
@@ -495,7 +496,6 @@ bool RegionAllocator::collect(void *user_context) {
 #endif
                 block_region = coalesce_block_regions(user_context, block_region);
                 result = true;
-                count++;
             }
         }
     }

From ea1832aaf2f8168b3f0558b08d45d04a9d03076f Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 16:20:26 -0700
Subject: [PATCH 062/166] Relax tolerance for newton's method to match other
 GPU APIs Skip gpu dynamic shared testfor Vulkan (since dynamic shared
 allocations aren't supported yet) Update correctness test status. PASS: 340,
 FAIL: 25

---
 README_vulkan.md                        | 6 +++---
 test/correctness/gpu_dynamic_shared.cpp | 5 +++++
 test/correctness/newtons_method.cpp     | 5 +++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index f4b854898d39..d41647f37695 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -205,7 +205,7 @@ modifications to run under Vulkan):
 
 | Totals | *PASS* 🟢 | *FAIL* 🔴 | 
 | --     | --        | --        |
-|        | 338       | 27        |
+|        | 340       | 25        |
 
 | Test Name | Status |
 | :--       |    --: |
@@ -312,7 +312,7 @@ modifications to run under Vulkan):
 | correctness_interval | *PASS* 🟢 |
 | correctness_unused_func | *PASS* 🟢 |
 | correctness_fuzz_float_stores | *PASS* 🟢 |
-| correctness_newtons_method | *FAIL* 🔴 |
+| correctness_newtons_method | *PASS* 🟢 |
 | correctness_compile_to_bitcode | *PASS* 🟢 |
 | correctness_lazy_convolution | *PASS* 🟢 |
 | correctness_image_wrapper | *PASS* 🟢 |
@@ -553,7 +553,7 @@ modifications to run under Vulkan):
 | correctness_unroll_huge_mux | *PASS* 🟢 |
 | correctness_parallel | *PASS* 🟢 |
 | correctness_code_explosion | *PASS* 🟢 |
-| correctness_gpu_dynamic_shared | *FAIL* 🔴 |
+| correctness_gpu_dynamic_shared | *PASS* 🟢 |
 | correctness_div_by_zero | *PASS* 🟢 |
 | correctness_convolution_multiple_kernels | *FAIL* 🔴 |
 | correctness_deinterleave4 | *PASS* 🟢 |
diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index d43386f05980..091f8315e617 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -10,6 +10,11 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (t.has_feature(Target::Vulkan)) {
+        printf("[SKIP] Skipping test for Vulkan, as it does not support dynamically-sized shared memory\n");
+        return 0;
+    }
+
     if (t.has_feature(Target::OpenGLCompute)) {
         printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n");
         return 0;
diff --git a/test/correctness/newtons_method.cpp b/test/correctness/newtons_method.cpp
index 25442209ff7e..d5200d551c5a 100644
--- a/test/correctness/newtons_method.cpp
+++ b/test/correctness/newtons_method.cpp
@@ -53,8 +53,9 @@ int find_pi() {
 
     T secant_result = evaluate_may_gpu<T>(g()[0]);
 
-    // Trig in openglcompute/d3d12 is approximate
-    float tolerance = target.has_feature(Target::OpenGLCompute) ||
+    // Trig in vulkan/openglcompute/d3d12 is approximate
+    float tolerance = target.has_feature(Target::Vulkan) ||
+                      target.has_feature(Target::OpenGLCompute) ||
                               target.has_feature(Target::D3D12Compute) ?
                           1e-5f :
                           1e-20f;

From 5d77860d19629516910a057ac85296cacae7eb86 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sun, 23 Oct 2022 16:26:14 -0700
Subject: [PATCH 063/166] Clang format/tidy pass

---
 test/correctness/newtons_method.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/correctness/newtons_method.cpp b/test/correctness/newtons_method.cpp
index d5200d551c5a..eeef000f1d20 100644
--- a/test/correctness/newtons_method.cpp
+++ b/test/correctness/newtons_method.cpp
@@ -55,7 +55,7 @@ int find_pi() {
 
     // Trig in vulkan/openglcompute/d3d12 is approximate
     float tolerance = target.has_feature(Target::Vulkan) ||
-                      target.has_feature(Target::OpenGLCompute) ||
+                              target.has_feature(Target::OpenGLCompute) ||
                               target.has_feature(Target::D3D12Compute) ?
                           1e-5f :
                           1e-20f;

From d17d3f380d280c063280074ebfb9c6528c5b3605 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 28 Oct 2022 12:44:58 -0700
Subject: [PATCH 064/166] Skip Vulkan for float64 for correctness test round
 (since f64 is optional)

---
 test/correctness/round.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/correctness/round.cpp b/test/correctness/round.cpp
index ffce93bccd28..ef5dd4ea087d 100644
--- a/test/correctness/round.cpp
+++ b/test/correctness/round.cpp
@@ -13,6 +13,7 @@ bool test(Expr e, const char *funcname, int vector_width, int N, Buffer<T> &inpu
     if (t.has_gpu_feature()) {
         if (e.type() == Float(64) &&
             ((t.has_feature(Target::OpenCL) && !t.has_feature(Target::CLDoubles)) ||
+             t.has_feature(Target::Vulkan) ||
              t.has_feature(Target::Metal) ||
              t.has_feature(Target::D3D12Compute))) {
             return true;

From a9baf9b7dfb54968b7fcd950a935c6e3a512afdc Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 28 Oct 2022 12:48:58 -0700
Subject: [PATCH 065/166] Skip Vulkan for tests that rely upon device crop, and
 slice.

---
 test/correctness/async_device_copy.cpp  | 6 ++++++
 test/correctness/device_buffer_copy.cpp | 5 +++++
 test/correctness/device_crop.cpp        | 5 +++++
 test/correctness/device_slice.cpp       | 5 +++++
 4 files changed, 21 insertions(+)

diff --git a/test/correctness/async_device_copy.cpp b/test/correctness/async_device_copy.cpp
index 490dc75b8dc1..7095f5e130b8 100644
--- a/test/correctness/async_device_copy.cpp
+++ b/test/correctness/async_device_copy.cpp
@@ -27,6 +27,12 @@ int main(int argc, char **argv) {
                " (halide_buffer_copy is unimplemented in that backend).\n");
         return 0;
     }
+    
+    if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
+        printf("Skipping test for Vulkan as it does not support copy_to_host/device() yet"
+               " (halide_buffer_copy is unimplemented in that backend).\n");
+        return 0;
+    }
 
     // Compute frames on GPU/CPU, and then sum then on
     // CPU/GPU. async() lets us overlap the CPU computation with the
diff --git a/test/correctness/device_buffer_copy.cpp b/test/correctness/device_buffer_copy.cpp
index 9179ac83cd24..c19efd15e6c1 100644
--- a/test/correctness/device_buffer_copy.cpp
+++ b/test/correctness/device_buffer_copy.cpp
@@ -37,6 +37,11 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (target.has_feature(Target::Vulkan)) {
+        printf("Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
+        return 0;
+    }
+
     printf("Test copy to device.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);
diff --git a/test/correctness/device_crop.cpp b/test/correctness/device_crop.cpp
index ee4b900bc1f3..ce02bd6ef995 100644
--- a/test/correctness/device_crop.cpp
+++ b/test/correctness/device_crop.cpp
@@ -35,6 +35,11 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (target.has_feature(Target::Vulkan)) {
+        printf("Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
+        return 0;
+    }
+
     printf("Test in-place cropping.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);
diff --git a/test/correctness/device_slice.cpp b/test/correctness/device_slice.cpp
index 0b9e3ca5bbcb..a77386543e09 100644
--- a/test/correctness/device_slice.cpp
+++ b/test/correctness/device_slice.cpp
@@ -37,6 +37,11 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (target.has_feature(Target::Vulkan)) {
+        printf("Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
+        return 0;
+    }
+
     printf("Test in-place slicing.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);

From 99cff5281f879f82f3a57e1b26dcfba24893775e Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 28 Oct 2022 12:58:09 -0700
Subject: [PATCH 066/166] Only test small vector widths for Vulkan (since
 widths >=8 are optional)

---
 test/correctness/boundary_conditions.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/correctness/boundary_conditions.cpp b/test/correctness/boundary_conditions.cpp
index 697678cf28e8..8649fe173013 100644
--- a/test/correctness/boundary_conditions.cpp
+++ b/test/correctness/boundary_conditions.cpp
@@ -385,6 +385,7 @@ int main(int argc, char **argv) {
 
     int vector_width_max = 32;
     if (target.has_feature(Target::Metal) ||
+        target.has_feature(Target::Vulkan) ||
         target.has_feature(Target::OpenGLCompute) ||
         target.has_feature(Target::D3D12Compute)) {
         // https://github.com/halide/Halide/issues/2148

From 15b06f281b827fe4bf84587e3f409aacf52ef96a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 28 Oct 2022 12:59:18 -0700
Subject: [PATCH 067/166] Caninicalize gpu vars for Vulkan

---
 src/Lower.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Lower.cpp b/src/Lower.cpp
index 38ad867686e6..42bcf54bb4da 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -252,6 +252,7 @@ void lower_impl(const vector<Function> &output_funcs,
     // OpenGL relies on GPU var canonicalization occurring before
     // storage flattening.
     if (t.has_gpu_feature() ||
+        t.has_feature(Target::Vulkan) ||
         t.has_feature(Target::OpenGLCompute)) {
         debug(1) << "Canonicalizing GPU var names...\n";
         s = canonicalize_gpu_vars(s);

From d8619d000fb1343dbdee6edba26d4259032e247d Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 28 Oct 2022 13:08:48 -0700
Subject: [PATCH 068/166] Fix loop initialization, and increments Add all
 explicit types, and fix constant declarations Add missing fast intrinsics
 Convert results of logical ops into expected types (instead of bools)

---
 src/CodeGen_Vulkan_Dev.cpp | 332 ++++++++++++++++++++++++++++---------
 1 file changed, 257 insertions(+), 75 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index cbcfc2ab7102..81b2eb7e086d 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -128,6 +128,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         SpvFactory::Components split_vector(Type type, SpvId value_id);
         SpvId join_vector(Type type, const SpvFactory::Components &value_components);
         SpvId cast_type(Type target_type, Type value_type, SpvId value_id);
+        SpvId convert_to_bool(Type target_type, Type value_type, SpvId value_id);
 
         using BuiltinMap = std::unordered_map<std::string, SpvId>;
 
@@ -158,6 +159,12 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
             {"exp_f32", GLSLstd450Exp},
             {"fast_inverse_sqrt_f16", GLSLstd450InverseSqrt},
             {"fast_inverse_sqrt_f32", GLSLstd450InverseSqrt},
+            {"fast_log_f16", GLSLstd450Log},
+            {"fast_log_f32", GLSLstd450Log},
+            {"fast_exp_f16", GLSLstd450Exp},
+            {"fast_exp_f32", GLSLstd450Exp},
+            {"fast_pow_f16", GLSLstd450Pow},
+            {"fast_pow_f32", GLSLstd450Pow},
             {"floor_f16", GLSLstd450Floor},
             {"floor_f32", GLSLstd450Floor},
             {"log_f16", GLSLstd450Log},
@@ -414,12 +421,22 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const StringImm *imm) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
-    if (imm->type.bits() == 32) {
-        const float value = (float)(imm->value);
+    if (imm->type.bits() == 16) {
+        if(imm->type.is_bfloat()) {
+            const bfloat16_t value = bfloat16_t(imm->value);
+            SpvId constant_id = builder.declare_constant(imm->type, &value);
+            builder.update_id(constant_id);
+        } else {
+            const float16_t value = float16_t(imm->value);
+            SpvId constant_id = builder.declare_constant(imm->type, &value);
+            builder.update_id(constant_id);
+        }
+    } else if (imm->type.bits() == 32) {
+        const float value = float(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
     } else if (imm->type.bits() == 64) {
-        const double value = (double)(imm->value);
+        const double value = double(imm->value);
         SpvId constant_id = builder.declare_constant(imm->type, &value);
         builder.update_id(constant_id);
     } else {
@@ -427,7 +444,50 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
     }
 }
 
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type value_type, SpvId value_id) {
+    if(!value_type.is_bool()) {
+        value_id = cast_type(Bool(), value_type, value_id);
+    }
+    uint8_t true_data[ target_type.bytes() ];
+    uint8_t false_data[ target_type.bytes() ];
+    for(int i = 0; i < target_type.lanes(); ++i) {
+        if(target_type.is_int_or_uint() && target_type.bits() == 8) {
+            reinterpret_cast<int8_t*>(true_data)[i] = int8_t(1);
+            reinterpret_cast<int8_t*>(false_data)[i] = int8_t(0);
+        } else if(target_type.is_int_or_uint() && target_type.bits() == 16) {
+            reinterpret_cast<int16_t*>(true_data)[i] = int16_t(1);
+            reinterpret_cast<int16_t*>(false_data)[i] = int16_t(0);
+        } else if(target_type.is_int_or_uint() && target_type.bits() == 32) {
+            reinterpret_cast<int32_t*>(true_data)[i] = int32_t(1);
+            reinterpret_cast<int32_t*>(false_data)[i] = int32_t(0);
+        } else if(target_type.is_int_or_uint() && target_type.bits() == 64) {
+            reinterpret_cast<int64_t*>(true_data)[i] = int64_t(1);
+            reinterpret_cast<int64_t*>(false_data)[i] = int64_t(0);
+        } else if(target_type.is_float() && target_type.bits() == 16) {
+            reinterpret_cast<uint16_t*>(true_data)[i] = uint16_t(1);
+            reinterpret_cast<uint16_t*>(false_data)[i] = uint16_t(0);
+        } else if(target_type.is_float() && target_type.bits() == 32) {
+            reinterpret_cast<float*>(true_data)[i] = 1.0f;
+            reinterpret_cast<float*>(false_data)[i] = 0.0f;
+        } else if(target_type.is_float() && target_type.bits() == 64) {
+            reinterpret_cast<double*>(true_data)[i] = 1.0;
+            reinterpret_cast<double*>(false_data)[i] = 0.0;
+        } else {
+            user_error << "Unhandled type cast from value type '" << value_type << "' to target type '" << target_type << "'!";
+        }
+    }
+
+    SpvId result_id = builder.reserve_id(SpvResultId);
+    SpvId target_type_id = builder.declare_type(target_type);
+    SpvId true_value_id = builder.declare_constant(target_type, &true_data);
+    SpvId false_value_id = builder.declare_constant(target_type, &false_data);
+    builder.append(SpvFactory::select(target_type_id, result_id, value_id, true_value_id, false_value_id));
+    return result_id;
+}
+
 SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_type, SpvId value_id) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(): casting from value type '" 
+             << value_type << "' to target type '" << target_type << "'!\n";
 
     if (value_type == target_type) {
         return value_id;
@@ -437,31 +497,32 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
     if (value_type.is_float()) {
         if (target_type.is_float()) {
             op_code = SpvOpFConvert;
+        } else if (target_type.is_bool()) {
+            op_code = SpvOpSelect;
         } else if (target_type.is_uint()) {
             op_code = SpvOpConvertFToU;
         } else if (target_type.is_int()) {
             op_code = SpvOpConvertFToS;
         }
+    } else if (value_type.is_bool()) {
+        op_code = SpvOpSelect;
+
     } else if (value_type.is_uint()) {
         if (target_type.is_float()) {
             op_code = SpvOpConvertUToF;
-        } else if (target_type.is_uint()) {
+        } else if (target_type.is_bool()) {
+            op_code = SpvOpSelect;
+        } else if (target_type.is_int_or_uint()) {
             if (target_type.bits() != value_type.bits()) {
                 op_code = SpvOpUConvert;  // UConvert is only allowed on differing component widths
             }
-        } else if (target_type.is_int()) {
-            if (target_type.bits() != value_type.bits()) {
-                op_code = SpvOpSConvert;  // SConvert is only allowed on differing component widths
-            }
         }
     } else if (value_type.is_int()) {
         if (target_type.is_float()) {
             op_code = SpvOpConvertSToF;
-        } else if (target_type.is_uint()) {
-            if (target_type.bits() != value_type.bits()) {
-                op_code = SpvOpUConvert;  // UConvert is only allowed on differing component widths
-            }
-        } else if (target_type.is_int() || target_type.is_uint()) {
+        } else if (target_type.is_bool()) {
+            op_code = SpvOpSelect;
+        } else if (target_type.is_int_or_uint()) {
             if (target_type.bits() != value_type.bits()) {
                 op_code = SpvOpSConvert;  // SConvert is only allowed on differing component widths
             }
@@ -482,11 +543,25 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
         return SpvInvalidId;
     }
 
+    SpvId result_id = SpvInvalidId;
     SpvId target_type_id = builder.declare_type(target_type);
-    SpvId result_id = builder.reserve_id(SpvResultId);
     if (op_code == SpvOpBitcast) {
+        result_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::bitcast(target_type_id, result_id, value_id));
+    } else if (op_code == SpvOpSelect) {
+        result_id = convert_to_bool(target_type, value_type, value_id);
+    } else if(op_code == SpvOpUConvert && target_type.is_int()) {
+        // Vulkan requires both value and target types to be unsigned for UConvert
+        // so do the conversion to an equivalent unsigned type then bitcast this 
+        // result into the target type
+        Type unsigned_type = target_type.with_code(halide_type_uint).narrow();
+        SpvId unsigned_type_id = builder.declare_type(unsigned_type);
+        SpvId unsigned_value_id = builder.reserve_id(SpvResultId);
+        result_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::convert(op_code, unsigned_type_id, unsigned_value_id, value_id));
+        builder.append(SpvFactory::bitcast(target_type_id, result_id, unsigned_value_id));
     } else {
+        result_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::convert(op_code, target_type_id, result_id, value_id));
     }
     return result_id;
@@ -580,7 +655,26 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Max *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Max *op): unhandled type: " << op->type << "\n";
     }
-    std::vector<Expr> args = {op->a, op->b};
+
+    std::vector<Expr> args;
+    args.reserve(2);
+    if(op->type.is_vector()) {
+        if(op->a.type().is_scalar()) {
+            Expr a_vector = Broadcast::make(op->a, op->type.lanes());
+            args.push_back(a_vector);
+        } else {
+            args.push_back(op->a);
+        }
+        if(op->b.type().is_scalar()) {
+            Expr b_vector = Broadcast::make(op->b, op->type.lanes());
+            args.push_back(b_vector);
+        } else {
+            args.push_back(op->b);
+        }
+    } else {
+        args.push_back(op->a);
+        args.push_back(op->b);
+    }
     visit_glsl_op(op_code, op->type, args);
 }
 
@@ -596,18 +690,49 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op): unhandled type: " << op->type << "\n";
     }
-    std::vector<Expr> args = {op->a, op->b};
+
+    std::vector<Expr> args;
+    args.reserve(2);
+    if(op->type.is_vector()) {
+        if(op->a.type().is_scalar()) {
+            Expr a_vector = Broadcast::make(op->a, op->type.lanes());
+            args.push_back(a_vector);
+        } else {
+            args.push_back(op->a);
+        }
+        if(op->b.type().is_scalar()) {
+            Expr b_vector = Broadcast::make(op->b, op->type.lanes());
+            args.push_back(b_vector);
+        } else {
+            args.push_back(op->b);
+        }
+    } else {
+        args.push_back(op->a);
+        args.push_back(op->b);
+    }
     visit_glsl_op(op_code, op->type, args);
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(EQ): " << op->type << " (" << op->a << ") == (" << op->b << ")\n";
     visit_binary_op(op->type.is_float() ? SpvOpFOrdEqual : SpvOpIEqual, op->type, op->a, op->b);
+    if(!op->type.is_bool()) {
+        Type bool_type = UInt(1, op->type.lanes());
+        SpvId current_id = builder.current_id();
+        SpvId result_id = cast_type(op->type, bool_type, current_id);
+        builder.update_id(result_id);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const NE *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(NE): " << op->type << " (" << op->a << ") != (" << op->b << ")\n";
     visit_binary_op(op->type.is_float() ? SpvOpFOrdNotEqual : SpvOpINotEqual, op->type, op->a, op->b);
+    if(!op->type.is_bool()) {
+        Type bool_type = UInt(1, op->type.lanes());
+        SpvId current_id = builder.current_id();
+        SpvId result_id = cast_type(op->type, bool_type, current_id);
+        builder.update_id(result_id);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
@@ -623,6 +748,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op): unhandled type: " << op->a.type() << "\n";
     }
     visit_binary_op(op_code, op->type, op->a, op->b);
+    if(!op->type.is_bool()) {
+        Type bool_type = UInt(1, op->type.lanes());
+        SpvId current_id = builder.current_id();
+        SpvId result_id = cast_type(op->type, bool_type, current_id);
+        builder.update_id(result_id);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
@@ -638,6 +769,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op): unhandled type: " << op->a.type() << "\n";
     }
     visit_binary_op(op_code, op->type, op->a, op->b);
+    if(!op->type.is_bool()) {
+        Type bool_type = UInt(1, op->type.lanes());
+        SpvId current_id = builder.current_id();
+        SpvId result_id = cast_type(op->type, bool_type, current_id);
+        builder.update_id(result_id);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
@@ -653,6 +790,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op): unhandled type: " << op->a.type() << "\n";
     }
     visit_binary_op(op_code, op->type, op->a, op->b);
+    if(!op->type.is_bool()) {
+        Type bool_type = UInt(1, op->type.lanes());
+        SpvId current_id = builder.current_id();
+        SpvId result_id = cast_type(op->type, bool_type, current_id);
+        builder.update_id(result_id);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
@@ -668,6 +811,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op): unhandled type: " << op->a.type() << "\n";
     }
     visit_binary_op(op_code, op->type, op->a, op->b);
+    if(!op->type.is_bool()) {
+        Type bool_type = UInt(1, op->type.lanes());
+        SpvId current_id = builder.current_id();
+        SpvId result_id = cast_type(op->type, bool_type, current_id);
+        builder.update_id(result_id);
+    }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const And *op) {
@@ -749,6 +898,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         Expr b = op->args[1];
         Expr e = cast(op->type, select(a < b, b - a, a - b));
         e->accept(this);
+
     } else if (op->is_intrinsic(Call::return_second)) {
         internal_assert(op->args.size() == 2);
         // Simply discard the first argument, which is generally a call to
@@ -825,7 +975,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         }
     } else if (op->is_intrinsic(Call::shift_left)) {
         internal_assert(op->args.size() == 2);
-        visit_binary_op(SpvOpShiftLeftLogical, op->type, op->args[0], op->args[1]);
+        if (op->type.is_uint()) {
+            visit_binary_op(SpvOpShiftLeftLogical, op->type, op->args[0], op->args[1]);
+        } else {
+            Expr e = lower_signed_shift_left(op->args[0], op->args[1]);
+            e.accept(this);
+        }
     } else if (op->is_intrinsic(Call::strict_float)) {
         // TODO: Enable/Disable RelaxedPrecision flags?
         internal_assert(op->args.size() == 1);
@@ -869,29 +1024,51 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
     } else if (op->is_intrinsic(Call::mux)) {
         Expr e = lower_mux(op);
         e.accept(this);
-    } else if (op->is_intrinsic(Call::widen_right_add) ||
-               op->is_intrinsic(Call::widen_right_mul) ||
-               op->is_intrinsic(Call::widen_right_sub) ||
-               op->is_intrinsic(Call::widening_add) ||
-               op->is_intrinsic(Call::widening_mul) ||
-               op->is_intrinsic(Call::widening_sub) ||
-               op->is_intrinsic(Call::widening_shift_left) ||
-               op->is_intrinsic(Call::widening_shift_right) ||
-               op->is_intrinsic(Call::rounding_shift_left) ||
-               op->is_intrinsic(Call::rounding_shift_right) ||
-               op->is_intrinsic(Call::saturating_cast) ||
-               op->is_intrinsic(Call::saturating_add) ||
-               op->is_intrinsic(Call::saturating_sub) ||
-               op->is_intrinsic(Call::saturating_cast) ||
-               op->is_intrinsic(Call::halving_add) ||
-               op->is_intrinsic(Call::halving_sub) ||
-               op->is_intrinsic(Call::rounding_halving_add) ||
-               op->is_intrinsic(Call::mul_shift_right) ||
-               op->is_intrinsic(Call::rounding_mul_shift_right)) {
+    } else if (op->is_intrinsic(Call::saturating_cast)) {
         Expr e = lower_intrinsic(op);
         e.accept(this);
-        return;
 
+    } else if (op->is_intrinsic()) {
+        Expr lowered = lower_intrinsic(op);
+        if (lowered.defined()) {
+            Expr e = lower_intrinsic(op);
+            e.accept(this);
+        } else {
+            internal_error << "Unhandled intrinsic in Vulkan backend: " << op->name << "\n";
+        }
+    } else if (starts_with(op->name, "fast_inverse_f")) {
+        internal_assert(op->args.size() == 1);
+
+        if(op->type.lanes() > 1) {
+            user_error << "Vulkan: Expected scalar value for fast_inverse!\n";
+        }
+
+        op->args[0].accept(this);
+        SpvId arg_value_id = builder.current_id();
+
+        SpvId one_constant_id = SpvInvalidId;
+        SpvId type_id = builder.declare_type(op->type);
+        if(op->type.is_float() && op->type.bits() == 16) {
+            if(op->type.is_bfloat()) {
+                bfloat16_t one_value = bfloat16_t(1.0f);
+                one_constant_id = builder.declare_constant(op->type, &one_value);
+            } else {
+                float16_t one_value = float16_t(1.0f);
+                one_constant_id = builder.declare_constant(op->type, &one_value);
+            }
+        } else if(op->type.is_float() && op->type.bits() == 32) {
+            float one_value = float(1.0f);
+            one_constant_id = builder.declare_constant(op->type, &one_value);
+        } else if(op->type.is_float() && op->type.bits() == 64) {
+            double one_value = double(1.0);
+            one_constant_id = builder.declare_constant(op->type, &one_value);
+        } else {
+            internal_error << "Vulkan: Unhandled float type in fast_inverse intrinsic!\n";
+        }
+        internal_assert(one_constant_id != SpvInvalidId);
+        SpvId result_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::binary_op(SpvOpFDiv, type_id, result_id, one_constant_id, arg_value_id));
+        builder.update_id(result_id);
     } else if (op->name == "nan_f32") {
         float value = NAN;
         SpvId result_id = builder.declare_constant(Float(32), &value);
@@ -1171,8 +1348,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
         storage_type = it->second.storage_type;
     }
 
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): value_type=" << op->type << " storage_type=" << storage_type << "\n";
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Load): index_type=" << op->index.type() << " index=" << op->index << "\n";
+    debug(2) << "    value_type=" << op->type << " storage_type=" << storage_type << "\n";
+    debug(2) << "    index_type=" << op->index.type() << " index=" << op->index << "\n";
 
     if (op->index.type().is_scalar()) {
         op->index.accept(this);
@@ -1187,7 +1364,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): " << op->name << "[" << op->index << "] = (" << op->value << ")\n";
     user_assert(is_const_one(op->predicate)) << "Predicated stores not supported by SPIR-V codegen!\n";
 
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): value_type=" << op->value.type() << " value=" << op->value << "\n";
+    debug(2) << "    value_type=" << op->value.type() << " value=" << op->value << "\n";
     op->value.accept(this);
     SpvId value_id = builder.current_id();
 
@@ -1209,8 +1386,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
         storage_type = it->second.storage_type;
     }
 
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): value_type=" << value_type << " storage_type=" << storage_type << "\n";
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Store): index_type=" << op->index.type() << " index=" << op->index << "\n";
+    debug(2) << "    value_type=" << value_type << " storage_type=" << storage_type << "\n";
+    debug(2) << "    index_type=" << op->index.type() << " index=" << op->index << "\n";
     if (op->index.type().is_scalar()) {
         op->index.accept(this);
         SpvId index_id = builder.current_id();
@@ -1279,7 +1456,7 @@ int thread_loop_workgroup_index(const std::string &name) {
 }  // anonymous namespace
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(For): " << op->name << "\n";
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(For): name=" << op->name << " min=" << op->min << " extent=" << op->extent << "\n";
 
     if (is_gpu_var(op->name)) {
         internal_assert((op->for_type == ForType::GPUBlock) ||
@@ -1305,20 +1482,19 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         // Intrinsics are inserted when adding the kernel
         internal_assert(symbol_table.contains(intrinsic.first));
         SpvId intrinsic_id = symbol_table.get(intrinsic.first).first;
+        SpvStorageClass storage_class = symbol_table.get(intrinsic.first).second;
 
-        // extract and cast to int (which is what's expected by Halide's for loops)
-        SpvId unsigned_type_id = builder.declare_type(UInt(32));
-        SpvId unsigned_gpu_var_id = builder.reserve_id(SpvResultId);
-        SpvId signed_type_id = builder.declare_type(Int(32));
-        SpvId signed_gpu_var_id = builder.reserve_id(SpvResultId);
+        // extract and cast to the extent type (which is what's expected by Halide's for loops)
+        Type unsigned_type = UInt(32);
+        SpvId unsigned_type_id = builder.declare_type(unsigned_type);
+        SpvId unsigned_value_id = builder.reserve_id(SpvResultId);
         SpvFactory::Indices indices = {intrinsic.second};
-        builder.append(SpvFactory::composite_extract(unsigned_type_id, unsigned_gpu_var_id, intrinsic_id, indices));
-        builder.append(SpvFactory::bitcast(signed_type_id, signed_gpu_var_id, unsigned_gpu_var_id));
+        builder.append(SpvFactory::composite_extract(unsigned_type_id, unsigned_value_id, intrinsic_id, indices));
+        SpvId intrinsic_value_id = cast_type(op->min.type(), unsigned_type, unsigned_value_id);
         {
-            ScopedSymbolBinding binding(symbol_table, op->name, {signed_gpu_var_id, SpvStorageClassUniform});
+            ScopedSymbolBinding binding(symbol_table, op->name, {intrinsic_value_id, storage_class});
             op->body.accept(this);
         }
-
     } else {
 
         debug(2) << "  (serial for loop): min=" << op->min << " extent=" << op->extent << "\n";
@@ -1333,7 +1509,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         SpvId extent_id = builder.current_id();
 
         // Compute max.
-        Type index_type = Int(32);
+        Type index_type = op->min.type();
         SpvId index_type_id = builder.declare_type(index_type);
         SpvStorageClass storage_class = SpvStorageClassFunction;
         SpvId index_var_type_id = builder.declare_pointer_type(index_type_id, storage_class);
@@ -1342,7 +1518,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
 
         // Declare loop var
         const std::string loop_var_name = unique_name("_loop_idx");
-        SpvId loop_var_id = builder.declare_variable(loop_var_name, index_var_type_id, storage_class, min_id);
+        debug(2) << "  loop_index=" << loop_var_name << " type=" << index_type << "\n";
+        SpvId loop_var_id = builder.declare_variable(loop_var_name, index_var_type_id, storage_class);
         symbol_table.push(loop_var_name, {loop_var_id, storage_class});
 
         SpvId header_block_id = builder.reserve_id(SpvBlockId);
@@ -1351,22 +1528,23 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         SpvId continue_block_id = builder.reserve_id(SpvBlockId);
         SpvId merge_block_id = builder.reserve_id(SpvBlockId);
 
+        builder.append(SpvFactory::store(loop_var_id, min_id));
         SpvBlock header_block = builder.create_block(header_block_id);
         builder.enter_block(header_block);
-        {
+        {            
             builder.append(SpvFactory::loop_merge(merge_block_id, continue_block_id, SpvLoopControlDontUnrollMask));
             builder.append(SpvFactory::branch(top_block_id));
         }
         builder.leave_block();
 
-        SpvId current_index_id = builder.reserve_id(SpvResultId);
+        SpvId loop_index_id = builder.reserve_id(SpvResultId);
         SpvBlock top_block = builder.create_block(top_block_id);
         builder.enter_block(top_block);
         {
             SpvId loop_test_type_id = builder.declare_type(Bool());
             SpvId loop_test_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::load(index_type_id, current_index_id, loop_var_id));
-            builder.append(SpvFactory::less_than(loop_test_type_id, loop_test_id, current_index_id, max_id, true));
+            builder.append(SpvFactory::load(index_type_id, loop_index_id, loop_var_id));
+            builder.append(SpvFactory::integer_less_than(loop_test_type_id, loop_test_id, loop_index_id, max_id, index_type.is_uint()));
             builder.append(SpvFactory::conditional_branch(loop_test_id, body_block_id, merge_block_id));
         }
         builder.leave_block();
@@ -1374,7 +1552,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         SpvBlock body_block = builder.create_block(body_block_id);
         builder.enter_block(body_block);
         {
-            ScopedSymbolBinding binding(symbol_table, op->name, {current_index_id, storage_class});
+            ScopedSymbolBinding binding(symbol_table, op->name, {loop_index_id, storage_class});
             op->body.accept(this);
             builder.append(SpvFactory::branch(continue_block_id));
         }
@@ -1386,7 +1564,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
             // Update loop variable
             int32_t one = 1;
             SpvId next_index_id = builder.reserve_id(SpvResultId);
-            SpvId constant_one_id = builder.declare_constant(Int(32), &one);
+            SpvId constant_one_id = builder.declare_constant(index_type, &one);
+            SpvId current_index_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::load(index_type_id, current_index_id, loop_var_id));
             builder.append(SpvFactory::integer_add(index_type_id, next_index_id, current_index_id, constant_one_id));
             builder.append(SpvFactory::store(loop_var_id, next_index_id));
             builder.append(SpvFactory::branch(header_block_id));
@@ -1420,8 +1600,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Ramp *op) {
         SpvId this_id = builder.reserve_id(SpvResultId);
         if (op->base.type().is_float()) {
             builder.append(SpvFactory::float_add(base_type_id, this_id, prev_id, stride_id));
-        } else {
+        } else if(op->base.type().is_int_or_uint()) {
             builder.append(SpvFactory::integer_add(base_type_id, this_id, prev_id, stride_id));
+        } else {
+            internal_error << "SPIRV: Unhandled base type encountered in ramp!\n";
         }
         constituents.push_back(this_id);
         prev_id = this_id;
@@ -1516,8 +1698,7 @@ template<typename StmtOrExpr>
 SpvFactory::BlockVariables
 CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
                                                      StmtOrExpr then_case, StmtOrExpr else_case) {
-    condition.accept(this);
-    SpvId cond_id = builder.current_id();
+
     SpvId merge_block_id = builder.reserve_id(SpvBlockId);
     SpvId if_block_id = builder.reserve_id(SpvBlockId);
     SpvId then_block_id = builder.reserve_id(SpvBlockId);
@@ -1525,24 +1706,24 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
 
     SpvFactory::BlockVariables block_vars;
 
-    // If Conditional
+    // If block
+    debug(2) << "Vulkan: If => (" << condition << " )\n";
     SpvBlock if_block = builder.create_block(if_block_id);
     builder.enter_block(if_block);
     {
-        debug(2) << "Vulkan: If (" << condition << " )\n";
-
+        condition.accept(this);
+        SpvId cond_id = builder.current_id();
         builder.append(SpvFactory::selection_merge(merge_block_id, SpvSelectionControlMaskNone));
         builder.append(SpvFactory::conditional_branch(cond_id, then_block_id, else_block_id));
     }
     builder.leave_block();
 
     // Then block
+    debug(2) << "Vulkan: Then =>\n" << then_case << "\n";
     SpvBlock then_block = builder.create_block(then_block_id);
     builder.enter_block(then_block);
     {
         then_case.accept(this);
-        debug(2) << "Vulkan: Then {" << then_case << " }\n";
-
         SpvId then_id = builder.current_id();
         builder.append(SpvFactory::branch(merge_block_id));
         block_vars.push_back({then_id, then_block_id});
@@ -1551,11 +1732,11 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
 
     // Else block (optional)
     if (else_case.defined()) {
+        debug(2) << "Vulkan: Else =>\n" << else_case << "\n";
         SpvBlock else_block = builder.create_block(else_block_id);
         builder.enter_block(else_block);
         {
             else_case.accept(this);
-            debug(2) << "Vulkan: Else { " << else_case << " }\n";
             SpvId else_id = builder.current_id();
             builder.append(SpvFactory::branch(merge_block_id));
             block_vars.push_back({else_id, else_block_id});
@@ -1961,20 +2142,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_entry_point(const Stmt &s, SpvId
     for (const std::string &intrinsic_name : find_intrinsics.intrinsics_used) {
 
         // The builtins are pointers to vec3
+        SpvStorageClass storage_class = SpvStorageClassInput;
         SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
-        SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, SpvStorageClassInput);
-        SpvId intrinsic_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, SpvStorageClassInput);
+        SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, storage_class);
+        SpvId intrinsic_var_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, storage_class);
         SpvId intrinsic_loaded_id = builder.reserve_id();
-        builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_id));
-        symbol_table.push(intrinsic_name, {intrinsic_loaded_id, SpvStorageClassInput});
+        builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_var_id));
+        symbol_table.push(intrinsic_name, {intrinsic_loaded_id, storage_class});
 
         // Annotate that this is the specific builtin
         SpvBuiltIn built_in_kind = map_simt_builtin(intrinsic_name);
         SpvBuilder::Literals annotation_literals = {(uint32_t)built_in_kind};
-        builder.add_annotation(intrinsic_id, SpvDecorationBuiltIn, annotation_literals);
+        builder.add_annotation(intrinsic_var_id, SpvDecorationBuiltIn, annotation_literals);
 
         // Add the builtin to the interface
-        entry_point_variables.push_back(intrinsic_id);
+        entry_point_variables.push_back(intrinsic_var_id);
     }
 
     // Add the entry point with the appropriate execution model

From fee92c683c3f73cafb33ceb09b111c1cb6cf6144 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 28 Oct 2022 13:12:28 -0700
Subject: [PATCH 069/166] Add SpvInstruction::add_operands(), add_immediates()
 and template based append() Make integer logical operations explicit. Better
 handling of constant data.

---
 src/SpirvIR.cpp | 488 ++++++++++++++++++++++++++++++++----------------
 src/SpirvIR.h   |  29 ++-
 2 files changed, 350 insertions(+), 167 deletions(-)

diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index b63931100b34..abf3b24a1e82 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -8,6 +8,101 @@ namespace Internal {
 
 namespace {
 
+template<typename T>
+T saturate_value(T val, T min = std::numeric_limits<T>::min(), T max = std::numeric_limits<T>::max()) {
+    return std::min(std::max(val, min), max);
+}
+
+template<typename T>
+void assign_constant(void* dst, const void* src) {
+    reinterpret_cast<T*>(dst)[0] = saturate_value<T>(reinterpret_cast<const T*>(src)[0]);
+}
+
+template<>
+void assign_constant<bfloat16_t>(void* dst, const void* src) {
+    reinterpret_cast<bfloat16_t*>(dst)[0] = reinterpret_cast<const bfloat16_t*>(src)[0];
+}
+
+template<>
+void assign_constant<float16_t>(void* dst, const void* src) {
+    reinterpret_cast<float16_t*>(dst)[0] = reinterpret_cast<const float16_t*>(src)[0];
+}
+
+template<>
+void assign_constant<float>(void* dst, const void* src) {
+    reinterpret_cast<float*>(dst)[0] = reinterpret_cast<const float*>(src)[0];
+}
+
+template<>
+void assign_constant<double>(void* dst, const void* src) {
+    reinterpret_cast<double*>(dst)[0] = reinterpret_cast<const double*>(src)[0];
+}
+
+template<typename T>
+std::string stringify_constant(const T& value) {
+    return std::string();
+}
+
+template<>
+std::string  stringify_constant(const int8_t& value) {
+    return std::to_string(int8_t(value));
+}
+
+template<>
+std::string  stringify_constant(const int16_t& value) {
+    return std::to_string(int16_t(value));
+}
+
+template<>
+std::string  stringify_constant(const int32_t& value) {
+    return std::to_string(int32_t(value));
+}
+
+template<>
+std::string  stringify_constant(const int64_t& value) {
+    return std::to_string(int64_t(value));
+}
+
+template<>
+std::string  stringify_constant(const uint8_t& value) {
+    return std::to_string(uint8_t(value));
+}
+
+template<>
+std::string  stringify_constant(const uint16_t& value) {
+    return std::to_string(uint16_t(value));
+}
+
+template<>
+std::string  stringify_constant(const uint32_t& value) {
+    return std::to_string(uint32_t(value));
+}
+
+template<>
+std::string  stringify_constant(const uint64_t& value) {
+    return std::to_string(uint64_t(value));
+}
+
+template<>
+std::string  stringify_constant(const bfloat16_t& value) {
+    return std::to_string(float(value));
+}
+
+template<>
+std::string  stringify_constant(const float16_t& value) {
+    return std::to_string(float(value));
+}
+
+template<>
+std::string  stringify_constant(const float& value) {
+    return std::to_string(float(value));
+}
+
+template<>
+std::string  stringify_constant(const double& value) {
+    return std::to_string(double(value));
+}
+
 /** Returns the major version of the SPIR-V header version indicator **/
 inline uint32_t spirv_major_version(uint32_t version) {
     return ((version >> 16) & 0xff);
@@ -77,12 +172,47 @@ void SpvInstruction::add_operand(SpvId id) {
     contents->value_types.push_back(SpvOperandId);
 }
 
+void SpvInstruction::add_operands(const SpvInstruction::Operands& operands) {
+    check_defined();
+    SpvInstructionContents::ValueTypes value_types(operands.size(), SpvOperandId);
+    contents->operands.insert(contents->operands.end(), operands.begin(), operands.end());
+    contents->value_types.insert(contents->value_types.end(), value_types.begin(), value_types.end());
+}
+
 void SpvInstruction::add_immediate(SpvId id, SpvValueType value_type) {
     check_defined();
     contents->operands.push_back(id);
     contents->value_types.push_back(value_type);
 }
 
+void SpvInstruction::add_immediates(const SpvInstruction::Immediates& literals) {
+    check_defined();
+    for(const SpvInstruction::LiteralValue& v : literals) {
+        contents->operands.push_back(v.first); // SpvId
+        contents->value_types.push_back(v.second); // SpvValueType
+    }
+}
+
+template<>
+void SpvInstruction::append(const SpvInstruction::Operands& operands) {
+    add_operands(operands);
+}
+
+template<>
+void SpvInstruction::append(const SpvInstruction::Immediates& immediates) {
+    add_immediates(immediates);
+}
+
+template<>
+void SpvInstruction::append(const std::string& str) {
+    add_string(str);
+}
+
+template<typename T>
+void SpvInstruction::append(const T&) {
+    internal_error << "SPIRV: Unhandled type encountered when appending to instruction!\n";
+}
+
 SpvId SpvInstruction::result_id() const {
     check_defined();
     return contents->result_id;
@@ -153,21 +283,33 @@ SpvBlock SpvInstruction::block() const {
 
 void SpvInstruction::add_data(uint32_t bytes, const void *data, SpvValueType value_type) {
     check_defined();
-    uint32_t extra_words = (bytes + 3) / 4;
+    
+    
+    uint32_t total_entries = (bytes + 3) / 4;
+    debug(3) << "    add_data bytes=" << bytes << " total_entries=" << total_entries << "\n";
+
+    if(bytes == sizeof(uint32_t)) {
+        uint32_t entry = 0;
+        memcpy(&entry, data, sizeof(uint32_t));
+        add_immediate(entry, value_type);
+        return;
+    }
+    const size_t entry_size = sizeof(uint32_t);
     const uint8_t *ptr = (const uint8_t *)data;
     size_t bytes_copied = 0;
-    for (uint32_t i = 0; i < extra_words; i++) {
-        size_t copy_size = std::min(bytes - bytes_copied, (size_t)4);
+    for (uint32_t i = 0; i < total_entries; i++) {
+        size_t copy_size = std::min(bytes - bytes_copied, entry_size);
         SpvId entry = 0;
         memcpy(&entry, ptr, copy_size);
         bytes_copied += copy_size;
         add_immediate(entry, value_type);
-        ptr += 4;
+        ptr += entry_size;
     }
 }
 
 void SpvInstruction::add_string(const std::string &str) {
     check_defined();
+    debug(3) << "    add_string str=" << str << " length=" << (uint32_t)str.length() << "\n";
     add_data(str.length() + 1, (const void *)str.c_str(), SpvStringData);
 }
 
@@ -1634,11 +1776,8 @@ SpvId SpvBuilder::declare_string_constant(const std::string &value) {
     return result_id;
 }
 
-SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *data) {
-    if (scalar_type.lanes() != 1) {
-        internal_error << "SPIRV: Invalid type provided for scalar constant!" << scalar_type << "\n";
-        return SpvInvalidId;
-    }
+template<typename T>
+SpvId SpvBuilder::declare_scalar_constant_of_type(const Type& scalar_type, const T* data) {
 
     ConstantKey constant_key = make_constant_key(scalar_type, data);
     ConstantMap::const_iterator it = constant_map.find(constant_key);
@@ -1646,33 +1785,106 @@ SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *d
         return it->second;
     }
 
-    if (scalar_type.is_bool() && data) {
-        bool value = *reinterpret_cast<const bool *>(data);
-        return declare_bool_constant(value);
-    }
-
     SpvId result_id = SpvInvalidId;
     SpvValueType value_type = SpvInvalidValueType;
-    if (scalar_type.is_float()) {
+    if (scalar_type.is_bool()) {
+        const bool value = (reinterpret_cast<const bool *>(data)[0]);
+        return declare_bool_constant(value);
+    } else if (scalar_type.is_float()) {
         result_id = make_id(SpvFloatConstantId);
         value_type = SpvFloatData;
-        debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const float *)(data)) << "\n";
     } else if (scalar_type.is_int_or_uint()) {
         result_id = make_id(SpvIntConstantId);
         value_type = SpvIntegerData;
-        debug(3) << "    declare_scalar_constant: %" << result_id << " type=" << scalar_type << " data=" << (*(const int32_t *)(data)) << "\n";
     } else {
         internal_error << "SPIRV: Unsupported type:" << scalar_type << "\n";
         return SpvInvalidId;
     }
 
+    T value = T(0);
+    assign_constant<T>(&value, data);
     SpvId type_id = add_type(scalar_type);
-    SpvInstruction inst = SpvFactory::constant(result_id, type_id, scalar_type.bytes(), data, value_type);
+
+    debug(3) << "    declare_scalar_constant_of_type: " 
+             << "%" << result_id << " "
+             << "type=" << scalar_type << " "
+             << "data=" << stringify_constant(value) << "\n";
+
+    SpvInstruction inst = SpvFactory::constant(result_id, type_id, scalar_type.bytes(), &value, value_type);
     module.add_constant(inst);
     constant_map[constant_key] = result_id;
     return result_id;
 }
 
+SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *data) {
+    if (scalar_type.lanes() != 1) {
+        internal_error << "SPIRV: Invalid type provided for scalar constant!" << scalar_type << "\n";
+        return SpvInvalidId;
+    }
+
+    ConstantKey constant_key = make_constant_key(scalar_type, data);
+    ConstantMap::const_iterator it = constant_map.find(constant_key);
+    if (it != constant_map.end()) {
+        return it->second;
+    }
+
+    SpvId result_id = SpvInvalidId;
+    if (scalar_type.is_bool() && data) {
+        bool value = *reinterpret_cast<const bool *>(data);
+        return declare_bool_constant(value);
+    } else if(scalar_type.is_int() && scalar_type.bits() == 8) {
+        result_id = declare_scalar_constant_of_type<int8_t>(scalar_type, reinterpret_cast<const int8_t*>(data));
+    } else if(scalar_type.is_int() && scalar_type.bits() == 16) {
+        result_id = declare_scalar_constant_of_type<int16_t>(scalar_type, reinterpret_cast<const int16_t*>(data));
+    } else if(scalar_type.is_int() && scalar_type.bits() == 32) {
+        result_id = declare_scalar_constant_of_type<int32_t>(scalar_type, reinterpret_cast<const int32_t*>(data));
+    } else if(scalar_type.is_int() && scalar_type.bits() == 64) {
+        result_id = declare_scalar_constant_of_type<int64_t>(scalar_type, reinterpret_cast<const int64_t*>(data));
+    } else if(scalar_type.is_uint() && scalar_type.bits() == 8) {
+        result_id = declare_scalar_constant_of_type<uint8_t>(scalar_type, reinterpret_cast<const uint8_t*>(data));
+    } else if(scalar_type.is_uint() && scalar_type.bits() == 16) {
+        result_id = declare_scalar_constant_of_type<uint16_t>(scalar_type, reinterpret_cast<const uint16_t*>(data));
+    } else if(scalar_type.is_uint() && scalar_type.bits() == 32) {
+        result_id = declare_scalar_constant_of_type<uint32_t>(scalar_type, reinterpret_cast<const uint32_t*>(data));
+    } else if(scalar_type.is_uint() && scalar_type.bits() == 64) {
+        result_id = declare_scalar_constant_of_type<uint64_t>(scalar_type, reinterpret_cast<const uint64_t*>(data));
+    } else if(scalar_type.is_float() && scalar_type.bits() == 16) {
+        if(scalar_type.is_bfloat()) {
+            result_id = declare_scalar_constant_of_type<bfloat16_t>(scalar_type, reinterpret_cast<const bfloat16_t*>(data));
+        } else {
+            result_id = declare_scalar_constant_of_type<float16_t>(scalar_type, reinterpret_cast<const float16_t*>(data));
+        }
+    } else if(scalar_type.is_float() && scalar_type.bits() == 32) {
+        result_id = declare_scalar_constant_of_type<float>(scalar_type, reinterpret_cast<const float*>(data));
+    } else if(scalar_type.is_float() && scalar_type.bits() == 64) {
+        result_id = declare_scalar_constant_of_type<double>(scalar_type, reinterpret_cast<const double*>(data));
+    } else {
+        user_error << "Unhandled constant data conversion from value type '" << scalar_type << "'!\n";
+    }
+    internal_assert(result_id != SpvInvalidId) << "Failed to declare scalar constant of type '" << scalar_type << "'!\n";
+    return result_id;
+}
+
+template<typename T>
+SpvBuilder::Components SpvBuilder::declare_constants_for_each_lane(Type type, const void* data) {
+    SpvBuilder::Components components;
+    components.reserve(type.lanes());
+
+    if (type.lanes() == 1) {
+        internal_error << "SPIRV: Invalid type provided for vector constant!" << type << "\n";
+        return components;
+    }
+
+    Type scalar_type = type.with_lanes(1);
+    const T *values = reinterpret_cast<const T *>(data);
+    for (int c = 0; c < type.lanes(); c++) {
+        const T *entry = &(values[c]);
+        SpvId scalar_id = declare_scalar_constant(scalar_type, (const void *)entry);
+        components.push_back(scalar_id);
+    }
+    return components;
+}
+
 SpvId SpvBuilder::declare_vector_constant(const Type &type, const void *data) {
     if (type.lanes() == 1) {
         internal_error << "SPIRV: Invalid type provided for vector constant!" << type << "\n";
@@ -1685,54 +1897,39 @@ SpvId SpvBuilder::declare_vector_constant(const Type &type, const void *data) {
         return it->second;
     }
 
-    Type scalar_type = type.with_lanes(1);
-    std::vector<SpvId> components(type.lanes());
-    if (scalar_type.is_float()) {
-        if (type.bits() == 64) {
-            const double *values = (const double *)data;
-            for (int c = 0; c < type.lanes(); c++) {
-                const double *entry = &(values[c]);
-                SpvId scalar_id = declare_scalar_constant(scalar_type, (const void *)entry);
-                components.push_back(scalar_id);
-            }
-        } else {
-            const float *values = (const float *)data;
-            for (int c = 0; c < type.lanes(); c++) {
-                const float *entry = &(values[c]);
-                SpvId scalar_id = declare_scalar_constant(scalar_type, (const void *)entry);
-                components.push_back(scalar_id);
-            }
-        }
-    } else if (scalar_type.is_bool()) {
-        const bool *values = (const bool *)data;
-        for (int c = 0; c < type.lanes(); c++) {
-            const bool *entry = &(values[c]);
-            SpvId scalar_id = declare_scalar_constant(scalar_type, (const void *)entry);
-            components.push_back(scalar_id);
-        }
-    } else if (scalar_type.is_int_or_uint()) {
-        if (type.bits() == 64) {
-            const uint64_t *values = (const uint64_t *)data;
-            for (int c = 0; c < type.lanes(); c++) {
-                const uint64_t *entry = &(values[c]);
-                SpvId scalar_id = declare_scalar_constant(scalar_type, (const void *)entry);
-                components.push_back(scalar_id);
-            }
+    SpvBuilder::Components components;
+    if(type.is_int() && type.bits() == 8) {
+        components = declare_constants_for_each_lane<int8_t>(type, data);
+    } else if(type.is_int() && type.bits() == 16) {
+        components = declare_constants_for_each_lane<int16_t>(type, data);
+    } else if(type.is_int() && type.bits() == 32) {
+        components = declare_constants_for_each_lane<int32_t>(type, data);
+    } else if(type.is_int() && type.bits() == 64) {
+        components = declare_constants_for_each_lane<int64_t>(type, data);
+    } else if(type.is_uint() && type.bits() == 8) {
+        components = declare_constants_for_each_lane<uint8_t>(type, data);
+    } else if(type.is_uint() && type.bits() == 16) {
+        components = declare_constants_for_each_lane<uint16_t>(type, data);
+    } else if(type.is_uint() && type.bits() == 32) {
+        components = declare_constants_for_each_lane<uint32_t>(type, data);
+    } else if(type.is_uint() && type.bits() == 64) {
+        components = declare_constants_for_each_lane<uint64_t>(type, data);
+    } else if(type.is_float() && type.bits() == 16) {
+        if(type.is_bfloat()) {
+            components = declare_constants_for_each_lane<bfloat16_t>(type, data);        
         } else {
-            const uint32_t *values = (const uint32_t *)data;
-            for (int c = 0; c < type.lanes(); c++) {
-                const uint32_t *entry = &(values[c]);
-                SpvId scalar_id = declare_scalar_constant(scalar_type, (const void *)entry);
-                components.push_back(scalar_id);
-            }
+            components = declare_constants_for_each_lane<float16_t>(type, data);
         }
+    } else if(type.is_float() && type.bits() == 32) {
+        components = declare_constants_for_each_lane<float>(type, data);
+    } else if(type.is_float() && type.bits() == 64) {
+        components = declare_constants_for_each_lane<double>(type, data);
     } else {
-        internal_error << "SPIRV: Unsupported type:" << type << "\n";
-        return SpvInvalidId;
+        user_error << "Unhandled constant data conversion from value type '" << type << "'!";
     }
 
-    SpvId result_id = make_id(SpvCompositeConstantId);
     SpvId type_id = add_type(type);
+    SpvId result_id = make_id(SpvCompositeConstantId);
     debug(3) << "    declare_vector_constant: %" << result_id << " key=" << key << " type=" << type << " data=" << data << "\n";
     SpvInstruction inst = SpvFactory::composite_constant(result_id, type_id, components);
     module.add_constant(inst);
@@ -1908,8 +2105,10 @@ SpvInstruction SpvFactory::label(SpvId result_id) {
 SpvInstruction SpvFactory::debug_line(SpvId string_id, uint32_t line, uint32_t column) {
     SpvInstruction inst = SpvInstruction::make(SpvOpLine);
     inst.add_operand(string_id);
-    inst.add_immediate(line, SpvIntegerLiteral);
-    inst.add_immediate(column, SpvIntegerLiteral);
+    inst.add_immediates({
+        {line, SpvIntegerLiteral},
+        {column, SpvIntegerLiteral},
+    });
     return inst;
 }
 
@@ -1940,8 +2139,10 @@ SpvInstruction SpvFactory::decorate(SpvId target_id, SpvDecoration decoration_ty
 SpvInstruction SpvFactory::decorate_member(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const SpvFactory::Literals &literals) {
     SpvInstruction inst = SpvInstruction::make(SpvOpMemberDecorate);
     inst.add_operand(struct_type_id);
-    inst.add_immediate(member_index, SpvIntegerLiteral);
-    inst.add_immediate(decoration_type, SpvIntegerLiteral);
+    inst.add_immediates({
+        {member_index, SpvIntegerLiteral}, 
+        {decoration_type, SpvIntegerLiteral}
+    });
     for (uint32_t l : literals) {
         inst.add_immediate(l, SpvIntegerLiteral);
     }
@@ -1960,8 +2161,7 @@ SpvInstruction SpvFactory::binary_op(SpvOp op_code, SpvId type_id, SpvId result_
     SpvInstruction inst = SpvInstruction::make(op_code);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operand(src_a_id);
-    inst.add_operand(src_b_id);
+    inst.add_operands({src_a_id, src_b_id});
     return inst;
 }
 
@@ -1988,8 +2188,10 @@ SpvInstruction SpvFactory::bool_type(SpvId bool_type_id) {
 SpvInstruction SpvFactory::integer_type(SpvId int_type_id, uint32_t bits, uint32_t signedness) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeInt);
     inst.set_result_id(int_type_id);
-    inst.add_immediate(bits, SpvIntegerLiteral);
-    inst.add_immediate(signedness, SpvIntegerLiteral);
+    inst.add_immediates({
+        {bits, SpvIntegerLiteral},
+        {signedness, SpvIntegerLiteral}
+    });
     return inst;
 }
 
@@ -2011,17 +2213,14 @@ SpvInstruction SpvFactory::vector_type(SpvId vector_type_id, SpvId element_type_
 SpvInstruction SpvFactory::array_type(SpvId array_type_id, SpvId element_type_id, SpvId array_size_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeArray);
     inst.set_result_id(array_type_id);
-    inst.add_operand(element_type_id);
-    inst.add_operand(array_size_id);
+    inst.add_operands({ element_type_id, array_size_id });
     return inst;
 }
 
 SpvInstruction SpvFactory::struct_type(SpvId result_id, const SpvFactory::MemberTypeIds &member_type_ids) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeStruct);
     inst.set_result_id(result_id);
-    for (const SpvId member_type : member_type_ids) {
-        inst.add_operand(member_type);
-    }
+    inst.add_operands(member_type_ids);
     return inst;
 }
 
@@ -2044,9 +2243,7 @@ SpvInstruction SpvFactory::function_type(SpvId function_type_id, SpvId return_ty
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeFunction);
     inst.set_result_id(function_type_id);
     inst.add_operand(return_type_id);
-    for (SpvId type_id : param_type_ids) {
-        inst.add_operand(type_id);
-    }
+    inst.add_operands(param_type_ids);
     return inst;
 }
 
@@ -2084,9 +2281,7 @@ SpvInstruction SpvFactory::composite_constant(SpvId result_id, SpvId type_id, co
     SpvInstruction inst = SpvInstruction::make(SpvOpConstantComposite);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    for (SpvId scalar_id : components) {
-        inst.add_operand(scalar_id);
-    }
+    inst.add_operands(components);
     return inst;
 }
 
@@ -2136,41 +2331,40 @@ SpvInstruction SpvFactory::entry_point(SpvId exec_model, SpvId func_id, const st
     inst.add_immediate(exec_model, SpvIntegerLiteral);
     inst.add_operand(func_id);
     inst.add_string(name);
-    for (SpvId var : variables) {
-        inst.add_operand(var);
-    }
+    inst.add_operands(variables);
     return inst;
 }
 
 SpvInstruction SpvFactory::memory_model(SpvAddressingModel addressing_model, SpvMemoryModel memory_model) {
     SpvInstruction inst = SpvInstruction::make(SpvOpMemoryModel);
-    inst.add_immediate(addressing_model, SpvIntegerLiteral);
-    inst.add_immediate(memory_model, SpvIntegerLiteral);
+    inst.add_immediates({
+        {addressing_model, SpvIntegerLiteral},
+        {memory_model, SpvIntegerLiteral}
+    });
     return inst;
 }
 
 SpvInstruction SpvFactory::exec_mode_local_size(SpvId function_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z) {
     SpvInstruction inst = SpvInstruction::make(SpvOpExecutionMode);
     inst.add_operand(function_id);
-    inst.add_immediate(SpvExecutionModeLocalSize, SpvIntegerLiteral);
-    inst.add_immediate(wg_size_x, SpvIntegerLiteral);
-    inst.add_immediate(wg_size_y, SpvIntegerLiteral);
-    inst.add_immediate(wg_size_z, SpvIntegerLiteral);
+    inst.add_immediates({
+        {SpvExecutionModeLocalSize, SpvIntegerLiteral},
+        {wg_size_x, SpvIntegerLiteral},
+        {wg_size_y, SpvIntegerLiteral},
+        {wg_size_z, SpvIntegerLiteral},
+    });
     return inst;
 }
 
 SpvInstruction SpvFactory::memory_barrier(SpvId memory_scope_id, SpvId semantics_mask_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpMemoryBarrier);
-    inst.add_operand(memory_scope_id);
-    inst.add_operand(semantics_mask_id);
+    inst.add_operands({memory_scope_id, semantics_mask_id});
     return inst;
 }
 
 SpvInstruction SpvFactory::control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, SpvId semantics_mask_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpControlBarrier);
-    inst.add_operand(execution_scope_id);
-    inst.add_operand(memory_scope_id);
-    inst.add_operand(semantics_mask_id);
+    inst.add_operands({execution_scope_id, memory_scope_id, semantics_mask_id});
     return inst;
 }
 
@@ -2192,19 +2386,17 @@ SpvInstruction SpvFactory::logical_and(SpvId type_id, SpvId result_id, SpvId src
 
 SpvInstruction SpvFactory::shift_right_logical(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpShiftRightLogical);
-    inst.add_operand(type_id);
-    inst.add_operand(result_id);
-    inst.add_operand(src_id);
-    inst.add_operand(shift_id);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operands({ src_id, shift_id });
     return inst;
 }
 
 SpvInstruction SpvFactory::shift_right_arithmetic(SpvId type_id, SpvId result_id, SpvId src_id, SpvId shift_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpShiftRightArithmetic);
-    inst.add_operand(type_id);
-    inst.add_operand(result_id);
-    inst.add_operand(src_id);
-    inst.add_operand(shift_id);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operands({ src_id, shift_id });
     return inst;
 }
 
@@ -2216,9 +2408,7 @@ SpvInstruction SpvFactory::select(SpvId type_id, SpvId result_id, SpvId conditio
     SpvInstruction inst = SpvInstruction::make(SpvOpSelect);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operand(condition_id);
-    inst.add_operand(true_id);
-    inst.add_operand(false_id);
+    inst.add_operands({condition_id, true_id, false_id});
     return inst;
 }
 
@@ -2227,9 +2417,7 @@ SpvInstruction SpvFactory::in_bounds_access_chain(SpvId type_id, SpvId result_id
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
     inst.add_operand(base_id);
-    for (SpvId i : indices) {
-        inst.add_operand(i);
-    }
+    inst.add_operands(indices);
     return inst;
 }
 
@@ -2237,11 +2425,8 @@ SpvInstruction SpvFactory::pointer_access_chain(SpvId type_id, SpvId result_id,
     SpvInstruction inst = SpvInstruction::make(SpvOpPtrAccessChain);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operand(base_id);
-    inst.add_operand(element_id);
-    for (SpvId i : indices) {
-        inst.add_operand(i);
-    }
+    inst.add_operands({base_id, element_id});
+    inst.add_operands(indices);
     return inst;
 }
 
@@ -2256,8 +2441,7 @@ SpvInstruction SpvFactory::load(SpvId type_id, SpvId result_id, SpvId ptr_id, ui
 
 SpvInstruction SpvFactory::store(SpvId ptr_id, SpvId obj_id, uint32_t access_mask) {
     SpvInstruction inst = SpvInstruction::make(SpvOpStore);
-    inst.add_operand(ptr_id);
-    inst.add_operand(obj_id);
+    inst.add_operands({ ptr_id, obj_id });
     inst.add_immediate(access_mask, SpvBitMaskLiteral);
     return inst;
 }
@@ -2266,8 +2450,7 @@ SpvInstruction SpvFactory::composite_insert(SpvId type_id, SpvId result_id, SpvI
     SpvInstruction inst = SpvInstruction::make(SpvOpCompositeInsert);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operand(object_id);
-    inst.add_operand(composite_id);
+    inst.add_operands({ object_id, composite_id });
     for (SpvId i : indices) {
         inst.add_immediate(i, SpvIntegerLiteral);
     }
@@ -2299,9 +2482,7 @@ SpvInstruction SpvFactory::vector_insert_dynamic(SpvId type_id, SpvId result_id,
     SpvInstruction inst = SpvInstruction::make(SpvOpVectorInsertDynamic);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operand(vector_id);
-    inst.add_operand(value_id);
-    inst.add_operand(index_id);
+    inst.add_operands({vector_id, value_id, index_id});
     return inst;
 }
 
@@ -2309,9 +2490,7 @@ SpvInstruction SpvFactory::vector_extract_dynamic(SpvId type_id, SpvId result_id
     SpvInstruction inst = SpvInstruction::make(SpvOpVectorExtractDynamic);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operand(vector_id);
-    inst.add_operand(value_id);
-    inst.add_operand(index_id);
+    inst.add_operands({vector_id, value_id, index_id});
     return inst;
 }
 
@@ -2367,66 +2546,58 @@ SpvInstruction SpvFactory::branch(SpvId target_label_id) {
 
 SpvInstruction SpvFactory::conditional_branch(SpvId condition_label_id, SpvId true_label_id, SpvId false_label_id, const SpvFactory::BranchWeights &weights) {
     SpvInstruction inst = SpvInstruction::make(SpvOpBranchConditional);
-    inst.add_operand(condition_label_id);
-    inst.add_operand(true_label_id);
-    inst.add_operand(false_label_id);
+    inst.add_operands({condition_label_id, true_label_id, false_label_id});
     for (uint32_t w : weights) {
         inst.add_immediate(w, SpvIntegerLiteral);
     }
     return inst;
 }
 
-SpvInstruction SpvFactory::equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
+SpvInstruction SpvFactory::integer_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpIEqual);
-    inst.add_operand(type_id);
-    inst.add_operand(result_id);
-    inst.add_operand(src_a_id);
-    inst.add_operand(src_b_id);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operands({src_a_id, src_b_id});
     return inst;
 }
 
-SpvInstruction SpvFactory::not_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
+SpvInstruction SpvFactory::integer_not_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpINotEqual);
-    inst.add_operand(type_id);
-    inst.add_operand(result_id);
-    inst.add_operand(src_a_id);
-    inst.add_operand(src_b_id);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operands({src_a_id, src_b_id});
     return inst;
 }
 
-SpvInstruction SpvFactory::less_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
+SpvInstruction SpvFactory::integer_less_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
     SpvInstruction inst = SpvInstruction::make(is_signed ? SpvOpSLessThan : SpvOpULessThan);
-    inst.add_operand(type_id);
-    inst.add_operand(result_id);
-    inst.add_operand(src_a_id);
-    inst.add_operand(src_b_id);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operands({src_a_id, src_b_id});
     return inst;
 }
 
-SpvInstruction SpvFactory::less_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
+SpvInstruction SpvFactory::integer_less_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
     SpvInstruction inst = SpvInstruction::make(is_signed ? SpvOpSLessThanEqual : SpvOpULessThanEqual);
-    inst.add_operand(type_id);
-    inst.add_operand(result_id);
-    inst.add_operand(src_a_id);
-    inst.add_operand(src_b_id);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operands({src_a_id, src_b_id});
     return inst;
 }
 
-SpvInstruction SpvFactory::greater_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
+SpvInstruction SpvFactory::integer_greater_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
     SpvInstruction inst = SpvInstruction::make(is_signed ? SpvOpSGreaterThan : SpvOpUGreaterThan);
-    inst.add_operand(type_id);
-    inst.add_operand(result_id);
-    inst.add_operand(src_a_id);
-    inst.add_operand(src_b_id);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operands({src_a_id, src_b_id});
     return inst;
 }
 
-SpvInstruction SpvFactory::greater_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
+SpvInstruction SpvFactory::integer_greater_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed) {
     SpvInstruction inst = SpvInstruction::make(is_signed ? SpvOpSGreaterThanEqual : SpvOpUGreaterThanEqual);
-    inst.add_operand(type_id);
-    inst.add_operand(result_id);
-    inst.add_operand(src_a_id);
-    inst.add_operand(src_b_id);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_operands({src_a_id, src_b_id});
     return inst;
 }
 
@@ -2450,8 +2621,7 @@ SpvInstruction SpvFactory::phi(SpvId type_id, SpvId result_id, const SpvFactory:
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
     for (const SpvFactory::VariableBlockIdPair &vb : block_vars) {
-        inst.add_operand(vb.first);   // variable id
-        inst.add_operand(vb.second);  // block id
+        inst.add_operands({vb.first, vb.second});  // variable id, block id
     }
     return inst;
 }
@@ -2481,9 +2651,7 @@ SpvInstruction SpvFactory::extended(SpvId instruction_set_id, SpvId instruction_
     inst.set_result_id(result_id);
     inst.add_operand(instruction_set_id);
     inst.add_immediate(instruction_number, SpvIntegerLiteral);
-    for (SpvId o : operands) {
-        inst.add_operand(o);
-    }
+    inst.add_operands(operands);
     return inst;
 }
 
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 037510a886cd..f17db1471cff 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -113,6 +113,7 @@ class SpvFunction;
 class SpvBlock;
 class SpvInstruction;
 class SpvBuilder;
+struct SpvFactory;
 
 /** Pre-declarations for SPIR-V IR data structures */
 struct SpvModuleContents;
@@ -129,6 +130,8 @@ using SpvInstructionContentsPtr = IntrusivePtr<SpvInstructionContents>;
 /** General interface for representing a SPIR-V Instruction */
 class SpvInstruction {
 public:
+    using LiteralValue = std::pair<uint32_t, SpvValueType>;
+    using Immediates = std::vector<LiteralValue>;
     using Operands = std::vector<SpvId>;
     using ValueTypes = std::vector<SpvValueType>;
 
@@ -145,10 +148,15 @@ class SpvInstruction {
     void set_type_id(SpvId id);
     void set_op_code(SpvOp opcode);
     void add_operand(SpvId id);
+    void add_operands(const Operands& operands);
     void add_immediate(SpvId id, SpvValueType type);
+    void add_immediates(const Immediates& Immediates);
     void add_data(uint32_t bytes, const void *data, SpvValueType type);
     void add_string(const std::string &str);
 
+    template<typename T>
+    void append(const T& operands_or_immediates_or_strings);
+
     SpvId result_id() const;
     SpvId type_id() const;
     SpvOp op_code() const;
@@ -343,6 +351,7 @@ class SpvModule {
 class SpvBuilder {
 public:
     using ParamTypes = std::vector<SpvId>;
+    using Components = std::vector<SpvId>;
     using StructMemberTypes = std::vector<SpvId>;
     using Variables = std::vector<SpvId>;
     using Indices = std::vector<uint32_t>;
@@ -559,6 +568,12 @@ class SpvBuilder {
     PointerTypeKey make_pointer_type_key(SpvId base_type_id, SpvStorageClass storage_class) const;
     SpvId lookup_pointer_type(SpvId base_type_id, SpvStorageClass storage_class) const;
 
+    template<typename T>
+    SpvId declare_scalar_constant_of_type(const Type& scalar_type, const T* data);
+
+    template<typename T>
+    SpvBuilder::Components declare_constants_for_each_lane(Type type, const void* data);
+    
     ConstantKey make_bool_constant_key(bool value) const;
     ConstantKey make_string_constant_key(const std::string &value) const;
     ConstantKey make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data) const;
@@ -663,15 +678,15 @@ struct SpvFactory {
     static SpvInstruction is_inf(SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction is_nan(SpvId type_id, SpvId result_id, SpvId src_id);
     static SpvInstruction bitcast(SpvId type_id, SpvId result_id, SpvId src_id);
-    static SpvInstruction integer_add(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
     static SpvInstruction float_add(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
+    static SpvInstruction integer_add(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
+    static SpvInstruction integer_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
+    static SpvInstruction integer_not_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
+    static SpvInstruction integer_less_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
+    static SpvInstruction integer_less_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
+    static SpvInstruction integer_greater_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
+    static SpvInstruction integer_greater_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
     static SpvInstruction branch(SpvId target_label_id);
-    static SpvInstruction equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
-    static SpvInstruction not_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id);
-    static SpvInstruction less_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
-    static SpvInstruction less_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
-    static SpvInstruction greater_than(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
-    static SpvInstruction greater_than_equal(SpvId type_id, SpvId result_id, SpvId src_a_id, SpvId src_b_id, bool is_signed);
     static SpvInstruction conditional_branch(SpvId condition_label_id, SpvId true_label_id, SpvId false_label_id, const BranchWeights &weights = {});
     static SpvInstruction loop_merge(SpvId merge_label_id, SpvId continue_label_id, uint32_t loop_control_mask = SpvLoopControlMaskNone);
     static SpvInstruction selection_merge(SpvId merge_label_id, uint32_t selection_control_mask = SpvSelectionControlMaskNone);

From 12048f348a18e8d136072a1c73b8019364138e81 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 28 Oct 2022 13:22:19 -0700
Subject: [PATCH 070/166] Clang format & tidy pass

---
 src/CodeGen_Vulkan_Dev.cpp             | 102 +++++++-------
 src/SpirvIR.cpp                        | 177 ++++++++++++-------------
 src/SpirvIR.h                          |  12 +-
 test/correctness/async_device_copy.cpp |   2 +-
 4 files changed, 144 insertions(+), 149 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 81b2eb7e086d..7de9d1ce1455 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -422,7 +422,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const StringImm *imm) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
     if (imm->type.bits() == 16) {
-        if(imm->type.is_bfloat()) {
+        if (imm->type.is_bfloat()) {
             const bfloat16_t value = bfloat16_t(imm->value);
             SpvId constant_id = builder.declare_constant(imm->type, &value);
             builder.update_id(constant_id);
@@ -445,33 +445,33 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
 }
 
 SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type value_type, SpvId value_id) {
-    if(!value_type.is_bool()) {
+    if (!value_type.is_bool()) {
         value_id = cast_type(Bool(), value_type, value_id);
     }
-    uint8_t true_data[ target_type.bytes() ];
-    uint8_t false_data[ target_type.bytes() ];
-    for(int i = 0; i < target_type.lanes(); ++i) {
-        if(target_type.is_int_or_uint() && target_type.bits() == 8) {
-            reinterpret_cast<int8_t*>(true_data)[i] = int8_t(1);
-            reinterpret_cast<int8_t*>(false_data)[i] = int8_t(0);
-        } else if(target_type.is_int_or_uint() && target_type.bits() == 16) {
-            reinterpret_cast<int16_t*>(true_data)[i] = int16_t(1);
-            reinterpret_cast<int16_t*>(false_data)[i] = int16_t(0);
-        } else if(target_type.is_int_or_uint() && target_type.bits() == 32) {
-            reinterpret_cast<int32_t*>(true_data)[i] = int32_t(1);
-            reinterpret_cast<int32_t*>(false_data)[i] = int32_t(0);
-        } else if(target_type.is_int_or_uint() && target_type.bits() == 64) {
-            reinterpret_cast<int64_t*>(true_data)[i] = int64_t(1);
-            reinterpret_cast<int64_t*>(false_data)[i] = int64_t(0);
-        } else if(target_type.is_float() && target_type.bits() == 16) {
-            reinterpret_cast<uint16_t*>(true_data)[i] = uint16_t(1);
-            reinterpret_cast<uint16_t*>(false_data)[i] = uint16_t(0);
-        } else if(target_type.is_float() && target_type.bits() == 32) {
-            reinterpret_cast<float*>(true_data)[i] = 1.0f;
-            reinterpret_cast<float*>(false_data)[i] = 0.0f;
-        } else if(target_type.is_float() && target_type.bits() == 64) {
-            reinterpret_cast<double*>(true_data)[i] = 1.0;
-            reinterpret_cast<double*>(false_data)[i] = 0.0;
+    uint8_t true_data[target_type.bytes()];
+    uint8_t false_data[target_type.bytes()];
+    for (int i = 0; i < target_type.lanes(); ++i) {
+        if (target_type.is_int_or_uint() && target_type.bits() == 8) {
+            reinterpret_cast<int8_t *>(true_data)[i] = int8_t(1);
+            reinterpret_cast<int8_t *>(false_data)[i] = int8_t(0);
+        } else if (target_type.is_int_or_uint() && target_type.bits() == 16) {
+            reinterpret_cast<int16_t *>(true_data)[i] = int16_t(1);
+            reinterpret_cast<int16_t *>(false_data)[i] = int16_t(0);
+        } else if (target_type.is_int_or_uint() && target_type.bits() == 32) {
+            reinterpret_cast<int32_t *>(true_data)[i] = int32_t(1);
+            reinterpret_cast<int32_t *>(false_data)[i] = int32_t(0);
+        } else if (target_type.is_int_or_uint() && target_type.bits() == 64) {
+            reinterpret_cast<int64_t *>(true_data)[i] = int64_t(1);
+            reinterpret_cast<int64_t *>(false_data)[i] = int64_t(0);
+        } else if (target_type.is_float() && target_type.bits() == 16) {
+            reinterpret_cast<uint16_t *>(true_data)[i] = uint16_t(1);
+            reinterpret_cast<uint16_t *>(false_data)[i] = uint16_t(0);
+        } else if (target_type.is_float() && target_type.bits() == 32) {
+            reinterpret_cast<float *>(true_data)[i] = 1.0f;
+            reinterpret_cast<float *>(false_data)[i] = 0.0f;
+        } else if (target_type.is_float() && target_type.bits() == 64) {
+            reinterpret_cast<double *>(true_data)[i] = 1.0;
+            reinterpret_cast<double *>(false_data)[i] = 0.0;
         } else {
             user_error << "Unhandled type cast from value type '" << value_type << "' to target type '" << target_type << "'!";
         }
@@ -486,7 +486,7 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type
 }
 
 SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_type, SpvId value_id) {
-    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(): casting from value type '" 
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(): casting from value type '"
              << value_type << "' to target type '" << target_type << "'!\n";
 
     if (value_type == target_type) {
@@ -550,9 +550,9 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
         builder.append(SpvFactory::bitcast(target_type_id, result_id, value_id));
     } else if (op_code == SpvOpSelect) {
         result_id = convert_to_bool(target_type, value_type, value_id);
-    } else if(op_code == SpvOpUConvert && target_type.is_int()) {
+    } else if (op_code == SpvOpUConvert && target_type.is_int()) {
         // Vulkan requires both value and target types to be unsigned for UConvert
-        // so do the conversion to an equivalent unsigned type then bitcast this 
+        // so do the conversion to an equivalent unsigned type then bitcast this
         // result into the target type
         Type unsigned_type = target_type.with_code(halide_type_uint).narrow();
         SpvId unsigned_type_id = builder.declare_type(unsigned_type);
@@ -658,14 +658,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Max *op) {
 
     std::vector<Expr> args;
     args.reserve(2);
-    if(op->type.is_vector()) {
-        if(op->a.type().is_scalar()) {
+    if (op->type.is_vector()) {
+        if (op->a.type().is_scalar()) {
             Expr a_vector = Broadcast::make(op->a, op->type.lanes());
             args.push_back(a_vector);
         } else {
             args.push_back(op->a);
         }
-        if(op->b.type().is_scalar()) {
+        if (op->b.type().is_scalar()) {
             Expr b_vector = Broadcast::make(op->b, op->type.lanes());
             args.push_back(b_vector);
         } else {
@@ -693,14 +693,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op) {
 
     std::vector<Expr> args;
     args.reserve(2);
-    if(op->type.is_vector()) {
-        if(op->a.type().is_scalar()) {
+    if (op->type.is_vector()) {
+        if (op->a.type().is_scalar()) {
             Expr a_vector = Broadcast::make(op->a, op->type.lanes());
             args.push_back(a_vector);
         } else {
             args.push_back(op->a);
         }
-        if(op->b.type().is_scalar()) {
+        if (op->b.type().is_scalar()) {
             Expr b_vector = Broadcast::make(op->b, op->type.lanes());
             args.push_back(b_vector);
         } else {
@@ -716,7 +716,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(EQ): " << op->type << " (" << op->a << ") == (" << op->b << ")\n";
     visit_binary_op(op->type.is_float() ? SpvOpFOrdEqual : SpvOpIEqual, op->type, op->a, op->b);
-    if(!op->type.is_bool()) {
+    if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
         SpvId result_id = cast_type(op->type, bool_type, current_id);
@@ -727,7 +727,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const NE *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(NE): " << op->type << " (" << op->a << ") != (" << op->b << ")\n";
     visit_binary_op(op->type.is_float() ? SpvOpFOrdNotEqual : SpvOpINotEqual, op->type, op->a, op->b);
-    if(!op->type.is_bool()) {
+    if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
         SpvId result_id = cast_type(op->type, bool_type, current_id);
@@ -748,7 +748,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op): unhandled type: " << op->a.type() << "\n";
     }
     visit_binary_op(op_code, op->type, op->a, op->b);
-    if(!op->type.is_bool()) {
+    if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
         SpvId result_id = cast_type(op->type, bool_type, current_id);
@@ -769,7 +769,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op): unhandled type: " << op->a.type() << "\n";
     }
     visit_binary_op(op_code, op->type, op->a, op->b);
-    if(!op->type.is_bool()) {
+    if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
         SpvId result_id = cast_type(op->type, bool_type, current_id);
@@ -790,7 +790,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op): unhandled type: " << op->a.type() << "\n";
     }
     visit_binary_op(op_code, op->type, op->a, op->b);
-    if(!op->type.is_bool()) {
+    if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
         SpvId result_id = cast_type(op->type, bool_type, current_id);
@@ -811,7 +811,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op): unhandled type: " << op->a.type() << "\n";
     }
     visit_binary_op(op_code, op->type, op->a, op->b);
-    if(!op->type.is_bool()) {
+    if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
         SpvId result_id = cast_type(op->type, bool_type, current_id);
@@ -1039,7 +1039,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
     } else if (starts_with(op->name, "fast_inverse_f")) {
         internal_assert(op->args.size() == 1);
 
-        if(op->type.lanes() > 1) {
+        if (op->type.lanes() > 1) {
             user_error << "Vulkan: Expected scalar value for fast_inverse!\n";
         }
 
@@ -1048,18 +1048,18 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
 
         SpvId one_constant_id = SpvInvalidId;
         SpvId type_id = builder.declare_type(op->type);
-        if(op->type.is_float() && op->type.bits() == 16) {
-            if(op->type.is_bfloat()) {
+        if (op->type.is_float() && op->type.bits() == 16) {
+            if (op->type.is_bfloat()) {
                 bfloat16_t one_value = bfloat16_t(1.0f);
                 one_constant_id = builder.declare_constant(op->type, &one_value);
             } else {
                 float16_t one_value = float16_t(1.0f);
                 one_constant_id = builder.declare_constant(op->type, &one_value);
             }
-        } else if(op->type.is_float() && op->type.bits() == 32) {
+        } else if (op->type.is_float() && op->type.bits() == 32) {
             float one_value = float(1.0f);
             one_constant_id = builder.declare_constant(op->type, &one_value);
-        } else if(op->type.is_float() && op->type.bits() == 64) {
+        } else if (op->type.is_float() && op->type.bits() == 64) {
             double one_value = double(1.0);
             one_constant_id = builder.declare_constant(op->type, &one_value);
         } else {
@@ -1531,7 +1531,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         builder.append(SpvFactory::store(loop_var_id, min_id));
         SpvBlock header_block = builder.create_block(header_block_id);
         builder.enter_block(header_block);
-        {            
+        {
             builder.append(SpvFactory::loop_merge(merge_block_id, continue_block_id, SpvLoopControlDontUnrollMask));
             builder.append(SpvFactory::branch(top_block_id));
         }
@@ -1600,7 +1600,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Ramp *op) {
         SpvId this_id = builder.reserve_id(SpvResultId);
         if (op->base.type().is_float()) {
             builder.append(SpvFactory::float_add(base_type_id, this_id, prev_id, stride_id));
-        } else if(op->base.type().is_int_or_uint()) {
+        } else if (op->base.type().is_int_or_uint()) {
             builder.append(SpvFactory::integer_add(base_type_id, this_id, prev_id, stride_id));
         } else {
             internal_error << "SPIRV: Unhandled base type encountered in ramp!\n";
@@ -1719,7 +1719,8 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
     builder.leave_block();
 
     // Then block
-    debug(2) << "Vulkan: Then =>\n" << then_case << "\n";
+    debug(2) << "Vulkan: Then =>\n"
+             << then_case << "\n";
     SpvBlock then_block = builder.create_block(then_block_id);
     builder.enter_block(then_block);
     {
@@ -1732,7 +1733,8 @@ CodeGen_Vulkan_Dev::SPIRV_Emitter::emit_if_then_else(const Expr &condition,
 
     // Else block (optional)
     if (else_case.defined()) {
-        debug(2) << "Vulkan: Else =>\n" << else_case << "\n";
+        debug(2) << "Vulkan: Else =>\n"
+                 << else_case << "\n";
         SpvBlock else_block = builder.create_block(else_block_id);
         builder.enter_block(else_block);
         {
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index abf3b24a1e82..8ea92f490571 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -14,92 +14,92 @@ T saturate_value(T val, T min = std::numeric_limits<T>::min(), T max = std::nume
 }
 
 template<typename T>
-void assign_constant(void* dst, const void* src) {
-    reinterpret_cast<T*>(dst)[0] = saturate_value<T>(reinterpret_cast<const T*>(src)[0]);
+void assign_constant(void *dst, const void *src) {
+    reinterpret_cast<T *>(dst)[0] = saturate_value<T>(reinterpret_cast<const T *>(src)[0]);
 }
 
 template<>
-void assign_constant<bfloat16_t>(void* dst, const void* src) {
-    reinterpret_cast<bfloat16_t*>(dst)[0] = reinterpret_cast<const bfloat16_t*>(src)[0];
+void assign_constant<bfloat16_t>(void *dst, const void *src) {
+    reinterpret_cast<bfloat16_t *>(dst)[0] = reinterpret_cast<const bfloat16_t *>(src)[0];
 }
 
 template<>
-void assign_constant<float16_t>(void* dst, const void* src) {
-    reinterpret_cast<float16_t*>(dst)[0] = reinterpret_cast<const float16_t*>(src)[0];
+void assign_constant<float16_t>(void *dst, const void *src) {
+    reinterpret_cast<float16_t *>(dst)[0] = reinterpret_cast<const float16_t *>(src)[0];
 }
 
 template<>
-void assign_constant<float>(void* dst, const void* src) {
-    reinterpret_cast<float*>(dst)[0] = reinterpret_cast<const float*>(src)[0];
+void assign_constant<float>(void *dst, const void *src) {
+    reinterpret_cast<float *>(dst)[0] = reinterpret_cast<const float *>(src)[0];
 }
 
 template<>
-void assign_constant<double>(void* dst, const void* src) {
-    reinterpret_cast<double*>(dst)[0] = reinterpret_cast<const double*>(src)[0];
+void assign_constant<double>(void *dst, const void *src) {
+    reinterpret_cast<double *>(dst)[0] = reinterpret_cast<const double *>(src)[0];
 }
 
 template<typename T>
-std::string stringify_constant(const T& value) {
+std::string stringify_constant(const T &value) {
     return std::string();
 }
 
 template<>
-std::string  stringify_constant(const int8_t& value) {
+std::string stringify_constant(const int8_t &value) {
     return std::to_string(int8_t(value));
 }
 
 template<>
-std::string  stringify_constant(const int16_t& value) {
+std::string stringify_constant(const int16_t &value) {
     return std::to_string(int16_t(value));
 }
 
 template<>
-std::string  stringify_constant(const int32_t& value) {
+std::string stringify_constant(const int32_t &value) {
     return std::to_string(int32_t(value));
 }
 
 template<>
-std::string  stringify_constant(const int64_t& value) {
+std::string stringify_constant(const int64_t &value) {
     return std::to_string(int64_t(value));
 }
 
 template<>
-std::string  stringify_constant(const uint8_t& value) {
+std::string stringify_constant(const uint8_t &value) {
     return std::to_string(uint8_t(value));
 }
 
 template<>
-std::string  stringify_constant(const uint16_t& value) {
+std::string stringify_constant(const uint16_t &value) {
     return std::to_string(uint16_t(value));
 }
 
 template<>
-std::string  stringify_constant(const uint32_t& value) {
+std::string stringify_constant(const uint32_t &value) {
     return std::to_string(uint32_t(value));
 }
 
 template<>
-std::string  stringify_constant(const uint64_t& value) {
+std::string stringify_constant(const uint64_t &value) {
     return std::to_string(uint64_t(value));
 }
 
 template<>
-std::string  stringify_constant(const bfloat16_t& value) {
+std::string stringify_constant(const bfloat16_t &value) {
     return std::to_string(float(value));
 }
 
 template<>
-std::string  stringify_constant(const float16_t& value) {
+std::string stringify_constant(const float16_t &value) {
     return std::to_string(float(value));
 }
 
 template<>
-std::string  stringify_constant(const float& value) {
+std::string stringify_constant(const float &value) {
     return std::to_string(float(value));
 }
 
 template<>
-std::string  stringify_constant(const double& value) {
+std::string stringify_constant(const double &value) {
     return std::to_string(double(value));
 }
 
@@ -172,7 +172,7 @@ void SpvInstruction::add_operand(SpvId id) {
     contents->value_types.push_back(SpvOperandId);
 }
 
-void SpvInstruction::add_operands(const SpvInstruction::Operands& operands) {
+void SpvInstruction::add_operands(const SpvInstruction::Operands &operands) {
     check_defined();
     SpvInstructionContents::ValueTypes value_types(operands.size(), SpvOperandId);
     contents->operands.insert(contents->operands.end(), operands.begin(), operands.end());
@@ -185,31 +185,31 @@ void SpvInstruction::add_immediate(SpvId id, SpvValueType value_type) {
     contents->value_types.push_back(value_type);
 }
 
-void SpvInstruction::add_immediates(const SpvInstruction::Immediates& literals) {
+void SpvInstruction::add_immediates(const SpvInstruction::Immediates &literals) {
     check_defined();
-    for(const SpvInstruction::LiteralValue& v : literals) {
-        contents->operands.push_back(v.first); // SpvId
-        contents->value_types.push_back(v.second); // SpvValueType
+    for (const SpvInstruction::LiteralValue &v : literals) {
+        contents->operands.push_back(v.first);      // SpvId
+        contents->value_types.push_back(v.second);  // SpvValueType
     }
 }
 
 template<>
-void SpvInstruction::append(const SpvInstruction::Operands& operands) {
+void SpvInstruction::append(const SpvInstruction::Operands &operands) {
     add_operands(operands);
 }
 
 template<>
-void SpvInstruction::append(const SpvInstruction::Immediates& immediates) {
+void SpvInstruction::append(const SpvInstruction::Immediates &immediates) {
     add_immediates(immediates);
 }
 
 template<>
-void SpvInstruction::append(const std::string& str) {
+void SpvInstruction::append(const std::string &str) {
     add_string(str);
 }
 
 template<typename T>
-void SpvInstruction::append(const T&) {
+void SpvInstruction::append(const T &) {
     internal_error << "SPIRV: Unhandled type encountered when appending to instruction!\n";
 }
 
@@ -283,12 +283,11 @@ SpvBlock SpvInstruction::block() const {
 
 void SpvInstruction::add_data(uint32_t bytes, const void *data, SpvValueType value_type) {
     check_defined();
-    
-    
+
     uint32_t total_entries = (bytes + 3) / 4;
     debug(3) << "    add_data bytes=" << bytes << " total_entries=" << total_entries << "\n";
 
-    if(bytes == sizeof(uint32_t)) {
+    if (bytes == sizeof(uint32_t)) {
         uint32_t entry = 0;
         memcpy(&entry, data, sizeof(uint32_t));
         add_immediate(entry, value_type);
@@ -1777,7 +1776,7 @@ SpvId SpvBuilder::declare_string_constant(const std::string &value) {
 }
 
 template<typename T>
-SpvId SpvBuilder::declare_scalar_constant_of_type(const Type& scalar_type, const T* data) {
+SpvId SpvBuilder::declare_scalar_constant_of_type(const Type &scalar_type, const T *data) {
 
     ConstantKey constant_key = make_constant_key(scalar_type, data);
     ConstantMap::const_iterator it = constant_map.find(constant_key);
@@ -1805,7 +1804,7 @@ SpvId SpvBuilder::declare_scalar_constant_of_type(const Type& scalar_type, const
     assign_constant<T>(&value, data);
     SpvId type_id = add_type(scalar_type);
 
-    debug(3) << "    declare_scalar_constant_of_type: " 
+    debug(3) << "    declare_scalar_constant_of_type: "
              << "%" << result_id << " "
              << "type=" << scalar_type << " "
              << "data=" << stringify_constant(value) << "\n";
@@ -1832,32 +1831,32 @@ SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *d
     if (scalar_type.is_bool() && data) {
         bool value = *reinterpret_cast<const bool *>(data);
         return declare_bool_constant(value);
-    } else if(scalar_type.is_int() && scalar_type.bits() == 8) {
-        result_id = declare_scalar_constant_of_type<int8_t>(scalar_type, reinterpret_cast<const int8_t*>(data));
-    } else if(scalar_type.is_int() && scalar_type.bits() == 16) {
-        result_id = declare_scalar_constant_of_type<int16_t>(scalar_type, reinterpret_cast<const int16_t*>(data));
-    } else if(scalar_type.is_int() && scalar_type.bits() == 32) {
-        result_id = declare_scalar_constant_of_type<int32_t>(scalar_type, reinterpret_cast<const int32_t*>(data));
-    } else if(scalar_type.is_int() && scalar_type.bits() == 64) {
-        result_id = declare_scalar_constant_of_type<int64_t>(scalar_type, reinterpret_cast<const int64_t*>(data));
-    } else if(scalar_type.is_uint() && scalar_type.bits() == 8) {
-        result_id = declare_scalar_constant_of_type<uint8_t>(scalar_type, reinterpret_cast<const uint8_t*>(data));
-    } else if(scalar_type.is_uint() && scalar_type.bits() == 16) {
-        result_id = declare_scalar_constant_of_type<uint16_t>(scalar_type, reinterpret_cast<const uint16_t*>(data));
-    } else if(scalar_type.is_uint() && scalar_type.bits() == 32) {
-        result_id = declare_scalar_constant_of_type<uint32_t>(scalar_type, reinterpret_cast<const uint32_t*>(data));
-    } else if(scalar_type.is_uint() && scalar_type.bits() == 64) {
-        result_id = declare_scalar_constant_of_type<uint64_t>(scalar_type, reinterpret_cast<const uint64_t*>(data));
-    } else if(scalar_type.is_float() && scalar_type.bits() == 16) {
-        if(scalar_type.is_bfloat()) {
-            result_id = declare_scalar_constant_of_type<bfloat16_t>(scalar_type, reinterpret_cast<const bfloat16_t*>(data));
+    } else if (scalar_type.is_int() && scalar_type.bits() == 8) {
+        result_id = declare_scalar_constant_of_type<int8_t>(scalar_type, reinterpret_cast<const int8_t *>(data));
+    } else if (scalar_type.is_int() && scalar_type.bits() == 16) {
+        result_id = declare_scalar_constant_of_type<int16_t>(scalar_type, reinterpret_cast<const int16_t *>(data));
+    } else if (scalar_type.is_int() && scalar_type.bits() == 32) {
+        result_id = declare_scalar_constant_of_type<int32_t>(scalar_type, reinterpret_cast<const int32_t *>(data));
+    } else if (scalar_type.is_int() && scalar_type.bits() == 64) {
+        result_id = declare_scalar_constant_of_type<int64_t>(scalar_type, reinterpret_cast<const int64_t *>(data));
+    } else if (scalar_type.is_uint() && scalar_type.bits() == 8) {
+        result_id = declare_scalar_constant_of_type<uint8_t>(scalar_type, reinterpret_cast<const uint8_t *>(data));
+    } else if (scalar_type.is_uint() && scalar_type.bits() == 16) {
+        result_id = declare_scalar_constant_of_type<uint16_t>(scalar_type, reinterpret_cast<const uint16_t *>(data));
+    } else if (scalar_type.is_uint() && scalar_type.bits() == 32) {
+        result_id = declare_scalar_constant_of_type<uint32_t>(scalar_type, reinterpret_cast<const uint32_t *>(data));
+    } else if (scalar_type.is_uint() && scalar_type.bits() == 64) {
+        result_id = declare_scalar_constant_of_type<uint64_t>(scalar_type, reinterpret_cast<const uint64_t *>(data));
+    } else if (scalar_type.is_float() && scalar_type.bits() == 16) {
+        if (scalar_type.is_bfloat()) {
+            result_id = declare_scalar_constant_of_type<bfloat16_t>(scalar_type, reinterpret_cast<const bfloat16_t *>(data));
         } else {
-            result_id = declare_scalar_constant_of_type<float16_t>(scalar_type, reinterpret_cast<const float16_t*>(data));
+            result_id = declare_scalar_constant_of_type<float16_t>(scalar_type, reinterpret_cast<const float16_t *>(data));
         }
-    } else if(scalar_type.is_float() && scalar_type.bits() == 32) {
-        result_id = declare_scalar_constant_of_type<float>(scalar_type, reinterpret_cast<const float*>(data));
-    } else if(scalar_type.is_float() && scalar_type.bits() == 64) {
-        result_id = declare_scalar_constant_of_type<double>(scalar_type, reinterpret_cast<const double*>(data));
+    } else if (scalar_type.is_float() && scalar_type.bits() == 32) {
+        result_id = declare_scalar_constant_of_type<float>(scalar_type, reinterpret_cast<const float *>(data));
+    } else if (scalar_type.is_float() && scalar_type.bits() == 64) {
+        result_id = declare_scalar_constant_of_type<double>(scalar_type, reinterpret_cast<const double *>(data));
     } else {
         user_error << "Unhandled constant data conversion from value type '" << scalar_type << "'!\n";
     }
@@ -1866,7 +1865,7 @@ SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *d
 }
 
 template<typename T>
-SpvBuilder::Components SpvBuilder::declare_constants_for_each_lane(Type type, const void* data) {
+SpvBuilder::Components SpvBuilder::declare_constants_for_each_lane(Type type, const void *data) {
     SpvBuilder::Components components;
     components.reserve(type.lanes());
 
@@ -1898,31 +1897,31 @@ SpvId SpvBuilder::declare_vector_constant(const Type &type, const void *data) {
     }
 
     SpvBuilder::Components components;
-    if(type.is_int() && type.bits() == 8) {
+    if (type.is_int() && type.bits() == 8) {
         components = declare_constants_for_each_lane<int8_t>(type, data);
-    } else if(type.is_int() && type.bits() == 16) {
+    } else if (type.is_int() && type.bits() == 16) {
         components = declare_constants_for_each_lane<int16_t>(type, data);
-    } else if(type.is_int() && type.bits() == 32) {
+    } else if (type.is_int() && type.bits() == 32) {
         components = declare_constants_for_each_lane<int32_t>(type, data);
-    } else if(type.is_int() && type.bits() == 64) {
+    } else if (type.is_int() && type.bits() == 64) {
         components = declare_constants_for_each_lane<int64_t>(type, data);
-    } else if(type.is_uint() && type.bits() == 8) {
+    } else if (type.is_uint() && type.bits() == 8) {
         components = declare_constants_for_each_lane<uint8_t>(type, data);
-    } else if(type.is_uint() && type.bits() == 16) {
+    } else if (type.is_uint() && type.bits() == 16) {
         components = declare_constants_for_each_lane<uint16_t>(type, data);
-    } else if(type.is_uint() && type.bits() == 32) {
+    } else if (type.is_uint() && type.bits() == 32) {
         components = declare_constants_for_each_lane<uint32_t>(type, data);
-    } else if(type.is_uint() && type.bits() == 64) {
+    } else if (type.is_uint() && type.bits() == 64) {
         components = declare_constants_for_each_lane<uint64_t>(type, data);
-    } else if(type.is_float() && type.bits() == 16) {
-        if(type.is_bfloat()) {
-            components = declare_constants_for_each_lane<bfloat16_t>(type, data);        
+    } else if (type.is_float() && type.bits() == 16) {
+        if (type.is_bfloat()) {
+            components = declare_constants_for_each_lane<bfloat16_t>(type, data);
         } else {
             components = declare_constants_for_each_lane<float16_t>(type, data);
         }
-    } else if(type.is_float() && type.bits() == 32) {
+    } else if (type.is_float() && type.bits() == 32) {
         components = declare_constants_for_each_lane<float>(type, data);
-    } else if(type.is_float() && type.bits() == 64) {
+    } else if (type.is_float() && type.bits() == 64) {
         components = declare_constants_for_each_lane<double>(type, data);
     } else {
         user_error << "Unhandled constant data conversion from value type '" << type << "'!";
@@ -2139,10 +2138,8 @@ SpvInstruction SpvFactory::decorate(SpvId target_id, SpvDecoration decoration_ty
 SpvInstruction SpvFactory::decorate_member(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const SpvFactory::Literals &literals) {
     SpvInstruction inst = SpvInstruction::make(SpvOpMemberDecorate);
     inst.add_operand(struct_type_id);
-    inst.add_immediates({
-        {member_index, SpvIntegerLiteral}, 
-        {decoration_type, SpvIntegerLiteral}
-    });
+    inst.add_immediates({{member_index, SpvIntegerLiteral},
+                         {decoration_type, SpvIntegerLiteral}});
     for (uint32_t l : literals) {
         inst.add_immediate(l, SpvIntegerLiteral);
     }
@@ -2188,10 +2185,8 @@ SpvInstruction SpvFactory::bool_type(SpvId bool_type_id) {
 SpvInstruction SpvFactory::integer_type(SpvId int_type_id, uint32_t bits, uint32_t signedness) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeInt);
     inst.set_result_id(int_type_id);
-    inst.add_immediates({
-        {bits, SpvIntegerLiteral},
-        {signedness, SpvIntegerLiteral}
-    });
+    inst.add_immediates({{bits, SpvIntegerLiteral},
+                         {signedness, SpvIntegerLiteral}});
     return inst;
 }
 
@@ -2213,7 +2208,7 @@ SpvInstruction SpvFactory::vector_type(SpvId vector_type_id, SpvId element_type_
 SpvInstruction SpvFactory::array_type(SpvId array_type_id, SpvId element_type_id, SpvId array_size_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpTypeArray);
     inst.set_result_id(array_type_id);
-    inst.add_operands({ element_type_id, array_size_id });
+    inst.add_operands({element_type_id, array_size_id});
     return inst;
 }
 
@@ -2337,10 +2332,8 @@ SpvInstruction SpvFactory::entry_point(SpvId exec_model, SpvId func_id, const st
 
 SpvInstruction SpvFactory::memory_model(SpvAddressingModel addressing_model, SpvMemoryModel memory_model) {
     SpvInstruction inst = SpvInstruction::make(SpvOpMemoryModel);
-    inst.add_immediates({
-        {addressing_model, SpvIntegerLiteral},
-        {memory_model, SpvIntegerLiteral}
-    });
+    inst.add_immediates({{addressing_model, SpvIntegerLiteral},
+                         {memory_model, SpvIntegerLiteral}});
     return inst;
 }
 
@@ -2388,7 +2381,7 @@ SpvInstruction SpvFactory::shift_right_logical(SpvId type_id, SpvId result_id, S
     SpvInstruction inst = SpvInstruction::make(SpvOpShiftRightLogical);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operands({ src_id, shift_id });
+    inst.add_operands({src_id, shift_id});
     return inst;
 }
 
@@ -2396,7 +2389,7 @@ SpvInstruction SpvFactory::shift_right_arithmetic(SpvId type_id, SpvId result_id
     SpvInstruction inst = SpvInstruction::make(SpvOpShiftRightArithmetic);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operands({ src_id, shift_id });
+    inst.add_operands({src_id, shift_id});
     return inst;
 }
 
@@ -2441,7 +2434,7 @@ SpvInstruction SpvFactory::load(SpvId type_id, SpvId result_id, SpvId ptr_id, ui
 
 SpvInstruction SpvFactory::store(SpvId ptr_id, SpvId obj_id, uint32_t access_mask) {
     SpvInstruction inst = SpvInstruction::make(SpvOpStore);
-    inst.add_operands({ ptr_id, obj_id });
+    inst.add_operands({ptr_id, obj_id});
     inst.add_immediate(access_mask, SpvBitMaskLiteral);
     return inst;
 }
@@ -2450,7 +2443,7 @@ SpvInstruction SpvFactory::composite_insert(SpvId type_id, SpvId result_id, SpvI
     SpvInstruction inst = SpvInstruction::make(SpvOpCompositeInsert);
     inst.set_type_id(type_id);
     inst.set_result_id(result_id);
-    inst.add_operands({ object_id, composite_id });
+    inst.add_operands({object_id, composite_id});
     for (SpvId i : indices) {
         inst.add_immediate(i, SpvIntegerLiteral);
     }
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index f17db1471cff..86355d2e54a1 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -148,14 +148,14 @@ class SpvInstruction {
     void set_type_id(SpvId id);
     void set_op_code(SpvOp opcode);
     void add_operand(SpvId id);
-    void add_operands(const Operands& operands);
+    void add_operands(const Operands &operands);
     void add_immediate(SpvId id, SpvValueType type);
-    void add_immediates(const Immediates& Immediates);
+    void add_immediates(const Immediates &Immediates);
     void add_data(uint32_t bytes, const void *data, SpvValueType type);
     void add_string(const std::string &str);
 
     template<typename T>
-    void append(const T& operands_or_immediates_or_strings);
+    void append(const T &operands_or_immediates_or_strings);
 
     SpvId result_id() const;
     SpvId type_id() const;
@@ -569,11 +569,11 @@ class SpvBuilder {
     SpvId lookup_pointer_type(SpvId base_type_id, SpvStorageClass storage_class) const;
 
     template<typename T>
-    SpvId declare_scalar_constant_of_type(const Type& scalar_type, const T* data);
+    SpvId declare_scalar_constant_of_type(const Type &scalar_type, const T *data);
 
     template<typename T>
-    SpvBuilder::Components declare_constants_for_each_lane(Type type, const void* data);
-    
+    SpvBuilder::Components declare_constants_for_each_lane(Type type, const void *data);
+
     ConstantKey make_bool_constant_key(bool value) const;
     ConstantKey make_string_constant_key(const std::string &value) const;
     ConstantKey make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data) const;
diff --git a/test/correctness/async_device_copy.cpp b/test/correctness/async_device_copy.cpp
index 7095f5e130b8..ec169a6d0ddf 100644
--- a/test/correctness/async_device_copy.cpp
+++ b/test/correctness/async_device_copy.cpp
@@ -27,7 +27,7 @@ int main(int argc, char **argv) {
                " (halide_buffer_copy is unimplemented in that backend).\n");
         return 0;
     }
-    
+
     if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
         printf("Skipping test for Vulkan as it does not support copy_to_host/device() yet"
                " (halide_buffer_copy is unimplemented in that backend).\n");

From 3f0b71545baa2b48da44d450dddbe5ee4e975559 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 28 Oct 2022 13:46:41 -0700
Subject: [PATCH 071/166] Fix windows build ... refactor convert_to_bool to use
 std::vectors rather than dynamic fixed sized arrays

---
 src/CodeGen_Vulkan_Dev.cpp | 53 ++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 7de9d1ce1455..17095cae56c0 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -448,30 +448,51 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type
     if (!value_type.is_bool()) {
         value_id = cast_type(Bool(), value_type, value_id);
     }
-    uint8_t true_data[target_type.bytes()];
-    uint8_t false_data[target_type.bytes()];
+    std::vector<uint8_t> true_data(target_type.bytes(), uint8_t(0));
+    std::vector<uint8_t> false_data(target_type.bytes(), uint8_t(0));
     for (int i = 0; i < target_type.lanes(); ++i) {
         if (target_type.is_int_or_uint() && target_type.bits() == 8) {
-            reinterpret_cast<int8_t *>(true_data)[i] = int8_t(1);
-            reinterpret_cast<int8_t *>(false_data)[i] = int8_t(0);
+            int8_t *td = reinterpret_cast<int8_t *>(&true_data[0]);
+            int8_t *fd = reinterpret_cast<int8_t *>(&false_data[0]);
+            td[i] = int8_t(1);
+            fd[i] = int8_t(0);
         } else if (target_type.is_int_or_uint() && target_type.bits() == 16) {
-            reinterpret_cast<int16_t *>(true_data)[i] = int16_t(1);
-            reinterpret_cast<int16_t *>(false_data)[i] = int16_t(0);
+            int16_t *td = reinterpret_cast<int16_t *>(&true_data[0]);
+            int16_t *fd = reinterpret_cast<int16_t *>(&false_data[0]);
+            td[i] = int16_t(1);
+            fd[i] = int16_t(0);
         } else if (target_type.is_int_or_uint() && target_type.bits() == 32) {
-            reinterpret_cast<int32_t *>(true_data)[i] = int32_t(1);
-            reinterpret_cast<int32_t *>(false_data)[i] = int32_t(0);
+            int32_t *td = reinterpret_cast<int32_t *>(&true_data[0]);
+            int32_t *fd = reinterpret_cast<int32_t *>(&false_data[0]);
+            td[i] = int32_t(1);
+            fd[i] = int32_t(0);
         } else if (target_type.is_int_or_uint() && target_type.bits() == 64) {
-            reinterpret_cast<int64_t *>(true_data)[i] = int64_t(1);
-            reinterpret_cast<int64_t *>(false_data)[i] = int64_t(0);
+            int64_t *td = reinterpret_cast<int64_t *>(&true_data[0]);
+            int64_t *fd = reinterpret_cast<int64_t *>(&false_data[0]);
+            td[i] = int64_t(1);
+            fd[i] = int64_t(0);
         } else if (target_type.is_float() && target_type.bits() == 16) {
-            reinterpret_cast<uint16_t *>(true_data)[i] = uint16_t(1);
-            reinterpret_cast<uint16_t *>(false_data)[i] = uint16_t(0);
+            if (target_type.is_bfloat()) {
+                bfloat16_t *td = reinterpret_cast<bfloat16_t *>(&true_data[0]);
+                bfloat16_t *fd = reinterpret_cast<bfloat16_t *>(&false_data[0]);
+                td[i] = bfloat16_t(1.0f);
+                fd[i] = bfloat16_t(0.0f);
+            } else {
+                float16_t *td = reinterpret_cast<float16_t *>(&true_data[0]);
+                float16_t *fd = reinterpret_cast<float16_t *>(&false_data[0]);
+                td[i] = float16_t(1.0f);
+                fd[i] = float16_t(0.0f);
+            }
         } else if (target_type.is_float() && target_type.bits() == 32) {
-            reinterpret_cast<float *>(true_data)[i] = 1.0f;
-            reinterpret_cast<float *>(false_data)[i] = 0.0f;
+            float *td = reinterpret_cast<float *>(&true_data[0]);
+            float *fd = reinterpret_cast<float *>(&false_data[0]);
+            td[i] = float(1.0f);
+            fd[i] = float(0.0f);
         } else if (target_type.is_float() && target_type.bits() == 64) {
-            reinterpret_cast<double *>(true_data)[i] = 1.0;
-            reinterpret_cast<double *>(false_data)[i] = 0.0;
+            double *td = reinterpret_cast<double *>(&true_data[0]);
+            double *fd = reinterpret_cast<double *>(&false_data[0]);
+            td[i] = double(1.0);
+            fd[i] = double(0.0);
         } else {
             user_error << "Unhandled type cast from value type '" << value_type << "' to target type '" << target_type << "'!";
         }

From f308841bfc29301dbf2bd772368bd3658bd0ae49 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 1 Nov 2022 11:03:50 -0700
Subject: [PATCH 072/166] Skip asyn_device_copy, device_buffer_copy,
 device_crop, and device_slice tests for Vulkan (for now).

---
 test/correctness/async_device_copy.cpp  | 2 +-
 test/correctness/device_buffer_copy.cpp | 2 +-
 test/correctness/device_crop.cpp        | 2 +-
 test/correctness/device_slice.cpp       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/correctness/async_device_copy.cpp b/test/correctness/async_device_copy.cpp
index ec169a6d0ddf..2443a8fc01b9 100644
--- a/test/correctness/async_device_copy.cpp
+++ b/test/correctness/async_device_copy.cpp
@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
     }
 
     if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
-        printf("Skipping test for Vulkan as it does not support copy_to_host/device() yet"
+        printf("[SKIP] Skipping test for Vulkan as it does not support copy_to_host/device() yet"
                " (halide_buffer_copy is unimplemented in that backend).\n");
         return 0;
     }
diff --git a/test/correctness/device_buffer_copy.cpp b/test/correctness/device_buffer_copy.cpp
index c19efd15e6c1..596590486d04 100644
--- a/test/correctness/device_buffer_copy.cpp
+++ b/test/correctness/device_buffer_copy.cpp
@@ -38,7 +38,7 @@ int main(int argc, char **argv) {
     }
 
     if (target.has_feature(Target::Vulkan)) {
-        printf("Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
+        printf("[SKIP] Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
         return 0;
     }
 
diff --git a/test/correctness/device_crop.cpp b/test/correctness/device_crop.cpp
index ce02bd6ef995..ddb8e92917a5 100644
--- a/test/correctness/device_crop.cpp
+++ b/test/correctness/device_crop.cpp
@@ -36,7 +36,7 @@ int main(int argc, char **argv) {
     }
 
     if (target.has_feature(Target::Vulkan)) {
-        printf("Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
+        printf("[SKIP] Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
         return 0;
     }
 
diff --git a/test/correctness/device_slice.cpp b/test/correctness/device_slice.cpp
index a77386543e09..924d34939cb3 100644
--- a/test/correctness/device_slice.cpp
+++ b/test/correctness/device_slice.cpp
@@ -38,7 +38,7 @@ int main(int argc, char **argv) {
     }
 
     if (target.has_feature(Target::Vulkan)) {
-        printf("Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
+        printf("[SKIP] Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
         return 0;
     }
 

From 3b8929700b91e7cbfc0351c5f794bd1630f9117e Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 1 Nov 2022 11:04:37 -0700
Subject: [PATCH 073/166] Don't test large vector widths for Vulkan (since they
 are optionally supported)

---
 test/correctness/mul_div_mod.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/correctness/mul_div_mod.cpp b/test/correctness/mul_div_mod.cpp
index af82c8bf26be..c42aa25cc3bc 100644
--- a/test/correctness/mul_div_mod.cpp
+++ b/test/correctness/mul_div_mod.cpp
@@ -550,6 +550,7 @@ int main(int argc, char **argv) {
     // Test multiplication and division
     std::vector<int> vector_widths = {1};
     if (target.has_feature(Target::Metal) ||
+        target.has_feature(Target::Vulkan) || 
         target.has_feature(Target::D3D12Compute)) {
         for (int i = 2; i <= 4; i *= 2) {
             vector_widths.push_back(i);

From 5b62620a8cb53788fddc26f5dad36f6e001fcd5b Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 1 Nov 2022 11:05:14 -0700
Subject: [PATCH 074/166] Clear Vulkan buffer allocations prior to use (tbd if
 this is necessary)

---
 src/CodeGen_Vulkan_Dev.cpp     |  1 +
 src/runtime/vulkan.cpp         | 69 ++++++++++++++++++++++++++++++++++
 src/runtime/vulkan_functions.h |  1 +
 3 files changed, 71 insertions(+)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 17095cae56c0..9eb0aa41b58c 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -2393,6 +2393,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     for (const auto &arg : args) {
         symbol_table.pop(arg.name);
     }
+    storage_access_map.clear();
     builder.leave_block();
     builder.leave_function();
 }
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 7250421c0f1d..4cb9a1d4f4e6 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -277,6 +277,75 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
         << "    Allocated device buffer " << (void *)buf->device
         << " for buffer " << buf << "\n";
 
+    // retrieve the buffer from the region
+    VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
+    if (device_buffer == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve device buffer for device memory!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // create a command buffer
+    VkCommandBuffer command_buffer;
+    VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "Vulkan: vkCreateCommandBuffer returned: " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // begin the command buffer
+    VkCommandBufferBeginInfo command_buffer_begin_info =
+    {
+        VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
+        nullptr,                                      // pointer to struct extending this
+        VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
+        nullptr                                       // pointer to parent command buffer
+    };
+
+    result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // fill buffer with zero values
+    vkCmdFillBuffer(command_buffer, *device_buffer, 0, device_region->size, 0);
+    debug(user_context) << "    zeroing device_buffer=" << (void*)device_buffer 
+                        << " size=" << (uint32_t)device_region->size << "\n";
+
+    // end the command buffer
+    result = vkEndCommandBuffer(command_buffer);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // submit the command buffer
+    VkSubmitInfo submit_info =
+        {
+            VK_STRUCTURE_TYPE_SUBMIT_INFO,  // struct type
+            nullptr,                        // pointer to struct extending this
+            0,                              // wait semaphore count
+            nullptr,                        // semaphores
+            nullptr,                        // pipeline stages where semaphore waits occur
+            1,                              // how many command buffers to execute
+            &command_buffer,                // the command buffers
+            0,                              // number of semaphores to signal
+            nullptr                         // the semaphores to signal
+        };
+
+    result = vkQueueSubmit(ctx.queue, 1, &submit_info, 0);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
+    // wait for memset to finish
+    result = vkQueueWaitIdle(ctx.queue);
+    if (result != VK_SUCCESS) {
+        debug(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return result;
+    }
+
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
index 6536811e71ee..dcf6701dbd54 100644
--- a/src/runtime/vulkan_functions.h
+++ b/src/runtime/vulkan_functions.h
@@ -40,4 +40,5 @@ VULKAN_FN(vkCmdDispatch)
 VULKAN_FN(vkQueueSubmit)
 VULKAN_FN(vkQueueWaitIdle)
 VULKAN_FN(vkEndCommandBuffer)
+VULKAN_FN(vkCmdFillBuffer)
 VULKAN_FN(vkCmdCopyBuffer)
\ No newline at end of file

From e8f5355501de8b40f32e4a62965a4ef413470f7d Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 2 Nov 2022 09:26:13 -0700
Subject: [PATCH 075/166] Skip Vulkan for async copy chain test

---
 test/correctness/async_copy_chain.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/correctness/async_copy_chain.cpp b/test/correctness/async_copy_chain.cpp
index 45b014c4bd8b..b573780b9a75 100644
--- a/test/correctness/async_copy_chain.cpp
+++ b/test/correctness/async_copy_chain.cpp
@@ -31,6 +31,12 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
+        printf("Skipping test for Vulkan as it does not support copy_to_host/device() yet"
+               " (halide_buffer_copy is unimplemented in that backend).\n");
+        return 0;
+    }
+
     // Make a list of extern pipeline stages (just copies) all async
     // and connected by double buffers, then try various nestings of
     // them. This is a stress test of the async extern storage folding

From 06739d38453251474d953ff03d2a1cba78bbfce0 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 2 Nov 2022 09:26:43 -0700
Subject: [PATCH 076/166] Skip Vulkan for interpreter test

---
 test/correctness/interpreter.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/correctness/interpreter.cpp b/test/correctness/interpreter.cpp
index d230616b87e0..11751fb70ba2 100644
--- a/test/correctness/interpreter.cpp
+++ b/test/correctness/interpreter.cpp
@@ -8,6 +8,7 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     if (target.os == Target::Windows &&
         (target.has_feature(Target::OpenCL) ||
+         target.has_feature(Target::Vulkan) ||
          target.has_feature(Target::D3D12Compute))) {
         printf("[SKIP] workaround for issue #5738\n");
         return 0;

From bb7a8bb5fb989ff705ac699dbfd86a23d090c325 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 2 Nov 2022 09:30:32 -0700
Subject: [PATCH 077/166] Clang tidy/format pass

---
 src/runtime/vulkan.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 4cb9a1d4f4e6..75f9f81ef26a 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -294,12 +294,12 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
 
     // begin the command buffer
     VkCommandBufferBeginInfo command_buffer_begin_info =
-    {
-        VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
-        nullptr,                                      // pointer to struct extending this
-        VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
-        nullptr                                       // pointer to parent command buffer
-    };
+        {
+            VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
+            nullptr,                                      // pointer to struct extending this
+            VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
+            nullptr                                       // pointer to parent command buffer
+        };
 
     result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
     if (result != VK_SUCCESS) {
@@ -309,7 +309,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
 
     // fill buffer with zero values
     vkCmdFillBuffer(command_buffer, *device_buffer, 0, device_region->size, 0);
-    debug(user_context) << "    zeroing device_buffer=" << (void*)device_buffer 
+    debug(user_context) << "    zeroing device_buffer=" << (void *)device_buffer
                         << " size=" << (uint32_t)device_region->size << "\n";
 
     // end the command buffer

From f8bd24338afc40bf70f532001226db3c025f8214 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 2 Nov 2022 09:31:19 -0700
Subject: [PATCH 078/166] Fix formatting

---
 test/correctness/mul_div_mod.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/correctness/mul_div_mod.cpp b/test/correctness/mul_div_mod.cpp
index c42aa25cc3bc..020e45ad75b8 100644
--- a/test/correctness/mul_div_mod.cpp
+++ b/test/correctness/mul_div_mod.cpp
@@ -550,7 +550,7 @@ int main(int argc, char **argv) {
     // Test multiplication and division
     std::vector<int> vector_widths = {1};
     if (target.has_feature(Target::Metal) ||
-        target.has_feature(Target::Vulkan) || 
+        target.has_feature(Target::Vulkan) ||
         target.has_feature(Target::D3D12Compute)) {
         for (int i = 2; i <= 4; i *= 2) {
             vector_widths.push_back(i);

From 4040f2abe2141c21c45c4fe4e00beefd1574c14d Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 2 Nov 2022 11:17:16 -0700
Subject: [PATCH 079/166] Fix build ... use error messages for errors

---
 src/runtime/internal/block_allocator.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index 9bf7a4efb172..2d3790c7752b 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -157,8 +157,8 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
 #endif
     BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated);
     if (block_entry == nullptr) {
-        StackBasicPrinter<256>(nullptr) << "BlockAllocator: Failed to allocate new empty block of requested size ("
-                                        << (int32_t)(request.size) << " bytes)!\n";
+        error(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size ("
+                            << (int32_t)(request.size) << " bytes)!\n";
         return nullptr;
     }
 
@@ -173,8 +173,8 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
         size_t actual_size = constrain_requested_size(request.size);
         block_entry = create_block_entry(user_context, request.properties, actual_size, request.dedicated);
         if (block_entry == nullptr) {
-            StackBasicPrinter<256>(nullptr) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
-                                            << (int32_t)(actual_size) << " bytes)!\n";
+            error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
+                                << (int32_t)(actual_size) << " bytes)!\n";
             return nullptr;
         }
 
@@ -366,7 +366,7 @@ void BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocato
 BlockAllocator::BlockEntry *
 BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
     if (config.maximum_block_count && (block_count() >= config.maximum_block_count)) {
-        debug(user_context) << "BlockAllocator: No free blocks found! Maximum block count reached ("
+        error(user_context) << "BlockAllocator: No free blocks found! Maximum block count reached ("
                             << (int32_t)(config.maximum_block_count) << ")!\n";
         return nullptr;
     }

From 5bdb204b474b11ae1d4c48c9512de5c083295b4a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:08:22 -1000
Subject: [PATCH 080/166] Separate shared memory resources by element type for
 Vulkan.

---
 src/FuseGPUThreadLoops.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index 013024510cd5..e50d9da1c7c6 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -759,7 +759,7 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
         // lifetimes, and then cluster the groups according to which
         // ones can share a single allocation. For cuda, opencl, and
         // similar we get one big combined allocation per memory
-        // type. For openglcompute and direct3d, we also separate by
+        // type. For vulkan, openglcompute and direct3d, we also separate by
         // element type.
         map<pair<MemoryType, Type>, vector<AllocGroup>> clustered_allocs;
 
@@ -1027,7 +1027,8 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
           device_api(d),
           thread_id_var_name(unique_name('t')),
           num_threads_var_name(unique_name('t')),
-          may_merge_allocs_of_different_type(device_api != DeviceAPI::OpenGLCompute &&
+          may_merge_allocs_of_different_type(device_api != DeviceAPI::Vulkan &&
+                                             device_api != DeviceAPI::OpenGLCompute &&
                                              device_api != DeviceAPI::D3D12Compute) {
     }
 };  // namespace Internal

From 67c0881d7d81ae377155bfaff6255c31c8b68ffd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:09:33 -1000
Subject: [PATCH 081/166] Add Vulkan to conditional for fusing gpu loops

---
 src/Lower.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Lower.cpp b/src/Lower.cpp
index 42bcf54bb4da..7676fdd8f7c0 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -321,6 +321,7 @@ void lower_impl(const vector<Function> &output_funcs,
     log("Lowering after vectorizing:", s);
 
     if (t.has_gpu_feature() ||
+        t.has_feature(Target::Vulkan) ||
         t.has_feature(Target::OpenGLCompute)) {
         debug(1) << "Injecting per-block gpu synchronization...\n";
         s = fuse_gpu_thread_loops(s);

From 517c987fd6bdbcc47ab4ec37f5c6e4f645c21303 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:11:17 -1000
Subject: [PATCH 082/166] Reorder reset method to match declaration ordering.

---
 src/SpirvIR.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 8ea92f490571..bfd1f029e279 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -970,6 +970,10 @@ SpvBuilder::SpvBuilder() {
 
 void SpvBuilder::reset() {
 
+    active_id = SpvInvalidId;
+    active_function = SpvFunction();
+    active_block = SpvBlock();
+
     kind_map.clear();
     type_map.clear();
     struct_map.clear();
@@ -985,10 +989,6 @@ void SpvBuilder::reset() {
     variable_type_map.clear();
     function_type_map.clear();
 
-    active_id = SpvInvalidId;
-    active_block = SpvBlock();
-    active_function = SpvFunction();
-
     SpvId module_id = make_id(SpvModuleId);
     module = SpvModule::make(module_id);
 }

From 81cefde1bf7a4a364f115963bc3d50eb7d6cdccd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:14:19 -1000
Subject: [PATCH 083/166] Cleanup debug log messages for Vulkan resources

---
 src/runtime/vulkan_resources.h | 70 ++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 3b7b331a795b..ef9529481972 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -42,7 +42,7 @@ namespace {  // internalize
 VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_create_command_pool (user_context: " << user_context << ", "
+        << " vk_create_command_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "queue_index: " << queue_index << ")\n";
@@ -50,24 +50,23 @@ VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *alloc
 
     VkCommandPoolCreateInfo command_pool_info =
         {
-            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,  // struct type
-            nullptr,                                     // pointer to struct extending this
-            0,                                           // flags.  may consider VK_COMMAND_POOL_CREATE_TRANSIENT_BIT
-            queue_index                                  // queue family index corresponding to the compute command queue
+            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,      // struct type
+            nullptr,                                         // pointer to struct extending this
+            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,            // flags. Assume transient short-lived single-use command buffers
+            queue_index                                      // queue family index corresponding to the compute command queue
         };
     return vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
 }
 
-VkResult vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool) {
+void vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_destroy_command_pool (user_context: " << user_context << ", "
+        << " vk_destroy_command_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "command_pool: " << (void *)command_pool << ")\n";
 #endif
     vkDestroyCommandPool(allocator->current_device(), command_pool, allocator->callbacks());
-    return VK_SUCCESS;
 }
 
 // --
@@ -75,7 +74,7 @@ VkResult vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allo
 VkResult vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer *command_buffer) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_create_command_buffer (user_context: " << user_context << ", "
+        << " vk_create_command_buffer (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "command_pool: " << (void *)command_pool << ")\n";
@@ -92,6 +91,18 @@ VkResult vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *all
     return vkAllocateCommandBuffers(allocator->current_device(), &command_buffer_info, command_buffer);
 }
 
+void vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer command_buffer) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << " vk_destroy_command_buffer (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "command_pool: " << (void *)command_pool << ", "
+        << "command_buffer: " << (void *)command_buffer << ")\n";
+#endif
+    vkFreeCommandBuffers(allocator->current_device(), command_pool, 1, &command_buffer);
+}
+
 VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
                                                    VkDevice device,
                                                    VkCommandBuffer command_buffer,
@@ -103,7 +114,7 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_fill_command_buffer_with_dispatch_call (user_context: " << user_context << ", "
+        << " vk_fill_command_buffer_with_dispatch_call (user_context: " << user_context << ", "
         << "device: " << (void *)device << ", "
         << "command_buffer: " << (void *)command_buffer << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ", "
@@ -142,7 +153,7 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
 VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_submit_command_buffer (user_context: " << user_context << ", "
+        << " vk_submit_command_buffer (user_context: " << user_context << ", "
         << "queue: " << (void *)queue << ", "
         << "command_buffer: " << (void *)command_buffer << ")\n";
 #endif
@@ -211,7 +222,7 @@ VkResult vk_create_descriptor_pool(void *user_context,
                                    VkDescriptorPool *descriptor_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_create_descriptor_pool (user_context: " << user_context << ", "
+        << " vk_create_descriptor_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "uniform_buffer_count: " << (uint32_t)uniform_buffer_count << ", "
@@ -262,7 +273,7 @@ VkResult vk_destroy_descriptor_pool(void *user_context,
                                     VkDescriptorPool descriptor_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_destroy_descriptor_pool (user_context: " << user_context << ", "
+        << " vk_destroy_descriptor_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
@@ -281,7 +292,7 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_create_descriptor_set_layout (user_context: " << user_context << ", "
+        << " vk_create_descriptor_set_layout (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "uniform_buffer_count: " << uniform_buffer_count << ", "
@@ -306,7 +317,7 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
 
 #ifdef DEBUG_RUNTIME
         debug(user_context)
-            << "        [" << (uint32_t)layout_bindings.size() << "] : UNIFORM_BUFFER\n";
+            << "  [" << (uint32_t)layout_bindings.size() << "] : UNIFORM_BUFFER\n";
 #endif
 
         layout_bindings.append(user_context, &uniform_buffer_layout);
@@ -325,7 +336,7 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
         };
 #ifdef DEBUG_RUNTIME
         debug(user_context)
-            << "        [" << (uint32_t)layout_bindings.size() << "] : STORAGE_BUFFER\n";
+            << "  [" << (uint32_t)layout_bindings.size() << "] : STORAGE_BUFFER\n";
 #endif
 
         layout_bindings.append(user_context, &storage_buffer_layout);
@@ -367,7 +378,7 @@ VkResult vk_create_descriptor_set(void *user_context,
                                   VkDescriptorSet *descriptor_set) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_create_descriptor_set (user_context: " << user_context << ", "
+        << " vk_create_descriptor_set (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "descriptor_set_layout: " << (void *)descriptor_set_layout << ", "
@@ -403,7 +414,7 @@ VkResult vk_update_descriptor_set(void *user_context,
                                   VkDescriptorSet descriptor_set) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_update_descriptor_set (user_context: " << user_context << ", "
+        << " vk_update_descriptor_set (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "scalar_args_buffer: " << (void *)scalar_args_buffer << ", "
@@ -514,7 +525,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_create_scalar_uniform_buffer (user_context: " << user_context << ", "
+        << " vk_create_scalar_uniform_buffer (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "scalar_buffer_size: " << (uint32_t)scalar_buffer_size << ")\n";
@@ -546,7 +557,7 @@ VkResult vk_update_scalar_uniform_buffer(void *user_context,
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_update_scalar_uniform_buffer (user_context: " << user_context << ", "
+        << " vk_update_scalar_uniform_buffer (user_context: " << user_context << ", "
         << "region: " << (void *)region << ")\n";
 #endif
 
@@ -581,7 +592,7 @@ void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_destroy_scalar_uniform_buffer (user_context: " << user_context << ", "
+        << " vk_destroy_scalar_uniform_buffer (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "scalar_args_region: " << (void *)scalar_args_region << ")\n";
@@ -603,7 +614,7 @@ VkResult vk_create_pipeline_layout(void *user_context,
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_create_pipeline_layout (user_context: " << user_context << ", "
+        << " vk_create_pipeline_layout (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "descriptor_set_count: " << descriptor_set_count << ", "
@@ -635,7 +646,7 @@ VkResult vk_destroy_pipeline_layout(void *user_context,
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_destroy_pipeline_layout (user_context: " << user_context << ", "
+        << " vk_destroy_pipeline_layout (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
@@ -656,7 +667,7 @@ VkResult vk_create_compute_pipeline(void *user_context,
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_create_compute_pipeline (user_context: " << user_context << ", "
+        << " vk_create_compute_pipeline (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "shader_module: " << (void *)shader_module << ", "
@@ -697,7 +708,7 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
                                      VkPipeline compute_pipeline) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_destroy_compute_pipeline (user_context: " << user_context << ", "
+        << " vk_destroy_compute_pipeline (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "compute_pipeline: " << (void *)compute_pipeline << ")\n";
@@ -711,7 +722,7 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
 VulkanEntryPointData *vk_decode_entry_point_data(void *user_context, VulkanMemoryAllocator *allocator, const uint32_t *module_ptr, uint32_t module_size) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_decode_entry_point_data (user_context: " << user_context << ", "
+        << " vk_decode_entry_point_data (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "module_ptr: " << (void *)module_ptr << ", "
         << "module_size: " << module_size << ")\n";
@@ -784,7 +795,7 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
                                                       const char *ptr, int size) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_compile_shader_module (user_context: " << user_context << ", "
+        << " vk_compile_shader_module (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ", "
         << "module: " << (void *)ptr << ", "
@@ -868,7 +879,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: vk_destroy_shader_modules (user_context: " << user_context << ", "
+        << " vk_destroy_shader_modules (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "device: " << (void *)allocator->current_device() << ")\n";
 
@@ -887,6 +898,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
         void operator()(VulkanCompilationCacheEntry *cache_entry) {
             if (cache_entry != nullptr) {
                 if (cache_entry->shader_module) {
+                    debug(user_context) << "    destroying shader module " << (void*)cache_entry->shader_module << "\n";
                     vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
                     cache_entry->shader_module = {0};
                 }
@@ -912,6 +924,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                 }
                 if (cache_entry->descriptor_set_layouts) {
                     for (uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
+                        debug(user_context) << "    destroying descriptor set layout [" << n << "] " << cache_entry->entry_point_data[n].entry_point_name << "\n";
                         vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
                         cache_entry->descriptor_set_layouts[n] = {0};
                     }
@@ -919,6 +932,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                     cache_entry->descriptor_set_layouts = nullptr;
                 }
                 if (cache_entry->pipeline_layout) {
+                    debug(user_context) << "    destroying pipeline layout " << (void*)cache_entry->pipeline_layout << "\n";
                     vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
                     cache_entry->pipeline_layout = {0};
                 }

From f42f063222845d6567873fa03488919ae77d4989 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:15:22 -1000
Subject: [PATCH 084/166] Assert alignment is power of two

---
 src/runtime/internal/memory_resources.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index f06f850cc28b..5e5c949c761d 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -106,9 +106,15 @@ struct BlockRegion {
     BlockResource *block_ptr = nullptr;                         //< pointer to parent block resource
 };
 
+// Returns true if given byte alignment is a power of two
+ALWAYS_INLINE bool is_power_of_two_alignment(size_t x) {
+    return (x & (x - 1)) == 0;
+}
+
 // Returns an aligned byte offset to adjust the given offset based on alignment constraints
 // -- Alignment must be power of two!
 ALWAYS_INLINE size_t aligned_offset(size_t offset, size_t alignment) {
+    halide_abort_if_false(nullptr, is_power_of_two_alignment(alignment));
     return (offset + (alignment - 1)) & ~(alignment - 1);
 }
 

From fdcb345dc1a39530a71500ed76f2778347ff96e2 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:16:21 -1000
Subject: [PATCH 085/166] Only split regions that have already been freed. Add
 more debug messages to log

---
 src/runtime/internal/region_allocator.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 980b6c77256c..3e561b34c5a8 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -211,6 +211,19 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
         size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
         size_t actual_size = aligned_size(block_region->memory.offset, request.size, actual_alignment);
 
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: conform_alignment ("
+                       << " request=" << (uint32_t)request.alignment 
+                       << " required=" << (uint32_t)block->memory.properties.alignment << ") =>"
+                       << " actual_alignment=" << (uint32_t)actual_alignment << "\n";
+
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: aligned_size ("
+                       << " offset=" << (uint32_t)block_region->memory.offset 
+                       << " request_size=" << (uint32_t)request.size << " "
+                       << " actual_alignment=" << (uint32_t)actual_alignment  << ") =>" 
+                       << " actual_size=" << (uint32_t)actual_size << "\n";
+#endif
+
         // is the adjusted size larger than the current region?
         if (actual_size > block_region->memory.size) {
             continue;
@@ -278,7 +291,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
 }
 
 bool RegionAllocator::can_split(BlockRegion *block_region, size_t size) {
-    return (block_region && (block_region->memory.size > size));
+    return (block_region && (block_region->memory.size > size) && (block_region->memory.handle == nullptr));
 }
 
 BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
@@ -343,6 +356,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
                                     << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
 
+    block_region->memory.handle = nullptr;
     block_region->memory.offset = offset;
     block_region->memory.size = size;
     block_region->memory.properties = properties;
@@ -410,6 +424,7 @@ void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_
         StackBasicPrinter<256>(nullptr) << "Allocating region ("
                                         << "block_ptr=" << (void *)block_region->block_ptr << " "
                                         << "block_region=" << (void *)block_region << " "
+                                        << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " "
                                         << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                                         << "block_reserved=" << (uint32_t)block->reserved << " "
                                         << ")\n";
@@ -421,6 +436,7 @@ void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_
         StackBasicPrinter<256>(nullptr) << "Re-using region  ("
                                         << "block_ptr=" << (void *)block_region->block_ptr << " "
                                         << "block_region=" << (void *)block_region << " "
+                                        << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " "
                                         << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                                         << "block_reserved=" << (uint32_t)block->reserved << " "
                                         << ")\n";

From bbf89119350425cbb086aa78025bfda73a774ccd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:19:19 -1000
Subject: [PATCH 086/166] Explicitly cleanup Vulkan command buffers as after
 they are used Avoid recreating descriptor sets Tidy up Vulkan debug messages

---
 src/runtime/vulkan.cpp         | 73 ++++++++++++++++++++++------------
 src/runtime/vulkan_context.h   | 36 ++++++++---------
 src/runtime/vulkan_functions.h |  1 +
 src/runtime/vulkan_internal.h  |  6 ++-
 src/runtime/vulkan_memory.h    | 24 +++++++----
 5 files changed, 87 insertions(+), 53 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 75f9f81ef26a..ad1530b7846d 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -81,6 +81,10 @@ WEAK int halide_vulkan_release_context(void *user_context, VkInstance instance,
 }
 
 WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_buffer) {
+    debug(user_context)
+        << "halide_vulkan_device_free (user_context: " << user_context
+        << ", halide_buffer: " << halide_buffer << ")\n";
+
     // halide_vulkan_device_free, at present, can be exposed to clients and they
     // should be allowed to call halide_vulkan_device_free on any halide_buffer_t
     // including ones that have never been used with a GPU.
@@ -117,7 +121,7 @@ WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_b
 
 WEAK int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr, const char *src, int size) {
     debug(user_context)
-        << "Vulkan: halide_vulkan_init_kernels (user_context: " << user_context
+        << "halide_vulkan_init_kernels (user_context: " << user_context
         << ", state_ptr: " << state_ptr
         << ", program: " << (void *)src
         << ", size: " << size << "\n";
@@ -149,7 +153,7 @@ WEAK int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr,
 
 WEAK void halide_vulkan_finalize_kernels(void *user_context, void *state_ptr) {
     debug(user_context)
-        << "Vulkan: halide_vulkan_finalize_kernels (user_context: " << user_context
+        << "halide_vulkan_finalize_kernels (user_context: " << user_context
         << ", state_ptr: " << state_ptr << "\n";
 
 #ifdef DEBUG_RUNTIME
@@ -169,7 +173,7 @@ WEAK void halide_vulkan_finalize_kernels(void *user_context, void *state_ptr) {
 
 // Used to generate correct timings when tracing
 WEAK int halide_vulkan_device_sync(void *user_context, halide_buffer_t *) {
-    debug(user_context) << "Vulkan: halide_vulkan_device_sync (user_context: " << user_context << ")\n";
+    debug(user_context) << "halide_vulkan_device_sync (user_context: " << user_context << ")\n";
 
     VulkanContext ctx(user_context);
     halide_debug_assert(user_context, ctx.error == VK_SUCCESS);
@@ -190,7 +194,7 @@ WEAK int halide_vulkan_device_sync(void *user_context, halide_buffer_t *) {
 
 WEAK int halide_vulkan_device_release(void *user_context) {
     debug(user_context)
-        << "Vulkan: halide_vulkan_device_release (user_context: " << user_context << ")\n";
+        << "halide_vulkan_device_release (user_context: " << user_context << ")\n";
 
     VulkanMemoryAllocator *allocator;
     VkInstance instance;
@@ -208,6 +212,13 @@ WEAK int halide_vulkan_device_release(void *user_context) {
     if (instance != nullptr) {
 
         vkQueueWaitIdle(queue);
+        if(command_pool == cached_command_pool) {
+            cached_command_pool = 0;
+        }
+        if(reinterpret_cast<halide_vulkan_memory_allocator*>(allocator) == cached_allocator) {
+            cached_allocator = nullptr;
+        }
+
         vk_destroy_command_pool(user_context, allocator, command_pool);
         vk_destroy_shader_modules(user_context, allocator);
         vk_destroy_memory_allocator(user_context, allocator);
@@ -216,6 +227,7 @@ WEAK int halide_vulkan_device_release(void *user_context) {
             cached_device = nullptr;
             cached_physical_device = nullptr;
             cached_queue = nullptr;
+            cached_queue_family_index = 0;
         }
         vkDestroyDevice(device, nullptr);
 
@@ -346,6 +358,8 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
         return result;
     }
 
+    vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
@@ -361,7 +375,7 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     }
 
     debug(user_context)
-        << "Vulkan: halide_vulkan_copy_to_device (user_context: " << user_context
+        << "halide_vulkan_copy_to_device (user_context: " << user_context
         << ", halide_buffer: " << halide_buffer << ")\n";
 
     // Acquire the context so we can use the command queue.
@@ -496,6 +510,8 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
         ctx.allocator->reclaim(user_context, staging_region);
     }
 
+    vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
@@ -508,7 +524,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: halide_copy_to_host (user_context: " << user_context
+        << "halide_copy_to_host (user_context: " << user_context
         << ", halide_buffer: " << halide_buffer << ")\n";
 #endif
 
@@ -565,7 +581,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     VkCommandBuffer command_buffer;
     VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vk_create_command_buffer returned: " << vk_get_error_name(result) << "\n";
+        error(user_context) << "vk_create_command_buffer returned: " << vk_get_error_name(result) << "\n";
         return -1;
     }
 
@@ -645,6 +661,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     } else {
         ctx.allocator->reclaim(user_context, staging_region);
     }
+    vk_destroy_command_buffer(user_context,  ctx.allocator, ctx.command_pool, command_buffer);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -665,7 +682,7 @@ WEAK int halide_vulkan_run(void *user_context,
                            int8_t arg_is_buffer[]) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << "Vulkan: halide_vulkan_run (user_context: " << user_context << ", "
+        << "halide_vulkan_run (user_context: " << user_context << ", "
         << "entry: " << entry_name << ", "
         << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
         << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
@@ -733,7 +750,7 @@ WEAK int halide_vulkan_run(void *user_context,
         error(user_context) << "Vulkan: Failed to locate shader entry point! Unable to proceed!\n";
         return halide_error_code_internal_error;
     }
-    debug(user_context) << "    found entry point ["
+    debug(user_context) << " found entry point ["
                         << (entry_point_index + 1) << " of " << cache_entry->entry_point_count
                         << "] '" << entry_name << "'\n";
 
@@ -742,19 +759,22 @@ WEAK int halide_vulkan_run(void *user_context,
 
         //// 1. Create all descriptor set layouts
         for (uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
-            uint32_t uniform_buffer_count = cache_entry->entry_point_data[n].uniform_buffer_count;
-            uint32_t storage_buffer_count = cache_entry->entry_point_data[n].storage_buffer_count;
-            VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(cache_entry->descriptor_set_layouts[n]));
-            if (result != VK_SUCCESS) {
-                error(user_context) << "Vulkan: vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
-                return result;
+            if(((void*)cache_entry->descriptor_set_layouts[n]) == nullptr) {
+                uint32_t uniform_buffer_count = cache_entry->entry_point_data[n].uniform_buffer_count;
+                uint32_t storage_buffer_count = cache_entry->entry_point_data[n].storage_buffer_count;
+                debug(user_context) << " creating descriptor set layout [" << n << "] " << cache_entry->entry_point_data[n].entry_point_name << "\n";
+                VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(cache_entry->descriptor_set_layouts[n]));
+                if (result != VK_SUCCESS) {
+                    error(user_context) << "vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
+                    return result;
+                }
             }
         }
 
         ///// 2. Create the pipeline layout
         VkResult result = vk_create_pipeline_layout(user_context, ctx.allocator, cache_entry->entry_point_count, cache_entry->descriptor_set_layouts, &(cache_entry->pipeline_layout));
         if (result != VK_SUCCESS) {
-            error(user_context) << "Vulkan: vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
+            error(user_context) << "vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
             return halide_error_code_internal_error;
         }
     }
@@ -768,7 +788,7 @@ WEAK int halide_vulkan_run(void *user_context,
         if (scalar_buffer_size > 0) {
             entry_point_data->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
             if (entry_point_data->args_region == nullptr) {
-                error(user_context) << "Vulkan: vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
+                error(user_context) << "vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
                 return halide_error_code_internal_error;
             }
         }
@@ -778,7 +798,7 @@ WEAK int halide_vulkan_run(void *user_context,
     if (entry_point_data->compute_pipeline == 0) {
         VkResult result = vk_create_compute_pipeline(user_context, ctx.allocator, entry_name, cache_entry->shader_module, cache_entry->pipeline_layout, &(entry_point_data->compute_pipeline));
         if (result != VK_SUCCESS) {
-            error(user_context) << "Vulkan: vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            error(user_context) << "vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return halide_error_code_internal_error;
         }
     }
@@ -795,14 +815,14 @@ WEAK int halide_vulkan_run(void *user_context,
         uint32_t storage_buffer_count = entry_point_data->storage_buffer_count;
         VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(entry_point_data->descriptor_pool));
         if (result != VK_SUCCESS) {
-            error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            error(user_context) << "vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
 
         // Create the descriptor set
         result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layouts[entry_point_index], entry_point_data->descriptor_pool, &(entry_point_data->descriptor_set));
         if (result != VK_SUCCESS) {
-            error(user_context) << "Vulkan: vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            error(user_context) << "vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
     }
@@ -812,7 +832,7 @@ WEAK int halide_vulkan_run(void *user_context,
     if ((entry_point_data->args_region != nullptr) && entry_point_data->uniform_buffer_count) {
         VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, entry_point_data->args_region, arg_sizes, args, arg_is_buffer);
         if (result != VK_SUCCESS) {
-            debug(user_context) << "Vulkan: vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            debug(user_context) << "vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
 
@@ -825,7 +845,7 @@ WEAK int halide_vulkan_run(void *user_context,
 
     VkResult result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_data->uniform_buffer_count, entry_point_data->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_data->descriptor_set);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "Vulkan: vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        debug(user_context) << "vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
     }
 
@@ -833,7 +853,7 @@ WEAK int halide_vulkan_run(void *user_context,
     VkCommandBuffer command_buffer;
     result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "Vulkan: vk_create_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        debug(user_context) << "vk_create_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
     }
 
@@ -846,14 +866,14 @@ WEAK int halide_vulkan_run(void *user_context,
                                                        entry_point_index,
                                                        blocksX, blocksY, blocksZ);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "Vulkan: vk_fill_command_buffer_with_dispatch_call() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        debug(user_context) << "vk_fill_command_buffer_with_dispatch_call() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
     }
 
     //// 8. Submit the command buffer to our command queue
     result = vk_submit_command_buffer(user_context, ctx.queue, command_buffer);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "Vulkan: vk_submit_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        debug(user_context) << "vk_submit_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
     }
 
@@ -865,10 +885,11 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     //// 10. Cleanup
+    vk_destroy_command_buffer(user_context,  ctx.allocator, ctx.command_pool, command_buffer);
     vkResetCommandPool(ctx.device, ctx.command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
 
 #ifdef DEBUG_RUNTIME
-    debug(user_context) << "Vulkan: blocks_allocated="
+    debug(user_context) << "halide_vulkan_run: blocks_allocated="
                         << (uint32_t)ctx.allocator->blocks_allocated() << " "
                         << "bytes_allocated_for_blocks=" << (uint32_t)ctx.allocator->bytes_allocated_for_blocks() << " "
                         << "regions_allocated=" << (uint32_t)ctx.allocator->regions_allocated() << " "
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 1e9985561c83..348470438cb5 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -86,7 +86,7 @@ namespace {
 
 // Initializes the instance (used by the default vk_create_context)
 int vk_create_instance(void *user_context, const StringTable &requested_layers, VkInstance *instance, const VkAllocationCallbacks *alloc_callbacks) {
-    debug(user_context) << "    vk_create_instance (user_context: " << user_context << ")\n";
+    debug(user_context) << " vk_create_instance (user_context: " << user_context << ")\n";
 
     StringTable required_instance_extensions;
     vk_get_required_instance_extensions(user_context, required_instance_extensions);
@@ -97,9 +97,9 @@ int vk_create_instance(void *user_context, const StringTable &requested_layers,
     bool valid_instance = vk_validate_required_extension_support(user_context, required_instance_extensions, supported_instance_extensions);
     halide_abort_if_false(user_context, valid_instance);
 
-    debug(user_context) << "Vulkan: Found " << (uint32_t)required_instance_extensions.size() << " required extensions for instance!\n";
+    debug(user_context) << "  found " << (uint32_t)required_instance_extensions.size() << " required extensions for instance!\n";
     for (int n = 0; n < (int)required_instance_extensions.size(); ++n) {
-        debug(user_context) << "    extension: " << required_instance_extensions[n] << "\n";
+        debug(user_context) << "  extension: " << required_instance_extensions[n] << "\n";
     }
 
     // If we're running under Molten VK, we must enable the portability extension and create flags
@@ -219,7 +219,7 @@ int vk_select_device_for_context(void *user_context,
 
 int vk_create_device(void *user_context, const StringTable &requested_layers, VkInstance *instance, VkDevice *device, VkQueue *queue,
                      VkPhysicalDevice *physical_device, uint32_t *queue_family_index, const VkAllocationCallbacks *alloc_callbacks) {
-
+    debug(user_context) << " vk_create_device (user_context=" << user_context << ")\n";
     StringTable required_device_extensions;
     vk_get_required_device_extensions(user_context, required_device_extensions);
 
@@ -232,16 +232,16 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     bool valid_device = vk_validate_required_extension_support(user_context, required_device_extensions, supported_device_extensions);
     halide_abort_if_false(user_context, valid_device);
 
-    debug(user_context) << "Vulkan: Found " << (uint32_t)required_device_extensions.size() << " required extensions for device!\n";
+    debug(user_context) << "  found " << (uint32_t)required_device_extensions.size() << " required extensions for device!\n";
     for (int n = 0; n < (int)required_device_extensions.size(); ++n) {
-        debug(user_context) << "    required extension: " << required_device_extensions[n] << "\n";
+        debug(user_context) << "   required extension: " << required_device_extensions[n] << "\n";
     }
 
     // enable all available optional extensions
-    debug(user_context) << "Vulkan: Checking for " << (uint32_t)optional_device_extensions.size() << " optional extensions for device ...\n";
+    debug(user_context) << "  checking for " << (uint32_t)optional_device_extensions.size() << " optional extensions for device ...\n";
     for (int n = 0; n < (int)optional_device_extensions.size(); ++n) {
         if (supported_device_extensions.contains(optional_device_extensions[n])) {
-            debug(user_context) << "    optional extension: " << optional_device_extensions[n] << "\n";
+            debug(user_context) << "   optional extension: " << optional_device_extensions[n] << "\n";
             required_device_extensions.append(user_context, optional_device_extensions[n]);
         }
     }
@@ -262,11 +262,11 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     void *extended_features_ptr = nullptr;
     void *standard_features_ptr = nullptr;
 
-    debug(user_context) << "Vulkan: Querying for device features...\n";
+    debug(user_context) << "  querying for device features...\n";
     vkGetPhysicalDeviceFeatures(*physical_device, &device_features);
-    debug(user_context) << "Vulkan: Shader Float64 support: " << (device_features.shaderFloat64 ? "true" : "false") << "...\n";
-    debug(user_context) << "Vulkan: Shader Int64 support: " << (device_features.shaderInt64 ? "true" : "false") << "...\n";
-    debug(user_context) << "Vulkan: Shader Int16 support: " << (device_features.shaderInt16 ? "true" : "false") << "...\n";
+    debug(user_context) << "   shader float64 support: " << (device_features.shaderFloat64 ? "true" : "false") << "...\n";
+    debug(user_context) << "   shader int64 support: " << (device_features.shaderInt64 ? "true" : "false") << "...\n";
+    debug(user_context) << "   shader int16 support: " << (device_features.shaderInt16 ? "true" : "false") << "...\n";
 
     // If the instance runtime supports querying extended device features, request them
     VkPhysicalDeviceShaderFloat16Int8FeaturesKHR shader_f16_i8_ext = {
@@ -285,10 +285,10 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     }
 
     if (vkGetPhysicalDeviceFeatures2KHR) {
-        debug(user_context) << "Vulkan: Querying for extended device features...\n";
+        debug(user_context) << "  qerying for extended device features...\n";
         vkGetPhysicalDeviceFeatures2KHR(*physical_device, &device_features_ext);
-        debug(user_context) << "Vulkan: Shader Int8 support: " << (shader_f16_i8_ext.shaderInt8 ? "true" : "false") << "...\n";
-        debug(user_context) << "Vulkan: Shader Float16 support: " << (shader_f16_i8_ext.shaderFloat16 ? "true" : "false") << "...\n";
+        debug(user_context) << "   shader int8 support: " << (shader_f16_i8_ext.shaderInt8 ? "true" : "false") << "...\n";
+        debug(user_context) << "   shader float16 support: " << (shader_f16_i8_ext.shaderFloat16 ? "true" : "false") << "...\n";
         extended_features_ptr = (void *)(&device_features_ext);  // pass v1.1 extended features (which also contains the standard features)
     } else {
         standard_features_ptr = &device_features;  // pass v1.0 standard features
@@ -320,13 +320,13 @@ int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
                       VkInstance *instance, VkDevice *device, VkPhysicalDevice *physical_device,
                       VkCommandPool *command_pool, VkQueue *queue, uint32_t *queue_family_index) {
 
-    debug(user_context) << "    vk_create_context (user_context: " << user_context << ")\n";
+    debug(user_context) << " vk_create_context (user_context: " << user_context << ")\n";
 
     StringTable requested_layers;
     uint32_t requested_layer_count = vk_get_requested_layers(user_context, requested_layers);
-    debug(user_context) << "Vulkan: Requested " << requested_layer_count << " layers for instance!\n";
+    debug(user_context) << "  requested " << requested_layer_count << " layers for instance!\n";
     for (int n = 0; n < (int)requested_layer_count; ++n) {
-        debug(user_context) << "    layer: " << requested_layers[n] << "\n";
+        debug(user_context) << "   layer: " << requested_layers[n] << "\n";
     }
 
     const VkAllocationCallbacks *alloc_callbacks = halide_vulkan_get_allocation_callbacks(user_context);
diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
index dcf6701dbd54..d1c0a8bfd32c 100644
--- a/src/runtime/vulkan_functions.h
+++ b/src/runtime/vulkan_functions.h
@@ -33,6 +33,7 @@ VULKAN_FN(vkUpdateDescriptorSets)
 VULKAN_FN(vkCreateCommandPool)
 VULKAN_FN(vkResetCommandPool)
 VULKAN_FN(vkAllocateCommandBuffers)
+VULKAN_FN(vkFreeCommandBuffers)
 VULKAN_FN(vkBeginCommandBuffer)
 VULKAN_FN(vkCmdBindPipeline)
 VULKAN_FN(vkCmdBindDescriptorSets)
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 696cdd9dd871..7a109c8068e4 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -14,6 +14,9 @@
 
 #include "vulkan_interface.h"
 
+#ifndef DEBUG_RUNTIME
+#define DEBUG_RUNTIME
+#endif
 // --
 
 namespace Halide {
@@ -82,10 +85,11 @@ bool vk_validate_required_extension_support(void *user_context,
 
 // -- Command Pool
 VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool);
-VkResult vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool);
+void vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool);
 
 // -- Command Buffer
 VkResult vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool pool, VkCommandBuffer *command_buffer);
+void vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer command_buffer);
 
 VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
                                                    VkDevice device,
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index e6907a0eb0ce..786b10b7c7dd 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -23,7 +23,7 @@ WEAK const VkAllocationCallbacks *custom_allocation_callbacks = nullptr;  // nul
 // Runtime configuration parameters to adjust the behaviour of the block allocator
 struct VulkanMemoryConfig {
     size_t minimum_block_size = 32 * 1024 * 1024;  // 32MB
-    size_t maximum_block_size = 0;                 //< zero means no contraint
+    size_t maximum_block_size = 0;                 //< zero means no constraint
     size_t maximum_block_count = 0;                //< zero means no constraint
 };
 WEAK VulkanMemoryConfig memory_allocator_config;
@@ -188,9 +188,11 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Mapping region ("
                    << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << " "
                    << "device=" << (void *)(device) << " "
-                   << "physical_device=" << (void *)(physical_device) << ") ...\n";
+                   << "physical_device=" << (void *)(physical_device) << " "
+                   << "region=" << (void *)(region) << " "
+                   << "size=" << (uint32_t)region->size << " "
+                   << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -228,9 +230,11 @@ void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Unmapping region ("
                    << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << " "
                    << "device=" << (void *)(device) << " "
-                   << "physical_device=" << (void *)(physical_device) << ") ...\n";
+                   << "physical_device=" << (void *)(physical_device) << " "
+                   << "region=" << (void *)(region) << " "
+                   << "size=" << (uint32_t)region->size << " "
+                   << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -260,7 +264,9 @@ void VulkanMemoryAllocator::release(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Releasing region ("
                    << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << ") ... \n";
+                   << "region=" << (void *)(region) << " "
+                   << "size=" << (uint32_t)region->size << " "
+                   << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -272,7 +278,9 @@ void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Reclaiming region ("
                    << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << ") ... \n";
+                   << "region=" << (void *)(region) << " "
+                   << "size=" << (uint32_t)region->size << " "
+                   << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
@@ -293,7 +301,7 @@ bool VulkanMemoryAllocator::collect(void *user_context) {
 
 void VulkanMemoryAllocator::release(void *user_context) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Releasing ("
+    debug(nullptr) << "VulkanMemoryAllocator: Releasing block allocator ("
                    << "user_context=" << user_context << ") ... \n";
 #endif
     halide_abort_if_false(user_context, device != nullptr);

From 74b917666609691a91d12046a646a12205a2049d Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:21:45 -1000
Subject: [PATCH 087/166] Fix Div, Mod, and div_round_to_zero for integer cases
 Cleanup reset method

---
 src/CodeGen_Vulkan_Dev.cpp | 164 +++++++++++++++++++++++++------------
 1 file changed, 112 insertions(+), 52 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 9eb0aa41b58c..cf23b0c34253 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -20,8 +20,6 @@
 namespace Halide {
 namespace Internal {
 
-class CodeGen_LLVM;
-
 namespace {  // anonymous
 
 // --
@@ -115,6 +113,30 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         void visit(const Acquire *) override;
         void visit(const Atomic *) override;
 
+        void reset();
+
+
+        // Top-level function for adding kernels
+        void add_kernel(const Stmt &s, const std::string &name, const std::vector<DeviceArgument> &args);
+        void init_module();
+        void compile(std::vector<char> &binary);
+        void dump() const;
+
+        // Encode the descriptor sets into a sidecar which will be added
+        // as a header to the module prior to the actual SPIR-V binary
+        void encode_header(SpvBinary &spirv_header);
+
+        // Scalarize expressions
+        void scalarize(const Expr &e);
+        SpvId map_type_to_pair(const Type &t);
+
+        // Workgroup size
+        void reset_workgroup_size();
+        void declare_workgroup_size(SpvId kernel_func_id);
+        void declare_entry_point(const Stmt &s, SpvId kernel_func_id);
+        void declare_device_args(const Stmt &s, uint32_t entry_point_index, const std::string &kernel_name, const std::vector<DeviceArgument> &args);
+
+        // Common operator visitors
         void visit_unary_op(SpvOp op_code, Type t, const Expr &a);
         void visit_binary_op(SpvOp op_code, Type t, const Expr &a, const Expr &b);
         void visit_glsl_op(SpvId glsl_op_code, Type t, const std::vector<Expr> &args);
@@ -130,8 +152,12 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         SpvId cast_type(Type target_type, Type value_type, SpvId value_id);
         SpvId convert_to_bool(Type target_type, Type value_type, SpvId value_id);
 
-        using BuiltinMap = std::unordered_map<std::string, SpvId>;
+        // Returns Phi node inputs.
+        template<typename StmtOrExpr>
+        SpvFactory::BlockVariables emit_if_then_else(const Expr &condition, StmtOrExpr then_case, StmtOrExpr else_case);
 
+        // Map from Halide built-in names to extended GLSL intrinsics for SPIR-V
+        using BuiltinMap = std::unordered_map<std::string, SpvId>;
         const BuiltinMap glsl_builtin = {
             {"pow_f16", GLSLstd450Pow},
             {"pow_f32", GLSLstd450Pow},
@@ -187,22 +213,6 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         // The SPIRV-IR builder
         SpvBuilder builder;
 
-        // Top-level function for adding kernels
-        void add_kernel(const Stmt &s, const std::string &name, const std::vector<DeviceArgument> &args);
-        void init_module();
-        void compile(std::vector<char> &binary);
-        void dump() const;
-
-        // Scalarize expressions
-        void scalarize(const Expr &e);
-        SpvId map_type_to_pair(const Type &t);
-
-        // Workgroup size
-        void reset_workgroup_size();
-        void declare_workgroup_size(SpvId kernel_func_id);
-        void declare_entry_point(const Stmt &s, SpvId kernel_func_id);
-        void declare_device_args(const Stmt &s, uint32_t entry_point_index, const std::string &kernel_name, const std::vector<DeviceArgument> &args);
-
         // The scope contains both the symbol id and its storage class
         using SymbolIdStorageClassPair = std::pair<SpvId, SpvStorageClass>;
         using SymbolScope = Scope<SymbolIdStorageClassPair>;
@@ -228,16 +238,11 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         using DescriptorSetTable = std::vector<DescriptorSet>;
         DescriptorSetTable descriptor_set_table;
 
-        // Encode the descriptor sets into a sidecar which will be added
-        // as a header to the module prior to the actual SPIR-V binary
-        void encode_header(SpvBinary &spirv_header);
-
         // The workgroup size.  May vary between kernels.
         uint32_t workgroup_size[3];
 
-        // Returns Phi node inputs.
-        template<typename StmtOrExpr>
-        SpvFactory::BlockVariables emit_if_then_else(const Expr &condition, StmtOrExpr then_case, StmtOrExpr else_case);
+        // Current index of kernel for module
+        uint32_t kernel_index = 0;
 
     } emitter;
 
@@ -644,23 +649,53 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mul *op) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Div): " << op->type << " ((" << op->a << ") / (" << op->b << "))\n";
     user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n";
-
-    if (op->type.is_float()) {
-        visit_binary_op(SpvOpFDiv, op->type, op->a, op->b);
-    } else {
-        Expr e = lower_int_uint_div(op->a, op->b);
+    int bits = 0; 
+    if (is_const_power_of_two_integer(op->b, &bits)) {
+        SpvId shift_amount_id = builder.declare_constant(Int(32), &bits);
+        SpvId type_id = builder.declare_type(op->type);
+        op->a.accept(this);
+        SpvId src_a_id = builder.current_id();
+        SpvId result_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::binary_op(SpvOpShiftRightArithmetic, type_id, result_id, src_a_id, shift_amount_id));
+        builder.update_id(result_id);
+    } else if (op->type.is_int()) {
+        Expr e = lower_euclidean_div(op->a, op->b);
         e.accept(this);
+    } else {
+        if (op->type.is_float()) {
+            visit_binary_op(SpvOpFDiv, op->type, op->a, op->b);
+        } else if(op->type.is_uint()) {
+            visit_binary_op(SpvOpUDiv, op->type, op->a, op->b);
+        } else {
+            internal_error << "Failed to find a suitable Div operator for type: " << op->type << "\n";
+        }
     }
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a << ") % (" << op->b << "))\n";
-    if (op->type.is_float()) {
-        // SPIR-V FMod is strangely not what we want .. FRem does what we need
-        visit_binary_op(SpvOpFRem, op->type, op->a, op->b);
-    } else {
-        Expr e = lower_int_uint_mod(op->a, op->b);
+    int bits = 0; 
+    if (is_const_power_of_two_integer(op->b, &bits)) {
+        int bitwise_value = ((1 << bits) - 1);
+        SpvId bitwise_value_id = builder.declare_constant(Int(32), &bitwise_value);
+        SpvId type_id = builder.declare_type(op->type);
+        op->a.accept(this);
+        SpvId src_a_id = builder.current_id();
+        SpvId result_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::binary_op(SpvOpBitwiseAnd, type_id, result_id, src_a_id, bitwise_value_id));
+        builder.update_id(result_id);
+    } else if (op->type.is_int()) {
+        Expr e = lower_euclidean_mod(op->a, op->b);
         e.accept(this);
+    } else {
+        if (op->type.is_float()) {
+            // SPIR-V FMod is strangely not what we want .. FRem does what we need
+            visit_binary_op(SpvOpFRem, op->type, op->a, op->b);
+        } else if(op->type.is_uint()) {
+            visit_binary_op(SpvOpUMod, op->type, op->a, op->b);
+        } else {
+            internal_error << "Failed to find a suitable Mod operator for type: " << op->type << "\n";
+        }
     }
 }
 
@@ -962,6 +997,13 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         }
     } else if (op->is_intrinsic(Call::IntrinsicOp::div_round_to_zero)) {
         internal_assert(op->args.size() == 2);
+        // See if we can rewrite it to something faster (e.g. a shift)
+        Expr e = lower_int_uint_div(op->args[0], op->args[1], /** round to zero */ true);
+        if (!e.as<Call>()) {
+            e.accept(this);
+            return;
+        }
+
         SpvOp op_code = SpvOpNop;
         if (op->type.is_float()) {
             op_code = SpvOpFDiv;
@@ -1538,7 +1580,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         builder.append(SpvFactory::integer_add(index_type_id, max_id, min_id, extent_id));
 
         // Declare loop var
-        const std::string loop_var_name = unique_name("_loop_idx");
+        const std::string loop_var_name = unique_name(std::string("k") + std::to_string(kernel_index) + "_loop_idx");
         debug(2) << "  loop_index=" << loop_var_name << " type=" << index_type << "\n";
         SpvId loop_var_id = builder.declare_variable(loop_var_name, index_var_type_id, storage_class);
         symbol_table.push(loop_var_name, {loop_var_id, storage_class});
@@ -1671,14 +1713,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
         int32_t size = op->constant_allocation_size();
         array_type_id = builder.declare_type(op->type, size);
         storage_class = SpvStorageClassWorkgroup;  // shared across workgroup
-        debug(2) << "Vulkan: Allocate " << op->name << "[" << (uint32_t)size << "] in shared memory on device in global scope\n";
+        debug(2) << "Vulkan: Allocate " << op->name << " type=" << op->type << " size=" << (uint32_t)size << " in shared memory on device in global scope\n";
+        std::string variable_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + op->name;
         SpvId ptr_type_id = builder.declare_pointer_type(array_type_id, storage_class);
-        variable_id = builder.declare_global_variable(op->name, ptr_type_id, storage_class);
+        variable_id = builder.declare_global_variable(variable_name, ptr_type_id, storage_class);
 
     } else {
 
         // Allocation is not a shared memory allocation, just make a local declaration.
-        debug(2) << "Vulkan: Allocate " << op->name << " on device in function scope\n";
         int32_t size = op->constant_allocation_size();
 
         // It must have a constant size.
@@ -1686,10 +1728,13 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
             << "Allocation " << op->name << " has a dynamic size. "
             << "Only fixed-size allocations are supported with Vulkan.";
 
+        debug(2) << "Vulkan: Allocate " << op->name << " type=" << op->type << " size=" << (uint32_t)size << " on device in function scope\n";
+
         array_type_id = builder.declare_type(op->type, size);
         storage_class = SpvStorageClassFunction;  // function scope
+        std::string variable_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + op->name;
         SpvId ptr_type_id = builder.declare_pointer_type(array_type_id, storage_class);
-        variable_id = builder.declare_variable(op->name, ptr_type_id, storage_class);
+        variable_id = builder.declare_variable(variable_name, ptr_type_id, storage_class);
     }
 
     StorageAccess access;
@@ -2032,9 +2077,19 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::join_vector(Type type, const SpvFactory
     return result_id;
 }
 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::reset() {
+    kernel_index = 0;
+    builder.reset();
+    SymbolScope empty;
+    symbol_table.swap(empty);
+    storage_access_map.clear();
+    descriptor_set_table.clear();
+    reset_workgroup_size();
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
 
-    builder.reset();
+    reset();
 
     // NOTE: Source language is irrelevant. We encode the binary directly
     builder.set_source_language(SpvSourceLanguageUnknown);
@@ -2222,7 +2277,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
     // Add a binding for a uniform buffer packed with all scalar args
     uint32_t binding_counter = 0;
     if (!param_struct_members.empty()) {
-        const std::string struct_name = std::string("_struct") + entry_point_name + std::string("_args");
+
+        const std::string struct_name = std::string("k") + std::to_string(kernel_index) + std::string("_args_struct");
         SpvId param_struct_type_id = builder.declare_struct(struct_name, param_struct_members);
 
         // Add a decoration describing the offset for each parameter struct member
@@ -2241,7 +2297,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
         builder.add_annotation(param_struct_type_id, SpvDecorationBlock);
 
         // Add a variable for the parameter pack
-        const std::string param_pack_var_name = std::string("_var") + entry_point_name + std::string("_args");
+        const std::string param_pack_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_args_var");
         SpvId param_pack_ptr_type_id = builder.declare_pointer_type(param_struct_type_id, SpvStorageClassUniform);
         SpvId param_pack_var_id = builder.declare_global_variable(param_pack_var_name, param_pack_ptr_type_id, SpvStorageClassUniform);
 
@@ -2293,13 +2349,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
 
             // Wrap the runtime array in a struct (required with SPIR-V buffer block semantics)
             SpvBuilder::StructMemberTypes struct_member_types = {runtime_arr_type_id};
-            const std::string struct_name = std::string("_struct") + entry_point_name + std::string("_b") + std::to_string(binding_counter);
+            const std::string struct_name = std::string("k") + std::to_string(kernel_index) + std::string("_buffer_block") + std::to_string(binding_counter);
             SpvId struct_type_id = builder.declare_struct(struct_name, struct_member_types);
 
             // Declare a pointer to the struct as a global variable
             SpvStorageClass storage_class = SpvStorageClassUniform;
             SpvId ptr_struct_type_id = builder.declare_pointer_type(struct_type_id, storage_class);
-            SpvId buffer_block_var_id = builder.declare_global_variable(arg.name, ptr_struct_type_id, storage_class);
+            const std::string buffer_block_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + arg.name;
+            SpvId buffer_block_var_id = builder.declare_global_variable(buffer_block_var_name, ptr_struct_type_id, storage_class);
 
             // Annotate the struct to indicate it's passed in a GLSL-style buffer block
             builder.add_annotation(struct_type_id, SpvDecorationBufferBlock);
@@ -2357,19 +2414,22 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
-                                                   const std::string &name,
+                                                   const std::string &kernel_name,
                                                    const std::vector<DeviceArgument> &args) {
-    debug(2) << "Adding Vulkan kernel " << name << "\n";
-
+    debug(2) << "Adding Vulkan kernel " << kernel_name << "\n";
+    
     // Add function definition
     // TODO: can we use one of the function control annotations?
 
     // We'll discover the workgroup size as we traverse the kernel
     reset_workgroup_size();
 
+    // Update the kernel index for the module
+    kernel_index++;
+    
     // Declare the kernel function
     SpvId void_type_id = builder.declare_void_type();
-    SpvId kernel_func_id = builder.add_function(name, void_type_id);
+    SpvId kernel_func_id = builder.add_function(kernel_name, void_type_id);
     SpvFunction kernel_func = builder.lookup_function(kernel_func_id);
     uint32_t entry_point_index = builder.current_module().entry_point_count();
     builder.enter_function(kernel_func);
@@ -2378,7 +2438,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     declare_entry_point(s, kernel_func_id);
 
     // Declare all parameters -- scalar args and device buffers
-    declare_device_args(s, entry_point_index, name, args);
+    declare_device_args(s, entry_point_index, kernel_name, args);
 
     // Traverse
     s.accept(this);
@@ -2393,9 +2453,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     for (const auto &arg : args) {
         symbol_table.pop(arg.name);
     }
-    storage_access_map.clear();
     builder.leave_block();
     builder.leave_function();
+    storage_access_map.clear();
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::dump() const {

From 11050ef4411fb128127a69ead926705bb43ac9bd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:22:57 -1000
Subject: [PATCH 088/166] Skip Vulkan for async_copy_chain

---
 test/correctness/async_copy_chain.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/correctness/async_copy_chain.cpp b/test/correctness/async_copy_chain.cpp
index b573780b9a75..dadd3ffd45f3 100644
--- a/test/correctness/async_copy_chain.cpp
+++ b/test/correctness/async_copy_chain.cpp
@@ -25,14 +25,14 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) {
-        printf("Skipping test for OpenGLCompute as it does not support copy_to_host/device() yet"
+    if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
+        printf("[SKIP] Skipping test for Vulkan as it does not support copy_to_host/device() yet"
                " (halide_buffer_copy is unimplemented in that backend).\n");
         return 0;
     }
 
-    if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
-        printf("Skipping test for Vulkan as it does not support copy_to_host/device() yet"
+    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) {
+        printf("Skipping test for OpenGLCompute as it does not support copy_to_host/device() yet"
                " (halide_buffer_copy is unimplemented in that backend).\n");
         return 0;
     }

From fd155a93e6823323843b76be404188c410943830 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:23:34 -1000
Subject: [PATCH 089/166] Skip 64-bit values on Vulkan since they are
 optionally supported

---
 test/correctness/gpu_mixed_shared_mem_types.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/correctness/gpu_mixed_shared_mem_types.cpp b/test/correctness/gpu_mixed_shared_mem_types.cpp
index ff4b343eabf4..9126a31ac266 100644
--- a/test/correctness/gpu_mixed_shared_mem_types.cpp
+++ b/test/correctness/gpu_mixed_shared_mem_types.cpp
@@ -48,6 +48,7 @@ int main(int argc, char **argv) {
             // Metal does not support 64-bit integers.
             // neither does D3D12 under SM 5.1.
             if ((t.supports_device_api(DeviceAPI::Metal) ||
+                 t.supports_device_api(DeviceAPI::Vulkan) ||
                  t.supports_device_api(DeviceAPI::OpenGLCompute) ||
                  t.supports_device_api(DeviceAPI::D3D12Compute)) &&
                 types[i].bits() >= 64) {

From a5dea74610c882ffe37cdc4615a4a4e41f629ab1 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:37:26 -1000
Subject: [PATCH 090/166] Skip interleave_rgb for Vulkan (which doesn't support
 cropping)

---
 test/correctness/interleave_rgb.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/correctness/interleave_rgb.cpp b/test/correctness/interleave_rgb.cpp
index 371b6d0acd2f..adf426ded42c 100644
--- a/test/correctness/interleave_rgb.cpp
+++ b/test/correctness/interleave_rgb.cpp
@@ -103,6 +103,12 @@ bool test_deinterleave(int x_stride) {
 }
 
 int main(int argc, char **argv) {
+
+    if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
+        printf("[SKIP] Skipping test for Vulkan as it does not support cropping yet!\n");
+        return 0;
+    }
+
     for (int x_stride : {3, 4}) {
         if (!test_interleave<uint8_t>(x_stride)) return -1;
         if (!test_interleave<uint16_t>(x_stride)) return -1;

From 840036b68a4162abe4166b3f0f94e1207cb14f12 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:37:58 -1000
Subject: [PATCH 091/166] Skip interpreter for Vulkan (which doesn't support
 dynamic allocation of shared mem).

---
 test/correctness/interpreter.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/correctness/interpreter.cpp b/test/correctness/interpreter.cpp
index 11751fb70ba2..7623b4b70463 100644
--- a/test/correctness/interpreter.cpp
+++ b/test/correctness/interpreter.cpp
@@ -8,12 +8,16 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     if (target.os == Target::Windows &&
         (target.has_feature(Target::OpenCL) ||
-         target.has_feature(Target::Vulkan) ||
          target.has_feature(Target::D3D12Compute))) {
         printf("[SKIP] workaround for issue #5738\n");
         return 0;
     }
 
+    if(target.has_feature(Target::Vulkan)) {
+        printf("[SKIP] Skipping test for Vulkan (which doesn't support dynamically allocated shared mem)!\n");
+        return 0;
+    }
+
     // This test demonstrates a trick for writing interpreters in
     // Halide, and as a side-effect tests our ability to correctly
     // emit switch statements.

From d486f5a702b66e943f2a91841aab420afe66e3d1 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 08:43:52 -1000
Subject: [PATCH 092/166] Clang Tidy/Format pass

---
 src/CodeGen_Vulkan_Dev.cpp              | 13 ++++++-------
 src/runtime/internal/region_allocator.h | 14 +++++++-------
 src/runtime/vulkan.cpp                  | 10 +++++-----
 src/runtime/vulkan_resources.h          | 12 ++++++------
 test/correctness/interpreter.cpp        |  2 +-
 5 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index cf23b0c34253..9d05c96281f0 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -115,7 +115,6 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 
         void reset();
 
-
         // Top-level function for adding kernels
         void add_kernel(const Stmt &s, const std::string &name, const std::vector<DeviceArgument> &args);
         void init_module();
@@ -649,7 +648,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mul *op) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Div): " << op->type << " ((" << op->a << ") / (" << op->b << "))\n";
     user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n";
-    int bits = 0; 
+    int bits = 0;
     if (is_const_power_of_two_integer(op->b, &bits)) {
         SpvId shift_amount_id = builder.declare_constant(Int(32), &bits);
         SpvId type_id = builder.declare_type(op->type);
@@ -664,7 +663,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
     } else {
         if (op->type.is_float()) {
             visit_binary_op(SpvOpFDiv, op->type, op->a, op->b);
-        } else if(op->type.is_uint()) {
+        } else if (op->type.is_uint()) {
             visit_binary_op(SpvOpUDiv, op->type, op->a, op->b);
         } else {
             internal_error << "Failed to find a suitable Div operator for type: " << op->type << "\n";
@@ -674,7 +673,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a << ") % (" << op->b << "))\n";
-    int bits = 0; 
+    int bits = 0;
     if (is_const_power_of_two_integer(op->b, &bits)) {
         int bitwise_value = ((1 << bits) - 1);
         SpvId bitwise_value_id = builder.declare_constant(Int(32), &bitwise_value);
@@ -691,7 +690,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
         if (op->type.is_float()) {
             // SPIR-V FMod is strangely not what we want .. FRem does what we need
             visit_binary_op(SpvOpFRem, op->type, op->a, op->b);
-        } else if(op->type.is_uint()) {
+        } else if (op->type.is_uint()) {
             visit_binary_op(SpvOpUMod, op->type, op->a, op->b);
         } else {
             internal_error << "Failed to find a suitable Mod operator for type: " << op->type << "\n";
@@ -2417,7 +2416,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
                                                    const std::string &kernel_name,
                                                    const std::vector<DeviceArgument> &args) {
     debug(2) << "Adding Vulkan kernel " << kernel_name << "\n";
-    
+
     // Add function definition
     // TODO: can we use one of the function control annotations?
 
@@ -2426,7 +2425,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
 
     // Update the kernel index for the module
     kernel_index++;
-    
+
     // Declare the kernel function
     SpvId void_type_id = builder.declare_void_type();
     SpvId kernel_func_id = builder.add_function(kernel_name, void_type_id);
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 3e561b34c5a8..c857ce993837 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -213,15 +213,15 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: conform_alignment ("
-                       << " request=" << (uint32_t)request.alignment 
-                       << " required=" << (uint32_t)block->memory.properties.alignment << ") =>"
-                       << " actual_alignment=" << (uint32_t)actual_alignment << "\n";
+                                        << " request=" << (uint32_t)request.alignment
+                                        << " required=" << (uint32_t)block->memory.properties.alignment << ") =>"
+                                        << " actual_alignment=" << (uint32_t)actual_alignment << "\n";
 
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: aligned_size ("
-                       << " offset=" << (uint32_t)block_region->memory.offset 
-                       << " request_size=" << (uint32_t)request.size << " "
-                       << " actual_alignment=" << (uint32_t)actual_alignment  << ") =>" 
-                       << " actual_size=" << (uint32_t)actual_size << "\n";
+                                        << " offset=" << (uint32_t)block_region->memory.offset
+                                        << " request_size=" << (uint32_t)request.size << " "
+                                        << " actual_alignment=" << (uint32_t)actual_alignment << ") =>"
+                                        << " actual_size=" << (uint32_t)actual_size << "\n";
 #endif
 
         // is the adjusted size larger than the current region?
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index ad1530b7846d..591118b51573 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -212,10 +212,10 @@ WEAK int halide_vulkan_device_release(void *user_context) {
     if (instance != nullptr) {
 
         vkQueueWaitIdle(queue);
-        if(command_pool == cached_command_pool) {
+        if (command_pool == cached_command_pool) {
             cached_command_pool = 0;
         }
-        if(reinterpret_cast<halide_vulkan_memory_allocator*>(allocator) == cached_allocator) {
+        if (reinterpret_cast<halide_vulkan_memory_allocator *>(allocator) == cached_allocator) {
             cached_allocator = nullptr;
         }
 
@@ -661,7 +661,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     } else {
         ctx.allocator->reclaim(user_context, staging_region);
     }
-    vk_destroy_command_buffer(user_context,  ctx.allocator, ctx.command_pool, command_buffer);
+    vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -759,7 +759,7 @@ WEAK int halide_vulkan_run(void *user_context,
 
         //// 1. Create all descriptor set layouts
         for (uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
-            if(((void*)cache_entry->descriptor_set_layouts[n]) == nullptr) {
+            if (((void *)cache_entry->descriptor_set_layouts[n]) == nullptr) {
                 uint32_t uniform_buffer_count = cache_entry->entry_point_data[n].uniform_buffer_count;
                 uint32_t storage_buffer_count = cache_entry->entry_point_data[n].storage_buffer_count;
                 debug(user_context) << " creating descriptor set layout [" << n << "] " << cache_entry->entry_point_data[n].entry_point_name << "\n";
@@ -885,7 +885,7 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     //// 10. Cleanup
-    vk_destroy_command_buffer(user_context,  ctx.allocator, ctx.command_pool, command_buffer);
+    vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
     vkResetCommandPool(ctx.device, ctx.command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
 
 #ifdef DEBUG_RUNTIME
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index ef9529481972..e770b9b396c4 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -50,10 +50,10 @@ VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *alloc
 
     VkCommandPoolCreateInfo command_pool_info =
         {
-            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,      // struct type
-            nullptr,                                         // pointer to struct extending this
-            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,            // flags. Assume transient short-lived single-use command buffers
-            queue_index                                      // queue family index corresponding to the compute command queue
+            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,  // struct type
+            nullptr,                                     // pointer to struct extending this
+            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,        // flags. Assume transient short-lived single-use command buffers
+            queue_index                                  // queue family index corresponding to the compute command queue
         };
     return vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
 }
@@ -898,7 +898,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
         void operator()(VulkanCompilationCacheEntry *cache_entry) {
             if (cache_entry != nullptr) {
                 if (cache_entry->shader_module) {
-                    debug(user_context) << "    destroying shader module " << (void*)cache_entry->shader_module << "\n";
+                    debug(user_context) << "    destroying shader module " << (void *)cache_entry->shader_module << "\n";
                     vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
                     cache_entry->shader_module = {0};
                 }
@@ -932,7 +932,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                     cache_entry->descriptor_set_layouts = nullptr;
                 }
                 if (cache_entry->pipeline_layout) {
-                    debug(user_context) << "    destroying pipeline layout " << (void*)cache_entry->pipeline_layout << "\n";
+                    debug(user_context) << "    destroying pipeline layout " << (void *)cache_entry->pipeline_layout << "\n";
                     vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
                     cache_entry->pipeline_layout = {0};
                 }
diff --git a/test/correctness/interpreter.cpp b/test/correctness/interpreter.cpp
index 7623b4b70463..86cc625a151a 100644
--- a/test/correctness/interpreter.cpp
+++ b/test/correctness/interpreter.cpp
@@ -13,7 +13,7 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if(target.has_feature(Target::Vulkan)) {
+    if (target.has_feature(Target::Vulkan)) {
         printf("[SKIP] Skipping test for Vulkan (which doesn't support dynamically allocated shared mem)!\n");
         return 0;
     }

From b384b7cf085ab587e0e7a37ab52c9f10fe625d6f Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 14:05:20 -1000
Subject: [PATCH 093/166] Handle calls to pow with negative values for Vulkan
 Add integer and float constant helpers to SPIRV

---
 src/CodeGen_Vulkan_Dev.cpp | 91 +++++++++++++++++++++++++++++++++-----
 src/SpirvIR.cpp            | 52 ++++++++++++++++++++++
 src/SpirvIR.h              |  2 +
 3 files changed, 134 insertions(+), 11 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 9d05c96281f0..773ce29d8031 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -12,6 +12,7 @@
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "Scope.h"
+#include "Simplify.h"
 #include "SpirvIR.h"
 #include "Target.h"
 
@@ -158,8 +159,6 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         // Map from Halide built-in names to extended GLSL intrinsics for SPIR-V
         using BuiltinMap = std::unordered_map<std::string, SpvId>;
         const BuiltinMap glsl_builtin = {
-            {"pow_f16", GLSLstd450Pow},
-            {"pow_f32", GLSLstd450Pow},
             {"acos_f16", GLSLstd450Acos},
             {"acos_f32", GLSLstd450Acos},
             {"acosh_f16", GLSLstd450Acosh},
@@ -649,17 +648,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Div): " << op->type << " ((" << op->a << ") / (" << op->b << "))\n";
     user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n";
     int bits = 0;
-    if (is_const_power_of_two_integer(op->b, &bits)) {
-        SpvId shift_amount_id = builder.declare_constant(Int(32), &bits);
+    if (is_const_power_of_two_integer(op->b, &bits) && op->type.is_int_or_uint()) {
+        SpvId shift_amount_id = builder.declare_integer_constant(op->type.with_lanes(1), (int64_t)bits);
         SpvId type_id = builder.declare_type(op->type);
         op->a.accept(this);
         SpvId src_a_id = builder.current_id();
         SpvId result_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::binary_op(SpvOpShiftRightArithmetic, type_id, result_id, src_a_id, shift_amount_id));
+        if(op->type.is_uint()) {
+            builder.append(SpvFactory::binary_op(SpvOpShiftRightLogical, type_id, result_id, src_a_id, shift_amount_id));
+        } else {
+            builder.append(SpvFactory::binary_op(SpvOpShiftRightArithmetic, type_id, result_id, src_a_id, shift_amount_id));
+        }
         builder.update_id(result_id);
     } else if (op->type.is_int()) {
         Expr e = lower_euclidean_div(op->a, op->b);
-        e.accept(this);
+        e.accept(this);        
     } else {
         if (op->type.is_float()) {
             visit_binary_op(SpvOpFDiv, op->type, op->a, op->b);
@@ -674,9 +677,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a << ") % (" << op->b << "))\n";
     int bits = 0;
-    if (is_const_power_of_two_integer(op->b, &bits)) {
+    if (is_const_power_of_two_integer(op->b, &bits) && op->type.is_int_or_uint()) {
         int bitwise_value = ((1 << bits) - 1);
-        SpvId bitwise_value_id = builder.declare_constant(Int(32), &bitwise_value);
+        SpvId bitwise_value_id = builder.declare_integer_constant(op->type.with_lanes(1), (int64_t)bitwise_value);
         SpvId type_id = builder.declare_type(op->type);
         op->a.accept(this);
         SpvId src_a_id = builder.current_id();
@@ -1018,7 +1021,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         internal_assert(op->args.size() == 2);
         SpvOp op_code = SpvOpNop;
         if (op->type.is_float()) {
-            op_code = SpvOpFMod;
+            op_code = SpvOpFRem; // NOTE: FRem matches the fmod we expect
         } else if (op->type.is_int()) {
             op_code = SpvOpSMod;
         } else if (op->type.is_uint()) {
@@ -1033,7 +1036,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         if (op->type.is_uint()) {
             visit_binary_op(SpvOpShiftRightLogical, op->type, op->args[0], op->args[1]);
         } else {
-            visit_binary_op(SpvOpShiftRightArithmetic, op->type, op->args[0], op->args[1]);
+            Expr e = lower_signed_shift_right(op->args[0], op->args[1]);
+            e.accept(this);
         }
     } else if (op->is_intrinsic(Call::shift_left)) {
         internal_assert(op->args.size() == 2);
@@ -1098,6 +1102,71 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         } else {
             internal_error << "Unhandled intrinsic in Vulkan backend: " << op->name << "\n";
         }
+
+    } else if (starts_with(op->name, "pow_f")) {
+        internal_assert(op->args.size() == 2);
+        if (can_prove(op->args[0] > 0)) {
+            visit_glsl_op(GLSLstd450Pow, op->type, op->args);
+        } else {
+            visit_glsl_op(GLSLstd450Pow, op->type, op->args);
+            SpvId type_id = builder.declare_type(op->type);
+            SpvId inst_set_id = builder.import_glsl_intrinsics();
+
+            Expr a = op->args[0];
+            a->accept(this);
+            SpvId src_a_id = builder.current_id();
+
+            Expr b = op->args[1];
+            b->accept(this);
+            SpvId src_b_id = builder.current_id();
+
+            SpvId abs_a_id = builder.reserve_id(SpvResultId);
+            SpvFactory::Operands abs_operands = { src_a_id };
+            builder.append(SpvFactory::extended(inst_set_id, GLSLstd450FAbs, type_id, abs_a_id, abs_operands));
+
+            SpvFactory::Operands pow_operands = { abs_a_id, src_b_id };
+            SpvId pow_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::extended(inst_set_id, GLSLstd450Pow, type_id, pow_id, pow_operands));
+            builder.update_id(pow_id);
+
+            // a > 0
+            SpvId zero_id = builder.declare_float_constant(op->type, 0.0);
+            SpvId a_gt_zero_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::binary_op(SpvOpFOrdGreaterThan, type_id, a_gt_zero_id, src_a_id, zero_id));
+    
+            // b % 2
+            SpvId two_id = builder.declare_float_constant(op->type, 2.0);
+            SpvId b_mod_two_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::binary_op(SpvOpFRem, type_id, b_mod_two_id, src_b_id, two_id));
+
+            // b % 2 == 1
+            SpvId one_id = builder.declare_float_constant(op->type, 1.0);
+            SpvId b_mod_two_is_one_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::binary_op(SpvOpFOrdEqual, type_id, b_mod_two_is_one_id, b_mod_two_id, one_id));
+
+            // b % 2 == 0
+            SpvId b_mod_two_is_zero_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::binary_op(SpvOpFOrdEqual, type_id, b_mod_two_is_zero_id, b_mod_two_id, zero_id));
+
+            // -pow 
+            SpvId neg_pow_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::unary_op(SpvOpFNegate, type_id, neg_pow_id, pow_id));
+
+            // a_var > 0 || b_var % 2 == 0
+            SpvId bool_type_id = builder.declare_type(Bool());
+            SpvId a_gt_zero_or_b_mod_two_is_zero_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::binary_op(SpvOpLogicalOr, bool_type_id, a_gt_zero_or_b_mod_two_is_zero_id, a_gt_zero_id, b_mod_two_is_zero_id));
+
+            // select(b_var % 2 == 1, -c_var, zero)
+            SpvId nan_id = builder.declare_float_constant(op->type, 0.0);
+            SpvId neg_pow_or_zero_result_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::select(type_id, neg_pow_or_zero_result_id, b_mod_two_is_one_id, neg_pow_id, nan_id));
+
+            // select(a_var > 0 || b_var % 2 == 0, pow_id, neg_pow_or_zero_result_id)
+            SpvId result_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::select(type_id, result_id, a_gt_zero_or_b_mod_two_is_zero_id, pow_id, neg_pow_or_zero_result_id));
+            builder.update_id(result_id);
+        }
     } else if (starts_with(op->name, "fast_inverse_f")) {
         internal_assert(op->args.size() == 1);
 
@@ -2039,7 +2108,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_binary_op(SpvOp op_code, Type t, c
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_glsl_op(SpvId glsl_op_code, Type type, const std::vector<Expr> &args) {
-    uint32_t type_id = builder.declare_type(type);
+    SpvId type_id = builder.declare_type(type);
 
     SpvFactory::Operands operands;
     operands.reserve(args.size());
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index bfd1f029e279..fbd702222e2e 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -1815,6 +1815,58 @@ SpvId SpvBuilder::declare_scalar_constant_of_type(const Type &scalar_type, const
     return result_id;
 }
 
+SpvId SpvBuilder::declare_integer_constant(const Type &type, int64_t value) {
+    if (!type.is_int() || !type.is_scalar()) {
+        internal_error << "SPIRV: Invalid type provided for integer constant!" << type << "\n";
+        return SpvInvalidId;
+    }
+
+    SpvId result_id = SpvInvalidId;
+    if (type.is_int() && type.bits() == 8) {
+        int8_t data(value);
+        result_id = declare_scalar_constant_of_type<int8_t>(type, &data);
+    } else if (type.is_int() && type.bits() == 16) {
+        int16_t data(value);
+        result_id = declare_scalar_constant_of_type<int16_t>(type, &data);
+    } else if (type.is_int() && type.bits() == 32) {
+        int32_t data(value);
+        result_id = declare_scalar_constant_of_type<int32_t>(type, &data);
+    } else if (type.is_int() && type.bits() == 64) {
+        int64_t data(value);
+        result_id = declare_scalar_constant_of_type<int64_t>(type, &data);
+    } else {
+        user_error << "Unhandled constant integer data conversion from value type '" << type << "'!\n";
+    }
+    return result_id;
+}
+
+SpvId SpvBuilder::declare_float_constant(const Type &type, double value) {
+    if (!type.is_float() || !type.is_scalar()) {
+        internal_error << "SPIRV: Invalid type provided for float constant!" << type << "\n";
+        return SpvInvalidId;
+    }
+
+    SpvId result_id = SpvInvalidId;
+    if (type.is_float() && type.bits() == 16) {
+        if (type.is_bfloat()) {
+            bfloat16_t data(value);
+            result_id = declare_scalar_constant_of_type<bfloat16_t>(type, &data);
+        } else {
+            float16_t data(value);
+            result_id = declare_scalar_constant_of_type<float16_t>(type, &data);
+        }
+    } else if (type.is_float() && type.bits() == 32) {
+        float data(value);
+        result_id = declare_scalar_constant_of_type<float>(type, &data);
+    } else if (type.is_float() && type.bits() == 64) {
+        double data(value);
+        result_id = declare_scalar_constant_of_type<double>(type, &data);
+    } else {
+        user_error << "Unhandled constant float data conversion from value type '" << type << "'!\n";
+    }
+    return result_id;
+}
+
 SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *data) {
     if (scalar_type.lanes() != 1) {
         internal_error << "SPIRV: Invalid type provided for scalar constant!" << scalar_type << "\n";
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 86355d2e54a1..80afb78ce3c9 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -390,6 +390,8 @@ class SpvBuilder {
     SpvId declare_null_constant(const Type &type);
     SpvId declare_bool_constant(bool value);
     SpvId declare_string_constant(const std::string &str);
+    SpvId declare_integer_constant(const Type &type, int64_t value);
+    SpvId declare_float_constant(const Type &type, double value);
     SpvId declare_scalar_constant(const Type &type, const void *data);
     SpvId declare_vector_constant(const Type &type, const void *data);
     SpvId declare_access_chain(SpvId ptr_type_id, SpvId base_id, const Indices &indices);

From dae13835af008e1c4a9225ff799f9cb9a755f19f Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 14:06:09 -1000
Subject: [PATCH 094/166] Only test real numbers for pow with Vulkan

---
 test/correctness/math.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/correctness/math.cpp b/test/correctness/math.cpp
index 1315de6cc9c9..9c54ef859379 100644
--- a/test/correctness/math.cpp
+++ b/test/correctness/math.cpp
@@ -290,7 +290,8 @@ int main(int argc, char **argv) {
     call_1_float_types(ceil, 256, -25, 25);
     call_1_float_types(trunc, 256, -25, 25);
 
-    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) {
+    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute) ||
+        get_jit_target_from_environment().has_feature(Target::Vulkan)) {
         // GLSL isn't required to support NaN, so keep things real
         call_2_float_types(pow, 256, 0.0, 10.0, -4.0f, 4.0f);
     } else {

From 74d52f927f5bab96e5d7be8db7d693acfb0fb5d8 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 8 Nov 2022 14:09:13 -1000
Subject: [PATCH 095/166] Clang tidy/format pass

---
 src/CodeGen_Vulkan_Dev.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 773ce29d8031..16e88ab3ef3d 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -654,7 +654,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
         op->a.accept(this);
         SpvId src_a_id = builder.current_id();
         SpvId result_id = builder.reserve_id(SpvResultId);
-        if(op->type.is_uint()) {
+        if (op->type.is_uint()) {
             builder.append(SpvFactory::binary_op(SpvOpShiftRightLogical, type_id, result_id, src_a_id, shift_amount_id));
         } else {
             builder.append(SpvFactory::binary_op(SpvOpShiftRightArithmetic, type_id, result_id, src_a_id, shift_amount_id));
@@ -662,7 +662,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
         builder.update_id(result_id);
     } else if (op->type.is_int()) {
         Expr e = lower_euclidean_div(op->a, op->b);
-        e.accept(this);        
+        e.accept(this);
     } else {
         if (op->type.is_float()) {
             visit_binary_op(SpvOpFDiv, op->type, op->a, op->b);
@@ -1021,7 +1021,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         internal_assert(op->args.size() == 2);
         SpvOp op_code = SpvOpNop;
         if (op->type.is_float()) {
-            op_code = SpvOpFRem; // NOTE: FRem matches the fmod we expect
+            op_code = SpvOpFRem;  // NOTE: FRem matches the fmod we expect
         } else if (op->type.is_int()) {
             op_code = SpvOpSMod;
         } else if (op->type.is_uint()) {
@@ -1121,10 +1121,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             SpvId src_b_id = builder.current_id();
 
             SpvId abs_a_id = builder.reserve_id(SpvResultId);
-            SpvFactory::Operands abs_operands = { src_a_id };
+            SpvFactory::Operands abs_operands = {src_a_id};
             builder.append(SpvFactory::extended(inst_set_id, GLSLstd450FAbs, type_id, abs_a_id, abs_operands));
 
-            SpvFactory::Operands pow_operands = { abs_a_id, src_b_id };
+            SpvFactory::Operands pow_operands = {abs_a_id, src_b_id};
             SpvId pow_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::extended(inst_set_id, GLSLstd450Pow, type_id, pow_id, pow_operands));
             builder.update_id(pow_id);
@@ -1133,7 +1133,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             SpvId zero_id = builder.declare_float_constant(op->type, 0.0);
             SpvId a_gt_zero_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::binary_op(SpvOpFOrdGreaterThan, type_id, a_gt_zero_id, src_a_id, zero_id));
-    
+
             // b % 2
             SpvId two_id = builder.declare_float_constant(op->type, 2.0);
             SpvId b_mod_two_id = builder.reserve_id(SpvResultId);
@@ -1148,7 +1148,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             SpvId b_mod_two_is_zero_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::binary_op(SpvOpFOrdEqual, type_id, b_mod_two_is_zero_id, b_mod_two_id, zero_id));
 
-            // -pow 
+            // -pow
             SpvId neg_pow_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::unary_op(SpvOpFNegate, type_id, neg_pow_id, pow_id));
 

From 01b74dc261caa689e9631033c5b00ada77aa6ab0 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 10 Nov 2022 12:36:40 -1000
Subject: [PATCH 096/166] Fix logic so a region request of an entire block
 matches if exactly the same size as an empty block

---
 src/runtime/internal/block_allocator.h  | 2 ++
 src/runtime/internal/region_allocator.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index 2d3790c7752b..a827cdc73e92 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -386,6 +386,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
 
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
     block->memory.size = size;
+    block->memory.handle = nullptr;
     block->memory.properties = properties;
     block->memory.dedicated = dedicated;
     block->reserved = 0;
@@ -439,6 +440,7 @@ void BlockAllocator::free_memory_block(void *user_context, BlockResource *block)
     halide_abort_if_false(user_context, allocators.block.deallocate != nullptr);
     MemoryBlock *memory_block = &(block->memory);
     allocators.block.deallocate(user_context, memory_block);
+    memory_block->handle = nullptr;
     block->reserved = 0;
     block->memory.size = 0;
 }
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index c857ce993837..67ea0e9d9533 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -230,7 +230,7 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
         }
 
         // will the adjusted size fit within the remaining unallocated space?
-        if ((actual_size + block->reserved) < block->memory.size) {
+        if ((actual_size + block->reserved) <= block->memory.size) {
             result = block_region;  // best-fit!
             break;
         }

From 190dd35ea61523346c9bc33cb9d29332e4183bee Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 10 Nov 2022 12:38:18 -1000
Subject: [PATCH 097/166] Create a zero size buffer to check for alignment
 Return null handles after freeing

---
 src/runtime/vulkan_memory.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 786b10b7c7dd..60d5db6c887d 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -5,7 +5,7 @@
 #include "vulkan_internal.h"
 
 // Uncomment to enable verbose memory allocation debugging
-// #define HL_VK_DEBUG_MEM 1
+#define HL_VK_DEBUG_MEM 1
 
 namespace Halide {
 namespace Runtime {
@@ -331,11 +331,6 @@ VulkanMemoryAllocator::default_config() {
 // --
 
 void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *block) {
-#if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
-                   << "user_context=" << user_context << " "
-                   << "block=" << (void *)(block) << ") ... \n";
-#endif
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
     halide_abort_if_false(user_context, instance != nullptr);
     halide_abort_if_false(user_context, instance->device != nullptr);
@@ -344,7 +339,9 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
 
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
-                   << "size=" << (uint32_t)block->size << ", "
+                   << "user_context=" << user_context << " "
+                   << "block=" << (void *)(block) << " "
+                   << "size=" << (uint64_t)block->size << ", "
                    << "dedicated=" << (block->dedicated ? "true" : "false") << " "
                    << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
                    << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
@@ -384,7 +381,7 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
         VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
         nullptr,                               // struct extending this
         0,                                     // create flags
-        block->size,                           // buffer size (in bytes)
+        0,                                     // buffer size (in bytes)
         usage_flags,                           // buffer usage flags
         VK_SHARING_MODE_EXCLUSIVE,             // sharing mode
         0, nullptr};
@@ -464,6 +461,7 @@ void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *bl
         instance->block_byte_count = 0;
     }
 
+    block->handle = nullptr;
     vk_host_free(nullptr, device_memory, instance->alloc_callbacks);
     device_memory = nullptr;
 }

From 64bf559fd25c1311af69f5a58dc5ab81f0fac5fa Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 10 Nov 2022 12:39:24 -1000
Subject: [PATCH 098/166] Add more verbose debug output for malloc

---
 src/runtime/vulkan.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 591118b51573..b6074384ff88 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -261,7 +261,16 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
         halide_debug_assert(user_context, buf->dim[i].stride >= 0);
     }
 
-    debug(user_context) << "    allocating " << *buf << "\n";
+    debug(user_context) << "    allocating buffer: "
+                        << "extents: " << buf->dim[0].extent << "x"
+                        << buf->dim[1].extent << "x" << buf->dim[2].extent << "x"
+                        << buf->dim[3].extent << " "
+                        << "strides: " << buf->dim[0].stride << "x"
+                        << buf->dim[1].stride << "x" << buf->dim[2].stride << "x"
+                        << buf->dim[3].stride << " "
+                        << "type: " << buf->type << " " 
+                        << "size_in_bytes: " << (uint64_t)size << " "
+                        << "(or " << (size * 1e-6f) << "MB)\n";
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_before = halide_current_time_ns(user_context);

From 50dd4c64dcee5ba1926ba4700fcf10a21e92b0bd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 10 Nov 2022 12:40:06 -1000
Subject: [PATCH 099/166] Fix UConvert logic to avoid narrowing an integer type
 less than 8 bits Remove optimization path for division which seems to fail
 worse than DIV Cleanup DIV and MOD operators

---
 src/CodeGen_Vulkan_Dev.cpp | 44 ++++++++++++--------------------------
 1 file changed, 14 insertions(+), 30 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 16e88ab3ef3d..98264dd837b6 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -578,7 +578,8 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
         // Vulkan requires both value and target types to be unsigned for UConvert
         // so do the conversion to an equivalent unsigned type then bitcast this
         // result into the target type
-        Type unsigned_type = target_type.with_code(halide_type_uint).narrow();
+        Type unsigned_type = target_type.with_code(halide_type_uint);
+        unsigned_type = (unsigned_type.bits() > 8) ? unsigned_type.narrow() : unsigned_type;
         SpvId unsigned_type_id = builder.declare_type(unsigned_type);
         SpvId unsigned_value_id = builder.reserve_id(SpvResultId);
         result_id = builder.reserve_id(SpvResultId);
@@ -647,30 +648,15 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mul *op) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Div *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Div): " << op->type << " ((" << op->a << ") / (" << op->b << "))\n";
     user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n";
-    int bits = 0;
-    if (is_const_power_of_two_integer(op->b, &bits) && op->type.is_int_or_uint()) {
-        SpvId shift_amount_id = builder.declare_integer_constant(op->type.with_lanes(1), (int64_t)bits);
-        SpvId type_id = builder.declare_type(op->type);
-        op->a.accept(this);
-        SpvId src_a_id = builder.current_id();
-        SpvId result_id = builder.reserve_id(SpvResultId);
-        if (op->type.is_uint()) {
-            builder.append(SpvFactory::binary_op(SpvOpShiftRightLogical, type_id, result_id, src_a_id, shift_amount_id));
-        } else {
-            builder.append(SpvFactory::binary_op(SpvOpShiftRightArithmetic, type_id, result_id, src_a_id, shift_amount_id));
-        }
-        builder.update_id(result_id);
-    } else if (op->type.is_int()) {
+    if (op->type.is_int()) {
         Expr e = lower_euclidean_div(op->a, op->b);
         e.accept(this);
+    } else if (op->type.is_uint()) {
+        visit_binary_op(SpvOpUDiv, op->type, op->a, op->b);
+    } else if (op->type.is_float()) {
+        visit_binary_op(SpvOpFDiv, op->type, op->a, op->b);
     } else {
-        if (op->type.is_float()) {
-            visit_binary_op(SpvOpFDiv, op->type, op->a, op->b);
-        } else if (op->type.is_uint()) {
-            visit_binary_op(SpvOpUDiv, op->type, op->a, op->b);
-        } else {
-            internal_error << "Failed to find a suitable Div operator for type: " << op->type << "\n";
-        }
+        internal_error << "Failed to find a suitable Div operator for type: " << op->type << "\n";
     }
 }
 
@@ -689,15 +675,13 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
     } else if (op->type.is_int()) {
         Expr e = lower_euclidean_mod(op->a, op->b);
         e.accept(this);
+    } else if (op->type.is_uint()) {
+        visit_binary_op(SpvOpUMod, op->type, op->a, op->b);
+    } else if (op->type.is_float()) {
+        // SPIR-V FMod is strangely not what we want .. FRem does what we need
+        visit_binary_op(SpvOpFRem, op->type, op->a, op->b);
     } else {
-        if (op->type.is_float()) {
-            // SPIR-V FMod is strangely not what we want .. FRem does what we need
-            visit_binary_op(SpvOpFRem, op->type, op->a, op->b);
-        } else if (op->type.is_uint()) {
-            visit_binary_op(SpvOpUMod, op->type, op->a, op->b);
-        } else {
-            internal_error << "Failed to find a suitable Mod operator for type: " << op->type << "\n";
-        }
+        internal_error << "Failed to find a suitable Mod operator for type: " << op->type << "\n";
     }
 }
 

From 0cd89bb80288758380070beed55d1cc5fd67ec78 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 10 Nov 2022 12:46:40 -1000
Subject: [PATCH 100/166] Clang format/tidy pass

---
 src/runtime/vulkan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index b6074384ff88..1644ee91e7f7 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -268,7 +268,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
                         << "strides: " << buf->dim[0].stride << "x"
                         << buf->dim[1].stride << "x" << buf->dim[2].stride << "x"
                         << buf->dim[3].stride << " "
-                        << "type: " << buf->type << " " 
+                        << "type: " << buf->type << " "
                         << "size_in_bytes: " << (uint64_t)size << " "
                         << "(or " << (size * 1e-6f) << "MB)\n";
 

From 7b8b728832863a41e9dd0a17769390e9ba96425f Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 12 Nov 2022 09:39:37 -1000
Subject: [PATCH 101/166] Fix SConvert & UConvert ops

---
 src/CodeGen_Vulkan_Dev.cpp | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 98264dd837b6..0985a5a1540c 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -537,9 +537,7 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
         } else if (target_type.is_bool()) {
             op_code = SpvOpSelect;
         } else if (target_type.is_int_or_uint()) {
-            if (target_type.bits() != value_type.bits()) {
-                op_code = SpvOpUConvert;  // UConvert is only allowed on differing component widths
-            }
+            op_code = SpvOpUConvert;
         }
     } else if (value_type.is_int()) {
         if (target_type.is_float()) {
@@ -547,9 +545,7 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
         } else if (target_type.is_bool()) {
             op_code = SpvOpSelect;
         } else if (target_type.is_int_or_uint()) {
-            if (target_type.bits() != value_type.bits()) {
-                op_code = SpvOpSConvert;  // SConvert is only allowed on differing component widths
-            }
+            op_code = SpvOpSConvert;
         }
     }
 
@@ -575,16 +571,30 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
     } else if (op_code == SpvOpSelect) {
         result_id = convert_to_bool(target_type, value_type, value_id);
     } else if (op_code == SpvOpUConvert && target_type.is_int()) {
-        // Vulkan requires both value and target types to be unsigned for UConvert
-        // so do the conversion to an equivalent unsigned type then bitcast this
+        // SPIR-V requires both value and target types to be unsigned and of 
+        // different component bit widths in order to be compatible with UConvert
+        // ... so do the conversion to an equivalent unsigned type then bitcast this
         // result into the target type
         Type unsigned_type = target_type.with_code(halide_type_uint);
-        unsigned_type = (unsigned_type.bits() > 8) ? unsigned_type.narrow() : unsigned_type;
-        SpvId unsigned_type_id = builder.declare_type(unsigned_type);
-        SpvId unsigned_value_id = builder.reserve_id(SpvResultId);
+        if(unsigned_type.bytes() != value_type.bytes()) {
+            SpvId unsigned_type_id = builder.declare_type(unsigned_type);
+            SpvId unsigned_value_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::convert(op_code, unsigned_type_id, unsigned_value_id, value_id));
+            value_id = unsigned_value_id;
+        }
         result_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::convert(op_code, unsigned_type_id, unsigned_value_id, value_id));
-        builder.append(SpvFactory::bitcast(target_type_id, result_id, unsigned_value_id));
+        builder.append(SpvFactory::bitcast(target_type_id, result_id, value_id));
+    } else if (op_code == SpvOpSConvert && target_type.is_uint()) {
+        // Same as above but for SConvert
+        Type signed_type = target_type.with_code(halide_type_int);
+        if(signed_type.bytes() != value_type.bytes()) {
+            SpvId signed_type_id = builder.declare_type(signed_type);
+            SpvId signed_value_id = builder.reserve_id(SpvResultId);
+            builder.append(SpvFactory::convert(op_code, signed_type_id, signed_value_id, value_id));
+            value_id = signed_value_id;
+        }
+        result_id = builder.reserve_id(SpvResultId);
+        builder.append(SpvFactory::bitcast(target_type_id, result_id, value_id));
     } else {
         result_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::convert(op_code, target_type_id, result_id, value_id));

From 15e4c7f4aa4862c7f0f7007368917e9edebad048 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 12 Nov 2022 12:14:01 -1000
Subject: [PATCH 102/166] Add retain semantics to block allocator interface
 Update test to validate retain/release/reclaim functionality

---
 src/runtime/internal/block_allocator.h  | 10 ++++++++
 src/runtime/internal/memory_resources.h | 11 ++++++++-
 src/runtime/internal/region_allocator.h | 32 ++++++++++++++++++++-----
 test/runtime/block_allocator.cpp        | 10 ++++++++
 4 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index a827cdc73e92..b3301a12bf96 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -55,6 +55,7 @@ class BlockAllocator {
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
     void release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
     void reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
+    void retain(void *user_context, MemoryRegion *region);   //< retain the region and increase the usage count
     bool collect(void *user_context);                        //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
@@ -206,6 +207,15 @@ void BlockAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     allocator->reclaim(user_context, memory_region);
 }
 
+void BlockAllocator::retain(void *user_context, MemoryRegion *memory_region) {
+    halide_abort_if_false(user_context, memory_region != nullptr);
+    RegionAllocator *allocator = RegionAllocator::find_allocator(user_context, memory_region);
+    if (allocator == nullptr) {
+        return;
+    }
+    allocator->retain(user_context, memory_region);
+}
+
 bool BlockAllocator::collect(void *user_context) {
     bool result = false;
     BlockEntry *block_entry = block_list.back();
diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index 5e5c949c761d..8b3a9e47c866 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -66,12 +66,20 @@ struct MemoryBlock {
     MemoryProperties properties;  //< properties for the allocated block
 };
 
+// Client-facing struct for specifying a range of a memory region (eg for crops)
+struct MemoryRange {
+    size_t head_offset = 0;       //< byte offset from start of region
+    size_t tail_offset = 0;       //< byte offset from end of region
+};
+
 // Client-facing struct for exchanging memory region allocation requests
 struct MemoryRegion {
-    void *handle = nullptr;       //< client data storing native handle (managed by alloc_block_region/free_block_region)
+    void *handle = nullptr;       //< client data storing native handle (managed by alloc_block_region/free_block_region) or a pointer to region owning allocation
     size_t offset = 0;            //< offset from base address in block (in bytes)
     size_t size = 0;              //< allocated size (in bytes)
+    MemoryRange range;            //< optional range (e.g. for handling crops, etc)
     bool dedicated = false;       //< flag indicating whether allocation is one dedicated resource (or split/shared into other resources)
+    bool is_owner = true;         //< flag indicating whether allocation is owned by this region, in which case handle is a native handle. Otherwise handle points to owning region of alloction.
     MemoryProperties properties;  //< properties for the allocated region
 };
 
@@ -100,6 +108,7 @@ struct BlockResource {
 // -- Note: first field must MemoryRegion
 struct BlockRegion {
     MemoryRegion memory;                                        //< memory info for the allocated region
+    uint32_t usage_count = 0;                                   //< number of active clients using region
     AllocationStatus status = AllocationStatus::InvalidStatus;  //< allocation status indicator
     BlockRegion *next_ptr = nullptr;                            //< pointer to next block region in linked list
     BlockRegion *prev_ptr = nullptr;                            //< pointer to prev block region in linked list
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 67ea0e9d9533..d24692383b77 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -49,6 +49,7 @@ class RegionAllocator {
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
     void release(void *user_context, MemoryRegion *memory_region);  //< unmark and cache the region for reuse
     void reclaim(void *user_context, MemoryRegion *memory_region);  //< free the region and consolidate
+    void retain(void *user_context, MemoryRegion *memory_region);   //< retain the region and increase usage count
     bool collect(void *user_context);                               //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
@@ -163,6 +164,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     }
 
     alloc_block_region(user_context, block_region);
+    block_region->usage_count = 1;
     return reinterpret_cast<MemoryRegion *>(block_region);
 }
 
@@ -170,6 +172,9 @@ void RegionAllocator::release(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
     halide_abort_if_false(user_context, block_region->block_ptr == block);
+    if(block_region->usage_count > 0) {
+        block_region->usage_count--;
+    }
     release_block_region(user_context, block_region);
 }
 
@@ -177,12 +182,22 @@ void RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
     halide_abort_if_false(user_context, block_region->block_ptr == block);
+    if(block_region->usage_count > 0) {
+        block_region->usage_count--;
+    }
     free_block_region(user_context, block_region);
     if (can_coalesce(block_region)) {
         block_region = coalesce_block_regions(user_context, block_region);
     }
 }
 
+void RegionAllocator::retain(void *user_context, MemoryRegion *memory_region) {
+    BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
+    halide_abort_if_false(user_context, block_region != nullptr);
+    halide_abort_if_false(user_context, block_region->block_ptr == block);
+    block_region->usage_count++;
+}
+
 RegionAllocator *RegionAllocator::find_allocator(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
@@ -363,6 +378,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
     block_region->memory.dedicated = dedicated;
     block_region->status = AllocationStatus::Available;
     block_region->block_ptr = block;
+    block_region->usage_count = 0;
 
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "Creating region ("
@@ -381,9 +397,9 @@ void RegionAllocator::release_block_region(void *user_context, BlockRegion *bloc
                                     << "user_context=" << (void *)(user_context) << " "
                                     << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
-
-    if ((block_region->status == AllocationStatus::InUse) ||
-        (block_region->status == AllocationStatus::Dedicated)) {
+    if ((block_region->usage_count == 0) && 
+        ((block_region->status == AllocationStatus::InUse) ||
+         (block_region->status == AllocationStatus::Dedicated))) {
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Releasing region ("
@@ -406,6 +422,7 @@ void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *bloc
                                     << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
 
+    block_region->usage_count = 0;
     free_block_region(user_context, block_region);
     arena->reclaim(user_context, block_region);
 }
@@ -419,6 +436,7 @@ void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_
     MemoryRegion *memory_region = &(block_region->memory);
     if (memory_region->handle == nullptr) {
         allocators.region.allocate(user_context, memory_region);
+        memory_region->is_owner = true;
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Allocating region ("
@@ -450,10 +468,11 @@ void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_r
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Freeing block region ("
                                     << "user_context=" << (void *)(user_context) << " "
-                                    << "block_region=" << (void *)(block_region) << ") ...\n";
+                                    << "block_region=" << (void *)(block_region) << " "
+                                    << "status=" << (uint32_t)block_region->status << " "
+                                    << "usage_count="  << (uint32_t)block_region->usage_count << ") ...\n";
 #endif
-    if ((block_region->status == AllocationStatus::InUse) ||
-        (block_region->status == AllocationStatus::Dedicated)) {
+    if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Freeing region ("
                                         << "block_ptr=" << (void *)block_region->block_ptr << " "
@@ -470,6 +489,7 @@ void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_r
         block_region->memory.offset = 0;
         block_region->memory.handle = nullptr;
     }
+    block_region->usage_count = 0;
     block_region->status = AllocationStatus::Available;
 }
 
diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp
index d147f652e80d..a7bb9752c7a4 100644
--- a/test/runtime/block_allocator.cpp
+++ b/test/runtime/block_allocator.cpp
@@ -91,6 +91,16 @@ int main(int argc, char **argv) {
         instance->reclaim(user_context, r1);
         halide_abort_if_false(user_context, allocated_region_memory == (1 * request.size));
 
+        MemoryRegion *r3 = instance->reserve(user_context, request);
+        halide_abort_if_false(user_context, r3 != nullptr);
+        halide_abort_if_false(user_context, allocated_block_memory == config.minimum_block_size);
+        halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size));
+        instance->retain(user_context, r3);
+        halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size));
+        instance->release(user_context, r3);
+        halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size));
+        instance->reclaim(user_context, r3);
+
         instance->destroy(user_context);
         debug(user_context) << "Test : block_allocator::destroy ("
                             << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "

From a65712a3221105fac715b12241e5ec5c3e16fb97 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 12 Nov 2022 12:15:01 -1000
Subject: [PATCH 103/166] Implement device_crop, device_slice and release_crop
 for Vulkan. Re-enable device_crop, device_slice and interleave_rgb tests.

---
 src/runtime/vulkan.cpp              |  69 +++++++++++++++---
 src/runtime/vulkan_internal.h       |   2 +-
 src/runtime/vulkan_memory.h         | 105 ++++++++++++++++++++++++++--
 src/runtime/vulkan_resources.h      |  68 +++++++++++++++---
 test/correctness/device_crop.cpp    |   5 --
 test/correctness/device_slice.cpp   |   5 --
 test/correctness/interleave_rgb.cpp |   5 --
 7 files changed, 215 insertions(+), 44 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 1644ee91e7f7..3379e6236baf 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -100,11 +100,12 @@ WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_b
 
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
-    if (ctx.allocator && device_region && device_region->handle) {
+    MemoryRegion *memory_region = ctx.allocator->owner_of(user_context, device_region);
+    if (ctx.allocator && memory_region && memory_region->handle) {
         if (halide_can_reuse_device_allocations(user_context)) {
-            ctx.allocator->release(user_context, device_region);
+            ctx.allocator->release(user_context, memory_region);
         } else {
-            ctx.allocator->reclaim(user_context, device_region);
+            ctx.allocator->reclaim(user_context, memory_region);
         }
     }
     halide_buffer->device = 0;
@@ -440,9 +441,10 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
 
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
+    MemoryRegion *memory_region = ctx.allocator->owner_of(user_context, device_region);
 
     // retrieve the buffer from the region
-    VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
+    VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(memory_region->handle);
     if (device_buffer == nullptr) {
         error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
         return halide_error_code_internal_error;
@@ -475,7 +477,7 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     copy_helper.src = (uint64_t)(staging_buffer);
     copy_helper.dst = (uint64_t)(device_buffer);
     uint64_t src_offset = copy_helper.src_begin;
-    uint64_t dst_offset = 0;
+    uint64_t dst_offset = device_region->range.head_offset;
     vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, src_offset, dst_offset, halide_buffer->dimensions);
 
     // end the command buffer
@@ -578,9 +580,10 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
+    MemoryRegion *memory_region = ctx.allocator->owner_of(user_context, device_region);
 
     // retrieve the buffer from the region
-    VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
+    VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(memory_region->handle);
     if (device_buffer == nullptr) {
         error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
         return halide_error_code_internal_error;
@@ -613,7 +616,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     uint64_t copy_dst = copy_helper.dst;
     copy_helper.src = (uint64_t)(device_buffer);
     copy_helper.dst = (uint64_t)(staging_buffer);
-    uint64_t src_offset = copy_helper.src_begin;
+    uint64_t src_offset = copy_helper.src_begin + device_region->range.head_offset;
     uint64_t dst_offset = 0;
     vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, src_offset, dst_offset, halide_buffer->dimensions);
 
@@ -680,6 +683,52 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     return 0;
 }
 
+
+WEAK int halide_vulkan_device_crop(void *user_context,
+                                   const struct halide_buffer_t *src,
+                                   struct halide_buffer_t *dst) {
+    const int64_t offset = calc_device_crop_byte_offset(src, dst);
+    return vk_device_crop_from_offset(user_context, src, offset, dst);
+}
+
+WEAK int halide_vulkan_device_slice(void *user_context,
+                                   const struct halide_buffer_t *src,
+                                   int slice_dim, int slice_pos,
+                                   struct halide_buffer_t *dst) {
+    const int64_t offset = calc_device_slice_byte_offset(src, slice_dim, slice_pos);
+    return vk_device_crop_from_offset(user_context, src, offset, dst);
+}
+
+WEAK int halide_vulkan_device_release_crop(void *user_context,
+                                           struct halide_buffer_t *halide_buffer) {
+
+    debug(user_context)
+        << "Vulkan: halide_vulkan_device_release_crop (user_context: " << user_context
+        << ", halide_buffer: " << halide_buffer << ")\n";
+
+    VulkanContext ctx(user_context);
+    if (ctx.error != VK_SUCCESS) {
+        return ctx.error;
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    halide_abort_if_false(user_context, halide_buffer->device);
+
+    // get the allocated region for the device
+    MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
+    ctx.allocator->destroy_crop(user_context, device_region);
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return 0;
+}
+
 WEAK int halide_vulkan_run(void *user_context,
                            void *state_ptr,
                            const char *entry_name,
@@ -1011,9 +1060,9 @@ WEAK halide_device_interface_impl_t vulkan_device_interface_impl = {
     halide_vulkan_device_and_host_malloc,
     halide_vulkan_device_and_host_free,
     halide_default_buffer_copy,
-    halide_default_device_crop,
-    halide_default_device_slice,
-    halide_default_device_release_crop,
+    halide_vulkan_device_crop,
+    halide_vulkan_device_slice,
+    halide_vulkan_device_release_crop,
     halide_vulkan_wrap_vk_buffer,
     halide_vulkan_detach_vk_buffer,
 };
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 7a109c8068e4..6b312ee01d7d 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -38,7 +38,7 @@ namespace {  // internalize
 // --------------------------------------------------------------------------
 void *vk_host_malloc(void *user_context, size_t size, size_t alignment, VkSystemAllocationScope scope, const VkAllocationCallbacks *callbacks = nullptr);
 void vk_host_free(void *user_context, void *ptr, const VkAllocationCallbacks *callbacks = nullptr);
-
+int vk_device_crop_from_offset(void *user_context, const struct halide_buffer_t *src, int64_t offset, struct halide_buffer_t *dst);
 VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context, VkDevice device, VkPhysicalDevice physical_device,
                                                   const VkAllocationCallbacks *alloc_callbacks);
 
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 60d5db6c887d..80b7cc5a0bf9 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -58,12 +58,16 @@ class VulkanMemoryAllocator {
     MemoryRegion *reserve(void *user_context, MemoryRequest &request);
     void release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
     void reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
+    void retain(void *user_context, MemoryRegion *region);   //< retain the region and increase its use count
     bool collect(void *user_context);                        //< returns true if any blocks were removed
     void release(void *user_context);
     void destroy(void *user_context);
 
     void *map(void *user_context, MemoryRegion *region);
     void unmap(void *user_context, MemoryRegion *region);
+    MemoryRegion* create_crop(void *user_context, MemoryRegion *region, uint64_t offset);
+    void destroy_crop(void *user_context, MemoryRegion *region);
+    MemoryRegion* owner_of(void *user_context, MemoryRegion *region);
 
     VkDevice current_device() const {
         return this->device;
@@ -191,14 +195,16 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
                    << "device=" << (void *)(device) << " "
                    << "physical_device=" << (void *)(physical_device) << " "
                    << "region=" << (void *)(region) << " "
-                   << "size=" << (uint32_t)region->size << " "
-                   << "offset=" << (uint32_t)region->offset << ") ...\n";
+                   << "region_size=" << (uint32_t)region->size << " "
+                   << "region_offset=" << (uint32_t)region->offset << " "
+                   << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
 #endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
     halide_abort_if_false(user_context, block_allocator != nullptr);
 
-    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, region);
+    MemoryRegion* owner = owner_of(user_context, region);
+    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
         error(nullptr) << "VulkanMemoryAllocator: Unable to map region! Invalid region allocator handle!\n";
         return nullptr;
@@ -217,7 +223,10 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
     }
 
     uint8_t *mapped_ptr = nullptr;
-    VkResult result = vkMapMemory(device, *device_memory, region->offset, region->size, 0, (void **)(&mapped_ptr));
+    VkDeviceSize memory_offset = region->offset + region->range.head_offset;
+    VkDeviceSize memory_size = region->size - region->range.tail_offset - region->range.head_offset;
+    halide_abort_if_false(user_context, (region->size - region->range.tail_offset - region->range.head_offset) > 0);
+    VkResult result = vkMapMemory(device, *device_memory, memory_offset, memory_size, 0, (void **)(&mapped_ptr));
     if (result != VK_SUCCESS) {
         error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n";
         return nullptr;
@@ -233,13 +242,15 @@ void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
                    << "device=" << (void *)(device) << " "
                    << "physical_device=" << (void *)(physical_device) << " "
                    << "region=" << (void *)(region) << " "
-                   << "size=" << (uint32_t)region->size << " "
-                   << "offset=" << (uint32_t)region->offset << ") ...\n";
+                   << "region_size=" << (uint32_t)region->size << " "
+                   << "region_offset=" << (uint32_t)region->offset << " "
+                   << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
 #endif
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
 
-    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, region);
+    MemoryRegion* owner = owner_of(user_context, region);
+    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
         error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
         return;
@@ -260,6 +271,74 @@ void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
     vkUnmapMemory(device, *device_memory);
 }
 
+MemoryRegion* VulkanMemoryAllocator::create_crop(void *user_context, MemoryRegion *region, uint64_t offset) {
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Cropping region ("
+                   << "user_context=" << user_context << " "
+                   << "device=" << (void *)(device) << " "
+                   << "physical_device=" << (void *)(physical_device) << " "
+                   << "region=" << (void *)(region) << " "
+                   << "region_size=" << (uint32_t)region->size << " "
+                   << "region_offset=" << (uint32_t)region->offset << " "
+                   << "crop_offset=" << (int64_t)offset << ") ...\n";
+#endif
+    halide_abort_if_false(user_context, device != nullptr);
+    halide_abort_if_false(user_context, physical_device != nullptr);
+
+    MemoryRegion* owner = owner_of(user_context, region);
+    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
+    if (region_allocator == nullptr) {
+        error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
+        return nullptr;
+    }
+
+    // increment usage count
+    region_allocator->retain(this, owner);
+
+    // create a new region to return, and copy all the other region's properties
+    const BlockAllocator::MemoryAllocators &allocators = block_allocator->current_allocators();
+    halide_abort_if_false(user_context, allocators.system.allocate != nullptr);
+    MemoryRegion *result = reinterpret_cast<MemoryRegion *>(
+        allocators.system.allocate(user_context, sizeof(MemoryRegion))
+    );
+
+    halide_abort_if_false(user_context, result != nullptr);
+    memcpy(result, owner, sizeof(MemoryRegion));
+
+    // point the handle to the owner of the allocated region, and update the head offset
+    result->is_owner = false;
+    result->handle = (void*)owner;
+    result->range.head_offset = owner->range.head_offset + offset;
+    return result;
+}
+
+void VulkanMemoryAllocator::destroy_crop(void *user_context, MemoryRegion *region) {
+    
+    MemoryRegion* owner = owner_of(user_context, region);
+    RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
+    if (region_allocator == nullptr) {
+        error(nullptr) << "VulkanMemoryAllocator: Unable to destroy crop region! Invalid region allocator handle!\n";
+        return;
+    }
+
+    // decrement usage count
+    region_allocator->release(this, owner);
+
+    // discard the copied region struct
+    const BlockAllocator::MemoryAllocators &allocators = block_allocator->current_allocators();
+    halide_abort_if_false(user_context, allocators.system.deallocate != nullptr);
+    allocators.system.deallocate(user_context, region);
+}
+
+MemoryRegion* VulkanMemoryAllocator::owner_of(void *user_context, MemoryRegion *region) {
+    if(region->is_owner) {
+        return region;
+    } else {
+        // If this is a cropped region, use the handle to retrieve the owner of the allocation
+        return reinterpret_cast<MemoryRegion*>(region->handle);
+    }
+}
+
 void VulkanMemoryAllocator::release(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Releasing region ("
@@ -288,6 +367,17 @@ void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
     return block_allocator->reclaim(this, region);
 }
 
+void VulkanMemoryAllocator::retain(void *user_context, MemoryRegion *region) {
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Retaining region ("
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << " "
+                   << "size=" << (uint32_t)region->size << " "
+                   << "offset=" << (uint32_t)region->offset << ") ...\n";
+#endif
+    return block_allocator->retain(this, region);
+}
+
 bool VulkanMemoryAllocator::collect(void *user_context) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Collecting unused memory ("
@@ -641,6 +731,7 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
     }
 
     region->handle = (void *)buffer;
+    region->is_owner = true;
     instance->region_byte_count += region->size;
     instance->region_count++;
 }
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index e770b9b396c4..e9393df322d2 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -50,10 +50,10 @@ VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *alloc
 
     VkCommandPoolCreateInfo command_pool_info =
         {
-            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,  // struct type
-            nullptr,                                     // pointer to struct extending this
-            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,        // flags. Assume transient short-lived single-use command buffers
-            queue_index                                  // queue family index corresponding to the compute command queue
+            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,      // struct type
+            nullptr,                                         // pointer to struct extending this
+            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,            // flags. Assume transient short-lived single-use command buffers
+            queue_index                                      // queue family index corresponding to the compute command queue
         };
     return vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
 }
@@ -465,18 +465,22 @@ VkResult vk_update_descriptor_set(void *user_context,
 
             // get the allocated region for the buffer
             MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(((halide_buffer_t *)args[i])->device);
+            MemoryRegion *owner = allocator->owner_of(user_context, device_region);
 
             // retrieve the buffer from the region
-            VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
+            VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(owner->handle);
             if (device_buffer == nullptr) {
                 error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
                 return VK_ERROR_INITIALIZATION_FAILED;
             }
 
+            VkDeviceSize range_offset = device_region->range.head_offset;
+            VkDeviceSize range_size = device_region->size - device_region->range.head_offset - device_region->range.tail_offset;
+            halide_abort_if_false(user_context, (device_region->size - device_region->range.head_offset - device_region->range.tail_offset) > 0);
             VkDescriptorBufferInfo device_buffer_info = {
                 *device_buffer,  // the buffer
-                0,               // offset
-                VK_WHOLE_SIZE    // range
+                range_offset,    // range offset
+                range_size       // range size
             };
             descriptor_buffer_info.append(user_context, &device_buffer_info);
             VkDescriptorBufferInfo *device_buffer_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
@@ -771,7 +775,7 @@ VulkanEntryPointData *vk_decode_entry_point_data(void *user_context, VulkanMemor
         uint32_t padded_string_length = module_ptr[idx++];
         const char *entry_point_name = (const char *)(module_ptr + idx);
 
-        debug(user_context) << "    [" << n << "] "
+        debug(user_context) << "  [" << n << "] "
                             << "uniform_buffer_count=" << uniform_buffer_count << " "
                             << "storage_buffer_count=" << storage_buffer_count << " "
                             << "entry_point_name_length=" << padded_string_length << " "
@@ -898,7 +902,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
         void operator()(VulkanCompilationCacheEntry *cache_entry) {
             if (cache_entry != nullptr) {
                 if (cache_entry->shader_module) {
-                    debug(user_context) << "    destroying shader module " << (void *)cache_entry->shader_module << "\n";
+                    debug(user_context) << " . destroying shader module " << (void*)cache_entry->shader_module << "\n";
                     vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
                     cache_entry->shader_module = {0};
                 }
@@ -924,7 +928,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                 }
                 if (cache_entry->descriptor_set_layouts) {
                     for (uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
-                        debug(user_context) << "    destroying descriptor set layout [" << n << "] " << cache_entry->entry_point_data[n].entry_point_name << "\n";
+                        debug(user_context) << "  destroying descriptor set layout [" << n << "] " << cache_entry->entry_point_data[n].entry_point_name << "\n";
                         vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
                         cache_entry->descriptor_set_layouts[n] = {0};
                     }
@@ -932,7 +936,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                     cache_entry->descriptor_set_layouts = nullptr;
                 }
                 if (cache_entry->pipeline_layout) {
-                    debug(user_context) << "    destroying pipeline layout " << (void *)cache_entry->pipeline_layout << "\n";
+                    debug(user_context) << "  destroying pipeline layout " << (void*)cache_entry->pipeline_layout << "\n";
                     vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
                     cache_entry->pipeline_layout = {0};
                 }
@@ -989,6 +993,48 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
     return 0;
 }
 
+int vk_device_crop_from_offset(void *user_context,
+                              const struct halide_buffer_t *src,
+                              int64_t offset,
+                              struct halide_buffer_t *dst) {
+
+    VulkanContext ctx(user_context);
+    if (ctx.error != VK_SUCCESS) {
+        return ctx.error;
+    }
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+
+    halide_abort_if_false(user_context, src->device);
+    if(offset < 0) {
+        error(user_context) << "Vulkan: Invalid offset for device crop!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // get the allocated region for the device
+    MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(src->device);
+
+    // create the croppeg region from the allocated region
+    MemoryRegion *cropped_region = ctx.allocator->create_crop(user_context, device_region, (uint64_t)offset);
+    if ((cropped_region == nullptr) || (cropped_region->handle == nullptr)) {
+        error(user_context) << "Vulkan: Failed to crop region!\n";
+        return halide_error_code_internal_error;
+    }
+
+    // update the destination to the cropped region
+    dst->device = (uint64_t)cropped_region;
+    dst->device_interface = src->device_interface;
+
+#ifdef DEBUG_RUNTIME
+    uint64_t t_after = halide_current_time_ns(user_context);
+    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+
+    return 0;
+}
+
 // --------------------------------------------------------------------------
 
 }  // namespace
diff --git a/test/correctness/device_crop.cpp b/test/correctness/device_crop.cpp
index ddb8e92917a5..ee4b900bc1f3 100644
--- a/test/correctness/device_crop.cpp
+++ b/test/correctness/device_crop.cpp
@@ -35,11 +35,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (target.has_feature(Target::Vulkan)) {
-        printf("[SKIP] Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
-        return 0;
-    }
-
     printf("Test in-place cropping.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);
diff --git a/test/correctness/device_slice.cpp b/test/correctness/device_slice.cpp
index 924d34939cb3..0b9e3ca5bbcb 100644
--- a/test/correctness/device_slice.cpp
+++ b/test/correctness/device_slice.cpp
@@ -37,11 +37,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (target.has_feature(Target::Vulkan)) {
-        printf("[SKIP] Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
-        return 0;
-    }
-
     printf("Test in-place slicing.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);
diff --git a/test/correctness/interleave_rgb.cpp b/test/correctness/interleave_rgb.cpp
index adf426ded42c..d7a8f9d7c110 100644
--- a/test/correctness/interleave_rgb.cpp
+++ b/test/correctness/interleave_rgb.cpp
@@ -104,11 +104,6 @@ bool test_deinterleave(int x_stride) {
 
 int main(int argc, char **argv) {
 
-    if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
-        printf("[SKIP] Skipping test for Vulkan as it does not support cropping yet!\n");
-        return 0;
-    }
-
     for (int x_stride : {3, 4}) {
         if (!test_interleave<uint8_t>(x_stride)) return -1;
         if (!test_interleave<uint16_t>(x_stride)) return -1;

From 485ec208054ac023bb227d0ce4ea9fb038f70780 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 12 Nov 2022 12:19:54 -1000
Subject: [PATCH 104/166] Clang format/tidy pass

---
 src/CodeGen_Vulkan_Dev.cpp              |  6 +++---
 src/runtime/internal/memory_resources.h |  4 ++--
 src/runtime/internal/region_allocator.h |  8 ++++----
 src/runtime/vulkan.cpp                  |  7 +++----
 src/runtime/vulkan_memory.h             | 27 ++++++++++++-------------
 src/runtime/vulkan_resources.h          | 20 +++++++++---------
 6 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 0985a5a1540c..14b50b2f8a03 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -571,12 +571,12 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
     } else if (op_code == SpvOpSelect) {
         result_id = convert_to_bool(target_type, value_type, value_id);
     } else if (op_code == SpvOpUConvert && target_type.is_int()) {
-        // SPIR-V requires both value and target types to be unsigned and of 
+        // SPIR-V requires both value and target types to be unsigned and of
         // different component bit widths in order to be compatible with UConvert
         // ... so do the conversion to an equivalent unsigned type then bitcast this
         // result into the target type
         Type unsigned_type = target_type.with_code(halide_type_uint);
-        if(unsigned_type.bytes() != value_type.bytes()) {
+        if (unsigned_type.bytes() != value_type.bytes()) {
             SpvId unsigned_type_id = builder.declare_type(unsigned_type);
             SpvId unsigned_value_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::convert(op_code, unsigned_type_id, unsigned_value_id, value_id));
@@ -587,7 +587,7 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
     } else if (op_code == SpvOpSConvert && target_type.is_uint()) {
         // Same as above but for SConvert
         Type signed_type = target_type.with_code(halide_type_int);
-        if(signed_type.bytes() != value_type.bytes()) {
+        if (signed_type.bytes() != value_type.bytes()) {
             SpvId signed_type_id = builder.declare_type(signed_type);
             SpvId signed_value_id = builder.reserve_id(SpvResultId);
             builder.append(SpvFactory::convert(op_code, signed_type_id, signed_value_id, value_id));
diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index 8b3a9e47c866..a1889541435c 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -68,8 +68,8 @@ struct MemoryBlock {
 
 // Client-facing struct for specifying a range of a memory region (eg for crops)
 struct MemoryRange {
-    size_t head_offset = 0;       //< byte offset from start of region
-    size_t tail_offset = 0;       //< byte offset from end of region
+    size_t head_offset = 0;  //< byte offset from start of region
+    size_t tail_offset = 0;  //< byte offset from end of region
 };
 
 // Client-facing struct for exchanging memory region allocation requests
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index d24692383b77..ff645eeda467 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -172,7 +172,7 @@ void RegionAllocator::release(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
     halide_abort_if_false(user_context, block_region->block_ptr == block);
-    if(block_region->usage_count > 0) {
+    if (block_region->usage_count > 0) {
         block_region->usage_count--;
     }
     release_block_region(user_context, block_region);
@@ -182,7 +182,7 @@ void RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
     halide_abort_if_false(user_context, block_region->block_ptr == block);
-    if(block_region->usage_count > 0) {
+    if (block_region->usage_count > 0) {
         block_region->usage_count--;
     }
     free_block_region(user_context, block_region);
@@ -397,7 +397,7 @@ void RegionAllocator::release_block_region(void *user_context, BlockRegion *bloc
                                     << "user_context=" << (void *)(user_context) << " "
                                     << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
-    if ((block_region->usage_count == 0) && 
+    if ((block_region->usage_count == 0) &&
         ((block_region->status == AllocationStatus::InUse) ||
          (block_region->status == AllocationStatus::Dedicated))) {
 
@@ -470,7 +470,7 @@ void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_r
                                     << "user_context=" << (void *)(user_context) << " "
                                     << "block_region=" << (void *)(block_region) << " "
                                     << "status=" << (uint32_t)block_region->status << " "
-                                    << "usage_count="  << (uint32_t)block_region->usage_count << ") ...\n";
+                                    << "usage_count=" << (uint32_t)block_region->usage_count << ") ...\n";
 #endif
     if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
 #ifdef DEBUG_INTERNAL
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 3379e6236baf..7b07e8714c94 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -683,7 +683,6 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     return 0;
 }
 
-
 WEAK int halide_vulkan_device_crop(void *user_context,
                                    const struct halide_buffer_t *src,
                                    struct halide_buffer_t *dst) {
@@ -692,9 +691,9 @@ WEAK int halide_vulkan_device_crop(void *user_context,
 }
 
 WEAK int halide_vulkan_device_slice(void *user_context,
-                                   const struct halide_buffer_t *src,
-                                   int slice_dim, int slice_pos,
-                                   struct halide_buffer_t *dst) {
+                                    const struct halide_buffer_t *src,
+                                    int slice_dim, int slice_pos,
+                                    struct halide_buffer_t *dst) {
     const int64_t offset = calc_device_slice_byte_offset(src, slice_dim, slice_pos);
     return vk_device_crop_from_offset(user_context, src, offset, dst);
 }
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 80b7cc5a0bf9..674eb077af17 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -65,9 +65,9 @@ class VulkanMemoryAllocator {
 
     void *map(void *user_context, MemoryRegion *region);
     void unmap(void *user_context, MemoryRegion *region);
-    MemoryRegion* create_crop(void *user_context, MemoryRegion *region, uint64_t offset);
+    MemoryRegion *create_crop(void *user_context, MemoryRegion *region, uint64_t offset);
     void destroy_crop(void *user_context, MemoryRegion *region);
-    MemoryRegion* owner_of(void *user_context, MemoryRegion *region);
+    MemoryRegion *owner_of(void *user_context, MemoryRegion *region);
 
     VkDevice current_device() const {
         return this->device;
@@ -203,7 +203,7 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
     halide_abort_if_false(user_context, physical_device != nullptr);
     halide_abort_if_false(user_context, block_allocator != nullptr);
 
-    MemoryRegion* owner = owner_of(user_context, region);
+    MemoryRegion *owner = owner_of(user_context, region);
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
         error(nullptr) << "VulkanMemoryAllocator: Unable to map region! Invalid region allocator handle!\n";
@@ -249,7 +249,7 @@ void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
 
-    MemoryRegion* owner = owner_of(user_context, region);
+    MemoryRegion *owner = owner_of(user_context, region);
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
         error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
@@ -271,7 +271,7 @@ void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
     vkUnmapMemory(device, *device_memory);
 }
 
-MemoryRegion* VulkanMemoryAllocator::create_crop(void *user_context, MemoryRegion *region, uint64_t offset) {
+MemoryRegion *VulkanMemoryAllocator::create_crop(void *user_context, MemoryRegion *region, uint64_t offset) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Cropping region ("
                    << "user_context=" << user_context << " "
@@ -285,7 +285,7 @@ MemoryRegion* VulkanMemoryAllocator::create_crop(void *user_context, MemoryRegio
     halide_abort_if_false(user_context, device != nullptr);
     halide_abort_if_false(user_context, physical_device != nullptr);
 
-    MemoryRegion* owner = owner_of(user_context, region);
+    MemoryRegion *owner = owner_of(user_context, region);
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
         error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
@@ -299,22 +299,21 @@ MemoryRegion* VulkanMemoryAllocator::create_crop(void *user_context, MemoryRegio
     const BlockAllocator::MemoryAllocators &allocators = block_allocator->current_allocators();
     halide_abort_if_false(user_context, allocators.system.allocate != nullptr);
     MemoryRegion *result = reinterpret_cast<MemoryRegion *>(
-        allocators.system.allocate(user_context, sizeof(MemoryRegion))
-    );
+        allocators.system.allocate(user_context, sizeof(MemoryRegion)));
 
     halide_abort_if_false(user_context, result != nullptr);
     memcpy(result, owner, sizeof(MemoryRegion));
 
     // point the handle to the owner of the allocated region, and update the head offset
     result->is_owner = false;
-    result->handle = (void*)owner;
+    result->handle = (void *)owner;
     result->range.head_offset = owner->range.head_offset + offset;
     return result;
 }
 
 void VulkanMemoryAllocator::destroy_crop(void *user_context, MemoryRegion *region) {
-    
-    MemoryRegion* owner = owner_of(user_context, region);
+
+    MemoryRegion *owner = owner_of(user_context, region);
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
         error(nullptr) << "VulkanMemoryAllocator: Unable to destroy crop region! Invalid region allocator handle!\n";
@@ -330,12 +329,12 @@ void VulkanMemoryAllocator::destroy_crop(void *user_context, MemoryRegion *regio
     allocators.system.deallocate(user_context, region);
 }
 
-MemoryRegion* VulkanMemoryAllocator::owner_of(void *user_context, MemoryRegion *region) {
-    if(region->is_owner) {
+MemoryRegion *VulkanMemoryAllocator::owner_of(void *user_context, MemoryRegion *region) {
+    if (region->is_owner) {
         return region;
     } else {
         // If this is a cropped region, use the handle to retrieve the owner of the allocation
-        return reinterpret_cast<MemoryRegion*>(region->handle);
+        return reinterpret_cast<MemoryRegion *>(region->handle);
     }
 }
 
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index e9393df322d2..50c31d351669 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -50,10 +50,10 @@ VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *alloc
 
     VkCommandPoolCreateInfo command_pool_info =
         {
-            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,      // struct type
-            nullptr,                                         // pointer to struct extending this
-            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,            // flags. Assume transient short-lived single-use command buffers
-            queue_index                                      // queue family index corresponding to the compute command queue
+            VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,  // struct type
+            nullptr,                                     // pointer to struct extending this
+            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,        // flags. Assume transient short-lived single-use command buffers
+            queue_index                                  // queue family index corresponding to the compute command queue
         };
     return vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
 }
@@ -902,7 +902,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
         void operator()(VulkanCompilationCacheEntry *cache_entry) {
             if (cache_entry != nullptr) {
                 if (cache_entry->shader_module) {
-                    debug(user_context) << " . destroying shader module " << (void*)cache_entry->shader_module << "\n";
+                    debug(user_context) << " . destroying shader module " << (void *)cache_entry->shader_module << "\n";
                     vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
                     cache_entry->shader_module = {0};
                 }
@@ -936,7 +936,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                     cache_entry->descriptor_set_layouts = nullptr;
                 }
                 if (cache_entry->pipeline_layout) {
-                    debug(user_context) << "  destroying pipeline layout " << (void*)cache_entry->pipeline_layout << "\n";
+                    debug(user_context) << "  destroying pipeline layout " << (void *)cache_entry->pipeline_layout << "\n";
                     vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
                     cache_entry->pipeline_layout = {0};
                 }
@@ -994,9 +994,9 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
 }
 
 int vk_device_crop_from_offset(void *user_context,
-                              const struct halide_buffer_t *src,
-                              int64_t offset,
-                              struct halide_buffer_t *dst) {
+                               const struct halide_buffer_t *src,
+                               int64_t offset,
+                               struct halide_buffer_t *dst) {
 
     VulkanContext ctx(user_context);
     if (ctx.error != VK_SUCCESS) {
@@ -1008,7 +1008,7 @@ int vk_device_crop_from_offset(void *user_context,
 #endif
 
     halide_abort_if_false(user_context, src->device);
-    if(offset < 0) {
+    if (offset < 0) {
         error(user_context) << "Vulkan: Invalid offset for device crop!\n";
         return halide_error_code_internal_error;
     }

From 48faeea6575090cf617b2820f27c6504299a3831 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 14 Nov 2022 16:12:15 -0800
Subject: [PATCH 105/166] Implement device copy for Vulkan. Enable device copy
 test.

---
 src/runtime/vulkan.cpp                  | 291 +++++++++++++++++++++++-
 src/runtime/vulkan_internal.h           |   3 +-
 src/runtime/vulkan_resources.h          |  42 ++--
 test/correctness/device_buffer_copy.cpp |   5 -
 4 files changed, 316 insertions(+), 25 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 7b07e8714c94..985ecd6a2107 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -473,12 +473,19 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
         return result;
     }
 
-    // enqueue the copy operation, using the allocated buffers
+    // define the src and dst config
+    bool from_host = true;
+    bool to_host = false;
     copy_helper.src = (uint64_t)(staging_buffer);
     copy_helper.dst = (uint64_t)(device_buffer);
     uint64_t src_offset = copy_helper.src_begin;
     uint64_t dst_offset = device_region->range.head_offset;
-    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, src_offset, dst_offset, halide_buffer->dimensions);
+
+    // enqueue the copy operation, using the allocated buffers
+    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, 
+                                src_offset, dst_offset, 
+                                halide_buffer->dimensions, 
+                                from_host, to_host);
 
     // end the command buffer
     result = vkEndCommandBuffer(command_buffer);
@@ -612,13 +619,20 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
         return result;
     }
 
-    // enqueue the copy operation, using the allocated buffers
+    // define the src and dst config
+    bool from_host = false;
+    bool to_host = true;
     uint64_t copy_dst = copy_helper.dst;
     copy_helper.src = (uint64_t)(device_buffer);
     copy_helper.dst = (uint64_t)(staging_buffer);
     uint64_t src_offset = copy_helper.src_begin + device_region->range.head_offset;
     uint64_t dst_offset = 0;
-    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, src_offset, dst_offset, halide_buffer->dimensions);
+
+    // enqueue the copy operation, using the allocated buffers
+    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, 
+                                src_offset, dst_offset, 
+                                halide_buffer->dimensions, 
+                                from_host, to_host);
 
     // end the command buffer
     result = vkEndCommandBuffer(command_buffer);
@@ -683,6 +697,273 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     return 0;
 }
 
+WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *src,
+                                  const struct halide_device_interface_t *dst_device_interface,
+                                  struct halide_buffer_t *dst) {
+    if (dst->dimensions > MAX_COPY_DIMS) {
+        error(user_context) << "Buffer has too many dimensions to copy to/from GPU\n";
+        return halide_error_code_device_buffer_copy_failed;
+    }
+
+    // We only handle copies to Vulkan buffers or to host
+    if (dst_device_interface != nullptr && dst_device_interface != &vulkan_device_interface) {
+        error(user_context) << "halide_vulkan_buffer_copy: only handle copies to metal buffers or to host\n";
+        return halide_error_code_device_buffer_copy_failed;
+    }
+
+    if ((src->device_dirty() || src->host == nullptr) && src->device_interface != &vulkan_device_interface) {
+        halide_debug_assert(user_context, dst_device_interface == &vulkan_device_interface);
+        // This is handled at the higher level.
+        return halide_error_code_incompatible_device_interface;
+    }
+
+    bool from_host = (src->device_interface != &vulkan_device_interface) ||
+                     (src->device == 0) ||
+                     (src->host_dirty() && src->host != nullptr);
+    bool to_host = !dst_device_interface;
+
+    if (!(from_host || src->device)) {
+        error(user_context) << "halide_vulkan_buffer_copy: invalid copy source\n";
+        return halide_error_code_device_buffer_copy_failed;
+    }
+    if (!(to_host || dst->device)) {
+        error(user_context) << "halide_vulkan_buffer_copy: invalid copy destination\n";
+        return halide_error_code_device_buffer_copy_failed;
+    }
+
+    device_copy copy_helper = make_buffer_copy(src, from_host, dst, to_host);
+
+    int err = 0;
+    {
+        VulkanContext ctx(user_context);
+        if (ctx.error != VK_SUCCESS) {
+            return ctx.error;
+        }
+
+        debug(user_context)
+            << "halide_vulkan_buffer_copy (user_context: " << user_context
+            << ", src: " << src << ", dst: " << dst << ")\n";
+
+#ifdef DEBUG_RUNTIME
+        uint64_t t_before = halide_current_time_ns(user_context);
+#endif
+        MemoryRegion *staging_region = nullptr;
+        MemoryRegion *src_buffer_region = nullptr;
+        MemoryRegion *dst_buffer_region = nullptr;
+        
+        //// wait until the queue is done with the command buffer
+        VkResult wait_result = vkQueueWaitIdle(ctx.queue);
+        if (wait_result != VK_SUCCESS) {
+            error(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(wait_result) << "\n";
+            return wait_result;
+        }
+
+        if (!from_host && !to_host) {
+            // Device only case
+            debug(user_context) << " buffer copy from: device to: device\n";
+
+            // get the buffer regions for the device
+            src_buffer_region = reinterpret_cast<MemoryRegion *>(src->device);
+            dst_buffer_region = reinterpret_cast<MemoryRegion *>(dst->device);
+
+        } else if (!from_host && to_host) {
+            // Device to Host 
+            debug(user_context) << " buffer copy from: device to: host\n";
+
+            // Need to make sure all reads and writes to/from source are complete.
+            MemoryRequest request = {0};
+            request.size = src->size_in_bytes();
+            request.properties.usage = MemoryUsage::TransferSrc;
+            request.properties.caching = MemoryCaching::UncachedCoherent;
+            request.properties.visibility = MemoryVisibility::DeviceToHost;
+
+            // allocate a new region
+            staging_region = ctx.allocator->reserve(user_context, request);
+            if ((staging_region == nullptr) || (staging_region->handle == nullptr)) {
+                error(user_context) << "Vulkan: Failed to allocate device memory!\n";
+                return -1;
+            }
+
+            // use the staging region and buffer from the copy destination
+            src_buffer_region = reinterpret_cast<MemoryRegion *>(src->device);
+            dst_buffer_region = staging_region;
+
+        } else if (from_host && !to_host) {
+            // Host to Device 
+            debug(user_context) << " buffer copy from: host to: device\n";
+
+            // Need to make sure all reads and writes to/from destination are complete.
+            MemoryRequest request = {0};
+            request.size = src->size_in_bytes();
+            request.properties.usage = MemoryUsage::TransferSrc;
+            request.properties.caching = MemoryCaching::UncachedCoherent;
+            request.properties.visibility = MemoryVisibility::HostToDevice;
+
+            // allocate a new region
+            staging_region = ctx.allocator->reserve(user_context, request);
+            if ((staging_region == nullptr) || (staging_region->handle == nullptr)) {
+                error(user_context) << "Vulkan: Failed to allocate device memory!\n";
+                return -1;
+            }
+
+            // map the region to a host ptr
+            uint8_t *stage_host_ptr = (uint8_t *)ctx.allocator->map(user_context, staging_region);
+            if (stage_host_ptr == nullptr) {
+                error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
+                return halide_error_code_internal_error;
+            }
+
+            // copy to the (host-visible/coherent) staging buffer, then restore the dst pointer
+            uint64_t copy_dst_ptr = copy_helper.dst;
+            copy_helper.dst = (uint64_t)(stage_host_ptr);
+            copy_memory(copy_helper, user_context);
+            copy_helper.dst = copy_dst_ptr;
+            
+            // unmap the pointer
+            ctx.allocator->unmap(user_context, staging_region);
+
+            // use the staging region and buffer from the copy source
+            src_buffer_region = staging_region;
+            dst_buffer_region = reinterpret_cast<MemoryRegion *>(dst->device);
+
+        } else if (from_host && to_host) {
+            debug(user_context) << " buffer copy from: host to: host\n";
+            copy_memory(copy_helper, user_context);
+            return 0;
+        }
+        
+        if (src_buffer_region == nullptr) {
+            error(user_context) << "Vulkan: Failed to retrieve source buffer for device memory!\n";
+            return halide_error_code_internal_error;
+        }
+
+        if (dst_buffer_region == nullptr) {
+            error(user_context) << "Vulkan: Failed to retrieve destination buffer for device memory!\n";
+            return halide_error_code_internal_error;
+        }
+
+        // get the owning memory region (that holds the allocation)
+        MemoryRegion *src_memory_region = ctx.allocator->owner_of(user_context, src_buffer_region);
+        MemoryRegion *dst_memory_region = ctx.allocator->owner_of(user_context, dst_buffer_region);
+
+        // retrieve the buffers from the owning allocation region
+        VkBuffer *src_device_buffer = reinterpret_cast<VkBuffer *>(src_memory_region->handle);
+        VkBuffer *dst_device_buffer = reinterpret_cast<VkBuffer *>(dst_memory_region->handle);
+
+        // create a command buffer
+        VkCommandBuffer command_buffer;
+        VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+        if (result != VK_SUCCESS) {
+            error(user_context) << "vk_create_command_buffer returned: " << vk_get_error_name(result) << "\n";
+            return -1;
+        }
+
+        // begin the command buffer
+        VkCommandBufferBeginInfo command_buffer_begin_info =
+            {
+                VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
+                nullptr,                                      // pointer to struct extending this
+                VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
+                nullptr                                       // pointer to parent command buffer
+            };
+
+        result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+        if (result != VK_SUCCESS) {
+            error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+            return result;
+        }
+
+        // define the src and dst config
+        uint64_t copy_dst = copy_helper.dst;
+        copy_helper.src = (uint64_t)(src_device_buffer);
+        copy_helper.dst = (uint64_t)(dst_device_buffer);
+        uint64_t src_offset = copy_helper.src_begin + src_buffer_region->range.head_offset;
+        uint64_t dst_offset = dst_buffer_region->range.head_offset;
+        if(!from_host && !to_host) {
+            src_offset = src_buffer_region->range.head_offset;
+            dst_offset = dst_buffer_region->range.head_offset;
+        }
+
+        debug(user_context) << " src region=" << (void*)src_memory_region << " buffer=" << (void*)src_device_buffer << " crop_offset=" << (uint64_t)src_buffer_region->range.head_offset << " copy_offset=" << src_offset << "\n";
+        debug(user_context) << " dst region=" << (void*)dst_memory_region << " buffer=" << (void*)dst_device_buffer << " crop_offset=" << (uint64_t)dst_buffer_region->range.head_offset << " copy_offset=" << dst_offset << "\n";
+
+        // enqueue the copy operation, using the allocated buffers
+        vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, 
+                                    src_offset, dst_offset, 
+                                    src->dimensions, 
+                                    from_host, to_host);
+
+        // end the command buffer
+        result = vkEndCommandBuffer(command_buffer);
+        if (result != VK_SUCCESS) {
+            error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+            return result;
+        }
+
+        //// submit the command buffer to our command queue
+        VkSubmitInfo submit_info =
+            {
+                VK_STRUCTURE_TYPE_SUBMIT_INFO,  // struct type
+                nullptr,                        // pointer to struct extending this
+                0,                              // wait semaphore count
+                nullptr,                        // semaphores
+                nullptr,                        // pipeline stages where semaphore waits occur
+                1,                              // how many command buffers to execute
+                &command_buffer,                // the command buffers
+                0,                              // number of semaphores to signal
+                nullptr                         // the semaphores to signal
+            };
+
+        result = vkQueueSubmit(ctx.queue, 1, &submit_info, 0);
+        if (result != VK_SUCCESS) {
+            error(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+            return result;
+        }
+
+        //// wait until the queue is done with the command buffer
+        result = vkQueueWaitIdle(ctx.queue);
+        if (result != VK_SUCCESS) {
+            error(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+            return result;
+        }
+
+        if (!from_host && to_host) {
+            // map the staging region to a host ptr
+            uint8_t *stage_host_ptr = (uint8_t *)ctx.allocator->map(user_context, staging_region);
+            if (stage_host_ptr == nullptr) {
+                error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
+                return halide_error_code_internal_error;
+            }
+
+            // copy to the (host-visible/coherent) staging buffer
+            copy_helper.dst = copy_dst;
+            copy_helper.src = (uint64_t)(stage_host_ptr);
+            copy_memory(copy_helper, user_context);
+
+            // unmap the pointer and reclaim the staging region
+            ctx.allocator->unmap(user_context, staging_region);
+    
+        }
+
+        if(staging_region) {
+            if (halide_can_reuse_device_allocations(user_context)) {
+                ctx.allocator->release(user_context, staging_region);
+            } else {
+                ctx.allocator->reclaim(user_context, staging_region);
+            }
+        }
+
+        vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+    
+#ifdef DEBUG_RUNTIME
+        uint64_t t_after = halide_current_time_ns(user_context);
+        debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
+#endif
+    }
+
+    return err;
+}
+
 WEAK int halide_vulkan_device_crop(void *user_context,
                                    const struct halide_buffer_t *src,
                                    struct halide_buffer_t *dst) {
@@ -1058,7 +1339,7 @@ WEAK halide_device_interface_impl_t vulkan_device_interface_impl = {
     halide_vulkan_copy_to_device,
     halide_vulkan_device_and_host_malloc,
     halide_vulkan_device_and_host_free,
-    halide_default_buffer_copy,
+    halide_vulkan_buffer_copy,
     halide_vulkan_device_crop,
     halide_vulkan_device_slice,
     halide_vulkan_device_release_crop,
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 6b312ee01d7d..07107869333e 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -203,7 +203,8 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 
 // -- Copy Buffer
 int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buffer,
-                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d);
+                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset, 
+                                int d, bool from_host, bool to_host);
 
 // --------------------------------------------------------------------------
 // Errors
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 50c31d351669..c5f630196c9d 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -959,30 +959,44 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 // --------------------------------------------------------------------------
 
 int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buffer,
-                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d) {
+                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset, 
+                                int d, bool from_host, bool to_host) {
     if (d == 0) {
 
-        VkBufferCopy buffer_copy = {
-            c.src_begin + src_offset,  // srcOffset
-            dst_offset,                // dstOffset
-            c.chunk_size               // size
-        };
+        if((!from_host && to_host) || 
+           (from_host && !to_host) || 
+           (!from_host && !to_host) ) {
 
-        VkBuffer *src_buffer = reinterpret_cast<VkBuffer *>(c.src);
-        VkBuffer *dst_buffer = reinterpret_cast<VkBuffer *>(c.dst);
-        if (!src_buffer || !dst_buffer) {
-            error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
-            return -1;
-        }
+            VkBufferCopy buffer_copy = {
+                c.src_begin + src_offset,  // srcOffset
+                dst_offset,                // dstOffset
+                c.chunk_size               // size
+            };
 
-        vkCmdCopyBuffer(command_buffer, *src_buffer, *dst_buffer, 1, &buffer_copy);
+            VkBuffer *src_buffer = reinterpret_cast<VkBuffer *>(c.src);
+            VkBuffer *dst_buffer = reinterpret_cast<VkBuffer *>(c.dst);
+            if (!src_buffer || !dst_buffer) {
+                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+                return -1;
+            }
 
+            vkCmdCopyBuffer(command_buffer, *src_buffer, *dst_buffer, 1, &buffer_copy);
+        
+        } else if ((c.dst + dst_offset) != (c.src + src_offset)) {
+            // Could reach here if a user called directly into the
+            // Vulkan API for a device->host copy on a source buffer
+            // with device_dirty = false.
+            memcpy((void *)(c.dst + dst_offset), (void *)(c.src + src_offset), c.chunk_size);
+        }
     } else {
         // TODO: deal with negative strides. Currently the code in
         // device_buffer_utils.h does not do so either.
         uint64_t src_off = 0, dst_off = 0;
         for (uint64_t i = 0; i < c.extent[d - 1]; i++) {
-            int err = vk_do_multidimensional_copy(user_context, command_buffer, c, src_offset + src_off, dst_offset + dst_off, d - 1);
+            int err = vk_do_multidimensional_copy(user_context, command_buffer, c, 
+                                                  src_offset + src_off, 
+                                                  dst_offset + dst_off, 
+                                                  d - 1, from_host, to_host);
             dst_off += c.dst_stride_bytes[d - 1];
             src_off += c.src_stride_bytes[d - 1];
             if (err) {
diff --git a/test/correctness/device_buffer_copy.cpp b/test/correctness/device_buffer_copy.cpp
index 596590486d04..9179ac83cd24 100644
--- a/test/correctness/device_buffer_copy.cpp
+++ b/test/correctness/device_buffer_copy.cpp
@@ -37,11 +37,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (target.has_feature(Target::Vulkan)) {
-        printf("[SKIP] Skipping test for Vulkan, as it does not support device crops, slices, or copies\n");
-        return 0;
-    }
-
     printf("Test copy to device.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);

From 17d54768dcce647b63d73ccf19d9921a9d7316d7 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 15 Nov 2022 09:24:37 -0800
Subject: [PATCH 106/166] Clang format/tidy pass

---
 src/runtime/vulkan.cpp         | 43 +++++++++++++++++-----------------
 src/runtime/vulkan_internal.h  |  2 +-
 src/runtime/vulkan_resources.h | 16 ++++++-------
 3 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 985ecd6a2107..3398cf414505 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -482,9 +482,9 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     uint64_t dst_offset = device_region->range.head_offset;
 
     // enqueue the copy operation, using the allocated buffers
-    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, 
-                                src_offset, dst_offset, 
-                                halide_buffer->dimensions, 
+    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
+                                src_offset, dst_offset,
+                                halide_buffer->dimensions,
                                 from_host, to_host);
 
     // end the command buffer
@@ -629,9 +629,9 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     uint64_t dst_offset = 0;
 
     // enqueue the copy operation, using the allocated buffers
-    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, 
-                                src_offset, dst_offset, 
-                                halide_buffer->dimensions, 
+    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
+                                src_offset, dst_offset,
+                                halide_buffer->dimensions,
                                 from_host, to_host);
 
     // end the command buffer
@@ -698,8 +698,8 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 }
 
 WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *src,
-                                  const struct halide_device_interface_t *dst_device_interface,
-                                  struct halide_buffer_t *dst) {
+                                   const struct halide_device_interface_t *dst_device_interface,
+                                   struct halide_buffer_t *dst) {
     if (dst->dimensions > MAX_COPY_DIMS) {
         error(user_context) << "Buffer has too many dimensions to copy to/from GPU\n";
         return halide_error_code_device_buffer_copy_failed;
@@ -750,7 +750,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
         MemoryRegion *staging_region = nullptr;
         MemoryRegion *src_buffer_region = nullptr;
         MemoryRegion *dst_buffer_region = nullptr;
-        
+
         //// wait until the queue is done with the command buffer
         VkResult wait_result = vkQueueWaitIdle(ctx.queue);
         if (wait_result != VK_SUCCESS) {
@@ -767,7 +767,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
             dst_buffer_region = reinterpret_cast<MemoryRegion *>(dst->device);
 
         } else if (!from_host && to_host) {
-            // Device to Host 
+            // Device to Host
             debug(user_context) << " buffer copy from: device to: host\n";
 
             // Need to make sure all reads and writes to/from source are complete.
@@ -789,7 +789,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
             dst_buffer_region = staging_region;
 
         } else if (from_host && !to_host) {
-            // Host to Device 
+            // Host to Device
             debug(user_context) << " buffer copy from: host to: device\n";
 
             // Need to make sure all reads and writes to/from destination are complete.
@@ -818,7 +818,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
             copy_helper.dst = (uint64_t)(stage_host_ptr);
             copy_memory(copy_helper, user_context);
             copy_helper.dst = copy_dst_ptr;
-            
+
             // unmap the pointer
             ctx.allocator->unmap(user_context, staging_region);
 
@@ -831,7 +831,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
             copy_memory(copy_helper, user_context);
             return 0;
         }
-        
+
         if (src_buffer_region == nullptr) {
             error(user_context) << "Vulkan: Failed to retrieve source buffer for device memory!\n";
             return halide_error_code_internal_error;
@@ -879,18 +879,18 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
         copy_helper.dst = (uint64_t)(dst_device_buffer);
         uint64_t src_offset = copy_helper.src_begin + src_buffer_region->range.head_offset;
         uint64_t dst_offset = dst_buffer_region->range.head_offset;
-        if(!from_host && !to_host) {
+        if (!from_host && !to_host) {
             src_offset = src_buffer_region->range.head_offset;
             dst_offset = dst_buffer_region->range.head_offset;
         }
 
-        debug(user_context) << " src region=" << (void*)src_memory_region << " buffer=" << (void*)src_device_buffer << " crop_offset=" << (uint64_t)src_buffer_region->range.head_offset << " copy_offset=" << src_offset << "\n";
-        debug(user_context) << " dst region=" << (void*)dst_memory_region << " buffer=" << (void*)dst_device_buffer << " crop_offset=" << (uint64_t)dst_buffer_region->range.head_offset << " copy_offset=" << dst_offset << "\n";
+        debug(user_context) << " src region=" << (void *)src_memory_region << " buffer=" << (void *)src_device_buffer << " crop_offset=" << (uint64_t)src_buffer_region->range.head_offset << " copy_offset=" << src_offset << "\n";
+        debug(user_context) << " dst region=" << (void *)dst_memory_region << " buffer=" << (void *)dst_device_buffer << " crop_offset=" << (uint64_t)dst_buffer_region->range.head_offset << " copy_offset=" << dst_offset << "\n";
 
         // enqueue the copy operation, using the allocated buffers
-        vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, 
-                                    src_offset, dst_offset, 
-                                    src->dimensions, 
+        vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
+                                    src_offset, dst_offset,
+                                    src->dimensions,
                                     from_host, to_host);
 
         // end the command buffer
@@ -942,10 +942,9 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
 
             // unmap the pointer and reclaim the staging region
             ctx.allocator->unmap(user_context, staging_region);
-    
         }
 
-        if(staging_region) {
+        if (staging_region) {
             if (halide_can_reuse_device_allocations(user_context)) {
                 ctx.allocator->release(user_context, staging_region);
             } else {
@@ -954,7 +953,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
         }
 
         vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
-    
+
 #ifdef DEBUG_RUNTIME
         uint64_t t_after = halide_current_time_ns(user_context);
         debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 07107869333e..66879d377792 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -203,7 +203,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 
 // -- Copy Buffer
 int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buffer,
-                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset, 
+                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset,
                                 int d, bool from_host, bool to_host);
 
 // --------------------------------------------------------------------------
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index c5f630196c9d..57def976ca1d 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -959,13 +959,13 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 // --------------------------------------------------------------------------
 
 int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buffer,
-                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset, 
+                                const device_copy &c, uint64_t src_offset, uint64_t dst_offset,
                                 int d, bool from_host, bool to_host) {
     if (d == 0) {
 
-        if((!from_host && to_host) || 
-           (from_host && !to_host) || 
-           (!from_host && !to_host) ) {
+        if ((!from_host && to_host) ||
+            (from_host && !to_host) ||
+            (!from_host && !to_host)) {
 
             VkBufferCopy buffer_copy = {
                 c.src_begin + src_offset,  // srcOffset
@@ -981,7 +981,7 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
             }
 
             vkCmdCopyBuffer(command_buffer, *src_buffer, *dst_buffer, 1, &buffer_copy);
-        
+
         } else if ((c.dst + dst_offset) != (c.src + src_offset)) {
             // Could reach here if a user called directly into the
             // Vulkan API for a device->host copy on a source buffer
@@ -993,9 +993,9 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
         // device_buffer_utils.h does not do so either.
         uint64_t src_off = 0, dst_off = 0;
         for (uint64_t i = 0; i < c.extent[d - 1]; i++) {
-            int err = vk_do_multidimensional_copy(user_context, command_buffer, c, 
-                                                  src_offset + src_off, 
-                                                  dst_offset + dst_off, 
+            int err = vk_do_multidimensional_copy(user_context, command_buffer, c,
+                                                  src_offset + src_off,
+                                                  dst_offset + dst_off,
                                                   d - 1, from_host, to_host);
             dst_off += c.dst_stride_bytes[d - 1];
             src_off += c.src_stride_bytes[d - 1];

From cea209dcfe594d7fb592a2aa53078a581a1ab11e Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 15 Nov 2022 13:02:59 -0800
Subject: [PATCH 107/166] Fix signed mod operator and use euclidean identity
 (just like glsl)

---
 src/CodeGen_Vulkan_Dev.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 14b50b2f8a03..a72c0e5110df 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -6,6 +6,7 @@
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Vulkan_Dev.h"
+#include "CSE.h"
 #include "Debug.h"
 #include "Deinterleave.h"
 #include "FindIntrinsics.h"
@@ -682,12 +683,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
         SpvId result_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::binary_op(SpvOpBitwiseAnd, type_id, result_id, src_a_id, bitwise_value_id));
         builder.update_id(result_id);
-    } else if (op->type.is_int()) {
-        Expr e = lower_euclidean_mod(op->a, op->b);
-        e.accept(this);
-    } else if (op->type.is_uint()) {
-        visit_binary_op(SpvOpUMod, op->type, op->a, op->b);
-    } else if (op->type.is_float()) {
+   } else if (op->type.is_int() || op->type.is_uint()) {
+        // Just exploit the Euclidean identity
+        Expr zero = make_zero(op->type);
+        Expr equiv = select(op->a == zero, zero,
+                            op->a - (op->a / op->b) * op->b);
+        equiv = common_subexpression_elimination(equiv);
+        equiv.accept(this);
+     } else if (op->type.is_float()) {
         // SPIR-V FMod is strangely not what we want .. FRem does what we need
         visit_binary_op(SpvOpFRem, op->type, op->a, op->b);
     } else {
@@ -1027,7 +1030,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
 
     } else if (op->is_intrinsic(Call::shift_right)) {
         internal_assert(op->args.size() == 2);
-        if (op->type.is_uint()) {
+        if (op->type.is_uint() || (op->args[1].type().is_uint())) {
             visit_binary_op(SpvOpShiftRightLogical, op->type, op->args[0], op->args[1]);
         } else {
             Expr e = lower_signed_shift_right(op->args[0], op->args[1]);
@@ -1035,7 +1038,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         }
     } else if (op->is_intrinsic(Call::shift_left)) {
         internal_assert(op->args.size() == 2);
-        if (op->type.is_uint()) {
+        if (op->type.is_uint() || (op->args[1].type().is_uint())) {
             visit_binary_op(SpvOpShiftLeftLogical, op->type, op->args[0], op->args[1]);
         } else {
             Expr e = lower_signed_shift_left(op->args[0], op->args[1]);

From 666f3d7620a55906e0d70398f55994c135732886 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 15 Nov 2022 13:06:48 -0800
Subject: [PATCH 108/166] Clang format/tidy pass

---
 src/CodeGen_Vulkan_Dev.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index a72c0e5110df..cbab37697793 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -3,10 +3,10 @@
 #include <sstream>
 #include <unordered_set>
 
+#include "CSE.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Vulkan_Dev.h"
-#include "CSE.h"
 #include "Debug.h"
 #include "Deinterleave.h"
 #include "FindIntrinsics.h"
@@ -683,14 +683,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
         SpvId result_id = builder.reserve_id(SpvResultId);
         builder.append(SpvFactory::binary_op(SpvOpBitwiseAnd, type_id, result_id, src_a_id, bitwise_value_id));
         builder.update_id(result_id);
-   } else if (op->type.is_int() || op->type.is_uint()) {
+    } else if (op->type.is_int() || op->type.is_uint()) {
         // Just exploit the Euclidean identity
         Expr zero = make_zero(op->type);
         Expr equiv = select(op->a == zero, zero,
                             op->a - (op->a / op->b) * op->b);
         equiv = common_subexpression_elimination(equiv);
         equiv.accept(this);
-     } else if (op->type.is_float()) {
+    } else if (op->type.is_float()) {
         // SPIR-V FMod is strangely not what we want .. FRem does what we need
         visit_binary_op(SpvOpFRem, op->type, op->a, op->b);
     } else {

From 029b56bfdac07dd6d5e3a5d458872e4b6f9d79ea Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 15 Nov 2022 16:32:19 -0800
Subject: [PATCH 109/166] Fix to handle Mod on vectors (use vector constant for
 bitwise and)

---
 src/CodeGen_Vulkan_Dev.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index cbab37697793..e057cbf41992 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -675,13 +675,17 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Mod *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(Mod): " << op->type << " ((" << op->a << ") % (" << op->b << "))\n";
     int bits = 0;
     if (is_const_power_of_two_integer(op->b, &bits) && op->type.is_int_or_uint()) {
-        int bitwise_value = ((1 << bits) - 1);
-        SpvId bitwise_value_id = builder.declare_integer_constant(op->type.with_lanes(1), (int64_t)bitwise_value);
-        SpvId type_id = builder.declare_type(op->type);
         op->a.accept(this);
         SpvId src_a_id = builder.current_id();
+
+        int bitwise_value = ((1 << bits) - 1);
+        Expr expr = make_const(op->type, bitwise_value);
+        expr.accept(this);
+        SpvId src_b_id = builder.current_id();
+
+        SpvId type_id = builder.declare_type(op->type);
         SpvId result_id = builder.reserve_id(SpvResultId);
-        builder.append(SpvFactory::binary_op(SpvOpBitwiseAnd, type_id, result_id, src_a_id, bitwise_value_id));
+        builder.append(SpvFactory::binary_op(SpvOpBitwiseAnd, type_id, result_id, src_a_id, src_b_id));
         builder.update_id(result_id);
     } else if (op->type.is_int() || op->type.is_uint()) {
         // Just exploit the Euclidean identity

From 2ce84a65d9c0e9035df396cb02df84322e162708 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 16 Nov 2022 13:51:54 -0800
Subject: [PATCH 110/166] Fix pow operator for Vulkan, and re-enable math test
 to full range.

---
 src/CodeGen_Vulkan_Dev.cpp | 75 ++++++++------------------------------
 test/correctness/math.cpp  |  3 +-
 2 files changed, 17 insertions(+), 61 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index e057cbf41992..7f19a8870044 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1104,69 +1104,26 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
             internal_error << "Unhandled intrinsic in Vulkan backend: " << op->name << "\n";
         }
 
-    } else if (starts_with(op->name, "pow_f")) {
+    } else if (op->call_type == Call::PureExtern && starts_with(op->name, "pow_f")) {
         internal_assert(op->args.size() == 2);
         if (can_prove(op->args[0] > 0)) {
             visit_glsl_op(GLSLstd450Pow, op->type, op->args);
         } else {
-            visit_glsl_op(GLSLstd450Pow, op->type, op->args);
-            SpvId type_id = builder.declare_type(op->type);
-            SpvId inst_set_id = builder.import_glsl_intrinsics();
-
-            Expr a = op->args[0];
-            a->accept(this);
-            SpvId src_a_id = builder.current_id();
-
-            Expr b = op->args[1];
-            b->accept(this);
-            SpvId src_b_id = builder.current_id();
-
-            SpvId abs_a_id = builder.reserve_id(SpvResultId);
-            SpvFactory::Operands abs_operands = {src_a_id};
-            builder.append(SpvFactory::extended(inst_set_id, GLSLstd450FAbs, type_id, abs_a_id, abs_operands));
-
-            SpvFactory::Operands pow_operands = {abs_a_id, src_b_id};
-            SpvId pow_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::extended(inst_set_id, GLSLstd450Pow, type_id, pow_id, pow_operands));
-            builder.update_id(pow_id);
-
-            // a > 0
-            SpvId zero_id = builder.declare_float_constant(op->type, 0.0);
-            SpvId a_gt_zero_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::binary_op(SpvOpFOrdGreaterThan, type_id, a_gt_zero_id, src_a_id, zero_id));
-
-            // b % 2
-            SpvId two_id = builder.declare_float_constant(op->type, 2.0);
-            SpvId b_mod_two_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::binary_op(SpvOpFRem, type_id, b_mod_two_id, src_b_id, two_id));
-
-            // b % 2 == 1
-            SpvId one_id = builder.declare_float_constant(op->type, 1.0);
-            SpvId b_mod_two_is_one_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::binary_op(SpvOpFOrdEqual, type_id, b_mod_two_is_one_id, b_mod_two_id, one_id));
-
-            // b % 2 == 0
-            SpvId b_mod_two_is_zero_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::binary_op(SpvOpFOrdEqual, type_id, b_mod_two_is_zero_id, b_mod_two_id, zero_id));
-
-            // -pow
-            SpvId neg_pow_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::unary_op(SpvOpFNegate, type_id, neg_pow_id, pow_id));
-
-            // a_var > 0 || b_var % 2 == 0
-            SpvId bool_type_id = builder.declare_type(Bool());
-            SpvId a_gt_zero_or_b_mod_two_is_zero_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::binary_op(SpvOpLogicalOr, bool_type_id, a_gt_zero_or_b_mod_two_is_zero_id, a_gt_zero_id, b_mod_two_is_zero_id));
-
-            // select(b_var % 2 == 1, -c_var, zero)
-            SpvId nan_id = builder.declare_float_constant(op->type, 0.0);
-            SpvId neg_pow_or_zero_result_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::select(type_id, neg_pow_or_zero_result_id, b_mod_two_is_one_id, neg_pow_id, nan_id));
-
-            // select(a_var > 0 || b_var % 2 == 0, pow_id, neg_pow_or_zero_result_id)
-            SpvId result_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::select(type_id, result_id, a_gt_zero_or_b_mod_two_is_zero_id, pow_id, neg_pow_or_zero_result_id));
-            builder.update_id(result_id);
+            Expr x = op->args[0];
+            Expr y = op->args[1];
+            Halide::Expr abs_x_pow_y = Internal::halide_exp(Internal::halide_log(abs(x)) * y);
+            Halide::Expr nan_expr = Call::make(x.type(), "nan_f32", {}, Call::PureExtern);
+            Expr iy = floor(y);
+            Expr one = make_one(x.type());
+            Expr zero = make_zero(x.type());
+            Expr e = select(x > 0, abs_x_pow_y,        // Strictly positive x
+                            y == 0.0f, one,            // x^0 == 1
+                            x == 0.0f, zero,           // 0^y == 0
+                            y != iy, nan_expr,         // negative x to a non-integer power
+                            iy % 2 == 0, abs_x_pow_y,  // negative x to an even power
+                            -abs_x_pow_y);             // negative x to an odd power
+            e = common_subexpression_elimination(e);
+            e.accept(this);
         }
     } else if (starts_with(op->name, "fast_inverse_f")) {
         internal_assert(op->args.size() == 1);
diff --git a/test/correctness/math.cpp b/test/correctness/math.cpp
index 9c54ef859379..1315de6cc9c9 100644
--- a/test/correctness/math.cpp
+++ b/test/correctness/math.cpp
@@ -290,8 +290,7 @@ int main(int argc, char **argv) {
     call_1_float_types(ceil, 256, -25, 25);
     call_1_float_types(trunc, 256, -25, 25);
 
-    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute) ||
-        get_jit_target_from_environment().has_feature(Target::Vulkan)) {
+    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) {
         // GLSL isn't required to support NaN, so keep things real
         call_2_float_types(pow, 256, 0.0, 10.0, -4.0f, 4.0f);
     } else {

From fd1d4d1a35ecf1c8f11a3a6384f577f5a735ae4f Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 16 Nov 2022 14:45:08 -0800
Subject: [PATCH 111/166] Add error checking for return types for conditionals
 Use bool types for ops that require them, and adapt to expected return types

---
 src/CodeGen_Vulkan_Dev.cpp | 49 ++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 7f19a8870044..5aa030a4ab38 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -774,9 +774,18 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Min *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(EQ): " << op->type << " (" << op->a << ") == (" << op->b << ")\n";
-    visit_binary_op(op->type.is_float() ? SpvOpFOrdEqual : SpvOpIEqual, op->type, op->a, op->b);
+    if (op->a.type() != op->b.type()) {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op): Mismatched operand types: " << op->a.type() << " != " << op->b.type() << "\n";
+    }
+    SpvOp op_code = SpvOpNop;
+    if (op->a.type().is_float()) {
+        op_code = SpvOpFOrdEqual;
+    } else {
+        op_code = SpvOpIEqual;
+    }
+    Type bool_type = UInt(1, op->type.lanes());
+    visit_binary_op(op_code, bool_type, op->a, op->b);
     if (!op->type.is_bool()) {
-        Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
         SpvId result_id = cast_type(op->type, bool_type, current_id);
         builder.update_id(result_id);
@@ -785,7 +794,17 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const EQ *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const NE *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(NE): " << op->type << " (" << op->a << ") != (" << op->b << ")\n";
-    visit_binary_op(op->type.is_float() ? SpvOpFOrdNotEqual : SpvOpINotEqual, op->type, op->a, op->b);
+    if (op->a.type() != op->b.type()) {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const NE *op): Mismatched operand types: " << op->a.type() << " != " << op->b.type() << "\n";
+    }
+    SpvOp op_code = SpvOpNop;
+    if (op->a.type().is_float()) {
+        op_code = SpvOpFOrdNotEqual;
+    } else {
+        op_code = SpvOpINotEqual;
+    }
+    Type bool_type = UInt(1, op->type.lanes());
+    visit_binary_op(op_code, bool_type, op->a, op->b);
     if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
@@ -796,6 +815,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const NE *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LT): " << op->type << " (" << op->a << ") < (" << op->b << ")\n";
+    if (op->a.type() != op->b.type()) {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op): Mismatched operand types: " << op->a.type() << " != " << op->b.type() << "\n";
+    }
     SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
         op_code = SpvOpFOrdLessThan;
@@ -806,7 +828,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binary_op(op_code, op->type, op->a, op->b);
+    Type bool_type = UInt(1, op->type.lanes());
+    visit_binary_op(op_code, bool_type, op->a, op->b);
     if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
@@ -817,6 +840,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LT *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(LE): " << op->type << " (" << op->a << ") <= (" << op->b << ")\n";
+    if (op->a.type() != op->b.type()) {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op): Mismatched operand types: " << op->a.type() << " != " << op->b.type() << "\n";
+    }
     SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
         op_code = SpvOpFOrdLessThanEqual;
@@ -827,7 +853,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binary_op(op_code, op->type, op->a, op->b);
+    Type bool_type = UInt(1, op->type.lanes());
+    visit_binary_op(op_code, bool_type, op->a, op->b);
     if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
@@ -838,6 +865,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(GT): " << op->type << " (" << op->a << ") > (" << op->b << ")\n";
+    if (op->a.type() != op->b.type()) {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op): Mismatched operand types: " << op->a.type() << " != " << op->b.type() << "\n";
+    }
     SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
         op_code = SpvOpFOrdGreaterThan;
@@ -848,7 +878,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binary_op(op_code, op->type, op->a, op->b);
+    Type bool_type = UInt(1, op->type.lanes());
+    visit_binary_op(op_code, bool_type, op->a, op->b);
     if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();
@@ -859,6 +890,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GT *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(GE): " << op->type << " (" << op->a << ") >= (" << op->b << ")\n";
+    if (op->a.type() != op->b.type()) {
+        internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const LE *op): Mismatched operand types: " << op->a.type() << " != " << op->b.type() << "\n";
+    }
     SpvOp op_code = SpvOpNop;
     if (op->a.type().is_float()) {
         op_code = SpvOpFOrdGreaterThanEqual;
@@ -869,7 +903,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op) {
     } else {
         internal_error << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const GE *op): unhandled type: " << op->a.type() << "\n";
     }
-    visit_binary_op(op_code, op->type, op->a, op->b);
+    Type bool_type = UInt(1, op->type.lanes());
+    visit_binary_op(op_code, bool_type, op->a, op->b);
     if (!op->type.is_bool()) {
         Type bool_type = UInt(1, op->type.lanes());
         SpvId current_id = builder.current_id();

From ddab953dcac09a3d07cc25decb7f405451a28d5b Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 17 Nov 2022 14:34:08 -0800
Subject: [PATCH 112/166] Handle deallocation for existing regions prior to
 coalescing. Cleanup region allocator logic for availability. Augment
 block_allocator test to cover allocation reuse.

---
 src/runtime/internal/region_allocator.h | 77 ++++++++++++++++++-------
 test/runtime/block_allocator.cpp        | 53 ++++++++++++++++-
 2 files changed, 109 insertions(+), 21 deletions(-)

diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index ff645eeda467..a811dd152001 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -64,6 +64,9 @@ class RegionAllocator {
     // Search through allocated block regions (Best-Fit)
     BlockRegion *find_block_region(void *user_context, const MemoryRequest &request);
 
+    // Returns true if block region is unused and available
+    bool is_available(BlockRegion *region);
+
     // Returns true if neighbouring block regions to the given region can be coalesced into one
     bool can_coalesce(BlockRegion *region);
 
@@ -253,21 +256,51 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
     return result;
 }
 
-bool RegionAllocator::can_coalesce(BlockRegion *block_region) {
+bool RegionAllocator::is_available(BlockRegion *block_region) {
     if (block_region == nullptr) {
         return false;
     }
-    if (block_region->prev_ptr && (block_region->prev_ptr->status == AllocationStatus::Available)) {
+    if (block_region->usage_count > 0) {
+        return false;
+    }
+    if (block_region->status != AllocationStatus::Available) {
+        return false;
+    }
+    return true;
+}
+
+bool RegionAllocator::can_coalesce(BlockRegion *block_region) {
+    if (!is_available(block_region)) {
+        return false;
+    }
+    if (is_available(block_region->prev_ptr)) {
         return true;
     }
-    if (block_region->next_ptr && (block_region->next_ptr->status == AllocationStatus::Available)) {
+    if (is_available(block_region->next_ptr)) {
         return true;
     }
     return false;
 }
 
 BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRegion *block_region) {
-    if (block_region->prev_ptr && (block_region->prev_ptr->status == AllocationStatus::Available)) {
+
+    if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "Freeing region ("
+                                        << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                        << "block_region=" << (void *)block_region << " "
+                                        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                        << "block_reserved=" << (uint32_t)block->reserved << " "
+                                        << ")\n";
+#endif
+        halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
+        MemoryRegion *memory_region = &(block_region->memory);
+        allocators.region.deallocate(user_context, memory_region);
+        block->reserved -= block_region->memory.size;
+        block_region->memory.handle = nullptr;
+    }
+
+    if (is_available(block_region->prev_ptr)) {
         BlockRegion *prev_region = block_region->prev_ptr;
 
 #ifdef DEBUG_INTERNAL
@@ -285,7 +318,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
         block_region = prev_region;
     }
 
-    if (block_region->next_ptr && (block_region->next_ptr->status == AllocationStatus::Available)) {
+    if (is_available(block_region->next_ptr)) {
         BlockRegion *next_region = block_region->next_ptr;
 
 #ifdef DEBUG_INTERNAL
@@ -397,9 +430,15 @@ void RegionAllocator::release_block_region(void *user_context, BlockRegion *bloc
                                     << "user_context=" << (void *)(user_context) << " "
                                     << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
-    if ((block_region->usage_count == 0) &&
-        ((block_region->status == AllocationStatus::InUse) ||
-         (block_region->status == AllocationStatus::Dedicated))) {
+    if (block_region == nullptr) {
+        return;
+    }
+
+    if (block_region->usage_count > 0) {
+        return;
+    }
+
+    if (!is_available(block_region)) {
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Releasing region ("
@@ -518,21 +557,19 @@ bool RegionAllocator::collect(void *user_context) {
 
     bool result = false;
     for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
-        if (block_region->status == AllocationStatus::Available) {
-            if (can_coalesce(block_region)) {
+        if (can_coalesce(block_region)) {
 
 #ifdef DEBUG_INTERNAL
-                count++;
-                StackBasicPrinter<256>(nullptr) << "    collecting region ("
-                                                << "block_ptr=" << (void *)block_region->block_ptr << " "
-                                                << "block_region=" << (void *)block_region << " "
-                                                << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
-                                                << "block_reserved=" << (uint32_t)block->reserved << " "
-                                                << ")\n";
+            count++;
+            StackBasicPrinter<256>(nullptr) << "    collecting region ("
+                                            << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                            << "block_region=" << (void *)block_region << " "
+                                            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                            << "block_reserved=" << (uint32_t)block->reserved << " "
+                                            << ")\n";
 #endif
-                block_region = coalesce_block_regions(user_context, block_region);
-                result = true;
-            }
+            block_region = coalesce_block_regions(user_context, block_region);
+            result = true;
         }
     }
 
diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp
index a7bb9752c7a4..07ae4cfe1287 100644
--- a/test/runtime/block_allocator.cpp
+++ b/test/runtime/block_allocator.cpp
@@ -119,7 +119,7 @@ int main(int argc, char **argv) {
         halide_abort_if_false(user_context, allocated_system_memory == 0);
     }
 
-    // stress test
+    // allocation stress test
     {
         BlockAllocator::Config config = {0};
         config.minimum_block_size = 1024;
@@ -158,6 +158,57 @@ int main(int argc, char **argv) {
         halide_abort_if_false(user_context, allocated_system_memory == 0);
     }
 
+    // reuse stress test
+    {
+        BlockAllocator::Config config = {0};
+        config.minimum_block_size = 1024;
+
+        BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
+        BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
+
+        MemoryRequest request = {0};
+        request.size = sizeof(int);
+        request.alignment = sizeof(int);
+        request.properties.visibility = MemoryVisibility::DefaultVisibility;
+        request.properties.caching = MemoryCaching::DefaultCaching;
+        request.properties.usage = MemoryUsage::DefaultUsage;
+
+        size_t total_allocation_size = 0;
+        static size_t test_allocations = 1000;
+        PointerTable pointers(user_context, test_allocations, system_allocator);
+        for (size_t n = 0; n < test_allocations; ++n) {
+            size_t count = n % 32;
+            count = count > 1 ? count : 1;
+            request.size = count * sizeof(int);
+            total_allocation_size += request.size;
+            MemoryRegion *region = instance->reserve(user_context, request);
+            pointers.append(user_context, region);
+        }
+
+        for (size_t n = 0; n < pointers.size(); ++n) {
+            MemoryRegion *region = static_cast<MemoryRegion *>(pointers[n]);
+            instance->release(user_context, region); // release but don't destroy
+        }
+        pointers.clear(user_context);
+        halide_abort_if_false(user_context, allocated_region_memory >= total_allocation_size);
+
+        // reallocate and reuse
+        for (size_t n = 0; n < test_allocations; ++n) {
+            size_t count = n % 32;
+            count = count > 1 ? count : 1;
+            request.size = count * sizeof(int);
+            MemoryRegion *region = instance->reserve(user_context, request);
+            pointers.append(user_context, region);
+        }
+
+        pointers.destroy(user_context);
+        instance->destroy(user_context);
+        halide_abort_if_false(user_context, allocated_block_memory == 0);
+
+        BlockAllocator::destroy(user_context, instance);
+        halide_abort_if_false(user_context, allocated_system_memory == 0);
+    }
+
     print(user_context) << "Success!\n";
     return 0;
 }

From a65e04fdf6a39ae8f84c8e461b6fcae933bd86d5 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 17 Nov 2022 14:43:55 -0800
Subject: [PATCH 113/166] Clang tidy/format pass

---
 test/runtime/block_allocator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp
index 07ae4cfe1287..c7659705cce6 100644
--- a/test/runtime/block_allocator.cpp
+++ b/test/runtime/block_allocator.cpp
@@ -187,7 +187,7 @@ int main(int argc, char **argv) {
 
         for (size_t n = 0; n < pointers.size(); ++n) {
             MemoryRegion *region = static_cast<MemoryRegion *>(pointers[n]);
-            instance->release(user_context, region); // release but don't destroy
+            instance->release(user_context, region);  // release but don't destroy
         }
         pointers.clear(user_context);
         halide_abort_if_false(user_context, allocated_region_memory >= total_allocation_size);

From c1c83e53188d8658d2bf6a326f5cf914a6e6481b Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 17 Nov 2022 16:10:08 -0800
Subject: [PATCH 114/166] Fix reserved accounting for regions

---
 src/runtime/internal/region_allocator.h | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index a811dd152001..8dae7b8b9c16 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -188,6 +188,7 @@ void RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     if (block_region->usage_count > 0) {
         block_region->usage_count--;
     }
+    release_block_region(user_context, block_region);
     free_block_region(user_context, block_region);
     if (can_coalesce(block_region)) {
         block_region = coalesce_block_regions(user_context, block_region);
@@ -212,7 +213,7 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
     BlockRegion *result = nullptr;
     for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
 
-        if (block_region->status != AllocationStatus::Available) {
+        if (!is_available(block_region)) {
             continue;
         }
 
@@ -296,7 +297,6 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
         halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
         MemoryRegion *memory_region = &(block_region->memory);
         allocators.region.deallocate(user_context, memory_region);
-        block->reserved -= block_region->memory.size;
         block_region->memory.handle = nullptr;
     }
 
@@ -339,10 +339,26 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
 }
 
 bool RegionAllocator::can_split(BlockRegion *block_region, size_t size) {
-    return (block_region && (block_region->memory.size > size) && (block_region->memory.handle == nullptr));
+    return (block_region && (block_region->memory.size > size) && (block_region->usage_count == 0));
 }
 
 BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
+
+    if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: Split deallocate region ("
+                                        << "block_ptr=" << (void *)block_region->block_ptr << " "
+                                        << "block_region=" << (void *)block_region << " "
+                                        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                                        << "block_reserved=" << (uint32_t)block_region->block_ptr->reserved << " "
+                                        << ")\n";
+#endif
+        halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
+        MemoryRegion *memory_region = &(block_region->memory);
+        allocators.region.deallocate(user_context, memory_region);
+        block_region->memory.handle = nullptr;
+    }
+
     alignment = conform_alignment(alignment, block->memory.properties.alignment);
 
 #ifdef DEBUG_INTERNAL
@@ -438,7 +454,7 @@ void RegionAllocator::release_block_region(void *user_context, BlockRegion *bloc
         return;
     }
 
-    if (!is_available(block_region)) {
+    if (block_region->status != AllocationStatus::Available) {
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "Releasing region ("
@@ -523,7 +539,6 @@ void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_r
         halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
         MemoryRegion *memory_region = &(block_region->memory);
         allocators.region.deallocate(user_context, memory_region);
-        block->reserved -= block_region->memory.size;
         block_region->memory.size = 0;
         block_region->memory.offset = 0;
         block_region->memory.handle = nullptr;

From 46a7dca20a5c46eb888faf3b8f57e5b342672ba3 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Wed, 23 Nov 2022 10:09:28 -0800
Subject: [PATCH 115/166] Add more details to Windows specific Vulkan build
 config

---
 README_vulkan.md | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index d41647f37695..fa10ba3b1321 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -18,8 +18,16 @@ You'll need to configure Halide and enable the cmake option TARGET_VULKAN.
 For example, on Linux & OSX:
 
 ```
-% cmake -G Ninja -DTARGET_VULKAN=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_DIR=$LLVM_ROOT/lib/cmake/llvm -S . -B build
-% cmake --build build
+% cmake -G Ninja -DTARGET_VULKAN=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_DIR=$LLVM_ROOT/lib/cmake/llvm 
+% cmake --build build --config Release
+```
+
+On Windows, you may need to specify the location of the Vulkan SDK if the paths aren't resolved by CMake automatically.  For example (assuming the Vulkan SDK is installed in the default path):
+
+```
+C:\> cmake -G Ninja -DTARGET_VULKAN=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_DIR=$LLVM_ROOT/lib/cmake/llvm -DVulkan_LIBRARY=C:\VulkanSDK\1.3.231.1\Lib\vulkan-1.lib -DVulkan_INCLUDE_DIR=C:\VulkanSDK\1.3.231.1\Include\vulkan -S . -B build
+C:\> cmake --build build --config Release
+
 ```
 
 # Vulkan Runtime Environment:
@@ -38,6 +46,11 @@ for your platform. Vulkan support should be included.
 
 ## Windows 
 
+To build Halide AOT generators, you'll need the Vulkan SDK (specifically the Vulkan loader library and headers):
+https://sdk.lunarg.com/sdk/download/latest/windows/vulkan-sdk.exe
+
+For Vulkan device drivers, consult the appropriate hardware vendor for your device.  A few common ones are listed below.
+
 AMD:
 https://www.amd.com/en/technologies/vulkan
 
@@ -50,8 +63,9 @@ https://www.intel.com/content/www/us/en/download-center/home.html
 
 ## Linux 
 
-On Ubuntu Linux, proprietary drivers can be installed via 'apt' using 
-PPA's for each vendor.
+On Ubuntu Linux v22.04, the vulkan runtime is distributed in the `vulkan-tools` package. For earlier versions of Ubuntu (eg v20.x or v18.x) the contents of the `vulkan-tools` package was distributed as `vulkan-utils` so use this package instead.
+
+Proprietary drivers can be installed via 'apt' using PPA's for each vendor. Examples for AMD and NVIDIA are provided below.
 
 For AMD on Ubuntu v22.04:
 ```
@@ -70,8 +84,6 @@ $ sudo apt upgrade
 $ sudo apt install nvidia-driver-### nvidia-settings vulkan vulkan-tools
 ```
 
-For earlier versions of Ubuntu (eg v20.x or v18.x) the contents of the `vulkan-tools` package was distributed as `vulkan-utils` so use this package instead.
-
 Note that only valid drivers for your system should be installed since there's been 
 reports of the Vulkan loader segfaulting just by having a non-supported driver present. 
 Specifically, the seemingly generic `mesa-vulkan-drivers` actually includes the AMD 

From 52982ab78ae225eea34c03933834d4df7bb25b64 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 2 Dec 2022 16:56:50 -0800
Subject: [PATCH 116/166] Update SPIR-V headers to v1.6

---
 dependencies/spirv/include/spirv/1.0/spirv.h  |  993 -------
 .../include/spirv/{1.0 => 1.6}/GLSL.std.450.h |    0
 dependencies/spirv/include/spirv/1.6/spirv.h  | 2576 +++++++++++++++++
 src/SpirvIR.cpp                               |  163 +-
 src/SpirvIR.h                                 |   33 +-
 5 files changed, 2743 insertions(+), 1022 deletions(-)
 delete mode 100644 dependencies/spirv/include/spirv/1.0/spirv.h
 rename dependencies/spirv/include/spirv/{1.0 => 1.6}/GLSL.std.450.h (100%)
 create mode 100644 dependencies/spirv/include/spirv/1.6/spirv.h

diff --git a/dependencies/spirv/include/spirv/1.0/spirv.h b/dependencies/spirv/include/spirv/1.0/spirv.h
deleted file mode 100644
index bd5a9b9593aa..000000000000
--- a/dependencies/spirv/include/spirv/1.0/spirv.h
+++ /dev/null
@@ -1,993 +0,0 @@
-/*
-** Copyright (c) 2014-2018 The Khronos Group Inc.
-** 
-** Permission is hereby granted, free of charge, to any person obtaining a copy
-** of this software and/or associated documentation files (the "Materials"),
-** to deal in the Materials without restriction, including without limitation
-** the rights to use, copy, modify, merge, publish, distribute, sublicense,
-** and/or sell copies of the Materials, and to permit persons to whom the
-** Materials are furnished to do so, subject to the following conditions:
-** 
-** The above copyright notice and this permission notice shall be included in
-** all copies or substantial portions of the Materials.
-** 
-** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
-** STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
-** HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ 
-** 
-** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-** THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-** FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
-** IN THE MATERIALS.
-*/
-
-/*
-** This header is automatically generated by the same tool that creates
-** the Binary Section of the SPIR-V specification.
-*/
-
-/*
-** Enumeration tokens for SPIR-V, in various styles:
-**   C, C++, C++11, JSON, Lua, Python
-** 
-** - C will have tokens with a "Spv" prefix, e.g.: SpvSourceLanguageGLSL
-** - C++ will have tokens in the "spv" name space, e.g.: spv::SourceLanguageGLSL
-** - C++11 will use enum classes in the spv namespace, e.g.: spv::SourceLanguage::GLSL
-** - Lua will use tables, e.g.: spv.SourceLanguage.GLSL
-** - Python will use dictionaries, e.g.: spv['SourceLanguage']['GLSL']
-** 
-** Some tokens act like mask values, which can be OR'd together,
-** while others are mutually exclusive.  The mask-like ones have
-** "Mask" in their name, and a parallel enum that has the shift
-** amount (1 << x) for each corresponding enumerant.
-*/
-
-#ifndef spirv_H
-#define spirv_H
-
-typedef unsigned int SpvId;
-
-#define SPV_VERSION 0x10000
-#define SPV_REVISION 12
-
-static const unsigned int SpvMagicNumber = 0x07230203;
-static const unsigned int SpvVersion = 0x00010000;
-static const unsigned int SpvRevision = 12;
-static const unsigned int SpvOpCodeMask = 0xffff;
-static const unsigned int SpvWordCountShift = 16;
-
-typedef enum SpvSourceLanguage_ {
-    SpvSourceLanguageUnknown = 0,
-    SpvSourceLanguageESSL = 1,
-    SpvSourceLanguageGLSL = 2,
-    SpvSourceLanguageOpenCL_C = 3,
-    SpvSourceLanguageOpenCL_CPP = 4,
-    SpvSourceLanguageHLSL = 5,
-    SpvSourceLanguageMax = 0x7fffffff,
-} SpvSourceLanguage;
-
-typedef enum SpvExecutionModel_ {
-    SpvExecutionModelVertex = 0,
-    SpvExecutionModelTessellationControl = 1,
-    SpvExecutionModelTessellationEvaluation = 2,
-    SpvExecutionModelGeometry = 3,
-    SpvExecutionModelFragment = 4,
-    SpvExecutionModelGLCompute = 5,
-    SpvExecutionModelKernel = 6,
-    SpvExecutionModelMax = 0x7fffffff,
-} SpvExecutionModel;
-
-typedef enum SpvAddressingModel_ {
-    SpvAddressingModelLogical = 0,
-    SpvAddressingModelPhysical32 = 1,
-    SpvAddressingModelPhysical64 = 2,
-    SpvAddressingModelMax = 0x7fffffff,
-} SpvAddressingModel;
-
-typedef enum SpvMemoryModel_ {
-    SpvMemoryModelSimple = 0,
-    SpvMemoryModelGLSL450 = 1,
-    SpvMemoryModelOpenCL = 2,
-    SpvMemoryModelMax = 0x7fffffff,
-} SpvMemoryModel;
-
-typedef enum SpvExecutionMode_ {
-    SpvExecutionModeInvocations = 0,
-    SpvExecutionModeSpacingEqual = 1,
-    SpvExecutionModeSpacingFractionalEven = 2,
-    SpvExecutionModeSpacingFractionalOdd = 3,
-    SpvExecutionModeVertexOrderCw = 4,
-    SpvExecutionModeVertexOrderCcw = 5,
-    SpvExecutionModePixelCenterInteger = 6,
-    SpvExecutionModeOriginUpperLeft = 7,
-    SpvExecutionModeOriginLowerLeft = 8,
-    SpvExecutionModeEarlyFragmentTests = 9,
-    SpvExecutionModePointMode = 10,
-    SpvExecutionModeXfb = 11,
-    SpvExecutionModeDepthReplacing = 12,
-    SpvExecutionModeDepthGreater = 14,
-    SpvExecutionModeDepthLess = 15,
-    SpvExecutionModeDepthUnchanged = 16,
-    SpvExecutionModeLocalSize = 17,
-    SpvExecutionModeLocalSizeHint = 18,
-    SpvExecutionModeInputPoints = 19,
-    SpvExecutionModeInputLines = 20,
-    SpvExecutionModeInputLinesAdjacency = 21,
-    SpvExecutionModeTriangles = 22,
-    SpvExecutionModeInputTrianglesAdjacency = 23,
-    SpvExecutionModeQuads = 24,
-    SpvExecutionModeIsolines = 25,
-    SpvExecutionModeOutputVertices = 26,
-    SpvExecutionModeOutputPoints = 27,
-    SpvExecutionModeOutputLineStrip = 28,
-    SpvExecutionModeOutputTriangleStrip = 29,
-    SpvExecutionModeVecTypeHint = 30,
-    SpvExecutionModeContractionOff = 31,
-    SpvExecutionModePostDepthCoverage = 4446,
-    SpvExecutionModeStencilRefReplacingEXT = 5027,
-    SpvExecutionModeMax = 0x7fffffff,
-} SpvExecutionMode;
-
-typedef enum SpvStorageClass_ {
-    SpvStorageClassUniformConstant = 0,
-    SpvStorageClassInput = 1,
-    SpvStorageClassUniform = 2,
-    SpvStorageClassOutput = 3,
-    SpvStorageClassWorkgroup = 4,
-    SpvStorageClassCrossWorkgroup = 5,
-    SpvStorageClassPrivate = 6,
-    SpvStorageClassFunction = 7,
-    SpvStorageClassGeneric = 8,
-    SpvStorageClassPushConstant = 9,
-    SpvStorageClassAtomicCounter = 10,
-    SpvStorageClassImage = 11,
-    SpvStorageClassStorageBuffer = 12,
-    SpvStorageClassMax = 0x7fffffff,
-} SpvStorageClass;
-
-typedef enum SpvDim_ {
-    SpvDim1D = 0,
-    SpvDim2D = 1,
-    SpvDim3D = 2,
-    SpvDimCube = 3,
-    SpvDimRect = 4,
-    SpvDimBuffer = 5,
-    SpvDimSubpassData = 6,
-    SpvDimMax = 0x7fffffff,
-} SpvDim;
-
-typedef enum SpvSamplerAddressingMode_ {
-    SpvSamplerAddressingModeNone = 0,
-    SpvSamplerAddressingModeClampToEdge = 1,
-    SpvSamplerAddressingModeClamp = 2,
-    SpvSamplerAddressingModeRepeat = 3,
-    SpvSamplerAddressingModeRepeatMirrored = 4,
-    SpvSamplerAddressingModeMax = 0x7fffffff,
-} SpvSamplerAddressingMode;
-
-typedef enum SpvSamplerFilterMode_ {
-    SpvSamplerFilterModeNearest = 0,
-    SpvSamplerFilterModeLinear = 1,
-    SpvSamplerFilterModeMax = 0x7fffffff,
-} SpvSamplerFilterMode;
-
-typedef enum SpvImageFormat_ {
-    SpvImageFormatUnknown = 0,
-    SpvImageFormatRgba32f = 1,
-    SpvImageFormatRgba16f = 2,
-    SpvImageFormatR32f = 3,
-    SpvImageFormatRgba8 = 4,
-    SpvImageFormatRgba8Snorm = 5,
-    SpvImageFormatRg32f = 6,
-    SpvImageFormatRg16f = 7,
-    SpvImageFormatR11fG11fB10f = 8,
-    SpvImageFormatR16f = 9,
-    SpvImageFormatRgba16 = 10,
-    SpvImageFormatRgb10A2 = 11,
-    SpvImageFormatRg16 = 12,
-    SpvImageFormatRg8 = 13,
-    SpvImageFormatR16 = 14,
-    SpvImageFormatR8 = 15,
-    SpvImageFormatRgba16Snorm = 16,
-    SpvImageFormatRg16Snorm = 17,
-    SpvImageFormatRg8Snorm = 18,
-    SpvImageFormatR16Snorm = 19,
-    SpvImageFormatR8Snorm = 20,
-    SpvImageFormatRgba32i = 21,
-    SpvImageFormatRgba16i = 22,
-    SpvImageFormatRgba8i = 23,
-    SpvImageFormatR32i = 24,
-    SpvImageFormatRg32i = 25,
-    SpvImageFormatRg16i = 26,
-    SpvImageFormatRg8i = 27,
-    SpvImageFormatR16i = 28,
-    SpvImageFormatR8i = 29,
-    SpvImageFormatRgba32ui = 30,
-    SpvImageFormatRgba16ui = 31,
-    SpvImageFormatRgba8ui = 32,
-    SpvImageFormatR32ui = 33,
-    SpvImageFormatRgb10a2ui = 34,
-    SpvImageFormatRg32ui = 35,
-    SpvImageFormatRg16ui = 36,
-    SpvImageFormatRg8ui = 37,
-    SpvImageFormatR16ui = 38,
-    SpvImageFormatR8ui = 39,
-    SpvImageFormatMax = 0x7fffffff,
-} SpvImageFormat;
-
-typedef enum SpvImageChannelOrder_ {
-    SpvImageChannelOrderR = 0,
-    SpvImageChannelOrderA = 1,
-    SpvImageChannelOrderRG = 2,
-    SpvImageChannelOrderRA = 3,
-    SpvImageChannelOrderRGB = 4,
-    SpvImageChannelOrderRGBA = 5,
-    SpvImageChannelOrderBGRA = 6,
-    SpvImageChannelOrderARGB = 7,
-    SpvImageChannelOrderIntensity = 8,
-    SpvImageChannelOrderLuminance = 9,
-    SpvImageChannelOrderRx = 10,
-    SpvImageChannelOrderRGx = 11,
-    SpvImageChannelOrderRGBx = 12,
-    SpvImageChannelOrderDepth = 13,
-    SpvImageChannelOrderDepthStencil = 14,
-    SpvImageChannelOrdersRGB = 15,
-    SpvImageChannelOrdersRGBx = 16,
-    SpvImageChannelOrdersRGBA = 17,
-    SpvImageChannelOrdersBGRA = 18,
-    SpvImageChannelOrderABGR = 19,
-    SpvImageChannelOrderMax = 0x7fffffff,
-} SpvImageChannelOrder;
-
-typedef enum SpvImageChannelDataType_ {
-    SpvImageChannelDataTypeSnormInt8 = 0,
-    SpvImageChannelDataTypeSnormInt16 = 1,
-    SpvImageChannelDataTypeUnormInt8 = 2,
-    SpvImageChannelDataTypeUnormInt16 = 3,
-    SpvImageChannelDataTypeUnormShort565 = 4,
-    SpvImageChannelDataTypeUnormShort555 = 5,
-    SpvImageChannelDataTypeUnormInt101010 = 6,
-    SpvImageChannelDataTypeSignedInt8 = 7,
-    SpvImageChannelDataTypeSignedInt16 = 8,
-    SpvImageChannelDataTypeSignedInt32 = 9,
-    SpvImageChannelDataTypeUnsignedInt8 = 10,
-    SpvImageChannelDataTypeUnsignedInt16 = 11,
-    SpvImageChannelDataTypeUnsignedInt32 = 12,
-    SpvImageChannelDataTypeHalfFloat = 13,
-    SpvImageChannelDataTypeFloat = 14,
-    SpvImageChannelDataTypeUnormInt24 = 15,
-    SpvImageChannelDataTypeUnormInt101010_2 = 16,
-    SpvImageChannelDataTypeMax = 0x7fffffff,
-} SpvImageChannelDataType;
-
-typedef enum SpvImageOperandsShift_ {
-    SpvImageOperandsBiasShift = 0,
-    SpvImageOperandsLodShift = 1,
-    SpvImageOperandsGradShift = 2,
-    SpvImageOperandsConstOffsetShift = 3,
-    SpvImageOperandsOffsetShift = 4,
-    SpvImageOperandsConstOffsetsShift = 5,
-    SpvImageOperandsSampleShift = 6,
-    SpvImageOperandsMinLodShift = 7,
-    SpvImageOperandsMax = 0x7fffffff,
-} SpvImageOperandsShift;
-
-typedef enum SpvImageOperandsMask_ {
-    SpvImageOperandsMaskNone = 0,
-    SpvImageOperandsBiasMask = 0x00000001,
-    SpvImageOperandsLodMask = 0x00000002,
-    SpvImageOperandsGradMask = 0x00000004,
-    SpvImageOperandsConstOffsetMask = 0x00000008,
-    SpvImageOperandsOffsetMask = 0x00000010,
-    SpvImageOperandsConstOffsetsMask = 0x00000020,
-    SpvImageOperandsSampleMask = 0x00000040,
-    SpvImageOperandsMinLodMask = 0x00000080,
-} SpvImageOperandsMask;
-
-typedef enum SpvFPFastMathModeShift_ {
-    SpvFPFastMathModeNotNaNShift = 0,
-    SpvFPFastMathModeNotInfShift = 1,
-    SpvFPFastMathModeNSZShift = 2,
-    SpvFPFastMathModeAllowRecipShift = 3,
-    SpvFPFastMathModeFastShift = 4,
-    SpvFPFastMathModeMax = 0x7fffffff,
-} SpvFPFastMathModeShift;
-
-typedef enum SpvFPFastMathModeMask_ {
-    SpvFPFastMathModeMaskNone = 0,
-    SpvFPFastMathModeNotNaNMask = 0x00000001,
-    SpvFPFastMathModeNotInfMask = 0x00000002,
-    SpvFPFastMathModeNSZMask = 0x00000004,
-    SpvFPFastMathModeAllowRecipMask = 0x00000008,
-    SpvFPFastMathModeFastMask = 0x00000010,
-} SpvFPFastMathModeMask;
-
-typedef enum SpvFPRoundingMode_ {
-    SpvFPRoundingModeRTE = 0,
-    SpvFPRoundingModeRTZ = 1,
-    SpvFPRoundingModeRTP = 2,
-    SpvFPRoundingModeRTN = 3,
-    SpvFPRoundingModeMax = 0x7fffffff,
-} SpvFPRoundingMode;
-
-typedef enum SpvLinkageType_ {
-    SpvLinkageTypeExport = 0,
-    SpvLinkageTypeImport = 1,
-    SpvLinkageTypeMax = 0x7fffffff,
-} SpvLinkageType;
-
-typedef enum SpvAccessQualifier_ {
-    SpvAccessQualifierReadOnly = 0,
-    SpvAccessQualifierWriteOnly = 1,
-    SpvAccessQualifierReadWrite = 2,
-    SpvAccessQualifierMax = 0x7fffffff,
-} SpvAccessQualifier;
-
-typedef enum SpvFunctionParameterAttribute_ {
-    SpvFunctionParameterAttributeZext = 0,
-    SpvFunctionParameterAttributeSext = 1,
-    SpvFunctionParameterAttributeByVal = 2,
-    SpvFunctionParameterAttributeSret = 3,
-    SpvFunctionParameterAttributeNoAlias = 4,
-    SpvFunctionParameterAttributeNoCapture = 5,
-    SpvFunctionParameterAttributeNoWrite = 6,
-    SpvFunctionParameterAttributeNoReadWrite = 7,
-    SpvFunctionParameterAttributeMax = 0x7fffffff,
-} SpvFunctionParameterAttribute;
-
-typedef enum SpvDecoration_ {
-    SpvDecorationRelaxedPrecision = 0,
-    SpvDecorationSpecId = 1,
-    SpvDecorationBlock = 2,
-    SpvDecorationBufferBlock = 3,
-    SpvDecorationRowMajor = 4,
-    SpvDecorationColMajor = 5,
-    SpvDecorationArrayStride = 6,
-    SpvDecorationMatrixStride = 7,
-    SpvDecorationGLSLShared = 8,
-    SpvDecorationGLSLPacked = 9,
-    SpvDecorationCPacked = 10,
-    SpvDecorationBuiltIn = 11,
-    SpvDecorationNoPerspective = 13,
-    SpvDecorationFlat = 14,
-    SpvDecorationPatch = 15,
-    SpvDecorationCentroid = 16,
-    SpvDecorationSample = 17,
-    SpvDecorationInvariant = 18,
-    SpvDecorationRestrict = 19,
-    SpvDecorationAliased = 20,
-    SpvDecorationVolatile = 21,
-    SpvDecorationConstant = 22,
-    SpvDecorationCoherent = 23,
-    SpvDecorationNonWritable = 24,
-    SpvDecorationNonReadable = 25,
-    SpvDecorationUniform = 26,
-    SpvDecorationSaturatedConversion = 28,
-    SpvDecorationStream = 29,
-    SpvDecorationLocation = 30,
-    SpvDecorationComponent = 31,
-    SpvDecorationIndex = 32,
-    SpvDecorationBinding = 33,
-    SpvDecorationDescriptorSet = 34,
-    SpvDecorationOffset = 35,
-    SpvDecorationXfbBuffer = 36,
-    SpvDecorationXfbStride = 37,
-    SpvDecorationFuncParamAttr = 38,
-    SpvDecorationFPRoundingMode = 39,
-    SpvDecorationFPFastMathMode = 40,
-    SpvDecorationLinkageAttributes = 41,
-    SpvDecorationNoContraction = 42,
-    SpvDecorationInputAttachmentIndex = 43,
-    SpvDecorationAlignment = 44,
-    SpvDecorationExplicitInterpAMD = 4999,
-    SpvDecorationOverrideCoverageNV = 5248,
-    SpvDecorationPassthroughNV = 5250,
-    SpvDecorationViewportRelativeNV = 5252,
-    SpvDecorationSecondaryViewportRelativeNV = 5256,
-    SpvDecorationHlslCounterBufferGOOGLE = 5634,
-    SpvDecorationHlslSemanticGOOGLE = 5635,
-    SpvDecorationMax = 0x7fffffff,
-} SpvDecoration;
-
-typedef enum SpvBuiltIn_ {
-    SpvBuiltInPosition = 0,
-    SpvBuiltInPointSize = 1,
-    SpvBuiltInClipDistance = 3,
-    SpvBuiltInCullDistance = 4,
-    SpvBuiltInVertexId = 5,
-    SpvBuiltInInstanceId = 6,
-    SpvBuiltInPrimitiveId = 7,
-    SpvBuiltInInvocationId = 8,
-    SpvBuiltInLayer = 9,
-    SpvBuiltInViewportIndex = 10,
-    SpvBuiltInTessLevelOuter = 11,
-    SpvBuiltInTessLevelInner = 12,
-    SpvBuiltInTessCoord = 13,
-    SpvBuiltInPatchVertices = 14,
-    SpvBuiltInFragCoord = 15,
-    SpvBuiltInPointCoord = 16,
-    SpvBuiltInFrontFacing = 17,
-    SpvBuiltInSampleId = 18,
-    SpvBuiltInSamplePosition = 19,
-    SpvBuiltInSampleMask = 20,
-    SpvBuiltInFragDepth = 22,
-    SpvBuiltInHelperInvocation = 23,
-    SpvBuiltInNumWorkgroups = 24,
-    SpvBuiltInWorkgroupSize = 25,
-    SpvBuiltInWorkgroupId = 26,
-    SpvBuiltInLocalInvocationId = 27,
-    SpvBuiltInGlobalInvocationId = 28,
-    SpvBuiltInLocalInvocationIndex = 29,
-    SpvBuiltInWorkDim = 30,
-    SpvBuiltInGlobalSize = 31,
-    SpvBuiltInEnqueuedWorkgroupSize = 32,
-    SpvBuiltInGlobalOffset = 33,
-    SpvBuiltInGlobalLinearId = 34,
-    SpvBuiltInSubgroupSize = 36,
-    SpvBuiltInSubgroupMaxSize = 37,
-    SpvBuiltInNumSubgroups = 38,
-    SpvBuiltInNumEnqueuedSubgroups = 39,
-    SpvBuiltInSubgroupId = 40,
-    SpvBuiltInSubgroupLocalInvocationId = 41,
-    SpvBuiltInVertexIndex = 42,
-    SpvBuiltInInstanceIndex = 43,
-    SpvBuiltInSubgroupEqMaskKHR = 4416,
-    SpvBuiltInSubgroupGeMaskKHR = 4417,
-    SpvBuiltInSubgroupGtMaskKHR = 4418,
-    SpvBuiltInSubgroupLeMaskKHR = 4419,
-    SpvBuiltInSubgroupLtMaskKHR = 4420,
-    SpvBuiltInBaseVertex = 4424,
-    SpvBuiltInBaseInstance = 4425,
-    SpvBuiltInDrawIndex = 4426,
-    SpvBuiltInDeviceIndex = 4438,
-    SpvBuiltInViewIndex = 4440,
-    SpvBuiltInBaryCoordNoPerspAMD = 4992,
-    SpvBuiltInBaryCoordNoPerspCentroidAMD = 4993,
-    SpvBuiltInBaryCoordNoPerspSampleAMD = 4994,
-    SpvBuiltInBaryCoordSmoothAMD = 4995,
-    SpvBuiltInBaryCoordSmoothCentroidAMD = 4996,
-    SpvBuiltInBaryCoordSmoothSampleAMD = 4997,
-    SpvBuiltInBaryCoordPullModelAMD = 4998,
-    SpvBuiltInFragStencilRefEXT = 5014,
-    SpvBuiltInViewportMaskNV = 5253,
-    SpvBuiltInSecondaryPositionNV = 5257,
-    SpvBuiltInSecondaryViewportMaskNV = 5258,
-    SpvBuiltInPositionPerViewNV = 5261,
-    SpvBuiltInViewportMaskPerViewNV = 5262,
-    SpvBuiltInMax = 0x7fffffff,
-} SpvBuiltIn;
-
-typedef enum SpvSelectionControlShift_ {
-    SpvSelectionControlFlattenShift = 0,
-    SpvSelectionControlDontFlattenShift = 1,
-    SpvSelectionControlMax = 0x7fffffff,
-} SpvSelectionControlShift;
-
-typedef enum SpvSelectionControlMask_ {
-    SpvSelectionControlMaskNone = 0,
-    SpvSelectionControlFlattenMask = 0x00000001,
-    SpvSelectionControlDontFlattenMask = 0x00000002,
-} SpvSelectionControlMask;
-
-typedef enum SpvLoopControlShift_ {
-    SpvLoopControlUnrollShift = 0,
-    SpvLoopControlDontUnrollShift = 1,
-    SpvLoopControlMax = 0x7fffffff,
-} SpvLoopControlShift;
-
-typedef enum SpvLoopControlMask_ {
-    SpvLoopControlMaskNone = 0,
-    SpvLoopControlUnrollMask = 0x00000001,
-    SpvLoopControlDontUnrollMask = 0x00000002,
-} SpvLoopControlMask;
-
-typedef enum SpvFunctionControlShift_ {
-    SpvFunctionControlInlineShift = 0,
-    SpvFunctionControlDontInlineShift = 1,
-    SpvFunctionControlPureShift = 2,
-    SpvFunctionControlConstShift = 3,
-    SpvFunctionControlMax = 0x7fffffff,
-} SpvFunctionControlShift;
-
-typedef enum SpvFunctionControlMask_ {
-    SpvFunctionControlMaskNone = 0,
-    SpvFunctionControlInlineMask = 0x00000001,
-    SpvFunctionControlDontInlineMask = 0x00000002,
-    SpvFunctionControlPureMask = 0x00000004,
-    SpvFunctionControlConstMask = 0x00000008,
-} SpvFunctionControlMask;
-
-typedef enum SpvMemorySemanticsShift_ {
-    SpvMemorySemanticsAcquireShift = 1,
-    SpvMemorySemanticsReleaseShift = 2,
-    SpvMemorySemanticsAcquireReleaseShift = 3,
-    SpvMemorySemanticsSequentiallyConsistentShift = 4,
-    SpvMemorySemanticsUniformMemoryShift = 6,
-    SpvMemorySemanticsSubgroupMemoryShift = 7,
-    SpvMemorySemanticsWorkgroupMemoryShift = 8,
-    SpvMemorySemanticsCrossWorkgroupMemoryShift = 9,
-    SpvMemorySemanticsAtomicCounterMemoryShift = 10,
-    SpvMemorySemanticsImageMemoryShift = 11,
-    SpvMemorySemanticsMax = 0x7fffffff,
-} SpvMemorySemanticsShift;
-
-typedef enum SpvMemorySemanticsMask_ {
-    SpvMemorySemanticsMaskNone = 0,
-    SpvMemorySemanticsAcquireMask = 0x00000002,
-    SpvMemorySemanticsReleaseMask = 0x00000004,
-    SpvMemorySemanticsAcquireReleaseMask = 0x00000008,
-    SpvMemorySemanticsSequentiallyConsistentMask = 0x00000010,
-    SpvMemorySemanticsUniformMemoryMask = 0x00000040,
-    SpvMemorySemanticsSubgroupMemoryMask = 0x00000080,
-    SpvMemorySemanticsWorkgroupMemoryMask = 0x00000100,
-    SpvMemorySemanticsCrossWorkgroupMemoryMask = 0x00000200,
-    SpvMemorySemanticsAtomicCounterMemoryMask = 0x00000400,
-    SpvMemorySemanticsImageMemoryMask = 0x00000800,
-} SpvMemorySemanticsMask;
-
-typedef enum SpvMemoryAccessShift_ {
-    SpvMemoryAccessVolatileShift = 0,
-    SpvMemoryAccessAlignedShift = 1,
-    SpvMemoryAccessNontemporalShift = 2,
-    SpvMemoryAccessMax = 0x7fffffff,
-} SpvMemoryAccessShift;
-
-typedef enum SpvMemoryAccessMask_ {
-    SpvMemoryAccessMaskNone = 0,
-    SpvMemoryAccessVolatileMask = 0x00000001,
-    SpvMemoryAccessAlignedMask = 0x00000002,
-    SpvMemoryAccessNontemporalMask = 0x00000004,
-} SpvMemoryAccessMask;
-
-typedef enum SpvScope_ {
-    SpvScopeCrossDevice = 0,
-    SpvScopeDevice = 1,
-    SpvScopeWorkgroup = 2,
-    SpvScopeSubgroup = 3,
-    SpvScopeInvocation = 4,
-    SpvScopeMax = 0x7fffffff,
-} SpvScope;
-
-typedef enum SpvGroupOperation_ {
-    SpvGroupOperationReduce = 0,
-    SpvGroupOperationInclusiveScan = 1,
-    SpvGroupOperationExclusiveScan = 2,
-    SpvGroupOperationMax = 0x7fffffff,
-} SpvGroupOperation;
-
-typedef enum SpvKernelEnqueueFlags_ {
-    SpvKernelEnqueueFlagsNoWait = 0,
-    SpvKernelEnqueueFlagsWaitKernel = 1,
-    SpvKernelEnqueueFlagsWaitWorkGroup = 2,
-    SpvKernelEnqueueFlagsMax = 0x7fffffff,
-} SpvKernelEnqueueFlags;
-
-typedef enum SpvKernelProfilingInfoShift_ {
-    SpvKernelProfilingInfoCmdExecTimeShift = 0,
-    SpvKernelProfilingInfoMax = 0x7fffffff,
-} SpvKernelProfilingInfoShift;
-
-typedef enum SpvKernelProfilingInfoMask_ {
-    SpvKernelProfilingInfoMaskNone = 0,
-    SpvKernelProfilingInfoCmdExecTimeMask = 0x00000001,
-} SpvKernelProfilingInfoMask;
-
-typedef enum SpvCapability_ {
-    SpvCapabilityMatrix = 0,
-    SpvCapabilityShader = 1,
-    SpvCapabilityGeometry = 2,
-    SpvCapabilityTessellation = 3,
-    SpvCapabilityAddresses = 4,
-    SpvCapabilityLinkage = 5,
-    SpvCapabilityKernel = 6,
-    SpvCapabilityVector16 = 7,
-    SpvCapabilityFloat16Buffer = 8,
-    SpvCapabilityFloat16 = 9,
-    SpvCapabilityFloat64 = 10,
-    SpvCapabilityInt64 = 11,
-    SpvCapabilityInt64Atomics = 12,
-    SpvCapabilityImageBasic = 13,
-    SpvCapabilityImageReadWrite = 14,
-    SpvCapabilityImageMipmap = 15,
-    SpvCapabilityPipes = 17,
-    SpvCapabilityGroups = 18,
-    SpvCapabilityDeviceEnqueue = 19,
-    SpvCapabilityLiteralSampler = 20,
-    SpvCapabilityAtomicStorage = 21,
-    SpvCapabilityInt16 = 22,
-    SpvCapabilityTessellationPointSize = 23,
-    SpvCapabilityGeometryPointSize = 24,
-    SpvCapabilityImageGatherExtended = 25,
-    SpvCapabilityStorageImageMultisample = 27,
-    SpvCapabilityUniformBufferArrayDynamicIndexing = 28,
-    SpvCapabilitySampledImageArrayDynamicIndexing = 29,
-    SpvCapabilityStorageBufferArrayDynamicIndexing = 30,
-    SpvCapabilityStorageImageArrayDynamicIndexing = 31,
-    SpvCapabilityClipDistance = 32,
-    SpvCapabilityCullDistance = 33,
-    SpvCapabilityImageCubeArray = 34,
-    SpvCapabilitySampleRateShading = 35,
-    SpvCapabilityImageRect = 36,
-    SpvCapabilitySampledRect = 37,
-    SpvCapabilityGenericPointer = 38,
-    SpvCapabilityInt8 = 39,
-    SpvCapabilityInputAttachment = 40,
-    SpvCapabilitySparseResidency = 41,
-    SpvCapabilityMinLod = 42,
-    SpvCapabilitySampled1D = 43,
-    SpvCapabilityImage1D = 44,
-    SpvCapabilitySampledCubeArray = 45,
-    SpvCapabilitySampledBuffer = 46,
-    SpvCapabilityImageBuffer = 47,
-    SpvCapabilityImageMSArray = 48,
-    SpvCapabilityStorageImageExtendedFormats = 49,
-    SpvCapabilityImageQuery = 50,
-    SpvCapabilityDerivativeControl = 51,
-    SpvCapabilityInterpolationFunction = 52,
-    SpvCapabilityTransformFeedback = 53,
-    SpvCapabilityGeometryStreams = 54,
-    SpvCapabilityStorageImageReadWithoutFormat = 55,
-    SpvCapabilityStorageImageWriteWithoutFormat = 56,
-    SpvCapabilityMultiViewport = 57,
-    SpvCapabilitySubgroupBallotKHR = 4423,
-    SpvCapabilityDrawParameters = 4427,
-    SpvCapabilitySubgroupVoteKHR = 4431,
-    SpvCapabilityStorageBuffer16BitAccess = 4433,
-    SpvCapabilityStorageUniformBufferBlock16 = 4433,
-    SpvCapabilityStorageUniform16 = 4434,
-    SpvCapabilityUniformAndStorageBuffer16BitAccess = 4434,
-    SpvCapabilityStoragePushConstant16 = 4435,
-    SpvCapabilityStorageInputOutput16 = 4436,
-    SpvCapabilityDeviceGroup = 4437,
-    SpvCapabilityMultiView = 4439,
-    SpvCapabilityVariablePointersStorageBuffer = 4441,
-    SpvCapabilityVariablePointers = 4442,
-    SpvCapabilityAtomicStorageOps = 4445,
-    SpvCapabilitySampleMaskPostDepthCoverage = 4447,
-    SpvCapabilityImageGatherBiasLodAMD = 5009,
-    SpvCapabilityFragmentMaskAMD = 5010,
-    SpvCapabilityStencilExportEXT = 5013,
-    SpvCapabilityImageReadWriteLodAMD = 5015,
-    SpvCapabilitySampleMaskOverrideCoverageNV = 5249,
-    SpvCapabilityGeometryShaderPassthroughNV = 5251,
-    SpvCapabilityShaderViewportIndexLayerEXT = 5254,
-    SpvCapabilityShaderViewportIndexLayerNV = 5254,
-    SpvCapabilityShaderViewportMaskNV = 5255,
-    SpvCapabilityShaderStereoViewNV = 5259,
-    SpvCapabilityPerViewAttributesNV = 5260,
-    SpvCapabilitySubgroupShuffleINTEL = 5568,
-    SpvCapabilitySubgroupBufferBlockIOINTEL = 5569,
-    SpvCapabilitySubgroupImageBlockIOINTEL = 5570,
-    SpvCapabilityMax = 0x7fffffff,
-} SpvCapability;
-
-typedef enum SpvOp_ {
-    SpvOpNop = 0,
-    SpvOpUndef = 1,
-    SpvOpSourceContinued = 2,
-    SpvOpSource = 3,
-    SpvOpSourceExtension = 4,
-    SpvOpName = 5,
-    SpvOpMemberName = 6,
-    SpvOpString = 7,
-    SpvOpLine = 8,
-    SpvOpExtension = 10,
-    SpvOpExtInstImport = 11,
-    SpvOpExtInst = 12,
-    SpvOpMemoryModel = 14,
-    SpvOpEntryPoint = 15,
-    SpvOpExecutionMode = 16,
-    SpvOpCapability = 17,
-    SpvOpTypeVoid = 19,
-    SpvOpTypeBool = 20,
-    SpvOpTypeInt = 21,
-    SpvOpTypeFloat = 22,
-    SpvOpTypeVector = 23,
-    SpvOpTypeMatrix = 24,
-    SpvOpTypeImage = 25,
-    SpvOpTypeSampler = 26,
-    SpvOpTypeSampledImage = 27,
-    SpvOpTypeArray = 28,
-    SpvOpTypeRuntimeArray = 29,
-    SpvOpTypeStruct = 30,
-    SpvOpTypeOpaque = 31,
-    SpvOpTypePointer = 32,
-    SpvOpTypeFunction = 33,
-    SpvOpTypeEvent = 34,
-    SpvOpTypeDeviceEvent = 35,
-    SpvOpTypeReserveId = 36,
-    SpvOpTypeQueue = 37,
-    SpvOpTypePipe = 38,
-    SpvOpTypeForwardPointer = 39,
-    SpvOpConstantTrue = 41,
-    SpvOpConstantFalse = 42,
-    SpvOpConstant = 43,
-    SpvOpConstantComposite = 44,
-    SpvOpConstantSampler = 45,
-    SpvOpConstantNull = 46,
-    SpvOpSpecConstantTrue = 48,
-    SpvOpSpecConstantFalse = 49,
-    SpvOpSpecConstant = 50,
-    SpvOpSpecConstantComposite = 51,
-    SpvOpSpecConstantOp = 52,
-    SpvOpFunction = 54,
-    SpvOpFunctionParameter = 55,
-    SpvOpFunctionEnd = 56,
-    SpvOpFunctionCall = 57,
-    SpvOpVariable = 59,
-    SpvOpImageTexelPointer = 60,
-    SpvOpLoad = 61,
-    SpvOpStore = 62,
-    SpvOpCopyMemory = 63,
-    SpvOpCopyMemorySized = 64,
-    SpvOpAccessChain = 65,
-    SpvOpInBoundsAccessChain = 66,
-    SpvOpPtrAccessChain = 67,
-    SpvOpArrayLength = 68,
-    SpvOpGenericPtrMemSemantics = 69,
-    SpvOpInBoundsPtrAccessChain = 70,
-    SpvOpDecorate = 71,
-    SpvOpMemberDecorate = 72,
-    SpvOpDecorationGroup = 73,
-    SpvOpGroupDecorate = 74,
-    SpvOpGroupMemberDecorate = 75,
-    SpvOpVectorExtractDynamic = 77,
-    SpvOpVectorInsertDynamic = 78,
-    SpvOpVectorShuffle = 79,
-    SpvOpCompositeConstruct = 80,
-    SpvOpCompositeExtract = 81,
-    SpvOpCompositeInsert = 82,
-    SpvOpCopyObject = 83,
-    SpvOpTranspose = 84,
-    SpvOpSampledImage = 86,
-    SpvOpImageSampleImplicitLod = 87,
-    SpvOpImageSampleExplicitLod = 88,
-    SpvOpImageSampleDrefImplicitLod = 89,
-    SpvOpImageSampleDrefExplicitLod = 90,
-    SpvOpImageSampleProjImplicitLod = 91,
-    SpvOpImageSampleProjExplicitLod = 92,
-    SpvOpImageSampleProjDrefImplicitLod = 93,
-    SpvOpImageSampleProjDrefExplicitLod = 94,
-    SpvOpImageFetch = 95,
-    SpvOpImageGather = 96,
-    SpvOpImageDrefGather = 97,
-    SpvOpImageRead = 98,
-    SpvOpImageWrite = 99,
-    SpvOpImage = 100,
-    SpvOpImageQueryFormat = 101,
-    SpvOpImageQueryOrder = 102,
-    SpvOpImageQuerySizeLod = 103,
-    SpvOpImageQuerySize = 104,
-    SpvOpImageQueryLod = 105,
-    SpvOpImageQueryLevels = 106,
-    SpvOpImageQuerySamples = 107,
-    SpvOpConvertFToU = 109,
-    SpvOpConvertFToS = 110,
-    SpvOpConvertSToF = 111,
-    SpvOpConvertUToF = 112,
-    SpvOpUConvert = 113,
-    SpvOpSConvert = 114,
-    SpvOpFConvert = 115,
-    SpvOpQuantizeToF16 = 116,
-    SpvOpConvertPtrToU = 117,
-    SpvOpSatConvertSToU = 118,
-    SpvOpSatConvertUToS = 119,
-    SpvOpConvertUToPtr = 120,
-    SpvOpPtrCastToGeneric = 121,
-    SpvOpGenericCastToPtr = 122,
-    SpvOpGenericCastToPtrExplicit = 123,
-    SpvOpBitcast = 124,
-    SpvOpSNegate = 126,
-    SpvOpFNegate = 127,
-    SpvOpIAdd = 128,
-    SpvOpFAdd = 129,
-    SpvOpISub = 130,
-    SpvOpFSub = 131,
-    SpvOpIMul = 132,
-    SpvOpFMul = 133,
-    SpvOpUDiv = 134,
-    SpvOpSDiv = 135,
-    SpvOpFDiv = 136,
-    SpvOpUMod = 137,
-    SpvOpSRem = 138,
-    SpvOpSMod = 139,
-    SpvOpFRem = 140,
-    SpvOpFMod = 141,
-    SpvOpVectorTimesScalar = 142,
-    SpvOpMatrixTimesScalar = 143,
-    SpvOpVectorTimesMatrix = 144,
-    SpvOpMatrixTimesVector = 145,
-    SpvOpMatrixTimesMatrix = 146,
-    SpvOpOuterProduct = 147,
-    SpvOpDot = 148,
-    SpvOpIAddCarry = 149,
-    SpvOpISubBorrow = 150,
-    SpvOpUMulExtended = 151,
-    SpvOpSMulExtended = 152,
-    SpvOpAny = 154,
-    SpvOpAll = 155,
-    SpvOpIsNan = 156,
-    SpvOpIsInf = 157,
-    SpvOpIsFinite = 158,
-    SpvOpIsNormal = 159,
-    SpvOpSignBitSet = 160,
-    SpvOpLessOrGreater = 161,
-    SpvOpOrdered = 162,
-    SpvOpUnordered = 163,
-    SpvOpLogicalEqual = 164,
-    SpvOpLogicalNotEqual = 165,
-    SpvOpLogicalOr = 166,
-    SpvOpLogicalAnd = 167,
-    SpvOpLogicalNot = 168,
-    SpvOpSelect = 169,
-    SpvOpIEqual = 170,
-    SpvOpINotEqual = 171,
-    SpvOpUGreaterThan = 172,
-    SpvOpSGreaterThan = 173,
-    SpvOpUGreaterThanEqual = 174,
-    SpvOpSGreaterThanEqual = 175,
-    SpvOpULessThan = 176,
-    SpvOpSLessThan = 177,
-    SpvOpULessThanEqual = 178,
-    SpvOpSLessThanEqual = 179,
-    SpvOpFOrdEqual = 180,
-    SpvOpFUnordEqual = 181,
-    SpvOpFOrdNotEqual = 182,
-    SpvOpFUnordNotEqual = 183,
-    SpvOpFOrdLessThan = 184,
-    SpvOpFUnordLessThan = 185,
-    SpvOpFOrdGreaterThan = 186,
-    SpvOpFUnordGreaterThan = 187,
-    SpvOpFOrdLessThanEqual = 188,
-    SpvOpFUnordLessThanEqual = 189,
-    SpvOpFOrdGreaterThanEqual = 190,
-    SpvOpFUnordGreaterThanEqual = 191,
-    SpvOpShiftRightLogical = 194,
-    SpvOpShiftRightArithmetic = 195,
-    SpvOpShiftLeftLogical = 196,
-    SpvOpBitwiseOr = 197,
-    SpvOpBitwiseXor = 198,
-    SpvOpBitwiseAnd = 199,
-    SpvOpNot = 200,
-    SpvOpBitFieldInsert = 201,
-    SpvOpBitFieldSExtract = 202,
-    SpvOpBitFieldUExtract = 203,
-    SpvOpBitReverse = 204,
-    SpvOpBitCount = 205,
-    SpvOpDPdx = 207,
-    SpvOpDPdy = 208,
-    SpvOpFwidth = 209,
-    SpvOpDPdxFine = 210,
-    SpvOpDPdyFine = 211,
-    SpvOpFwidthFine = 212,
-    SpvOpDPdxCoarse = 213,
-    SpvOpDPdyCoarse = 214,
-    SpvOpFwidthCoarse = 215,
-    SpvOpEmitVertex = 218,
-    SpvOpEndPrimitive = 219,
-    SpvOpEmitStreamVertex = 220,
-    SpvOpEndStreamPrimitive = 221,
-    SpvOpControlBarrier = 224,
-    SpvOpMemoryBarrier = 225,
-    SpvOpAtomicLoad = 227,
-    SpvOpAtomicStore = 228,
-    SpvOpAtomicExchange = 229,
-    SpvOpAtomicCompareExchange = 230,
-    SpvOpAtomicCompareExchangeWeak = 231,
-    SpvOpAtomicIIncrement = 232,
-    SpvOpAtomicIDecrement = 233,
-    SpvOpAtomicIAdd = 234,
-    SpvOpAtomicISub = 235,
-    SpvOpAtomicSMin = 236,
-    SpvOpAtomicUMin = 237,
-    SpvOpAtomicSMax = 238,
-    SpvOpAtomicUMax = 239,
-    SpvOpAtomicAnd = 240,
-    SpvOpAtomicOr = 241,
-    SpvOpAtomicXor = 242,
-    SpvOpPhi = 245,
-    SpvOpLoopMerge = 246,
-    SpvOpSelectionMerge = 247,
-    SpvOpLabel = 248,
-    SpvOpBranch = 249,
-    SpvOpBranchConditional = 250,
-    SpvOpSwitch = 251,
-    SpvOpKill = 252,
-    SpvOpReturn = 253,
-    SpvOpReturnValue = 254,
-    SpvOpUnreachable = 255,
-    SpvOpLifetimeStart = 256,
-    SpvOpLifetimeStop = 257,
-    SpvOpGroupAsyncCopy = 259,
-    SpvOpGroupWaitEvents = 260,
-    SpvOpGroupAll = 261,
-    SpvOpGroupAny = 262,
-    SpvOpGroupBroadcast = 263,
-    SpvOpGroupIAdd = 264,
-    SpvOpGroupFAdd = 265,
-    SpvOpGroupFMin = 266,
-    SpvOpGroupUMin = 267,
-    SpvOpGroupSMin = 268,
-    SpvOpGroupFMax = 269,
-    SpvOpGroupUMax = 270,
-    SpvOpGroupSMax = 271,
-    SpvOpReadPipe = 274,
-    SpvOpWritePipe = 275,
-    SpvOpReservedReadPipe = 276,
-    SpvOpReservedWritePipe = 277,
-    SpvOpReserveReadPipePackets = 278,
-    SpvOpReserveWritePipePackets = 279,
-    SpvOpCommitReadPipe = 280,
-    SpvOpCommitWritePipe = 281,
-    SpvOpIsValidReserveId = 282,
-    SpvOpGetNumPipePackets = 283,
-    SpvOpGetMaxPipePackets = 284,
-    SpvOpGroupReserveReadPipePackets = 285,
-    SpvOpGroupReserveWritePipePackets = 286,
-    SpvOpGroupCommitReadPipe = 287,
-    SpvOpGroupCommitWritePipe = 288,
-    SpvOpEnqueueMarker = 291,
-    SpvOpEnqueueKernel = 292,
-    SpvOpGetKernelNDrangeSubGroupCount = 293,
-    SpvOpGetKernelNDrangeMaxSubGroupSize = 294,
-    SpvOpGetKernelWorkGroupSize = 295,
-    SpvOpGetKernelPreferredWorkGroupSizeMultiple = 296,
-    SpvOpRetainEvent = 297,
-    SpvOpReleaseEvent = 298,
-    SpvOpCreateUserEvent = 299,
-    SpvOpIsValidEvent = 300,
-    SpvOpSetUserEventStatus = 301,
-    SpvOpCaptureEventProfilingInfo = 302,
-    SpvOpGetDefaultQueue = 303,
-    SpvOpBuildNDRange = 304,
-    SpvOpImageSparseSampleImplicitLod = 305,
-    SpvOpImageSparseSampleExplicitLod = 306,
-    SpvOpImageSparseSampleDrefImplicitLod = 307,
-    SpvOpImageSparseSampleDrefExplicitLod = 308,
-    SpvOpImageSparseSampleProjImplicitLod = 309,
-    SpvOpImageSparseSampleProjExplicitLod = 310,
-    SpvOpImageSparseSampleProjDrefImplicitLod = 311,
-    SpvOpImageSparseSampleProjDrefExplicitLod = 312,
-    SpvOpImageSparseFetch = 313,
-    SpvOpImageSparseGather = 314,
-    SpvOpImageSparseDrefGather = 315,
-    SpvOpImageSparseTexelsResident = 316,
-    SpvOpNoLine = 317,
-    SpvOpAtomicFlagTestAndSet = 318,
-    SpvOpAtomicFlagClear = 319,
-    SpvOpImageSparseRead = 320,
-    SpvOpDecorateId = 332,
-    SpvOpSubgroupBallotKHR = 4421,
-    SpvOpSubgroupFirstInvocationKHR = 4422,
-    SpvOpSubgroupAllKHR = 4428,
-    SpvOpSubgroupAnyKHR = 4429,
-    SpvOpSubgroupAllEqualKHR = 4430,
-    SpvOpSubgroupReadInvocationKHR = 4432,
-    SpvOpGroupIAddNonUniformAMD = 5000,
-    SpvOpGroupFAddNonUniformAMD = 5001,
-    SpvOpGroupFMinNonUniformAMD = 5002,
-    SpvOpGroupUMinNonUniformAMD = 5003,
-    SpvOpGroupSMinNonUniformAMD = 5004,
-    SpvOpGroupFMaxNonUniformAMD = 5005,
-    SpvOpGroupUMaxNonUniformAMD = 5006,
-    SpvOpGroupSMaxNonUniformAMD = 5007,
-    SpvOpFragmentMaskFetchAMD = 5011,
-    SpvOpFragmentFetchAMD = 5012,
-    SpvOpSubgroupShuffleINTEL = 5571,
-    SpvOpSubgroupShuffleDownINTEL = 5572,
-    SpvOpSubgroupShuffleUpINTEL = 5573,
-    SpvOpSubgroupShuffleXorINTEL = 5574,
-    SpvOpSubgroupBlockReadINTEL = 5575,
-    SpvOpSubgroupBlockWriteINTEL = 5576,
-    SpvOpSubgroupImageBlockReadINTEL = 5577,
-    SpvOpSubgroupImageBlockWriteINTEL = 5578,
-    SpvOpDecorateStringGOOGLE = 5632,
-    SpvOpMemberDecorateStringGOOGLE = 5633,
-    SpvOpMax = 0x7fffffff,
-} SpvOp;
-
-#endif  // #ifndef spirv_H
-
diff --git a/dependencies/spirv/include/spirv/1.0/GLSL.std.450.h b/dependencies/spirv/include/spirv/1.6/GLSL.std.450.h
similarity index 100%
rename from dependencies/spirv/include/spirv/1.0/GLSL.std.450.h
rename to dependencies/spirv/include/spirv/1.6/GLSL.std.450.h
diff --git a/dependencies/spirv/include/spirv/1.6/spirv.h b/dependencies/spirv/include/spirv/1.6/spirv.h
new file mode 100644
index 000000000000..73d6c76614a2
--- /dev/null
+++ b/dependencies/spirv/include/spirv/1.6/spirv.h
@@ -0,0 +1,2576 @@
+/*
+** Copyright (c) 2014-2020 The Khronos Group Inc.
+** 
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and/or associated documentation files (the "Materials"),
+** to deal in the Materials without restriction, including without limitation
+** the rights to use, copy, modify, merge, publish, distribute, sublicense,
+** and/or sell copies of the Materials, and to permit persons to whom the
+** Materials are furnished to do so, subject to the following conditions:
+** 
+** The above copyright notice and this permission notice shall be included in
+** all copies or substantial portions of the Materials.
+** 
+** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
+** STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
+** HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ 
+** 
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+** THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+** FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
+** IN THE MATERIALS.
+*/
+
+/*
+** This header is automatically generated by the same tool that creates
+** the Binary Section of the SPIR-V specification.
+*/
+
+/*
+** Enumeration tokens for SPIR-V, in various styles:
+**   C, C++, C++11, JSON, Lua, Python, C#, D, Beef
+** 
+** - C will have tokens with a "Spv" prefix, e.g.: SpvSourceLanguageGLSL
+** - C++ will have tokens in the "spv" name space, e.g.: spv::SourceLanguageGLSL
+** - C++11 will use enum classes in the spv namespace, e.g.: spv::SourceLanguage::GLSL
+** - Lua will use tables, e.g.: spv.SourceLanguage.GLSL
+** - Python will use dictionaries, e.g.: spv['SourceLanguage']['GLSL']
+** - C# will use enum classes in the Specification class located in the "Spv" namespace,
+**     e.g.: Spv.Specification.SourceLanguage.GLSL
+** - D will have tokens under the "spv" module, e.g: spv.SourceLanguage.GLSL
+** - Beef will use enum classes in the Specification class located in the "Spv" namespace,
+**     e.g.: Spv.Specification.SourceLanguage.GLSL
+** 
+** Some tokens act like mask values, which can be OR'd together,
+** while others are mutually exclusive.  The mask-like ones have
+** "Mask" in their name, and a parallel enum that has the shift
+** amount (1 << x) for each corresponding enumerant.
+*/
+
+#ifndef spirv_H
+#define spirv_H
+
+typedef unsigned int SpvId;
+
+#define SPV_VERSION 0x10600
+#define SPV_REVISION 1
+
+static const unsigned int SpvMagicNumber = 0x07230203;
+static const unsigned int SpvVersion = 0x00010600;
+static const unsigned int SpvRevision = 1;
+static const unsigned int SpvOpCodeMask = 0xffff;
+static const unsigned int SpvWordCountShift = 16;
+
+typedef enum SpvSourceLanguage_ {
+    SpvSourceLanguageUnknown = 0,
+    SpvSourceLanguageESSL = 1,
+    SpvSourceLanguageGLSL = 2,
+    SpvSourceLanguageOpenCL_C = 3,
+    SpvSourceLanguageOpenCL_CPP = 4,
+    SpvSourceLanguageHLSL = 5,
+    SpvSourceLanguageCPP_for_OpenCL = 6,
+    SpvSourceLanguageSYCL = 7,
+    SpvSourceLanguageMax = 0x7fffffff,
+} SpvSourceLanguage;
+
+typedef enum SpvExecutionModel_ {
+    SpvExecutionModelVertex = 0,
+    SpvExecutionModelTessellationControl = 1,
+    SpvExecutionModelTessellationEvaluation = 2,
+    SpvExecutionModelGeometry = 3,
+    SpvExecutionModelFragment = 4,
+    SpvExecutionModelGLCompute = 5,
+    SpvExecutionModelKernel = 6,
+    SpvExecutionModelTaskNV = 5267,
+    SpvExecutionModelMeshNV = 5268,
+    SpvExecutionModelRayGenerationKHR = 5313,
+    SpvExecutionModelRayGenerationNV = 5313,
+    SpvExecutionModelIntersectionKHR = 5314,
+    SpvExecutionModelIntersectionNV = 5314,
+    SpvExecutionModelAnyHitKHR = 5315,
+    SpvExecutionModelAnyHitNV = 5315,
+    SpvExecutionModelClosestHitKHR = 5316,
+    SpvExecutionModelClosestHitNV = 5316,
+    SpvExecutionModelMissKHR = 5317,
+    SpvExecutionModelMissNV = 5317,
+    SpvExecutionModelCallableKHR = 5318,
+    SpvExecutionModelCallableNV = 5318,
+    SpvExecutionModelTaskEXT = 5364,
+    SpvExecutionModelMeshEXT = 5365,
+    SpvExecutionModelMax = 0x7fffffff,
+} SpvExecutionModel;
+
+typedef enum SpvAddressingModel_ {
+    SpvAddressingModelLogical = 0,
+    SpvAddressingModelPhysical32 = 1,
+    SpvAddressingModelPhysical64 = 2,
+    SpvAddressingModelPhysicalStorageBuffer64 = 5348,
+    SpvAddressingModelPhysicalStorageBuffer64EXT = 5348,
+    SpvAddressingModelMax = 0x7fffffff,
+} SpvAddressingModel;
+
+typedef enum SpvMemoryModel_ {
+    SpvMemoryModelSimple = 0,
+    SpvMemoryModelGLSL450 = 1,
+    SpvMemoryModelOpenCL = 2,
+    SpvMemoryModelVulkan = 3,
+    SpvMemoryModelVulkanKHR = 3,
+    SpvMemoryModelMax = 0x7fffffff,
+} SpvMemoryModel;
+
+typedef enum SpvExecutionMode_ {
+    SpvExecutionModeInvocations = 0,
+    SpvExecutionModeSpacingEqual = 1,
+    SpvExecutionModeSpacingFractionalEven = 2,
+    SpvExecutionModeSpacingFractionalOdd = 3,
+    SpvExecutionModeVertexOrderCw = 4,
+    SpvExecutionModeVertexOrderCcw = 5,
+    SpvExecutionModePixelCenterInteger = 6,
+    SpvExecutionModeOriginUpperLeft = 7,
+    SpvExecutionModeOriginLowerLeft = 8,
+    SpvExecutionModeEarlyFragmentTests = 9,
+    SpvExecutionModePointMode = 10,
+    SpvExecutionModeXfb = 11,
+    SpvExecutionModeDepthReplacing = 12,
+    SpvExecutionModeDepthGreater = 14,
+    SpvExecutionModeDepthLess = 15,
+    SpvExecutionModeDepthUnchanged = 16,
+    SpvExecutionModeLocalSize = 17,
+    SpvExecutionModeLocalSizeHint = 18,
+    SpvExecutionModeInputPoints = 19,
+    SpvExecutionModeInputLines = 20,
+    SpvExecutionModeInputLinesAdjacency = 21,
+    SpvExecutionModeTriangles = 22,
+    SpvExecutionModeInputTrianglesAdjacency = 23,
+    SpvExecutionModeQuads = 24,
+    SpvExecutionModeIsolines = 25,
+    SpvExecutionModeOutputVertices = 26,
+    SpvExecutionModeOutputPoints = 27,
+    SpvExecutionModeOutputLineStrip = 28,
+    SpvExecutionModeOutputTriangleStrip = 29,
+    SpvExecutionModeVecTypeHint = 30,
+    SpvExecutionModeContractionOff = 31,
+    SpvExecutionModeInitializer = 33,
+    SpvExecutionModeFinalizer = 34,
+    SpvExecutionModeSubgroupSize = 35,
+    SpvExecutionModeSubgroupsPerWorkgroup = 36,
+    SpvExecutionModeSubgroupsPerWorkgroupId = 37,
+    SpvExecutionModeLocalSizeId = 38,
+    SpvExecutionModeLocalSizeHintId = 39,
+    SpvExecutionModeSubgroupUniformControlFlowKHR = 4421,
+    SpvExecutionModePostDepthCoverage = 4446,
+    SpvExecutionModeDenormPreserve = 4459,
+    SpvExecutionModeDenormFlushToZero = 4460,
+    SpvExecutionModeSignedZeroInfNanPreserve = 4461,
+    SpvExecutionModeRoundingModeRTE = 4462,
+    SpvExecutionModeRoundingModeRTZ = 4463,
+    SpvExecutionModeEarlyAndLateFragmentTestsAMD = 5017,
+    SpvExecutionModeStencilRefReplacingEXT = 5027,
+    SpvExecutionModeStencilRefUnchangedFrontAMD = 5079,
+    SpvExecutionModeStencilRefGreaterFrontAMD = 5080,
+    SpvExecutionModeStencilRefLessFrontAMD = 5081,
+    SpvExecutionModeStencilRefUnchangedBackAMD = 5082,
+    SpvExecutionModeStencilRefGreaterBackAMD = 5083,
+    SpvExecutionModeStencilRefLessBackAMD = 5084,
+    SpvExecutionModeOutputLinesEXT = 5269,
+    SpvExecutionModeOutputLinesNV = 5269,
+    SpvExecutionModeOutputPrimitivesEXT = 5270,
+    SpvExecutionModeOutputPrimitivesNV = 5270,
+    SpvExecutionModeDerivativeGroupQuadsNV = 5289,
+    SpvExecutionModeDerivativeGroupLinearNV = 5290,
+    SpvExecutionModeOutputTrianglesEXT = 5298,
+    SpvExecutionModeOutputTrianglesNV = 5298,
+    SpvExecutionModePixelInterlockOrderedEXT = 5366,
+    SpvExecutionModePixelInterlockUnorderedEXT = 5367,
+    SpvExecutionModeSampleInterlockOrderedEXT = 5368,
+    SpvExecutionModeSampleInterlockUnorderedEXT = 5369,
+    SpvExecutionModeShadingRateInterlockOrderedEXT = 5370,
+    SpvExecutionModeShadingRateInterlockUnorderedEXT = 5371,
+    SpvExecutionModeSharedLocalMemorySizeINTEL = 5618,
+    SpvExecutionModeRoundingModeRTPINTEL = 5620,
+    SpvExecutionModeRoundingModeRTNINTEL = 5621,
+    SpvExecutionModeFloatingPointModeALTINTEL = 5622,
+    SpvExecutionModeFloatingPointModeIEEEINTEL = 5623,
+    SpvExecutionModeMaxWorkgroupSizeINTEL = 5893,
+    SpvExecutionModeMaxWorkDimINTEL = 5894,
+    SpvExecutionModeNoGlobalOffsetINTEL = 5895,
+    SpvExecutionModeNumSIMDWorkitemsINTEL = 5896,
+    SpvExecutionModeSchedulerTargetFmaxMhzINTEL = 5903,
+    SpvExecutionModeNamedBarrierCountINTEL = 6417,
+    SpvExecutionModeMax = 0x7fffffff,
+} SpvExecutionMode;
+
+typedef enum SpvStorageClass_ {
+    SpvStorageClassUniformConstant = 0,
+    SpvStorageClassInput = 1,
+    SpvStorageClassUniform = 2,
+    SpvStorageClassOutput = 3,
+    SpvStorageClassWorkgroup = 4,
+    SpvStorageClassCrossWorkgroup = 5,
+    SpvStorageClassPrivate = 6,
+    SpvStorageClassFunction = 7,
+    SpvStorageClassGeneric = 8,
+    SpvStorageClassPushConstant = 9,
+    SpvStorageClassAtomicCounter = 10,
+    SpvStorageClassImage = 11,
+    SpvStorageClassStorageBuffer = 12,
+    SpvStorageClassCallableDataKHR = 5328,
+    SpvStorageClassCallableDataNV = 5328,
+    SpvStorageClassIncomingCallableDataKHR = 5329,
+    SpvStorageClassIncomingCallableDataNV = 5329,
+    SpvStorageClassRayPayloadKHR = 5338,
+    SpvStorageClassRayPayloadNV = 5338,
+    SpvStorageClassHitAttributeKHR = 5339,
+    SpvStorageClassHitAttributeNV = 5339,
+    SpvStorageClassIncomingRayPayloadKHR = 5342,
+    SpvStorageClassIncomingRayPayloadNV = 5342,
+    SpvStorageClassShaderRecordBufferKHR = 5343,
+    SpvStorageClassShaderRecordBufferNV = 5343,
+    SpvStorageClassPhysicalStorageBuffer = 5349,
+    SpvStorageClassPhysicalStorageBufferEXT = 5349,
+    SpvStorageClassTaskPayloadWorkgroupEXT = 5402,
+    SpvStorageClassCodeSectionINTEL = 5605,
+    SpvStorageClassDeviceOnlyINTEL = 5936,
+    SpvStorageClassHostOnlyINTEL = 5937,
+    SpvStorageClassMax = 0x7fffffff,
+} SpvStorageClass;
+
+typedef enum SpvDim_ {
+    SpvDim1D = 0,
+    SpvDim2D = 1,
+    SpvDim3D = 2,
+    SpvDimCube = 3,
+    SpvDimRect = 4,
+    SpvDimBuffer = 5,
+    SpvDimSubpassData = 6,
+    SpvDimMax = 0x7fffffff,
+} SpvDim;
+
+typedef enum SpvSamplerAddressingMode_ {
+    SpvSamplerAddressingModeNone = 0,
+    SpvSamplerAddressingModeClampToEdge = 1,
+    SpvSamplerAddressingModeClamp = 2,
+    SpvSamplerAddressingModeRepeat = 3,
+    SpvSamplerAddressingModeRepeatMirrored = 4,
+    SpvSamplerAddressingModeMax = 0x7fffffff,
+} SpvSamplerAddressingMode;
+
+typedef enum SpvSamplerFilterMode_ {
+    SpvSamplerFilterModeNearest = 0,
+    SpvSamplerFilterModeLinear = 1,
+    SpvSamplerFilterModeMax = 0x7fffffff,
+} SpvSamplerFilterMode;
+
+typedef enum SpvImageFormat_ {
+    SpvImageFormatUnknown = 0,
+    SpvImageFormatRgba32f = 1,
+    SpvImageFormatRgba16f = 2,
+    SpvImageFormatR32f = 3,
+    SpvImageFormatRgba8 = 4,
+    SpvImageFormatRgba8Snorm = 5,
+    SpvImageFormatRg32f = 6,
+    SpvImageFormatRg16f = 7,
+    SpvImageFormatR11fG11fB10f = 8,
+    SpvImageFormatR16f = 9,
+    SpvImageFormatRgba16 = 10,
+    SpvImageFormatRgb10A2 = 11,
+    SpvImageFormatRg16 = 12,
+    SpvImageFormatRg8 = 13,
+    SpvImageFormatR16 = 14,
+    SpvImageFormatR8 = 15,
+    SpvImageFormatRgba16Snorm = 16,
+    SpvImageFormatRg16Snorm = 17,
+    SpvImageFormatRg8Snorm = 18,
+    SpvImageFormatR16Snorm = 19,
+    SpvImageFormatR8Snorm = 20,
+    SpvImageFormatRgba32i = 21,
+    SpvImageFormatRgba16i = 22,
+    SpvImageFormatRgba8i = 23,
+    SpvImageFormatR32i = 24,
+    SpvImageFormatRg32i = 25,
+    SpvImageFormatRg16i = 26,
+    SpvImageFormatRg8i = 27,
+    SpvImageFormatR16i = 28,
+    SpvImageFormatR8i = 29,
+    SpvImageFormatRgba32ui = 30,
+    SpvImageFormatRgba16ui = 31,
+    SpvImageFormatRgba8ui = 32,
+    SpvImageFormatR32ui = 33,
+    SpvImageFormatRgb10a2ui = 34,
+    SpvImageFormatRg32ui = 35,
+    SpvImageFormatRg16ui = 36,
+    SpvImageFormatRg8ui = 37,
+    SpvImageFormatR16ui = 38,
+    SpvImageFormatR8ui = 39,
+    SpvImageFormatR64ui = 40,
+    SpvImageFormatR64i = 41,
+    SpvImageFormatMax = 0x7fffffff,
+} SpvImageFormat;
+
+typedef enum SpvImageChannelOrder_ {
+    SpvImageChannelOrderR = 0,
+    SpvImageChannelOrderA = 1,
+    SpvImageChannelOrderRG = 2,
+    SpvImageChannelOrderRA = 3,
+    SpvImageChannelOrderRGB = 4,
+    SpvImageChannelOrderRGBA = 5,
+    SpvImageChannelOrderBGRA = 6,
+    SpvImageChannelOrderARGB = 7,
+    SpvImageChannelOrderIntensity = 8,
+    SpvImageChannelOrderLuminance = 9,
+    SpvImageChannelOrderRx = 10,
+    SpvImageChannelOrderRGx = 11,
+    SpvImageChannelOrderRGBx = 12,
+    SpvImageChannelOrderDepth = 13,
+    SpvImageChannelOrderDepthStencil = 14,
+    SpvImageChannelOrdersRGB = 15,
+    SpvImageChannelOrdersRGBx = 16,
+    SpvImageChannelOrdersRGBA = 17,
+    SpvImageChannelOrdersBGRA = 18,
+    SpvImageChannelOrderABGR = 19,
+    SpvImageChannelOrderMax = 0x7fffffff,
+} SpvImageChannelOrder;
+
+typedef enum SpvImageChannelDataType_ {
+    SpvImageChannelDataTypeSnormInt8 = 0,
+    SpvImageChannelDataTypeSnormInt16 = 1,
+    SpvImageChannelDataTypeUnormInt8 = 2,
+    SpvImageChannelDataTypeUnormInt16 = 3,
+    SpvImageChannelDataTypeUnormShort565 = 4,
+    SpvImageChannelDataTypeUnormShort555 = 5,
+    SpvImageChannelDataTypeUnormInt101010 = 6,
+    SpvImageChannelDataTypeSignedInt8 = 7,
+    SpvImageChannelDataTypeSignedInt16 = 8,
+    SpvImageChannelDataTypeSignedInt32 = 9,
+    SpvImageChannelDataTypeUnsignedInt8 = 10,
+    SpvImageChannelDataTypeUnsignedInt16 = 11,
+    SpvImageChannelDataTypeUnsignedInt32 = 12,
+    SpvImageChannelDataTypeHalfFloat = 13,
+    SpvImageChannelDataTypeFloat = 14,
+    SpvImageChannelDataTypeUnormInt24 = 15,
+    SpvImageChannelDataTypeUnormInt101010_2 = 16,
+    SpvImageChannelDataTypeMax = 0x7fffffff,
+} SpvImageChannelDataType;
+
+typedef enum SpvImageOperandsShift_ {
+    SpvImageOperandsBiasShift = 0,
+    SpvImageOperandsLodShift = 1,
+    SpvImageOperandsGradShift = 2,
+    SpvImageOperandsConstOffsetShift = 3,
+    SpvImageOperandsOffsetShift = 4,
+    SpvImageOperandsConstOffsetsShift = 5,
+    SpvImageOperandsSampleShift = 6,
+    SpvImageOperandsMinLodShift = 7,
+    SpvImageOperandsMakeTexelAvailableShift = 8,
+    SpvImageOperandsMakeTexelAvailableKHRShift = 8,
+    SpvImageOperandsMakeTexelVisibleShift = 9,
+    SpvImageOperandsMakeTexelVisibleKHRShift = 9,
+    SpvImageOperandsNonPrivateTexelShift = 10,
+    SpvImageOperandsNonPrivateTexelKHRShift = 10,
+    SpvImageOperandsVolatileTexelShift = 11,
+    SpvImageOperandsVolatileTexelKHRShift = 11,
+    SpvImageOperandsSignExtendShift = 12,
+    SpvImageOperandsZeroExtendShift = 13,
+    SpvImageOperandsNontemporalShift = 14,
+    SpvImageOperandsOffsetsShift = 16,
+    SpvImageOperandsMax = 0x7fffffff,
+} SpvImageOperandsShift;
+
+typedef enum SpvImageOperandsMask_ {
+    SpvImageOperandsMaskNone = 0,
+    SpvImageOperandsBiasMask = 0x00000001,
+    SpvImageOperandsLodMask = 0x00000002,
+    SpvImageOperandsGradMask = 0x00000004,
+    SpvImageOperandsConstOffsetMask = 0x00000008,
+    SpvImageOperandsOffsetMask = 0x00000010,
+    SpvImageOperandsConstOffsetsMask = 0x00000020,
+    SpvImageOperandsSampleMask = 0x00000040,
+    SpvImageOperandsMinLodMask = 0x00000080,
+    SpvImageOperandsMakeTexelAvailableMask = 0x00000100,
+    SpvImageOperandsMakeTexelAvailableKHRMask = 0x00000100,
+    SpvImageOperandsMakeTexelVisibleMask = 0x00000200,
+    SpvImageOperandsMakeTexelVisibleKHRMask = 0x00000200,
+    SpvImageOperandsNonPrivateTexelMask = 0x00000400,
+    SpvImageOperandsNonPrivateTexelKHRMask = 0x00000400,
+    SpvImageOperandsVolatileTexelMask = 0x00000800,
+    SpvImageOperandsVolatileTexelKHRMask = 0x00000800,
+    SpvImageOperandsSignExtendMask = 0x00001000,
+    SpvImageOperandsZeroExtendMask = 0x00002000,
+    SpvImageOperandsNontemporalMask = 0x00004000,
+    SpvImageOperandsOffsetsMask = 0x00010000,
+} SpvImageOperandsMask;
+
+typedef enum SpvFPFastMathModeShift_ {
+    SpvFPFastMathModeNotNaNShift = 0,
+    SpvFPFastMathModeNotInfShift = 1,
+    SpvFPFastMathModeNSZShift = 2,
+    SpvFPFastMathModeAllowRecipShift = 3,
+    SpvFPFastMathModeFastShift = 4,
+    SpvFPFastMathModeAllowContractFastINTELShift = 16,
+    SpvFPFastMathModeAllowReassocINTELShift = 17,
+    SpvFPFastMathModeMax = 0x7fffffff,
+} SpvFPFastMathModeShift;
+
+typedef enum SpvFPFastMathModeMask_ {
+    SpvFPFastMathModeMaskNone = 0,
+    SpvFPFastMathModeNotNaNMask = 0x00000001,
+    SpvFPFastMathModeNotInfMask = 0x00000002,
+    SpvFPFastMathModeNSZMask = 0x00000004,
+    SpvFPFastMathModeAllowRecipMask = 0x00000008,
+    SpvFPFastMathModeFastMask = 0x00000010,
+    SpvFPFastMathModeAllowContractFastINTELMask = 0x00010000,
+    SpvFPFastMathModeAllowReassocINTELMask = 0x00020000,
+} SpvFPFastMathModeMask;
+
+typedef enum SpvFPRoundingMode_ {
+    SpvFPRoundingModeRTE = 0,
+    SpvFPRoundingModeRTZ = 1,
+    SpvFPRoundingModeRTP = 2,
+    SpvFPRoundingModeRTN = 3,
+    SpvFPRoundingModeMax = 0x7fffffff,
+} SpvFPRoundingMode;
+
+typedef enum SpvLinkageType_ {
+    SpvLinkageTypeExport = 0,
+    SpvLinkageTypeImport = 1,
+    SpvLinkageTypeLinkOnceODR = 2,
+    SpvLinkageTypeMax = 0x7fffffff,
+} SpvLinkageType;
+
+typedef enum SpvAccessQualifier_ {
+    SpvAccessQualifierReadOnly = 0,
+    SpvAccessQualifierWriteOnly = 1,
+    SpvAccessQualifierReadWrite = 2,
+    SpvAccessQualifierMax = 0x7fffffff,
+} SpvAccessQualifier;
+
+typedef enum SpvFunctionParameterAttribute_ {
+    SpvFunctionParameterAttributeZext = 0,
+    SpvFunctionParameterAttributeSext = 1,
+    SpvFunctionParameterAttributeByVal = 2,
+    SpvFunctionParameterAttributeSret = 3,
+    SpvFunctionParameterAttributeNoAlias = 4,
+    SpvFunctionParameterAttributeNoCapture = 5,
+    SpvFunctionParameterAttributeNoWrite = 6,
+    SpvFunctionParameterAttributeNoReadWrite = 7,
+    SpvFunctionParameterAttributeMax = 0x7fffffff,
+} SpvFunctionParameterAttribute;
+
+typedef enum SpvDecoration_ {
+    SpvDecorationRelaxedPrecision = 0,
+    SpvDecorationSpecId = 1,
+    SpvDecorationBlock = 2,
+    SpvDecorationBufferBlock = 3,
+    SpvDecorationRowMajor = 4,
+    SpvDecorationColMajor = 5,
+    SpvDecorationArrayStride = 6,
+    SpvDecorationMatrixStride = 7,
+    SpvDecorationGLSLShared = 8,
+    SpvDecorationGLSLPacked = 9,
+    SpvDecorationCPacked = 10,
+    SpvDecorationBuiltIn = 11,
+    SpvDecorationNoPerspective = 13,
+    SpvDecorationFlat = 14,
+    SpvDecorationPatch = 15,
+    SpvDecorationCentroid = 16,
+    SpvDecorationSample = 17,
+    SpvDecorationInvariant = 18,
+    SpvDecorationRestrict = 19,
+    SpvDecorationAliased = 20,
+    SpvDecorationVolatile = 21,
+    SpvDecorationConstant = 22,
+    SpvDecorationCoherent = 23,
+    SpvDecorationNonWritable = 24,
+    SpvDecorationNonReadable = 25,
+    SpvDecorationUniform = 26,
+    SpvDecorationUniformId = 27,
+    SpvDecorationSaturatedConversion = 28,
+    SpvDecorationStream = 29,
+    SpvDecorationLocation = 30,
+    SpvDecorationComponent = 31,
+    SpvDecorationIndex = 32,
+    SpvDecorationBinding = 33,
+    SpvDecorationDescriptorSet = 34,
+    SpvDecorationOffset = 35,
+    SpvDecorationXfbBuffer = 36,
+    SpvDecorationXfbStride = 37,
+    SpvDecorationFuncParamAttr = 38,
+    SpvDecorationFPRoundingMode = 39,
+    SpvDecorationFPFastMathMode = 40,
+    SpvDecorationLinkageAttributes = 41,
+    SpvDecorationNoContraction = 42,
+    SpvDecorationInputAttachmentIndex = 43,
+    SpvDecorationAlignment = 44,
+    SpvDecorationMaxByteOffset = 45,
+    SpvDecorationAlignmentId = 46,
+    SpvDecorationMaxByteOffsetId = 47,
+    SpvDecorationNoSignedWrap = 4469,
+    SpvDecorationNoUnsignedWrap = 4470,
+    SpvDecorationExplicitInterpAMD = 4999,
+    SpvDecorationOverrideCoverageNV = 5248,
+    SpvDecorationPassthroughNV = 5250,
+    SpvDecorationViewportRelativeNV = 5252,
+    SpvDecorationSecondaryViewportRelativeNV = 5256,
+    SpvDecorationPerPrimitiveEXT = 5271,
+    SpvDecorationPerPrimitiveNV = 5271,
+    SpvDecorationPerViewNV = 5272,
+    SpvDecorationPerTaskNV = 5273,
+    SpvDecorationPerVertexKHR = 5285,
+    SpvDecorationPerVertexNV = 5285,
+    SpvDecorationNonUniform = 5300,
+    SpvDecorationNonUniformEXT = 5300,
+    SpvDecorationRestrictPointer = 5355,
+    SpvDecorationRestrictPointerEXT = 5355,
+    SpvDecorationAliasedPointer = 5356,
+    SpvDecorationAliasedPointerEXT = 5356,
+    SpvDecorationBindlessSamplerNV = 5398,
+    SpvDecorationBindlessImageNV = 5399,
+    SpvDecorationBoundSamplerNV = 5400,
+    SpvDecorationBoundImageNV = 5401,
+    SpvDecorationSIMTCallINTEL = 5599,
+    SpvDecorationReferencedIndirectlyINTEL = 5602,
+    SpvDecorationClobberINTEL = 5607,
+    SpvDecorationSideEffectsINTEL = 5608,
+    SpvDecorationVectorComputeVariableINTEL = 5624,
+    SpvDecorationFuncParamIOKindINTEL = 5625,
+    SpvDecorationVectorComputeFunctionINTEL = 5626,
+    SpvDecorationStackCallINTEL = 5627,
+    SpvDecorationGlobalVariableOffsetINTEL = 5628,
+    SpvDecorationCounterBuffer = 5634,
+    SpvDecorationHlslCounterBufferGOOGLE = 5634,
+    SpvDecorationHlslSemanticGOOGLE = 5635,
+    SpvDecorationUserSemantic = 5635,
+    SpvDecorationUserTypeGOOGLE = 5636,
+    SpvDecorationFunctionRoundingModeINTEL = 5822,
+    SpvDecorationFunctionDenormModeINTEL = 5823,
+    SpvDecorationRegisterINTEL = 5825,
+    SpvDecorationMemoryINTEL = 5826,
+    SpvDecorationNumbanksINTEL = 5827,
+    SpvDecorationBankwidthINTEL = 5828,
+    SpvDecorationMaxPrivateCopiesINTEL = 5829,
+    SpvDecorationSinglepumpINTEL = 5830,
+    SpvDecorationDoublepumpINTEL = 5831,
+    SpvDecorationMaxReplicatesINTEL = 5832,
+    SpvDecorationSimpleDualPortINTEL = 5833,
+    SpvDecorationMergeINTEL = 5834,
+    SpvDecorationBankBitsINTEL = 5835,
+    SpvDecorationForcePow2DepthINTEL = 5836,
+    SpvDecorationBurstCoalesceINTEL = 5899,
+    SpvDecorationCacheSizeINTEL = 5900,
+    SpvDecorationDontStaticallyCoalesceINTEL = 5901,
+    SpvDecorationPrefetchINTEL = 5902,
+    SpvDecorationStallEnableINTEL = 5905,
+    SpvDecorationFuseLoopsInFunctionINTEL = 5907,
+    SpvDecorationAliasScopeINTEL = 5914,
+    SpvDecorationNoAliasINTEL = 5915,
+    SpvDecorationBufferLocationINTEL = 5921,
+    SpvDecorationIOPipeStorageINTEL = 5944,
+    SpvDecorationFunctionFloatingPointModeINTEL = 6080,
+    SpvDecorationSingleElementVectorINTEL = 6085,
+    SpvDecorationVectorComputeCallableFunctionINTEL = 6087,
+    SpvDecorationMediaBlockIOINTEL = 6140,
+    SpvDecorationMax = 0x7fffffff,
+} SpvDecoration;
+
+typedef enum SpvBuiltIn_ {
+    SpvBuiltInPosition = 0,
+    SpvBuiltInPointSize = 1,
+    SpvBuiltInClipDistance = 3,
+    SpvBuiltInCullDistance = 4,
+    SpvBuiltInVertexId = 5,
+    SpvBuiltInInstanceId = 6,
+    SpvBuiltInPrimitiveId = 7,
+    SpvBuiltInInvocationId = 8,
+    SpvBuiltInLayer = 9,
+    SpvBuiltInViewportIndex = 10,
+    SpvBuiltInTessLevelOuter = 11,
+    SpvBuiltInTessLevelInner = 12,
+    SpvBuiltInTessCoord = 13,
+    SpvBuiltInPatchVertices = 14,
+    SpvBuiltInFragCoord = 15,
+    SpvBuiltInPointCoord = 16,
+    SpvBuiltInFrontFacing = 17,
+    SpvBuiltInSampleId = 18,
+    SpvBuiltInSamplePosition = 19,
+    SpvBuiltInSampleMask = 20,
+    SpvBuiltInFragDepth = 22,
+    SpvBuiltInHelperInvocation = 23,
+    SpvBuiltInNumWorkgroups = 24,
+    SpvBuiltInWorkgroupSize = 25,
+    SpvBuiltInWorkgroupId = 26,
+    SpvBuiltInLocalInvocationId = 27,
+    SpvBuiltInGlobalInvocationId = 28,
+    SpvBuiltInLocalInvocationIndex = 29,
+    SpvBuiltInWorkDim = 30,
+    SpvBuiltInGlobalSize = 31,
+    SpvBuiltInEnqueuedWorkgroupSize = 32,
+    SpvBuiltInGlobalOffset = 33,
+    SpvBuiltInGlobalLinearId = 34,
+    SpvBuiltInSubgroupSize = 36,
+    SpvBuiltInSubgroupMaxSize = 37,
+    SpvBuiltInNumSubgroups = 38,
+    SpvBuiltInNumEnqueuedSubgroups = 39,
+    SpvBuiltInSubgroupId = 40,
+    SpvBuiltInSubgroupLocalInvocationId = 41,
+    SpvBuiltInVertexIndex = 42,
+    SpvBuiltInInstanceIndex = 43,
+    SpvBuiltInCoreIDARM = 4160,
+    SpvBuiltInCoreCountARM = 4161,
+    SpvBuiltInCoreMaxIDARM = 4162,
+    SpvBuiltInWarpIDARM = 4163,
+    SpvBuiltInWarpMaxIDARM = 4164,
+    SpvBuiltInSubgroupEqMask = 4416,
+    SpvBuiltInSubgroupEqMaskKHR = 4416,
+    SpvBuiltInSubgroupGeMask = 4417,
+    SpvBuiltInSubgroupGeMaskKHR = 4417,
+    SpvBuiltInSubgroupGtMask = 4418,
+    SpvBuiltInSubgroupGtMaskKHR = 4418,
+    SpvBuiltInSubgroupLeMask = 4419,
+    SpvBuiltInSubgroupLeMaskKHR = 4419,
+    SpvBuiltInSubgroupLtMask = 4420,
+    SpvBuiltInSubgroupLtMaskKHR = 4420,
+    SpvBuiltInBaseVertex = 4424,
+    SpvBuiltInBaseInstance = 4425,
+    SpvBuiltInDrawIndex = 4426,
+    SpvBuiltInPrimitiveShadingRateKHR = 4432,
+    SpvBuiltInDeviceIndex = 4438,
+    SpvBuiltInViewIndex = 4440,
+    SpvBuiltInShadingRateKHR = 4444,
+    SpvBuiltInBaryCoordNoPerspAMD = 4992,
+    SpvBuiltInBaryCoordNoPerspCentroidAMD = 4993,
+    SpvBuiltInBaryCoordNoPerspSampleAMD = 4994,
+    SpvBuiltInBaryCoordSmoothAMD = 4995,
+    SpvBuiltInBaryCoordSmoothCentroidAMD = 4996,
+    SpvBuiltInBaryCoordSmoothSampleAMD = 4997,
+    SpvBuiltInBaryCoordPullModelAMD = 4998,
+    SpvBuiltInFragStencilRefEXT = 5014,
+    SpvBuiltInViewportMaskNV = 5253,
+    SpvBuiltInSecondaryPositionNV = 5257,
+    SpvBuiltInSecondaryViewportMaskNV = 5258,
+    SpvBuiltInPositionPerViewNV = 5261,
+    SpvBuiltInViewportMaskPerViewNV = 5262,
+    SpvBuiltInFullyCoveredEXT = 5264,
+    SpvBuiltInTaskCountNV = 5274,
+    SpvBuiltInPrimitiveCountNV = 5275,
+    SpvBuiltInPrimitiveIndicesNV = 5276,
+    SpvBuiltInClipDistancePerViewNV = 5277,
+    SpvBuiltInCullDistancePerViewNV = 5278,
+    SpvBuiltInLayerPerViewNV = 5279,
+    SpvBuiltInMeshViewCountNV = 5280,
+    SpvBuiltInMeshViewIndicesNV = 5281,
+    SpvBuiltInBaryCoordKHR = 5286,
+    SpvBuiltInBaryCoordNV = 5286,
+    SpvBuiltInBaryCoordNoPerspKHR = 5287,
+    SpvBuiltInBaryCoordNoPerspNV = 5287,
+    SpvBuiltInFragSizeEXT = 5292,
+    SpvBuiltInFragmentSizeNV = 5292,
+    SpvBuiltInFragInvocationCountEXT = 5293,
+    SpvBuiltInInvocationsPerPixelNV = 5293,
+    SpvBuiltInPrimitivePointIndicesEXT = 5294,
+    SpvBuiltInPrimitiveLineIndicesEXT = 5295,
+    SpvBuiltInPrimitiveTriangleIndicesEXT = 5296,
+    SpvBuiltInCullPrimitiveEXT = 5299,
+    SpvBuiltInLaunchIdKHR = 5319,
+    SpvBuiltInLaunchIdNV = 5319,
+    SpvBuiltInLaunchSizeKHR = 5320,
+    SpvBuiltInLaunchSizeNV = 5320,
+    SpvBuiltInWorldRayOriginKHR = 5321,
+    SpvBuiltInWorldRayOriginNV = 5321,
+    SpvBuiltInWorldRayDirectionKHR = 5322,
+    SpvBuiltInWorldRayDirectionNV = 5322,
+    SpvBuiltInObjectRayOriginKHR = 5323,
+    SpvBuiltInObjectRayOriginNV = 5323,
+    SpvBuiltInObjectRayDirectionKHR = 5324,
+    SpvBuiltInObjectRayDirectionNV = 5324,
+    SpvBuiltInRayTminKHR = 5325,
+    SpvBuiltInRayTminNV = 5325,
+    SpvBuiltInRayTmaxKHR = 5326,
+    SpvBuiltInRayTmaxNV = 5326,
+    SpvBuiltInInstanceCustomIndexKHR = 5327,
+    SpvBuiltInInstanceCustomIndexNV = 5327,
+    SpvBuiltInObjectToWorldKHR = 5330,
+    SpvBuiltInObjectToWorldNV = 5330,
+    SpvBuiltInWorldToObjectKHR = 5331,
+    SpvBuiltInWorldToObjectNV = 5331,
+    SpvBuiltInHitTNV = 5332,
+    SpvBuiltInHitKindKHR = 5333,
+    SpvBuiltInHitKindNV = 5333,
+    SpvBuiltInCurrentRayTimeNV = 5334,
+    SpvBuiltInIncomingRayFlagsKHR = 5351,
+    SpvBuiltInIncomingRayFlagsNV = 5351,
+    SpvBuiltInRayGeometryIndexKHR = 5352,
+    SpvBuiltInWarpsPerSMNV = 5374,
+    SpvBuiltInSMCountNV = 5375,
+    SpvBuiltInWarpIDNV = 5376,
+    SpvBuiltInSMIDNV = 5377,
+    SpvBuiltInCullMaskKHR = 6021,
+    SpvBuiltInMax = 0x7fffffff,
+} SpvBuiltIn;
+
+typedef enum SpvSelectionControlShift_ {
+    SpvSelectionControlFlattenShift = 0,
+    SpvSelectionControlDontFlattenShift = 1,
+    SpvSelectionControlMax = 0x7fffffff,
+} SpvSelectionControlShift;
+
+typedef enum SpvSelectionControlMask_ {
+    SpvSelectionControlMaskNone = 0,
+    SpvSelectionControlFlattenMask = 0x00000001,
+    SpvSelectionControlDontFlattenMask = 0x00000002,
+} SpvSelectionControlMask;
+
+typedef enum SpvLoopControlShift_ {
+    SpvLoopControlUnrollShift = 0,
+    SpvLoopControlDontUnrollShift = 1,
+    SpvLoopControlDependencyInfiniteShift = 2,
+    SpvLoopControlDependencyLengthShift = 3,
+    SpvLoopControlMinIterationsShift = 4,
+    SpvLoopControlMaxIterationsShift = 5,
+    SpvLoopControlIterationMultipleShift = 6,
+    SpvLoopControlPeelCountShift = 7,
+    SpvLoopControlPartialCountShift = 8,
+    SpvLoopControlInitiationIntervalINTELShift = 16,
+    SpvLoopControlMaxConcurrencyINTELShift = 17,
+    SpvLoopControlDependencyArrayINTELShift = 18,
+    SpvLoopControlPipelineEnableINTELShift = 19,
+    SpvLoopControlLoopCoalesceINTELShift = 20,
+    SpvLoopControlMaxInterleavingINTELShift = 21,
+    SpvLoopControlSpeculatedIterationsINTELShift = 22,
+    SpvLoopControlNoFusionINTELShift = 23,
+    SpvLoopControlMax = 0x7fffffff,
+} SpvLoopControlShift;
+
+typedef enum SpvLoopControlMask_ {
+    SpvLoopControlMaskNone = 0,
+    SpvLoopControlUnrollMask = 0x00000001,
+    SpvLoopControlDontUnrollMask = 0x00000002,
+    SpvLoopControlDependencyInfiniteMask = 0x00000004,
+    SpvLoopControlDependencyLengthMask = 0x00000008,
+    SpvLoopControlMinIterationsMask = 0x00000010,
+    SpvLoopControlMaxIterationsMask = 0x00000020,
+    SpvLoopControlIterationMultipleMask = 0x00000040,
+    SpvLoopControlPeelCountMask = 0x00000080,
+    SpvLoopControlPartialCountMask = 0x00000100,
+    SpvLoopControlInitiationIntervalINTELMask = 0x00010000,
+    SpvLoopControlMaxConcurrencyINTELMask = 0x00020000,
+    SpvLoopControlDependencyArrayINTELMask = 0x00040000,
+    SpvLoopControlPipelineEnableINTELMask = 0x00080000,
+    SpvLoopControlLoopCoalesceINTELMask = 0x00100000,
+    SpvLoopControlMaxInterleavingINTELMask = 0x00200000,
+    SpvLoopControlSpeculatedIterationsINTELMask = 0x00400000,
+    SpvLoopControlNoFusionINTELMask = 0x00800000,
+} SpvLoopControlMask;
+
+typedef enum SpvFunctionControlShift_ {
+    SpvFunctionControlInlineShift = 0,
+    SpvFunctionControlDontInlineShift = 1,
+    SpvFunctionControlPureShift = 2,
+    SpvFunctionControlConstShift = 3,
+    SpvFunctionControlOptNoneINTELShift = 16,
+    SpvFunctionControlMax = 0x7fffffff,
+} SpvFunctionControlShift;
+
+typedef enum SpvFunctionControlMask_ {
+    SpvFunctionControlMaskNone = 0,
+    SpvFunctionControlInlineMask = 0x00000001,
+    SpvFunctionControlDontInlineMask = 0x00000002,
+    SpvFunctionControlPureMask = 0x00000004,
+    SpvFunctionControlConstMask = 0x00000008,
+    SpvFunctionControlOptNoneINTELMask = 0x00010000,
+} SpvFunctionControlMask;
+
+typedef enum SpvMemorySemanticsShift_ {
+    SpvMemorySemanticsAcquireShift = 1,
+    SpvMemorySemanticsReleaseShift = 2,
+    SpvMemorySemanticsAcquireReleaseShift = 3,
+    SpvMemorySemanticsSequentiallyConsistentShift = 4,
+    SpvMemorySemanticsUniformMemoryShift = 6,
+    SpvMemorySemanticsSubgroupMemoryShift = 7,
+    SpvMemorySemanticsWorkgroupMemoryShift = 8,
+    SpvMemorySemanticsCrossWorkgroupMemoryShift = 9,
+    SpvMemorySemanticsAtomicCounterMemoryShift = 10,
+    SpvMemorySemanticsImageMemoryShift = 11,
+    SpvMemorySemanticsOutputMemoryShift = 12,
+    SpvMemorySemanticsOutputMemoryKHRShift = 12,
+    SpvMemorySemanticsMakeAvailableShift = 13,
+    SpvMemorySemanticsMakeAvailableKHRShift = 13,
+    SpvMemorySemanticsMakeVisibleShift = 14,
+    SpvMemorySemanticsMakeVisibleKHRShift = 14,
+    SpvMemorySemanticsVolatileShift = 15,
+    SpvMemorySemanticsMax = 0x7fffffff,
+} SpvMemorySemanticsShift;
+
+typedef enum SpvMemorySemanticsMask_ {
+    SpvMemorySemanticsMaskNone = 0,
+    SpvMemorySemanticsAcquireMask = 0x00000002,
+    SpvMemorySemanticsReleaseMask = 0x00000004,
+    SpvMemorySemanticsAcquireReleaseMask = 0x00000008,
+    SpvMemorySemanticsSequentiallyConsistentMask = 0x00000010,
+    SpvMemorySemanticsUniformMemoryMask = 0x00000040,
+    SpvMemorySemanticsSubgroupMemoryMask = 0x00000080,
+    SpvMemorySemanticsWorkgroupMemoryMask = 0x00000100,
+    SpvMemorySemanticsCrossWorkgroupMemoryMask = 0x00000200,
+    SpvMemorySemanticsAtomicCounterMemoryMask = 0x00000400,
+    SpvMemorySemanticsImageMemoryMask = 0x00000800,
+    SpvMemorySemanticsOutputMemoryMask = 0x00001000,
+    SpvMemorySemanticsOutputMemoryKHRMask = 0x00001000,
+    SpvMemorySemanticsMakeAvailableMask = 0x00002000,
+    SpvMemorySemanticsMakeAvailableKHRMask = 0x00002000,
+    SpvMemorySemanticsMakeVisibleMask = 0x00004000,
+    SpvMemorySemanticsMakeVisibleKHRMask = 0x00004000,
+    SpvMemorySemanticsVolatileMask = 0x00008000,
+} SpvMemorySemanticsMask;
+
+typedef enum SpvMemoryAccessShift_ {
+    SpvMemoryAccessVolatileShift = 0,
+    SpvMemoryAccessAlignedShift = 1,
+    SpvMemoryAccessNontemporalShift = 2,
+    SpvMemoryAccessMakePointerAvailableShift = 3,
+    SpvMemoryAccessMakePointerAvailableKHRShift = 3,
+    SpvMemoryAccessMakePointerVisibleShift = 4,
+    SpvMemoryAccessMakePointerVisibleKHRShift = 4,
+    SpvMemoryAccessNonPrivatePointerShift = 5,
+    SpvMemoryAccessNonPrivatePointerKHRShift = 5,
+    SpvMemoryAccessAliasScopeINTELMaskShift = 16,
+    SpvMemoryAccessNoAliasINTELMaskShift = 17,
+    SpvMemoryAccessMax = 0x7fffffff,
+} SpvMemoryAccessShift;
+
+typedef enum SpvMemoryAccessMask_ {
+    SpvMemoryAccessMaskNone = 0,
+    SpvMemoryAccessVolatileMask = 0x00000001,
+    SpvMemoryAccessAlignedMask = 0x00000002,
+    SpvMemoryAccessNontemporalMask = 0x00000004,
+    SpvMemoryAccessMakePointerAvailableMask = 0x00000008,
+    SpvMemoryAccessMakePointerAvailableKHRMask = 0x00000008,
+    SpvMemoryAccessMakePointerVisibleMask = 0x00000010,
+    SpvMemoryAccessMakePointerVisibleKHRMask = 0x00000010,
+    SpvMemoryAccessNonPrivatePointerMask = 0x00000020,
+    SpvMemoryAccessNonPrivatePointerKHRMask = 0x00000020,
+    SpvMemoryAccessAliasScopeINTELMaskMask = 0x00010000,
+    SpvMemoryAccessNoAliasINTELMaskMask = 0x00020000,
+} SpvMemoryAccessMask;
+
+typedef enum SpvScope_ {
+    SpvScopeCrossDevice = 0,
+    SpvScopeDevice = 1,
+    SpvScopeWorkgroup = 2,
+    SpvScopeSubgroup = 3,
+    SpvScopeInvocation = 4,
+    SpvScopeQueueFamily = 5,
+    SpvScopeQueueFamilyKHR = 5,
+    SpvScopeShaderCallKHR = 6,
+    SpvScopeMax = 0x7fffffff,
+} SpvScope;
+
+typedef enum SpvGroupOperation_ {
+    SpvGroupOperationReduce = 0,
+    SpvGroupOperationInclusiveScan = 1,
+    SpvGroupOperationExclusiveScan = 2,
+    SpvGroupOperationClusteredReduce = 3,
+    SpvGroupOperationPartitionedReduceNV = 6,
+    SpvGroupOperationPartitionedInclusiveScanNV = 7,
+    SpvGroupOperationPartitionedExclusiveScanNV = 8,
+    SpvGroupOperationMax = 0x7fffffff,
+} SpvGroupOperation;
+
+typedef enum SpvKernelEnqueueFlags_ {
+    SpvKernelEnqueueFlagsNoWait = 0,
+    SpvKernelEnqueueFlagsWaitKernel = 1,
+    SpvKernelEnqueueFlagsWaitWorkGroup = 2,
+    SpvKernelEnqueueFlagsMax = 0x7fffffff,
+} SpvKernelEnqueueFlags;
+
+typedef enum SpvKernelProfilingInfoShift_ {
+    SpvKernelProfilingInfoCmdExecTimeShift = 0,
+    SpvKernelProfilingInfoMax = 0x7fffffff,
+} SpvKernelProfilingInfoShift;
+
+typedef enum SpvKernelProfilingInfoMask_ {
+    SpvKernelProfilingInfoMaskNone = 0,
+    SpvKernelProfilingInfoCmdExecTimeMask = 0x00000001,
+} SpvKernelProfilingInfoMask;
+
+typedef enum SpvCapability_ {
+    SpvCapabilityMatrix = 0,
+    SpvCapabilityShader = 1,
+    SpvCapabilityGeometry = 2,
+    SpvCapabilityTessellation = 3,
+    SpvCapabilityAddresses = 4,
+    SpvCapabilityLinkage = 5,
+    SpvCapabilityKernel = 6,
+    SpvCapabilityVector16 = 7,
+    SpvCapabilityFloat16Buffer = 8,
+    SpvCapabilityFloat16 = 9,
+    SpvCapabilityFloat64 = 10,
+    SpvCapabilityInt64 = 11,
+    SpvCapabilityInt64Atomics = 12,
+    SpvCapabilityImageBasic = 13,
+    SpvCapabilityImageReadWrite = 14,
+    SpvCapabilityImageMipmap = 15,
+    SpvCapabilityPipes = 17,
+    SpvCapabilityGroups = 18,
+    SpvCapabilityDeviceEnqueue = 19,
+    SpvCapabilityLiteralSampler = 20,
+    SpvCapabilityAtomicStorage = 21,
+    SpvCapabilityInt16 = 22,
+    SpvCapabilityTessellationPointSize = 23,
+    SpvCapabilityGeometryPointSize = 24,
+    SpvCapabilityImageGatherExtended = 25,
+    SpvCapabilityStorageImageMultisample = 27,
+    SpvCapabilityUniformBufferArrayDynamicIndexing = 28,
+    SpvCapabilitySampledImageArrayDynamicIndexing = 29,
+    SpvCapabilityStorageBufferArrayDynamicIndexing = 30,
+    SpvCapabilityStorageImageArrayDynamicIndexing = 31,
+    SpvCapabilityClipDistance = 32,
+    SpvCapabilityCullDistance = 33,
+    SpvCapabilityImageCubeArray = 34,
+    SpvCapabilitySampleRateShading = 35,
+    SpvCapabilityImageRect = 36,
+    SpvCapabilitySampledRect = 37,
+    SpvCapabilityGenericPointer = 38,
+    SpvCapabilityInt8 = 39,
+    SpvCapabilityInputAttachment = 40,
+    SpvCapabilitySparseResidency = 41,
+    SpvCapabilityMinLod = 42,
+    SpvCapabilitySampled1D = 43,
+    SpvCapabilityImage1D = 44,
+    SpvCapabilitySampledCubeArray = 45,
+    SpvCapabilitySampledBuffer = 46,
+    SpvCapabilityImageBuffer = 47,
+    SpvCapabilityImageMSArray = 48,
+    SpvCapabilityStorageImageExtendedFormats = 49,
+    SpvCapabilityImageQuery = 50,
+    SpvCapabilityDerivativeControl = 51,
+    SpvCapabilityInterpolationFunction = 52,
+    SpvCapabilityTransformFeedback = 53,
+    SpvCapabilityGeometryStreams = 54,
+    SpvCapabilityStorageImageReadWithoutFormat = 55,
+    SpvCapabilityStorageImageWriteWithoutFormat = 56,
+    SpvCapabilityMultiViewport = 57,
+    SpvCapabilitySubgroupDispatch = 58,
+    SpvCapabilityNamedBarrier = 59,
+    SpvCapabilityPipeStorage = 60,
+    SpvCapabilityGroupNonUniform = 61,
+    SpvCapabilityGroupNonUniformVote = 62,
+    SpvCapabilityGroupNonUniformArithmetic = 63,
+    SpvCapabilityGroupNonUniformBallot = 64,
+    SpvCapabilityGroupNonUniformShuffle = 65,
+    SpvCapabilityGroupNonUniformShuffleRelative = 66,
+    SpvCapabilityGroupNonUniformClustered = 67,
+    SpvCapabilityGroupNonUniformQuad = 68,
+    SpvCapabilityShaderLayer = 69,
+    SpvCapabilityShaderViewportIndex = 70,
+    SpvCapabilityUniformDecoration = 71,
+    SpvCapabilityCoreBuiltinsARM = 4165,
+    SpvCapabilityFragmentShadingRateKHR = 4422,
+    SpvCapabilitySubgroupBallotKHR = 4423,
+    SpvCapabilityDrawParameters = 4427,
+    SpvCapabilityWorkgroupMemoryExplicitLayoutKHR = 4428,
+    SpvCapabilityWorkgroupMemoryExplicitLayout8BitAccessKHR = 4429,
+    SpvCapabilityWorkgroupMemoryExplicitLayout16BitAccessKHR = 4430,
+    SpvCapabilitySubgroupVoteKHR = 4431,
+    SpvCapabilityStorageBuffer16BitAccess = 4433,
+    SpvCapabilityStorageUniformBufferBlock16 = 4433,
+    SpvCapabilityStorageUniform16 = 4434,
+    SpvCapabilityUniformAndStorageBuffer16BitAccess = 4434,
+    SpvCapabilityStoragePushConstant16 = 4435,
+    SpvCapabilityStorageInputOutput16 = 4436,
+    SpvCapabilityDeviceGroup = 4437,
+    SpvCapabilityMultiView = 4439,
+    SpvCapabilityVariablePointersStorageBuffer = 4441,
+    SpvCapabilityVariablePointers = 4442,
+    SpvCapabilityAtomicStorageOps = 4445,
+    SpvCapabilitySampleMaskPostDepthCoverage = 4447,
+    SpvCapabilityStorageBuffer8BitAccess = 4448,
+    SpvCapabilityUniformAndStorageBuffer8BitAccess = 4449,
+    SpvCapabilityStoragePushConstant8 = 4450,
+    SpvCapabilityDenormPreserve = 4464,
+    SpvCapabilityDenormFlushToZero = 4465,
+    SpvCapabilitySignedZeroInfNanPreserve = 4466,
+    SpvCapabilityRoundingModeRTE = 4467,
+    SpvCapabilityRoundingModeRTZ = 4468,
+    SpvCapabilityRayQueryProvisionalKHR = 4471,
+    SpvCapabilityRayQueryKHR = 4472,
+    SpvCapabilityRayTraversalPrimitiveCullingKHR = 4478,
+    SpvCapabilityRayTracingKHR = 4479,
+    SpvCapabilityFloat16ImageAMD = 5008,
+    SpvCapabilityImageGatherBiasLodAMD = 5009,
+    SpvCapabilityFragmentMaskAMD = 5010,
+    SpvCapabilityStencilExportEXT = 5013,
+    SpvCapabilityImageReadWriteLodAMD = 5015,
+    SpvCapabilityInt64ImageEXT = 5016,
+    SpvCapabilityShaderClockKHR = 5055,
+    SpvCapabilitySampleMaskOverrideCoverageNV = 5249,
+    SpvCapabilityGeometryShaderPassthroughNV = 5251,
+    SpvCapabilityShaderViewportIndexLayerEXT = 5254,
+    SpvCapabilityShaderViewportIndexLayerNV = 5254,
+    SpvCapabilityShaderViewportMaskNV = 5255,
+    SpvCapabilityShaderStereoViewNV = 5259,
+    SpvCapabilityPerViewAttributesNV = 5260,
+    SpvCapabilityFragmentFullyCoveredEXT = 5265,
+    SpvCapabilityMeshShadingNV = 5266,
+    SpvCapabilityImageFootprintNV = 5282,
+    SpvCapabilityMeshShadingEXT = 5283,
+    SpvCapabilityFragmentBarycentricKHR = 5284,
+    SpvCapabilityFragmentBarycentricNV = 5284,
+    SpvCapabilityComputeDerivativeGroupQuadsNV = 5288,
+    SpvCapabilityFragmentDensityEXT = 5291,
+    SpvCapabilityShadingRateNV = 5291,
+    SpvCapabilityGroupNonUniformPartitionedNV = 5297,
+    SpvCapabilityShaderNonUniform = 5301,
+    SpvCapabilityShaderNonUniformEXT = 5301,
+    SpvCapabilityRuntimeDescriptorArray = 5302,
+    SpvCapabilityRuntimeDescriptorArrayEXT = 5302,
+    SpvCapabilityInputAttachmentArrayDynamicIndexing = 5303,
+    SpvCapabilityInputAttachmentArrayDynamicIndexingEXT = 5303,
+    SpvCapabilityUniformTexelBufferArrayDynamicIndexing = 5304,
+    SpvCapabilityUniformTexelBufferArrayDynamicIndexingEXT = 5304,
+    SpvCapabilityStorageTexelBufferArrayDynamicIndexing = 5305,
+    SpvCapabilityStorageTexelBufferArrayDynamicIndexingEXT = 5305,
+    SpvCapabilityUniformBufferArrayNonUniformIndexing = 5306,
+    SpvCapabilityUniformBufferArrayNonUniformIndexingEXT = 5306,
+    SpvCapabilitySampledImageArrayNonUniformIndexing = 5307,
+    SpvCapabilitySampledImageArrayNonUniformIndexingEXT = 5307,
+    SpvCapabilityStorageBufferArrayNonUniformIndexing = 5308,
+    SpvCapabilityStorageBufferArrayNonUniformIndexingEXT = 5308,
+    SpvCapabilityStorageImageArrayNonUniformIndexing = 5309,
+    SpvCapabilityStorageImageArrayNonUniformIndexingEXT = 5309,
+    SpvCapabilityInputAttachmentArrayNonUniformIndexing = 5310,
+    SpvCapabilityInputAttachmentArrayNonUniformIndexingEXT = 5310,
+    SpvCapabilityUniformTexelBufferArrayNonUniformIndexing = 5311,
+    SpvCapabilityUniformTexelBufferArrayNonUniformIndexingEXT = 5311,
+    SpvCapabilityStorageTexelBufferArrayNonUniformIndexing = 5312,
+    SpvCapabilityStorageTexelBufferArrayNonUniformIndexingEXT = 5312,
+    SpvCapabilityRayTracingNV = 5340,
+    SpvCapabilityRayTracingMotionBlurNV = 5341,
+    SpvCapabilityVulkanMemoryModel = 5345,
+    SpvCapabilityVulkanMemoryModelKHR = 5345,
+    SpvCapabilityVulkanMemoryModelDeviceScope = 5346,
+    SpvCapabilityVulkanMemoryModelDeviceScopeKHR = 5346,
+    SpvCapabilityPhysicalStorageBufferAddresses = 5347,
+    SpvCapabilityPhysicalStorageBufferAddressesEXT = 5347,
+    SpvCapabilityComputeDerivativeGroupLinearNV = 5350,
+    SpvCapabilityRayTracingProvisionalKHR = 5353,
+    SpvCapabilityCooperativeMatrixNV = 5357,
+    SpvCapabilityFragmentShaderSampleInterlockEXT = 5363,
+    SpvCapabilityFragmentShaderShadingRateInterlockEXT = 5372,
+    SpvCapabilityShaderSMBuiltinsNV = 5373,
+    SpvCapabilityFragmentShaderPixelInterlockEXT = 5378,
+    SpvCapabilityDemoteToHelperInvocation = 5379,
+    SpvCapabilityDemoteToHelperInvocationEXT = 5379,
+    SpvCapabilityRayTracingOpacityMicromapEXT = 5381,
+    SpvCapabilityBindlessTextureNV = 5390,
+    SpvCapabilitySubgroupShuffleINTEL = 5568,
+    SpvCapabilitySubgroupBufferBlockIOINTEL = 5569,
+    SpvCapabilitySubgroupImageBlockIOINTEL = 5570,
+    SpvCapabilitySubgroupImageMediaBlockIOINTEL = 5579,
+    SpvCapabilityRoundToInfinityINTEL = 5582,
+    SpvCapabilityFloatingPointModeINTEL = 5583,
+    SpvCapabilityIntegerFunctions2INTEL = 5584,
+    SpvCapabilityFunctionPointersINTEL = 5603,
+    SpvCapabilityIndirectReferencesINTEL = 5604,
+    SpvCapabilityAsmINTEL = 5606,
+    SpvCapabilityAtomicFloat32MinMaxEXT = 5612,
+    SpvCapabilityAtomicFloat64MinMaxEXT = 5613,
+    SpvCapabilityAtomicFloat16MinMaxEXT = 5616,
+    SpvCapabilityVectorComputeINTEL = 5617,
+    SpvCapabilityVectorAnyINTEL = 5619,
+    SpvCapabilityExpectAssumeKHR = 5629,
+    SpvCapabilitySubgroupAvcMotionEstimationINTEL = 5696,
+    SpvCapabilitySubgroupAvcMotionEstimationIntraINTEL = 5697,
+    SpvCapabilitySubgroupAvcMotionEstimationChromaINTEL = 5698,
+    SpvCapabilityVariableLengthArrayINTEL = 5817,
+    SpvCapabilityFunctionFloatControlINTEL = 5821,
+    SpvCapabilityFPGAMemoryAttributesINTEL = 5824,
+    SpvCapabilityFPFastMathModeINTEL = 5837,
+    SpvCapabilityArbitraryPrecisionIntegersINTEL = 5844,
+    SpvCapabilityArbitraryPrecisionFloatingPointINTEL = 5845,
+    SpvCapabilityUnstructuredLoopControlsINTEL = 5886,
+    SpvCapabilityFPGALoopControlsINTEL = 5888,
+    SpvCapabilityKernelAttributesINTEL = 5892,
+    SpvCapabilityFPGAKernelAttributesINTEL = 5897,
+    SpvCapabilityFPGAMemoryAccessesINTEL = 5898,
+    SpvCapabilityFPGAClusterAttributesINTEL = 5904,
+    SpvCapabilityLoopFuseINTEL = 5906,
+    SpvCapabilityMemoryAccessAliasingINTEL = 5910,
+    SpvCapabilityFPGABufferLocationINTEL = 5920,
+    SpvCapabilityArbitraryPrecisionFixedPointINTEL = 5922,
+    SpvCapabilityUSMStorageClassesINTEL = 5935,
+    SpvCapabilityIOPipesINTEL = 5943,
+    SpvCapabilityBlockingPipesINTEL = 5945,
+    SpvCapabilityFPGARegINTEL = 5948,
+    SpvCapabilityDotProductInputAll = 6016,
+    SpvCapabilityDotProductInputAllKHR = 6016,
+    SpvCapabilityDotProductInput4x8Bit = 6017,
+    SpvCapabilityDotProductInput4x8BitKHR = 6017,
+    SpvCapabilityDotProductInput4x8BitPacked = 6018,
+    SpvCapabilityDotProductInput4x8BitPackedKHR = 6018,
+    SpvCapabilityDotProduct = 6019,
+    SpvCapabilityDotProductKHR = 6019,
+    SpvCapabilityRayCullMaskKHR = 6020,
+    SpvCapabilityBitInstructions = 6025,
+    SpvCapabilityGroupNonUniformRotateKHR = 6026,
+    SpvCapabilityAtomicFloat32AddEXT = 6033,
+    SpvCapabilityAtomicFloat64AddEXT = 6034,
+    SpvCapabilityLongConstantCompositeINTEL = 6089,
+    SpvCapabilityOptNoneINTEL = 6094,
+    SpvCapabilityAtomicFloat16AddEXT = 6095,
+    SpvCapabilityDebugInfoModuleINTEL = 6114,
+    SpvCapabilitySplitBarrierINTEL = 6141,
+    SpvCapabilityGroupUniformArithmeticKHR = 6400,
+    SpvCapabilityMax = 0x7fffffff,
+} SpvCapability;
+
+typedef enum SpvRayFlagsShift_ {
+    SpvRayFlagsOpaqueKHRShift = 0,
+    SpvRayFlagsNoOpaqueKHRShift = 1,
+    SpvRayFlagsTerminateOnFirstHitKHRShift = 2,
+    SpvRayFlagsSkipClosestHitShaderKHRShift = 3,
+    SpvRayFlagsCullBackFacingTrianglesKHRShift = 4,
+    SpvRayFlagsCullFrontFacingTrianglesKHRShift = 5,
+    SpvRayFlagsCullOpaqueKHRShift = 6,
+    SpvRayFlagsCullNoOpaqueKHRShift = 7,
+    SpvRayFlagsSkipTrianglesKHRShift = 8,
+    SpvRayFlagsSkipAABBsKHRShift = 9,
+    SpvRayFlagsForceOpacityMicromap2StateEXTShift = 10,
+    SpvRayFlagsMax = 0x7fffffff,
+} SpvRayFlagsShift;
+
+typedef enum SpvRayFlagsMask_ {
+    SpvRayFlagsMaskNone = 0,
+    SpvRayFlagsOpaqueKHRMask = 0x00000001,
+    SpvRayFlagsNoOpaqueKHRMask = 0x00000002,
+    SpvRayFlagsTerminateOnFirstHitKHRMask = 0x00000004,
+    SpvRayFlagsSkipClosestHitShaderKHRMask = 0x00000008,
+    SpvRayFlagsCullBackFacingTrianglesKHRMask = 0x00000010,
+    SpvRayFlagsCullFrontFacingTrianglesKHRMask = 0x00000020,
+    SpvRayFlagsCullOpaqueKHRMask = 0x00000040,
+    SpvRayFlagsCullNoOpaqueKHRMask = 0x00000080,
+    SpvRayFlagsSkipTrianglesKHRMask = 0x00000100,
+    SpvRayFlagsSkipAABBsKHRMask = 0x00000200,
+    SpvRayFlagsForceOpacityMicromap2StateEXTMask = 0x00000400,
+} SpvRayFlagsMask;
+
+typedef enum SpvRayQueryIntersection_ {
+    SpvRayQueryIntersectionRayQueryCandidateIntersectionKHR = 0,
+    SpvRayQueryIntersectionRayQueryCommittedIntersectionKHR = 1,
+    SpvRayQueryIntersectionMax = 0x7fffffff,
+} SpvRayQueryIntersection;
+
+typedef enum SpvRayQueryCommittedIntersectionType_ {
+    SpvRayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionNoneKHR = 0,
+    SpvRayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionTriangleKHR = 1,
+    SpvRayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionGeneratedKHR = 2,
+    SpvRayQueryCommittedIntersectionTypeMax = 0x7fffffff,
+} SpvRayQueryCommittedIntersectionType;
+
+typedef enum SpvRayQueryCandidateIntersectionType_ {
+    SpvRayQueryCandidateIntersectionTypeRayQueryCandidateIntersectionTriangleKHR = 0,
+    SpvRayQueryCandidateIntersectionTypeRayQueryCandidateIntersectionAABBKHR = 1,
+    SpvRayQueryCandidateIntersectionTypeMax = 0x7fffffff,
+} SpvRayQueryCandidateIntersectionType;
+
+typedef enum SpvFragmentShadingRateShift_ {
+    SpvFragmentShadingRateVertical2PixelsShift = 0,
+    SpvFragmentShadingRateVertical4PixelsShift = 1,
+    SpvFragmentShadingRateHorizontal2PixelsShift = 2,
+    SpvFragmentShadingRateHorizontal4PixelsShift = 3,
+    SpvFragmentShadingRateMax = 0x7fffffff,
+} SpvFragmentShadingRateShift;
+
+typedef enum SpvFragmentShadingRateMask_ {
+    SpvFragmentShadingRateMaskNone = 0,
+    SpvFragmentShadingRateVertical2PixelsMask = 0x00000001,
+    SpvFragmentShadingRateVertical4PixelsMask = 0x00000002,
+    SpvFragmentShadingRateHorizontal2PixelsMask = 0x00000004,
+    SpvFragmentShadingRateHorizontal4PixelsMask = 0x00000008,
+} SpvFragmentShadingRateMask;
+
+typedef enum SpvFPDenormMode_ {
+    SpvFPDenormModePreserve = 0,
+    SpvFPDenormModeFlushToZero = 1,
+    SpvFPDenormModeMax = 0x7fffffff,
+} SpvFPDenormMode;
+
+typedef enum SpvFPOperationMode_ {
+    SpvFPOperationModeIEEE = 0,
+    SpvFPOperationModeALT = 1,
+    SpvFPOperationModeMax = 0x7fffffff,
+} SpvFPOperationMode;
+
+typedef enum SpvQuantizationModes_ {
+    SpvQuantizationModesTRN = 0,
+    SpvQuantizationModesTRN_ZERO = 1,
+    SpvQuantizationModesRND = 2,
+    SpvQuantizationModesRND_ZERO = 3,
+    SpvQuantizationModesRND_INF = 4,
+    SpvQuantizationModesRND_MIN_INF = 5,
+    SpvQuantizationModesRND_CONV = 6,
+    SpvQuantizationModesRND_CONV_ODD = 7,
+    SpvQuantizationModesMax = 0x7fffffff,
+} SpvQuantizationModes;
+
+typedef enum SpvOverflowModes_ {
+    SpvOverflowModesWRAP = 0,
+    SpvOverflowModesSAT = 1,
+    SpvOverflowModesSAT_ZERO = 2,
+    SpvOverflowModesSAT_SYM = 3,
+    SpvOverflowModesMax = 0x7fffffff,
+} SpvOverflowModes;
+
+typedef enum SpvPackedVectorFormat_ {
+    SpvPackedVectorFormatPackedVectorFormat4x8Bit = 0,
+    SpvPackedVectorFormatPackedVectorFormat4x8BitKHR = 0,
+    SpvPackedVectorFormatMax = 0x7fffffff,
+} SpvPackedVectorFormat;
+
+typedef enum SpvOp_ {
+    SpvOpNop = 0,
+    SpvOpUndef = 1,
+    SpvOpSourceContinued = 2,
+    SpvOpSource = 3,
+    SpvOpSourceExtension = 4,
+    SpvOpName = 5,
+    SpvOpMemberName = 6,
+    SpvOpString = 7,
+    SpvOpLine = 8,
+    SpvOpExtension = 10,
+    SpvOpExtInstImport = 11,
+    SpvOpExtInst = 12,
+    SpvOpMemoryModel = 14,
+    SpvOpEntryPoint = 15,
+    SpvOpExecutionMode = 16,
+    SpvOpCapability = 17,
+    SpvOpTypeVoid = 19,
+    SpvOpTypeBool = 20,
+    SpvOpTypeInt = 21,
+    SpvOpTypeFloat = 22,
+    SpvOpTypeVector = 23,
+    SpvOpTypeMatrix = 24,
+    SpvOpTypeImage = 25,
+    SpvOpTypeSampler = 26,
+    SpvOpTypeSampledImage = 27,
+    SpvOpTypeArray = 28,
+    SpvOpTypeRuntimeArray = 29,
+    SpvOpTypeStruct = 30,
+    SpvOpTypeOpaque = 31,
+    SpvOpTypePointer = 32,
+    SpvOpTypeFunction = 33,
+    SpvOpTypeEvent = 34,
+    SpvOpTypeDeviceEvent = 35,
+    SpvOpTypeReserveId = 36,
+    SpvOpTypeQueue = 37,
+    SpvOpTypePipe = 38,
+    SpvOpTypeForwardPointer = 39,
+    SpvOpConstantTrue = 41,
+    SpvOpConstantFalse = 42,
+    SpvOpConstant = 43,
+    SpvOpConstantComposite = 44,
+    SpvOpConstantSampler = 45,
+    SpvOpConstantNull = 46,
+    SpvOpSpecConstantTrue = 48,
+    SpvOpSpecConstantFalse = 49,
+    SpvOpSpecConstant = 50,
+    SpvOpSpecConstantComposite = 51,
+    SpvOpSpecConstantOp = 52,
+    SpvOpFunction = 54,
+    SpvOpFunctionParameter = 55,
+    SpvOpFunctionEnd = 56,
+    SpvOpFunctionCall = 57,
+    SpvOpVariable = 59,
+    SpvOpImageTexelPointer = 60,
+    SpvOpLoad = 61,
+    SpvOpStore = 62,
+    SpvOpCopyMemory = 63,
+    SpvOpCopyMemorySized = 64,
+    SpvOpAccessChain = 65,
+    SpvOpInBoundsAccessChain = 66,
+    SpvOpPtrAccessChain = 67,
+    SpvOpArrayLength = 68,
+    SpvOpGenericPtrMemSemantics = 69,
+    SpvOpInBoundsPtrAccessChain = 70,
+    SpvOpDecorate = 71,
+    SpvOpMemberDecorate = 72,
+    SpvOpDecorationGroup = 73,
+    SpvOpGroupDecorate = 74,
+    SpvOpGroupMemberDecorate = 75,
+    SpvOpVectorExtractDynamic = 77,
+    SpvOpVectorInsertDynamic = 78,
+    SpvOpVectorShuffle = 79,
+    SpvOpCompositeConstruct = 80,
+    SpvOpCompositeExtract = 81,
+    SpvOpCompositeInsert = 82,
+    SpvOpCopyObject = 83,
+    SpvOpTranspose = 84,
+    SpvOpSampledImage = 86,
+    SpvOpImageSampleImplicitLod = 87,
+    SpvOpImageSampleExplicitLod = 88,
+    SpvOpImageSampleDrefImplicitLod = 89,
+    SpvOpImageSampleDrefExplicitLod = 90,
+    SpvOpImageSampleProjImplicitLod = 91,
+    SpvOpImageSampleProjExplicitLod = 92,
+    SpvOpImageSampleProjDrefImplicitLod = 93,
+    SpvOpImageSampleProjDrefExplicitLod = 94,
+    SpvOpImageFetch = 95,
+    SpvOpImageGather = 96,
+    SpvOpImageDrefGather = 97,
+    SpvOpImageRead = 98,
+    SpvOpImageWrite = 99,
+    SpvOpImage = 100,
+    SpvOpImageQueryFormat = 101,
+    SpvOpImageQueryOrder = 102,
+    SpvOpImageQuerySizeLod = 103,
+    SpvOpImageQuerySize = 104,
+    SpvOpImageQueryLod = 105,
+    SpvOpImageQueryLevels = 106,
+    SpvOpImageQuerySamples = 107,
+    SpvOpConvertFToU = 109,
+    SpvOpConvertFToS = 110,
+    SpvOpConvertSToF = 111,
+    SpvOpConvertUToF = 112,
+    SpvOpUConvert = 113,
+    SpvOpSConvert = 114,
+    SpvOpFConvert = 115,
+    SpvOpQuantizeToF16 = 116,
+    SpvOpConvertPtrToU = 117,
+    SpvOpSatConvertSToU = 118,
+    SpvOpSatConvertUToS = 119,
+    SpvOpConvertUToPtr = 120,
+    SpvOpPtrCastToGeneric = 121,
+    SpvOpGenericCastToPtr = 122,
+    SpvOpGenericCastToPtrExplicit = 123,
+    SpvOpBitcast = 124,
+    SpvOpSNegate = 126,
+    SpvOpFNegate = 127,
+    SpvOpIAdd = 128,
+    SpvOpFAdd = 129,
+    SpvOpISub = 130,
+    SpvOpFSub = 131,
+    SpvOpIMul = 132,
+    SpvOpFMul = 133,
+    SpvOpUDiv = 134,
+    SpvOpSDiv = 135,
+    SpvOpFDiv = 136,
+    SpvOpUMod = 137,
+    SpvOpSRem = 138,
+    SpvOpSMod = 139,
+    SpvOpFRem = 140,
+    SpvOpFMod = 141,
+    SpvOpVectorTimesScalar = 142,
+    SpvOpMatrixTimesScalar = 143,
+    SpvOpVectorTimesMatrix = 144,
+    SpvOpMatrixTimesVector = 145,
+    SpvOpMatrixTimesMatrix = 146,
+    SpvOpOuterProduct = 147,
+    SpvOpDot = 148,
+    SpvOpIAddCarry = 149,
+    SpvOpISubBorrow = 150,
+    SpvOpUMulExtended = 151,
+    SpvOpSMulExtended = 152,
+    SpvOpAny = 154,
+    SpvOpAll = 155,
+    SpvOpIsNan = 156,
+    SpvOpIsInf = 157,
+    SpvOpIsFinite = 158,
+    SpvOpIsNormal = 159,
+    SpvOpSignBitSet = 160,
+    SpvOpLessOrGreater = 161,
+    SpvOpOrdered = 162,
+    SpvOpUnordered = 163,
+    SpvOpLogicalEqual = 164,
+    SpvOpLogicalNotEqual = 165,
+    SpvOpLogicalOr = 166,
+    SpvOpLogicalAnd = 167,
+    SpvOpLogicalNot = 168,
+    SpvOpSelect = 169,
+    SpvOpIEqual = 170,
+    SpvOpINotEqual = 171,
+    SpvOpUGreaterThan = 172,
+    SpvOpSGreaterThan = 173,
+    SpvOpUGreaterThanEqual = 174,
+    SpvOpSGreaterThanEqual = 175,
+    SpvOpULessThan = 176,
+    SpvOpSLessThan = 177,
+    SpvOpULessThanEqual = 178,
+    SpvOpSLessThanEqual = 179,
+    SpvOpFOrdEqual = 180,
+    SpvOpFUnordEqual = 181,
+    SpvOpFOrdNotEqual = 182,
+    SpvOpFUnordNotEqual = 183,
+    SpvOpFOrdLessThan = 184,
+    SpvOpFUnordLessThan = 185,
+    SpvOpFOrdGreaterThan = 186,
+    SpvOpFUnordGreaterThan = 187,
+    SpvOpFOrdLessThanEqual = 188,
+    SpvOpFUnordLessThanEqual = 189,
+    SpvOpFOrdGreaterThanEqual = 190,
+    SpvOpFUnordGreaterThanEqual = 191,
+    SpvOpShiftRightLogical = 194,
+    SpvOpShiftRightArithmetic = 195,
+    SpvOpShiftLeftLogical = 196,
+    SpvOpBitwiseOr = 197,
+    SpvOpBitwiseXor = 198,
+    SpvOpBitwiseAnd = 199,
+    SpvOpNot = 200,
+    SpvOpBitFieldInsert = 201,
+    SpvOpBitFieldSExtract = 202,
+    SpvOpBitFieldUExtract = 203,
+    SpvOpBitReverse = 204,
+    SpvOpBitCount = 205,
+    SpvOpDPdx = 207,
+    SpvOpDPdy = 208,
+    SpvOpFwidth = 209,
+    SpvOpDPdxFine = 210,
+    SpvOpDPdyFine = 211,
+    SpvOpFwidthFine = 212,
+    SpvOpDPdxCoarse = 213,
+    SpvOpDPdyCoarse = 214,
+    SpvOpFwidthCoarse = 215,
+    SpvOpEmitVertex = 218,
+    SpvOpEndPrimitive = 219,
+    SpvOpEmitStreamVertex = 220,
+    SpvOpEndStreamPrimitive = 221,
+    SpvOpControlBarrier = 224,
+    SpvOpMemoryBarrier = 225,
+    SpvOpAtomicLoad = 227,
+    SpvOpAtomicStore = 228,
+    SpvOpAtomicExchange = 229,
+    SpvOpAtomicCompareExchange = 230,
+    SpvOpAtomicCompareExchangeWeak = 231,
+    SpvOpAtomicIIncrement = 232,
+    SpvOpAtomicIDecrement = 233,
+    SpvOpAtomicIAdd = 234,
+    SpvOpAtomicISub = 235,
+    SpvOpAtomicSMin = 236,
+    SpvOpAtomicUMin = 237,
+    SpvOpAtomicSMax = 238,
+    SpvOpAtomicUMax = 239,
+    SpvOpAtomicAnd = 240,
+    SpvOpAtomicOr = 241,
+    SpvOpAtomicXor = 242,
+    SpvOpPhi = 245,
+    SpvOpLoopMerge = 246,
+    SpvOpSelectionMerge = 247,
+    SpvOpLabel = 248,
+    SpvOpBranch = 249,
+    SpvOpBranchConditional = 250,
+    SpvOpSwitch = 251,
+    SpvOpKill = 252,
+    SpvOpReturn = 253,
+    SpvOpReturnValue = 254,
+    SpvOpUnreachable = 255,
+    SpvOpLifetimeStart = 256,
+    SpvOpLifetimeStop = 257,
+    SpvOpGroupAsyncCopy = 259,
+    SpvOpGroupWaitEvents = 260,
+    SpvOpGroupAll = 261,
+    SpvOpGroupAny = 262,
+    SpvOpGroupBroadcast = 263,
+    SpvOpGroupIAdd = 264,
+    SpvOpGroupFAdd = 265,
+    SpvOpGroupFMin = 266,
+    SpvOpGroupUMin = 267,
+    SpvOpGroupSMin = 268,
+    SpvOpGroupFMax = 269,
+    SpvOpGroupUMax = 270,
+    SpvOpGroupSMax = 271,
+    SpvOpReadPipe = 274,
+    SpvOpWritePipe = 275,
+    SpvOpReservedReadPipe = 276,
+    SpvOpReservedWritePipe = 277,
+    SpvOpReserveReadPipePackets = 278,
+    SpvOpReserveWritePipePackets = 279,
+    SpvOpCommitReadPipe = 280,
+    SpvOpCommitWritePipe = 281,
+    SpvOpIsValidReserveId = 282,
+    SpvOpGetNumPipePackets = 283,
+    SpvOpGetMaxPipePackets = 284,
+    SpvOpGroupReserveReadPipePackets = 285,
+    SpvOpGroupReserveWritePipePackets = 286,
+    SpvOpGroupCommitReadPipe = 287,
+    SpvOpGroupCommitWritePipe = 288,
+    SpvOpEnqueueMarker = 291,
+    SpvOpEnqueueKernel = 292,
+    SpvOpGetKernelNDrangeSubGroupCount = 293,
+    SpvOpGetKernelNDrangeMaxSubGroupSize = 294,
+    SpvOpGetKernelWorkGroupSize = 295,
+    SpvOpGetKernelPreferredWorkGroupSizeMultiple = 296,
+    SpvOpRetainEvent = 297,
+    SpvOpReleaseEvent = 298,
+    SpvOpCreateUserEvent = 299,
+    SpvOpIsValidEvent = 300,
+    SpvOpSetUserEventStatus = 301,
+    SpvOpCaptureEventProfilingInfo = 302,
+    SpvOpGetDefaultQueue = 303,
+    SpvOpBuildNDRange = 304,
+    SpvOpImageSparseSampleImplicitLod = 305,
+    SpvOpImageSparseSampleExplicitLod = 306,
+    SpvOpImageSparseSampleDrefImplicitLod = 307,
+    SpvOpImageSparseSampleDrefExplicitLod = 308,
+    SpvOpImageSparseSampleProjImplicitLod = 309,
+    SpvOpImageSparseSampleProjExplicitLod = 310,
+    SpvOpImageSparseSampleProjDrefImplicitLod = 311,
+    SpvOpImageSparseSampleProjDrefExplicitLod = 312,
+    SpvOpImageSparseFetch = 313,
+    SpvOpImageSparseGather = 314,
+    SpvOpImageSparseDrefGather = 315,
+    SpvOpImageSparseTexelsResident = 316,
+    SpvOpNoLine = 317,
+    SpvOpAtomicFlagTestAndSet = 318,
+    SpvOpAtomicFlagClear = 319,
+    SpvOpImageSparseRead = 320,
+    SpvOpSizeOf = 321,
+    SpvOpTypePipeStorage = 322,
+    SpvOpConstantPipeStorage = 323,
+    SpvOpCreatePipeFromPipeStorage = 324,
+    SpvOpGetKernelLocalSizeForSubgroupCount = 325,
+    SpvOpGetKernelMaxNumSubgroups = 326,
+    SpvOpTypeNamedBarrier = 327,
+    SpvOpNamedBarrierInitialize = 328,
+    SpvOpMemoryNamedBarrier = 329,
+    SpvOpModuleProcessed = 330,
+    SpvOpExecutionModeId = 331,
+    SpvOpDecorateId = 332,
+    SpvOpGroupNonUniformElect = 333,
+    SpvOpGroupNonUniformAll = 334,
+    SpvOpGroupNonUniformAny = 335,
+    SpvOpGroupNonUniformAllEqual = 336,
+    SpvOpGroupNonUniformBroadcast = 337,
+    SpvOpGroupNonUniformBroadcastFirst = 338,
+    SpvOpGroupNonUniformBallot = 339,
+    SpvOpGroupNonUniformInverseBallot = 340,
+    SpvOpGroupNonUniformBallotBitExtract = 341,
+    SpvOpGroupNonUniformBallotBitCount = 342,
+    SpvOpGroupNonUniformBallotFindLSB = 343,
+    SpvOpGroupNonUniformBallotFindMSB = 344,
+    SpvOpGroupNonUniformShuffle = 345,
+    SpvOpGroupNonUniformShuffleXor = 346,
+    SpvOpGroupNonUniformShuffleUp = 347,
+    SpvOpGroupNonUniformShuffleDown = 348,
+    SpvOpGroupNonUniformIAdd = 349,
+    SpvOpGroupNonUniformFAdd = 350,
+    SpvOpGroupNonUniformIMul = 351,
+    SpvOpGroupNonUniformFMul = 352,
+    SpvOpGroupNonUniformSMin = 353,
+    SpvOpGroupNonUniformUMin = 354,
+    SpvOpGroupNonUniformFMin = 355,
+    SpvOpGroupNonUniformSMax = 356,
+    SpvOpGroupNonUniformUMax = 357,
+    SpvOpGroupNonUniformFMax = 358,
+    SpvOpGroupNonUniformBitwiseAnd = 359,
+    SpvOpGroupNonUniformBitwiseOr = 360,
+    SpvOpGroupNonUniformBitwiseXor = 361,
+    SpvOpGroupNonUniformLogicalAnd = 362,
+    SpvOpGroupNonUniformLogicalOr = 363,
+    SpvOpGroupNonUniformLogicalXor = 364,
+    SpvOpGroupNonUniformQuadBroadcast = 365,
+    SpvOpGroupNonUniformQuadSwap = 366,
+    SpvOpCopyLogical = 400,
+    SpvOpPtrEqual = 401,
+    SpvOpPtrNotEqual = 402,
+    SpvOpPtrDiff = 403,
+    SpvOpTerminateInvocation = 4416,
+    SpvOpSubgroupBallotKHR = 4421,
+    SpvOpSubgroupFirstInvocationKHR = 4422,
+    SpvOpSubgroupAllKHR = 4428,
+    SpvOpSubgroupAnyKHR = 4429,
+    SpvOpSubgroupAllEqualKHR = 4430,
+    SpvOpGroupNonUniformRotateKHR = 4431,
+    SpvOpSubgroupReadInvocationKHR = 4432,
+    SpvOpTraceRayKHR = 4445,
+    SpvOpExecuteCallableKHR = 4446,
+    SpvOpConvertUToAccelerationStructureKHR = 4447,
+    SpvOpIgnoreIntersectionKHR = 4448,
+    SpvOpTerminateRayKHR = 4449,
+    SpvOpSDot = 4450,
+    SpvOpSDotKHR = 4450,
+    SpvOpUDot = 4451,
+    SpvOpUDotKHR = 4451,
+    SpvOpSUDot = 4452,
+    SpvOpSUDotKHR = 4452,
+    SpvOpSDotAccSat = 4453,
+    SpvOpSDotAccSatKHR = 4453,
+    SpvOpUDotAccSat = 4454,
+    SpvOpUDotAccSatKHR = 4454,
+    SpvOpSUDotAccSat = 4455,
+    SpvOpSUDotAccSatKHR = 4455,
+    SpvOpTypeRayQueryKHR = 4472,
+    SpvOpRayQueryInitializeKHR = 4473,
+    SpvOpRayQueryTerminateKHR = 4474,
+    SpvOpRayQueryGenerateIntersectionKHR = 4475,
+    SpvOpRayQueryConfirmIntersectionKHR = 4476,
+    SpvOpRayQueryProceedKHR = 4477,
+    SpvOpRayQueryGetIntersectionTypeKHR = 4479,
+    SpvOpGroupIAddNonUniformAMD = 5000,
+    SpvOpGroupFAddNonUniformAMD = 5001,
+    SpvOpGroupFMinNonUniformAMD = 5002,
+    SpvOpGroupUMinNonUniformAMD = 5003,
+    SpvOpGroupSMinNonUniformAMD = 5004,
+    SpvOpGroupFMaxNonUniformAMD = 5005,
+    SpvOpGroupUMaxNonUniformAMD = 5006,
+    SpvOpGroupSMaxNonUniformAMD = 5007,
+    SpvOpFragmentMaskFetchAMD = 5011,
+    SpvOpFragmentFetchAMD = 5012,
+    SpvOpReadClockKHR = 5056,
+    SpvOpImageSampleFootprintNV = 5283,
+    SpvOpEmitMeshTasksEXT = 5294,
+    SpvOpSetMeshOutputsEXT = 5295,
+    SpvOpGroupNonUniformPartitionNV = 5296,
+    SpvOpWritePackedPrimitiveIndices4x8NV = 5299,
+    SpvOpReportIntersectionKHR = 5334,
+    SpvOpReportIntersectionNV = 5334,
+    SpvOpIgnoreIntersectionNV = 5335,
+    SpvOpTerminateRayNV = 5336,
+    SpvOpTraceNV = 5337,
+    SpvOpTraceMotionNV = 5338,
+    SpvOpTraceRayMotionNV = 5339,
+    SpvOpTypeAccelerationStructureKHR = 5341,
+    SpvOpTypeAccelerationStructureNV = 5341,
+    SpvOpExecuteCallableNV = 5344,
+    SpvOpTypeCooperativeMatrixNV = 5358,
+    SpvOpCooperativeMatrixLoadNV = 5359,
+    SpvOpCooperativeMatrixStoreNV = 5360,
+    SpvOpCooperativeMatrixMulAddNV = 5361,
+    SpvOpCooperativeMatrixLengthNV = 5362,
+    SpvOpBeginInvocationInterlockEXT = 5364,
+    SpvOpEndInvocationInterlockEXT = 5365,
+    SpvOpDemoteToHelperInvocation = 5380,
+    SpvOpDemoteToHelperInvocationEXT = 5380,
+    SpvOpIsHelperInvocationEXT = 5381,
+    SpvOpConvertUToImageNV = 5391,
+    SpvOpConvertUToSamplerNV = 5392,
+    SpvOpConvertImageToUNV = 5393,
+    SpvOpConvertSamplerToUNV = 5394,
+    SpvOpConvertUToSampledImageNV = 5395,
+    SpvOpConvertSampledImageToUNV = 5396,
+    SpvOpSamplerImageAddressingModeNV = 5397,
+    SpvOpSubgroupShuffleINTEL = 5571,
+    SpvOpSubgroupShuffleDownINTEL = 5572,
+    SpvOpSubgroupShuffleUpINTEL = 5573,
+    SpvOpSubgroupShuffleXorINTEL = 5574,
+    SpvOpSubgroupBlockReadINTEL = 5575,
+    SpvOpSubgroupBlockWriteINTEL = 5576,
+    SpvOpSubgroupImageBlockReadINTEL = 5577,
+    SpvOpSubgroupImageBlockWriteINTEL = 5578,
+    SpvOpSubgroupImageMediaBlockReadINTEL = 5580,
+    SpvOpSubgroupImageMediaBlockWriteINTEL = 5581,
+    SpvOpUCountLeadingZerosINTEL = 5585,
+    SpvOpUCountTrailingZerosINTEL = 5586,
+    SpvOpAbsISubINTEL = 5587,
+    SpvOpAbsUSubINTEL = 5588,
+    SpvOpIAddSatINTEL = 5589,
+    SpvOpUAddSatINTEL = 5590,
+    SpvOpIAverageINTEL = 5591,
+    SpvOpUAverageINTEL = 5592,
+    SpvOpIAverageRoundedINTEL = 5593,
+    SpvOpUAverageRoundedINTEL = 5594,
+    SpvOpISubSatINTEL = 5595,
+    SpvOpUSubSatINTEL = 5596,
+    SpvOpIMul32x16INTEL = 5597,
+    SpvOpUMul32x16INTEL = 5598,
+    SpvOpConstantFunctionPointerINTEL = 5600,
+    SpvOpFunctionPointerCallINTEL = 5601,
+    SpvOpAsmTargetINTEL = 5609,
+    SpvOpAsmINTEL = 5610,
+    SpvOpAsmCallINTEL = 5611,
+    SpvOpAtomicFMinEXT = 5614,
+    SpvOpAtomicFMaxEXT = 5615,
+    SpvOpAssumeTrueKHR = 5630,
+    SpvOpExpectKHR = 5631,
+    SpvOpDecorateString = 5632,
+    SpvOpDecorateStringGOOGLE = 5632,
+    SpvOpMemberDecorateString = 5633,
+    SpvOpMemberDecorateStringGOOGLE = 5633,
+    SpvOpVmeImageINTEL = 5699,
+    SpvOpTypeVmeImageINTEL = 5700,
+    SpvOpTypeAvcImePayloadINTEL = 5701,
+    SpvOpTypeAvcRefPayloadINTEL = 5702,
+    SpvOpTypeAvcSicPayloadINTEL = 5703,
+    SpvOpTypeAvcMcePayloadINTEL = 5704,
+    SpvOpTypeAvcMceResultINTEL = 5705,
+    SpvOpTypeAvcImeResultINTEL = 5706,
+    SpvOpTypeAvcImeResultSingleReferenceStreamoutINTEL = 5707,
+    SpvOpTypeAvcImeResultDualReferenceStreamoutINTEL = 5708,
+    SpvOpTypeAvcImeSingleReferenceStreaminINTEL = 5709,
+    SpvOpTypeAvcImeDualReferenceStreaminINTEL = 5710,
+    SpvOpTypeAvcRefResultINTEL = 5711,
+    SpvOpTypeAvcSicResultINTEL = 5712,
+    SpvOpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL = 5713,
+    SpvOpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL = 5714,
+    SpvOpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL = 5715,
+    SpvOpSubgroupAvcMceSetInterShapePenaltyINTEL = 5716,
+    SpvOpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL = 5717,
+    SpvOpSubgroupAvcMceSetInterDirectionPenaltyINTEL = 5718,
+    SpvOpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL = 5719,
+    SpvOpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL = 5720,
+    SpvOpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL = 5721,
+    SpvOpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL = 5722,
+    SpvOpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL = 5723,
+    SpvOpSubgroupAvcMceSetMotionVectorCostFunctionINTEL = 5724,
+    SpvOpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL = 5725,
+    SpvOpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL = 5726,
+    SpvOpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL = 5727,
+    SpvOpSubgroupAvcMceSetAcOnlyHaarINTEL = 5728,
+    SpvOpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL = 5729,
+    SpvOpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL = 5730,
+    SpvOpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL = 5731,
+    SpvOpSubgroupAvcMceConvertToImePayloadINTEL = 5732,
+    SpvOpSubgroupAvcMceConvertToImeResultINTEL = 5733,
+    SpvOpSubgroupAvcMceConvertToRefPayloadINTEL = 5734,
+    SpvOpSubgroupAvcMceConvertToRefResultINTEL = 5735,
+    SpvOpSubgroupAvcMceConvertToSicPayloadINTEL = 5736,
+    SpvOpSubgroupAvcMceConvertToSicResultINTEL = 5737,
+    SpvOpSubgroupAvcMceGetMotionVectorsINTEL = 5738,
+    SpvOpSubgroupAvcMceGetInterDistortionsINTEL = 5739,
+    SpvOpSubgroupAvcMceGetBestInterDistortionsINTEL = 5740,
+    SpvOpSubgroupAvcMceGetInterMajorShapeINTEL = 5741,
+    SpvOpSubgroupAvcMceGetInterMinorShapeINTEL = 5742,
+    SpvOpSubgroupAvcMceGetInterDirectionsINTEL = 5743,
+    SpvOpSubgroupAvcMceGetInterMotionVectorCountINTEL = 5744,
+    SpvOpSubgroupAvcMceGetInterReferenceIdsINTEL = 5745,
+    SpvOpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL = 5746,
+    SpvOpSubgroupAvcImeInitializeINTEL = 5747,
+    SpvOpSubgroupAvcImeSetSingleReferenceINTEL = 5748,
+    SpvOpSubgroupAvcImeSetDualReferenceINTEL = 5749,
+    SpvOpSubgroupAvcImeRefWindowSizeINTEL = 5750,
+    SpvOpSubgroupAvcImeAdjustRefOffsetINTEL = 5751,
+    SpvOpSubgroupAvcImeConvertToMcePayloadINTEL = 5752,
+    SpvOpSubgroupAvcImeSetMaxMotionVectorCountINTEL = 5753,
+    SpvOpSubgroupAvcImeSetUnidirectionalMixDisableINTEL = 5754,
+    SpvOpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL = 5755,
+    SpvOpSubgroupAvcImeSetWeightedSadINTEL = 5756,
+    SpvOpSubgroupAvcImeEvaluateWithSingleReferenceINTEL = 5757,
+    SpvOpSubgroupAvcImeEvaluateWithDualReferenceINTEL = 5758,
+    SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL = 5759,
+    SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL = 5760,
+    SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL = 5761,
+    SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL = 5762,
+    SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL = 5763,
+    SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL = 5764,
+    SpvOpSubgroupAvcImeConvertToMceResultINTEL = 5765,
+    SpvOpSubgroupAvcImeGetSingleReferenceStreaminINTEL = 5766,
+    SpvOpSubgroupAvcImeGetDualReferenceStreaminINTEL = 5767,
+    SpvOpSubgroupAvcImeStripSingleReferenceStreamoutINTEL = 5768,
+    SpvOpSubgroupAvcImeStripDualReferenceStreamoutINTEL = 5769,
+    SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL = 5770,
+    SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL = 5771,
+    SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL = 5772,
+    SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL = 5773,
+    SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL = 5774,
+    SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL = 5775,
+    SpvOpSubgroupAvcImeGetBorderReachedINTEL = 5776,
+    SpvOpSubgroupAvcImeGetTruncatedSearchIndicationINTEL = 5777,
+    SpvOpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL = 5778,
+    SpvOpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL = 5779,
+    SpvOpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL = 5780,
+    SpvOpSubgroupAvcFmeInitializeINTEL = 5781,
+    SpvOpSubgroupAvcBmeInitializeINTEL = 5782,
+    SpvOpSubgroupAvcRefConvertToMcePayloadINTEL = 5783,
+    SpvOpSubgroupAvcRefSetBidirectionalMixDisableINTEL = 5784,
+    SpvOpSubgroupAvcRefSetBilinearFilterEnableINTEL = 5785,
+    SpvOpSubgroupAvcRefEvaluateWithSingleReferenceINTEL = 5786,
+    SpvOpSubgroupAvcRefEvaluateWithDualReferenceINTEL = 5787,
+    SpvOpSubgroupAvcRefEvaluateWithMultiReferenceINTEL = 5788,
+    SpvOpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL = 5789,
+    SpvOpSubgroupAvcRefConvertToMceResultINTEL = 5790,
+    SpvOpSubgroupAvcSicInitializeINTEL = 5791,
+    SpvOpSubgroupAvcSicConfigureSkcINTEL = 5792,
+    SpvOpSubgroupAvcSicConfigureIpeLumaINTEL = 5793,
+    SpvOpSubgroupAvcSicConfigureIpeLumaChromaINTEL = 5794,
+    SpvOpSubgroupAvcSicGetMotionVectorMaskINTEL = 5795,
+    SpvOpSubgroupAvcSicConvertToMcePayloadINTEL = 5796,
+    SpvOpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL = 5797,
+    SpvOpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL = 5798,
+    SpvOpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL = 5799,
+    SpvOpSubgroupAvcSicSetBilinearFilterEnableINTEL = 5800,
+    SpvOpSubgroupAvcSicSetSkcForwardTransformEnableINTEL = 5801,
+    SpvOpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL = 5802,
+    SpvOpSubgroupAvcSicEvaluateIpeINTEL = 5803,
+    SpvOpSubgroupAvcSicEvaluateWithSingleReferenceINTEL = 5804,
+    SpvOpSubgroupAvcSicEvaluateWithDualReferenceINTEL = 5805,
+    SpvOpSubgroupAvcSicEvaluateWithMultiReferenceINTEL = 5806,
+    SpvOpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL = 5807,
+    SpvOpSubgroupAvcSicConvertToMceResultINTEL = 5808,
+    SpvOpSubgroupAvcSicGetIpeLumaShapeINTEL = 5809,
+    SpvOpSubgroupAvcSicGetBestIpeLumaDistortionINTEL = 5810,
+    SpvOpSubgroupAvcSicGetBestIpeChromaDistortionINTEL = 5811,
+    SpvOpSubgroupAvcSicGetPackedIpeLumaModesINTEL = 5812,
+    SpvOpSubgroupAvcSicGetIpeChromaModeINTEL = 5813,
+    SpvOpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL = 5814,
+    SpvOpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL = 5815,
+    SpvOpSubgroupAvcSicGetInterRawSadsINTEL = 5816,
+    SpvOpVariableLengthArrayINTEL = 5818,
+    SpvOpSaveMemoryINTEL = 5819,
+    SpvOpRestoreMemoryINTEL = 5820,
+    SpvOpArbitraryFloatSinCosPiINTEL = 5840,
+    SpvOpArbitraryFloatCastINTEL = 5841,
+    SpvOpArbitraryFloatCastFromIntINTEL = 5842,
+    SpvOpArbitraryFloatCastToIntINTEL = 5843,
+    SpvOpArbitraryFloatAddINTEL = 5846,
+    SpvOpArbitraryFloatSubINTEL = 5847,
+    SpvOpArbitraryFloatMulINTEL = 5848,
+    SpvOpArbitraryFloatDivINTEL = 5849,
+    SpvOpArbitraryFloatGTINTEL = 5850,
+    SpvOpArbitraryFloatGEINTEL = 5851,
+    SpvOpArbitraryFloatLTINTEL = 5852,
+    SpvOpArbitraryFloatLEINTEL = 5853,
+    SpvOpArbitraryFloatEQINTEL = 5854,
+    SpvOpArbitraryFloatRecipINTEL = 5855,
+    SpvOpArbitraryFloatRSqrtINTEL = 5856,
+    SpvOpArbitraryFloatCbrtINTEL = 5857,
+    SpvOpArbitraryFloatHypotINTEL = 5858,
+    SpvOpArbitraryFloatSqrtINTEL = 5859,
+    SpvOpArbitraryFloatLogINTEL = 5860,
+    SpvOpArbitraryFloatLog2INTEL = 5861,
+    SpvOpArbitraryFloatLog10INTEL = 5862,
+    SpvOpArbitraryFloatLog1pINTEL = 5863,
+    SpvOpArbitraryFloatExpINTEL = 5864,
+    SpvOpArbitraryFloatExp2INTEL = 5865,
+    SpvOpArbitraryFloatExp10INTEL = 5866,
+    SpvOpArbitraryFloatExpm1INTEL = 5867,
+    SpvOpArbitraryFloatSinINTEL = 5868,
+    SpvOpArbitraryFloatCosINTEL = 5869,
+    SpvOpArbitraryFloatSinCosINTEL = 5870,
+    SpvOpArbitraryFloatSinPiINTEL = 5871,
+    SpvOpArbitraryFloatCosPiINTEL = 5872,
+    SpvOpArbitraryFloatASinINTEL = 5873,
+    SpvOpArbitraryFloatASinPiINTEL = 5874,
+    SpvOpArbitraryFloatACosINTEL = 5875,
+    SpvOpArbitraryFloatACosPiINTEL = 5876,
+    SpvOpArbitraryFloatATanINTEL = 5877,
+    SpvOpArbitraryFloatATanPiINTEL = 5878,
+    SpvOpArbitraryFloatATan2INTEL = 5879,
+    SpvOpArbitraryFloatPowINTEL = 5880,
+    SpvOpArbitraryFloatPowRINTEL = 5881,
+    SpvOpArbitraryFloatPowNINTEL = 5882,
+    SpvOpLoopControlINTEL = 5887,
+    SpvOpAliasDomainDeclINTEL = 5911,
+    SpvOpAliasScopeDeclINTEL = 5912,
+    SpvOpAliasScopeListDeclINTEL = 5913,
+    SpvOpFixedSqrtINTEL = 5923,
+    SpvOpFixedRecipINTEL = 5924,
+    SpvOpFixedRsqrtINTEL = 5925,
+    SpvOpFixedSinINTEL = 5926,
+    SpvOpFixedCosINTEL = 5927,
+    SpvOpFixedSinCosINTEL = 5928,
+    SpvOpFixedSinPiINTEL = 5929,
+    SpvOpFixedCosPiINTEL = 5930,
+    SpvOpFixedSinCosPiINTEL = 5931,
+    SpvOpFixedLogINTEL = 5932,
+    SpvOpFixedExpINTEL = 5933,
+    SpvOpPtrCastToCrossWorkgroupINTEL = 5934,
+    SpvOpCrossWorkgroupCastToPtrINTEL = 5938,
+    SpvOpReadPipeBlockingINTEL = 5946,
+    SpvOpWritePipeBlockingINTEL = 5947,
+    SpvOpFPGARegINTEL = 5949,
+    SpvOpRayQueryGetRayTMinKHR = 6016,
+    SpvOpRayQueryGetRayFlagsKHR = 6017,
+    SpvOpRayQueryGetIntersectionTKHR = 6018,
+    SpvOpRayQueryGetIntersectionInstanceCustomIndexKHR = 6019,
+    SpvOpRayQueryGetIntersectionInstanceIdKHR = 6020,
+    SpvOpRayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetKHR = 6021,
+    SpvOpRayQueryGetIntersectionGeometryIndexKHR = 6022,
+    SpvOpRayQueryGetIntersectionPrimitiveIndexKHR = 6023,
+    SpvOpRayQueryGetIntersectionBarycentricsKHR = 6024,
+    SpvOpRayQueryGetIntersectionFrontFaceKHR = 6025,
+    SpvOpRayQueryGetIntersectionCandidateAABBOpaqueKHR = 6026,
+    SpvOpRayQueryGetIntersectionObjectRayDirectionKHR = 6027,
+    SpvOpRayQueryGetIntersectionObjectRayOriginKHR = 6028,
+    SpvOpRayQueryGetWorldRayDirectionKHR = 6029,
+    SpvOpRayQueryGetWorldRayOriginKHR = 6030,
+    SpvOpRayQueryGetIntersectionObjectToWorldKHR = 6031,
+    SpvOpRayQueryGetIntersectionWorldToObjectKHR = 6032,
+    SpvOpAtomicFAddEXT = 6035,
+    SpvOpTypeBufferSurfaceINTEL = 6086,
+    SpvOpTypeStructContinuedINTEL = 6090,
+    SpvOpConstantCompositeContinuedINTEL = 6091,
+    SpvOpSpecConstantCompositeContinuedINTEL = 6092,
+    SpvOpControlBarrierArriveINTEL = 6142,
+    SpvOpControlBarrierWaitINTEL = 6143,
+    SpvOpGroupIMulKHR = 6401,
+    SpvOpGroupFMulKHR = 6402,
+    SpvOpGroupBitwiseAndKHR = 6403,
+    SpvOpGroupBitwiseOrKHR = 6404,
+    SpvOpGroupBitwiseXorKHR = 6405,
+    SpvOpGroupLogicalAndKHR = 6406,
+    SpvOpGroupLogicalOrKHR = 6407,
+    SpvOpGroupLogicalXorKHR = 6408,
+    SpvOpMax = 0x7fffffff,
+} SpvOp;
+
+#ifdef SPV_ENABLE_UTILITY_CODE
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif
+inline void SpvHasResultAndType(SpvOp opcode, bool *hasResult, bool *hasResultType) {
+    *hasResult = *hasResultType = false;
+    switch (opcode) {
+    default: /* unknown opcode */ break;
+    case SpvOpNop: *hasResult = false; *hasResultType = false; break;
+    case SpvOpUndef: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSourceContinued: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSource: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSourceExtension: *hasResult = false; *hasResultType = false; break;
+    case SpvOpName: *hasResult = false; *hasResultType = false; break;
+    case SpvOpMemberName: *hasResult = false; *hasResultType = false; break;
+    case SpvOpString: *hasResult = true; *hasResultType = false; break;
+    case SpvOpLine: *hasResult = false; *hasResultType = false; break;
+    case SpvOpExtension: *hasResult = false; *hasResultType = false; break;
+    case SpvOpExtInstImport: *hasResult = true; *hasResultType = false; break;
+    case SpvOpExtInst: *hasResult = true; *hasResultType = true; break;
+    case SpvOpMemoryModel: *hasResult = false; *hasResultType = false; break;
+    case SpvOpEntryPoint: *hasResult = false; *hasResultType = false; break;
+    case SpvOpExecutionMode: *hasResult = false; *hasResultType = false; break;
+    case SpvOpCapability: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTypeVoid: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeBool: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeInt: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeFloat: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeVector: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeMatrix: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeImage: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeSampler: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeSampledImage: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeArray: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeRuntimeArray: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeStruct: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeOpaque: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypePointer: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeFunction: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeEvent: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeDeviceEvent: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeReserveId: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeQueue: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypePipe: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeForwardPointer: *hasResult = false; *hasResultType = false; break;
+    case SpvOpConstantTrue: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConstantFalse: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConstant: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConstantComposite: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConstantSampler: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConstantNull: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSpecConstantTrue: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSpecConstantFalse: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSpecConstant: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSpecConstantComposite: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSpecConstantOp: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFunction: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFunctionParameter: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFunctionEnd: *hasResult = false; *hasResultType = false; break;
+    case SpvOpFunctionCall: *hasResult = true; *hasResultType = true; break;
+    case SpvOpVariable: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageTexelPointer: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLoad: *hasResult = true; *hasResultType = true; break;
+    case SpvOpStore: *hasResult = false; *hasResultType = false; break;
+    case SpvOpCopyMemory: *hasResult = false; *hasResultType = false; break;
+    case SpvOpCopyMemorySized: *hasResult = false; *hasResultType = false; break;
+    case SpvOpAccessChain: *hasResult = true; *hasResultType = true; break;
+    case SpvOpInBoundsAccessChain: *hasResult = true; *hasResultType = true; break;
+    case SpvOpPtrAccessChain: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArrayLength: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGenericPtrMemSemantics: *hasResult = true; *hasResultType = true; break;
+    case SpvOpInBoundsPtrAccessChain: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDecorate: *hasResult = false; *hasResultType = false; break;
+    case SpvOpMemberDecorate: *hasResult = false; *hasResultType = false; break;
+    case SpvOpDecorationGroup: *hasResult = true; *hasResultType = false; break;
+    case SpvOpGroupDecorate: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGroupMemberDecorate: *hasResult = false; *hasResultType = false; break;
+    case SpvOpVectorExtractDynamic: *hasResult = true; *hasResultType = true; break;
+    case SpvOpVectorInsertDynamic: *hasResult = true; *hasResultType = true; break;
+    case SpvOpVectorShuffle: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCompositeConstruct: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCompositeExtract: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCompositeInsert: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCopyObject: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTranspose: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSampledImage: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleDrefImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleDrefExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleProjImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleProjExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleProjDrefImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleProjDrefExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageFetch: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageGather: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageDrefGather: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageRead: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageWrite: *hasResult = false; *hasResultType = false; break;
+    case SpvOpImage: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageQueryFormat: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageQueryOrder: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageQuerySizeLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageQuerySize: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageQueryLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageQueryLevels: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageQuerySamples: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertFToU: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertFToS: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertSToF: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertUToF: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUConvert: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSConvert: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFConvert: *hasResult = true; *hasResultType = true; break;
+    case SpvOpQuantizeToF16: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertPtrToU: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSatConvertSToU: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSatConvertUToS: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertUToPtr: *hasResult = true; *hasResultType = true; break;
+    case SpvOpPtrCastToGeneric: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGenericCastToPtr: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGenericCastToPtrExplicit: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitcast: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSNegate: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFNegate: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIAdd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFAdd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpISub: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFSub: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIMul: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFMul: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUDiv: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSDiv: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFDiv: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUMod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSRem: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSMod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFRem: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFMod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpVectorTimesScalar: *hasResult = true; *hasResultType = true; break;
+    case SpvOpMatrixTimesScalar: *hasResult = true; *hasResultType = true; break;
+    case SpvOpVectorTimesMatrix: *hasResult = true; *hasResultType = true; break;
+    case SpvOpMatrixTimesVector: *hasResult = true; *hasResultType = true; break;
+    case SpvOpMatrixTimesMatrix: *hasResult = true; *hasResultType = true; break;
+    case SpvOpOuterProduct: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDot: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIAddCarry: *hasResult = true; *hasResultType = true; break;
+    case SpvOpISubBorrow: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUMulExtended: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSMulExtended: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAny: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAll: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIsNan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIsInf: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIsFinite: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIsNormal: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSignBitSet: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLessOrGreater: *hasResult = true; *hasResultType = true; break;
+    case SpvOpOrdered: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUnordered: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLogicalEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLogicalNotEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLogicalOr: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLogicalAnd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLogicalNot: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSelect: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpINotEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUGreaterThan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSGreaterThan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUGreaterThanEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSGreaterThanEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpULessThan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSLessThan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpULessThanEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSLessThanEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFOrdEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFUnordEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFOrdNotEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFUnordNotEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFOrdLessThan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFUnordLessThan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFOrdGreaterThan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFUnordGreaterThan: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFOrdLessThanEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFUnordLessThanEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFOrdGreaterThanEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFUnordGreaterThanEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpShiftRightLogical: *hasResult = true; *hasResultType = true; break;
+    case SpvOpShiftRightArithmetic: *hasResult = true; *hasResultType = true; break;
+    case SpvOpShiftLeftLogical: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitwiseOr: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitwiseXor: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitwiseAnd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpNot: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitFieldInsert: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitFieldSExtract: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitFieldUExtract: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitReverse: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBitCount: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDPdx: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDPdy: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFwidth: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDPdxFine: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDPdyFine: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFwidthFine: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDPdxCoarse: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDPdyCoarse: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFwidthCoarse: *hasResult = true; *hasResultType = true; break;
+    case SpvOpEmitVertex: *hasResult = false; *hasResultType = false; break;
+    case SpvOpEndPrimitive: *hasResult = false; *hasResultType = false; break;
+    case SpvOpEmitStreamVertex: *hasResult = false; *hasResultType = false; break;
+    case SpvOpEndStreamPrimitive: *hasResult = false; *hasResultType = false; break;
+    case SpvOpControlBarrier: *hasResult = false; *hasResultType = false; break;
+    case SpvOpMemoryBarrier: *hasResult = false; *hasResultType = false; break;
+    case SpvOpAtomicLoad: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicStore: *hasResult = false; *hasResultType = false; break;
+    case SpvOpAtomicExchange: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicCompareExchange: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicCompareExchangeWeak: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicIIncrement: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicIDecrement: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicIAdd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicISub: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicSMin: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicUMin: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicSMax: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicUMax: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicAnd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicOr: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicXor: *hasResult = true; *hasResultType = true; break;
+    case SpvOpPhi: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLoopMerge: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSelectionMerge: *hasResult = false; *hasResultType = false; break;
+    case SpvOpLabel: *hasResult = true; *hasResultType = false; break;
+    case SpvOpBranch: *hasResult = false; *hasResultType = false; break;
+    case SpvOpBranchConditional: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSwitch: *hasResult = false; *hasResultType = false; break;
+    case SpvOpKill: *hasResult = false; *hasResultType = false; break;
+    case SpvOpReturn: *hasResult = false; *hasResultType = false; break;
+    case SpvOpReturnValue: *hasResult = false; *hasResultType = false; break;
+    case SpvOpUnreachable: *hasResult = false; *hasResultType = false; break;
+    case SpvOpLifetimeStart: *hasResult = false; *hasResultType = false; break;
+    case SpvOpLifetimeStop: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGroupAsyncCopy: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupWaitEvents: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGroupAll: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupAny: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupBroadcast: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupIAdd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupFAdd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupFMin: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupUMin: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupSMin: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupFMax: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupUMax: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupSMax: *hasResult = true; *hasResultType = true; break;
+    case SpvOpReadPipe: *hasResult = true; *hasResultType = true; break;
+    case SpvOpWritePipe: *hasResult = true; *hasResultType = true; break;
+    case SpvOpReservedReadPipe: *hasResult = true; *hasResultType = true; break;
+    case SpvOpReservedWritePipe: *hasResult = true; *hasResultType = true; break;
+    case SpvOpReserveReadPipePackets: *hasResult = true; *hasResultType = true; break;
+    case SpvOpReserveWritePipePackets: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCommitReadPipe: *hasResult = false; *hasResultType = false; break;
+    case SpvOpCommitWritePipe: *hasResult = false; *hasResultType = false; break;
+    case SpvOpIsValidReserveId: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGetNumPipePackets: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGetMaxPipePackets: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupReserveReadPipePackets: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupReserveWritePipePackets: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupCommitReadPipe: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGroupCommitWritePipe: *hasResult = false; *hasResultType = false; break;
+    case SpvOpEnqueueMarker: *hasResult = true; *hasResultType = true; break;
+    case SpvOpEnqueueKernel: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGetKernelNDrangeSubGroupCount: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGetKernelNDrangeMaxSubGroupSize: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGetKernelWorkGroupSize: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGetKernelPreferredWorkGroupSizeMultiple: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRetainEvent: *hasResult = false; *hasResultType = false; break;
+    case SpvOpReleaseEvent: *hasResult = false; *hasResultType = false; break;
+    case SpvOpCreateUserEvent: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIsValidEvent: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSetUserEventStatus: *hasResult = false; *hasResultType = false; break;
+    case SpvOpCaptureEventProfilingInfo: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGetDefaultQueue: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBuildNDRange: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseSampleImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseSampleExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseSampleDrefImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseSampleDrefExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseSampleProjImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseSampleProjExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseSampleProjDrefImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseSampleProjDrefExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseFetch: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseGather: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseDrefGather: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSparseTexelsResident: *hasResult = true; *hasResultType = true; break;
+    case SpvOpNoLine: *hasResult = false; *hasResultType = false; break;
+    case SpvOpAtomicFlagTestAndSet: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicFlagClear: *hasResult = false; *hasResultType = false; break;
+    case SpvOpImageSparseRead: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSizeOf: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTypePipeStorage: *hasResult = true; *hasResultType = false; break;
+    case SpvOpConstantPipeStorage: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCreatePipeFromPipeStorage: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGetKernelLocalSizeForSubgroupCount: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGetKernelMaxNumSubgroups: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTypeNamedBarrier: *hasResult = true; *hasResultType = false; break;
+    case SpvOpNamedBarrierInitialize: *hasResult = true; *hasResultType = true; break;
+    case SpvOpMemoryNamedBarrier: *hasResult = false; *hasResultType = false; break;
+    case SpvOpModuleProcessed: *hasResult = false; *hasResultType = false; break;
+    case SpvOpExecutionModeId: *hasResult = false; *hasResultType = false; break;
+    case SpvOpDecorateId: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGroupNonUniformElect: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformAll: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformAny: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformAllEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBroadcast: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBroadcastFirst: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBallot: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformInverseBallot: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBallotBitExtract: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBallotBitCount: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBallotFindLSB: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBallotFindMSB: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformShuffle: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformShuffleXor: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformShuffleUp: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformShuffleDown: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformIAdd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformFAdd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformIMul: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformFMul: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformSMin: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformUMin: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformFMin: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformSMax: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformUMax: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformFMax: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBitwiseAnd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBitwiseOr: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformBitwiseXor: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformLogicalAnd: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformLogicalOr: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformLogicalXor: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformQuadBroadcast: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformQuadSwap: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCopyLogical: *hasResult = true; *hasResultType = true; break;
+    case SpvOpPtrEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpPtrNotEqual: *hasResult = true; *hasResultType = true; break;
+    case SpvOpPtrDiff: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTerminateInvocation: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSubgroupBallotKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupFirstInvocationKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAllKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAnyKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAllEqualKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupNonUniformRotateKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupReadInvocationKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTraceRayKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpExecuteCallableKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpConvertUToAccelerationStructureKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIgnoreIntersectionKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTerminateRayKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSDot: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUDot: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSUDot: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSDotAccSat: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUDotAccSat: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSUDotAccSat: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTypeRayQueryKHR: *hasResult = true; *hasResultType = false; break;
+    case SpvOpRayQueryInitializeKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpRayQueryTerminateKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpRayQueryGenerateIntersectionKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpRayQueryConfirmIntersectionKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpRayQueryProceedKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionTypeKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupIAddNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupFAddNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupFMinNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupUMinNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupSMinNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupFMaxNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupUMaxNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupSMaxNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFragmentMaskFetchAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFragmentFetchAMD: *hasResult = true; *hasResultType = true; break;
+    case SpvOpReadClockKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpImageSampleFootprintNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpEmitMeshTasksEXT: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSetMeshOutputsEXT: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGroupNonUniformPartitionNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpWritePackedPrimitiveIndices4x8NV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpReportIntersectionNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIgnoreIntersectionNV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTerminateRayNV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTraceNV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTraceMotionNV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTraceRayMotionNV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTypeAccelerationStructureNV: *hasResult = true; *hasResultType = false; break;
+    case SpvOpExecuteCallableNV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTypeCooperativeMatrixNV: *hasResult = true; *hasResultType = false; break;
+    case SpvOpCooperativeMatrixLoadNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCooperativeMatrixStoreNV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpCooperativeMatrixMulAddNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCooperativeMatrixLengthNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpBeginInvocationInterlockEXT: *hasResult = false; *hasResultType = false; break;
+    case SpvOpEndInvocationInterlockEXT: *hasResult = false; *hasResultType = false; break;
+    case SpvOpDemoteToHelperInvocation: *hasResult = false; *hasResultType = false; break;
+    case SpvOpIsHelperInvocationEXT: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertUToImageNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertUToSamplerNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertImageToUNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertSamplerToUNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertUToSampledImageNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertSampledImageToUNV: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSamplerImageAddressingModeNV: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSubgroupShuffleINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupShuffleDownINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupShuffleUpINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupShuffleXorINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupBlockReadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupBlockWriteINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSubgroupImageBlockReadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupImageBlockWriteINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSubgroupImageMediaBlockReadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupImageMediaBlockWriteINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpUCountLeadingZerosINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUCountTrailingZerosINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAbsISubINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAbsUSubINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIAddSatINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUAddSatINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIAverageINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUAverageINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIAverageRoundedINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUAverageRoundedINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpISubSatINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUSubSatINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpIMul32x16INTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpUMul32x16INTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConstantFunctionPointerINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFunctionPointerCallINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAsmTargetINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAsmINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAsmCallINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicFMinEXT: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicFMaxEXT: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAssumeTrueKHR: *hasResult = false; *hasResultType = false; break;
+    case SpvOpExpectKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpDecorateString: *hasResult = false; *hasResultType = false; break;
+    case SpvOpMemberDecorateString: *hasResult = false; *hasResultType = false; break;
+    case SpvOpVmeImageINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTypeVmeImageINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcImePayloadINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcRefPayloadINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcSicPayloadINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcMcePayloadINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcMceResultINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcImeResultINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcImeResultSingleReferenceStreamoutINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcImeResultDualReferenceStreamoutINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcImeSingleReferenceStreaminINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcImeDualReferenceStreaminINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcRefResultINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeAvcSicResultINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceSetInterShapePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceSetInterDirectionPenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceSetMotionVectorCostFunctionINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceSetAcOnlyHaarINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceConvertToImePayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceConvertToImeResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceConvertToRefPayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceConvertToRefResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceConvertToSicPayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceConvertToSicResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetMotionVectorsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetInterDistortionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetBestInterDistortionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetInterMajorShapeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetInterMinorShapeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetInterDirectionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetInterMotionVectorCountINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetInterReferenceIdsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeInitializeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeSetSingleReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeSetDualReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeRefWindowSizeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeAdjustRefOffsetINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeConvertToMcePayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeSetMaxMotionVectorCountINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeSetUnidirectionalMixDisableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeSetWeightedSadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeEvaluateWithSingleReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeEvaluateWithDualReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeConvertToMceResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetSingleReferenceStreaminINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetDualReferenceStreaminINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeStripSingleReferenceStreamoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeStripDualReferenceStreamoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetBorderReachedINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetTruncatedSearchIndicationINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcFmeInitializeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcBmeInitializeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcRefConvertToMcePayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcRefSetBidirectionalMixDisableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcRefSetBilinearFilterEnableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcRefEvaluateWithSingleReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcRefEvaluateWithDualReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcRefEvaluateWithMultiReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcRefConvertToMceResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicInitializeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicConfigureSkcINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicConfigureIpeLumaINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicConfigureIpeLumaChromaINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetMotionVectorMaskINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicConvertToMcePayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicSetBilinearFilterEnableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicSetSkcForwardTransformEnableINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicEvaluateIpeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicEvaluateWithSingleReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicEvaluateWithDualReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicEvaluateWithMultiReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicConvertToMceResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetIpeLumaShapeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetBestIpeLumaDistortionINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetBestIpeChromaDistortionINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetPackedIpeLumaModesINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetIpeChromaModeINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSubgroupAvcSicGetInterRawSadsINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpVariableLengthArrayINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpSaveMemoryINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRestoreMemoryINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpArbitraryFloatSinCosPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatCastINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatCastFromIntINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatCastToIntINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatAddINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatSubINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatMulINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatDivINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatGTINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatGEINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatLTINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatLEINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatEQINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatRecipINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatRSqrtINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatCbrtINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatHypotINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatSqrtINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatLogINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatLog2INTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatLog10INTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatLog1pINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatExpINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatExp2INTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatExp10INTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatExpm1INTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatSinINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatCosINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatSinCosINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatSinPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatCosPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatASinINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatASinPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatACosINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatACosPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatATanINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatATanPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatATan2INTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatPowINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatPowRINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpArbitraryFloatPowNINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpLoopControlINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpAliasDomainDeclINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpAliasScopeDeclINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpAliasScopeListDeclINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpFixedSqrtINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedRecipINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedRsqrtINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedSinINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedCosINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedSinCosINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedSinPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedCosPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedSinCosPiINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedLogINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFixedExpINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpPtrCastToCrossWorkgroupINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpCrossWorkgroupCastToPtrINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpReadPipeBlockingINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpWritePipeBlockingINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpFPGARegINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetRayTMinKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetRayFlagsKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionTKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionInstanceCustomIndexKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionInstanceIdKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionGeometryIndexKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionPrimitiveIndexKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionBarycentricsKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionFrontFaceKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionCandidateAABBOpaqueKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionObjectRayDirectionKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionObjectRayOriginKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetWorldRayDirectionKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetWorldRayOriginKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionObjectToWorldKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpRayQueryGetIntersectionWorldToObjectKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpAtomicFAddEXT: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTypeBufferSurfaceINTEL: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTypeStructContinuedINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpConstantCompositeContinuedINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpSpecConstantCompositeContinuedINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpControlBarrierArriveINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpControlBarrierWaitINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGroupIMulKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupFMulKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupBitwiseAndKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupBitwiseOrKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupBitwiseXorKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupLogicalAndKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupLogicalOrKHR: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGroupLogicalXorKHR: *hasResult = true; *hasResultType = true; break;
+    }
+}
+#endif /* SPV_ENABLE_UTILITY_CODE */
+
+#endif
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index fbd702222e2e..8ae729dd0052 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -664,6 +664,11 @@ void SpvModule::set_binding_count(SpvId val) {
     contents->binding_count = val;
 }
 
+void SpvModule::set_version_format(uint32_t val) {
+    check_defined();
+    contents->version_format = val;
+}
+
 void SpvModule::set_source_language(SpvSourceLanguage val) {
     check_defined();
     contents->source_language = val;
@@ -1180,10 +1185,10 @@ SpvId SpvBuilder::declare_function(const std::string &name, SpvId function_type)
     return add_function(name, function_type);
 }
 
-SpvId SpvBuilder::declare_constant(const Type &type, const void *data) {
-    SpvId result_id = lookup_constant(type, data);
+SpvId SpvBuilder::declare_constant(const Type &type, const void *data, bool is_specialization) {
+    SpvId result_id = lookup_constant(type, data, is_specialization);
     if (result_id == SpvInvalidId) {
-        result_id = add_constant(type, data);
+        result_id = add_constant(type, data, is_specialization);
     }
     return result_id;
 }
@@ -1306,13 +1311,24 @@ void SpvBuilder::add_struct_annotation(SpvId struct_type_id, uint32_t member_ind
 }
 
 void SpvBuilder::add_execution_mode_local_size(SpvId func_id,
-                                               uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z) {
+                                               uint32_t local_size_x, 
+                                               uint32_t local_size_y, 
+                                               uint32_t local_size_z) {
+
+    local_size_x = std::max(local_size_x, (uint32_t)1);
+    local_size_y = std::max(local_size_y, (uint32_t)1);
+    local_size_z = std::max(local_size_z, (uint32_t)1);
+
+    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(func_id, local_size_x, local_size_y, local_size_z);
+    module.add_execution_mode(exec_mode_inst);
+}
 
-    wg_size_x = std::max(wg_size_x, (uint32_t)1);
-    wg_size_y = std::max(wg_size_y, (uint32_t)1);
-    wg_size_z = std::max(wg_size_z, (uint32_t)1);
+void SpvBuilder::add_execution_mode_local_size_id(SpvId func_id,
+                                                  SpvId local_size_x_id, 
+                                                  SpvId local_size_y_id, 
+                                                  SpvId local_size_z_id) {
 
-    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(func_id, wg_size_x, wg_size_y, wg_size_z);
+    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(func_id, local_size_x_id, local_size_y_id, local_size_z_id);
     module.add_execution_mode(exec_mode_inst);
 }
 
@@ -1409,6 +1425,10 @@ SpvModule SpvBuilder::current_module() const {
     return module;
 }
 
+void SpvBuilder::set_version_format(uint32_t val) {
+    module.set_version_format(val);
+}
+
 void SpvBuilder::set_source_language(SpvSourceLanguage val) {
     module.set_source_language(val);
 }
@@ -1683,11 +1703,12 @@ SpvId SpvBuilder::add_pointer_type(SpvId base_type_id, SpvStorageClass storage_c
     return pointer_type_id;
 }
 
-SpvBuilder::ConstantKey SpvBuilder::make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data) const {
+SpvBuilder::ConstantKey SpvBuilder::make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data, bool is_specialization) const {
     ConstantKey key = hash_splitmix64(code);
     key = hash_combine(key, bits);
     key = hash_combine(key, lanes);
     key = hash_combine(key, bytes);
+    key = hash_combine(key, is_specialization ? uint64_t(-1) : uint64_t(1));
 
     if (data != nullptr) {
         const int8_t *ptr = reinterpret_bits<const int8_t *>(data);
@@ -1698,8 +1719,8 @@ SpvBuilder::ConstantKey SpvBuilder::make_constant_key(uint8_t code, uint8_t bits
     return key;
 }
 
-SpvBuilder::ConstantKey SpvBuilder::make_constant_key(const Type &type, const void *data) const {
-    return make_constant_key(type.code(), type.bits(), type.lanes(), type.bytes(), data);
+SpvBuilder::ConstantKey SpvBuilder::make_constant_key(const Type &type, const void *data, bool is_specialization) const {
+    return make_constant_key(type.code(), type.bits(), type.lanes(), type.bytes(), data, is_specialization);
 }
 
 SpvBuilder::ConstantKey SpvBuilder::make_bool_constant_key(bool value) const {
@@ -1815,6 +1836,37 @@ SpvId SpvBuilder::declare_scalar_constant_of_type(const Type &scalar_type, const
     return result_id;
 }
 
+template<typename T>
+SpvId SpvBuilder::declare_specialization_constant_of_type(const Type &scalar_type, const T *data) {
+
+    SpvId result_id = SpvInvalidId;
+    SpvValueType value_type = SpvInvalidValueType;
+    // TODO: Add bools?
+    if (scalar_type.is_float()) {
+        result_id = make_id(SpvFloatConstantId);
+        value_type = SpvFloatData;
+    } else if (scalar_type.is_int_or_uint()) {
+        result_id = make_id(SpvIntConstantId);
+        value_type = SpvIntegerData;
+    } else {
+        internal_error << "SPIRV: Unsupported type for specialization constant: " << scalar_type << "\n";
+        return SpvInvalidId;
+    }
+
+    T value = T(0);
+    assign_constant<T>(&value, data);
+    SpvId type_id = add_type(scalar_type);
+
+    debug(3) << "    declare_specialization_constant_of_type: "
+             << "%" << result_id << " "
+             << "type=" << scalar_type << " "
+             << "data=" << stringify_constant(value) << "\n";
+
+    SpvInstruction inst = SpvFactory::specialization_constant(result_id, type_id, scalar_type.bytes(), &value, value_type);
+    module.add_type(inst); // NOTE: Needs to be declared in the type section in order to be used with other type definitions
+    return result_id;
+}
+
 SpvId SpvBuilder::declare_integer_constant(const Type &type, int64_t value) {
     if (!type.is_int() || !type.is_scalar()) {
         internal_error << "SPIRV: Invalid type provided for integer constant!" << type << "\n";
@@ -1988,8 +2040,48 @@ SpvId SpvBuilder::declare_vector_constant(const Type &type, const void *data) {
     return result_id;
 }
 
-SpvId SpvBuilder::lookup_constant(const Type &type, const void *data) const {
-    ConstantKey key = make_constant_key(type, data);
+SpvId SpvBuilder::declare_specialization_constant(const Type &scalar_type, const void *data) {
+    if (scalar_type.lanes() != 1) {
+        internal_error << "SPIRV: Invalid type provided for scalar constant!" << scalar_type << "\n";
+        return SpvInvalidId;
+    }
+
+    SpvId result_id = SpvInvalidId;
+    if (scalar_type.is_int() && scalar_type.bits() == 8) {
+        result_id = declare_specialization_constant_of_type<int8_t>(scalar_type, reinterpret_cast<const int8_t *>(data));
+    } else if (scalar_type.is_int() && scalar_type.bits() == 16) {
+        result_id = declare_specialization_constant_of_type<int16_t>(scalar_type, reinterpret_cast<const int16_t *>(data));
+    } else if (scalar_type.is_int() && scalar_type.bits() == 32) {
+        result_id = declare_specialization_constant_of_type<int32_t>(scalar_type, reinterpret_cast<const int32_t *>(data));
+    } else if (scalar_type.is_int() && scalar_type.bits() == 64) {
+        result_id = declare_specialization_constant_of_type<int64_t>(scalar_type, reinterpret_cast<const int64_t *>(data));
+    } else if (scalar_type.is_uint() && scalar_type.bits() == 8) {
+        result_id = declare_specialization_constant_of_type<uint8_t>(scalar_type, reinterpret_cast<const uint8_t *>(data));
+    } else if (scalar_type.is_uint() && scalar_type.bits() == 16) {
+        result_id = declare_specialization_constant_of_type<uint16_t>(scalar_type, reinterpret_cast<const uint16_t *>(data));
+    } else if (scalar_type.is_uint() && scalar_type.bits() == 32) {
+        result_id = declare_specialization_constant_of_type<uint32_t>(scalar_type, reinterpret_cast<const uint32_t *>(data));
+    } else if (scalar_type.is_uint() && scalar_type.bits() == 64) {
+        result_id = declare_specialization_constant_of_type<uint64_t>(scalar_type, reinterpret_cast<const uint64_t *>(data));
+    } else if (scalar_type.is_float() && scalar_type.bits() == 16) {
+        if (scalar_type.is_bfloat()) {
+            result_id = declare_specialization_constant_of_type<bfloat16_t>(scalar_type, reinterpret_cast<const bfloat16_t *>(data));
+        } else {
+            result_id = declare_specialization_constant_of_type<float16_t>(scalar_type, reinterpret_cast<const float16_t *>(data));
+        }
+    } else if (scalar_type.is_float() && scalar_type.bits() == 32) {
+        result_id = declare_specialization_constant_of_type<float>(scalar_type, reinterpret_cast<const float *>(data));
+    } else if (scalar_type.is_float() && scalar_type.bits() == 64) {
+        result_id = declare_specialization_constant_of_type<double>(scalar_type, reinterpret_cast<const double *>(data));
+    } else {
+        user_error << "Unhandled constant data conversion from value type '" << scalar_type << "'!\n";
+    }
+    internal_assert(result_id != SpvInvalidId) << "Failed to declare specialization constant of type '" << scalar_type << "'!\n";
+    return result_id;
+}
+
+SpvId SpvBuilder::lookup_constant(const Type &type, const void *data, bool is_specialization) const {
+    ConstantKey key = make_constant_key(type, data, is_specialization);
     ConstantMap::const_iterator it = constant_map.find(key);
     if (it != constant_map.end()) {
         return it->second;
@@ -1997,15 +2089,17 @@ SpvId SpvBuilder::lookup_constant(const Type &type, const void *data) const {
     return SpvInvalidId;
 }
 
-SpvId SpvBuilder::add_constant(const Type &type, const void *data) {
+SpvId SpvBuilder::add_constant(const Type &type, const void *data, bool is_specialization) {
 
-    ConstantKey key = make_constant_key(type, data);
+    ConstantKey key = make_constant_key(type, data, is_specialization);
     ConstantMap::const_iterator it = constant_map.find(key);
     if (it != constant_map.end()) {
         return it->second;
     }
 
-    if (type.lanes() == 1) {
+    if (is_specialization) {
+        return declare_specialization_constant(type, data);
+    } else if (type.lanes() == 1) {
         return declare_scalar_constant(type, data);
     } else {
         return declare_vector_constant(type, data);
@@ -2067,6 +2161,13 @@ SpvId SpvBuilder::add_runtime_array(SpvId base_type_id) {
     return runtime_array_id;
 }
 
+SpvId SpvBuilder::add_array_with_default_size(SpvId base_type_id, SpvId array_size_id) {
+    SpvId array_id = make_id(SpvArrayTypeId);
+    SpvInstruction inst = SpvFactory::array_type(array_id, base_type_id, array_size_id);
+    module.add_type(inst);
+    return array_id;
+}
+
 bool SpvBuilder::is_pointer_type(SpvId id) const {
     BaseTypeMap::const_iterator it = base_type_map.find(id);
     if (it != base_type_map.end()) {
@@ -2332,6 +2433,14 @@ SpvInstruction SpvFactory::composite_constant(SpvId result_id, SpvId type_id, co
     return inst;
 }
 
+SpvInstruction SpvFactory::specialization_constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data, SpvValueType value_type) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpSpecConstant);
+    inst.set_type_id(type_id);
+    inst.set_result_id(result_id);
+    inst.add_data(bytes, data, value_type);
+    return inst;
+}
+
 SpvInstruction SpvFactory::variable(SpvId result_id, SpvId result_type_id, uint32_t storage_class, SpvId initializer_id) {
     SpvInstruction inst = SpvInstruction::make(SpvOpVariable);
     inst.set_type_id(result_type_id);
@@ -2389,14 +2498,28 @@ SpvInstruction SpvFactory::memory_model(SpvAddressingModel addressing_model, Spv
     return inst;
 }
 
-SpvInstruction SpvFactory::exec_mode_local_size(SpvId function_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z) {
+SpvInstruction SpvFactory::exec_mode_local_size(SpvId function_id, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) {
     SpvInstruction inst = SpvInstruction::make(SpvOpExecutionMode);
     inst.add_operand(function_id);
     inst.add_immediates({
         {SpvExecutionModeLocalSize, SpvIntegerLiteral},
-        {wg_size_x, SpvIntegerLiteral},
-        {wg_size_y, SpvIntegerLiteral},
-        {wg_size_z, SpvIntegerLiteral},
+        {local_size_x, SpvIntegerLiteral},
+        {local_size_y, SpvIntegerLiteral},
+        {local_size_z, SpvIntegerLiteral},
+    });
+    return inst;
+}
+
+SpvInstruction SpvFactory::exec_mode_local_size_id(SpvId function_id, SpvId local_size_x_id, SpvId local_size_y_id, SpvId local_size_z_id) {
+    SpvInstruction inst = SpvInstruction::make(SpvOpExecutionModeId);
+    inst.add_operand(function_id);
+    inst.add_immediates({
+        {SpvExecutionModeLocalSizeId, SpvIntegerLiteral},
+    });
+    inst.add_operands({
+        local_size_x_id, 
+        local_size_y_id, 
+        local_size_z_id
     });
     return inst;
 }
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index 80afb78ce3c9..f28cfeb1172d 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -23,8 +23,8 @@
 #include "IntrusivePtr.h"
 #include "Type.h"
 
-#include <spirv/1.0/GLSL.std.450.h>  // GLSL extended instructions for common intrinsics
-#include <spirv/1.0/spirv.h>         // Use v1.0 spec as the minimal viable version (for maximum compatiblity)
+#include <spirv/1.6/GLSL.std.450.h>  // GLSL extended instructions for common intrinsics
+#include <spirv/1.6/spirv.h>         // Use v1.6 headers but only use the minimal viable format version (for maximum compatiblity)
 
 namespace Halide {
 namespace Internal {
@@ -296,6 +296,7 @@ class SpvModule {
     void require_capability(SpvCapability val);
     void require_extension(const std::string &val);
 
+    void set_version_format(uint32_t version);
     void set_source_language(SpvSourceLanguage val);
     void set_addressing_model(SpvAddressingModel val);
     void set_memory_model(SpvMemoryModel val);
@@ -386,7 +387,7 @@ class SpvBuilder {
     SpvId declare_type(const Type &type, uint32_t array_size = 1);
     SpvId declare_pointer_type(const Type &type, SpvStorageClass storage_class);
     SpvId declare_pointer_type(SpvId type_id, SpvStorageClass storage_class);
-    SpvId declare_constant(const Type &type, const void *data);
+    SpvId declare_constant(const Type &type, const void *data, bool is_specialization=false);
     SpvId declare_null_constant(const Type &type);
     SpvId declare_bool_constant(bool value);
     SpvId declare_string_constant(const std::string &str);
@@ -394,6 +395,7 @@ class SpvBuilder {
     SpvId declare_float_constant(const Type &type, double value);
     SpvId declare_scalar_constant(const Type &type, const void *data);
     SpvId declare_vector_constant(const Type &type, const void *data);
+    SpvId declare_specialization_constant(const Type &type, const void *data);
     SpvId declare_access_chain(SpvId ptr_type_id, SpvId base_id, const Indices &indices);
     SpvId declare_pointer_access_chain(SpvId ptr_type_id, SpvId base_id, SpvId element_id, const Indices &indices);
     SpvId declare_function_type(SpvId return_type, const ParamTypes &param_types = {});
@@ -407,10 +409,11 @@ class SpvBuilder {
     // number of checks and the caller must insure that duplicates aren't created
     SpvId add_type(const Type &type, uint32_t array_size = 1);
     SpvId add_struct(const std::string &name, const StructMemberTypes &member_types);
+    SpvId add_array_with_default_size(SpvId base_type_id, SpvId array_size_id);
     SpvId add_runtime_array(SpvId base_type_id);
     SpvId add_pointer_type(const Type &type, SpvStorageClass storage_class);
     SpvId add_pointer_type(SpvId base_type_id, SpvStorageClass storage_class);
-    SpvId add_constant(const Type &type, const void *data);
+    SpvId add_constant(const Type &type, const void *data, bool is_specialization=false);
     SpvId add_function_type(SpvId return_type_id, const ParamTypes &param_type_ids);
     SpvId add_function(const std::string &name, SpvId return_type, const ParamTypes &param_types = {});
     SpvId add_instruction(SpvInstruction val);
@@ -422,7 +425,14 @@ class SpvBuilder {
     void add_entry_point(SpvId func_id, SpvExecutionModel exec_model,
                          const Variables &variables = {});
 
-    void add_execution_mode_local_size(SpvId entry_point_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z);
+    // Define the execution mode with a fixed local size for the workgroup (using literal values)
+    void add_execution_mode_local_size(SpvId entry_point_id, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z);
+
+    // Same as above but uses id's for the local size (to allow specialization constants to be used)
+    void add_execution_mode_local_size_id(SpvId entry_point_id, SpvId local_size_x, SpvId local_size_y, SpvId local_size_z);
+
+    // Assigns a specific SPIR-V version format for output (needed for compatibility)
+    void set_version_format(uint32_t version);
 
     // Assigns a specific source language hint to the module
     void set_source_language(SpvSourceLanguage val);
@@ -573,14 +583,17 @@ class SpvBuilder {
     template<typename T>
     SpvId declare_scalar_constant_of_type(const Type &scalar_type, const T *data);
 
+    template<typename T>
+    SpvId declare_specialization_constant_of_type(const Type &scalar_type, const T *data);
+
     template<typename T>
     SpvBuilder::Components declare_constants_for_each_lane(Type type, const void *data);
 
     ConstantKey make_bool_constant_key(bool value) const;
     ConstantKey make_string_constant_key(const std::string &value) const;
-    ConstantKey make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data) const;
-    ConstantKey make_constant_key(const Type &type, const void *data) const;
-    SpvId lookup_constant(const Type &type, const void *data) const;
+    ConstantKey make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data, bool is_specialization=false) const;
+    ConstantKey make_constant_key(const Type &type, const void *data, bool is_specialization=false) const;
+    SpvId lookup_constant(const Type &type, const void *data, bool is_specialization=false) const;
 
     ConstantKey make_null_constant_key(const Type &type) const;
     SpvId lookup_null_constant(const Type &type) const;
@@ -649,6 +662,7 @@ struct SpvFactory {
     static SpvInstruction bool_constant(SpvId result_id, SpvId type_id, bool value);
     static SpvInstruction string_constant(SpvId result_id, const std::string &value);
     static SpvInstruction composite_constant(SpvId result_id, SpvId type_id, const Components &components);
+    static SpvInstruction specialization_constant(SpvId result_id, SpvId type_id, size_t bytes, const void *data, SpvValueType value_type);
     static SpvInstruction variable(SpvId result_id, SpvId result_type_id, uint32_t storage_class, SpvId initializer_id = SpvInvalidId);
     static SpvInstruction function(SpvId return_type_id, SpvId func_id, uint32_t control_mask, SpvId func_type_id);
     static SpvInstruction function_parameter(SpvId param_type_id, SpvId param_id);
@@ -656,7 +670,8 @@ struct SpvFactory {
     static SpvInstruction return_stmt(SpvId return_value_id = SpvInvalidId);
     static SpvInstruction entry_point(SpvId exec_model, SpvId func_id, const std::string &name, const Variables &variables);
     static SpvInstruction memory_model(SpvAddressingModel addressing_model, SpvMemoryModel memory_model);
-    static SpvInstruction exec_mode_local_size(SpvId function_id, uint32_t wg_size_x, uint32_t wg_size_y, uint32_t wg_size_z);
+    static SpvInstruction exec_mode_local_size(SpvId function_id, uint32_t local_size_size_x, uint32_t local_size_size_y, uint32_t local_size_size_z);
+    static SpvInstruction exec_mode_local_size_id(SpvId function_id, SpvId local_size_x_id, SpvId local_size_y_id, SpvId local_size_z_id); // only avail in 1.2
     static SpvInstruction memory_barrier(SpvId memory_scope_id, SpvId semantics_mask_id);
     static SpvInstruction control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, SpvId semantics_mask_id);
     static SpvInstruction bitwise_not(SpvId type_id, SpvId result_id, SpvId src_id);

From a6ee0c3fca4826d8224901bc85389fa074784508 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 2 Dec 2022 17:08:05 -0800
Subject: [PATCH 117/166] Add support for dynamic shared memory allocations for
 Vulkan Add dynamic workgroup dispatching to Vulkan Add optional feature flags
 for Vulkan capabilities Add Vulkan API version flags for target features
 Enable v1.3 path if requested Re-enable tests for added features Update
 Vulkan docs with status updates and feature flags

---
 README_vulkan.md                              | 441 ++---------------
 .../src/halide/halide_/PyEnums.cpp            |   8 +
 src/BoundSmallAllocations.cpp                 |   9 -
 src/CodeGen_Vulkan_Dev.cpp                    | 453 +++++++++++++++---
 src/SpirvIR.cpp                               |  18 +-
 src/SpirvIR.h                                 |  12 +-
 src/Target.cpp                                | 101 +++-
 src/Target.h                                  |  13 +
 src/runtime/HalideRuntime.h                   |   8 +
 src/runtime/mini_vulkan.h                     |  11 +
 src/runtime/vulkan.cpp                        | 138 +++---
 src/runtime/vulkan_context.h                  |  51 +-
 src/runtime/vulkan_internal.h                 |  18 +-
 src/runtime/vulkan_memory.h                   |   2 +-
 src/runtime/vulkan_resources.h                | 444 ++++++++++++++---
 test/correctness/async_copy_chain.cpp         |   6 -
 test/correctness/gpu_dynamic_shared.cpp       |  16 +-
 .../gpu_mixed_shared_mem_types.cpp            |   1 -
 test/correctness/gpu_reuse_shared_memory.cpp  |   2 -
 test/correctness/math.cpp                     | 104 ++--
 test/correctness/newtons_method.cpp           |   8 +-
 21 files changed, 1169 insertions(+), 695 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index fa10ba3b1321..55bde5e1acc1 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -5,11 +5,11 @@ devices, and compiles directly to a binary SPIR-V representation as part of its
 code generation before submitting it to the Vulkan API. Both JIT and AOT usage 
 are supported via the `vulkan` target flag (eg `HL_JIT_TARGET=host-vulkan`).
 
-Vulkan support is actively under development, and considered *EXPERIMENTAL*
-at this stage.  Basic tests are passing, but there's still work to do to
-until we have adequate feature parity for production use.  
+Vulkan support is actively under development, and considered *BETA* quality
+at this stage.  Tests are passing, but performance tuning and user testing is needed 
+to identify potential issues before rolling this into production.  
 
-See [below](#current-status) for details on specific test cases.
+See [below](#current-status) for details.
 
 # Compiling Halide w/Vulkan Support
 
@@ -34,7 +34,7 @@ C:\> cmake --build build --config Release
 
 Halide has no direct dependency on Vulkan for code-generation, but the runtime
 requires a working Vulkan environment to run Halide generated code. Any valid 
-Vulkan v1.0+ device driver should work. 
+Vulkan v1.0+ device driver should work.
 
 Specifically, you'll need:
 
@@ -131,18 +131,32 @@ Make sure everything looks correct before continuing!
 
 # Targetting Vulkan
 
-To generate Halide code for Vulkan, simply add the `vulkan` flag to your target.
+To generate Halide code for Vulkan, simply add the `vulkan` flag to your target as well as any other optional device specific features you wish to enable for Halide:
 
-For AOT generators add `vulkan` to the target command line option:
+| Target Feature | Description | 
+| --             | --          |
+| `vulkan`       | Enables the vulkan backend |
+| `vk_int8`      | Allows 8-bit integer storage types to be used |
+| `vk_int16`     | Allows 16-bit integer storage types to be used |
+| `vk_int64`     | Allows 64-bit integer storage types to be used |
+| `vk_float16`   | Allows 16-bit floating-point values to be used for computation |
+| `vk_float64`   | Allows 64-bit floating-point values to be used for computation |
+| `vk_v10`       | Generates code compatible with the Vulkan v1.0+ API |
+| `vk_v12`       | Generates code compatible with the Vulkan v1.2+ API |
+| `vk_v13`       | Generates code compatible with the Vulkan v1.3+ API |
+
+Note that all optional device features are off by default (since they are not required by the Vulkan API, and thus must be explicitly enabled to ensure that the code being generated will be compatible with the device and API version being used for execution).
+
+For AOT generators add `vulkan` (and any other flags you wish to use) to the target command line option:
 
 ```
-$ ./lesson_15_generate -g my_first_generator -o . target=host-vulkan
+$ ./lesson_15_generate -g my_first_generator -o . target=host-vulkan-vk_int8-vk_int16
 ```
 
 For JIT apps use the `HL_JIT_TARGET` environment variable:
 
 ```
-$ HL_JIT_TARGET=host-vulkan ./tutorial/lesson_01_basics
+$ HL_JIT_TARGET=host-vulkan-vk_int8-vk_int16 ./tutorial/lesson_01_basics
 ```
 
 # Useful Environment Variables
@@ -185,404 +199,31 @@ validated and disassembled via the SPIR-V tools:
 https://github.com/KhronosGroup/SPIRV-Tools
 
 
-# Known Limitations And Caveats
+# Current Status
+
+All correctness tests are now passing on tested configs for Linux & Windows using the target `host-vulkan-vk_int8-vk_int16-vk_int64-vk_float16-vk_float64-vk_v13` on LLVM v14.x. 
+
+MacOS passes most tests but encounters internal MoltenVK code translation issues for wide vectors, and ambiguously function calls.
 
--   During CodeGen we enable capabilities in the SPIR-V binary for 
-    Int8, Int16, Float16, and Float64 based on the Halide IR, assuming
-    the device will support these requirements.  We may need to limit 
-    these if targetting a lower class device. We may need to add Halide 
-	feature flags to address these issues.
+Most of the Python app and correctness tests are failing, but are being investigated now.
 
--   When generating vector lengths greater than 8 components, we assume 
-    Vector16 support is available.  This may not be the case when running
-	on lower class devices.  Same as above ... we may need a feature flag
-	to limit vector lengths to a specific class of device.
+Android platform support is currently being worked on.
+
+# Caveats:
+
+-   Other than 32-bit floats and integers, every other data type is optional per the Vulkan spec
+-   Float 64-bit types can be enabled, but there aren't any native math functions available in SPIR-V
+-   Only one dynamically sized shared memory allocation can be used, but any number of 
+    fixed sized allocation are supported (up to the maximum amount allowed by the device)
 
 # Known TODO:
 
--   Fix as many tests as possible
--   Shared local memory and barriers need work
--   More platform support (e.g. Windows, Android, etc)
+-   Performance tuning of CodeGen and Runtime
+-   More platform support (Android is WIP, RISC-V, etc)
+-   Adapt unsupported types to supported types (if missing vk_int8 then promote to uint32_t)?
 -   Better debugging utilities using the Vulkan debug hooks.
 -   Allow debug symbols to be stripped from SPIR-V during codegen to reduce
     memory overhead for large kernels.
 -   Investigate floating point rounding and precision (v1.3 adds more controls)
--   Investigate memory model usage (are Halide's assumptions correct?)
-
-# Current Status
+-   Investigate memory model usage (can Halide gain anything from these?)
 
-The following table outlines the state of the correctness tests (as of Oct-23) when 
-run with `HL_JIT_TARGET=host-vulkan` (NOTE: some tests may need additional 
-modifications to run under Vulkan):
-
-| Totals | *PASS* 🟢 | *FAIL* 🔴 | 
-| --     | --        | --        |
-|        | 340       | 25        |
-
-| Test Name | Status |
-| :--       |    --: |
-| correctness_cse_nan | *PASS* 🟢 |
-| correctness_simd_op_check_hvx | *PASS* 🟢 |
-| correctness_compute_with_in | *PASS* 🟢 |
-| correctness_extern_stage | *PASS* 🟢 |
-| correctness_pipeline_set_jit_externs_func | *PASS* 🟢 |
-| correctness_likely | *PASS* 🟢 |
-| correctness_vector_bounds_inference | *PASS* 🟢 |
-| correctness_inline_reduction | *PASS* 🟢 |
-| correctness_unsafe_promises | *PASS* 🟢 |
-| correctness_reorder_rvars | *PASS* 🟢 |
-| correctness_lossless_cast | *PASS* 🟢 |
-| correctness_gpu_reuse_shared_memory | *FAIL* 🔴 |
-| correctness_boundary_conditions | *PASS* 🟢 |
-| correctness_min_extent | *PASS* 🟢 |
-| correctness_gpu_sum_scan | *PASS* 🟢 |
-| correctness_dynamic_allocation_in_gpu_kernel | *FAIL* 🔴 |
-| correctness_image_of_lists | *PASS* 🟢 |
-| correctness_tracing_broadcast | *PASS* 🟢 |
-| correctness_scatter | *PASS* 🟢 |
-| correctness_stmt_to_html | *PASS* 🟢 |
-| correctness_host_alignment | *PASS* 🟢 |
-| correctness_custom_allocator | *PASS* 🟢 |
-| correctness_issue_3926 | *PASS* 🟢 |
-| correctness_compare_vars | *PASS* 🟢 |
-| correctness_non_vector_aligned_embeded_buffer | *PASS* 🟢 |
-| correctness_realize_larger_than_two_gigs | *PASS* 🟢 |
-| correctness_gpu_transpose | *PASS* 🟢 |
-| correctness_side_effects | *PASS* 🟢 |
-| correctness_logical | *FAIL* 🔴 |
-| correctness_func_lifetime_2 | *PASS* 🟢 |
-| correctness_device_crop | *FAIL* 🔴 |
-| correctness_print_loop_nest | *PASS* 🟢 |
-| correctness_bool_compute_root_vectorize | *FAIL* 🔴 |
-| correctness_extract_concat_bits | *PASS* 🟢 |
-| correctness_dead_realization_in_specialization | *PASS* 🟢 |
-| correctness_undef | *PASS* 🟢 |
-| correctness_growing_stack | *PASS* 🟢 |
-| correctness_parallel_scatter | *PASS* 🟢 |
-| correctness_multi_splits_with_diff_tail_strategies | *PASS* 🟢 |
-| correctness_gpu_arg_types | *PASS* 🟢 |
-| correctness_cascaded_filters | *PASS* 🟢 |
-| correctness_trim_no_ops | *PASS* 🟢 |
-| correctness_float16_t_comparison | *PASS* 🟢 |
-| correctness_legal_race_condition | *PASS* 🟢 |
-| correctness_explicit_inline_reductions | *PASS* 🟢 |
-| correctness_vector_tile | *PASS* 🟢 |
-| correctness_skip_stages_memoize | *PASS* 🟢 |
-| correctness_intrinsics | *PASS* 🟢 |
-| correctness_strict_float | *PASS* 🟢 |
-| correctness_bounds_query | *PASS* 🟢 |
-| correctness_vector_reductions | *PASS* 🟢 |
-| correctness_custom_lowering_pass | *PASS* 🟢 |
-| correctness_gpu_assertion_in_kernel | *PASS* 🟢 |
-| correctness_low_bit_depth_noise | *PASS* 🟢 |
-| correctness_fuse | *PASS* 🟢 |
-| correctness_vector_cast | *PASS* 🟢 |
-| correctness_concat | *PASS* 🟢 |
-| correctness_mod | *PASS* 🟢 |
-| correctness_parallel_rvar | *PASS* 🟢 |
-| correctness_make_struct | *PASS* 🟢 |
-| correctness_reduction_predicate_racing | *PASS* 🟢 |
-| correctness_bounds_inference_chunk | *PASS* 🟢 |
-| correctness_realize_over_shifted_domain | *PASS* 🟢 |
-| correctness_compute_at_split_rvar | *PASS* 🟢 |
-| correctness_split_fuse_rvar | *PASS* 🟢 |
-| correctness_memoize_cloned | *PASS* 🟢 |
-| correctness_failed_unroll | *PASS* 🟢 |
-| correctness_gpu_vectorized_shared_memory | *PASS* 🟢 |
-| correctness_bounds_inference_complex | *PASS* 🟢 |
-| correctness_widening_reduction | *FAIL* 🔴 |
-| correctness_extern_partial | *PASS* 🟢 |
-| correctness_multi_output_pipeline_with_bad_sizes | *PASS* 🟢 |
-| correctness_hoist_loop_invariant_if_statements | *PASS* 🟢 |
-| correctness_extern_sort | *PASS* 🟢 |
-| correctness_multiple_outputs_extern | *PASS* 🟢 |
-| correctness_tracing_bounds | *PASS* 🟢 |
-| correctness_gpu_object_lifetime_1 | *PASS* 🟢 |
-| correctness_nested_tail_strategies | *PASS* 🟢 |
-| correctness_parallel_reductions | *PASS* 🟢 |
-| correctness_custom_error_reporter | *PASS* 🟢 |
-| correctness_many_dimensions | *PASS* 🟢 |
-| correctness_predicated_store_load | *PASS* 🟢 |
-| correctness_random | *PASS* 🟢 |
-| correctness_partition_loops_bug | *PASS* 🟢 |
-| correctness_stencil_chain_in_update_definitions | *PASS* 🟢 |
-| correctness_inverse | *FAIL* 🔴 |
-| correctness_skip_stages | *PASS* 🟢 |
-| correctness_cuda_8_bit_dot_product | *PASS* 🟢 |
-| correctness_gpu_vectorize | *PASS* 🟢 |
-| correctness_gpu_object_lifetime_3 | *PASS* 🟢 |
-| correctness_histogram | *PASS* 🟢 |
-| correctness_shared_self_references | *PASS* 🟢 |
-| correctness_gpu_mixed_shared_mem_types | *FAIL* 🔴 |
-| correctness_custom_cuda_context | *PASS* 🟢 |
-| correctness_implicit_args_tests | *PASS* 🟢 |
-| correctness_compile_to_lowered_stmt | *PASS* 🟢 |
-| correctness_bounds_of_func | *PASS* 🟢 |
-| correctness_interleave_rgb | *FAIL* 🔴 |
-| correctness_multi_gpu_gpu_multi_device | *PASS* 🟢 |
-| correctness_lambda | *PASS* 🟢 |
-| correctness_interval | *PASS* 🟢 |
-| correctness_unused_func | *PASS* 🟢 |
-| correctness_fuzz_float_stores | *PASS* 🟢 |
-| correctness_newtons_method | *PASS* 🟢 |
-| correctness_compile_to_bitcode | *PASS* 🟢 |
-| correctness_lazy_convolution | *PASS* 🟢 |
-| correctness_image_wrapper | *PASS* 🟢 |
-| correctness_reduction_chain | *PASS* 🟢 |
-| correctness_storage_folding | *PASS* 🟢 |
-| correctness_reorder_storage | *PASS* 🟢 |
-| correctness_bit_counting | *PASS* 🟢 |
-| correctness_tiled_matmul | *PASS* 🟢 |
-| correctness_async_device_copy | *FAIL* 🔴 |
-| correctness_lots_of_dimensions | *PASS* 🟢 |
-| correctness_interleave | *PASS* 🟢 |
-| correctness_dynamic_reduction_bounds | *PASS* 🟢 |
-| correctness_atomic_tuples | *PASS* 🟢 |
-| correctness_named_updates | *PASS* 🟢 |
-| correctness_unroll_dynamic_loop | *PASS* 🟢 |
-| correctness_buffer_t | *PASS* 🟢 |
-| correctness_hello_gpu | *PASS* 🟢 |
-| correctness_gpu_object_lifetime_2 | *PASS* 🟢 |
-| correctness_update_chunk | *PASS* 🟢 |
-| correctness_autodiff | *PASS* 🟢 |
-| correctness_extern_consumer | *PASS* 🟢 |
-| correctness_func_wrapper | *PASS* 🟢 |
-| correctness_bounds_of_multiply | *PASS* 🟢 |
-| correctness_gpu_store_in_register_with_no_lanes_loop | *PASS* 🟢 |
-| correctness_gpu_condition_lifting | *PASS* 🟢 |
-| correctness_extern_consumer_tiled | *PASS* 🟢 |
-| correctness_float16_t_neon_op_check | *PASS* 🟢 |
-| correctness_split_by_non_factor | *PASS* 🟢 |
-| correctness_parallel_fork | *PASS* 🟢 |
-| correctness_hexagon_scatter | *PASS* 🟢 |
-| correctness_partition_loops | *PASS* 🟢 |
-| correctness_process_some_tiles | *PASS* 🟢 |
-| correctness_parameter_constraints | *PASS* 🟢 |
-| correctness_callable | *PASS* 🟢 |
-| correctness_bounds_inference | *PASS* 🟢 |
-| correctness_indexing_access_undef | *PASS* 🟢 |
-| correctness_partial_realization | *PASS* 🟢 |
-| correctness_gpu_mixed_dimensionality | *FAIL* 🔴 |
-| correctness_uninitialized_read | *PASS* 🟢 |
-| correctness_unsafe_dedup_lets | *PASS* 🟢 |
-| correctness_output_larger_than_two_gigs | *PASS* 🟢 |
-| correctness_obscure_image_references | *PASS* 🟢 |
-| correctness_chunk | *PASS* 🟢 |
-| correctness_vectorized_load_from_vectorized_allocation | *PASS* 🟢 |
-| correctness_load_library | *PASS* 🟢 |
-| correctness_compute_inside_guard | *PASS* 🟢 |
-| correctness_multi_pass_reduction | *PASS* 🟢 |
-| correctness_lerp | *PASS* 🟢 |
-| correctness_realize_condition_depends_on_tuple | *PASS* 🟢 |
-| correctness_vectorized_initialization | *PASS* 🟢 |
-| correctness_loop_level_generator_param | *PASS* 🟢 |
-| correctness_two_vector_args | *PASS* 🟢 |
-| correctness_argmax | *FAIL* 🔴 |
-| correctness_custom_auto_scheduler | *PASS* 🟢 |
-| correctness_shadowed_bound | *PASS* 🟢 |
-| correctness_inlined_generator | *PASS* 🟢 |
-| correctness_math | *FAIL* 🔴 |
-| correctness_gpu_different_blocks_threads_dimensions | *PASS* 🟢 |
-| correctness_extern_stage_on_device | *PASS* 🟢 |
-| correctness_bound | *PASS* 🟢 |
-| correctness_popc_clz_ctz_bounds | *PASS* 🟢 |
-| correctness_bounds | *PASS* 🟢 |
-| correctness_prefetch | *PASS* 🟢 |
-| correctness_force_onto_stack | *PASS* 🟢 |
-| correctness_input_image_bounds_check | *PASS* 🟢 |
-| correctness_sort_exprs | *PASS* 🟢 |
-| correctness_let_in_rdom_bound | *PASS* 🟢 |
-| correctness_func_lifetime | *PASS* 🟢 |
-| correctness_compute_outermost | *PASS* 🟢 |
-| correctness_histogram_equalize | *PASS* 🟢 |
-| correctness_func_clone | *PASS* 🟢 |
-| correctness_tracing_stack | *PASS* 🟢 |
-| correctness_simplify | *PASS* 🟢 |
-| correctness_gameoflife | *PASS* 🟢 |
-| correctness_thread_safety | *PASS* 🟢 |
-| correctness_fuse_gpu_threads | *PASS* 🟢 |
-| correctness_split_reuse_inner_name_bug | *PASS* 🟢 |
-| correctness_gpu_jit_explicit_copy_to_device | *PASS* 🟢 |
-| correctness_tuple_select | *PASS* 🟢 |
-| correctness_device_buffer_copy | *FAIL* 🔴 |
-| correctness_pseudostack_shares_slots | *PASS* 🟢 |
-| correctness_lots_of_loop_invariants | *PASS* 🟢 |
-| correctness_fuzz_simplify | *PASS* 🟢 |
-| correctness_div_round_to_zero | *PASS* 🟢 |
-| correctness_rfactor | *PASS* 🟢 |
-| correctness_custom_jit_context | *PASS* 🟢 |
-| correctness_round | *FAIL* 🔴 |
-| correctness_device_slice | *FAIL* 🔴 |
-| correctness_iterate_over_circle | *PASS* 🟢 |
-| correctness_vector_print_bug | *PASS* 🟢 |
-| correctness_mux | *PASS* 🟢 |
-| correctness_vectorize_varying_allocation_size | *PASS* 🟢 |
-| correctness_parallel_nested_1 | *PASS* 🟢 |
-| correctness_compile_to_multitarget | *PASS* 🟢 |
-| correctness_bounds_inference_outer_split | *PASS* 🟢 |
-| correctness_leak_device_memory | *PASS* 🟢 |
-| correctness_reduction_schedule | *PASS* 🟢 |
-| correctness_many_small_extern_stages | *PASS* 🟢 |
-| correctness_parallel_alloc | *PASS* 🟢 |
-| correctness_multiple_outputs | *PASS* 🟢 |
-| correctness_vectorize_nested | *PASS* 🟢 |
-| correctness_bad_likely | *PASS* 🟢 |
-| correctness_sliding_reduction | *PASS* 🟢 |
-| correctness_bounds_of_split | *PASS* 🟢 |
-| correctness_erf | *PASS* 🟢 |
-| correctness_float16_t_image_type | *PASS* 🟢 |
-| correctness_gpu_non_monotonic_shared_mem_size | *FAIL* 🔴 |
-| correctness_extern_reorder_storage | *PASS* 🟢 |
-| correctness_gather | *PASS* 🟢 |
-| correctness_gpu_many_kernels | *PASS* 🟢 |
-| correctness_early_out | *PASS* 🟢 |
-| correctness_strict_float_bounds | *PASS* 🟢 |
-| correctness_bounds_of_abs | *PASS* 🟢 |
-| correctness_tuple_vector_reduce | *PASS* 🟢 |
-| correctness_debug_to_file_reorder | *PASS* 🟢 |
-| correctness_vectorized_reduction_bug | *PASS* 🟢 |
-| correctness_input_larger_than_two_gigs | *PASS* 🟢 |
-| correctness_computed_index | *PASS* 🟢 |
-| correctness_reduction_non_rectangular | *PASS* 🟢 |
-| correctness_left_shift_negative | *PASS* 🟢 |
-| correctness_set_custom_trace | *PASS* 🟢 |
-| correctness_vectorized_gpu_allocation | *PASS* 🟢 |
-| correctness_split_store_compute | *PASS* 🟢 |
-| correctness_c_function | *PASS* 🟢 |
-| correctness_specialize | *PASS* 🟢 |
-| correctness_nested_shiftinwards | *PASS* 🟢 |
-| correctness_assertion_failure_in_parallel_for | *PASS* 🟢 |
-| correctness_plain_c_includes | *PASS* 🟢 |
-| correctness_stream_compaction | *PASS* 🟢 |
-| correctness_async | *PASS* 🟢 |
-| correctness_atomics | *PASS* 🟢 |
-| correctness_multipass_constraints | *PASS* 🟢 |
-| correctness_target | *PASS* 🟢 |
-| correctness_tuple_reduction | *PASS* 🟢 |
-| correctness_dilate3x3 | *PASS* 🟢 |
-| correctness_image_io | *PASS* 🟢 |
-| correctness_gpu_param_allocation | *FAIL* 🔴 |
-| correctness_reschedule | *PASS* 🟢 |
-| correctness_isnan | *PASS* 🟢 |
-| correctness_halide_buffer | *PASS* 🟢 |
-| correctness_bounds_of_cast | *PASS* 🟢 |
-| correctness_handle | *PASS* 🟢 |
-| correctness_param | *PASS* 🟢 |
-| correctness_saturating_casts | *PASS* 🟢 |
-| correctness_extern_producer | *PASS* 🟢 |
-| correctness_shift_by_unsigned_negated | *PASS* 🟢 |
-| correctness_circular_reference_leak | *PASS* 🟢 |
-| correctness_specialize_to_gpu | *PASS* 🟢 |
-| correctness_device_copy_at_inner_loop | *PASS* 🟢 |
-| correctness_fit_function | *PASS* 🟢 |
-| correctness_compute_at_reordered_update_stage | *PASS* 🟢 |
-| correctness_non_nesting_extern_bounds_query | *PASS* 🟢 |
-| correctness_bitwise_ops | *PASS* 🟢 |
-| correctness_gpu_data_flows | *PASS* 🟢 |
-| correctness_cast | *PASS* 🟢 |
-| correctness_stack_allocations | *PASS* 🟢 |
-| correctness_sliding_backwards | *PASS* 🟢 |
-| correctness_float16_t | *PASS* 🟢 |
-| correctness_simd_op_check | *PASS* 🟢 |
-| correctness_typed_func | *PASS* 🟢 |
-| correctness_tuple_partial_update | *PASS* 🟢 |
-| correctness_heap_cleanup | *PASS* 🟢 |
-| correctness_implicit_args | *PASS* 🟢 |
-| correctness_deferred_loop_level | *PASS* 🟢 |
-| correctness_interleave_x | *PASS* 🟢 |
-| correctness_fuzz_bounds | *PASS* 🟢 |
-| correctness_strided_load | *PASS* 🟢 |
-| correctness_bound_storage | *PASS* 🟢 |
-| correctness_gpu_cpu_simultaneous_read | *PASS* 🟢 |
-| correctness_fast_trigonometric | *PASS* 🟢 |
-| correctness_compute_with | *PASS* 🟢 |
-| correctness_gpu_allocation_cache | *PASS* 🟢 |
-| correctness_compile_to | *PASS* 🟢 |
-| correctness_extern_output_expansion | *PASS* 🟢 |
-| correctness_gpu_texture | *PASS* 🟢 |
-| correctness_many_updates | *PASS* 🟢 |
-| correctness_memoize | *PASS* 🟢 |
-| correctness_gpu_multi_kernel | *PASS* 🟢 |
-| correctness_extern_error | *PASS* 🟢 |
-| correctness_partition_max_filter | *PASS* 🟢 |
-| correctness_bound_small_allocations | *PASS* 🟢 |
-| correctness_median3x3 | *PASS* 🟢 |
-| correctness_reuse_stack_alloc | *PASS* 🟢 |
-| correctness_debug_to_file | *PASS* 🟢 |
-| correctness_embed_bitcode | *PASS* 🟢 |
-| correctness_gpu_large_alloc | *PASS* 🟢 |
-| correctness_pytorch | *PASS* 🟢 |
-| correctness_in_place | *PASS* 🟢 |
-| correctness_exception | *PASS* 🟢 |
-| correctness_python_extension_gen | *PASS* 🟢 |
-| correctness_cross_compilation | *PASS* 🟢 |
-| correctness_extern_bounds_inference | *PASS* 🟢 |
-| correctness_bounds_of_monotonic_math | *PASS* 🟢 |
-| correctness_loop_invariant_extern_calls | *PASS* 🟢 |
-| correctness_skip_stages_external_array_functions | *PASS* 🟢 |
-| correctness_chunk_sharing | *PASS* 🟢 |
-| correctness_multi_way_select | *PASS* 🟢 |
-| correctness_async_copy_chain | *FAIL* 🔴 |
-| correctness_gpu_give_input_buffers_device_allocations | *PASS* 🟢 |
-| correctness_oddly_sized_output | *PASS* 🟢 |
-| correctness_fuzz_cse | *PASS* 🟢 |
-| correctness_half_native_interleave | *PASS* 🟢 |
-| correctness_introspection | *PASS* 🟢 |
-| correctness_callable_generator | *PASS* 🟢 |
-| correctness_fused_where_inner_extent_is_zero | *PASS* 🟢 |
-| correctness_tuple_update_ops | *PASS* 🟢 |
-| correctness_constraints | *PASS* 🟢 |
-| correctness_multiple_scatter | *PASS* 🟢 |
-| correctness_unrolled_reduction | *PASS* 🟢 |
-| correctness_tracing | *PASS* 🟢 |
-| correctness_simplified_away_embedded_image | *PASS* 🟢 |
-| correctness_mul_div_mod | *FAIL* 🔴 |
-| correctness_infer_arguments | *PASS* 🟢 |
-| correctness_convolution | *FAIL* 🔴 |
-| correctness_truncated_pyramid | *PASS* 🟢 |
-| correctness_for_each_element | *PASS* 🟢 |
-| correctness_store_in | *PASS* 🟢 |
-| correctness_transitive_bounds | *PASS* 🟢 |
-| correctness_vectorize_guard_with_if | *PASS* 🟢 |
-| correctness_widening_lerp | *PASS* 🟢 |
-| correctness_cast_handle | *PASS* 🟢 |
-| correctness_tuple_undef | *PASS* 🟢 |
-| correctness_partial_application | *PASS* 🟢 |
-| correctness_vectorize_mixed_widths | *PASS* 🟢 |
-| correctness_print | *PASS* 🟢 |
-| correctness_fibonacci | *PASS* 🟢 |
-| correctness_parallel_nested | *PASS* 🟢 |
-| correctness_sliding_window | *PASS* 🟢 |
-| correctness_integer_powers | *PASS* 🟢 |
-| correctness_unique_func_image | *PASS* 🟢 |
-| correctness_constant_type | *PASS* 🟢 |
-| correctness_shifted_image | *PASS* 🟢 |
-| correctness_vector_extern | *PASS* 🟢 |
-| correctness_compute_with_inlined | *PASS* 🟢 |
-| correctness_param_map | *PASS* 🟢 |
-| correctness_float16_t_constants | *PASS* 🟢 |
-| correctness_callable_typed | *PASS* 🟢 |
-| correctness_unroll_huge_mux | *PASS* 🟢 |
-| correctness_parallel | *PASS* 🟢 |
-| correctness_code_explosion | *PASS* 🟢 |
-| correctness_gpu_dynamic_shared | *PASS* 🟢 |
-| correctness_div_by_zero | *PASS* 🟢 |
-| correctness_convolution_multiple_kernels | *FAIL* 🔴 |
-| correctness_deinterleave4 | *PASS* 🟢 |
-| correctness_align_bounds | *PASS* 🟢 |
-| correctness_gpu_bounds_inference_failure | *PASS* 🟢 |
-| correctness_interpreter | *FAIL* 🔴 |
-| correctness_parallel_gpu_nested | *PASS* 🟢 |
-| correctness_gpu_thread_barrier | *FAIL* 🔴 |
-| correctness_debug_to_file_multiple_outputs | *PASS* 🟢 |
-| correctness_gpu_free_sync | *PASS* 🟢 |
-| correctness_out_constraint | *PASS* 🟢 |
-| correctness_gpu_specialize | *FAIL* 🔴 |
-| correctness_register_shuffle | *PASS* 🟢 |
-| correctness_constant_expr | *PASS* 🟢 |
-| correctness_out_of_memory | *PASS* 🟢 |
-| correctness_gpu_non_contiguous_copy | *PASS* 🟢 |
-| correctness_sliding_over_guard_with_if | *PASS* 🟢 |
-| correctness_vector_math | *PASS* 🟢 |
-| correctness_require | *PASS* 🟢 |
-| correctness_callable_errors | *PASS* 🟢 |
diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index 15248c5f6a11..742df23231f9 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -186,6 +186,14 @@ void define_enums(py::module &m) {
         .value("ProfileByTimer", Target::Feature::ProfileByTimer)
         .value("SPIRV", Target::Feature::SPIRV)
         .value("Vulkan", Target::Feature::Vulkan)
+        .value("VulkanInt8", Target::VulkanInt8)
+        .value("VulkanInt16", Target::VulkanInt16)
+        .value("VulkanInt64", Target::VulkanInt64)
+        .value("VulkanFloat16", Target::VulkanFloat16)
+        .value("VulkanFloat64", Target::VulkanFloat64)
+        .value("VulkanV10", Target::VulkanV10)
+        .value("VulkanV12", Target::VulkanV12)
+        .value("VulkanV13", Target::VulkanV13)
         .value("FeatureEnd", Target::Feature::FeatureEnd);
 
     py::enum_<halide_type_code_t>(m, "TypeCode")
diff --git a/src/BoundSmallAllocations.cpp b/src/BoundSmallAllocations.cpp
index 227da06c378f..ae492d00d908 100644
--- a/src/BoundSmallAllocations.cpp
+++ b/src/BoundSmallAllocations.cpp
@@ -76,8 +76,6 @@ class BoundSmallAllocations : public IRMutator {
     bool must_be_constant(MemoryType memory_type) const {
         return (memory_type == MemoryType::Register ||
                 (device_api == DeviceAPI::OpenGLCompute &&
-                 memory_type == MemoryType::GPUShared) ||
-                (device_api == DeviceAPI::Vulkan &&
                  memory_type == MemoryType::GPUShared));
     }
 
@@ -134,13 +132,6 @@ class BoundSmallAllocations : public IRMutator {
                 << "Only fixed-size allocations can be stored in shared memory "
                 << "in OpenGL compute shaders. Try storing in MemoryType::Heap "
                 << "instead.";
-
-            user_assert(!(device_api == DeviceAPI::Vulkan &&
-                          op->memory_type == MemoryType::GPUShared))
-                << "Allocation " << op->name << " has a dynamic size. "
-                << "Only fixed-size allocations can be stored in shared memory "
-                << "in Vulkan compute shaders. Try storing in MemoryType::Heap "
-                << "instead.";
         }
 
         const int64_t *size_ptr = bound.defined() ? as_const_int(bound) : nullptr;
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 5aa030a4ab38..933a9ca74549 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -64,7 +64,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
     class SPIRV_Emitter : public IRVisitor {
 
     public:
-        SPIRV_Emitter() = default;
+        SPIRV_Emitter(Target t);
 
         using IRVisitor::visit;
 
@@ -133,6 +133,8 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 
         // Workgroup size
         void reset_workgroup_size();
+        void find_workgroup_size(const Stmt &s);
+
         void declare_workgroup_size(SpvId kernel_func_id);
         void declare_entry_point(const Stmt &s, SpvId kernel_func_id);
         void declare_device_args(const Stmt &s, uint32_t entry_point_index, const std::string &kernel_name, const std::vector<DeviceArgument> &args);
@@ -221,18 +223,45 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         // Map from a variable ID to its corresponding storage type definition
         struct StorageAccess {
             SpvStorageClass storage_class = SpvStorageClassMax;
+            uint32_t storage_array_size = 0;  // zero if not an array
             SpvId storage_type_id = SpvInvalidId;
             Type storage_type;
         };
         using StorageAccessMap = std::unordered_map<SpvId, StorageAccess>;
         StorageAccessMap storage_access_map;
 
+        // Defines the binding information for a specialization constant
+        // that is exported by the module and can be overriden at runtime
+        struct SpecializationBinding {
+            SpvId constant_id = 0;
+            uint32_t type_size = 0;
+            std::string constant_name;
+        };
+        using SpecializationConstants = std::vector<SpecializationBinding>;
+
+        // Defines a shared memory allocation
+        struct SharedMemoryAllocation {
+            SpvId constant_id = 0;  // specialization constant to dynamically adjust array size (zero if not used)
+            uint32_t array_size = 0;
+            uint32_t type_size = 0;
+            std::string variable_name;
+        };
+        using SharedMemoryUsage = std::vector<SharedMemoryAllocation>;
+
+        // Defines the specialization constants used for dynamically overiding the dispatch size
+        struct WorkgroupSizeBinding {
+            SpvId local_size_constant_id[3] = {0, 0, 0};  // zero if unused
+        };
+
         // Keep track of the descriptor sets so we can add a sidecar to the
         // module indicating which descriptor set to use for each entry point
         struct DescriptorSet {
             std::string entry_point_name;
             uint32_t uniform_buffer_count = 0;
             uint32_t storage_buffer_count = 0;
+            SpecializationConstants specialization_constants;
+            SharedMemoryUsage shared_memory_usage;
+            WorkgroupSizeBinding workgroup_size_binding;
         };
         using DescriptorSetTable = std::vector<DescriptorSet>;
         DescriptorSetTable descriptor_set_table;
@@ -243,6 +272,9 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         // Current index of kernel for module
         uint32_t kernel_index = 0;
 
+        // Target for codegen
+        Target target;
+
     } emitter;
 
     std::string current_kernel_name;
@@ -341,6 +373,58 @@ class CheckAlignedDenseVectorLoadStore : public IRVisitor {
     }
 };
 
+struct FindWorkGroupSize : public IRVisitor {
+    using IRVisitor::visit;
+    void visit(const For *loop) override {
+        if (!CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
+            return loop->body.accept(this);
+        }
+
+        if ((loop->for_type == ForType::GPUBlock) ||
+            (loop->for_type == ForType::GPUThread)) {
+
+            // This should always be true at this point in codegen
+            internal_assert(is_const_zero(loop->min));
+
+            // Save & validate the workgroup size
+            int index = thread_loop_workgroup_index(loop->name);
+            if (index >= 0) {
+                const IntImm *literal = loop->extent.as<IntImm>();
+                if (literal != nullptr) {
+                    uint32_t new_wg_size = literal->value;
+                    user_assert(workgroup_size[index] == 0 || workgroup_size[index] == new_wg_size)
+                        << "Vulkan requires all kernels have the same workgroup size, "
+                        << "but two different sizes were encountered: "
+                        << workgroup_size[index] << " and "
+                        << new_wg_size << " in dimension " << index << "\n";
+                    workgroup_size[index] = new_wg_size;
+                }
+            }
+            debug(4) << "Thread group size for index " << index << " is " << workgroup_size[index] << "\n";
+        }
+        loop->body.accept(this);
+    }
+
+    int thread_loop_workgroup_index(const std::string &name) {
+        std::string ids[] = {".__thread_id_x",
+                             ".__thread_id_y",
+                             ".__thread_id_z"};
+        for (size_t i = 0; i < sizeof(ids) / sizeof(std::string); i++) {
+            if (ends_with(name, ids[i])) {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    uint32_t workgroup_size[3] = {0, 0, 0};
+};
+
+CodeGen_Vulkan_Dev::SPIRV_Emitter::SPIRV_Emitter(Target t)
+    : IRVisitor(), target(t) {
+    // Empty
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(const Expr &e) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize(): " << (Expr)e << "\n";
     internal_assert(e.type().is_vector()) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::scalarize must be called with an expression of vector type.\n";
@@ -1565,18 +1649,6 @@ std::pair<std::string, uint32_t> simt_intrinsic(const std::string &name) {
     return {"", -1};
 }
 
-int thread_loop_workgroup_index(const std::string &name) {
-    std::string ids[] = {".__thread_id_x",
-                         ".__thread_id_y",
-                         ".__thread_id_z"};
-    for (size_t i = 0; i < sizeof(ids) / sizeof(std::string); i++) {
-        if (ends_with(name, ids[i])) {
-            return i;
-        }
-    }
-    return -1;
-}
-
 }  // anonymous namespace
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
@@ -1586,21 +1658,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         internal_assert((op->for_type == ForType::GPUBlock) ||
                         (op->for_type == ForType::GPUThread))
             << "kernel loops must be either gpu block or gpu thread\n";
+
         // This should always be true at this point in codegen
         internal_assert(is_const_zero(op->min));
-
-        // Save & validate the workgroup size
-        int idx = thread_loop_workgroup_index(op->name);
-        if (idx >= 0) {
-            const IntImm *wsize = op->extent.as<IntImm>();
-            user_assert(wsize != nullptr) << "Vulkan requires statically-known workgroup size.\n";
-            uint32_t new_wsize = wsize->value;
-            user_assert(workgroup_size[idx] == 0 || workgroup_size[idx] == new_wsize) << "Vulkan requires all kernels have the same workgroup size, but two different ones "
-                                                                                         "were encountered "
-                                                                                      << workgroup_size[idx] << " and " << new_wsize << " in dimension " << idx << "\n";
-            workgroup_size[idx] = new_wsize;
-        }
-
         auto intrinsic = simt_intrinsic(op->name);
 
         // Intrinsics are inserted when adding the kernel
@@ -1764,34 +1824,69 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
     SpvId storage_type_id = builder.declare_type(op->type);
     SpvId array_type_id = SpvInvalidId;
     SpvId variable_id = SpvInvalidId;
+    uint32_t array_size = 0;
+
     SpvStorageClass storage_class = SpvStorageClassGeneric;
     if (op->memory_type == MemoryType::GPUShared) {
-        // Allocation of shared memory must be declared at global scope
-        user_assert(op->extents.size() == 1 && is_const(op->extents[0]))
-            << "Allocation " << op->name << " has a dynamic size. "
-            << "Only fixed-size allocations are supported with Vulkan.";
 
-        int32_t size = op->constant_allocation_size();
-        array_type_id = builder.declare_type(op->type, size);
+        // Allocation of shared memory must be declared at global scope
         storage_class = SpvStorageClassWorkgroup;  // shared across workgroup
-        debug(2) << "Vulkan: Allocate " << op->name << " type=" << op->type << " size=" << (uint32_t)size << " in shared memory on device in global scope\n";
         std::string variable_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + op->name;
+        uint32_t type_size = op->type.bytes();
+        uint32_t constant_id = 0;
+
+        // static fixed size allocation
+        if (op->extents.size() == 1 && is_const(op->extents[0])) {
+            array_size = op->constant_allocation_size();
+            array_type_id = builder.declare_type(op->type, array_size);
+            builder.add_symbol(variable_name + "_array_type", array_type_id, builder.current_module().id());
+            debug(2) << "Vulkan: Allocate (fixed-size) " << op->name << " type=" << op->type << " array_size=" << (uint32_t)array_size << " in shared memory on device in global scope\n";
+
+        } else {
+            // dynamic allocation with unknown size at compile time ...
+
+            // declare the array size as a specialization constant (which will get overridden at runtime)
+            Type array_size_type = UInt(32);
+            array_size = std::max(workgroup_size[0], uint32_t(1));  // use one item per workgroup as an initial guess
+            SpvId array_size_id = builder.declare_specialization_constant(array_size_type, &array_size);
+            array_type_id = builder.add_array_with_default_size(storage_type_id, array_size_id);
+            builder.add_symbol(variable_name + "_array_type", array_type_id, builder.current_module().id());
+
+            debug(2) << "Vulkan: Allocate (dynamic size) " << op->name << " type=" << op->type << " default_size=" << (uint32_t)array_size << " in shared memory on device in global scope\n";
+
+            // bind the specialization constant to the next slot
+            std::string constant_name = variable_name + "_array_size";
+            constant_id = (uint32_t)(descriptor_set_table.back().specialization_constants.size() + 1);
+            SpvBuilder::Literals spec_id = {constant_id};
+            builder.add_annotation(array_size_id, SpvDecorationSpecId, spec_id);
+            builder.add_symbol(constant_name, array_size_id, builder.current_module().id());
+
+            // update the descriptor set with the specialization binding
+            SpecializationBinding spec_binding = {constant_id, (uint32_t)array_size_type.bytes(), constant_name};
+            descriptor_set_table.back().specialization_constants.push_back(spec_binding);
+        }
+
+        // add the shared memory allocation to the descriptor set
+        SharedMemoryAllocation shared_mem_allocation = {constant_id, array_size, type_size, variable_name};
+        descriptor_set_table.back().shared_memory_usage.push_back(shared_mem_allocation);
+
+        // declare the variable
         SpvId ptr_type_id = builder.declare_pointer_type(array_type_id, storage_class);
         variable_id = builder.declare_global_variable(variable_name, ptr_type_id, storage_class);
 
     } else {
 
         // Allocation is not a shared memory allocation, just make a local declaration.
-        int32_t size = op->constant_allocation_size();
+        array_size = op->constant_allocation_size();
 
         // It must have a constant size.
-        user_assert(size > 0)
+        user_assert(array_size > 0)
             << "Allocation " << op->name << " has a dynamic size. "
-            << "Only fixed-size allocations are supported with Vulkan.";
+            << "Only fixed-size local allocations are supported with Vulkan.";
 
-        debug(2) << "Vulkan: Allocate " << op->name << " type=" << op->type << " size=" << (uint32_t)size << " on device in function scope\n";
+        debug(2) << "Vulkan: Allocate " << op->name << " type=" << op->type << " size=" << (uint32_t)array_size << " on device in function scope\n";
 
-        array_type_id = builder.declare_type(op->type, size);
+        array_type_id = builder.declare_type(op->type, array_size);
         storage_class = SpvStorageClassFunction;  // function scope
         std::string variable_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + op->name;
         SpvId ptr_type_id = builder.declare_pointer_type(array_type_id, storage_class);
@@ -1800,6 +1895,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
 
     StorageAccess access;
     access.storage_class = storage_class;
+    access.storage_array_size = array_size;
     access.storage_type_id = storage_type_id;
     access.storage_type = op->type;
     storage_access_map[variable_id] = access;
@@ -2152,6 +2248,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
 
     reset();
 
+    if (target.has_feature(Target::VulkanV13)) {
+        // Encode to SPIR-V v1.2 to allow dynamic dispatching (if needed)
+        builder.set_version_format(0x00010200);
+    } else {
+        // Encode to SPIR-V v1.0 (which is the only format supported by Vulkan v1.0)
+        builder.set_version_format(0x00010000);
+    }
+
     // NOTE: Source language is irrelevant. We encode the binary directly
     builder.set_source_language(SpvSourceLanguageUnknown);
 
@@ -2167,51 +2271,155 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
     // NOTE: Extensions are handled in finalize
 }
 
+namespace {
+
+std::vector<char> encode_header_string(const std::string &str) {
+    uint32_t padded_word_count = (str.length() / 4) + 1;  // add an extra entry to ensure strings are terminated
+    uint32_t padded_str_length = padded_word_count * 4;
+    std::vector<char> encoded_string(padded_str_length, '\0');
+    for (uint32_t c = 0; c < str.length(); c++) {
+        encoded_string[c] = str[c];
+    }
+    return encoded_string;
+}
+
+}  // namespace
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header(SpvBinary &spirv_header) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header\n";
 
     // Encode a sidecar for the module that lists the descriptor sets
-    // corresponding to each entry point contained in the module
+    // corresponding to each entry point contained in the module.
+    //
+    // This metadata will be used at runtime to define the shader bindings
+    // needed for all buffers, constants, shared memory, and workgroup sizes
+    // that are required for execution.
+    //
+    // Like the SPIR-V code module, each entry is one word (1x uint32_t).
+    // Variable length sections are prefixed with their length (ie number of entries).
     //
     // [0] Header word count (total length of header)
     // [1] Number of descriptor sets
     // ... For each descriptor set ...
-    // ... [0] Number of uniform buffers for this descriptor set
-    // ... [1] Number of storage buffers for this descriptor set
-    // ... [2] Length of entry point name (padded to nearest word size)
-    // ... [X] Entry point string data
+    // ... [0] Length of entry point name (padded to nearest word size)
+    // ....... [*] Entry point string data (padded with null chars)
+    // ... [1] Number of uniform buffers for this descriptor set
+    // ... [2] Number of storage buffers for this descriptor set
+    // ... [3] Number of specialization constants for this descriptor set
+    // ....... For each specialization constant ...
+    // ....... [0] Length of constant name string (padded to nearest word size)
+    // ........... [*] Constant name string data (padded with null chars)
+    // ....... [1] Constant id (as used in VkSpecializationMapEntry for binding)
+    // ....... [2] Size of data type (in bytes)
+    // ... [4] Number of shared memory allocations for this descriptor set
+    // ....... For each allocation ...
+    // ....... [0] Length of variable name string (padded to nearest word size)
+    // ........... [*] Variable name string data (padded with null chars)
+    // ....... [1] Constant id to use for overriding array size (zero if it is not bound to a specialization constant)
+    // ....... [2] Size of data type (in bytes)
+    // ....... [3] Size of array (ie element count)
+    // ... [4] Dynamic workgroup dimensions bound to specialization constants
+    // ....... [0] Constant id to use for local_size_x (zero if it was statically declared and not bound to a specialization constant)
+    // ....... [1] Constant id to use for local_size_y
+    // ....... [2] Constant id ot use for local_size_z
     //
-
-    // NOTE: The Vulkan runtime consumes this header prior to compiling.
+    // NOTE: Halide's Vulkan runtime consumes this header prior to compiling.
     //
-    // Both vk_decode_entry_point_data() and vk_compile_shader_module() will
+    // Both vk_decode_shader_bindings() and vk_compile_shader_module() will
     // need to be updated if the header encoding ever changes!
     //
     uint32_t index = 0;
     spirv_header.push_back(descriptor_set_table.size());
     for (const DescriptorSet &ds : descriptor_set_table) {
-        std::vector<char> padded_name;
-        uint32_t padded_word_count = (ds.entry_point_name.length() + 3) / 4;
-        uint32_t padded_str_length = padded_word_count * 4;
-        padded_name.reserve(padded_str_length);
-        padded_name.insert(padded_name.begin(), ds.entry_point_name.c_str(), (ds.entry_point_name.c_str() + ds.entry_point_name.length()));
-        uint32_t padding = (padded_str_length - ds.entry_point_name.length());
-        for (uint32_t i = 0; i < padding; ++i) {
-            padded_name.push_back('\0');
-        }
+
+        // encode the entry point name into an array of chars (padded to the next word entry)
+        std::vector<char> entry_point_name = encode_header_string(ds.entry_point_name);
+        uint32_t entry_point_name_entries = (uint32_t)(entry_point_name.size() / sizeof(uint32_t));
 
         debug(2) << "    [" << index << "] "
                  << "uniform_buffer_count=" << ds.uniform_buffer_count << " "
                  << "storage_buffer_count=" << ds.storage_buffer_count << " "
-                 << "entry_point_name_length=" << padded_str_length << " "
-                 << "entry_point_name_size=" << padded_name.size() << " "
-                 << "entry_point_name: " << (const char *)padded_name.data() << "\n";
+                 << "entry_point_name_size=" << entry_point_name.size() << " "
+                 << "entry_point_name: " << (const char *)entry_point_name.data() << "\n";
+
+        // [0] Length of entry point name (padded to nearest word size)
+        spirv_header.push_back(entry_point_name_entries);
 
+        // [*] Entry point string data (padded with null chars)
+        spirv_header.insert(spirv_header.end(), (const uint32_t *)entry_point_name.data(), (const uint32_t *)(entry_point_name.data() + entry_point_name.size()));
+
+        // [1] Number of uniform buffers for this descriptor set
         spirv_header.push_back(ds.uniform_buffer_count);
+
+        // [2] Number of storage buffers for this descriptor set
         spirv_header.push_back(ds.storage_buffer_count);
-        spirv_header.push_back(padded_str_length);
-        internal_assert(padded_name.size() == padded_str_length);
-        spirv_header.insert(spirv_header.end(), (const uint32_t *)padded_name.data(), (const uint32_t *)(padded_name.data() + padded_name.size()));
+
+        // [3] Number of specialization constants for this descriptor set
+        spirv_header.push_back((uint32_t)ds.specialization_constants.size());
+        debug(2) << "     specialization_count=" << (uint32_t)ds.specialization_constants.size() << "\n";
+
+        // For each specialization constant ...
+        for (const SpecializationBinding &spec_binding : ds.specialization_constants) {
+
+            // encode the constant name into an array of chars (padded to the next word entry)
+            std::vector<char> constant_name = encode_header_string(spec_binding.constant_name);
+            uint32_t constant_name_entries = (uint32_t)(constant_name.size() / sizeof(uint32_t));
+
+            debug(2) << "     [" << spec_binding.constant_id << "] "
+                     << "constant_name=" << (const char *)constant_name.data() << " "
+                     << "type_size=" << spec_binding.type_size << "\n";
+
+            // [0] Length of constant name string (padded to nearest word size)
+            spirv_header.push_back(constant_name_entries);
+
+            // [*] Constant name string data (padded with null chars)
+            spirv_header.insert(spirv_header.end(), (const uint32_t *)constant_name.data(), (const uint32_t *)(constant_name.data() + constant_name.size()));
+
+            // [1] Constant id (as used in VkSpecializationMapEntry for binding)
+            spirv_header.push_back(spec_binding.constant_id);
+
+            // [2] Size of data type (in bytes)
+            spirv_header.push_back(spec_binding.type_size);
+        }
+
+        // [4] Number of shared memory allocations for this descriptor set
+        spirv_header.push_back((uint32_t)ds.shared_memory_usage.size());
+        debug(2) << "     shared_memory_allocations=" << (uint32_t)ds.shared_memory_usage.size() << "\n";
+
+        // For each allocation ...
+        uint32_t shm_index = 0;
+        for (const SharedMemoryAllocation &shared_mem_alloc : ds.shared_memory_usage) {
+
+            // encode the variable name into an array of chars (padded to the next word entry)
+            std::vector<char> variable_name = encode_header_string(shared_mem_alloc.variable_name);
+            uint32_t variable_name_entries = (uint32_t)(variable_name.size() / sizeof(uint32_t));
+
+            debug(2) << "     [" << shm_index++ << "] "
+                     << "variable_name=" << (const char *)variable_name.data() << " "
+                     << "constant_id=" << shared_mem_alloc.constant_id << " "
+                     << "type_size=" << shared_mem_alloc.type_size << " "
+                     << "array_size=" << shared_mem_alloc.array_size << "\n";
+
+            // [0] Length of variable name string (padded to nearest word size)
+            spirv_header.push_back(variable_name_entries);
+
+            // [*] Variable name string data (padded with null chars)
+            spirv_header.insert(spirv_header.end(), (const uint32_t *)variable_name.data(), (const uint32_t *)(variable_name.data() + variable_name.size()));
+
+            // [1] Constant id to use for overriding array size (zero if it is not bound to a specialization constant)
+            spirv_header.push_back(shared_mem_alloc.constant_id);
+
+            // [2] Size of data type (in bytes)
+            spirv_header.push_back(shared_mem_alloc.type_size);
+
+            // [3] Size of array (ie element count)
+            spirv_header.push_back(shared_mem_alloc.array_size);
+        }
+
+        // [4] Dynamic workgroup dimensions bound to specialization constants
+        spirv_header.push_back(ds.workgroup_size_binding.local_size_constant_id[0]);
+        spirv_header.push_back(ds.workgroup_size_binding.local_size_constant_id[1]);
+        spirv_header.push_back(ds.workgroup_size_binding.local_size_constant_id[2]);
         ++index;
     }
     uint32_t header_word_count = spirv_header.size();
@@ -2224,14 +2432,78 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::reset_workgroup_size() {
     workgroup_size[2] = 0;
 }
 
+void CodeGen_Vulkan_Dev::SPIRV_Emitter::find_workgroup_size(const Stmt &s) {
+    reset_workgroup_size();
+    FindWorkGroupSize fwgs;
+    s.accept(&fwgs);
+
+    workgroup_size[0] = fwgs.workgroup_size[0];
+    workgroup_size[1] = fwgs.workgroup_size[1];
+    workgroup_size[2] = fwgs.workgroup_size[2];
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_workgroup_size(SpvId kernel_func_id) {
-    workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
-    workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
-    workgroup_size[2] = std::max(workgroup_size[2], (uint32_t)1);
 
-    // Add workgroup size to execution mode
-    SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(kernel_func_id, workgroup_size[0], workgroup_size[1], workgroup_size[2]);
-    builder.current_module().add_execution_mode(exec_mode_inst);
+    if (workgroup_size[0] == 0) {
+
+        // workgroup size is dynamic ...
+        if (!target.has_feature(Target::VulkanV13)) {
+            user_error << "Vulkan: Dynamic workgroup sizes require Vulkan v1.3+ support! "
+                       << "Either enable the target feature, or adjust the pipeline's schedule "
+                       << "to use static workgroup sizes!";
+        }
+
+        // declare the workgroup local size as a specialization constant (which will get overridden at runtime)
+        Type local_size_type = UInt(32);
+
+        uint32_t local_size_x = std::max(workgroup_size[0], (uint32_t)1);  // use a minimum of 1 for the default value
+        uint32_t local_size_y = std::max(workgroup_size[1], (uint32_t)1);
+        uint32_t local_size_z = std::max(workgroup_size[2], (uint32_t)1);
+
+        SpvId local_size_x_id = builder.declare_specialization_constant(local_size_type, &local_size_x);
+        SpvId local_size_y_id = builder.declare_specialization_constant(local_size_type, &local_size_y);
+        SpvId local_size_z_id = builder.declare_specialization_constant(local_size_type, &local_size_z);
+
+        SpvId local_size_ids[3] = {
+            local_size_x_id,
+            local_size_y_id,
+            local_size_z_id};
+
+        const char *local_size_names[3] = {
+            "__thread_id_x",
+            "__thread_id_y",
+            "__thread_id_z"};
+
+        debug(1) << "Vulkan: Using dynamic workgroup local size with default of [" << local_size_x << ", " << local_size_y << ", " << local_size_z << "]...\n";
+
+        // annotate each local size with a corresponding specialization constant
+        for (uint32_t dim = 0; dim < 3; dim++) {
+            SpvId constant_id = (uint32_t)(descriptor_set_table.back().specialization_constants.size() + 1);
+            SpvBuilder::Literals spec_id = {constant_id};
+            builder.add_annotation(local_size_ids[dim], SpvDecorationSpecId, spec_id);
+            builder.add_symbol(local_size_names[dim], local_size_ids[dim], builder.current_module().id());
+            SpecializationBinding spec_binding = {constant_id, (uint32_t)sizeof(uint32_t), local_size_names[dim]};
+            descriptor_set_table.back().specialization_constants.push_back(spec_binding);
+            descriptor_set_table.back().workgroup_size_binding.local_size_constant_id[dim] = constant_id;
+        }
+
+        // Add workgroup size to execution mode
+        SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size_id(kernel_func_id, local_size_x_id, local_size_y_id, local_size_z_id);
+        builder.current_module().add_execution_mode(exec_mode_inst);
+
+    } else {
+
+        // workgroup size is static ...
+        workgroup_size[0] = std::max(workgroup_size[0], (uint32_t)1);
+        workgroup_size[1] = std::max(workgroup_size[1], (uint32_t)1);
+        workgroup_size[2] = std::max(workgroup_size[2], (uint32_t)1);
+
+        debug(1) << "Vulkan: Using static workgroup local size [" << workgroup_size[0] << ", " << workgroup_size[1] << ", " << workgroup_size[2] << "]...\n";
+
+        // Add workgroup size to execution mode
+        SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(kernel_func_id, workgroup_size[0], workgroup_size[1], workgroup_size[2]);
+        builder.current_module().add_execution_mode(exec_mode_inst);
+    }
 }
 
 namespace {
@@ -2456,9 +2728,42 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
     SpvBinary spirv_header;
     encode_header(spirv_header);
 
-    // Finalize and encode the SPIR-V IR into a compliant binary
-    SpvBinary spirv_binary;
+    // Finalize the SPIR-V module
     builder.finalize();
+
+    // Validate the SPIR-V for the target
+    if (builder.is_capability_required(SpvCapabilityInt8) && !target.has_feature(Target::VulkanInt8)) {
+        user_error << "Vulkan: Code requires 8-bit integer support (which is not enabled in the target features)! "
+                   << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
+    }
+
+    if (builder.is_capability_required(SpvCapabilityInt16) && !target.has_feature(Target::VulkanInt16)) {
+        user_error << "Vulkan: Code requires 16-bit integer support (which is not enabled in the target features)! "
+                   << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
+    }
+
+    if (builder.is_capability_required(SpvCapabilityInt64) && !target.has_feature(Target::VulkanInt64)) {
+        user_error << "Vulkan: Code requires 64-bit integer support (which is not enabled in the target features)! "
+                   << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
+    }
+
+    if (builder.is_capability_required(SpvCapabilityInt64) && !target.has_feature(Target::VulkanInt64)) {
+        user_error << "Vulkan: Code requires 64-bit integer support (which is not enabled in the target features)! "
+                   << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
+    }
+
+    if (builder.is_capability_required(SpvCapabilityFloat16) && !target.has_feature(Target::VulkanFloat16)) {
+        user_error << "Vulkan: Code requires 16-bit floating-point support (which is not enabled in the target features)! "
+                   << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
+    }
+
+    if (builder.is_capability_required(SpvCapabilityFloat64) && !target.has_feature(Target::VulkanFloat64)) {
+        user_error << "Vulkan: Code requires 16-bit floating-point support (which is not enabled in the target features)! "
+                   << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
+    }
+
+    // Encode the SPIR-V into a compliant binary
+    SpvBinary spirv_binary;
     builder.encode(spirv_binary);
 
     size_t header_bytes = spirv_header.size() * sizeof(uint32_t);
@@ -2482,8 +2787,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     // Add function definition
     // TODO: can we use one of the function control annotations?
 
-    // We'll discover the workgroup size as we traverse the kernel
-    reset_workgroup_size();
+    // Discover the workgroup size
+    find_workgroup_size(s);
 
     // Update the kernel index for the module
     kernel_index++;
@@ -2507,7 +2812,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
     // Insert return statement end delimiter
     kernel_func.tail_block().add_instruction(SpvFactory::return_stmt());
 
-    // Declare the workgroup size now that we've traversed the kernel
+    // Declare the workgroup size for the kernel
     declare_workgroup_size(kernel_func_id);
 
     // Pop scope
@@ -2524,7 +2829,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::dump() const {
     std::cerr << builder.current_module();
 }
 
-CodeGen_Vulkan_Dev::CodeGen_Vulkan_Dev(Target t) {
+CodeGen_Vulkan_Dev::CodeGen_Vulkan_Dev(Target t)
+    : emitter(t) {
+    // Empty
 }
 
 void CodeGen_Vulkan_Dev::init_module() {
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 8ae729dd0052..997cb12ed955 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -1311,8 +1311,8 @@ void SpvBuilder::add_struct_annotation(SpvId struct_type_id, uint32_t member_ind
 }
 
 void SpvBuilder::add_execution_mode_local_size(SpvId func_id,
-                                               uint32_t local_size_x, 
-                                               uint32_t local_size_y, 
+                                               uint32_t local_size_x,
+                                               uint32_t local_size_y,
                                                uint32_t local_size_z) {
 
     local_size_x = std::max(local_size_x, (uint32_t)1);
@@ -1324,8 +1324,8 @@ void SpvBuilder::add_execution_mode_local_size(SpvId func_id,
 }
 
 void SpvBuilder::add_execution_mode_local_size_id(SpvId func_id,
-                                                  SpvId local_size_x_id, 
-                                                  SpvId local_size_y_id, 
+                                                  SpvId local_size_x_id,
+                                                  SpvId local_size_y_id,
                                                   SpvId local_size_z_id) {
 
     SpvInstruction exec_mode_inst = SpvFactory::exec_mode_local_size(func_id, local_size_x_id, local_size_y_id, local_size_z_id);
@@ -1863,7 +1863,7 @@ SpvId SpvBuilder::declare_specialization_constant_of_type(const Type &scalar_typ
              << "data=" << stringify_constant(value) << "\n";
 
     SpvInstruction inst = SpvFactory::specialization_constant(result_id, type_id, scalar_type.bytes(), &value, value_type);
-    module.add_type(inst); // NOTE: Needs to be declared in the type section in order to be used with other type definitions
+    module.add_type(inst);  // NOTE: Needs to be declared in the type section in order to be used with other type definitions
     return result_id;
 }
 
@@ -2516,11 +2516,9 @@ SpvInstruction SpvFactory::exec_mode_local_size_id(SpvId function_id, SpvId loca
     inst.add_immediates({
         {SpvExecutionModeLocalSizeId, SpvIntegerLiteral},
     });
-    inst.add_operands({
-        local_size_x_id, 
-        local_size_y_id, 
-        local_size_z_id
-    });
+    inst.add_operands({local_size_x_id,
+                       local_size_y_id,
+                       local_size_z_id});
     return inst;
 }
 
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index f28cfeb1172d..ccb7774d8983 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -387,7 +387,7 @@ class SpvBuilder {
     SpvId declare_type(const Type &type, uint32_t array_size = 1);
     SpvId declare_pointer_type(const Type &type, SpvStorageClass storage_class);
     SpvId declare_pointer_type(SpvId type_id, SpvStorageClass storage_class);
-    SpvId declare_constant(const Type &type, const void *data, bool is_specialization=false);
+    SpvId declare_constant(const Type &type, const void *data, bool is_specialization = false);
     SpvId declare_null_constant(const Type &type);
     SpvId declare_bool_constant(bool value);
     SpvId declare_string_constant(const std::string &str);
@@ -413,7 +413,7 @@ class SpvBuilder {
     SpvId add_runtime_array(SpvId base_type_id);
     SpvId add_pointer_type(const Type &type, SpvStorageClass storage_class);
     SpvId add_pointer_type(SpvId base_type_id, SpvStorageClass storage_class);
-    SpvId add_constant(const Type &type, const void *data, bool is_specialization=false);
+    SpvId add_constant(const Type &type, const void *data, bool is_specialization = false);
     SpvId add_function_type(SpvId return_type_id, const ParamTypes &param_type_ids);
     SpvId add_function(const std::string &name, SpvId return_type, const ParamTypes &param_types = {});
     SpvId add_instruction(SpvInstruction val);
@@ -591,9 +591,9 @@ class SpvBuilder {
 
     ConstantKey make_bool_constant_key(bool value) const;
     ConstantKey make_string_constant_key(const std::string &value) const;
-    ConstantKey make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data, bool is_specialization=false) const;
-    ConstantKey make_constant_key(const Type &type, const void *data, bool is_specialization=false) const;
-    SpvId lookup_constant(const Type &type, const void *data, bool is_specialization=false) const;
+    ConstantKey make_constant_key(uint8_t code, uint8_t bits, int lanes, size_t bytes, const void *data, bool is_specialization = false) const;
+    ConstantKey make_constant_key(const Type &type, const void *data, bool is_specialization = false) const;
+    SpvId lookup_constant(const Type &type, const void *data, bool is_specialization = false) const;
 
     ConstantKey make_null_constant_key(const Type &type) const;
     SpvId lookup_null_constant(const Type &type) const;
@@ -671,7 +671,7 @@ struct SpvFactory {
     static SpvInstruction entry_point(SpvId exec_model, SpvId func_id, const std::string &name, const Variables &variables);
     static SpvInstruction memory_model(SpvAddressingModel addressing_model, SpvMemoryModel memory_model);
     static SpvInstruction exec_mode_local_size(SpvId function_id, uint32_t local_size_size_x, uint32_t local_size_size_y, uint32_t local_size_size_z);
-    static SpvInstruction exec_mode_local_size_id(SpvId function_id, SpvId local_size_x_id, SpvId local_size_y_id, SpvId local_size_z_id); // only avail in 1.2
+    static SpvInstruction exec_mode_local_size_id(SpvId function_id, SpvId local_size_x_id, SpvId local_size_y_id, SpvId local_size_z_id);  // only avail in 1.2
     static SpvInstruction memory_barrier(SpvId memory_scope_id, SpvId semantics_mask_id);
     static SpvInstruction control_barrier(SpvId execution_scope_id, SpvId memory_scope_id, SpvId semantics_mask_id);
     static SpvInstruction bitwise_not(SpvId type_id, SpvId result_id, SpvId src_id);
diff --git a/src/Target.cpp b/src/Target.cpp
index 87122c13cb2e..c71cf9fcb46f 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -374,6 +374,31 @@ Target::Feature get_host_cuda_capability(Target t) {
     return cap;
 }
 
+Target::Feature calculate_host_vulkan_capability(Target t) {
+    const auto *interface = get_device_interface_for_device_api(DeviceAPI::Vulkan, t);
+    internal_assert(interface->compute_capability);
+    int major, minor;
+    int err = interface->compute_capability(nullptr, &major, &minor);
+    internal_assert(err == 0) << "Failed to query cuda compute capability\n";
+    int ver = major * 10 + minor;
+    if (ver < 10) {
+        return Target::FeatureEnd;
+    } else if (ver < 10) {
+        return Target::VulkanV10;
+    } else if (ver < 12) {
+        return Target::VulkanV12;
+    } else if (ver < 13) {
+        return Target::VulkanV13;
+    } else {
+        return Target::VulkanV13;
+    }
+}
+
+Target::Feature get_host_vulkan_capability(Target t) {
+    static Target::Feature cap = calculate_host_vulkan_capability(t);
+    return cap;
+}
+
 const std::map<std::string, Target::OS> os_name_map = {
     {"os_unknown", Target::OSUnknown},
     {"linux", Target::Linux},
@@ -530,6 +555,14 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"profile_by_timer", Target::ProfileByTimer},
     {"spirv", Target::SPIRV},
     {"vulkan", Target::Vulkan},
+    {"vk_int8", Target::VulkanInt8},
+    {"vk_int16", Target::VulkanInt16},
+    {"vk_int64", Target::VulkanInt64},
+    {"vk_float16", Target::VulkanFloat16},
+    {"vk_float64", Target::VulkanFloat64},
+    {"vk_v10", Target::VulkanV10},
+    {"vk_v12", Target::VulkanV12},
+    {"vk_v13", Target::VulkanV13},
     // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well.
 };
 
@@ -679,6 +712,15 @@ bool merge_string(Target &t, const std::string &target) {
         t.set_feature(get_host_cuda_capability(t));
     }
 
+    if (is_host &&
+        t.has_feature(Target::Vulkan) &&
+        !t.has_feature(Target::VulkanV10) &&
+        !t.has_feature(Target::VulkanV12) &&
+        !t.has_feature(Target::VulkanV13)) {
+        // Detect host vulkan capability
+        t.set_feature(get_host_vulkan_capability(t));
+    }
+
     if (arch_specified && !bits_specified) {
         return false;
     }
@@ -978,19 +1020,35 @@ int Target::get_cuda_capability_lower_bound() const {
     return 20;
 }
 
+int Target::get_vulkan_capability_lower_bound() const {
+    if (!has_feature(Target::Vulkan)) {
+        return -1;
+    }
+    if (has_feature(Target::VulkanV10)) {
+        return 10;
+    }
+    if (has_feature(Target::VulkanV12)) {
+        return 12;
+    }
+    if (has_feature(Target::VulkanV13)) {
+        return 13;
+    }
+    return 10;
+}
+
 bool Target::supports_type(const Type &t) const {
     if (t.bits() == 64) {
         if (t.is_float()) {
             return !has_feature(Metal) &&
-                   !has_feature(Vulkan) &&
                    !has_feature(OpenGLCompute) &&
                    !has_feature(D3D12Compute) &&
+                   (!has_feature(Vulkan) || has_feature(Target::VulkanFloat64)) &&
                    (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles));
         } else {
             return (!has_feature(Metal) &&
-                    !has_feature(Vulkan) &&
                     !has_feature(OpenGLCompute) &&
-                    !has_feature(D3D12Compute));
+                    !has_feature(D3D12Compute) &&
+                    (!has_feature(Vulkan) || has_feature(Target::VulkanInt64)));
         }
     }
     return true;
@@ -1020,8 +1078,17 @@ bool Target::supports_type(const Type &t, DeviceAPI device) const {
         // types are not supported.
         return t.bits() < 64;
     } else if (device == DeviceAPI::Vulkan) {
-        // TODO(shoaibkamil): Is this correct?
-        return t.bits() < 64;
+        if (t.is_float() && t.bits() == 64) {
+            return has_feature(Target::VulkanFloat64);
+        } else if (t.is_float() && t.bits() == 16) {
+            return has_feature(Target::VulkanFloat16);
+        } else if (t.is_int_or_uint() && t.bits() == 64) {
+            return has_feature(Target::VulkanInt64);
+        } else if (t.is_int_or_uint() && t.bits() == 16) {
+            return has_feature(Target::VulkanInt16);
+        } else if (t.is_int_or_uint() && t.bits() == 8) {
+            return has_feature(Target::VulkanInt8);
+        }
     } else if (device == DeviceAPI::OpenGLCompute) {
         return t.bits() < 64;
     }
@@ -1172,7 +1239,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
     // (c) must match across both targets; it is an error if one target has the feature and the other doesn't
 
     // clang-format off
-    const std::array<Feature, 19> union_features = {{
+    const std::array<Feature, 22> union_features = {{
         // These are true union features.
         CUDA,
         D3D12Compute,
@@ -1196,6 +1263,9 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         HVX_v62,
         HVX_v65,
         HVX_v66,
+        VulkanV10,
+        VulkanV12,
+        VulkanV13,
     }};
     // clang-format on
 
@@ -1306,6 +1376,22 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         output.features.reset(CUDACapability86);
     }
 
+    // Pick tight lower bound for Vulkan capability. Use fall-through to clear redundant features
+    int vulkan_a = get_vulkan_capability_lower_bound();
+    int vulkan_b = other.get_vulkan_capability_lower_bound();
+
+    // Same trick as above for CUDA
+    int vulkan_capability = std::min((unsigned)vulkan_a, (unsigned)vulkan_b);
+    if (vulkan_capability < 10) {
+        output.features.reset(VulkanV10);
+    }
+    if (vulkan_capability < 12) {
+        output.features.reset(VulkanV12);
+    }
+    if (vulkan_capability < 13) {
+        output.features.reset(VulkanV13);
+    }
+
     // Pick tight lower bound for HVX version. Use fall-through to clear redundant features
     int hvx_a = get_hvx_lower_bound(*this);
     int hvx_b = get_hvx_lower_bound(other);
@@ -1346,6 +1432,9 @@ void target_test() {
         {{"x86-64-linux-cuda", "x86-64-linux", "x86-64-linux-cuda"}},
         {{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda", "x86-64-linux-cuda"}},
         {{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda-cuda_capability_30", "x86-64-linux-cuda-cuda_capability_30"}},
+        {{"x86-64-linux-vulkan", "x86-64-linux", "x86-64-linux-vulkan"}},
+        {{"x86-64-linux-vulkan-vk_v13", "x86-64-linux-vulkan", "x86-64-linux-vulkan"}},
+        {{"x86-64-linux-vulkan-vk_v13", "x86-64-linux-vulkan-vk_v10", "x86-64-linux-vulkan-vk_v10"}},
         {{"hexagon-32-qurt-hvx_v65", "hexagon-32-qurt-hvx_v62", "hexagon-32-qurt-hvx_v62"}},
         {{"hexagon-32-qurt-hvx_v62", "hexagon-32-qurt", "hexagon-32-qurt"}},
         {{"hexagon-32-qurt-hvx_v62-hvx", "hexagon-32-qurt", ""}},
diff --git a/src/Target.h b/src/Target.h
index 080e24c641cb..0fb4ebabac49 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -163,6 +163,14 @@ struct Target {
         ProfileByTimer = halide_target_feature_profile_by_timer,
         SPIRV = halide_target_feature_spirv,
         Vulkan = halide_target_feature_vulkan,
+        VulkanInt8 = halide_target_feature_vulkan_int8,
+        VulkanInt16 = halide_target_feature_vulkan_int16,
+        VulkanInt64 = halide_target_feature_vulkan_int64,
+        VulkanFloat16 = halide_target_feature_vulkan_float16,
+        VulkanFloat64 = halide_target_feature_vulkan_float64,
+        VulkanV10 = halide_target_feature_vulkan_version10,
+        VulkanV12 = halide_target_feature_vulkan_version12,
+        VulkanV13 = halide_target_feature_vulkan_version13,
         FeatureEnd = halide_target_feature_end
     };
     Target() = default;
@@ -325,6 +333,11 @@ struct Target {
      * features are set. */
     int get_cuda_capability_lower_bound() const;
 
+    /** Get the minimum Vulkan capability found as an integer. Returns
+     * 10 (our minimum supported Vulkan compute capability) if no Vulkan
+     * features are set. */
+    int get_vulkan_capability_lower_bound() const;
+
     /** Was libHalide compiled with support for this target? */
     bool supported() const;
 
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index baf3bb64b892..80c264706c89 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1365,6 +1365,14 @@ typedef enum halide_target_feature_t {
     halide_target_feature_profile_by_timer,       ///< Alternative to halide_target_feature_profile using timer interrupt for systems without threads or applicartions that need to avoid them.
     halide_target_feature_spirv,                  ///< Enable SPIR-V code generation support.
     halide_target_feature_vulkan,                 ///< Enable Vulkan runtime support.
+    halide_target_feature_vulkan_int8,            ///< Enable Vulkan 8-bit integer support.
+    halide_target_feature_vulkan_int16,           ///< Enable Vulkan 16-bit integer support.
+    halide_target_feature_vulkan_int64,           ///< Enable Vulkan 64-bit integer support.
+    halide_target_feature_vulkan_float16,         ///< Enable Vulkan 16-bit float support.
+    halide_target_feature_vulkan_float64,         ///< Enable Vulkan 64-bit float support.
+    halide_target_feature_vulkan_version10,       ///< Enable Vulkan v1.0 runtime target support.
+    halide_target_feature_vulkan_version12,       ///< Enable Vulkan v1.2 runtime target support.
+    halide_target_feature_vulkan_version13,       ///< Enable Vulkan v1.3 runtime target support.
     halide_target_feature_end                     ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing.
 } halide_target_feature_t;
 
diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index 5aa0d38e67d0..92dc63ea8e31 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -71,6 +71,17 @@ typedef uint32_t VkBool32;
 typedef uint64_t VkDeviceSize;
 typedef uint32_t VkSampleMask;
 
+// Provided by VK_VERSION_1_0
+#define VK_API_VERSION_MAJOR(version) (((uint32_t)(version) >> 22) & 0x7FU)
+#define VK_API_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3FFU)
+#define VK_API_VERSION_PATCH(version) ((uint32_t)(version)&0xFFFU)
+#define VK_MAKE_API_VERSION(variant, major, minor, patch) \
+    ((((uint32_t)(variant)) << 29) | (((uint32_t)(major)) << 22) | (((uint32_t)(minor)) << 12) | ((uint32_t)(patch)))
+#define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)
+#define VK_API_VERSION_1_1 VK_MAKE_API_VERSION(0, 1, 1, 0)
+#define VK_API_VERSION_1_2 VK_MAKE_API_VERSION(0, 1, 2, 0)
+#define VK_API_VERSION_1_3 VK_MAKE_API_VERSION(0, 1, 3, 0)
+
 #define VK_DEFINE_HANDLE(object) typedef struct object##_T *(object);
 #define VK_DEFINE_NON_DISPATCHABLE_HANDLE(object) typedef uint64_t object;
 
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 3398cf414505..11aa495fc798 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -120,6 +120,11 @@ WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_b
     return 0;
 }
 
+WEAK int halide_vulkan_compute_capability(void *user_context, int *major, int *minor) {
+    debug(user_context) << " halide_vulkan_compute_capability (user_context: " << user_context << ")\n";
+    return vk_find_compute_capability(user_context, major, minor);
+}
+
 WEAK int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr, const char *src, int size) {
     debug(user_context)
         << "halide_vulkan_init_kernels (user_context: " << user_context
@@ -1041,17 +1046,19 @@ WEAK int halide_vulkan_run(void *user_context,
     // if used more than once.
     //
     // 1. Lookup the shader module cache entry in the compilation cache
-    //    1a. If shader module doesn't exist yet, then lookup invokes compile
+    //    --- If shader module doesn't exist yet, then lookup invokes compile
+    //    1a. Locate the correct entry point for the kernel (code modules may contain multiple entry points)
     // 2. If the rest of the cache entry is uninitialized, then create new objects:
     //    2a. Create a descriptor set layout
-    //    2b. Create the buffer for the scalar params
-    //    2c. Create a pipeline layout
-    //    2d. Create a compute pipeline
-    //    2e. Create a descriptor set
+    //    2b. Create a pipeline layout
+    //    2c. Create a compute pipeline
+    //    --- Apply specializations to pipeline for shared memory or workgroup sizes
+    //    2d. Create a descriptor set
     //    --- The above can be cached between invocations ---
-    // 3. Set bindings for buffers in the descriptor set
-    //    3a. Copy args into uniform buffer
-    //    3b. Update buffer bindings for descriptor set
+    // 3. Set bindings for buffers and args in the descriptor set
+    //    3a. Create the buffer for the scalar params
+    //    3b. Copy args into uniform buffer
+    //    3c. Update buffer bindings for descriptor set
     // 4. Create a command buffer from the command pool
     // 5. Fill the command buffer with a dispatch call
     //    7a. Bind the compute pipeline
@@ -1059,13 +1066,11 @@ WEAK int halide_vulkan_run(void *user_context,
     //    7c. Add a dispatch to the command buffer
     //    7d. End the command buffer
     // 6. Submit the command buffer to our command queue
-    // --- The following isn't best practice, but it's in line
-    //     with what we do in Metal etc. ---
+    // --- The following isn't the most efficient, but it's what we do in Metal etc. ---
     // 7. Wait until the queue is done with the command buffer
     // 8. Cleanup all temporary objects
 
-    //// 3. Create a compute pipeline
-    // Get the shader module
+    // 1. Get the shader module cache entry
     VulkanCompilationCacheEntry *cache_entry = nullptr;
     bool found = compilation_cache.lookup(ctx.device, state_ptr, cache_entry);
     halide_abort_if_false(user_context, found);
@@ -1074,11 +1079,11 @@ WEAK int halide_vulkan_run(void *user_context,
         return halide_error_code_internal_error;
     }
 
-    // 3a. Locate the correct entry point from the cache
+    // 1a. Locate the correct entry point from the cache
     bool found_entry_point = false;
     uint32_t entry_point_index = 0;
-    for (uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
-        if (strstr(cache_entry->entry_point_data[n].entry_point_name, entry_name) != nullptr) {
+    for (uint32_t n = 0; n < cache_entry->shader_count; ++n) {
+        if (strstr(cache_entry->shader_bindings[n].entry_point_name, entry_name) != nullptr) {
             entry_point_index = n;
             found_entry_point = true;
         }
@@ -1088,18 +1093,19 @@ WEAK int halide_vulkan_run(void *user_context,
         return halide_error_code_internal_error;
     }
     debug(user_context) << " found entry point ["
-                        << (entry_point_index + 1) << " of " << cache_entry->entry_point_count
+                        << (entry_point_index + 1) << " of " << cache_entry->shader_count
                         << "] '" << entry_name << "'\n";
 
+    // 2. Create objects for execution
     halide_abort_if_false(user_context, cache_entry->descriptor_set_layouts != nullptr);
     if (cache_entry->pipeline_layout == 0) {
 
-        //// 1. Create all descriptor set layouts
-        for (uint32_t n = 0; n < cache_entry->entry_point_count; ++n) {
+        // 2a. Create all descriptor set layouts
+        for (uint32_t n = 0; n < cache_entry->shader_count; ++n) {
             if (((void *)cache_entry->descriptor_set_layouts[n]) == nullptr) {
-                uint32_t uniform_buffer_count = cache_entry->entry_point_data[n].uniform_buffer_count;
-                uint32_t storage_buffer_count = cache_entry->entry_point_data[n].storage_buffer_count;
-                debug(user_context) << " creating descriptor set layout [" << n << "] " << cache_entry->entry_point_data[n].entry_point_name << "\n";
+                uint32_t uniform_buffer_count = cache_entry->shader_bindings[n].uniform_buffer_count;
+                uint32_t storage_buffer_count = cache_entry->shader_bindings[n].storage_buffer_count;
+                debug(user_context) << " creating descriptor set layout [" << n << "] " << cache_entry->shader_bindings[n].entry_point_name << "\n";
                 VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(cache_entry->descriptor_set_layouts[n]));
                 if (result != VK_SUCCESS) {
                     error(user_context) << "vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
@@ -1108,85 +1114,93 @@ WEAK int halide_vulkan_run(void *user_context,
             }
         }
 
-        ///// 2. Create the pipeline layout
-        VkResult result = vk_create_pipeline_layout(user_context, ctx.allocator, cache_entry->entry_point_count, cache_entry->descriptor_set_layouts, &(cache_entry->pipeline_layout));
+        // 2b. Create the pipeline layout
+        VkResult result = vk_create_pipeline_layout(user_context, ctx.allocator, cache_entry->shader_count, cache_entry->descriptor_set_layouts, &(cache_entry->pipeline_layout));
         if (result != VK_SUCCESS) {
             error(user_context) << "vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
             return halide_error_code_internal_error;
         }
     }
 
-    VulkanEntryPointData *entry_point_data = (cache_entry->entry_point_data + entry_point_index);
-    halide_abort_if_false(user_context, entry_point_data != nullptr);
+    VulkanShaderBinding *shader_bindings = (cache_entry->shader_bindings + entry_point_index);
+    halide_abort_if_false(user_context, shader_bindings != nullptr);
 
-    //// 1a. Create a buffer for the scalar parameters
-    if ((entry_point_data->args_region == nullptr) && entry_point_data->uniform_buffer_count) {
-        size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context, arg_sizes, args, arg_is_buffer);
-        if (scalar_buffer_size > 0) {
-            entry_point_data->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
-            if (entry_point_data->args_region == nullptr) {
-                error(user_context) << "vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
-                return halide_error_code_internal_error;
-            }
-        }
-    }
+    VulkanDispatchData dispatch_data = {};
+    dispatch_data.shared_mem_bytes = shared_mem_bytes;
+    dispatch_data.global_size[0] = blocksX;
+    dispatch_data.global_size[1] = blocksY;
+    dispatch_data.global_size[2] = blocksZ;
+    dispatch_data.local_size[0] = threadsX;
+    dispatch_data.local_size[1] = threadsY;
+    dispatch_data.local_size[2] = threadsZ;
 
-    // Construct the pipeline
-    if (entry_point_data->compute_pipeline == 0) {
-        VkResult result = vk_create_compute_pipeline(user_context, ctx.allocator, entry_name, cache_entry->shader_module, cache_entry->pipeline_layout, &(entry_point_data->compute_pipeline));
-        if (result != VK_SUCCESS) {
-            error(user_context) << "vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-            return halide_error_code_internal_error;
-        }
+    // 2c. Setup the compute pipeline (eg override any specializations for shared mem or workgroup size)
+    VkResult result = vk_setup_compute_pipeline(user_context, ctx.allocator, shader_bindings, &dispatch_data, cache_entry->shader_module, cache_entry->pipeline_layout, &(shader_bindings->compute_pipeline));
+    if (result != VK_SUCCESS) {
+        error(user_context) << "vk_setup_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_internal_error;
     }
 
-    //// 4. Create a descriptor set
-    if (entry_point_data->descriptor_set == 0) {
+    // 2d. Create a descriptor set
+    if (shader_bindings->descriptor_set == 0) {
 
         // Construct a descriptor pool
         //
         // NOTE: while this could be re-used across multiple pipelines, we only know the storage requirements of this kernel's
         //       inputs and outputs ... so create a pool specific to the number of buffers known at this time
 
-        uint32_t uniform_buffer_count = entry_point_data->uniform_buffer_count;
-        uint32_t storage_buffer_count = entry_point_data->storage_buffer_count;
-        VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(entry_point_data->descriptor_pool));
+        uint32_t uniform_buffer_count = shader_bindings->uniform_buffer_count;
+        uint32_t storage_buffer_count = shader_bindings->storage_buffer_count;
+        VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(shader_bindings->descriptor_pool));
         if (result != VK_SUCCESS) {
             error(user_context) << "vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
 
         // Create the descriptor set
-        result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layouts[entry_point_index], entry_point_data->descriptor_pool, &(entry_point_data->descriptor_set));
+        result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layouts[entry_point_index], shader_bindings->descriptor_pool, &(shader_bindings->descriptor_set));
         if (result != VK_SUCCESS) {
             error(user_context) << "vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
     }
 
-    //// 5. Update uniform args and bindings for buffers in the descriptor set
+    // 3a. Create a buffer for the scalar parameters
+    if ((shader_bindings->args_region == nullptr) && shader_bindings->uniform_buffer_count) {
+        size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context, arg_sizes, args, arg_is_buffer);
+        if (scalar_buffer_size > 0) {
+            shader_bindings->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
+            if (shader_bindings->args_region == nullptr) {
+                error(user_context) << "vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
+                return halide_error_code_internal_error;
+            }
+        }
+    }
+
+    // 3b. Update uniform buffer with scalar parameters
     VkBuffer *args_buffer = nullptr;
-    if ((entry_point_data->args_region != nullptr) && entry_point_data->uniform_buffer_count) {
-        VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, entry_point_data->args_region, arg_sizes, args, arg_is_buffer);
+    if ((shader_bindings->args_region != nullptr) && shader_bindings->uniform_buffer_count) {
+        VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, shader_bindings->args_region, arg_sizes, args, arg_is_buffer);
         if (result != VK_SUCCESS) {
             debug(user_context) << "vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
 
-        args_buffer = reinterpret_cast<VkBuffer *>(entry_point_data->args_region->handle);
+        args_buffer = reinterpret_cast<VkBuffer *>(shader_bindings->args_region->handle);
         if (args_buffer == nullptr) {
             error(user_context) << "Vulkan: Failed to retrieve scalar args buffer for device memory!\n";
             return halide_error_code_internal_error;
         }
     }
 
-    VkResult result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_data->uniform_buffer_count, entry_point_data->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_data->descriptor_set);
+    // 3c. Update buffer bindings for descriptor set
+    result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, shader_bindings->uniform_buffer_count, shader_bindings->storage_buffer_count, arg_sizes, args, arg_is_buffer, shader_bindings->descriptor_set);
     if (result != VK_SUCCESS) {
         debug(user_context) << "vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
     }
 
-    //// 6. Create a command buffer from the command pool
+    // 4. Create a command buffer from the command pool
     VkCommandBuffer command_buffer;
     result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
     if (result != VK_SUCCESS) {
@@ -1194,12 +1208,12 @@ WEAK int halide_vulkan_run(void *user_context,
         return result;
     }
 
-    //// 7. Begin the command buffer
+    // 5. Fill the command buffer
     result = vk_fill_command_buffer_with_dispatch_call(user_context,
                                                        ctx.device, command_buffer,
-                                                       entry_point_data->compute_pipeline,
+                                                       shader_bindings->compute_pipeline,
                                                        cache_entry->pipeline_layout,
-                                                       entry_point_data->descriptor_set,
+                                                       shader_bindings->descriptor_set,
                                                        entry_point_index,
                                                        blocksX, blocksY, blocksZ);
     if (result != VK_SUCCESS) {
@@ -1207,21 +1221,21 @@ WEAK int halide_vulkan_run(void *user_context,
         return result;
     }
 
-    //// 8. Submit the command buffer to our command queue
+    // 6. Submit the command buffer to our command queue
     result = vk_submit_command_buffer(user_context, ctx.queue, command_buffer);
     if (result != VK_SUCCESS) {
         debug(user_context) << "vk_submit_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
     }
 
-    //// 9. Wait until the queue is done with the command buffer
+    // 7. Wait until the queue is done with the command buffer
     result = vkQueueWaitIdle(ctx.queue);
     if (result != VK_SUCCESS) {
         debug(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
         return result;
     }
 
-    //// 10. Cleanup
+    // 8. Cleanup
     vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
     vkResetCommandPool(ctx.device, ctx.command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
 
@@ -1361,7 +1375,7 @@ WEAK halide_device_interface_t vulkan_device_interface = {
     halide_device_release_crop,
     halide_device_wrap_native,
     halide_device_detach_native,
-    nullptr,  // target capabilities.
+    halide_vulkan_compute_capability,
     &vulkan_device_interface_impl};
 
 // --------------------------------------------------------------------------
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 348470438cb5..62830a07a6d7 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -84,6 +84,49 @@ class VulkanContext {
 
 namespace {
 
+int vk_find_compute_capability(void *user_context, int *major, int *minor) {
+    debug(user_context) << " vk_find_compute_capability (user_context: " << user_context << ")\n";
+
+    VkInstance instance = nullptr;
+    VkDevice device = nullptr;
+    VkPhysicalDevice physical_device = nullptr;
+    uint32_t queue_family_index = 0;
+
+    StringTable requested_layers;
+    vk_get_requested_layers(user_context, requested_layers);
+
+    const VkAllocationCallbacks *alloc_callbacks = halide_vulkan_get_allocation_callbacks(user_context);
+    int status = vk_create_instance(user_context, requested_layers, &instance, alloc_callbacks);
+    if (status != halide_error_code_success) {
+        debug(user_context) << "  no valid vulkan runtime was found ...\n";
+        *major = 0;
+        *minor = 0;
+        return 0;
+    }
+
+    if (vkCreateDevice == nullptr) {
+        vk_load_vulkan_functions(instance);
+    }
+
+    status = vk_select_device_for_context(user_context, &instance, &device, &physical_device, &queue_family_index);
+    if (status != halide_error_code_success) {
+        debug(user_context) << "  no valid vulkan device was found ...\n";
+        *major = 0;
+        *minor = 0;
+        return 0;
+    }
+
+    VkPhysicalDeviceProperties device_properties = {0};
+    debug(user_context) << "  querying for device properties ...\n";
+    vkGetPhysicalDeviceProperties(physical_device, &device_properties);
+    *major = VK_API_VERSION_MAJOR(device_properties.apiVersion);
+    *minor = VK_API_VERSION_MINOR(device_properties.apiVersion);
+    debug(user_context) << "  found device compute capability v" << *major << "." << *minor << " ...\n";
+
+    vk_destroy_instance(user_context, instance, alloc_callbacks);
+    return 0;
+}
+
 // Initializes the instance (used by the default vk_create_context)
 int vk_create_instance(void *user_context, const StringTable &requested_layers, VkInstance *instance, const VkAllocationCallbacks *alloc_callbacks) {
     debug(user_context) << " vk_create_instance (user_context: " << user_context << ")\n";
@@ -118,7 +161,7 @@ int vk_create_instance(void *user_context, const StringTable &requested_layers,
         VK_MAKE_API_VERSION(0, 1, 0, 0),                                                           // app version
         "Halide",                                                                                  // engine name
         VK_MAKE_API_VERSION(0, HALIDE_VERSION_MAJOR, HALIDE_VERSION_MINOR, HALIDE_VERSION_PATCH),  // engine version
-        VK_API_VERSION_1_0};
+        VK_API_VERSION_1_3};                                                                       // FIXME: only use the minimum capability necessary
 
     VkInstanceCreateInfo create_info = {
         VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
@@ -138,6 +181,12 @@ int vk_create_instance(void *user_context, const StringTable &requested_layers,
     return halide_error_code_success;
 }
 
+int vk_destroy_instance(void *user_context, VkInstance instance, const VkAllocationCallbacks *alloc_callbacks) {
+    debug(user_context) << " vk_destroy_instance (user_context: " << user_context << ")\n";
+    vkDestroyInstance(instance, alloc_callbacks);
+    return halide_error_code_success;
+}
+
 int vk_select_device_for_context(void *user_context,
                                  VkInstance *instance, VkDevice *device,
                                  VkPhysicalDevice *physical_device,
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 66879d377792..6f2815b48b85 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -26,7 +26,7 @@ namespace Vulkan {
 
 // Declarations
 class VulkanMemoryAllocator;
-struct VulkanEntryPointData;
+struct VulkanShaderBinding;
 struct VulkanCompilationCacheEntry;
 
 // --------------------------------------------------------------------------
@@ -47,6 +47,7 @@ int vk_destroy_memory_allocator(void *user_context, VulkanMemoryAllocator *alloc
 // --------------------------------------------------------------------------
 // Context
 // --------------------------------------------------------------------------
+
 int vk_create_context(
     void *user_context,
     VulkanMemoryAllocator **allocator,
@@ -56,7 +57,10 @@ int vk_create_context(
     VkCommandPool *command_pool,
     VkQueue *queue, uint32_t *queue_family_index);
 
+int vk_find_compute_capability(void *user_context, int *major, int *minor);
+
 int vk_create_instance(void *user_context, const StringTable &requested_layers, VkInstance *instance, const VkAllocationCallbacks *alloc_callbacks);
+int vk_destroy_instance(void *user_context, VkInstance instance, const VkAllocationCallbacks *alloc_callbacks);
 
 int vk_select_device_for_context(void *user_context,
                                  VkInstance *instance, VkDevice *device,
@@ -186,15 +190,23 @@ VkResult vk_create_compute_pipeline(void *user_context,
                                     const char *pipeline_name,
                                     VkShaderModule shader_module,
                                     VkPipelineLayout pipeline_layout,
+                                    VkSpecializationInfo *specialization_info,
                                     VkPipeline *compute_pipeline);
 
+VkResult vk_setup_compute_pipeline(void *user_context,
+                                   VulkanMemoryAllocator *allocator,
+                                   VulkanShaderBinding *shader_bindings,
+                                   VkShaderModule shader_module,
+                                   VkPipelineLayout pipeline_layout,
+                                   VkPipeline *compute_pipeline);
+
 VkResult vk_destroy_compute_pipeline(void *user_context,
                                      VulkanMemoryAllocator *allocator,
                                      VkPipeline compute_pipeline);
 
 // -- Shader Module
-VulkanEntryPointData *vk_decode_entry_point_data(void *user_context, VulkanMemoryAllocator *allocator,
-                                                 const uint32_t *module_ptr, uint32_t module_size);
+VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryAllocator *allocator,
+                                               const uint32_t *module_ptr, uint32_t module_size);
 
 VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
                                                       const char *src, int size);
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 674eb077af17..e033fef805fb 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -470,7 +470,7 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
         VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
         nullptr,                               // struct extending this
         0,                                     // create flags
-        0,                                     // buffer size (in bytes)
+        sizeof(uint32_t),                      // buffer size (in bytes)
         usage_flags,                           // buffer usage flags
         VK_SHARING_MODE_EXCLUSIVE,             // sharing mode
         0, nullptr};
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 57def976ca1d..229b06c926c0 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -11,24 +11,58 @@ namespace Runtime {
 namespace Internal {
 namespace Vulkan {
 
-// Compilation cache for compiled shader modules
-struct VulkanEntryPointData {
+// Defines the specialization constants used for dynamically overiding the dispatch size
+struct VulkanWorkgroupSizeBinding {
+    uint32_t constant_id[3] = {0};  // zero if unused
+};
+
+// Data used to override specialization constants for dynamic dispatching
+struct VulkanDispatchData {
+    uint32_t global_size[3] = {0};  // aka blocks
+    uint32_t local_size[3] = {0};   // aka threads
+    uint32_t shared_mem_bytes = 0;
+    VulkanWorkgroupSizeBinding local_size_binding = {};
+};
+
+// Specialization constant binding information
+struct VulkanSpecializationConstant {
+    uint32_t constant_id = 0;
+    uint32_t type_size = 0;
+    const char *constant_name = nullptr;
+};
+
+// Shared memory allocation variable information
+struct VulkanSharedMemoryAllocation {
+    uint32_t constant_id = 0;  // specialization constant to override allocation array size (or zero if unused)
+    uint32_t type_size = 0;
+    uint32_t array_size = 0;
+    const char *variable_name = nullptr;
+};
+
+// Entry point metadata for shader modules
+struct VulkanShaderBinding {
     const char *entry_point_name = nullptr;
+    VulkanDispatchData dispatch_data = {};
     VkDescriptorPool descriptor_pool = {0};
     VkDescriptorSet descriptor_set = {0};
     VkPipeline compute_pipeline = {0};
     uint32_t uniform_buffer_count = 0;
     uint32_t storage_buffer_count = 0;
+    uint32_t specialization_constants_count = 0;
+    uint32_t shared_memory_allocations_count = 0;
+    VulkanSpecializationConstant *specialization_constants = nullptr;
+    VulkanSharedMemoryAllocation *shared_memory_allocations = nullptr;
     uint32_t bindings_count = 0;
     MemoryRegion *args_region = nullptr;
 };
 
+// Compilation cache for compiled shader modules
 struct VulkanCompilationCacheEntry {
     VkShaderModule shader_module = {0};
     VkDescriptorSetLayout *descriptor_set_layouts = nullptr;
     VkPipelineLayout pipeline_layout = {0};
-    uint32_t entry_point_count = 0;
-    VulkanEntryPointData *entry_point_data = nullptr;
+    uint32_t shader_count = 0;
+    VulkanShaderBinding *shader_bindings = nullptr;
 };
 
 WEAK Halide::Internal::GPUCompilationCache<VkDevice, VulkanCompilationCacheEntry *> compilation_cache;
@@ -139,7 +173,7 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
     vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute_pipeline);
     vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_layout,
                             descriptor_set_index, 1, &descriptor_set, 0, nullptr);
-    vkCmdDispatch(command_buffer, blocksX, blocksY, blocksZ);  // TODO: make sure this is right!
+    vkCmdDispatch(command_buffer, blocksX, blocksY, blocksZ);
 
     result = vkEndCommandBuffer(command_buffer);
     if (result != VK_SUCCESS) {
@@ -667,6 +701,7 @@ VkResult vk_create_compute_pipeline(void *user_context,
                                     const char *pipeline_name,
                                     VkShaderModule shader_module,
                                     VkPipelineLayout pipeline_layout,
+                                    VkSpecializationInfo *specialization_info,
                                     VkPipeline *compute_pipeline) {
 
 #ifdef DEBUG_RUNTIME
@@ -691,7 +726,7 @@ VkResult vk_create_compute_pipeline(void *user_context,
                 VK_SHADER_STAGE_COMPUTE_BIT,                          // compute stage shader
                 shader_module,                                        // shader module
                 pipeline_name,                                        // entry point name
-                nullptr                                               // pointer to VkSpecializationInfo struct
+                specialization_info,                                  // pointer to VkSpecializationInfo struct
             },
             pipeline_layout,  // pipeline layout
             0,                // base pipeline handle for derived pipeline
@@ -707,6 +742,145 @@ VkResult vk_create_compute_pipeline(void *user_context,
     return VK_SUCCESS;
 }
 
+VkResult vk_setup_compute_pipeline(void *user_context,
+                                   VulkanMemoryAllocator *allocator,
+                                   VulkanShaderBinding *shader_bindings,
+                                   VulkanDispatchData *dispatch_data,
+                                   VkShaderModule shader_module,
+                                   VkPipelineLayout pipeline_layout,
+                                   VkPipeline *compute_pipeline) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << " vk_setup_compute_pipeline (user_context: " << user_context << ", "
+        << "entry_point_name: '" << shader_bindings->entry_point_name << "', "
+        << "allocator: " << (void *)allocator << ", "
+        << "device: " << (void *)allocator->current_device() << ", "
+        << "shader_bindings: " << (void *)shader_bindings << ", "
+        << "shader_module: " << (void *)shader_module << ", "
+        << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
+#endif
+    halide_abort_if_false(user_context, allocator != nullptr);
+    halide_abort_if_false(user_context, shader_bindings != nullptr);
+    halide_abort_if_false(user_context, dispatch_data != nullptr);
+
+    VkResult result = VK_SUCCESS;
+    const char *entry_point_name = shader_bindings->entry_point_name;
+
+    uint32_t dispatch_constant_index = 0;
+    uint32_t dispatch_constant_ids[4] = {0, 0, 0, 0};
+    uint32_t dispatch_constant_values[4] = {0, 0, 0, 0};
+
+    // locate the mapping for overriding any dynamic shared memory allocation sizes
+    if (shader_bindings->shared_memory_allocations_count && dispatch_data->shared_mem_bytes) {
+
+        uint32_t shared_mem_constant_id = 0;
+        uint32_t static_shared_mem_bytes = 0;
+        uint32_t shared_mem_type_size = 0;
+
+        for (uint32_t sm = 0; sm < shader_bindings->shared_memory_allocations_count; sm++) {
+            VulkanSharedMemoryAllocation *allocation = &(shader_bindings->shared_memory_allocations[sm]);
+            if (allocation->constant_id == 0) {
+                // static fixed-size allocation
+                static_shared_mem_bytes += allocation->type_size * allocation->array_size;
+            } else {
+                // dynamic allocation
+                if (shared_mem_constant_id > 0) {
+                    error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is suported!!\n";
+                    break;
+                }
+                shared_mem_constant_id = allocation->constant_id;
+                shared_mem_type_size = allocation->type_size;
+            }
+        }
+        uint32_t shared_mem_bytes_avail = (dispatch_data->shared_mem_bytes - static_shared_mem_bytes);
+        debug(user_context) << "  pipeline uses " << static_shared_mem_bytes << " bytes of static shared memory\n";
+        debug(user_context) << "  dispatch requests " << dispatch_data->shared_mem_bytes << " bytes of shared memory\n";
+        debug(user_context) << "  dynamic shared memory " << shared_mem_bytes_avail << " bytes available\n";
+
+        // setup the dynamic array size
+        if ((shared_mem_constant_id > 0) && (shared_mem_bytes_avail > 0)) {
+            uint32_t dynamic_array_size = (uint32_t)shared_mem_bytes_avail / shared_mem_type_size;
+            debug(user_context) << "  setting shared memory to " << (uint32_t)dynamic_array_size << " elements "
+                                << "(or " << (uint32_t)shared_mem_bytes_avail << " bytes)\n";
+
+            // save the shared mem specialization constant in the first slot
+            dispatch_constant_ids[dispatch_constant_index] = shared_mem_constant_id;
+            dispatch_constant_values[dispatch_constant_index] = dynamic_array_size;
+            dispatch_constant_index++;
+        }
+    }
+
+    // locate the mapping for overriding any dynamic workgroup local sizes
+    if (shader_bindings->dispatch_data.local_size_binding.constant_id[0] != 0) {
+        for (uint32_t dim = 0; dim < 3; dim++) {
+            dispatch_constant_ids[dispatch_constant_index] = shader_bindings->dispatch_data.local_size_binding.constant_id[dim];
+            dispatch_constant_values[dispatch_constant_index] = dispatch_data->local_size[dim];
+            dispatch_constant_index++;
+        }
+    }
+
+    // verify the specialization constants actually exist
+    for (uint32_t dc = 0; dc < dispatch_constant_index; dc++) {
+        const uint32_t invalid_index = uint32_t(-1);
+        uint32_t found_index = invalid_index;
+        for (uint32_t sc = 0; sc < shader_bindings->specialization_constants_count; sc++) {
+            if (shader_bindings->specialization_constants[sc].constant_id == dispatch_constant_ids[dc]) {
+                debug(user_context) << "  binding specialization constant [" << dispatch_constant_ids[dc] << "] "
+                                    << "'" << shader_bindings->specialization_constants[sc].constant_name << "' "
+                                    << " => " << dispatch_constant_values[dc] << "\n";
+                found_index = sc;
+                break;
+            }
+        }
+        halide_abort_if_false(user_context, found_index != invalid_index);
+    }
+
+    // Prepare specialization mapping for all dispatch constants
+    uint32_t dispatch_constant_count = 0;
+    VkSpecializationMapEntry specialization_map_entries[4];
+    memset(specialization_map_entries, 0, 4 * sizeof(VkSpecializationMapEntry));
+    for (uint32_t dc = 0; dc < dispatch_constant_index && dc < 4; dc++) {
+        specialization_map_entries[dc].constantID = dispatch_constant_ids[dc];
+        specialization_map_entries[dc].size = sizeof(uint32_t);
+        specialization_map_entries[dc].offset = dc * sizeof(uint32_t);
+        dispatch_constant_count++;
+    }
+
+    if (dispatch_constant_count > 0) {
+
+        // Prepare specialization info block for the shader stage
+        VkSpecializationInfo specialization_info{};
+        specialization_info.dataSize = dispatch_constant_count * sizeof(uint32_t);
+        specialization_info.mapEntryCount = dispatch_constant_count;
+        specialization_info.pMapEntries = specialization_map_entries;
+        specialization_info.pData = dispatch_constant_values;
+
+        // Recreate the pipeline with the requested shared memory allocation
+        if (shader_bindings->compute_pipeline) {
+            vk_destroy_compute_pipeline(user_context, allocator, shader_bindings->compute_pipeline);
+            shader_bindings->compute_pipeline = {0};
+        }
+
+        result = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, &specialization_info, &(shader_bindings->compute_pipeline));
+        if (result != VK_SUCCESS) {
+            error(user_context) << "vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        }
+
+    } else {
+
+        // Construct and re-use the fixed pipeline
+        if (shader_bindings->compute_pipeline == 0) {
+            result = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, nullptr, &(shader_bindings->compute_pipeline));
+            if (result != VK_SUCCESS) {
+                error(user_context) << "vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            }
+        }
+    }
+
+    return result;
+}
+
 VkResult vk_destroy_compute_pipeline(void *user_context,
                                      VulkanMemoryAllocator *allocator,
                                      VkPipeline compute_pipeline) {
@@ -723,10 +897,10 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
 
 // --------------------------------------------------------------------------
 
-VulkanEntryPointData *vk_decode_entry_point_data(void *user_context, VulkanMemoryAllocator *allocator, const uint32_t *module_ptr, uint32_t module_size) {
+VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryAllocator *allocator, const uint32_t *module_ptr, uint32_t module_size) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
-        << " vk_decode_entry_point_data (user_context: " << user_context << ", "
+        << " vk_decode_shader_bindings (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
         << "module_ptr: " << (void *)module_ptr << ", "
         << "module_size: " << module_size << ")\n";
@@ -734,57 +908,197 @@ VulkanEntryPointData *vk_decode_entry_point_data(void *user_context, VulkanMemor
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
     halide_debug_assert(user_context, module_ptr != nullptr);
-    halide_debug_assert(user_context, module_size >= (2 * sizeof(uint32_t)));
 
     // Decode the sidecar for the module that lists the descriptor sets
-    // corresponding to each entry point contained in the module
+    // corresponding to each entry point contained in the module.
+    //
+    // Construct a shader binding for each entry point that defines all
+    // the buffers, constants, shared memory, and workgroup sizes
+    // that are required for execution.
+    //
+    // Like the SPIR-V code module, each entry is one word (1x uint32_t).
+    // Variable length sections are prefixed with their length (ie number of entries).
     //
     // [0] Header word count (total length of header)
     // [1] Number of descriptor sets
     // ... For each descriptor set ...
-    // ... [0] Number of uniform buffers for this descriptor set
-    // ... [1] Number of storage buffers for this descriptor set
-    // ... [2] Length of entry point name (padded to nearest word size)
-    // ... [X] Entry point string data
+    // ... [0] Length of entry point name (padded to nearest word size)
+    // ....... [*] Entry point string data (padded with null chars)
+    // ... [1] Number of uniform buffers for this descriptor set
+    // ... [2] Number of storage buffers for this descriptor set
+    // ... [3] Number of specialization constants for this descriptor set
+    // ....... For each specialization constant ...
+    // ....... [0] Length of constant name string (padded to nearest word size)
+    // ........... [*] Constant name string data (padded with null chars)
+    // ....... [1] Constant id (as used in VkSpecializationMapEntry for binding)
+    // ....... [2] Size of data type (in bytes)
+    // ... [4] Number of shared memory allocations for this descriptor set
+    // ....... For each allocation ...
+    // ....... [0] Length of variable name string (padded to nearest word size)
+    // ........... [*] Variable name string data (padded with null chars)
+    // ....... [1] Constant id to use for overriding array size (zero if it is not bound to a specialization constant)
+    // ....... [2] Size of data type (in bytes)
+    // ....... [3] Size of array (ie element count)
+    // ... [4] Dynamic workgroup dimensions bound to specialization constants
+    // ....... [0] Constant id to use for local_size_x (zero if it was statically declared and not bound to a specialization constant)
+    // ....... [1] Constant id to use for local_size_y
+    // ....... [2] Constant id ot use for local_size_z
     //
     // NOTE: See CodeGen_Vulkan_Dev::SPIRV_Emitter::encode_header() for the encoding
     //
+    // Both vk_decode_shader_bindings() and vk_compile_shader_module() will
+    // need to be updated if the header encoding ever changes!
     //
+    halide_debug_assert(user_context, module_size >= (2 * sizeof(uint32_t)));
     uint32_t module_entries = module_size / sizeof(uint32_t);
     uint32_t idx = 1;  // skip past the header_word_count
-    uint32_t entry_point_count = module_ptr[idx++];
-    if (entry_point_count < 1) {
+    uint32_t shader_count = module_ptr[idx++];
+    if (shader_count < 1) {
         return nullptr;  // no descriptors
     }
 
-    // allocate an array of entry point data
+    // allocate an array of shader bindings (one for each entry point in the module)
     VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT;
-    size_t entry_point_data_size = entry_point_count * sizeof(VulkanEntryPointData);
-    VulkanEntryPointData *entry_point_data = (VulkanEntryPointData *)vk_host_malloc(user_context, entry_point_data_size, 0, alloc_scope, allocator->callbacks());
-    if (entry_point_data == nullptr) {
-        error(user_context) << "Vulkan: Failed to allocate entry_point_data! Out of memory!\n";
+    size_t shader_bindings_size = shader_count * sizeof(VulkanShaderBinding);
+    VulkanShaderBinding *shader_bindings = (VulkanShaderBinding *)vk_host_malloc(user_context, shader_bindings_size, 0, alloc_scope, allocator->callbacks());
+    if (shader_bindings == nullptr) {
+        error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!\n";
         return nullptr;
     }
-    memset(entry_point_data, 0, entry_point_data_size);
+    memset(shader_bindings, 0, shader_bindings_size);
+
+    // decode and fill in the shader binding for each entry point
+    for (uint32_t n = 0; (n < shader_count) && (idx < module_entries); n++) {
+        halide_debug_assert(user_context, (idx + 8) < module_entries);  // should be at least 8 entries
+
+        // [0] Length of entry point name (padded to nearest word size)
+        uint32_t entry_point_name_length = module_ptr[idx++];
 
-    // decode and fill in each entry point
-    for (uint32_t n = 0; (n < entry_point_count) && (idx < module_entries); n++) {
-        halide_debug_assert(user_context, (idx + 4) < module_entries);
+        // [*] Entry point string data (padded with null chars)
+        const char *entry_point_name = (const char *)(module_ptr + idx);  // NOTE: module owns string data
+        idx += entry_point_name_length;                                   // skip past string data
+
+        // [1] Number of uniform buffers for this descriptor set
         uint32_t uniform_buffer_count = module_ptr[idx++];
+
+        // [2] Number of storage buffers for this descriptor set
         uint32_t storage_buffer_count = module_ptr[idx++];
-        uint32_t padded_string_length = module_ptr[idx++];
-        const char *entry_point_name = (const char *)(module_ptr + idx);
+
+        // [3] Number of specialization constants for this descriptor set
+        uint32_t specialization_constants_count = module_ptr[idx++];
+
+        // Decode all specialization constants
+        VulkanSpecializationConstant *specialization_constants = nullptr;
+        if (specialization_constants_count > 0) {
+
+            // Allocate an array to store the decoded specialization constant data
+            size_t specialization_constants_size = specialization_constants_count * sizeof(VulkanSpecializationConstant);
+            specialization_constants = (VulkanSpecializationConstant *)vk_host_malloc(user_context, specialization_constants_size, 0, alloc_scope, allocator->callbacks());
+            if (specialization_constants == nullptr) {
+                error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!\n";
+                return nullptr;
+            }
+            memset(specialization_constants, 0, specialization_constants_size);
+
+            // For each specialization constant ...
+            for (uint32_t sc = 0; sc < specialization_constants_count; sc++) {
+                halide_debug_assert(user_context, (idx + 4) < module_entries);  // should be at least 4 entries
+
+                // [0] Length of constant name string (padded to nearest word size)
+                uint32_t constant_name_length = module_ptr[idx++];
+
+                // [*] Constant name string data (padded with null chars)
+                const char *constant_name = (const char *)(module_ptr + idx);
+                specialization_constants[sc].constant_name = constant_name;  // NOTE: module owns string data
+                idx += constant_name_length;                                 // skip past string data
+
+                // [1] Constant id (as used in VkSpecializationMapEntry for binding)
+                specialization_constants[sc].constant_id = module_ptr[idx++];
+
+                // [2] Size of data type (in bytes)
+                specialization_constants[sc].type_size = module_ptr[idx++];
+            }
+        }
+
+        // [4] Number of shared memory allocations for this descriptor set
+        uint32_t shared_memory_allocations_count = module_ptr[idx++];  // [3]
+
+        // Decode all shared memory allocations ...
+        VulkanSharedMemoryAllocation *shared_memory_allocations = nullptr;
+        if (shared_memory_allocations_count > 0) {
+
+            // Allocate an array to store the decoded shared memory allocation data
+            size_t shared_memory_allocations_size = shared_memory_allocations_count * sizeof(VulkanSharedMemoryAllocation);
+            shared_memory_allocations = (VulkanSharedMemoryAllocation *)vk_host_malloc(user_context, shared_memory_allocations_size, 0, alloc_scope, allocator->callbacks());
+            if (shared_memory_allocations == nullptr) {
+                error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!\n";
+                return nullptr;
+            }
+            memset(shared_memory_allocations, 0, shared_memory_allocations_size);
+
+            // For each shared memory allocation ...
+            for (uint32_t sm = 0; sm < shared_memory_allocations_count && (idx < module_entries); sm++) {
+                halide_debug_assert(user_context, (idx + 4) < module_entries);  // should be at least 4 entries
+
+                // [0] Length of variable name string (padded to nearest word size)
+                uint32_t variable_name_length = module_ptr[idx++];
+
+                // [*] Variable name string data (padded with null chars)
+                const char *variable_name = (const char *)(module_ptr + idx);
+                shared_memory_allocations[sm].variable_name = variable_name;  // NOTE: module owns string data
+                idx += variable_name_length;                                  // skip past string data
+
+                // [1] Constant id to use for overriding array size
+                shared_memory_allocations[sm].constant_id = module_ptr[idx++];
+
+                // [2] Size of data type (in bytes)
+                shared_memory_allocations[sm].type_size = module_ptr[idx++];
+
+                // [3] Size of array (ie element count)
+                shared_memory_allocations[sm].array_size = module_ptr[idx++];
+            }
+        }
+
+        // [4] Dynamic workgroup dimensions bound to specialization constants
+        halide_debug_assert(user_context, (idx + 3) < module_entries);  // should be at least 3 entries
+        for (uint32_t dim = 0; dim < 3 && (idx < module_entries); dim++) {
+            shader_bindings[n].dispatch_data.local_size_binding.constant_id[dim] = module_ptr[idx++];
+        }
 
         debug(user_context) << "  [" << n << "] "
-                            << "uniform_buffer_count=" << uniform_buffer_count << " "
-                            << "storage_buffer_count=" << storage_buffer_count << " "
-                            << "entry_point_name_length=" << padded_string_length << " "
-                            << "entry_point_name: " << (const char *)entry_point_name << "\n";
-
-        entry_point_data[n].entry_point_name = entry_point_name;  // NOTE: module owns string data
-        entry_point_data[n].uniform_buffer_count = uniform_buffer_count;
-        entry_point_data[n].storage_buffer_count = storage_buffer_count;
-        idx += (padded_string_length / sizeof(uint32_t));  // skip past string data
+                            << "entry_point_name='" << (const char *)entry_point_name << "'\n";
+
+        debug(user_context) << "  uniform_buffer_count=" << uniform_buffer_count << "\n"
+                            << "  storage_buffer_count=" << storage_buffer_count << "\n";
+
+        debug(user_context) << "  specialization_constants_count=" << specialization_constants_count << "\n";
+        for (uint32_t sc = 0; sc < specialization_constants_count; sc++) {
+            debug(user_context) << "   [" << sc << "] "
+                                << "constant_name='" << (const char *)specialization_constants[sc].constant_name << "' "
+                                << "constant_id=" << specialization_constants[sc].constant_id << " "
+                                << "type_size=" << specialization_constants[sc].type_size << "\n";
+        }
+
+        debug(user_context) << "  shared_memory_allocations_count=" << shared_memory_allocations_count << "\n";
+        for (uint32_t sm = 0; sm < shared_memory_allocations_count; sm++) {
+            debug(user_context) << "   [" << sm << "] "
+                                << "variable_name='" << (const char *)shared_memory_allocations[sm].variable_name << "' "
+                                << "constant_id=" << shared_memory_allocations[sm].constant_id << " "
+                                << "type_size=" << shared_memory_allocations[sm].type_size << " "
+                                << "array_size=" << shared_memory_allocations[sm].array_size << "\n";
+        }
+        debug(user_context) << "  local_size_binding=[";
+        for (uint32_t dim = 0; dim < 3 && (idx < module_entries); dim++) {
+            debug(user_context) << shader_bindings[n].dispatch_data.local_size_binding.constant_id[dim] << " ";
+        }
+        debug(user_context) << "]\n";
+        shader_bindings[n].entry_point_name = entry_point_name;  // NOTE: module owns string data
+        shader_bindings[n].uniform_buffer_count = uniform_buffer_count;
+        shader_bindings[n].storage_buffer_count = storage_buffer_count;
+        shader_bindings[n].specialization_constants_count = specialization_constants_count;
+        shader_bindings[n].specialization_constants = specialization_constants;
+        shader_bindings[n].shared_memory_allocations_count = shared_memory_allocations_count;
+        shader_bindings[n].shared_memory_allocations = shared_memory_allocations;
     }
 
 #ifdef DEBUG_RUNTIME
@@ -792,7 +1106,7 @@ VulkanEntryPointData *vk_decode_entry_point_data(void *user_context, VulkanMemor
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return entry_point_data;
+    return shader_bindings;
 }
 
 VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
@@ -815,7 +1129,7 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
     halide_debug_assert(user_context, module_size >= (2 * sizeof(uint32_t)));
 
     uint32_t header_word_count = module_ptr[0];
-    uint32_t entry_point_count = module_ptr[1];
+    uint32_t shader_count = module_ptr[1];
     uint32_t header_size = header_word_count * sizeof(uint32_t);
 
     // skip past the preamble header to the start of the SPIR-V binary
@@ -848,27 +1162,27 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
     memset(cache_entry, 0, sizeof(VulkanCompilationCacheEntry));
 
     // decode the entry point data and save it in the cache entry
-    cache_entry->entry_point_data = vk_decode_entry_point_data(user_context, allocator, module_ptr, module_size);
-    if (cache_entry->entry_point_data != nullptr) {
-        cache_entry->entry_point_count = entry_point_count;
+    cache_entry->shader_bindings = vk_decode_shader_bindings(user_context, allocator, module_ptr, module_size);
+    if (cache_entry->shader_bindings != nullptr) {
+        cache_entry->shader_count = shader_count;
     }
 
     VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &cache_entry->shader_module);
     if ((result != VK_SUCCESS)) {
         error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
-        vk_host_free(user_context, cache_entry->entry_point_data, allocator->callbacks());
+        vk_host_free(user_context, cache_entry->shader_bindings, allocator->callbacks());
         vk_host_free(user_context, cache_entry, allocator->callbacks());
         return nullptr;
     }
 
     // allocate an array for storing the descriptor set layouts
-    if (cache_entry->entry_point_count) {
-        cache_entry->descriptor_set_layouts = (VkDescriptorSetLayout *)vk_host_malloc(user_context, cache_entry->entry_point_count * sizeof(VkDescriptorSetLayout), 0, alloc_scope, allocator->callbacks());
+    if (cache_entry->shader_count) {
+        cache_entry->descriptor_set_layouts = (VkDescriptorSetLayout *)vk_host_malloc(user_context, cache_entry->shader_count * sizeof(VkDescriptorSetLayout), 0, alloc_scope, allocator->callbacks());
         if (cache_entry->descriptor_set_layouts == nullptr) {
             error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!\n";
             return nullptr;
         }
-        memset(cache_entry->descriptor_set_layouts, 0, cache_entry->entry_point_count * sizeof(VkDescriptorSetLayout));
+        memset(cache_entry->descriptor_set_layouts, 0, cache_entry->shader_count * sizeof(VkDescriptorSetLayout));
     }
 
 #ifdef DEBUG_RUNTIME
@@ -906,29 +1220,37 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                     vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
                     cache_entry->shader_module = {0};
                 }
-                if (cache_entry->entry_point_data) {
-                    for (uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
-                        if (cache_entry->entry_point_data[n].args_region) {
-                            vk_destroy_scalar_uniform_buffer(user_context, allocator, cache_entry->entry_point_data[n].args_region);
-                            cache_entry->entry_point_data[n].args_region = nullptr;
+                if (cache_entry->shader_bindings) {
+                    for (uint32_t n = 0; n < cache_entry->shader_count; n++) {
+                        if (cache_entry->shader_bindings[n].args_region) {
+                            vk_destroy_scalar_uniform_buffer(user_context, allocator, cache_entry->shader_bindings[n].args_region);
+                            cache_entry->shader_bindings[n].args_region = nullptr;
+                        }
+                        if (cache_entry->shader_bindings[n].descriptor_pool) {
+                            vk_destroy_descriptor_pool(user_context, allocator, cache_entry->shader_bindings[n].descriptor_pool);
+                            cache_entry->shader_bindings[n].descriptor_pool = {0};
+                        }
+                        if (cache_entry->shader_bindings[n].compute_pipeline) {
+                            vk_destroy_compute_pipeline(user_context, allocator, cache_entry->shader_bindings[n].compute_pipeline);
+                            cache_entry->shader_bindings[n].compute_pipeline = {0};
                         }
-                        if (cache_entry->entry_point_data[n].descriptor_pool) {
-                            vk_destroy_descriptor_pool(user_context, allocator, cache_entry->entry_point_data[n].descriptor_pool);
-                            cache_entry->entry_point_data[n].descriptor_pool = {0};
+                        if (cache_entry->shader_bindings[n].specialization_constants) {
+                            vk_host_free(user_context, cache_entry->shader_bindings[n].specialization_constants, allocator->callbacks());
+                            cache_entry->shader_bindings[n].specialization_constants = nullptr;
                         }
-                        if (cache_entry->entry_point_data[n].compute_pipeline) {
-                            vk_destroy_compute_pipeline(user_context, allocator, cache_entry->entry_point_data[n].compute_pipeline);
-                            cache_entry->entry_point_data[n].compute_pipeline = {0};
+                        if (cache_entry->shader_bindings[n].shared_memory_allocations) {
+                            vk_host_free(user_context, cache_entry->shader_bindings[n].shared_memory_allocations, allocator->callbacks());
+                            cache_entry->shader_bindings[n].shared_memory_allocations = nullptr;
                         }
                     }
 
-                    vk_host_free(user_context, cache_entry->entry_point_data, allocator->callbacks());
-                    cache_entry->entry_point_data = nullptr;
-                    cache_entry->entry_point_count = 0;
+                    vk_host_free(user_context, cache_entry->shader_bindings, allocator->callbacks());
+                    cache_entry->shader_bindings = nullptr;
+                    cache_entry->shader_count = 0;
                 }
                 if (cache_entry->descriptor_set_layouts) {
-                    for (uint32_t n = 0; n < cache_entry->entry_point_count; n++) {
-                        debug(user_context) << "  destroying descriptor set layout [" << n << "] " << cache_entry->entry_point_data[n].entry_point_name << "\n";
+                    for (uint32_t n = 0; n < cache_entry->shader_count; n++) {
+                        debug(user_context) << "  destroying descriptor set layout [" << n << "] " << cache_entry->shader_bindings[n].entry_point_name << "\n";
                         vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
                         cache_entry->descriptor_set_layouts[n] = {0};
                     }
diff --git a/test/correctness/async_copy_chain.cpp b/test/correctness/async_copy_chain.cpp
index dadd3ffd45f3..45b014c4bd8b 100644
--- a/test/correctness/async_copy_chain.cpp
+++ b/test/correctness/async_copy_chain.cpp
@@ -25,12 +25,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
-        printf("[SKIP] Skipping test for Vulkan as it does not support copy_to_host/device() yet"
-               " (halide_buffer_copy is unimplemented in that backend).\n");
-        return 0;
-    }
-
     if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) {
         printf("Skipping test for OpenGLCompute as it does not support copy_to_host/device() yet"
                " (halide_buffer_copy is unimplemented in that backend).\n");
diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index 091f8315e617..7df99b759d4d 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -10,16 +10,22 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (t.has_feature(Target::Vulkan)) {
-        printf("[SKIP] Skipping test for Vulkan, as it does not support dynamically-sized shared memory\n");
-        return 0;
-    }
-
     if (t.has_feature(Target::OpenGLCompute)) {
         printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n");
         return 0;
     }
 
+    if (t.has_feature(Target::Vulkan)) {
+        const auto *interface = get_device_interface_for_device_api(DeviceAPI::Vulkan);
+        assert(interface->compute_capability != nullptr);
+        int major, minor;
+        int err = interface->compute_capability(nullptr, &major, &minor);
+        if (err != 0 || (major == 1 && minor < 2)) {
+            printf("[SKIP] Vulkan %d.%d is less than required 1.2.\n", major, minor);
+            return 0;
+        }
+    }
+
     // Check dynamic allocations per-block and per-thread into both
     // shared and global
     for (int per_thread = 0; per_thread < 2; per_thread++) {
diff --git a/test/correctness/gpu_mixed_shared_mem_types.cpp b/test/correctness/gpu_mixed_shared_mem_types.cpp
index 9126a31ac266..ff4b343eabf4 100644
--- a/test/correctness/gpu_mixed_shared_mem_types.cpp
+++ b/test/correctness/gpu_mixed_shared_mem_types.cpp
@@ -48,7 +48,6 @@ int main(int argc, char **argv) {
             // Metal does not support 64-bit integers.
             // neither does D3D12 under SM 5.1.
             if ((t.supports_device_api(DeviceAPI::Metal) ||
-                 t.supports_device_api(DeviceAPI::Vulkan) ||
                  t.supports_device_api(DeviceAPI::OpenGLCompute) ||
                  t.supports_device_api(DeviceAPI::D3D12Compute)) &&
                 types[i].bits() >= 64) {
diff --git a/test/correctness/gpu_reuse_shared_memory.cpp b/test/correctness/gpu_reuse_shared_memory.cpp
index 2eda324a4d2b..899b882d25d4 100644
--- a/test/correctness/gpu_reuse_shared_memory.cpp
+++ b/test/correctness/gpu_reuse_shared_memory.cpp
@@ -191,8 +191,6 @@ int main(int argc, char **argv) {
         printf("Running dynamic shared test\n");
         if (t.has_feature(Target::OpenGLCompute) && memory_type == MemoryType::GPUShared) {
             printf("Skipping test because GL doesn't support dynamic sizes for shared memory\n");
-        } else if (t.has_feature(Target::Vulkan) && memory_type == MemoryType::GPUShared) {
-            printf("Skipping test because Vulkan doesn't support dynamic sizes for shared memory\n");
         } else {
             if (dynamic_shared_test(memory_type) != 0) {
                 return -1;
diff --git a/test/correctness/math.cpp b/test/correctness/math.cpp
index 1315de6cc9c9..d21bad318b42 100644
--- a/test/correctness/math.cpp
+++ b/test/correctness/math.cpp
@@ -123,57 +123,65 @@ struct TestArgs {
 // for another day.
 
 // Version for a one argument function.
-#define fun_1(type_ret, type, name, c_name)                                                  \
-    void test_##type##_##name(Buffer<type> in) {                                             \
-        Target target = get_jit_target_from_environment();                                   \
-        if (!target.supports_type(type_of<type>())) {                                        \
-            return;                                                                          \
-        }                                                                                    \
-        Func test_##name("test_" #name);                                                     \
-        Var x("x"), xi("xi");                                                                \
-        test_##name(x) = name(in(x));                                                        \
-        if (target.has_gpu_feature()) {                                                      \
-            test_##name.gpu_tile(x, xi, 8);                                                  \
-        } else if (target.has_feature(Target::HVX)) {                                        \
-            test_##name.hexagon();                                                           \
-        }                                                                                    \
-        Buffer<type_ret> result = test_##name.realize({in.extent(0)}, target);               \
-        for (int i = 0; i < in.extent(0); i++) {                                             \
-            type_ret c_result = c_name(in(i));                                               \
-            if (!relatively_equal(c_result, result(i), target)) {                            \
-                fprintf(stderr, "For " #name "(%.20f) == %.20f from C and %.20f from %s.\n", \
-                        (double)in(i), (double)c_result, (double)result(i),                  \
-                        target.to_string().c_str());                                         \
-                num_errors++;                                                                \
-            }                                                                                \
-        }                                                                                    \
+#define fun_1(type_ret, type, name, c_name)                                                                \
+    void test_##type##_##name(Buffer<type> in) {                                                           \
+        Type type_of_type = type_of<type>();                                                               \
+        Target target = get_jit_target_from_environment();                                                 \
+        if (!target.supports_type(type_of_type)) {                                                         \
+            return;                                                                                        \
+        }                                                                                                  \
+        if (target.has_feature(Target::Vulkan) && (type_of_type.is_float() && type_of_type.bits() > 32)) { \
+            return;                                                                                        \
+        }                                                                                                  \
+        Func test_##name("test_" #name);                                                                   \
+        Var x("x"), xi("xi");                                                                              \
+        test_##name(x) = name(in(x));                                                                      \
+        if (target.has_gpu_feature()) {                                                                    \
+            test_##name.gpu_tile(x, xi, 8);                                                                \
+        } else if (target.has_feature(Target::HVX)) {                                                      \
+            test_##name.hexagon();                                                                         \
+        }                                                                                                  \
+        Buffer<type_ret> result = test_##name.realize({in.extent(0)}, target);                             \
+        for (int i = 0; i < in.extent(0); i++) {                                                           \
+            type_ret c_result = c_name(in(i));                                                             \
+            if (!relatively_equal(c_result, result(i), target)) {                                          \
+                fprintf(stderr, "For " #name "(%.20f) == %.20f from C and %.20f from %s.\n",               \
+                        (double)in(i), (double)c_result, (double)result(i),                                \
+                        target.to_string().c_str());                                                       \
+                num_errors++;                                                                              \
+            }                                                                                              \
+        }                                                                                                  \
     }
 
 // Version for a two argument function
-#define fun_2(type_ret, type, name, c_name)                                                         \
-    void test_##type##_##name(Buffer<type> in) {                                                    \
-        Target target = get_jit_target_from_environment();                                          \
-        if (!target.supports_type(type_of<type>())) {                                               \
-            return;                                                                                 \
-        }                                                                                           \
-        Func test_##name("test_" #name);                                                            \
-        Var x("x"), xi("xi");                                                                       \
-        test_##name(x) = name(in(0, x), in(1, x));                                                  \
-        if (target.has_gpu_feature()) {                                                             \
-            test_##name.gpu_tile(x, xi, 8);                                                         \
-        } else if (target.has_feature(Target::HVX)) {                                               \
-            test_##name.hexagon();                                                                  \
-        }                                                                                           \
-        Buffer<type_ret> result = test_##name.realize({in.height()}, target);                       \
-        for (int i = 0; i < in.height(); i++) {                                                     \
-            type_ret c_result = c_name(in(0, i), in(1, i));                                         \
-            if (!relatively_equal(c_result, result(i), target)) {                                   \
-                fprintf(stderr, "For " #name "(%.20f, %.20f) == %.20f from C and %.20f from %s.\n", \
-                        (double)in(0, i), (double)in(1, i), (double)c_result, (double)result(i),    \
-                        target.to_string().c_str());                                                \
-                num_errors++;                                                                       \
-            }                                                                                       \
-        }                                                                                           \
+#define fun_2(type_ret, type, name, c_name)                                                                \
+    void test_##type##_##name(Buffer<type> in) {                                                           \
+        Type type_of_type = type_of<type>();                                                               \
+        Target target = get_jit_target_from_environment();                                                 \
+        if (!target.supports_type(type_of_type)) {                                                         \
+            return;                                                                                        \
+        }                                                                                                  \
+        if (target.has_feature(Target::Vulkan) && (type_of_type.is_float() && type_of_type.bits() > 32)) { \
+            return;                                                                                        \
+        }                                                                                                  \
+        Func test_##name("test_" #name);                                                                   \
+        Var x("x"), xi("xi");                                                                              \
+        test_##name(x) = name(in(0, x), in(1, x));                                                         \
+        if (target.has_gpu_feature()) {                                                                    \
+            test_##name.gpu_tile(x, xi, 8);                                                                \
+        } else if (target.has_feature(Target::HVX)) {                                                      \
+            test_##name.hexagon();                                                                         \
+        }                                                                                                  \
+        Buffer<type_ret> result = test_##name.realize({in.height()}, target);                              \
+        for (int i = 0; i < in.height(); i++) {                                                            \
+            type_ret c_result = c_name(in(0, i), in(1, i));                                                \
+            if (!relatively_equal(c_result, result(i), target)) {                                          \
+                fprintf(stderr, "For " #name "(%.20f, %.20f) == %.20f from C and %.20f from %s.\n",        \
+                        (double)in(0, i), (double)in(1, i), (double)c_result, (double)result(i),           \
+                        target.to_string().c_str());                                                       \
+                num_errors++;                                                                              \
+            }                                                                                              \
+        }                                                                                                  \
     }
 
 // clang-format off
diff --git a/test/correctness/newtons_method.cpp b/test/correctness/newtons_method.cpp
index eeef000f1d20..acc37e79c9bd 100644
--- a/test/correctness/newtons_method.cpp
+++ b/test/correctness/newtons_method.cpp
@@ -10,8 +10,14 @@ using namespace Halide;
 template<typename T>
 int find_pi() {
     // Skip test if data type is not supported by the target.
+    Type type = type_of<T>();
     Target target = get_jit_target_from_environment();
-    if (!target.supports_type(type_of<T>())) {
+    if (!target.supports_type(type)) {
+        return 0;
+    }
+
+    // Vulkan lacks trig functions for 64-bit floats ... skip
+    if (target.has_feature(Target::Vulkan) && (type.is_float() && type.bits() > 32)) {
         return 0;
     }
 

From f1c004d716c11c39dce75555ad3a431ad1e49522 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 5 Dec 2022 14:22:31 -0800
Subject: [PATCH 118/166] Enable Vulkan asyc_device_copy test.

---
 test/correctness/async_device_copy.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/test/correctness/async_device_copy.cpp b/test/correctness/async_device_copy.cpp
index 2443a8fc01b9..490dc75b8dc1 100644
--- a/test/correctness/async_device_copy.cpp
+++ b/test/correctness/async_device_copy.cpp
@@ -28,12 +28,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (get_jit_target_from_environment().has_feature(Target::Vulkan)) {
-        printf("[SKIP] Skipping test for Vulkan as it does not support copy_to_host/device() yet"
-               " (halide_buffer_copy is unimplemented in that backend).\n");
-        return 0;
-    }
-
     // Compute frames on GPU/CPU, and then sum then on
     // CPU/GPU. async() lets us overlap the CPU computation with the
     // copies.

From 9805a291f35608bfa022f8224e3ab53835a0386c Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 5 Dec 2022 14:22:53 -0800
Subject: [PATCH 119/166] Disable Vulkan performance test for async gpu (for
 now).

---
 test/performance/async_gpu.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/performance/async_gpu.cpp b/test/performance/async_gpu.cpp
index 7e8cd6abb966..9033e7848ec8 100644
--- a/test/performance/async_gpu.cpp
+++ b/test/performance/async_gpu.cpp
@@ -20,6 +20,11 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (target.has_feature(Target::Vulkan)) {
+        printf("[SKIP] Skipping test for Vulkan. Async performance needs to be improved before this test will pass.\n");
+        return 0;
+    }
+    
     if (!target.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
         return 0;

From f0cc13b43372da53b715162a6efcbb4315e407c7 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 5 Dec 2022 14:23:42 -0800
Subject: [PATCH 120/166] Disable Vulkan from python AOT tests and tutorials
 (since it requires linkage against the vulkan loader system library).

---
 python_bindings/apps/CMakeLists.txt     | 5 +++++
 python_bindings/tutorial/CMakeLists.txt | 7 +++++++
 test/generator/CMakeLists.txt           | 3 ++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python_bindings/apps/CMakeLists.txt b/python_bindings/apps/CMakeLists.txt
index fd4298975a4b..1709ea434e9e 100644
--- a/python_bindings/apps/CMakeLists.txt
+++ b/python_bindings/apps/CMakeLists.txt
@@ -3,6 +3,11 @@ if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm")
     return()
 endif ()
 
+if (TARGET_VULKAN AND Halide_TARGET MATCHES "vulkan")
+    message(WARNING "Python apps are skipped under Vulkan.")
+    return()
+endif ()
+
 set(TEST_TMPDIR "$<SHELL_PATH:${CMAKE_CURRENT_BINARY_DIR}>")
 set(TEST_IMAGES_DIR "$<SHELL_PATH:${CMAKE_CURRENT_SOURCE_DIR}/../../apps/images>")
 
diff --git a/python_bindings/tutorial/CMakeLists.txt b/python_bindings/tutorial/CMakeLists.txt
index 3c2bcb674060..b304f6fd8c78 100644
--- a/python_bindings/tutorial/CMakeLists.txt
+++ b/python_bindings/tutorial/CMakeLists.txt
@@ -24,6 +24,11 @@ foreach (test IN LISTS tests)
         continue()
     endif ()
 
+    if (TARGET_VULKAN AND Halide_TARGET MATCHES "vulkan" AND test MATCHES "lesson_10")
+        message(WARNING "Not all tutorials build under Vulkan.")
+        continue()
+    endif ()
+
     cmake_path(GET test STEM test_name)
     add_python_test(
         FILE "${test}"
@@ -34,6 +39,8 @@ endforeach ()
 
 if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm")
     message(WARNING "Not all tutorials build under WASM.")
+elseif (TARGET_VULKAN AND Halide_TARGET MATCHES "vulkan")
+    message(WARNING "Not all tutorials build under Vulkan.")
 else ()
     ## Add some hacks for getting CMake to delay compiling lesson_10_halide until after the test has run. The "better" way
     ## of doing this might be to treat lesson 10 like an app and give it its own CMakeLists.txt, but since this is a one-off
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 7a9b6b000072..a7b57821dbc7 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -245,7 +245,8 @@ halide_define_aot_test(float16_t)
 # gpu_multi_context_threaded_aottest.cpp
 # gpu_multi_context_threaded_generator.cpp
 # (Doesn't build/link properly under wasm, and isn't useful there anyway)
-if (NOT Halide_TARGET MATCHES "wasm")
+# (Vulkan doesn't build/link properly and adding custom context creation is too much effort)
+if ((NOT Halide_TARGET MATCHES "wasm") AND (NOT Halide_TARGET MATCHES "vulkan"))
     halide_define_aot_test(gpu_multi_context_threaded
                            OMIT_DEFAULT_GENERATOR
                            EXTRA_LIBS

From f75273497b3e85c1e0d8555d6a02b3aede3d412a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 5 Dec 2022 14:29:20 -0800
Subject: [PATCH 121/166] Update Vulkan readme with latest status.  Everything
 works!  More or less. =)

---
 README_vulkan.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 55bde5e1acc1..b179adc61840 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -205,7 +205,7 @@ All correctness tests are now passing on tested configs for Linux & Windows usin
 
 MacOS passes most tests but encounters internal MoltenVK code translation issues for wide vectors, and ambiguously function calls.
 
-Most of the Python app and correctness tests are failing, but are being investigated now.
+Python apps, tutorials and correctness tests are now passing, but the AOT cases are skipped since the runtime environment needs to be customized to locate the platform specific Vulkan loader library.
 
 Android platform support is currently being worked on.
 

From 09908f3c3433ee1180d899719e28559eeec02755 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 5 Dec 2022 14:34:35 -0800
Subject: [PATCH 122/166] Clang format pass

---
 test/performance/async_gpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/performance/async_gpu.cpp b/test/performance/async_gpu.cpp
index 9033e7848ec8..4f6ae99635ce 100644
--- a/test/performance/async_gpu.cpp
+++ b/test/performance/async_gpu.cpp
@@ -24,7 +24,7 @@ int main(int argc, char **argv) {
         printf("[SKIP] Skipping test for Vulkan. Async performance needs to be improved before this test will pass.\n");
         return 0;
     }
-    
+
     if (!target.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
         return 0;

From 446b34f64b08652768a05ad7582f8693f0b726b0 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:16:39 -0800
Subject: [PATCH 123/166] Cleanup formatting for Halide version info in
 Makefile

---
 Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3b25ded8d981..a8e93b95ac9e 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,11 @@ WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty
 HL_TARGET ?= host
 HL_JIT_TARGET ?= host
 
-HL_VERSION_FLAGS=-DHALIDE_VERSION="$(HALIDE_VERSION)" -DHALIDE_VERSION_MAJOR=$(HALIDE_VERSION_MAJOR) -DHALIDE_VERSION_MINOR=$(HALIDE_VERSION_MINOR) -DHALIDE_VERSION_PATCH=$(HALIDE_VERSION_PATCH) 
+HL_VERSION_FLAGS = \
+	-DHALIDE_VERSION="$(HALIDE_VERSION)" \
+	-DHALIDE_VERSION_MAJOR=$(HALIDE_VERSION_MAJOR) \
+	-DHALIDE_VERSION_MINOR=$(HALIDE_VERSION_MINOR) \
+	-DHALIDE_VERSION_PATCH=$(HALIDE_VERSION_PATCH)
 
 X86_CXX_FLAGS=$(if $(WITH_X86), -DWITH_X86, )
 X86_LLVM_CONFIG_LIB=$(if $(WITH_X86), x86, )

From 9a54485235ed6eb99b0511755e8dc55322152b0f Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:17:24 -0800
Subject: [PATCH 124/166] Fix typos and address review comments for Vulkan
 readme

---
 README_vulkan.md | 67 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index b179adc61840..a70d2b3b40ad 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -3,7 +3,7 @@
 Halide supports the Khronos Vulkan framework as a compute API backend for GPU-like 
 devices, and compiles directly to a binary SPIR-V representation as part of its 
 code generation before submitting it to the Vulkan API. Both JIT and AOT usage 
-are supported via the `vulkan` target flag (eg `HL_JIT_TARGET=host-vulkan`).
+are supported via the `vulkan` target flag (e.g. `HL_JIT_TARGET=host-vulkan`).
 
 Vulkan support is actively under development, and considered *BETA* quality
 at this stage.  Tests are passing, but performance tuning and user testing is needed 
@@ -63,7 +63,7 @@ https://www.intel.com/content/www/us/en/download-center/home.html
 
 ## Linux 
 
-On Ubuntu Linux v22.04, the vulkan runtime is distributed in the `vulkan-tools` package. For earlier versions of Ubuntu (eg v20.x or v18.x) the contents of the `vulkan-tools` package was distributed as `vulkan-utils` so use this package instead.
+On Ubuntu Linux v22.04, the vulkan runtime is distributed in the `vulkan-tools` package. For earlier versions of Ubuntu (e.g. v20.x or v18.x) the contents of the `vulkan-tools` package was distributed as `vulkan-utils` so use that package instead.
 
 Proprietary drivers can be installed via 'apt' using PPA's for each vendor. Examples for AMD and NVIDIA are provided below.
 
@@ -80,14 +80,14 @@ For NVIDIA on Ubuntu v22.04:
 $ sudo add-apt-repository ppa:graphics-drivers/ppa
 $ sudo apt update
 $ sudo apt upgrade
-# - replace ### with latest driver release (eg 515)
+# - replace ### with latest driver release (e.g. 515)
 $ sudo apt install nvidia-driver-### nvidia-settings vulkan vulkan-tools
 ```
 
-Note that only valid drivers for your system should be installed since there's been 
+Note that only valid drivers for your system should be installed since there are
 reports of the Vulkan loader segfaulting just by having a non-supported driver present. 
 Specifically, the seemingly generic `mesa-vulkan-drivers` actually includes the AMD 
-graphics driver, which can cause problems if installed on an NVIDIA only system. 
+graphics driver, which can cause problems if installed on an NVIDIA-only system. 
 
 ## Mac
 
@@ -145,7 +145,7 @@ To generate Halide code for Vulkan, simply add the `vulkan` flag to your target
 | `vk_v12`       | Generates code compatible with the Vulkan v1.2+ API |
 | `vk_v13`       | Generates code compatible with the Vulkan v1.3+ API |
 
-Note that all optional device features are off by default (since they are not required by the Vulkan API, and thus must be explicitly enabled to ensure that the code being generated will be compatible with the device and API version being used for execution).
+Note that 32-bit integer and floating-point types are always available. All other optional device features are off by default (since they are not required by the Vulkan API, and thus must be explicitly enabled to ensure that the code being generated will be compatible with the device and API version being used for execution). 
 
 For AOT generators add `vulkan` (and any other flags you wish to use) to the target command line option:
 
@@ -159,12 +159,18 @@ For JIT apps use the `HL_JIT_TARGET` environment variable:
 $ HL_JIT_TARGET=host-vulkan-vk_int8-vk_int16 ./tutorial/lesson_01_basics
 ```
 
-# Useful Environment Variables
+# Useful Runtime Environment Variables
+
+To modify the default behavior of the runtime, the following environment 
+variables can be used to adjust the configuration of the Vulkan backend 
+at execution time:
 
 `HL_VK_LAYERS=...` will tell Halide to choose a suitable Vulkan instance
 that supports the given list of layers. If not set, `VK_INSTANCE_LAYERS=...` 
 will be used instead. If neither are present, Halide will use the first 
-Vulkan compute device it can find.
+Vulkan compute device it can find.  Multiple layers can be specified using 
+the appropriate environment variable list delimiter (`:` on Linux/OSX/Posix, 
+or `;` on Windows).
 
 `HL_VK_DEVICE_TYPE=...` will tell Halide to choose which type of device
 to select for creating the Vulkan instance. Valid options are 'gpu', 
@@ -172,22 +178,35 @@ to select for creating the Vulkan instance. Valid options are 'gpu',
 Halide will search for the first 'gpu' like device it can find, or fall back
 to the first compute device it can find.
 
-`HL_VK_MIN_BLOCK_SIZE=N` will tell Halide to configure the Vulkan memory
-allocator to always request a minimum of N megabytes for a resource block,
-which will be used as a pool for suballocations.  Increasing this value
-may improve performance while sacrificing the amount of available device 
-memory. Default is 32MB.
+`HL_VK_ALLOC_CONFIG=...` will tell Halide to configure the Vulkan memory
+allocator use the given constraints specified as three integer values 
+separated by the appropriate environment variable list delimiter 
+(e.g. `N:N:N` on Linux/OSX/Posix, or `N;N;N` on Windows). These values 
+correspond to `minimum_block_size`, `maximum_block_size` and 
+`maximum_block_count`. 
+
+The `minimum_block_size` constraint will tell Halide to configure the 
+Vulkan memory allocator to always request a minimum of N megabytes for 
+a resource block, which will be used as a pool for suballocations.  
+Increasing this value may improve performance while sacrificing the amount 
+of available device memory. Default is 32MB.
+
+The `maximum_block_size` constraint will tell Halide to configure the 
+Vulkan memory allocator to never exceed a maximum of N megabytes for a 
+resource block.  Decreasing this value may free up more memory but may 
+impact performance, and/or restrict allocations to be unusably small. 
+Default is 0 ... meaning no limit.
+
+The `maximum_block_count` constraint will tell Halide to configure the 
+Vulkan memory allocator to never exceed a total of N block allocations.  
+Decreasing this value may free up more memory but may impact performance, 
+and/or restrict allocations. Default is 0 ... meaning no limit.
+
 
-`HL_VK_MAX_BLOCK_SIZE=N` will tell Halide to configure the Vulkan memory
-allocator to never exceed a maximum of N megabytes for a resource block,
-which will be used as a pool for suballocations.  Decreasing this value
-may free up more memory but may impact performance, and/or restrict 
-allocations to be unusably small. Default is 0 ... meaning no limit.
+# Debug Environment Variables
 
-`HL_VK_MAX_BLOCK_COUNT=N` will tell Halide to configure the Vulkan memory
-allocator to never exceed a total of N block allocations.  Decreasing this 
-value may free up more memory but may impact performance, and/or restrict 
-allocations. Default is 0 ... meaning no limit.
+The following environment variables may be useful for tracking down potential
+issues related to Vulkan:
 
 `HL_DEBUG_CODEGEN=3` will print out debug info that includees the SPIR-V
 code generator used for Vulkan while it is compiling.
@@ -203,7 +222,7 @@ https://github.com/KhronosGroup/SPIRV-Tools
 
 All correctness tests are now passing on tested configs for Linux & Windows using the target `host-vulkan-vk_int8-vk_int16-vk_int64-vk_float16-vk_float64-vk_v13` on LLVM v14.x. 
 
-MacOS passes most tests but encounters internal MoltenVK code translation issues for wide vectors, and ambiguously function calls.
+MacOS passes most tests but encounters internal MoltenVK code translation issues for wide vectors, and ambiguous function calls.
 
 Python apps, tutorials and correctness tests are now passing, but the AOT cases are skipped since the runtime environment needs to be customized to locate the platform specific Vulkan loader library.
 
@@ -219,7 +238,7 @@ Android platform support is currently being worked on.
 # Known TODO:
 
 -   Performance tuning of CodeGen and Runtime
--   More platform support (Android is WIP, RISC-V, etc)
+-   More platform support (Android is work-in-progress, RISC-V, etc)
 -   Adapt unsupported types to supported types (if missing vk_int8 then promote to uint32_t)?
 -   Better debugging utilities using the Vulkan debug hooks.
 -   Allow debug symbols to be stripped from SPIR-V during codegen to reduce

From 3f731c2d9dd2afa1103fae00bd87659688662839 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:18:11 -0800
Subject: [PATCH 125/166] Change value casts to match Halide conventions

---
 src/CodeGen_Vulkan_Dev.cpp | 39 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 933a9ca74549..0cc276d505a2 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -536,51 +536,51 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type
     if (!value_type.is_bool()) {
         value_id = cast_type(Bool(), value_type, value_id);
     }
-    std::vector<uint8_t> true_data(target_type.bytes(), uint8_t(0));
-    std::vector<uint8_t> false_data(target_type.bytes(), uint8_t(0));
+    std::vector<uint8_t> true_data(target_type.bytes(), (uint8_t)0);
+    std::vector<uint8_t> false_data(target_type.bytes(), (uint8_t)0);
     for (int i = 0; i < target_type.lanes(); ++i) {
         if (target_type.is_int_or_uint() && target_type.bits() == 8) {
             int8_t *td = reinterpret_cast<int8_t *>(&true_data[0]);
             int8_t *fd = reinterpret_cast<int8_t *>(&false_data[0]);
-            td[i] = int8_t(1);
-            fd[i] = int8_t(0);
+            td[i] = (int8_t)1;
+            fd[i] = (int8_t)0;
         } else if (target_type.is_int_or_uint() && target_type.bits() == 16) {
             int16_t *td = reinterpret_cast<int16_t *>(&true_data[0]);
             int16_t *fd = reinterpret_cast<int16_t *>(&false_data[0]);
-            td[i] = int16_t(1);
-            fd[i] = int16_t(0);
+            td[i] = (int16_t)1;
+            fd[i] = (int16_t)0;
         } else if (target_type.is_int_or_uint() && target_type.bits() == 32) {
             int32_t *td = reinterpret_cast<int32_t *>(&true_data[0]);
             int32_t *fd = reinterpret_cast<int32_t *>(&false_data[0]);
-            td[i] = int32_t(1);
-            fd[i] = int32_t(0);
+            td[i] = (int32_t)1;
+            fd[i] = (int32_t)0;
         } else if (target_type.is_int_or_uint() && target_type.bits() == 64) {
             int64_t *td = reinterpret_cast<int64_t *>(&true_data[0]);
             int64_t *fd = reinterpret_cast<int64_t *>(&false_data[0]);
-            td[i] = int64_t(1);
-            fd[i] = int64_t(0);
+            td[i] = (int64_t)1;
+            fd[i] = (int64_t)0;
         } else if (target_type.is_float() && target_type.bits() == 16) {
             if (target_type.is_bfloat()) {
                 bfloat16_t *td = reinterpret_cast<bfloat16_t *>(&true_data[0]);
                 bfloat16_t *fd = reinterpret_cast<bfloat16_t *>(&false_data[0]);
-                td[i] = bfloat16_t(1.0f);
-                fd[i] = bfloat16_t(0.0f);
+                td[i] = (bfloat16_t)1.0f;
+                fd[i] = (bfloat16_t)0.0f;
             } else {
                 float16_t *td = reinterpret_cast<float16_t *>(&true_data[0]);
                 float16_t *fd = reinterpret_cast<float16_t *>(&false_data[0]);
-                td[i] = float16_t(1.0f);
-                fd[i] = float16_t(0.0f);
+                td[i] = (float16_t)1.0f;
+                fd[i] = (float16_t)0.0f;
             }
         } else if (target_type.is_float() && target_type.bits() == 32) {
             float *td = reinterpret_cast<float *>(&true_data[0]);
             float *fd = reinterpret_cast<float *>(&false_data[0]);
-            td[i] = float(1.0f);
-            fd[i] = float(0.0f);
+            td[i] = (float)1.0f;
+            fd[i] = (float)0.0f;
         } else if (target_type.is_float() && target_type.bits() == 64) {
             double *td = reinterpret_cast<double *>(&true_data[0]);
             double *fd = reinterpret_cast<double *>(&false_data[0]);
-            td[i] = double(1.0);
-            fd[i] = double(0.0);
+            td[i] = (double)1.0;
+            fd[i] = (double)0.0;
         } else {
             user_error << "Unhandled type cast from value type '" << value_type << "' to target type '" << target_type << "'!";
         }
@@ -615,7 +615,6 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::cast_type(Type target_type, Type value_
         }
     } else if (value_type.is_bool()) {
         op_code = SpvOpSelect;
-
     } else if (value_type.is_uint()) {
         if (target_type.is_float()) {
             op_code = SpvOpConvertUToF;
@@ -1103,7 +1102,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
         }
         if (cond.type().is_vector()) {
             scalarize(op);
-
         } else {
             // Generate Phi node if used as an expression.
             internal_assert(op->args.size() == 2 || op->args.size() == 3);
@@ -2245,7 +2243,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::reset() {
 }
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::init_module() {
-
     reset();
 
     if (target.has_feature(Target::VulkanV13)) {

From 93c5df53858ac88426a03233da8a638d68958253 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:18:57 -0800
Subject: [PATCH 126/166] Fix typos in comments

---
 src/SpirvIR.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index ccb7774d8983..d92eaa696bd7 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -367,7 +367,7 @@ class SpvBuilder {
     // Reserve a unique ID to use for identifying a specifc kind of SPIR-V result **/
     SpvId reserve_id(SpvKind = SpvResultId);
 
-    // Look up the specific kind of SPIR-V item from it's unique ID
+    // Look up the specific kind of SPIR-V item from its unique ID
     SpvKind kind_of(SpvId id) const;
 
     // Get a human readable name for a specific kind of SPIR-V item
@@ -406,7 +406,7 @@ class SpvBuilder {
     SpvId declare_symbol(const std::string &symbol, SpvId id, SpvId scope_id);
 
     // Top level creation methods for adding new items ... these have a limited
-    // number of checks and the caller must insure that duplicates aren't created
+    // number of checks and the caller must ensure that duplicates aren't created
     SpvId add_type(const Type &type, uint32_t array_size = 1);
     SpvId add_struct(const std::string &name, const StructMemberTypes &member_types);
     SpvId add_array_with_default_size(SpvId base_type_id, SpvId array_size_id);

From c526891a6494b8f25423e57ff57493701be4f983 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:20:55 -0800
Subject: [PATCH 127/166] Add static_assert to rotl to make compilation errors
 clearer (instead of using enable_if) Fix debug(3) formatting to avoid super
 long messages Use lookup table for SPIR-V op code names

---
 src/SpirvIR.cpp | 2111 +++++++++++++++++------------------------------
 1 file changed, 775 insertions(+), 1336 deletions(-)

diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 997cb12ed955..73fdd7f24871 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -113,8 +113,12 @@ inline uint32_t spirv_minor_version(uint32_t version) {
     return ((version >> 8) & 0xff);
 }
 
+/** Returns the name string for a given SPIR-V operand **/
+const std::string &spirv_op_name(SpvId op);
+
 template<typename T, typename S>
-typename std::enable_if<std::is_unsigned<T>::value, T>::type constexpr rotl(const T n, const S i) {
+T constexpr rotl(const T n, const S i) {
+    static_assert(std::is_unsigned<T>::value, "rotl only works on unsigned types");
     const T m = (std::numeric_limits<T>::digits - 1);
     const T c = i & m;
     return (n << c) | (n >> ((T(0) - c) & m));
@@ -974,7 +978,6 @@ SpvBuilder::SpvBuilder() {
 }
 
 void SpvBuilder::reset() {
-
     active_id = SpvInvalidId;
     active_function = SpvFunction();
     active_block = SpvBlock();
@@ -1237,9 +1240,12 @@ SpvId SpvBuilder::declare_variable(const std::string &name, SpvId type_id, SpvSt
     if (existing_id != SpvInvalidId) {
         return existing_id;
     }
-
     SpvId var_id = reserve_id(SpvVariableId);
-    debug(3) << "    declare_variable: %" << var_id << " name='" << name << "' type_id=" << type_id << " storage_class=" << (uint32_t)storage_class << " init_id=" << init_id << "\n";
+    debug(3) << "    declare_variable: %" << var_id << "\n"
+             << "      name='" << name << "'\n"
+             << "      type_id=" << type_id << "\n"
+             << "      storage_class=" << (uint32_t)storage_class << "\n"
+             << "      init_id=" << init_id << "\n";
     current_function().entry_block().add_variable(SpvFactory::variable(var_id, type_id, storage_class, init_id));
     declare_symbol(name, var_id, block_id);
     storage_class_map[var_id] = storage_class;
@@ -1249,7 +1255,11 @@ SpvId SpvBuilder::declare_variable(const std::string &name, SpvId type_id, SpvSt
 
 SpvId SpvBuilder::declare_global_variable(const std::string &name, SpvId type_id, SpvStorageClass storage_class, SpvId init_id) {
     SpvId var_id = reserve_id(SpvVariableId);
-    debug(3) << "    declare_global_variable: %" << var_id << " name='" << name << "' type_id=" << type_id << " storage_class=" << (uint32_t)storage_class << " init_id=" << init_id << "\n";
+    debug(3) << "    declare_global_variable: %" << var_id << "\n"
+             << "      name='" << name << "'\n"
+             << "      type_id=" << type_id << "\n"
+             << "      storage_class=" << (uint32_t)storage_class << "\n"
+             << "      init_id=" << init_id << "\n";
     module.add_global(SpvFactory::variable(var_id, type_id, storage_class, init_id));
     declare_symbol(name, var_id, module.id());
     storage_class_map[var_id] = storage_class;
@@ -1264,7 +1274,10 @@ void SpvBuilder::add_entry_point(SpvId func_id, SpvExecutionModel exec_model,
     if (func_name.empty()) {
         internal_error << "SPIRV: Function missing name definition: " << func_id << "\n";
     } else {
-        debug(3) << "    add_entry_point: %" << func_id << " func_name='" << func_name << "' exec_model=" << (uint32_t)exec_model << " variable_count=" << (uint32_t)variables.size() << "\n";
+        debug(3) << "    add_entry_point: %" << func_id << "\n"
+                 << "      func_name='" << func_name << "'\n"
+                 << "      exec_model=" << (uint32_t)exec_model << "\n"
+                 << "      variable_count=" << (uint32_t)variables.size() << "\n";
         SpvInstruction inst = SpvFactory::entry_point(exec_model, func_id, func_name, variables);
         module.add_entry_point(func_name, inst);
     }
@@ -1273,8 +1286,10 @@ void SpvBuilder::add_entry_point(SpvId func_id, SpvExecutionModel exec_model,
 SpvId SpvBuilder::add_function(const std::string &name, SpvId return_type_id, const ParamTypes &param_types) {
     SpvId func_id = make_id(SpvFunctionId);
     SpvId func_type_id = declare_function_type(return_type_id, param_types);
-
-    debug(3) << "    add_function: %" << func_id << " func_type_id=" << func_type_id << " return_type_id=" << return_type_id << " parameter_count=" << (uint32_t)param_types.size() << "\n";
+    debug(3) << "    add_function: %" << func_id << "\n"
+             << "      func_type_id=" << func_type_id << "\n"
+             << "      return_type_id=" << return_type_id << "\n"
+             << "      parameter_count=" << (uint32_t)param_types.size() << "\n";
     SpvFunction func = SpvFunction::make(func_type_id, func_id, return_type_id);
     for (SpvId param_type_id : param_types) {
         SpvId param_id = make_id(SpvParameterId);
@@ -1292,7 +1307,9 @@ SpvId SpvBuilder::add_function(const std::string &name, SpvId return_type_id, co
 
 void SpvBuilder::add_annotation(SpvId target_id, SpvDecoration decoration_type, const Literals &literals) {
     SpvInstruction inst = SpvFactory::decorate(target_id, decoration_type, literals);
-    debug(3) << "    add_annotation: %" << target_id << " decoration_type=" << uint32_t(decoration_type) << " literals=[";
+    debug(3) << "    add_annotation: %" << target_id << "\n"
+             << "      decoration_type=" << uint32_t(decoration_type) << "\n"
+             << "      literals=[";
     for (uint32_t v : literals) {
         debug(3) << " " << v;
     }
@@ -1302,7 +1319,10 @@ void SpvBuilder::add_annotation(SpvId target_id, SpvDecoration decoration_type,
 
 void SpvBuilder::add_struct_annotation(SpvId struct_type_id, uint32_t member_index, SpvDecoration decoration_type, const Literals &literals) {
     SpvInstruction inst = SpvFactory::decorate_member(struct_type_id, member_index, decoration_type, literals);
-    debug(3) << "    add_struct_annotation: %" << struct_type_id << " member_index=" << member_index << " decoration_type=" << uint32_t(decoration_type) << " literals=[";
+    debug(3) << "    add_struct_annotation: %" << struct_type_id << "\n"
+             << "      member_index=" << member_index << "\n"
+             << "      decoration_type=" << uint32_t(decoration_type) << "\n"
+             << "      literals=[";
     for (uint32_t v : literals) {
         debug(3) << " " << v;
     }
@@ -1381,7 +1401,9 @@ void SpvBuilder::add_symbol(const std::string &symbol, SpvId id, SpvId scope_id)
     symbol_id_map[symbol] = id;
     id_symbol_map[id] = symbol;
     scope_map[id] = scope_id;
-    debug(3) << "    add_symbol: %" << id << " symbol='" << symbol << "' scope_id=" << scope_id << "\n";
+    debug(3) << "    add_symbol: %" << id << "\n"
+             << "      symbol='" << symbol << "'\n"
+             << "      scope_id=" << scope_id << "\n";
     module.add_debug_symbol(id, symbol);
 }
 
@@ -1516,7 +1538,6 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
     }
 
     if (array_size > 1) {
-
         // first declare the array size as a uint32 constant value
         Type array_size_type = UInt(32);
         ConstantKey constant_key = make_constant_key(array_size_type, &array_size);
@@ -1529,7 +1550,9 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
         // declare the array type
         SpvId array_type_id = make_id(SpvArrayTypeId);
         SpvId element_type_id = add_type(type, 1);
-        debug(3) << "    add_array_type: %" << array_type_id << " element_type_id='" << element_type_id << " array_size='" << array_size << "\n";
+        debug(3) << "    add_array_type: %" << array_type_id << "\n"
+                 << "      element_type_id='" << element_type_id << "\n"
+                 << "      array_size='" << array_size << "\n";
         SpvInstruction inst = SpvFactory::array_type(array_type_id, element_type_id, array_size_id);
         module.add_type(inst);
         type_map[type_key] = array_type_id;
@@ -1540,7 +1563,9 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
     if (type.is_vector()) {
         type_id = make_id(SpvVectorTypeId);
         SpvId element_type_id = add_type(type.with_lanes(1));
-        debug(3) << "    add_vector_type: %" << type_id << " element_type_id='" << element_type_id << " lanes='" << type.lanes() << "\n";
+        debug(3) << "    add_vector_type: %" << type_id << "\n"
+                 << "      element_type_id='" << element_type_id << "\n"
+                 << "      lanes='" << type.lanes() << "\n";
         SpvInstruction inst = SpvFactory::vector_type(type_id, element_type_id, type.lanes());
         module.add_type(inst);
     } else {
@@ -1556,7 +1581,8 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
             module.add_type(inst);
         } else if (type.is_float()) {
             type_id = make_id(SpvFloatTypeId);
-            debug(3) << "    add_float_type: %" << type_id << " bits=" << type.bits() << "\n";
+            debug(3) << "    add_float_type: %" << type_id << "\n"
+                     << "      bits=" << type.bits() << "\n";
             SpvInstruction inst = SpvFactory::float_type(type_id, type.bits());
             module.add_type(inst);
             if (type.bits() == 16) {
@@ -1572,7 +1598,9 @@ SpvId SpvBuilder::add_type(const Type &type, uint32_t array_size) {
             }
 
             type_id = make_id(signedness ? SpvIntTypeId : SpvUIntTypeId);
-            debug(3) << "    add_integer_type: %" << type_id << " bits=" << type.bits() << " signed=" << (signedness ? "true" : "false") << "\n";
+            debug(3) << "    add_integer_type: %" << type_id << "\n"
+                     << "      bits=" << type.bits() << "\n"
+                     << "      signed=" << (signedness ? "true" : "false") << "\n";
             SpvInstruction inst = SpvFactory::integer_type(type_id, type.bits(), signedness);
             module.add_type(inst);
             if (type.bits() == 8) {
@@ -1624,7 +1652,9 @@ SpvId SpvBuilder::add_struct(const std::string &struct_name, const StructMemberT
     }
 
     SpvId struct_type_id = make_id(SpvStructTypeId);
-    debug(3) << "    add_struct_type: %" << struct_type_id << " name=" << struct_name << " member_type_ids=[";
+    debug(3) << "    add_struct_type: %" << struct_type_id << "\n"
+             << "      name=" << struct_name << "\n"
+             << "      member_type_ids=[";
     for (SpvId m : member_type_ids) {
         debug(3) << " " << m;
     }
@@ -1675,7 +1705,9 @@ SpvId SpvBuilder::lookup_pointer_type(SpvId base_type_id, SpvStorageClass storag
 
 SpvId SpvBuilder::add_pointer_type(const Type &type, SpvStorageClass storage_class) {
     SpvId base_type_id = declare_type(type);
-    debug(3) << "    add_pointer_type: base_type=" << type << " base_type_id=" << base_type_id << " storage_class=" << (uint32_t)(storage_class) << "\n";
+    debug(3) << "    add_pointer_type: " << type << "\n"
+             << "      base_type_id=" << base_type_id << "\n"
+             << "      storage_class=" << (uint32_t)(storage_class) << "\n";
     if (base_type_id == SpvInvalidId) {
         internal_error << "SPIRV: Attempted to create pointer type for undeclared base type! " << type << "\n";
     }
@@ -1694,7 +1726,9 @@ SpvId SpvBuilder::add_pointer_type(SpvId base_type_id, SpvStorageClass storage_c
     }
 
     SpvId pointer_type_id = make_id(SpvPointerTypeId);
-    debug(3) << "    add_pointer_type: %" << pointer_type_id << " base_type_id=" << base_type_id << " storage_class=" << (uint32_t)(storage_class) << "\n";
+    debug(3) << "    add_pointer_type: %" << pointer_type_id << "\n"
+             << "      base_type_id=" << base_type_id << "\n"
+             << "      storage_class=" << (uint32_t)(storage_class) << "\n";
     SpvInstruction inst = SpvFactory::pointer_type(pointer_type_id, storage_class, base_type_id);
     module.add_type(inst);
     pointer_type_map[key] = pointer_type_id;
@@ -1931,6 +1965,7 @@ SpvId SpvBuilder::declare_scalar_constant(const Type &scalar_type, const void *d
         return it->second;
     }
 
+    // TODO: Maybe add a templated Lambda to clean up this data conversion?
     SpvId result_id = SpvInvalidId;
     if (scalar_type.is_bool() && data) {
         bool value = *reinterpret_cast<const bool *>(data);
@@ -2141,9 +2176,10 @@ SpvId SpvBuilder::add_function_type(SpvId return_type_id, const ParamTypes &para
     if (it != function_type_map.end()) {
         return it->second;
     }
-
     SpvId function_type_id = make_id(SpvFunctionTypeId);
-    debug(3) << "    add_function_type: %" << function_type_id << " return_type_id=" << return_type_id << " param_type_ids=[";
+    debug(3) << "    add_function_type: %" << function_type_id << "\n"
+             << "      return_type_id=" << return_type_id << "\n"
+             << "      param_type_ids=[";
     for (SpvId p : param_type_ids) {
         debug(3) << " " << p;
     }
@@ -3133,1328 +3169,731 @@ std::ostream &operator<<(std::ostream &stream, const SpvInstruction &inst) {
         stream << " = ";
     }
 
-    switch (inst.op_code()) {
-    case SpvOpNop: {
-        stream << "OpNop";
-        break;
-    }
-    case SpvOpUndef: {
-        stream << "OpUndef";
-        break;
-    }
-    case SpvOpSourceContinued: {
-        stream << "OpSourceContinued";
-        break;
-    }
-    case SpvOpSource: {
-        stream << "OpSource";
-        break;
-    }
-    case SpvOpSourceExtension: {
-        stream << "OpSourceExtension";
-        break;
-    }
-    case SpvOpName: {
-        stream << "OpName";
-        break;
-    }
-    case SpvOpMemberName: {
-        stream << "OpMemberName";
-        break;
-    }
-    case SpvOpString: {
-        stream << "OpString";
-        break;
-    }
-    case SpvOpLine: {
-        stream << "OpLine";
-        break;
-    }
-    case SpvOpExtension: {
-        stream << "OpExtension";
-        break;
-    }
-    case SpvOpExtInstImport: {
-        stream << "OpExtInstImport";
-        break;
-    }
-    case SpvOpExtInst: {
-        stream << "OpExtInst";
-        break;
-    }
-    case SpvOpMemoryModel: {
-        stream << "OpMemoryModel";
-        break;
-    }
-    case SpvOpEntryPoint: {
-        stream << "OpEntryPoint";
-        break;
-    }
-    case SpvOpExecutionMode: {
-        stream << "OpExecutionMode";
-        break;
-    }
-    case SpvOpCapability: {
-        stream << "OpCapability";
-        break;
-    }
-    case SpvOpTypeVoid: {
-        stream << "OpTypeVoid";
-        break;
-    }
-    case SpvOpTypeBool: {
-        stream << "OpTypeBool";
-        break;
-    }
-    case SpvOpTypeInt: {
-        stream << "OpTypeInt";
-        break;
-    }
-    case SpvOpTypeFloat: {
-        stream << "OpTypeFloat";
-        break;
-    }
-    case SpvOpTypeVector: {
-        stream << "OpTypeVector";
-        break;
-    }
-    case SpvOpTypeMatrix: {
-        stream << "OpTypeMatrix";
-        break;
-    }
-    case SpvOpTypeImage: {
-        stream << "OpTypeImage";
-        break;
-    }
-    case SpvOpTypeSampler: {
-        stream << "OpTypeSampler";
-        break;
-    }
-    case SpvOpTypeSampledImage: {
-        stream << "OpTypeSampledImage";
-        break;
-    }
-    case SpvOpTypeArray: {
-        stream << "OpTypeArray";
-        break;
-    }
-    case SpvOpTypeRuntimeArray: {
-        stream << "OpTypeRuntimeArray";
-        break;
-    }
-    case SpvOpTypeStruct: {
-        stream << "OpTypeStruct";
-        break;
-    }
-    case SpvOpTypeOpaque: {
-        stream << "OpTypeOpaque";
-        break;
-    }
-    case SpvOpTypePointer: {
-        stream << "OpTypePointer";
-        break;
-    }
-    case SpvOpTypeFunction: {
-        stream << "OpTypeFunction";
-        break;
-    }
-    case SpvOpTypeEvent: {
-        stream << "OpTypeEvent";
-        break;
-    }
-    case SpvOpTypeDeviceEvent: {
-        stream << "OpTypeDeviceEvent";
-        break;
-    }
-    case SpvOpTypeReserveId: {
-        stream << "OpTypeReserveId";
-        break;
-    }
-    case SpvOpTypeQueue: {
-        stream << "OpTypeQueue";
-        break;
-    }
-    case SpvOpTypePipe: {
-        stream << "OpTypePipe";
-        break;
-    }
-    case SpvOpTypeForwardPointer: {
-        stream << "OpTypeForwardPointer";
-        break;
-    }
-    case SpvOpConstantTrue: {
-        stream << "OpConstantTrue";
-        break;
-    }
-    case SpvOpConstantFalse: {
-        stream << "OpConstantFalse";
-        break;
-    }
-    case SpvOpConstant: {
-        stream << "OpConstant";
-        break;
-    }
-    case SpvOpConstantComposite: {
-        stream << "OpConstantComposite";
-        break;
-    }
-    case SpvOpConstantSampler: {
-        stream << "OpConstantSampler";
-        break;
-    }
-    case SpvOpConstantNull: {
-        stream << "OpConstantNull";
-        break;
-    }
-    case SpvOpSpecConstantTrue: {
-        stream << "OpSpecConstantTrue";
-        break;
-    }
-    case SpvOpSpecConstantFalse: {
-        stream << "OpSpecConstantFalse";
-        break;
-    }
-    case SpvOpSpecConstant: {
-        stream << "OpSpecConstant";
-        break;
-    }
-    case SpvOpSpecConstantComposite: {
-        stream << "OpSpecConstantComposite";
-        break;
-    }
-    case SpvOpSpecConstantOp: {
-        stream << "OpSpecConstantOp";
-        break;
-    }
-    case SpvOpFunction: {
-        stream << "OpFunction";
-        break;
-    }
-    case SpvOpFunctionParameter: {
-        stream << "OpFunctionParameter";
-        break;
-    }
-    case SpvOpFunctionEnd: {
-        stream << "OpFunctionEnd";
-        break;
-    }
-    case SpvOpFunctionCall: {
-        stream << "OpFunctionCall";
-        break;
-    }
-    case SpvOpVariable: {
-        stream << "OpVariable";
-        break;
-    }
-    case SpvOpImageTexelPointer: {
-        stream << "OpImageTexelPointer";
-        break;
-    }
-    case SpvOpLoad: {
-        stream << "OpLoad";
-        break;
-    }
-    case SpvOpStore: {
-        stream << "OpStore";
-        break;
-    }
-    case SpvOpCopyMemory: {
-        stream << "OpCopyMemory";
-        break;
-    }
-    case SpvOpCopyMemorySized: {
-        stream << "OpCopyMemorySized";
-        break;
-    }
-    case SpvOpAccessChain: {
-        stream << "OpAccessChain";
-        break;
-    }
-    case SpvOpInBoundsAccessChain: {
-        stream << "OpInBoundsAccessChain";
-        break;
-    }
-    case SpvOpPtrAccessChain: {
-        stream << "OpPtrAccessChain";
-        break;
-    }
-    case SpvOpArrayLength: {
-        stream << "OpArrayLength";
-        break;
-    }
-    case SpvOpGenericPtrMemSemantics: {
-        stream << "OpGenericPtrMemSemantics";
-        break;
-    }
-    case SpvOpInBoundsPtrAccessChain: {
-        stream << "OpInBoundsPtrAccessChain";
-        break;
-    }
-    case SpvOpDecorate: {
-        stream << "OpDecorate";
-        break;
-    }
-    case SpvOpMemberDecorate: {
-        stream << "OpMemberDecorate";
-        break;
-    }
-    case SpvOpDecorationGroup: {
-        stream << "OpDecorationGroup";
-        break;
-    }
-    case SpvOpGroupDecorate: {
-        stream << "OpGroupDecorate";
-        break;
-    }
-    case SpvOpGroupMemberDecorate: {
-        stream << "OpGroupMemberDecorate";
-        break;
-    }
-    case SpvOpVectorExtractDynamic: {
-        stream << "OpVectorExtractDynamic";
-        break;
-    }
-    case SpvOpVectorInsertDynamic: {
-        stream << "OpVectorInsertDynamic";
-        break;
-    }
-    case SpvOpVectorShuffle: {
-        stream << "OpVectorShuffle";
-        break;
-    }
-    case SpvOpCompositeConstruct: {
-        stream << "OpCompositeConstruct";
-        break;
-    }
-    case SpvOpCompositeExtract: {
-        stream << "OpCompositeExtract";
-        break;
-    }
-    case SpvOpCompositeInsert: {
-        stream << "OpCompositeInsert";
-        break;
-    }
-    case SpvOpCopyObject: {
-        stream << "OpCopyObject";
-        break;
-    }
-    case SpvOpTranspose: {
-        stream << "OpTranspose";
-        break;
-    }
-    case SpvOpSampledImage: {
-        stream << "OpSampledImage";
-        break;
-    }
-    case SpvOpImageSampleImplicitLod: {
-        stream << "OpImageSampleImplicitLod";
-        break;
-    }
-    case SpvOpImageSampleExplicitLod: {
-        stream << "OpImageSampleExplicitLod";
-        break;
-    }
-    case SpvOpImageSampleDrefImplicitLod: {
-        stream << "OpImageSampleDrefImplicitLod";
-        break;
-    }
-    case SpvOpImageSampleDrefExplicitLod: {
-        stream << "OpImageSampleDrefExplicitLod";
-        break;
-    }
-    case SpvOpImageSampleProjImplicitLod: {
-        stream << "OpImageSampleProjImplicitLod";
-        break;
-    }
-    case SpvOpImageSampleProjExplicitLod: {
-        stream << "OpImageSampleProjExplicitLod";
-        break;
-    }
-    case SpvOpImageSampleProjDrefImplicitLod: {
-        stream << "OpImageSampleProjDrefImplicitLod";
-        break;
-    }
-    case SpvOpImageSampleProjDrefExplicitLod: {
-        stream << "OpImageSampleProjDrefExplicitLod";
-        break;
-    }
-    case SpvOpImageFetch: {
-        stream << "OpImageFetch";
-        break;
-    }
-    case SpvOpImageGather: {
-        stream << "OpImageGather";
-        break;
-    }
-    case SpvOpImageDrefGather: {
-        stream << "OpImageDrefGather";
-        break;
-    }
-    case SpvOpImageRead: {
-        stream << "OpImageRead";
-        break;
-    }
-    case SpvOpImageWrite: {
-        stream << "OpImageWrite";
-        break;
-    }
-    case SpvOpImage: {
-        stream << "OpImage";
-        break;
-    }
-    case SpvOpImageQueryFormat: {
-        stream << "OpImageQueryFormat";
-        break;
-    }
-    case SpvOpImageQueryOrder: {
-        stream << "OpImageQueryOrder";
-        break;
-    }
-    case SpvOpImageQuerySizeLod: {
-        stream << "OpImageQuerySizeLod";
-        break;
-    }
-    case SpvOpImageQuerySize: {
-        stream << "OpImageQuerySize";
-        break;
-    }
-    case SpvOpImageQueryLod: {
-        stream << "OpImageQueryLod";
-        break;
-    }
-    case SpvOpImageQueryLevels: {
-        stream << "OpImageQueryLevels";
-        break;
-    }
-    case SpvOpImageQuerySamples: {
-        stream << "OpImageQuerySamples";
-        break;
-    }
-    case SpvOpConvertFToU: {
-        stream << "OpConvertFToU";
-        break;
-    }
-    case SpvOpConvertFToS: {
-        stream << "OpConvertFToS";
-        break;
-    }
-    case SpvOpConvertSToF: {
-        stream << "OpConvertSToF";
-        break;
-    }
-    case SpvOpConvertUToF: {
-        stream << "OpConvertUToF";
-        break;
-    }
-    case SpvOpUConvert: {
-        stream << "OpUConvert";
-        break;
-    }
-    case SpvOpSConvert: {
-        stream << "OpSConvert";
-        break;
-    }
-    case SpvOpFConvert: {
-        stream << "OpFConvert";
-        break;
+    stream << spirv_op_name(inst.op_code());
+
+    if (inst.has_type()) {
+        stream << std::string(" %") << std::to_string(inst.type_id());
     }
-    case SpvOpConvertPtrToU: {
-        stream << "OpConvertPtrToU";
-        break;
+
+    for (uint32_t i = 0; i < inst.length(); i++) {
+        if (inst.is_immediate(i)) {
+            if (inst.value_type(i) == SpvStringData) {
+                const char *str = (const char *)inst.data(i);
+                stream << std::string(" \"") << str << "\"";
+                break;
+            } else if (inst.value_type(i) == SpvIntegerData) {
+                const int *data = (const int *)inst.data(i);
+                stream << std::string(" ") << std::to_string(*data);
+                break;
+            } else if (inst.value_type(i) == SpvFloatData) {
+                const float *data = (const float *)inst.data(i);
+                stream << std::string(" ") << std::to_string(*data);
+                break;
+            } else if (inst.value_type(i) == SpvBitMaskLiteral) {
+                stream << std::string(" ") << std::hex << std::showbase << std::uppercase << inst.operand(i) << std::dec;
+            } else {
+                stream << std::string(" ") << std::to_string(inst.operand(i));
+            }
+        } else {
+            stream << std::string(" %") << std::to_string(inst.operand(i));
+        }
     }
-    case SpvOpSatConvertSToU: {
-        stream << "OpSatConvertSToU";
-        break;
+
+    stream << "\n";
+    return stream;
+}
+
+// --
+
+namespace {
+
+/** Returns the name string for a given SPIR-V operand **/
+const std::string &spirv_op_name(SpvId op) {
+    using SpvOpNameMap = std::unordered_map<SpvId, std::string>;
+    static const SpvOpNameMap op_names = {
+        {SpvOpNop, "OpNop"},
+        {SpvOpUndef, "OpUndef"},
+        {SpvOpSourceContinued, "OpSourceContinued"},
+        {SpvOpSource, "OpSource"},
+        {SpvOpSourceExtension, "OpSourceExtension"},
+        {SpvOpName, "OpName"},
+        {SpvOpMemberName, "OpMemberName"},
+        {SpvOpString, "OpString"},
+        {SpvOpLine, "OpLine"},
+        {SpvOpExtension, "OpExtension"},
+        {SpvOpExtInstImport, "OpExtInstImport"},
+        {SpvOpExtInst, "OpExtInst"},
+        {SpvOpMemoryModel, "OpMemoryModel"},
+        {SpvOpEntryPoint, "OpEntryPoint"},
+        {SpvOpExecutionMode, "OpExecutionMode"},
+        {SpvOpCapability, "OpCapability"},
+        {SpvOpTypeVoid, "OpTypeVoid"},
+        {SpvOpTypeBool, "OpTypeBool"},
+        {SpvOpTypeInt, "OpTypeInt"},
+        {SpvOpTypeFloat, "OpTypeFloat"},
+        {SpvOpTypeVector, "OpTypeVector"},
+        {SpvOpTypeMatrix, "OpTypeMatrix"},
+        {SpvOpTypeImage, "OpTypeImage"},
+        {SpvOpTypeSampler, "OpTypeSampler"},
+        {SpvOpTypeSampledImage, "OpTypeSampledImage"},
+        {SpvOpTypeArray, "OpTypeArray"},
+        {SpvOpTypeRuntimeArray, "OpTypeRuntimeArray"},
+        {SpvOpTypeStruct, "OpTypeStruct"},
+        {SpvOpTypeOpaque, "OpTypeOpaque"},
+        {SpvOpTypePointer, "OpTypePointer"},
+        {SpvOpTypeFunction, "OpTypeFunction"},
+        {SpvOpTypeEvent, "OpTypeEvent"},
+        {SpvOpTypeDeviceEvent, "OpTypeDeviceEvent"},
+        {SpvOpTypeReserveId, "OpTypeReserveId"},
+        {SpvOpTypeQueue, "OpTypeQueue"},
+        {SpvOpTypePipe, "OpTypePipe"},
+        {SpvOpTypeForwardPointer, "OpTypeForwardPointer"},
+        {SpvOpConstantTrue, "OpConstantTrue"},
+        {SpvOpConstantFalse, "OpConstantFalse"},
+        {SpvOpConstant, "OpConstant"},
+        {SpvOpConstantComposite, "OpConstantComposite"},
+        {SpvOpConstantSampler, "OpConstantSampler"},
+        {SpvOpConstantNull, "OpConstantNull"},
+        {SpvOpSpecConstantTrue, "OpSpecConstantTrue"},
+        {SpvOpSpecConstantFalse, "OpSpecConstantFalse"},
+        {SpvOpSpecConstant, "OpSpecConstant"},
+        {SpvOpSpecConstantComposite, "OpSpecConstantComposite"},
+        {SpvOpSpecConstantOp, "OpSpecConstantOp"},
+        {SpvOpFunction, "OpFunction"},
+        {SpvOpFunctionParameter, "OpFunctionParameter"},
+        {SpvOpFunctionEnd, "OpFunctionEnd"},
+        {SpvOpFunctionCall, "OpFunctionCall"},
+        {SpvOpVariable, "OpVariable"},
+        {SpvOpImageTexelPointer, "OpImageTexelPointer"},
+        {SpvOpLoad, "OpLoad"},
+        {SpvOpStore, "OpStore"},
+        {SpvOpCopyMemory, "OpCopyMemory"},
+        {SpvOpCopyMemorySized, "OpCopyMemorySized"},
+        {SpvOpAccessChain, "OpAccessChain"},
+        {SpvOpInBoundsAccessChain, "OpInBoundsAccessChain"},
+        {SpvOpPtrAccessChain, "OpPtrAccessChain"},
+        {SpvOpArrayLength, "OpArrayLength"},
+        {SpvOpGenericPtrMemSemantics, "OpGenericPtrMemSemantics"},
+        {SpvOpInBoundsPtrAccessChain, "OpInBoundsPtrAccessChain"},
+        {SpvOpDecorate, "OpDecorate"},
+        {SpvOpMemberDecorate, "OpMemberDecorate"},
+        {SpvOpDecorationGroup, "OpDecorationGroup"},
+        {SpvOpGroupDecorate, "OpGroupDecorate"},
+        {SpvOpGroupMemberDecorate, "OpGroupMemberDecorate"},
+        {SpvOpVectorExtractDynamic, "OpVectorExtractDynamic"},
+        {SpvOpVectorInsertDynamic, "OpVectorInsertDynamic"},
+        {SpvOpVectorShuffle, "OpVectorShuffle"},
+        {SpvOpCompositeConstruct, "OpCompositeConstruct"},
+        {SpvOpCompositeExtract, "OpCompositeExtract"},
+        {SpvOpCompositeInsert, "OpCompositeInsert"},
+        {SpvOpCopyObject, "OpCopyObject"},
+        {SpvOpTranspose, "OpTranspose"},
+        {SpvOpSampledImage, "OpSampledImage"},
+        {SpvOpImageSampleImplicitLod, "OpImageSampleImplicitLod"},
+        {SpvOpImageSampleExplicitLod, "OpImageSampleExplicitLod"},
+        {SpvOpImageSampleDrefImplicitLod, "OpImageSampleDrefImplicitLod"},
+        {SpvOpImageSampleDrefExplicitLod, "OpImageSampleDrefExplicitLod"},
+        {SpvOpImageSampleProjImplicitLod, "OpImageSampleProjImplicitLod"},
+        {SpvOpImageSampleProjExplicitLod, "OpImageSampleProjExplicitLod"},
+        {SpvOpImageSampleProjDrefImplicitLod, "OpImageSampleProjDrefImplicitLod"},
+        {SpvOpImageSampleProjDrefExplicitLod, "OpImageSampleProjDrefExplicitLod"},
+        {SpvOpImageFetch, "OpImageFetch"},
+        {SpvOpImageGather, "OpImageGather"},
+        {SpvOpImageDrefGather, "OpImageDrefGather"},
+        {SpvOpImageRead, "OpImageRead"},
+        {SpvOpImageWrite, "OpImageWrite"},
+        {SpvOpImage, "OpImage"},
+        {SpvOpImageQueryFormat, "OpImageQueryFormat"},
+        {SpvOpImageQueryOrder, "OpImageQueryOrder"},
+        {SpvOpImageQuerySizeLod, "OpImageQuerySizeLod"},
+        {SpvOpImageQuerySize, "OpImageQuerySize"},
+        {SpvOpImageQueryLod, "OpImageQueryLod"},
+        {SpvOpImageQueryLevels, "OpImageQueryLevels"},
+        {SpvOpImageQuerySamples, "OpImageQuerySamples"},
+        {SpvOpConvertFToU, "OpConvertFToU"},
+        {SpvOpConvertFToS, "OpConvertFToS"},
+        {SpvOpConvertSToF, "OpConvertSToF"},
+        {SpvOpConvertUToF, "OpConvertUToF"},
+        {SpvOpUConvert, "OpUConvert"},
+        {SpvOpSConvert, "OpSConvert"},
+        {SpvOpFConvert, "OpFConvert"},
+        {SpvOpQuantizeToF16, "OpQuantizeToF16"},
+        {SpvOpConvertPtrToU, "OpConvertPtrToU"},
+        {SpvOpSatConvertSToU, "OpSatConvertSToU"},
+        {SpvOpSatConvertUToS, "OpSatConvertUToS"},
+        {SpvOpConvertUToPtr, "OpConvertUToPtr"},
+        {SpvOpPtrCastToGeneric, "OpPtrCastToGeneric"},
+        {SpvOpGenericCastToPtr, "OpGenericCastToPtr"},
+        {SpvOpGenericCastToPtrExplicit, "OpGenericCastToPtrExplicit"},
+        {SpvOpBitcast, "OpBitcast"},
+        {SpvOpSNegate, "OpSNegate"},
+        {SpvOpFNegate, "OpFNegate"},
+        {SpvOpIAdd, "OpIAdd"},
+        {SpvOpFAdd, "OpFAdd"},
+        {SpvOpISub, "OpISub"},
+        {SpvOpFSub, "OpFSub"},
+        {SpvOpIMul, "OpIMul"},
+        {SpvOpFMul, "OpFMul"},
+        {SpvOpUDiv, "OpUDiv"},
+        {SpvOpSDiv, "OpSDiv"},
+        {SpvOpFDiv, "OpFDiv"},
+        {SpvOpUMod, "OpUMod"},
+        {SpvOpSRem, "OpSRem"},
+        {SpvOpSMod, "OpSMod"},
+        {SpvOpFRem, "OpFRem"},
+        {SpvOpFMod, "OpFMod"},
+        {SpvOpVectorTimesScalar, "OpVectorTimesScalar"},
+        {SpvOpMatrixTimesScalar, "OpMatrixTimesScalar"},
+        {SpvOpVectorTimesMatrix, "OpVectorTimesMatrix"},
+        {SpvOpMatrixTimesVector, "OpMatrixTimesVector"},
+        {SpvOpMatrixTimesMatrix, "OpMatrixTimesMatrix"},
+        {SpvOpOuterProduct, "OpOuterProduct"},
+        {SpvOpDot, "OpDot"},
+        {SpvOpIAddCarry, "OpIAddCarry"},
+        {SpvOpISubBorrow, "OpISubBorrow"},
+        {SpvOpUMulExtended, "OpUMulExtended"},
+        {SpvOpSMulExtended, "OpSMulExtended"},
+        {SpvOpAny, "OpAny"},
+        {SpvOpAll, "OpAll"},
+        {SpvOpIsNan, "OpIsNan"},
+        {SpvOpIsInf, "OpIsInf"},
+        {SpvOpIsFinite, "OpIsFinite"},
+        {SpvOpIsNormal, "OpIsNormal"},
+        {SpvOpSignBitSet, "OpSignBitSet"},
+        {SpvOpLessOrGreater, "OpLessOrGreater"},
+        {SpvOpOrdered, "OpOrdered"},
+        {SpvOpUnordered, "OpUnordered"},
+        {SpvOpLogicalEqual, "OpLogicalEqual"},
+        {SpvOpLogicalNotEqual, "OpLogicalNotEqual"},
+        {SpvOpLogicalOr, "OpLogicalOr"},
+        {SpvOpLogicalAnd, "OpLogicalAnd"},
+        {SpvOpLogicalNot, "OpLogicalNot"},
+        {SpvOpSelect, "OpSelect"},
+        {SpvOpIEqual, "OpIEqual"},
+        {SpvOpINotEqual, "OpINotEqual"},
+        {SpvOpUGreaterThan, "OpUGreaterThan"},
+        {SpvOpSGreaterThan, "OpSGreaterThan"},
+        {SpvOpUGreaterThanEqual, "OpUGreaterThanEqual"},
+        {SpvOpSGreaterThanEqual, "OpSGreaterThanEqual"},
+        {SpvOpULessThan, "OpULessThan"},
+        {SpvOpSLessThan, "OpSLessThan"},
+        {SpvOpULessThanEqual, "OpULessThanEqual"},
+        {SpvOpSLessThanEqual, "OpSLessThanEqual"},
+        {SpvOpFOrdEqual, "OpFOrdEqual"},
+        {SpvOpFUnordEqual, "OpFUnordEqual"},
+        {SpvOpFOrdNotEqual, "OpFOrdNotEqual"},
+        {SpvOpFUnordNotEqual, "OpFUnordNotEqual"},
+        {SpvOpFOrdLessThan, "OpFOrdLessThan"},
+        {SpvOpFUnordLessThan, "OpFUnordLessThan"},
+        {SpvOpFOrdGreaterThan, "OpFOrdGreaterThan"},
+        {SpvOpFUnordGreaterThan, "OpFUnordGreaterThan"},
+        {SpvOpFOrdLessThanEqual, "OpFOrdLessThanEqual"},
+        {SpvOpFUnordLessThanEqual, "OpFUnordLessThanEqual"},
+        {SpvOpFOrdGreaterThanEqual, "OpFOrdGreaterThanEqual"},
+        {SpvOpFUnordGreaterThanEqual, "OpFUnordGreaterThanEqual"},
+        {SpvOpShiftRightLogical, "OpShiftRightLogical"},
+        {SpvOpShiftRightArithmetic, "OpShiftRightArithmetic"},
+        {SpvOpShiftLeftLogical, "OpShiftLeftLogical"},
+        {SpvOpBitwiseOr, "OpBitwiseOr"},
+        {SpvOpBitwiseXor, "OpBitwiseXor"},
+        {SpvOpBitwiseAnd, "OpBitwiseAnd"},
+        {SpvOpNot, "OpNot"},
+        {SpvOpBitFieldInsert, "OpBitFieldInsert"},
+        {SpvOpBitFieldSExtract, "OpBitFieldSExtract"},
+        {SpvOpBitFieldUExtract, "OpBitFieldUExtract"},
+        {SpvOpBitReverse, "OpBitReverse"},
+        {SpvOpBitCount, "OpBitCount"},
+        {SpvOpDPdx, "OpDPdx"},
+        {SpvOpDPdy, "OpDPdy"},
+        {SpvOpFwidth, "OpFwidth"},
+        {SpvOpDPdxFine, "OpDPdxFine"},
+        {SpvOpDPdyFine, "OpDPdyFine"},
+        {SpvOpFwidthFine, "OpFwidthFine"},
+        {SpvOpDPdxCoarse, "OpDPdxCoarse"},
+        {SpvOpDPdyCoarse, "OpDPdyCoarse"},
+        {SpvOpFwidthCoarse, "OpFwidthCoarse"},
+        {SpvOpEmitVertex, "OpEmitVertex"},
+        {SpvOpEndPrimitive, "OpEndPrimitive"},
+        {SpvOpEmitStreamVertex, "OpEmitStreamVertex"},
+        {SpvOpEndStreamPrimitive, "OpEndStreamPrimitive"},
+        {SpvOpControlBarrier, "OpControlBarrier"},
+        {SpvOpMemoryBarrier, "OpMemoryBarrier"},
+        {SpvOpAtomicLoad, "OpAtomicLoad"},
+        {SpvOpAtomicStore, "OpAtomicStore"},
+        {SpvOpAtomicExchange, "OpAtomicExchange"},
+        {SpvOpAtomicCompareExchange, "OpAtomicCompareExchange"},
+        {SpvOpAtomicCompareExchangeWeak, "OpAtomicCompareExchangeWeak"},
+        {SpvOpAtomicIIncrement, "OpAtomicIIncrement"},
+        {SpvOpAtomicIDecrement, "OpAtomicIDecrement"},
+        {SpvOpAtomicIAdd, "OpAtomicIAdd"},
+        {SpvOpAtomicISub, "OpAtomicISub"},
+        {SpvOpAtomicSMin, "OpAtomicSMin"},
+        {SpvOpAtomicUMin, "OpAtomicUMin"},
+        {SpvOpAtomicSMax, "OpAtomicSMax"},
+        {SpvOpAtomicUMax, "OpAtomicUMax"},
+        {SpvOpAtomicAnd, "OpAtomicAnd"},
+        {SpvOpAtomicOr, "OpAtomicOr"},
+        {SpvOpAtomicXor, "OpAtomicXor"},
+        {SpvOpPhi, "OpPhi"},
+        {SpvOpLoopMerge, "OpLoopMerge"},
+        {SpvOpSelectionMerge, "OpSelectionMerge"},
+        {SpvOpLabel, "OpLabel"},
+        {SpvOpBranch, "OpBranch"},
+        {SpvOpBranchConditional, "OpBranchConditional"},
+        {SpvOpSwitch, "OpSwitch"},
+        {SpvOpKill, "OpKill"},
+        {SpvOpReturn, "OpReturn"},
+        {SpvOpReturnValue, "OpReturnValue"},
+        {SpvOpUnreachable, "OpUnreachable"},
+        {SpvOpLifetimeStart, "OpLifetimeStart"},
+        {SpvOpLifetimeStop, "OpLifetimeStop"},
+        {SpvOpGroupAsyncCopy, "OpGroupAsyncCopy"},
+        {SpvOpGroupWaitEvents, "OpGroupWaitEvents"},
+        {SpvOpGroupAll, "OpGroupAll"},
+        {SpvOpGroupAny, "OpGroupAny"},
+        {SpvOpGroupBroadcast, "OpGroupBroadcast"},
+        {SpvOpGroupIAdd, "OpGroupIAdd"},
+        {SpvOpGroupFAdd, "OpGroupFAdd"},
+        {SpvOpGroupFMin, "OpGroupFMin"},
+        {SpvOpGroupUMin, "OpGroupUMin"},
+        {SpvOpGroupSMin, "OpGroupSMin"},
+        {SpvOpGroupFMax, "OpGroupFMax"},
+        {SpvOpGroupUMax, "OpGroupUMax"},
+        {SpvOpGroupSMax, "OpGroupSMax"},
+        {SpvOpReadPipe, "OpReadPipe"},
+        {SpvOpWritePipe, "OpWritePipe"},
+        {SpvOpReservedReadPipe, "OpReservedReadPipe"},
+        {SpvOpReservedWritePipe, "OpReservedWritePipe"},
+        {SpvOpReserveReadPipePackets, "OpReserveReadPipePackets"},
+        {SpvOpReserveWritePipePackets, "OpReserveWritePipePackets"},
+        {SpvOpCommitReadPipe, "OpCommitReadPipe"},
+        {SpvOpCommitWritePipe, "OpCommitWritePipe"},
+        {SpvOpIsValidReserveId, "OpIsValidReserveId"},
+        {SpvOpGetNumPipePackets, "OpGetNumPipePackets"},
+        {SpvOpGetMaxPipePackets, "OpGetMaxPipePackets"},
+        {SpvOpGroupReserveReadPipePackets, "OpGroupReserveReadPipePackets"},
+        {SpvOpGroupReserveWritePipePackets, "OpGroupReserveWritePipePackets"},
+        {SpvOpGroupCommitReadPipe, "OpGroupCommitReadPipe"},
+        {SpvOpGroupCommitWritePipe, "OpGroupCommitWritePipe"},
+        {SpvOpEnqueueMarker, "OpEnqueueMarker"},
+        {SpvOpEnqueueKernel, "OpEnqueueKernel"},
+        {SpvOpGetKernelNDrangeSubGroupCount, "OpGetKernelNDrangeSubGroupCount"},
+        {SpvOpGetKernelNDrangeMaxSubGroupSize, "OpGetKernelNDrangeMaxSubGroupSize"},
+        {SpvOpGetKernelWorkGroupSize, "OpGetKernelWorkGroupSize"},
+        {SpvOpGetKernelPreferredWorkGroupSizeMultiple, "OpGetKernelPreferredWorkGroupSizeMultiple"},
+        {SpvOpRetainEvent, "OpRetainEvent"},
+        {SpvOpReleaseEvent, "OpReleaseEvent"},
+        {SpvOpCreateUserEvent, "OpCreateUserEvent"},
+        {SpvOpIsValidEvent, "OpIsValidEvent"},
+        {SpvOpSetUserEventStatus, "OpSetUserEventStatus"},
+        {SpvOpCaptureEventProfilingInfo, "OpCaptureEventProfilingInfo"},
+        {SpvOpGetDefaultQueue, "OpGetDefaultQueue"},
+        {SpvOpBuildNDRange, "OpBuildNDRange"},
+        {SpvOpImageSparseSampleImplicitLod, "OpImageSparseSampleImplicitLod"},
+        {SpvOpImageSparseSampleExplicitLod, "OpImageSparseSampleExplicitLod"},
+        {SpvOpImageSparseSampleDrefImplicitLod, "OpImageSparseSampleDrefImplicitLod"},
+        {SpvOpImageSparseSampleDrefExplicitLod, "OpImageSparseSampleDrefExplicitLod"},
+        {SpvOpImageSparseSampleProjImplicitLod, "OpImageSparseSampleProjImplicitLod"},
+        {SpvOpImageSparseSampleProjExplicitLod, "OpImageSparseSampleProjExplicitLod"},
+        {SpvOpImageSparseSampleProjDrefImplicitLod, "OpImageSparseSampleProjDrefImplicitLod"},
+        {SpvOpImageSparseSampleProjDrefExplicitLod, "OpImageSparseSampleProjDrefExplicitLod"},
+        {SpvOpImageSparseFetch, "OpImageSparseFetch"},
+        {SpvOpImageSparseGather, "OpImageSparseGather"},
+        {SpvOpImageSparseDrefGather, "OpImageSparseDrefGather"},
+        {SpvOpImageSparseTexelsResident, "OpImageSparseTexelsResident"},
+        {SpvOpNoLine, "OpNoLine"},
+        {SpvOpAtomicFlagTestAndSet, "OpAtomicFlagTestAndSet"},
+        {SpvOpAtomicFlagClear, "OpAtomicFlagClear"},
+        {SpvOpImageSparseRead, "OpImageSparseRead"},
+        {SpvOpSizeOf, "OpSizeOf"},
+        {SpvOpTypePipeStorage, "OpTypePipeStorage"},
+        {SpvOpConstantPipeStorage, "OpConstantPipeStorage"},
+        {SpvOpCreatePipeFromPipeStorage, "OpCreatePipeFromPipeStorage"},
+        {SpvOpGetKernelLocalSizeForSubgroupCount, "OpGetKernelLocalSizeForSubgroupCount"},
+        {SpvOpGetKernelMaxNumSubgroups, "OpGetKernelMaxNumSubgroups"},
+        {SpvOpTypeNamedBarrier, "OpTypeNamedBarrier"},
+        {SpvOpNamedBarrierInitialize, "OpNamedBarrierInitialize"},
+        {SpvOpMemoryNamedBarrier, "OpMemoryNamedBarrier"},
+        {SpvOpModuleProcessed, "OpModuleProcessed"},
+        {SpvOpExecutionModeId, "OpExecutionModeId"},
+        {SpvOpDecorateId, "OpDecorateId"},
+        {SpvOpGroupNonUniformElect, "OpGroupNonUniformElect"},
+        {SpvOpGroupNonUniformAll, "OpGroupNonUniformAll"},
+        {SpvOpGroupNonUniformAny, "OpGroupNonUniformAny"},
+        {SpvOpGroupNonUniformAllEqual, "OpGroupNonUniformAllEqual"},
+        {SpvOpGroupNonUniformBroadcast, "OpGroupNonUniformBroadcast"},
+        {SpvOpGroupNonUniformBroadcastFirst, "OpGroupNonUniformBroadcastFirst"},
+        {SpvOpGroupNonUniformBallot, "OpGroupNonUniformBallot"},
+        {SpvOpGroupNonUniformInverseBallot, "OpGroupNonUniformInverseBallot"},
+        {SpvOpGroupNonUniformBallotBitExtract, "OpGroupNonUniformBallotBitExtract"},
+        {SpvOpGroupNonUniformBallotBitCount, "OpGroupNonUniformBallotBitCount"},
+        {SpvOpGroupNonUniformBallotFindLSB, "OpGroupNonUniformBallotFindLSB"},
+        {SpvOpGroupNonUniformBallotFindMSB, "OpGroupNonUniformBallotFindMSB"},
+        {SpvOpGroupNonUniformShuffle, "OpGroupNonUniformShuffle"},
+        {SpvOpGroupNonUniformShuffleXor, "OpGroupNonUniformShuffleXor"},
+        {SpvOpGroupNonUniformShuffleUp, "OpGroupNonUniformShuffleUp"},
+        {SpvOpGroupNonUniformShuffleDown, "OpGroupNonUniformShuffleDown"},
+        {SpvOpGroupNonUniformIAdd, "OpGroupNonUniformIAdd"},
+        {SpvOpGroupNonUniformFAdd, "OpGroupNonUniformFAdd"},
+        {SpvOpGroupNonUniformIMul, "OpGroupNonUniformIMul"},
+        {SpvOpGroupNonUniformFMul, "OpGroupNonUniformFMul"},
+        {SpvOpGroupNonUniformSMin, "OpGroupNonUniformSMin"},
+        {SpvOpGroupNonUniformUMin, "OpGroupNonUniformUMin"},
+        {SpvOpGroupNonUniformFMin, "OpGroupNonUniformFMin"},
+        {SpvOpGroupNonUniformSMax, "OpGroupNonUniformSMax"},
+        {SpvOpGroupNonUniformUMax, "OpGroupNonUniformUMax"},
+        {SpvOpGroupNonUniformFMax, "OpGroupNonUniformFMax"},
+        {SpvOpGroupNonUniformBitwiseAnd, "OpGroupNonUniformBitwiseAnd"},
+        {SpvOpGroupNonUniformBitwiseOr, "OpGroupNonUniformBitwiseOr"},
+        {SpvOpGroupNonUniformBitwiseXor, "OpGroupNonUniformBitwiseXor"},
+        {SpvOpGroupNonUniformLogicalAnd, "OpGroupNonUniformLogicalAnd"},
+        {SpvOpGroupNonUniformLogicalOr, "OpGroupNonUniformLogicalOr"},
+        {SpvOpGroupNonUniformLogicalXor, "OpGroupNonUniformLogicalXor"},
+        {SpvOpGroupNonUniformQuadBroadcast, "OpGroupNonUniformQuadBroadcast"},
+        {SpvOpGroupNonUniformQuadSwap, "OpGroupNonUniformQuadSwap"},
+        {SpvOpCopyLogical, "OpCopyLogical"},
+        {SpvOpPtrEqual, "OpPtrEqual"},
+        {SpvOpPtrNotEqual, "OpPtrNotEqual"},
+        {SpvOpPtrDiff, "OpPtrDiff"},
+        {SpvOpTerminateInvocation, "OpTerminateInvocation"},
+        {SpvOpSubgroupBallotKHR, "OpSubgroupBallotKHR"},
+        {SpvOpSubgroupFirstInvocationKHR, "OpSubgroupFirstInvocationKHR"},
+        {SpvOpSubgroupAllKHR, "OpSubgroupAllKHR"},
+        {SpvOpSubgroupAnyKHR, "OpSubgroupAnyKHR"},
+        {SpvOpSubgroupAllEqualKHR, "OpSubgroupAllEqualKHR"},
+        {SpvOpGroupNonUniformRotateKHR, "OpGroupNonUniformRotateKHR"},
+        {SpvOpSubgroupReadInvocationKHR, "OpSubgroupReadInvocationKHR"},
+        {SpvOpTraceRayKHR, "OpTraceRayKHR"},
+        {SpvOpExecuteCallableKHR, "OpExecuteCallableKHR"},
+        {SpvOpConvertUToAccelerationStructureKHR, "OpConvertUToAccelerationStructureKHR"},
+        {SpvOpIgnoreIntersectionKHR, "OpIgnoreIntersectionKHR"},
+        {SpvOpTerminateRayKHR, "OpTerminateRayKHR"},
+        {SpvOpSDot, "OpSDot"},
+        {SpvOpSDotKHR, "OpSDotKHR"},
+        {SpvOpUDot, "OpUDot"},
+        {SpvOpUDotKHR, "OpUDotKHR"},
+        {SpvOpSUDot, "OpSUDot"},
+        {SpvOpSUDotKHR, "OpSUDotKHR"},
+        {SpvOpSDotAccSat, "OpSDotAccSat"},
+        {SpvOpSDotAccSatKHR, "OpSDotAccSatKHR"},
+        {SpvOpUDotAccSat, "OpUDotAccSat"},
+        {SpvOpUDotAccSatKHR, "OpUDotAccSatKHR"},
+        {SpvOpSUDotAccSat, "OpSUDotAccSat"},
+        {SpvOpSUDotAccSatKHR, "OpSUDotAccSatKHR"},
+        {SpvOpTypeRayQueryKHR, "OpTypeRayQueryKHR"},
+        {SpvOpRayQueryInitializeKHR, "OpRayQueryInitializeKHR"},
+        {SpvOpRayQueryTerminateKHR, "OpRayQueryTerminateKHR"},
+        {SpvOpRayQueryGenerateIntersectionKHR, "OpRayQueryGenerateIntersectionKHR"},
+        {SpvOpRayQueryConfirmIntersectionKHR, "OpRayQueryConfirmIntersectionKHR"},
+        {SpvOpRayQueryProceedKHR, "OpRayQueryProceedKHR"},
+        {SpvOpRayQueryGetIntersectionTypeKHR, "OpRayQueryGetIntersectionTypeKHR"},
+        {SpvOpGroupIAddNonUniformAMD, "OpGroupIAddNonUniformAMD"},
+        {SpvOpGroupFAddNonUniformAMD, "OpGroupFAddNonUniformAMD"},
+        {SpvOpGroupFMinNonUniformAMD, "OpGroupFMinNonUniformAMD"},
+        {SpvOpGroupUMinNonUniformAMD, "OpGroupUMinNonUniformAMD"},
+        {SpvOpGroupSMinNonUniformAMD, "OpGroupSMinNonUniformAMD"},
+        {SpvOpGroupFMaxNonUniformAMD, "OpGroupFMaxNonUniformAMD"},
+        {SpvOpGroupUMaxNonUniformAMD, "OpGroupUMaxNonUniformAMD"},
+        {SpvOpGroupSMaxNonUniformAMD, "OpGroupSMaxNonUniformAMD"},
+        {SpvOpFragmentMaskFetchAMD, "OpFragmentMaskFetchAMD"},
+        {SpvOpFragmentFetchAMD, "OpFragmentFetchAMD"},
+        {SpvOpReadClockKHR, "OpReadClockKHR"},
+        {SpvOpImageSampleFootprintNV, "OpImageSampleFootprintNV"},
+        {SpvOpEmitMeshTasksEXT, "OpEmitMeshTasksEXT"},
+        {SpvOpSetMeshOutputsEXT, "OpSetMeshOutputsEXT"},
+        {SpvOpGroupNonUniformPartitionNV, "OpGroupNonUniformPartitionNV"},
+        {SpvOpWritePackedPrimitiveIndices4x8NV, "OpWritePackedPrimitiveIndices4x8NV"},
+        {SpvOpReportIntersectionKHR, "OpReportIntersectionKHR"},
+        {SpvOpReportIntersectionNV, "OpReportIntersectionNV"},
+        {SpvOpIgnoreIntersectionNV, "OpIgnoreIntersectionNV"},
+        {SpvOpTerminateRayNV, "OpTerminateRayNV"},
+        {SpvOpTraceNV, "OpTraceNV"},
+        {SpvOpTraceMotionNV, "OpTraceMotionNV"},
+        {SpvOpTraceRayMotionNV, "OpTraceRayMotionNV"},
+        {SpvOpTypeAccelerationStructureKHR, "OpTypeAccelerationStructureKHR"},
+        {SpvOpTypeAccelerationStructureNV, "OpTypeAccelerationStructureNV"},
+        {SpvOpExecuteCallableNV, "OpExecuteCallableNV"},
+        {SpvOpTypeCooperativeMatrixNV, "OpTypeCooperativeMatrixNV"},
+        {SpvOpCooperativeMatrixLoadNV, "OpCooperativeMatrixLoadNV"},
+        {SpvOpCooperativeMatrixStoreNV, "OpCooperativeMatrixStoreNV"},
+        {SpvOpCooperativeMatrixMulAddNV, "OpCooperativeMatrixMulAddNV"},
+        {SpvOpCooperativeMatrixLengthNV, "OpCooperativeMatrixLengthNV"},
+        {SpvOpBeginInvocationInterlockEXT, "OpBeginInvocationInterlockEXT"},
+        {SpvOpEndInvocationInterlockEXT, "OpEndInvocationInterlockEXT"},
+        {SpvOpDemoteToHelperInvocation, "OpDemoteToHelperInvocation"},
+        {SpvOpDemoteToHelperInvocationEXT, "OpDemoteToHelperInvocationEXT"},
+        {SpvOpIsHelperInvocationEXT, "OpIsHelperInvocationEXT"},
+        {SpvOpConvertUToImageNV, "OpConvertUToImageNV"},
+        {SpvOpConvertUToSamplerNV, "OpConvertUToSamplerNV"},
+        {SpvOpConvertImageToUNV, "OpConvertImageToUNV"},
+        {SpvOpConvertSamplerToUNV, "OpConvertSamplerToUNV"},
+        {SpvOpConvertUToSampledImageNV, "OpConvertUToSampledImageNV"},
+        {SpvOpConvertSampledImageToUNV, "OpConvertSampledImageToUNV"},
+        {SpvOpSamplerImageAddressingModeNV, "OpSamplerImageAddressingModeNV"},
+        {SpvOpSubgroupShuffleINTEL, "OpSubgroupShuffleINTEL"},
+        {SpvOpSubgroupShuffleDownINTEL, "OpSubgroupShuffleDownINTEL"},
+        {SpvOpSubgroupShuffleUpINTEL, "OpSubgroupShuffleUpINTEL"},
+        {SpvOpSubgroupShuffleXorINTEL, "OpSubgroupShuffleXorINTEL"},
+        {SpvOpSubgroupBlockReadINTEL, "OpSubgroupBlockReadINTEL"},
+        {SpvOpSubgroupBlockWriteINTEL, "OpSubgroupBlockWriteINTEL"},
+        {SpvOpSubgroupImageBlockReadINTEL, "OpSubgroupImageBlockReadINTEL"},
+        {SpvOpSubgroupImageBlockWriteINTEL, "OpSubgroupImageBlockWriteINTEL"},
+        {SpvOpSubgroupImageMediaBlockReadINTEL, "OpSubgroupImageMediaBlockReadINTEL"},
+        {SpvOpSubgroupImageMediaBlockWriteINTEL, "OpSubgroupImageMediaBlockWriteINTEL"},
+        {SpvOpUCountLeadingZerosINTEL, "OpUCountLeadingZerosINTEL"},
+        {SpvOpUCountTrailingZerosINTEL, "OpUCountTrailingZerosINTEL"},
+        {SpvOpAbsISubINTEL, "OpAbsISubINTEL"},
+        {SpvOpAbsUSubINTEL, "OpAbsUSubINTEL"},
+        {SpvOpIAddSatINTEL, "OpIAddSatINTEL"},
+        {SpvOpUAddSatINTEL, "OpUAddSatINTEL"},
+        {SpvOpIAverageINTEL, "OpIAverageINTEL"},
+        {SpvOpUAverageINTEL, "OpUAverageINTEL"},
+        {SpvOpIAverageRoundedINTEL, "OpIAverageRoundedINTEL"},
+        {SpvOpUAverageRoundedINTEL, "OpUAverageRoundedINTEL"},
+        {SpvOpISubSatINTEL, "OpISubSatINTEL"},
+        {SpvOpUSubSatINTEL, "OpUSubSatINTEL"},
+        {SpvOpIMul32x16INTEL, "OpIMul32x16INTEL"},
+        {SpvOpUMul32x16INTEL, "OpUMul32x16INTEL"},
+        {SpvOpConstantFunctionPointerINTEL, "OpConstantFunctionPointerINTEL"},
+        {SpvOpFunctionPointerCallINTEL, "OpFunctionPointerCallINTEL"},
+        {SpvOpAsmTargetINTEL, "OpAsmTargetINTEL"},
+        {SpvOpAsmINTEL, "OpAsmINTEL"},
+        {SpvOpAsmCallINTEL, "OpAsmCallINTEL"},
+        {SpvOpAtomicFMinEXT, "OpAtomicFMinEXT"},
+        {SpvOpAtomicFMaxEXT, "OpAtomicFMaxEXT"},
+        {SpvOpAssumeTrueKHR, "OpAssumeTrueKHR"},
+        {SpvOpExpectKHR, "OpExpectKHR"},
+        {SpvOpDecorateString, "OpDecorateString"},
+        {SpvOpDecorateStringGOOGLE, "OpDecorateStringGOOGLE"},
+        {SpvOpMemberDecorateString, "OpMemberDecorateString"},
+        {SpvOpMemberDecorateStringGOOGLE, "OpMemberDecorateStringGOOGLE"},
+        {SpvOpVmeImageINTEL, "OpVmeImageINTEL"},
+        {SpvOpTypeVmeImageINTEL, "OpTypeVmeImageINTEL"},
+        {SpvOpTypeAvcImePayloadINTEL, "OpTypeAvcImePayloadINTEL"},
+        {SpvOpTypeAvcRefPayloadINTEL, "OpTypeAvcRefPayloadINTEL"},
+        {SpvOpTypeAvcSicPayloadINTEL, "OpTypeAvcSicPayloadINTEL"},
+        {SpvOpTypeAvcMcePayloadINTEL, "OpTypeAvcMcePayloadINTEL"},
+        {SpvOpTypeAvcMceResultINTEL, "OpTypeAvcMceResultINTEL"},
+        {SpvOpTypeAvcImeResultINTEL, "OpTypeAvcImeResultINTEL"},
+        {SpvOpTypeAvcImeResultSingleReferenceStreamoutINTEL, "OpTypeAvcImeResultSingleReferenceStreamoutINTEL"},
+        {SpvOpTypeAvcImeResultDualReferenceStreamoutINTEL, "OpTypeAvcImeResultDualReferenceStreamoutINTEL"},
+        {SpvOpTypeAvcImeSingleReferenceStreaminINTEL, "OpTypeAvcImeSingleReferenceStreaminINTEL"},
+        {SpvOpTypeAvcImeDualReferenceStreaminINTEL, "OpTypeAvcImeDualReferenceStreaminINTEL"},
+        {SpvOpTypeAvcRefResultINTEL, "OpTypeAvcRefResultINTEL"},
+        {SpvOpTypeAvcSicResultINTEL, "OpTypeAvcSicResultINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL, "OpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL"},
+        {SpvOpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL, "OpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL, "OpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL"},
+        {SpvOpSubgroupAvcMceSetInterShapePenaltyINTEL, "OpSubgroupAvcMceSetInterShapePenaltyINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL, "OpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL"},
+        {SpvOpSubgroupAvcMceSetInterDirectionPenaltyINTEL, "OpSubgroupAvcMceSetInterDirectionPenaltyINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL, "OpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL, "OpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL, "OpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL, "OpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL, "OpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL"},
+        {SpvOpSubgroupAvcMceSetMotionVectorCostFunctionINTEL, "OpSubgroupAvcMceSetMotionVectorCostFunctionINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL, "OpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL, "OpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL"},
+        {SpvOpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL, "OpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL"},
+        {SpvOpSubgroupAvcMceSetAcOnlyHaarINTEL, "OpSubgroupAvcMceSetAcOnlyHaarINTEL"},
+        {SpvOpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL, "OpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL"},
+        {SpvOpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL, "OpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL"},
+        {SpvOpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL, "OpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL"},
+        {SpvOpSubgroupAvcMceConvertToImePayloadINTEL, "OpSubgroupAvcMceConvertToImePayloadINTEL"},
+        {SpvOpSubgroupAvcMceConvertToImeResultINTEL, "OpSubgroupAvcMceConvertToImeResultINTEL"},
+        {SpvOpSubgroupAvcMceConvertToRefPayloadINTEL, "OpSubgroupAvcMceConvertToRefPayloadINTEL"},
+        {SpvOpSubgroupAvcMceConvertToRefResultINTEL, "OpSubgroupAvcMceConvertToRefResultINTEL"},
+        {SpvOpSubgroupAvcMceConvertToSicPayloadINTEL, "OpSubgroupAvcMceConvertToSicPayloadINTEL"},
+        {SpvOpSubgroupAvcMceConvertToSicResultINTEL, "OpSubgroupAvcMceConvertToSicResultINTEL"},
+        {SpvOpSubgroupAvcMceGetMotionVectorsINTEL, "OpSubgroupAvcMceGetMotionVectorsINTEL"},
+        {SpvOpSubgroupAvcMceGetInterDistortionsINTEL, "OpSubgroupAvcMceGetInterDistortionsINTEL"},
+        {SpvOpSubgroupAvcMceGetBestInterDistortionsINTEL, "OpSubgroupAvcMceGetBestInterDistortionsINTEL"},
+        {SpvOpSubgroupAvcMceGetInterMajorShapeINTEL, "OpSubgroupAvcMceGetInterMajorShapeINTEL"},
+        {SpvOpSubgroupAvcMceGetInterMinorShapeINTEL, "OpSubgroupAvcMceGetInterMinorShapeINTEL"},
+        {SpvOpSubgroupAvcMceGetInterDirectionsINTEL, "OpSubgroupAvcMceGetInterDirectionsINTEL"},
+        {SpvOpSubgroupAvcMceGetInterMotionVectorCountINTEL, "OpSubgroupAvcMceGetInterMotionVectorCountINTEL"},
+        {SpvOpSubgroupAvcMceGetInterReferenceIdsINTEL, "OpSubgroupAvcMceGetInterReferenceIdsINTEL"},
+        {SpvOpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL, "OpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL"},
+        {SpvOpSubgroupAvcImeInitializeINTEL, "OpSubgroupAvcImeInitializeINTEL"},
+        {SpvOpSubgroupAvcImeSetSingleReferenceINTEL, "OpSubgroupAvcImeSetSingleReferenceINTEL"},
+        {SpvOpSubgroupAvcImeSetDualReferenceINTEL, "OpSubgroupAvcImeSetDualReferenceINTEL"},
+        {SpvOpSubgroupAvcImeRefWindowSizeINTEL, "OpSubgroupAvcImeRefWindowSizeINTEL"},
+        {SpvOpSubgroupAvcImeAdjustRefOffsetINTEL, "OpSubgroupAvcImeAdjustRefOffsetINTEL"},
+        {SpvOpSubgroupAvcImeConvertToMcePayloadINTEL, "OpSubgroupAvcImeConvertToMcePayloadINTEL"},
+        {SpvOpSubgroupAvcImeSetMaxMotionVectorCountINTEL, "OpSubgroupAvcImeSetMaxMotionVectorCountINTEL"},
+        {SpvOpSubgroupAvcImeSetUnidirectionalMixDisableINTEL, "OpSubgroupAvcImeSetUnidirectionalMixDisableINTEL"},
+        {SpvOpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL, "OpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL"},
+        {SpvOpSubgroupAvcImeSetWeightedSadINTEL, "OpSubgroupAvcImeSetWeightedSadINTEL"},
+        {SpvOpSubgroupAvcImeEvaluateWithSingleReferenceINTEL, "OpSubgroupAvcImeEvaluateWithSingleReferenceINTEL"},
+        {SpvOpSubgroupAvcImeEvaluateWithDualReferenceINTEL, "OpSubgroupAvcImeEvaluateWithDualReferenceINTEL"},
+        {SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL, "OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL"},
+        {SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL, "OpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL"},
+        {SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL, "OpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL"},
+        {SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL, "OpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL"},
+        {SpvOpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL, "OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL"},
+        {SpvOpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL, "OpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL"},
+        {SpvOpSubgroupAvcImeConvertToMceResultINTEL, "OpSubgroupAvcImeConvertToMceResultINTEL"},
+        {SpvOpSubgroupAvcImeGetSingleReferenceStreaminINTEL, "OpSubgroupAvcImeGetSingleReferenceStreaminINTEL"},
+        {SpvOpSubgroupAvcImeGetDualReferenceStreaminINTEL, "OpSubgroupAvcImeGetDualReferenceStreaminINTEL"},
+        {SpvOpSubgroupAvcImeStripSingleReferenceStreamoutINTEL, "OpSubgroupAvcImeStripSingleReferenceStreamoutINTEL"},
+        {SpvOpSubgroupAvcImeStripDualReferenceStreamoutINTEL, "OpSubgroupAvcImeStripDualReferenceStreamoutINTEL"},
+        {SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL, "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL"},
+        {SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL, "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL"},
+        {SpvOpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL, "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL"},
+        {SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL, "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL"},
+        {SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL, "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL"},
+        {SpvOpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL, "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL"},
+        {SpvOpSubgroupAvcImeGetBorderReachedINTEL, "OpSubgroupAvcImeGetBorderReachedINTEL"},
+        {SpvOpSubgroupAvcImeGetTruncatedSearchIndicationINTEL, "OpSubgroupAvcImeGetTruncatedSearchIndicationINTEL"},
+        {SpvOpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL, "OpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL"},
+        {SpvOpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL, "OpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL"},
+        {SpvOpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL, "OpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL"},
+        {SpvOpSubgroupAvcFmeInitializeINTEL, "OpSubgroupAvcFmeInitializeINTEL"},
+        {SpvOpSubgroupAvcBmeInitializeINTEL, "OpSubgroupAvcBmeInitializeINTEL"},
+        {SpvOpSubgroupAvcRefConvertToMcePayloadINTEL, "OpSubgroupAvcRefConvertToMcePayloadINTEL"},
+        {SpvOpSubgroupAvcRefSetBidirectionalMixDisableINTEL, "OpSubgroupAvcRefSetBidirectionalMixDisableINTEL"},
+        {SpvOpSubgroupAvcRefSetBilinearFilterEnableINTEL, "OpSubgroupAvcRefSetBilinearFilterEnableINTEL"},
+        {SpvOpSubgroupAvcRefEvaluateWithSingleReferenceINTEL, "OpSubgroupAvcRefEvaluateWithSingleReferenceINTEL"},
+        {SpvOpSubgroupAvcRefEvaluateWithDualReferenceINTEL, "OpSubgroupAvcRefEvaluateWithDualReferenceINTEL"},
+        {SpvOpSubgroupAvcRefEvaluateWithMultiReferenceINTEL, "OpSubgroupAvcRefEvaluateWithMultiReferenceINTEL"},
+        {SpvOpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL, "OpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL"},
+        {SpvOpSubgroupAvcRefConvertToMceResultINTEL, "OpSubgroupAvcRefConvertToMceResultINTEL"},
+        {SpvOpSubgroupAvcSicInitializeINTEL, "OpSubgroupAvcSicInitializeINTEL"},
+        {SpvOpSubgroupAvcSicConfigureSkcINTEL, "OpSubgroupAvcSicConfigureSkcINTEL"},
+        {SpvOpSubgroupAvcSicConfigureIpeLumaINTEL, "OpSubgroupAvcSicConfigureIpeLumaINTEL"},
+        {SpvOpSubgroupAvcSicConfigureIpeLumaChromaINTEL, "OpSubgroupAvcSicConfigureIpeLumaChromaINTEL"},
+        {SpvOpSubgroupAvcSicGetMotionVectorMaskINTEL, "OpSubgroupAvcSicGetMotionVectorMaskINTEL"},
+        {SpvOpSubgroupAvcSicConvertToMcePayloadINTEL, "OpSubgroupAvcSicConvertToMcePayloadINTEL"},
+        {SpvOpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL, "OpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL"},
+        {SpvOpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL, "OpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL"},
+        {SpvOpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL, "OpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL"},
+        {SpvOpSubgroupAvcSicSetBilinearFilterEnableINTEL, "OpSubgroupAvcSicSetBilinearFilterEnableINTEL"},
+        {SpvOpSubgroupAvcSicSetSkcForwardTransformEnableINTEL, "OpSubgroupAvcSicSetSkcForwardTransformEnableINTEL"},
+        {SpvOpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL, "OpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL"},
+        {SpvOpSubgroupAvcSicEvaluateIpeINTEL, "OpSubgroupAvcSicEvaluateIpeINTEL"},
+        {SpvOpSubgroupAvcSicEvaluateWithSingleReferenceINTEL, "OpSubgroupAvcSicEvaluateWithSingleReferenceINTEL"},
+        {SpvOpSubgroupAvcSicEvaluateWithDualReferenceINTEL, "OpSubgroupAvcSicEvaluateWithDualReferenceINTEL"},
+        {SpvOpSubgroupAvcSicEvaluateWithMultiReferenceINTEL, "OpSubgroupAvcSicEvaluateWithMultiReferenceINTEL"},
+        {SpvOpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL, "OpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL"},
+        {SpvOpSubgroupAvcSicConvertToMceResultINTEL, "OpSubgroupAvcSicConvertToMceResultINTEL"},
+        {SpvOpSubgroupAvcSicGetIpeLumaShapeINTEL, "OpSubgroupAvcSicGetIpeLumaShapeINTEL"},
+        {SpvOpSubgroupAvcSicGetBestIpeLumaDistortionINTEL, "OpSubgroupAvcSicGetBestIpeLumaDistortionINTEL"},
+        {SpvOpSubgroupAvcSicGetBestIpeChromaDistortionINTEL, "OpSubgroupAvcSicGetBestIpeChromaDistortionINTEL"},
+        {SpvOpSubgroupAvcSicGetPackedIpeLumaModesINTEL, "OpSubgroupAvcSicGetPackedIpeLumaModesINTEL"},
+        {SpvOpSubgroupAvcSicGetIpeChromaModeINTEL, "OpSubgroupAvcSicGetIpeChromaModeINTEL"},
+        {SpvOpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL, "OpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL"},
+        {SpvOpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL, "OpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL"},
+        {SpvOpSubgroupAvcSicGetInterRawSadsINTEL, "OpSubgroupAvcSicGetInterRawSadsINTEL"},
+        {SpvOpVariableLengthArrayINTEL, "OpVariableLengthArrayINTEL"},
+        {SpvOpSaveMemoryINTEL, "OpSaveMemoryINTEL"},
+        {SpvOpRestoreMemoryINTEL, "OpRestoreMemoryINTEL"},
+        {SpvOpArbitraryFloatSinCosPiINTEL, "OpArbitraryFloatSinCosPiINTEL"},
+        {SpvOpArbitraryFloatCastINTEL, "OpArbitraryFloatCastINTEL"},
+        {SpvOpArbitraryFloatCastFromIntINTEL, "OpArbitraryFloatCastFromIntINTEL"},
+        {SpvOpArbitraryFloatCastToIntINTEL, "OpArbitraryFloatCastToIntINTEL"},
+        {SpvOpArbitraryFloatAddINTEL, "OpArbitraryFloatAddINTEL"},
+        {SpvOpArbitraryFloatSubINTEL, "OpArbitraryFloatSubINTEL"},
+        {SpvOpArbitraryFloatMulINTEL, "OpArbitraryFloatMulINTEL"},
+        {SpvOpArbitraryFloatDivINTEL, "OpArbitraryFloatDivINTEL"},
+        {SpvOpArbitraryFloatGTINTEL, "OpArbitraryFloatGTINTEL"},
+        {SpvOpArbitraryFloatGEINTEL, "OpArbitraryFloatGEINTEL"},
+        {SpvOpArbitraryFloatLTINTEL, "OpArbitraryFloatLTINTEL"},
+        {SpvOpArbitraryFloatLEINTEL, "OpArbitraryFloatLEINTEL"},
+        {SpvOpArbitraryFloatEQINTEL, "OpArbitraryFloatEQINTEL"},
+        {SpvOpArbitraryFloatRecipINTEL, "OpArbitraryFloatRecipINTEL"},
+        {SpvOpArbitraryFloatRSqrtINTEL, "OpArbitraryFloatRSqrtINTEL"},
+        {SpvOpArbitraryFloatCbrtINTEL, "OpArbitraryFloatCbrtINTEL"},
+        {SpvOpArbitraryFloatHypotINTEL, "OpArbitraryFloatHypotINTEL"},
+        {SpvOpArbitraryFloatSqrtINTEL, "OpArbitraryFloatSqrtINTEL"},
+        {SpvOpArbitraryFloatLogINTEL, "OpArbitraryFloatLogINTEL"},
+        {SpvOpArbitraryFloatLog2INTEL, "OpArbitraryFloatLog2INTEL"},
+        {SpvOpArbitraryFloatLog10INTEL, "OpArbitraryFloatLog10INTEL"},
+        {SpvOpArbitraryFloatLog1pINTEL, "OpArbitraryFloatLog1pINTEL"},
+        {SpvOpArbitraryFloatExpINTEL, "OpArbitraryFloatExpINTEL"},
+        {SpvOpArbitraryFloatExp2INTEL, "OpArbitraryFloatExp2INTEL"},
+        {SpvOpArbitraryFloatExp10INTEL, "OpArbitraryFloatExp10INTEL"},
+        {SpvOpArbitraryFloatExpm1INTEL, "OpArbitraryFloatExpm1INTEL"},
+        {SpvOpArbitraryFloatSinINTEL, "OpArbitraryFloatSinINTEL"},
+        {SpvOpArbitraryFloatCosINTEL, "OpArbitraryFloatCosINTEL"},
+        {SpvOpArbitraryFloatSinCosINTEL, "OpArbitraryFloatSinCosINTEL"},
+        {SpvOpArbitraryFloatSinPiINTEL, "OpArbitraryFloatSinPiINTEL"},
+        {SpvOpArbitraryFloatCosPiINTEL, "OpArbitraryFloatCosPiINTEL"},
+        {SpvOpArbitraryFloatASinINTEL, "OpArbitraryFloatASinINTEL"},
+        {SpvOpArbitraryFloatASinPiINTEL, "OpArbitraryFloatASinPiINTEL"},
+        {SpvOpArbitraryFloatACosINTEL, "OpArbitraryFloatACosINTEL"},
+        {SpvOpArbitraryFloatACosPiINTEL, "OpArbitraryFloatACosPiINTEL"},
+        {SpvOpArbitraryFloatATanINTEL, "OpArbitraryFloatATanINTEL"},
+        {SpvOpArbitraryFloatATanPiINTEL, "OpArbitraryFloatATanPiINTEL"},
+        {SpvOpArbitraryFloatATan2INTEL, "OpArbitraryFloatATan2INTEL"},
+        {SpvOpArbitraryFloatPowINTEL, "OpArbitraryFloatPowINTEL"},
+        {SpvOpArbitraryFloatPowRINTEL, "OpArbitraryFloatPowRINTEL"},
+        {SpvOpArbitraryFloatPowNINTEL, "OpArbitraryFloatPowNINTEL"},
+        {SpvOpLoopControlINTEL, "OpLoopControlINTEL"},
+        {SpvOpAliasDomainDeclINTEL, "OpAliasDomainDeclINTEL"},
+        {SpvOpAliasScopeDeclINTEL, "OpAliasScopeDeclINTEL"},
+        {SpvOpAliasScopeListDeclINTEL, "OpAliasScopeListDeclINTEL"},
+        {SpvOpFixedSqrtINTEL, "OpFixedSqrtINTEL"},
+        {SpvOpFixedRecipINTEL, "OpFixedRecipINTEL"},
+        {SpvOpFixedRsqrtINTEL, "OpFixedRsqrtINTEL"},
+        {SpvOpFixedSinINTEL, "OpFixedSinINTEL"},
+        {SpvOpFixedCosINTEL, "OpFixedCosINTEL"},
+        {SpvOpFixedSinCosINTEL, "OpFixedSinCosINTEL"},
+        {SpvOpFixedSinPiINTEL, "OpFixedSinPiINTEL"},
+        {SpvOpFixedCosPiINTEL, "OpFixedCosPiINTEL"},
+        {SpvOpFixedSinCosPiINTEL, "OpFixedSinCosPiINTEL"},
+        {SpvOpFixedLogINTEL, "OpFixedLogINTEL"},
+        {SpvOpFixedExpINTEL, "OpFixedExpINTEL"},
+        {SpvOpPtrCastToCrossWorkgroupINTEL, "OpPtrCastToCrossWorkgroupINTEL"},
+        {SpvOpCrossWorkgroupCastToPtrINTEL, "OpCrossWorkgroupCastToPtrINTEL"},
+        {SpvOpReadPipeBlockingINTEL, "OpReadPipeBlockingINTEL"},
+        {SpvOpWritePipeBlockingINTEL, "OpWritePipeBlockingINTEL"},
+        {SpvOpFPGARegINTEL, "OpFPGARegINTEL"},
+        {SpvOpRayQueryGetRayTMinKHR, "OpRayQueryGetRayTMinKHR"},
+        {SpvOpRayQueryGetRayFlagsKHR, "OpRayQueryGetRayFlagsKHR"},
+        {SpvOpRayQueryGetIntersectionTKHR, "OpRayQueryGetIntersectionTKHR"},
+        {SpvOpRayQueryGetIntersectionInstanceCustomIndexKHR, "OpRayQueryGetIntersectionInstanceCustomIndexKHR"},
+        {SpvOpRayQueryGetIntersectionInstanceIdKHR, "OpRayQueryGetIntersectionInstanceIdKHR"},
+        {SpvOpRayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetKHR, "OpRayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetKHR"},
+        {SpvOpRayQueryGetIntersectionGeometryIndexKHR, "OpRayQueryGetIntersectionGeometryIndexKHR"},
+        {SpvOpRayQueryGetIntersectionPrimitiveIndexKHR, "OpRayQueryGetIntersectionPrimitiveIndexKHR"},
+        {SpvOpRayQueryGetIntersectionBarycentricsKHR, "OpRayQueryGetIntersectionBarycentricsKHR"},
+        {SpvOpRayQueryGetIntersectionFrontFaceKHR, "OpRayQueryGetIntersectionFrontFaceKHR"},
+        {SpvOpRayQueryGetIntersectionCandidateAABBOpaqueKHR, "OpRayQueryGetIntersectionCandidateAABBOpaqueKHR"},
+        {SpvOpRayQueryGetIntersectionObjectRayDirectionKHR, "OpRayQueryGetIntersectionObjectRayDirectionKHR"},
+        {SpvOpRayQueryGetIntersectionObjectRayOriginKHR, "OpRayQueryGetIntersectionObjectRayOriginKHR"},
+        {SpvOpRayQueryGetWorldRayDirectionKHR, "OpRayQueryGetWorldRayDirectionKHR"},
+        {SpvOpRayQueryGetWorldRayOriginKHR, "OpRayQueryGetWorldRayOriginKHR"},
+        {SpvOpRayQueryGetIntersectionObjectToWorldKHR, "OpRayQueryGetIntersectionObjectToWorldKHR"},
+        {SpvOpRayQueryGetIntersectionWorldToObjectKHR, "OpRayQueryGetIntersectionWorldToObjectKHR"},
+        {SpvOpAtomicFAddEXT, "OpAtomicFAddEXT"},
+        {SpvOpTypeBufferSurfaceINTEL, "OpTypeBufferSurfaceINTEL"},
+        {SpvOpTypeStructContinuedINTEL, "OpTypeStructContinuedINTEL"},
+        {SpvOpConstantCompositeContinuedINTEL, "OpConstantCompositeContinuedINTEL"},
+        {SpvOpSpecConstantCompositeContinuedINTEL, "OpSpecConstantCompositeContinuedINTEL"},
+        {SpvOpControlBarrierArriveINTEL, "OpControlBarrierArriveINTEL"},
+        {SpvOpControlBarrierWaitINTEL, "OpControlBarrierWaitINTEL"},
+        {SpvOpGroupIMulKHR, "OpGroupIMulKHR"},
+        {SpvOpGroupFMulKHR, "OpGroupFMulKHR"},
+        {SpvOpGroupBitwiseAndKHR, "OpGroupBitwiseAndKHR"},
+        {SpvOpGroupBitwiseOrKHR, "OpGroupBitwiseOrKHR"},
+        {SpvOpGroupBitwiseXorKHR, "OpGroupBitwiseXorKHR"},
+        {SpvOpGroupLogicalAndKHR, "OpGroupLogicalAndKHR"},
+        {SpvOpGroupLogicalOrKHR, "OpGroupLogicalOrKHR"},
+        {SpvOpGroupLogicalXorKHR, "OpGroupLogicalXorKHR"},
+    };
+
+    SpvOpNameMap::const_iterator entry = op_names.find(op);
+    if (entry != op_names.end()) {
+        return entry->second;
     }
-    case SpvOpSatConvertUToS: {
-        stream << "OpSatConvertUToS";
-        break;
-    }
-    case SpvOpConvertUToPtr: {
-        stream << "OpConvertUToPtr";
-        break;
-    }
-    case SpvOpPtrCastToGeneric: {
-        stream << "OpPtrCastToGeneric";
-        break;
-    }
-    case SpvOpGenericCastToPtr: {
-        stream << "OpGenericCastToPtr";
-        break;
-    }
-    case SpvOpGenericCastToPtrExplicit: {
-        stream << "OpGenericCastToPtrExplicit";
-        break;
-    }
-    case SpvOpBitcast: {
-        stream << "OpBitcast";
-        break;
-    }
-    case SpvOpSNegate: {
-        stream << "OpSNegate";
-        break;
-    }
-    case SpvOpFNegate: {
-        stream << "OpFNegate";
-        break;
-    }
-    case SpvOpIAdd: {
-        stream << "OpIAdd";
-        break;
-    }
-    case SpvOpFAdd: {
-        stream << "OpFAdd";
-        break;
-    }
-    case SpvOpISub: {
-        stream << "OpISub";
-        break;
-    }
-    case SpvOpFSub: {
-        stream << "OpFSub";
-        break;
-    }
-    case SpvOpIMul: {
-        stream << "OpIMul";
-        break;
-    }
-    case SpvOpFMul: {
-        stream << "OpFMul";
-        break;
-    }
-    case SpvOpUDiv: {
-        stream << "OpUDiv";
-        break;
-    }
-    case SpvOpSDiv: {
-        stream << "OpSDiv";
-        break;
-    }
-    case SpvOpFDiv: {
-        stream << "OpFDiv";
-        break;
-    }
-    case SpvOpUMod: {
-        stream << "OpUMod";
-        break;
-    }
-    case SpvOpSRem: {
-        stream << "OpSRem";
-        break;
-    }
-    case SpvOpSMod: {
-        stream << "OpSMod";
-        break;
-    }
-    case SpvOpFRem: {
-        stream << "OpFRem";
-        break;
-    }
-    case SpvOpFMod: {
-        stream << "OpFMod";
-        break;
-    }
-    case SpvOpVectorTimesScalar: {
-        stream << "OpVectorTimesScalar";
-        break;
-    }
-    case SpvOpMatrixTimesScalar: {
-        stream << "OpMatrixTimesScalar";
-        break;
-    }
-    case SpvOpVectorTimesMatrix: {
-        stream << "OpVectorTimesMatrix";
-        break;
-    }
-    case SpvOpMatrixTimesVector: {
-        stream << "OpMatrixTimesVector";
-        break;
-    }
-    case SpvOpMatrixTimesMatrix: {
-        stream << "OpMatrixTimesMatrix";
-        break;
-    }
-    case SpvOpOuterProduct: {
-        stream << "OpOuterProduct";
-        break;
-    }
-    case SpvOpDot: {
-        stream << "OpDot";
-        break;
-    }
-    case SpvOpIAddCarry: {
-        stream << "OpIAddCarry";
-        break;
-    }
-    case SpvOpISubBorrow: {
-        stream << "OpISubBorrow";
-        break;
-    }
-    case SpvOpUMulExtended: {
-        stream << "OpUMulExtended";
-        break;
-    }
-    case SpvOpSMulExtended: {
-        stream << "OpSMulExtended";
-        break;
-    }
-    case SpvOpAny: {
-        stream << "OpAny";
-        break;
-    }
-    case SpvOpAll: {
-        stream << "OpAll";
-        break;
-    }
-    case SpvOpIsNan: {
-        stream << "OpIsNan";
-        break;
-    }
-    case SpvOpIsInf: {
-        stream << "OpIsInf";
-        break;
-    }
-    case SpvOpIsFinite: {
-        stream << "OpIsFinite";
-        break;
-    }
-    case SpvOpIsNormal: {
-        stream << "OpIsNormal";
-        break;
-    }
-    case SpvOpSignBitSet: {
-        stream << "OpSignBitSet";
-        break;
-    }
-    case SpvOpLessOrGreater: {
-        stream << "OpLessOrGreater";
-        break;
-    }
-    case SpvOpOrdered: {
-        stream << "OpOrdered";
-        break;
-    }
-    case SpvOpUnordered: {
-        stream << "OpUnordered";
-        break;
-    }
-    case SpvOpLogicalEqual: {
-        stream << "OpLogicalEqual";
-        break;
-    }
-    case SpvOpLogicalNotEqual: {
-        stream << "OpLogicalNotEqual";
-        break;
-    }
-    case SpvOpLogicalOr: {
-        stream << "OpLogicalOr";
-        break;
-    }
-    case SpvOpLogicalAnd: {
-        stream << "OpLogicalAnd";
-        break;
-    }
-    case SpvOpLogicalNot: {
-        stream << "OpLogicalNot";
-        break;
-    }
-    case SpvOpSelect: {
-        stream << "OpSelect";
-        break;
-    }
-    case SpvOpIEqual: {
-        stream << "OpIEqual";
-        break;
-    }
-    case SpvOpINotEqual: {
-        stream << "OpINotEqual";
-        break;
-    }
-    case SpvOpUGreaterThan: {
-        stream << "OpUGreaterThan";
-        break;
-    }
-    case SpvOpSGreaterThan: {
-        stream << "OpSGreaterThan";
-        break;
-    }
-    case SpvOpUGreaterThanEqual: {
-        stream << "OpUGreaterThanEqual";
-        break;
-    }
-    case SpvOpSGreaterThanEqual: {
-        stream << "OpSGreaterThanEqual";
-        break;
-    }
-    case SpvOpULessThan: {
-        stream << "OpULessThan";
-        break;
-    }
-    case SpvOpSLessThan: {
-        stream << "OpSLessThan";
-        break;
-    }
-    case SpvOpULessThanEqual: {
-        stream << "OpULessThanEqual";
-        break;
-    }
-    case SpvOpSLessThanEqual: {
-        stream << "OpSLessThanEqual";
-        break;
-    }
-    case SpvOpFOrdEqual: {
-        stream << "OpFOrdEqual";
-        break;
-    }
-    case SpvOpFUnordEqual: {
-        stream << "OpFUnordEqual";
-        break;
-    }
-    case SpvOpFOrdNotEqual: {
-        stream << "OpFOrdNotEqual";
-        break;
-    }
-    case SpvOpFUnordNotEqual: {
-        stream << "OpFUnordNotEqual";
-        break;
-    }
-    case SpvOpFOrdLessThan: {
-        stream << "OpFOrdLessThan";
-        break;
-    }
-    case SpvOpFUnordLessThan: {
-        stream << "OpFUnordLessThan";
-        break;
-    }
-    case SpvOpFOrdGreaterThan: {
-        stream << "OpFOrdGreaterThan";
-        break;
-    }
-    case SpvOpFUnordGreaterThan: {
-        stream << "OpFUnordGreaterThan";
-        break;
-    }
-    case SpvOpFOrdLessThanEqual: {
-        stream << "OpFOrdLessThanEqual";
-        break;
-    }
-    case SpvOpFUnordLessThanEqual: {
-        stream << "OpFUnordLessThanEqual";
-        break;
-    }
-    case SpvOpFOrdGreaterThanEqual: {
-        stream << "OpFOrdGreaterThanEqual";
-        break;
-    }
-    case SpvOpFUnordGreaterThanEqual: {
-        stream << "OpFUnordGreaterThanEqual";
-        break;
-    }
-    case SpvOpShiftRightLogical: {
-        stream << "OpShiftRightLogical";
-        break;
-    }
-    case SpvOpShiftRightArithmetic: {
-        stream << "OpShiftRightArithmetic";
-        break;
-    }
-    case SpvOpShiftLeftLogical: {
-        stream << "OpShiftLeftLogical";
-        break;
-    }
-    case SpvOpBitwiseOr: {
-        stream << "OpBitwiseOr";
-        break;
-    }
-    case SpvOpBitwiseXor: {
-        stream << "OpBitwiseXor";
-        break;
-    }
-    case SpvOpBitwiseAnd: {
-        stream << "OpBitwiseAnd";
-        break;
-    }
-    case SpvOpNot: {
-        stream << "OpNot";
-        break;
-    }
-    case SpvOpBitFieldInsert: {
-        stream << "OpBitFieldInsert";
-        break;
-    }
-    case SpvOpBitFieldSExtract: {
-        stream << "OpBitFieldSExtract";
-        break;
-    }
-    case SpvOpBitFieldUExtract: {
-        stream << "OpBitFieldUExtract";
-        break;
-    }
-    case SpvOpBitReverse: {
-        stream << "OpBitReverse";
-        break;
-    }
-    case SpvOpBitCount: {
-        stream << "OpBitCount";
-        break;
-    }
-    case SpvOpDPdx: {
-        stream << "OpDPdx";
-        break;
-    }
-    case SpvOpDPdy: {
-        stream << "OpDPdy";
-        break;
-    }
-    case SpvOpFwidth: {
-        stream << "OpFwidth";
-        break;
-    }
-    case SpvOpDPdxFine: {
-        stream << "OpDPdxFine";
-        break;
-    }
-    case SpvOpDPdyFine: {
-        stream << "OpDPdyFine";
-        break;
-    }
-    case SpvOpFwidthFine: {
-        stream << "OpFwidthFine";
-        break;
-    }
-    case SpvOpDPdxCoarse: {
-        stream << "OpDPdxCoarse";
-        break;
-    }
-    case SpvOpDPdyCoarse: {
-        stream << "OpDPdyCoarse";
-        break;
-    }
-    case SpvOpFwidthCoarse: {
-        stream << "OpFwidthCoarse";
-        break;
-    }
-    case SpvOpEmitVertex: {
-        stream << "OpEmitVertex";
-        break;
-    }
-    case SpvOpEndPrimitive: {
-        stream << "OpEndPrimitive";
-        break;
-    }
-    case SpvOpEmitStreamVertex: {
-        stream << "OpEmitStreamVertex";
-        break;
-    }
-    case SpvOpEndStreamPrimitive: {
-        stream << "OpEndStreamPrimitive";
-        break;
-    }
-    case SpvOpControlBarrier: {
-        stream << "OpControlBarrier";
-        break;
-    }
-    case SpvOpMemoryBarrier: {
-        stream << "OpMemoryBarrier";
-        break;
-    }
-    case SpvOpAtomicLoad: {
-        stream << "OpAtomicLoad";
-        break;
-    }
-    case SpvOpAtomicStore: {
-        stream << "OpAtomicStore";
-        break;
-    }
-    case SpvOpAtomicExchange: {
-        stream << "OpAtomicExchange";
-        break;
-    }
-    case SpvOpAtomicCompareExchange: {
-        stream << "OpAtomicCompareExchange";
-        break;
-    }
-    case SpvOpAtomicCompareExchangeWeak: {
-        stream << "OpAtomicCompareExchangeWeak";
-        break;
-    }
-    case SpvOpAtomicIIncrement: {
-        stream << "OpAtomicIIncrement";
-        break;
-    }
-    case SpvOpAtomicIDecrement: {
-        stream << "OpAtomicIDecrement";
-        break;
-    }
-    case SpvOpAtomicIAdd: {
-        stream << "OpAtomicIAdd";
-        break;
-    }
-    case SpvOpAtomicISub: {
-        stream << "OpAtomicISub";
-        break;
-    }
-    case SpvOpAtomicSMin: {
-        stream << "OpAtomicSMin";
-        break;
-    }
-    case SpvOpAtomicUMin: {
-        stream << "OpAtomicUMin";
-        break;
-    }
-    case SpvOpAtomicSMax: {
-        stream << "OpAtomicSMax";
-        break;
-    }
-    case SpvOpAtomicUMax: {
-        stream << "OpAtomicUMax";
-        break;
-    }
-    case SpvOpAtomicAnd: {
-        stream << "OpAtomicAnd";
-        break;
-    }
-    case SpvOpAtomicOr: {
-        stream << "OpAtomicOr";
-        break;
-    }
-    case SpvOpAtomicXor: {
-        stream << "OpAtomicXor";
-        break;
-    }
-    case SpvOpPhi: {
-        stream << "OpPhi";
-        break;
-    }
-    case SpvOpLoopMerge: {
-        stream << "OpLoopMerge";
-        break;
-    }
-    case SpvOpSelectionMerge: {
-        stream << "OpSelectionMerge";
-        break;
-    }
-    case SpvOpLabel: {
-        stream << "OpLabel";
-        break;
-    }
-    case SpvOpBranch: {
-        stream << "OpBranch";
-        break;
-    }
-    case SpvOpBranchConditional: {
-        stream << "OpBranchConditional";
-        break;
-    }
-    case SpvOpSwitch: {
-        stream << "OpSwitch";
-        break;
-    }
-    case SpvOpKill: {
-        stream << "OpKill";
-        break;
-    }
-    case SpvOpReturn: {
-        stream << "OpReturn";
-        break;
-    }
-    case SpvOpReturnValue: {
-        stream << "OpReturnValue";
-        break;
-    }
-    case SpvOpUnreachable: {
-        stream << "OpUnreachable";
-        break;
-    }
-    case SpvOpLifetimeStart: {
-        stream << "OpLifetimeStart";
-        break;
-    }
-    case SpvOpLifetimeStop: {
-        stream << "OpLifetimeStop";
-        break;
-    }
-    case SpvOpGroupAsyncCopy: {
-        stream << "OpGroupAsyncCopy";
-        break;
-    }
-    case SpvOpGroupWaitEvents: {
-        stream << "OpGroupWaitEvents";
-        break;
-    }
-    case SpvOpGroupAll: {
-        stream << "OpGroupAll";
-        break;
-    }
-    case SpvOpGroupAny: {
-        stream << "OpGroupAny";
-        break;
-    }
-    case SpvOpGroupBroadcast: {
-        stream << "OpGroupBroadcast";
-        break;
-    }
-    case SpvOpGroupIAdd: {
-        stream << "OpGroupIAdd";
-        break;
-    }
-    case SpvOpGroupFAdd: {
-        stream << "OpGroupFAdd";
-        break;
-    }
-    case SpvOpGroupFMin: {
-        stream << "OpGroupFMin";
-        break;
-    }
-    case SpvOpGroupUMin: {
-        stream << "OpGroupUMin";
-        break;
-    }
-    case SpvOpGroupSMin: {
-        stream << "OpGroupSMin";
-        break;
-    }
-    case SpvOpGroupFMax: {
-        stream << "OpGroupFMax";
-        break;
-    }
-    case SpvOpGroupUMax: {
-        stream << "OpGroupUMax";
-        break;
-    }
-    case SpvOpGroupSMax: {
-        stream << "OpGroupSMax";
-        break;
-    }
-    case SpvOpReadPipe: {
-        stream << "OpReadPipe";
-        break;
-    }
-    case SpvOpWritePipe: {
-        stream << "OpWritePipe";
-        break;
-    }
-    case SpvOpReservedReadPipe: {
-        stream << "OpReservedReadPipe";
-        break;
-    }
-    case SpvOpReservedWritePipe: {
-        stream << "OpReservedWritePipe";
-        break;
-    }
-    case SpvOpReserveReadPipePackets: {
-        stream << "OpReserveReadPipePackets";
-        break;
-    }
-    case SpvOpReserveWritePipePackets: {
-        stream << "OpReserveWritePipePackets";
-        break;
-    }
-    case SpvOpCommitReadPipe: {
-        stream << "OpCommitReadPipe";
-        break;
-    }
-    case SpvOpCommitWritePipe: {
-        stream << "OpCommitWritePipe";
-        break;
-    }
-    case SpvOpIsValidReserveId: {
-        stream << "OpIsValidReserveId";
-        break;
-    }
-    case SpvOpGetNumPipePackets: {
-        stream << "OpGetNumPipePackets";
-        break;
-    }
-    case SpvOpGetMaxPipePackets: {
-        stream << "OpGetMaxPipePackets";
-        break;
-    }
-    case SpvOpGroupReserveReadPipePackets: {
-        stream << "OpGroupReserveReadPipePackets";
-        break;
-    }
-    case SpvOpGroupReserveWritePipePackets: {
-        stream << "OpGroupReserveWritePipePackets";
-        break;
-    }
-    case SpvOpGroupCommitReadPipe: {
-        stream << "OpGroupCommitReadPipe";
-        break;
-    }
-    case SpvOpGroupCommitWritePipe: {
-        stream << "OpGroupCommitWritePipe";
-        break;
-    }
-    case SpvOpEnqueueMarker: {
-        stream << "OpEnqueueMarker";
-        break;
-    }
-    case SpvOpEnqueueKernel: {
-        stream << "OpEnqueueKernel";
-        break;
-    }
-    case SpvOpGetKernelNDrangeSubGroupCount: {
-        stream << "OpGetKernelNDrangeSubGroupCount";
-        break;
-    }
-    case SpvOpGetKernelNDrangeMaxSubGroupSize: {
-        stream << "OpGetKernelNDrangeMaxSubGroupSize";
-        break;
-    }
-    case SpvOpGetKernelWorkGroupSize: {
-        stream << "OpGetKernelWorkGroupSize";
-        break;
-    }
-    case SpvOpGetKernelPreferredWorkGroupSizeMultiple: {
-        stream << "OpGetKernelPreferredWorkGroupSizeMultiple";
-        break;
-    }
-    case SpvOpRetainEvent: {
-        stream << "OpRetainEvent";
-        break;
-    }
-    case SpvOpReleaseEvent: {
-        stream << "OpReleaseEvent";
-        break;
-    }
-    case SpvOpCreateUserEvent: {
-        stream << "OpCreateUserEvent";
-        break;
-    }
-    case SpvOpIsValidEvent: {
-        stream << "OpIsValidEvent";
-        break;
-    }
-    case SpvOpSetUserEventStatus: {
-        stream << "OpSetUserEventStatus";
-        break;
-    }
-    case SpvOpCaptureEventProfilingInfo: {
-        stream << "OpCaptureEventProfilingInfo";
-        break;
-    }
-    case SpvOpGetDefaultQueue: {
-        stream << "OpGetDefaultQueue";
-        break;
-    }
-    case SpvOpBuildNDRange: {
-        stream << "OpBuildNDRange";
-        break;
-    }
-    case SpvOpImageSparseSampleImplicitLod: {
-        stream << "OpImageSparseSampleImplicitLod";
-        break;
-    }
-    case SpvOpImageSparseSampleExplicitLod: {
-        stream << "OpImageSparseSampleExplicitLod";
-        break;
-    }
-    case SpvOpImageSparseSampleDrefImplicitLod: {
-        stream << "OpImageSparseSampleDrefImplicitLod";
-        break;
-    }
-    case SpvOpImageSparseSampleDrefExplicitLod: {
-        stream << "OpImageSparseSampleDrefExplicitLod";
-        break;
-    }
-    case SpvOpImageSparseSampleProjImplicitLod: {
-        stream << "OpImageSparseSampleProjImplicitLod";
-        break;
-    }
-    case SpvOpImageSparseSampleProjExplicitLod: {
-        stream << "OpImageSparseSampleProjExplicitLod";
-        break;
-    }
-    case SpvOpImageSparseSampleProjDrefImplicitLod: {
-        stream << "OpImageSparseSampleProjDrefImplicitLod";
-        break;
-    }
-    case SpvOpImageSparseSampleProjDrefExplicitLod: {
-        stream << "OpImageSparseSampleProjDrefExplicitLod";
-        break;
-    }
-    case SpvOpImageSparseFetch: {
-        stream << "OpImageSparseFetch";
-        break;
-    }
-    case SpvOpImageSparseGather: {
-        stream << "OpImageSparseGather";
-        break;
-    }
-    case SpvOpImageSparseDrefGather: {
-        stream << "OpImageSparseDrefGather";
-        break;
-    }
-    case SpvOpImageSparseTexelsResident: {
-        stream << "OpImageSparseTexelsResident";
-        break;
-    }
-    case SpvOpNoLine: {
-        stream << "OpNoLine";
-        break;
-    }
-    case SpvOpAtomicFlagTestAndSet: {
-        stream << "OpAtomicFlagTestAndSet";
-        break;
-    }
-    case SpvOpAtomicFlagClear: {
-        stream << "OpAtomicFlagClear";
-        break;
-    }
-    case SpvOpImageSparseRead: {
-        stream << "OpImageSparseRead";
-        break;
-    }
-    case SpvOpDecorateId: {
-        stream << "OpDecorateId";
-        break;
-    }
-    case SpvOpSubgroupBallotKHR: {
-        stream << "OpSubgroupBallotKHR";
-        break;
-    }
-    case SpvOpSubgroupFirstInvocationKHR: {
-        stream << "OpSubgroupFirstInvocationKHR";
-        break;
-    }
-    case SpvOpSubgroupAllKHR: {
-        stream << "OpSubgroupAllKHR";
-        break;
-    }
-    case SpvOpSubgroupAnyKHR: {
-        stream << "OpSubgroupAnyKHR";
-        break;
-    }
-    case SpvOpSubgroupAllEqualKHR: {
-        stream << "OpSubgroupAllEqualKHR";
-        break;
-    }
-    case SpvOpSubgroupReadInvocationKHR: {
-        stream << "OpSubgroupReadInvocationKHR";
-        break;
-    }
-    case SpvOpGroupIAddNonUniformAMD: {
-        stream << "OpGroupIAddNonUniformAMD";
-        break;
-    }
-    case SpvOpGroupFAddNonUniformAMD: {
-        stream << "OpGroupFAddNonUniformAMD";
-        break;
-    }
-    case SpvOpGroupFMinNonUniformAMD: {
-        stream << "OpGroupFMinNonUniformAMD";
-        break;
-    }
-    case SpvOpGroupUMinNonUniformAMD: {
-        stream << "OpGroupUMinNonUniformAMD";
-        break;
-    }
-    case SpvOpGroupSMinNonUniformAMD: {
-        stream << "OpGroupSMinNonUniformAMD";
-        break;
-    }
-    case SpvOpGroupFMaxNonUniformAMD: {
-        stream << "OpGroupFMaxNonUniformAMD";
-        break;
-    }
-    case SpvOpGroupUMaxNonUniformAMD: {
-        stream << "OpGroupUMaxNonUniformAMD";
-        break;
-    }
-    case SpvOpGroupSMaxNonUniformAMD: {
-        stream << "OpGroupSMaxNonUniformAMD";
-        break;
-    }
-    case SpvOpFragmentMaskFetchAMD: {
-        stream << "OpFragmentMaskFetchAMD";
-        break;
-    }
-    case SpvOpFragmentFetchAMD: {
-        stream << "OpFragmentFetchAMD";
-        break;
-    }
-    case SpvOpSubgroupShuffleINTEL: {
-        stream << "OpSubgroupShuffleINTEL";
-        break;
-    }
-    case SpvOpSubgroupShuffleDownINTEL: {
-        stream << "OpSubgroupShuffleDownINTEL";
-        break;
-    }
-    case SpvOpSubgroupShuffleUpINTEL: {
-        stream << "OpSubgroupShuffleUpINTEL";
-        break;
-    }
-    case SpvOpSubgroupShuffleXorINTEL: {
-        stream << "OpSubgroupShuffleXorINTEL";
-        break;
-    }
-    case SpvOpSubgroupBlockReadINTEL: {
-        stream << "OpSubgroupBlockReadINTEL";
-        break;
-    }
-    case SpvOpSubgroupBlockWriteINTEL: {
-        stream << "OpSubgroupBlockWriteINTEL";
-        break;
-    }
-    case SpvOpSubgroupImageBlockReadINTEL: {
-        stream << "OpSubgroupImageBlockReadINTEL";
-        break;
-    }
-    case SpvOpSubgroupImageBlockWriteINTEL: {
-        stream << "OpSubgroupImageBlockWriteINTEL";
-        break;
-    }
-    case SpvOpDecorateStringGOOGLE: {
-        stream << "OpDecorateStringGOOGLE";
-        break;
-    }
-    case SpvOpMemberDecorateStringGOOGLE: {
-        stream << "OpMemberDecorateStringGOOGLE";
-        break;
-    }
-    case SpvOpMax:
-    default: {
-        stream << "*INVALID*";
-        break;
-    }
-    };
-
-    if (inst.has_type()) {
-        stream << std::string(" %") << std::to_string(inst.type_id());
-    }
-
-    for (uint32_t i = 0; i < inst.length(); i++) {
-        if (inst.is_immediate(i)) {
-            if (inst.value_type(i) == SpvStringData) {
-                const char *str = (const char *)inst.data(i);
-                stream << std::string(" \"") << str << "\"";
-                break;
-            } else if (inst.value_type(i) == SpvIntegerData) {
-                const int *data = (const int *)inst.data(i);
-                stream << std::string(" ") << std::to_string(*data);
-                break;
-            } else if (inst.value_type(i) == SpvFloatData) {
-                const float *data = (const float *)inst.data(i);
-                stream << std::string(" ") << std::to_string(*data);
-                break;
-            } else if (inst.value_type(i) == SpvBitMaskLiteral) {
-                stream << std::string(" ") << std::hex << std::showbase << std::uppercase << inst.operand(i) << std::dec;
-            } else {
-                stream << std::string(" ") << std::to_string(inst.operand(i));
-            }
-        } else {
-            stream << std::string(" %") << std::to_string(inst.operand(i));
-        }
-    }
-
-    stream << "\n";
-    return stream;
+    static const std::string invalid_op_name("*INVALID*");
+    return invalid_op_name;
 }
 
 // --
 
+}  // namespace
 }  // namespace Internal
 }  // namespace Halide
 

From f5d70e8a8787cfcf7df0e068d6cf8c22b0c45430 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:23:53 -0800
Subject: [PATCH 128/166] Fix typos and logic for Vulkan capabilities

---
 src/Target.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/Target.cpp b/src/Target.cpp
index c71cf9fcb46f..700e9523977b 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -379,16 +379,14 @@ Target::Feature calculate_host_vulkan_capability(Target t) {
     internal_assert(interface->compute_capability);
     int major, minor;
     int err = interface->compute_capability(nullptr, &major, &minor);
-    internal_assert(err == 0) << "Failed to query cuda compute capability\n";
+    internal_assert(err == 0) << "Failed to query vulkan compute capability\n";
     int ver = major * 10 + minor;
     if (ver < 10) {
         return Target::FeatureEnd;
-    } else if (ver < 10) {
-        return Target::VulkanV10;
     } else if (ver < 12) {
-        return Target::VulkanV12;
+        return Target::VulkanV10;
     } else if (ver < 13) {
-        return Target::VulkanV13;
+        return Target::VulkanV12;
     } else {
         return Target::VulkanV13;
     }

From b4c9bea6e9e923de1a110b206d625659f4eaf610 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:24:24 -0800
Subject: [PATCH 129/166] Remove leftover debug ifdef

---
 src/runtime/internal/block_allocator.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index b3301a12bf96..ff81adcc4de2 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -5,8 +5,6 @@
 #include "memory_resources.h"
 #include "region_allocator.h"
 
-// #define DEBUG_INTERNAL
-
 namespace Halide {
 namespace Runtime {
 namespace Internal {

From fea02d53769c46cc76325b4253f5c592614c65dd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:24:53 -0800
Subject: [PATCH 130/166] Fix typo in comments

---
 src/runtime/internal/memory_resources.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index a1889541435c..cba4792be1d1 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -134,7 +134,7 @@ ALWAYS_INLINE size_t conform_alignment(size_t requested, size_t required) {
     return ((required > 0) && (alignment > required)) ? (required * ((alignment / required) + 1)) : alignment;
 }
 
-// Returns a padded size to accomodate an adjusted offset due to alignment constraints
+// Returns a padded size to accommodate an adjusted offset due to alignment constraints
 // -- Alignment must be power of two!
 ALWAYS_INLINE size_t aligned_size(size_t offset, size_t size, size_t alignment) {
     size_t actual_offset = aligned_offset(offset, alignment);

From acd5ea5daa56a34f5d81f328da8b6c06cb0a7035 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:25:15 -0800
Subject: [PATCH 131/166] Rename copy_upto(...) method to be copy_up_to(...)

---
 src/runtime/internal/string_storage.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/internal/string_storage.h b/src/runtime/internal/string_storage.h
index 83b3acc13623..46b9505ec06a 100644
--- a/src/runtime/internal/string_storage.h
+++ b/src/runtime/internal/string_storage.h
@@ -68,7 +68,7 @@ struct StringUtils {
         return size_t(ptr - str);
     }
 
-    static size_t copy_upto(char *dst, const char *src, size_t max_chars) {
+    static size_t copy_up_to(char *dst, const char *src, size_t max_chars) {
         size_t length = count_length(src, max_chars);
         memcpy(dst, src, length);
         dst[length] = '\0';

From 14d3ab768d394c64b624c4ebc1a973bab3dc9e63 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:26:47 -0800
Subject: [PATCH 132/166] Handle error case for uninitialized buffer allocation
 (rather than abort) Fix typos in comments

---
 src/runtime/vulkan.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 11aa495fc798..5278d1ce0add 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -301,7 +301,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
     buf->device_interface->impl->use_module();
 
     debug(user_context)
-        << "    Allocated device buffer " << (void *)buf->device
+        << "    allocated device buffer " << (void *)buf->device
         << " for buffer " << buf << "\n";
 
     // retrieve the buffer from the region
@@ -544,7 +544,6 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
 }
 
 WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_buffer) {
-
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << "halide_copy_to_host (user_context: " << user_context
@@ -552,7 +551,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 #endif
 
     // Acquire the context so we can use the command queue. This also avoids multiple
-    // redundant calls to clEnqueueReadBuffer when multiple threads are trying to copy
+    // redundant calls to enqueue a download when multiple threads are trying to copy
     // the same buffer.
     VulkanContext ctx(user_context);
     if (ctx.error != VK_SUCCESS) {
@@ -562,8 +561,10 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 #ifdef DEBUG_RUNTIME
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
-
-    halide_abort_if_false(user_context, halide_buffer->host && halide_buffer->device);
+    if ((halide_buffer->host == nullptr) || (halide_buffer->device == 0)) {
+        error(user_context) << "Vulkan: Unable to copy buffer to host ... missing host and device pointers!\n";
+        return -1;
+    }
 
     device_copy copy_helper = make_device_to_host_copy(halide_buffer);
 

From 14c436331f0721dca8e555d8cb3cbf746a920cfb Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:27:36 -0800
Subject: [PATCH 133/166] Support any arbitary number of devices and queues for
 context creation Fix typos in comments

---
 src/runtime/vulkan_context.h | 63 ++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 62830a07a6d7..8e8cdeb5369d 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -18,15 +18,18 @@ namespace Vulkan {
 
 // --------------------------------------------------------------------------
 
-// An Vulkan context/queue/synchronization lock defined in this module with weak linkage
 // Vulkan Memory allocator for host-device allocations
 halide_vulkan_memory_allocator *WEAK cached_allocator = nullptr;
+
+// Cached instance related handles for device resources
 VkInstance WEAK cached_instance = nullptr;
 VkDevice WEAK cached_device = nullptr;
 VkCommandPool WEAK cached_command_pool = 0;
 VkQueue WEAK cached_queue = nullptr;
 VkPhysicalDevice WEAK cached_physical_device = nullptr;
 uint32_t WEAK cached_queue_family_index = 0;
+
+// A Vulkan context/queue/synchronization lock defined in this module with weak linkage
 volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0;
 
 // --------------------------------------------------------------------------
@@ -191,25 +194,40 @@ int vk_select_device_for_context(void *user_context,
                                  VkInstance *instance, VkDevice *device,
                                  VkPhysicalDevice *physical_device,
                                  uint32_t *queue_family_index) {
-
-    // For now handle more than 16 devices by just looking at the first 16.
-    VkPhysicalDevice chosen_device = nullptr;
-    VkPhysicalDevice avail_devices[16];
-    uint32_t device_count = sizeof(avail_devices) / sizeof(avail_devices[0]);
-    VkResult result = vkEnumeratePhysicalDevices(*instance, &device_count, avail_devices);
+    // query for the number of physical devices available in this instance
+    uint32_t device_count = 0;
+    VkResult result = vkEnumeratePhysicalDevices(*instance, &device_count, nullptr);
     if ((result != VK_SUCCESS) && (result != VK_INCOMPLETE)) {
         debug(user_context) << "Vulkan: vkEnumeratePhysicalDevices failed with return code: " << vk_get_error_name(result) << "\n";
         return halide_error_code_incompatible_device_interface;
     }
-
     if (device_count == 0) {
         debug(user_context) << "Vulkan: No devices found.\n";
         return halide_error_code_incompatible_device_interface;
     }
 
+    // allocate enough storage for the physical device query results
+    BlockStorage::Config device_query_storage_config;
+    device_query_storage_config.entry_size = sizeof(VkPhysicalDevice);
+    BlockStorage device_query_storage(user_context, device_query_storage_config);
+    device_query_storage.resize(user_context, device_count);
+
+    VkPhysicalDevice chosen_device = nullptr;
+    VkPhysicalDevice *avail_devices = (VkPhysicalDevice *)(device_query_storage.data());
+    if (avail_devices == nullptr) {
+        debug(user_context) << "Vulkan: Out of system memory!\n";
+        return halide_error_code_out_of_memory;
+    }
+    result = vkEnumeratePhysicalDevices(*instance, &device_count, avail_devices);
+    if ((result != VK_SUCCESS) && (result != VK_INCOMPLETE)) {
+        debug(user_context) << "Vulkan: vkEnumeratePhysicalDevices failed with return code: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_incompatible_device_interface;
+    }
+
+    // get the configurable device type to search for (e.g. 'cpu', 'gpu', 'integrated-gpu', 'discrete-gpu', ...)
     const char *dev_type = halide_vulkan_get_device_type(user_context);
 
-    // Try to find a device that supports compute.
+    // try to find a matching device that supports compute.
     uint32_t queue_family = 0;
     for (uint32_t i = 0; (chosen_device == nullptr) && (i < device_count); i++) {
         VkPhysicalDeviceProperties properties;
@@ -238,8 +256,20 @@ int vk_select_device_for_context(void *user_context,
         }
 
         if (matching_device) {
-            VkQueueFamilyProperties queue_properties[16];
-            uint32_t queue_properties_count = sizeof(queue_properties) / sizeof(queue_properties[0]);
+            // get the number of supported queues for this physical device
+            uint32_t queue_properties_count = 0;
+            vkGetPhysicalDeviceQueueFamilyProperties(avail_devices[i], &queue_properties_count, nullptr);
+            if (queue_properties_count < 1) {
+                continue;
+            }
+
+            // allocate enough storage for the queue properties query results
+            BlockStorage::Config queue_properties_storage_config;
+            queue_properties_storage_config.entry_size = sizeof(VkPhysicalDevice);
+            BlockStorage queue_properties_storage(user_context, queue_properties_storage_config);
+            queue_properties_storage.resize(user_context, queue_properties_count);
+
+            VkQueueFamilyProperties *queue_properties = (VkQueueFamilyProperties *)(queue_properties_storage.data());
             vkGetPhysicalDeviceQueueFamilyProperties(avail_devices[i], &queue_properties_count, queue_properties);
             for (uint32_t j = 0; (chosen_device == nullptr) && (j < queue_properties_count); j++) {
                 if (queue_properties[j].queueCount > 0 &&
@@ -269,17 +299,24 @@ int vk_select_device_for_context(void *user_context,
 int vk_create_device(void *user_context, const StringTable &requested_layers, VkInstance *instance, VkDevice *device, VkQueue *queue,
                      VkPhysicalDevice *physical_device, uint32_t *queue_family_index, const VkAllocationCallbacks *alloc_callbacks) {
     debug(user_context) << " vk_create_device (user_context=" << user_context << ")\n";
+
+    debug(user_context) << "  checking for required device extensions ...\n";
     StringTable required_device_extensions;
     vk_get_required_device_extensions(user_context, required_device_extensions);
 
+    debug(user_context) << "  checking for optional device extensions ...\n";
     StringTable optional_device_extensions;
     vk_get_optional_device_extensions(user_context, optional_device_extensions);
 
+    debug(user_context) << "  validating supported device extensions ...\n";
     StringTable supported_device_extensions;
     vk_get_supported_device_extensions(user_context, *physical_device, supported_device_extensions);
 
     bool valid_device = vk_validate_required_extension_support(user_context, required_device_extensions, supported_device_extensions);
-    halide_abort_if_false(user_context, valid_device);
+    if (!valid_device) {
+        debug(user_context) << "Vulkan: Unable to validate required extension support!\n";
+        return halide_error_code_incompatible_device_interface;
+    }
 
     debug(user_context) << "  found " << (uint32_t)required_device_extensions.size() << " required extensions for device!\n";
     for (int n = 0; n < (int)required_device_extensions.size(); ++n) {
@@ -334,7 +371,7 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     }
 
     if (vkGetPhysicalDeviceFeatures2KHR) {
-        debug(user_context) << "  qerying for extended device features...\n";
+        debug(user_context) << "  querying for extended device features...\n";
         vkGetPhysicalDeviceFeatures2KHR(*physical_device, &device_features_ext);
         debug(user_context) << "   shader int8 support: " << (shader_f16_i8_ext.shaderInt8 ? "true" : "false") << "...\n";
         debug(user_context) << "   shader float16 support: " << (shader_f16_i8_ext.shaderFloat16 ? "true" : "false") << "...\n";

From e08c646938bf54bc26409ae8d4be81975211f014 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:29:08 -0800
Subject: [PATCH 134/166] Add get/set alloc_config methods and API hooks for
 configuring the VulkanMemoryAllocator

---
 src/runtime/vulkan_extensions.h | 44 ++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
index 7f69a17426ee..5bf17048a2ad 100644
--- a/src/runtime/vulkan_extensions.h
+++ b/src/runtime/vulkan_extensions.h
@@ -28,13 +28,17 @@ WEAK char build_options[1024];
 WEAK ScopedSpinLock::AtomicFlag build_options_lock = 0;
 WEAK bool build_options_initialized = false;
 
+WEAK char alloc_config[1024];
+WEAK ScopedSpinLock::AtomicFlag alloc_config_lock = 0;
+WEAK bool alloc_config_initialized = false;
+
 // --------------------------------------------------------------------------
 namespace {
 
 void vk_set_layer_names_internal(const char *n) {
     if (n) {
         size_t buffer_size = sizeof(layer_names) / sizeof(layer_names[0]);
-        StringUtils::copy_upto(layer_names, n, buffer_size);
+        StringUtils::copy_up_to(layer_names, n, buffer_size);
     } else {
         layer_names[0] = 0;
     }
@@ -55,7 +59,7 @@ const char *vk_get_layer_names_internal(void *user_context) {
 void vk_set_extension_names_internal(const char *n) {
     if (n) {
         size_t buffer_size = sizeof(extension_names) / sizeof(extension_names[0]);
-        StringUtils::copy_upto(extension_names, n, buffer_size);
+        StringUtils::copy_up_to(extension_names, n, buffer_size);
     } else {
         extension_names[0] = 0;
     }
@@ -73,7 +77,7 @@ const char *vk_get_extension_names_internal(void *user_context) {
 void vk_set_device_type_internal(const char *n) {
     if (n) {
         size_t buffer_size = sizeof(device_type) / sizeof(device_type[0]);
-        StringUtils::copy_upto(device_type, n, buffer_size);
+        StringUtils::copy_up_to(device_type, n, buffer_size);
     } else {
         device_type[0] = 0;
     }
@@ -91,7 +95,7 @@ const char *vk_get_device_type_internal(void *user_context) {
 void vk_set_build_options_internal(const char *n) {
     if (n) {
         size_t buffer_size = sizeof(build_options) / sizeof(build_options[0]);
-        StringUtils::copy_upto(build_options, n, buffer_size);
+        StringUtils::copy_up_to(build_options, n, buffer_size);
     } else {
         build_options[0] = 0;
     }
@@ -106,6 +110,24 @@ const char *vk_get_build_options_internal(void *user_context) {
     return build_options;
 }
 
+void vk_set_alloc_config_internal(const char *n) {
+    if (n) {
+        size_t buffer_size = sizeof(alloc_config) / sizeof(alloc_config[0]);
+        StringUtils::copy_up_to(alloc_config, n, buffer_size);
+    } else {
+        alloc_config[0] = 0;
+    }
+    alloc_config_initialized = true;
+}
+
+const char *vk_get_alloc_config_internal(void *user_context) {
+    if (!alloc_config_initialized) {
+        const char *name = getenv("HL_VK_ALLOC_CONFIG");
+        vk_set_alloc_config_internal(name);
+    }
+    return alloc_config;
+}
+
 // --------------------------------------------------------------------------
 
 uint32_t vk_get_requested_layers(void *user_context, StringTable &layer_table) {
@@ -183,7 +205,7 @@ uint32_t vk_get_optional_device_extensions(void *user_context, StringTable &ext_
     return optional_ext_count;
 }
 uint32_t vk_get_supported_device_extensions(void *user_context, VkPhysicalDevice physical_device, StringTable &ext_table) {
-
+    debug(user_context) << "vk_get_supported_device_extensions\n";
     if (vkEnumerateDeviceExtensionProperties == nullptr) {
         debug(user_context) << "Vulkan: Missing vkEnumerateDeviceExtensionProperties proc address! Invalid loader?!\n";
         return 0;
@@ -193,7 +215,7 @@ uint32_t vk_get_supported_device_extensions(void *user_context, VkPhysicalDevice
     vkEnumerateDeviceExtensionProperties(physical_device, nullptr, &avail_ext_count, nullptr);
     debug(user_context) << "Vulkan: vkEnumerateDeviceExtensionProperties found  " << avail_ext_count << " extensions ...\n";
 
-    if (avail_ext_count) {
+    if (avail_ext_count > 0) {
         BlockStorage::Config config;
         config.entry_size = sizeof(VkExtensionProperties);
         config.minimum_capacity = avail_ext_count;
@@ -291,6 +313,16 @@ WEAK const char *halide_vulkan_get_build_options(void *user_context) {
     return vk_get_build_options_internal(user_context);
 }
 
+WEAK void halide_vulkan_set_alloc_config(const char *n) {
+    ScopedSpinLock lock(&alloc_config_lock);
+    vk_set_alloc_config_internal(n);
+}
+
+WEAK const char *halide_vulkan_get_alloc_config(void *user_context) {
+    ScopedSpinLock lock(&alloc_config_lock);
+    return vk_get_alloc_config_internal(user_context);
+}
+
 // --------------------------------------------------------------------------
 
 }  // extern "C"

From 3ca887099ecc33e2d5deee2be26c38349356d026 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:30:38 -0800
Subject: [PATCH 135/166] Remove leftover debug ifdef

---
 src/runtime/vulkan_internal.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 6f2815b48b85..5d8dbf0484d0 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -14,9 +14,6 @@
 
 #include "vulkan_interface.h"
 
-#ifndef DEBUG_RUNTIME
-#define DEBUG_RUNTIME
-#endif
 // --
 
 namespace Halide {

From fcf0b50093eec229ad7bb296d884a49b50c8ddf1 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:31:06 -0800
Subject: [PATCH 136/166] Hookup API methods for get/set alloc_config when
 initializing the VulkanMemoryAllocator

---
 src/runtime/vulkan_memory.h | 41 +++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index e033fef805fb..d647e5bc2150 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -871,26 +871,31 @@ VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context,
                                                   const VkAllocationCallbacks *alloc_callbacks) {
 
     SystemMemoryAllocatorFns system_allocator = {vk_system_malloc, vk_system_free};
-
     VulkanMemoryConfig config = memory_allocator_config;
 
-    const char *min_block_size_env = getenv("HL_VK_MIN_BLOCK_SIZE");
-    const char *max_block_size_env = getenv("HL_VK_MAX_BLOCK_SIZE");
-    const char *max_block_count_env = getenv("HL_VK_MAX_BLOCK_COUNT");
-
-    if (!StringUtils::is_empty(min_block_size_env)) {
-        config.minimum_block_size = atoi(min_block_size_env) * 1024 * 1024;
-        debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.minimum_block_size << " for minimum block size (in bytes)\n";
-    }
-
-    if (!StringUtils::is_empty(max_block_size_env)) {
-        config.maximum_block_size = atoi(max_block_size_env) * 1024 * 1024;
-        debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_size << " for maximum block size (in bytes)\n";
-    }
-
-    if (!StringUtils::is_empty(max_block_count_env)) {
-        config.maximum_block_count = atoi(max_block_count_env);
-        debug(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_count << " for maximum block count\n";
+    // Parse the allocation config string (if specified).
+    //
+    // `HL_VK_ALLOC_CONFIG=N:N:N` will tell Halide to configure the Vulkan memory
+    // allocator use the given constraints specified as three integer values
+    // separated by a `:` or `;`. These values correspond to `minimum_block_size`,
+    // `maximum_block_size` and `maximum_block_count`.
+    //
+    const char *alloc_config = vk_get_alloc_config_internal(user_context);
+    if (!StringUtils::is_empty(alloc_config)) {
+        StringTable alloc_config_values;
+        alloc_config_values.parse(user_context, alloc_config, HL_VK_ENV_DELIM);
+        if (alloc_config_values.size() > 0) {
+            config.minimum_block_size = atoi(alloc_config_values[0]) * 1024 * 1024;
+            print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.minimum_block_size << " for minimum block size (in bytes)\n";
+        }
+        if (alloc_config_values.size() > 1) {
+            config.maximum_block_size = atoi(alloc_config_values[1]) * 1024 * 1024;
+            print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.minimum_block_size << " for minimum block size (in bytes)\n";
+        }
+        if (alloc_config_values.size() > 2) {
+            config.maximum_block_count = atoi(alloc_config_values[2]);
+            print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_count << " for maximum block count\n";
+        }
     }
 
     return VulkanMemoryAllocator::create(user_context,

From f191715b477dc97e672e5a810a7576a48c55e6e7 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 15 Dec 2022 13:32:11 -0800
Subject: [PATCH 137/166] Remove empty lines in main

---
 test/correctness/interleave_rgb.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/correctness/interleave_rgb.cpp b/test/correctness/interleave_rgb.cpp
index d7a8f9d7c110..d3465db5f63e 100644
--- a/test/correctness/interleave_rgb.cpp
+++ b/test/correctness/interleave_rgb.cpp
@@ -103,7 +103,6 @@ bool test_deinterleave(int x_stride) {
 }
 
 int main(int argc, char **argv) {
-
     for (int x_stride : {3, 4}) {
         if (!test_interleave<uint8_t>(x_stride)) return -1;
         if (!test_interleave<uint16_t>(x_stride)) return -1;
@@ -113,7 +112,6 @@ int main(int argc, char **argv) {
         if (!test_deinterleave<uint16_t>(x_stride)) return -1;
         if (!test_deinterleave<uint32_t>(x_stride)) return -1;
     }
-
     printf("Success!\n");
     return 0;
 }

From 069b2945db54961f26a19532d38dfa9fd8a6c45c Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 30 Jan 2023 15:48:35 -0800
Subject: [PATCH 138/166] Add required capability flags for 8-bit and 16-bit
 uniform and storage buffer access Handle casts for GLSL ops (spec requires
 all args to be the same type as the return type)

---
 src/CodeGen_Vulkan_Dev.cpp | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 0cc276d505a2..fb44c2756276 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -2201,7 +2201,13 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_glsl_op(SpvId glsl_op_code, Type t
     operands.reserve(args.size());
     for (const Expr &e : args) {
         e.accept(this);
-        operands.push_back(builder.current_id());
+        SpvId arg_value_id = builder.current_id();
+        if(builder.type_of(arg_value_id) != type_id) {
+            SpvId casted_value_id = cast_type(type, e.type(), arg_value_id); // all GLSL args must match return type
+            operands.push_back(casted_value_id);
+        } else {
+            operands.push_back(arg_value_id);
+        }
     }
 
     // sanity check the expected number of operands
@@ -2580,15 +2586,15 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
     DescriptorSet descriptor_set;
     descriptor_set.entry_point_name = entry_point_name;
 
-    // Add required extension support for storage types
+    // Add required extension support for storage types which are necessary to 
+    // use smaller bit-width types for any halide buffer *or* device argument 
+    // (passed as a runtime array)
     for (const auto &arg : args) {
-        if (arg.is_buffer) {
-            if (arg.type.is_int_or_uint()) {
-                if (arg.type.bits() == 8) {
-                    builder.require_extension("SPV_KHR_8bit_storage");
-                } else if (arg.type.bits() == 16) {
-                    builder.require_extension("SPV_KHR_16bit_storage");
-                }
+        if (arg.type.is_int_or_uint()) {
+            if (arg.type.bits() == 8) {
+                builder.require_extension("SPV_KHR_8bit_storage");
+            } else if (arg.type.bits() == 16) {
+                builder.require_extension("SPV_KHR_16bit_storage");
             }
         }
     }
@@ -2599,6 +2605,13 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
     SpvBuilder::StructMemberTypes param_struct_members;
     for (const auto &arg : args) {
         if (!arg.is_buffer) {
+            // Add required access capability for smaller bit-width types used as runtime arrays
+            if (arg.type.bits() == 8) {
+                builder.require_capability(SpvCapabilityUniformAndStorageBuffer8BitAccess);
+            } else if (arg.type.bits() == 16) {
+                builder.require_capability(SpvCapabilityUniformAndStorageBuffer16BitAccess);
+            }
+            
             SpvId arg_type_id = builder.declare_type(arg.type);
             param_struct_members.push_back(arg_type_id);
         }
@@ -2744,11 +2757,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
                    << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
     }
 
-    if (builder.is_capability_required(SpvCapabilityInt64) && !target.has_feature(Target::VulkanInt64)) {
-        user_error << "Vulkan: Code requires 64-bit integer support (which is not enabled in the target features)! "
-                   << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
-    }
-
     if (builder.is_capability_required(SpvCapabilityFloat16) && !target.has_feature(Target::VulkanFloat16)) {
         user_error << "Vulkan: Code requires 16-bit floating-point support (which is not enabled in the target features)! "
                    << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";

From 4491f7846b5ab0387a98246bef39fb0cb71a2008 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 30 Jan 2023 15:50:29 -0800
Subject: [PATCH 139/166] Add VkPhysicalDevice8BitStorageFeaturesKHR and
 related constants

---
 src/runtime/mini_vulkan.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index 92dc63ea8e31..b6547d3dee4e 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -6049,14 +6049,24 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetValidationCacheDataEXT(
 #define VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_SPEC_VERSION 1
 #define VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME "VK_EXT_shader_viewport_index_layer"
 
-// Provided by VK_VERSION_1_2
+// Provided by VK_VERSION_1_1
 typedef struct VkPhysicalDeviceShaderFloat16Int8Features {
-    uint32_t sType;
-    void *pNext;
-    VkBool32 shaderFloat16;
-    VkBool32 shaderInt8;
+    uint32_t           sType;
+    void               *pNext;
+    VkBool32           shaderFloat16;
+    VkBool32           shaderInt8;
 } VkPhysicalDeviceShaderFloat16Int8FeaturesKHR;
-#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES 1000082000
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR 1000082000
+
+// Provided by VK_VERSION_1_2
+typedef struct VkPhysicalDevice8BitStorageFeatures {
+    uint32_t           sType;
+    void*              pNext;
+    VkBool32           storageBuffer8BitAccess;
+    VkBool32           uniformAndStorageBuffer8BitAccess;
+    VkBool32           storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR 1000177000
 
 // Provided by VK_KHR_portability_enumeration
 #define VK_KHR_portability_enumeration 1

From eb8a0aea0db5e4e9b2d71298be6e8c9de0ff1e34 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 30 Jan 2023 15:57:32 -0800
Subject: [PATCH 140/166] Query for 8-bit and 16-bit uniform and storage access
 support. Enable these as part of the device feature query chain.

---
 src/runtime/vulkan_context.h | 41 ++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 8e8cdeb5369d..4d0844ba1658 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -342,8 +342,17 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
         &queue_priority,
     };
 
+    // Get the API version to determine what device features are valid to search for
+    VkPhysicalDeviceProperties device_properties = {0};
+    debug(user_context) << "  querying for device properties ...\n";
+    vkGetPhysicalDeviceProperties(*physical_device, &device_properties);
+    uint32_t major_version = VK_API_VERSION_MAJOR(device_properties.apiVersion);
+    uint32_t minor_version = VK_API_VERSION_MINOR(device_properties.apiVersion);
+    bool has_capability_v11 = (major_version >= 1) && (minor_version >= 1); // supports >= v1.1
+    bool has_capability_v12 = (major_version >= 1) && (minor_version >= 2); // supports >= v1.2
+    debug(user_context) << "  found device compute capability v" << major_version << "." << minor_version << " ...\n";
+
     // Get the device features so that all supported features are enabled when device is created
-    //
     VkPhysicalDeviceFeatures device_features = {};
     void *extended_features_ptr = nullptr;
     void *standard_features_ptr = nullptr;
@@ -354,28 +363,42 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     debug(user_context) << "   shader int64 support: " << (device_features.shaderInt64 ? "true" : "false") << "...\n";
     debug(user_context) << "   shader int16 support: " << (device_features.shaderInt16 ? "true" : "false") << "...\n";
 
-    // If the instance runtime supports querying extended device features, request them
-    VkPhysicalDeviceShaderFloat16Int8FeaturesKHR shader_f16_i8_ext = {
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES,
+    // assemble the chain of features to query, but only add the ones that exist in the API version
+    VkPhysicalDeviceShaderFloat16Int8FeaturesKHR shader_f16_i8_ext = { // requires v1.2+
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR,
         nullptr, VK_FALSE, VK_FALSE};
 
+    VkPhysicalDevice8BitStorageFeaturesKHR storage_8bit_ext = { // requires v1.2+
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR,
+        &shader_f16_i8_ext, VK_FALSE, VK_FALSE, VK_FALSE};
+
+    VkPhysicalDevice16BitStorageFeaturesKHR storage_16bit_ext = { // requires v1.1+
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR,
+        (has_capability_v12 ? &storage_8bit_ext : nullptr), 
+        VK_FALSE, VK_FALSE, VK_FALSE, VK_FALSE};
+
     VkPhysicalDeviceFeatures2KHR device_features_ext = {
         VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR,
-        &shader_f16_i8_ext,
-        device_features};
+        &storage_16bit_ext, device_features};
 
-    // Look for v1.1+ device feature query method
+    // Look for extended device feature query method (KHR was removed when it was adopted into v1.1+)
     PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2KHR");  // v1.0+
     if (!vkGetPhysicalDeviceFeatures2KHR) {
         vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(*instance, "vkGetPhysicalDeviceFeatures2");
     }
 
-    if (vkGetPhysicalDeviceFeatures2KHR) {
+    // If the instance runtime supports querying extended device features, request them
+    if (vkGetPhysicalDeviceFeatures2KHR && has_capability_v11) {
+
         debug(user_context) << "  querying for extended device features...\n";
         vkGetPhysicalDeviceFeatures2KHR(*physical_device, &device_features_ext);
         debug(user_context) << "   shader int8 support: " << (shader_f16_i8_ext.shaderInt8 ? "true" : "false") << "...\n";
         debug(user_context) << "   shader float16 support: " << (shader_f16_i8_ext.shaderFloat16 ? "true" : "false") << "...\n";
-        extended_features_ptr = (void *)(&device_features_ext);  // pass v1.1 extended features (which also contains the standard features)
+        if(has_capability_v12) {
+            debug(user_context) << "   storage buffer 8bit access support: " << (storage_8bit_ext.storageBuffer8BitAccess ? "true" : "false") << "...\n";
+            debug(user_context) << "   storage buffer 16bit access support: " << (storage_16bit_ext.storageBuffer16BitAccess ? "true" : "false") << "...\n";
+        }
+        extended_features_ptr = (void *)(&device_features_ext);  // pass extended features (which also contains the standard features)
     } else {
         standard_features_ptr = &device_features;  // pass v1.0 standard features
     }

From 0be26d7e95d11af7d39910ff68fb6fc1d206c5e3 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 30 Jan 2023 15:58:42 -0800
Subject: [PATCH 141/166] Use VK_WHOLE_SIZE for setting buffer (to pass
 validation ... otherwise size has to be a multiple of alignment) Remove
 useless debug asserts for static variables Fix debug logging messages for
 allocations of scalars (which may not have a dim array)

---
 src/runtime/vulkan.cpp | 46 ++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 5278d1ce0add..e7c815de0e28 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -38,7 +38,9 @@ WEAK int halide_vulkan_acquire_context(void *user_context,
                                        VkQueue *queue,
                                        uint32_t *queue_family_index,
                                        bool create) {
-
+#ifdef DEBUG_RUNTIME
+    halide_start_clock(user_context);
+#endif
     halide_debug_assert(user_context, instance != nullptr);
     halide_debug_assert(user_context, device != nullptr);
     halide_debug_assert(user_context, queue != nullptr);
@@ -46,10 +48,6 @@ WEAK int halide_vulkan_acquire_context(void *user_context,
     while (__atomic_test_and_set(&thread_lock, __ATOMIC_ACQUIRE)) {}
 
     // If the context has not been initialized, initialize it now.
-    halide_debug_assert(user_context, &cached_instance != nullptr);
-    halide_debug_assert(user_context, &cached_device != nullptr);
-    halide_debug_assert(user_context, &cached_queue != nullptr);
-    halide_debug_assert(user_context, &cached_physical_device != nullptr);
     if ((cached_instance == nullptr) && create) {
         int result = vk_create_context(user_context,
                                        reinterpret_cast<VulkanMemoryAllocator **>(&cached_allocator),
@@ -60,6 +58,7 @@ WEAK int halide_vulkan_acquire_context(void *user_context,
                                        &cached_queue,
                                        &cached_queue_family_index);
         if (result != halide_error_code_success) {
+            debug(user_context) << "halide_vulkan_acquire_context: FAILED to create context!\n"; 
             __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
             return result;
         }
@@ -258,27 +257,40 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
     }
 
     size_t size = buf->size_in_bytes();
-    halide_debug_assert(user_context, size != 0);
     if (buf->device) {
-        return 0;
+        MemoryRegion *device_region = (MemoryRegion*)(buf->device);
+        if(device_region->size >= size) {
+            debug(user_context) << "Vulkan: Requested allocation for existing device memory ... using existing buffer!\n";
+            return 0;
+        } else {
+            debug(user_context) << "Vulkan: Requested allocation of different size ... reallocating buffer!\n";
+            if (halide_can_reuse_device_allocations(user_context)) {
+                ctx.allocator->release(user_context, device_region);
+            } else {
+                ctx.allocator->reclaim(user_context, device_region);
+            }
+            buf->device = 0;
+        }
     }
 
     for (int i = 0; i < buf->dimensions; i++) {
         halide_debug_assert(user_context, buf->dim[i].stride >= 0);
     }
 
-    debug(user_context) << "    allocating buffer: "
-                        << "extents: " << buf->dim[0].extent << "x"
-                        << buf->dim[1].extent << "x" << buf->dim[2].extent << "x"
-                        << buf->dim[3].extent << " "
-                        << "strides: " << buf->dim[0].stride << "x"
-                        << buf->dim[1].stride << "x" << buf->dim[2].stride << "x"
-                        << buf->dim[3].stride << " "
-                        << "type: " << buf->type << " "
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "    allocating buffer: ";
+    if(buf && buf->dim) {
+        debug(user_context) << "extents: " << buf->dim[0].extent << "x"
+                            << buf->dim[1].extent << "x" << buf->dim[2].extent << "x"
+                            << buf->dim[3].extent << " "
+                            << "strides: " << buf->dim[0].stride << "x"
+                            << buf->dim[1].stride << "x" << buf->dim[2].stride << "x"
+                            << buf->dim[3].stride << " ";
+    }
+    debug(user_context) << "type: " << buf->type << " "
                         << "size_in_bytes: " << (uint64_t)size << " "
                         << "(or " << (size * 1e-6f) << "MB)\n";
 
-#ifdef DEBUG_RUNTIME
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
@@ -335,7 +347,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
     }
 
     // fill buffer with zero values
-    vkCmdFillBuffer(command_buffer, *device_buffer, 0, device_region->size, 0);
+    vkCmdFillBuffer(command_buffer, *device_buffer, 0, VK_WHOLE_SIZE, 0);
     debug(user_context) << "    zeroing device_buffer=" << (void *)device_buffer
                         << " size=" << (uint32_t)device_region->size << "\n";
 

From 9ca31fec2511de124aa5e2a15a7cdcb0c64cdc50 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 30 Jan 2023 16:00:49 -0800
Subject: [PATCH 142/166] Query for device limits to enforce min alignment
 constraints for storage and uniform buffers

---
 src/runtime/vulkan_memory.h | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index d647e5bc2150..a8dc787df962 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -117,6 +117,7 @@ class VulkanMemoryAllocator {
     VulkanMemoryConfig config;
     VkDevice device = nullptr;
     VkPhysicalDevice physical_device = nullptr;
+    VkPhysicalDeviceLimits physical_device_limits = {};
     const VkAllocationCallbacks *alloc_callbacks = nullptr;
     BlockAllocator *block_allocator = nullptr;
 };
@@ -171,6 +172,12 @@ void VulkanMemoryAllocator::initialize(void *user_context,
     block_allocator_config.minimum_block_size = cfg.minimum_block_size;
     block_allocator = BlockAllocator::create(user_context, block_allocator_config, allocators);
     halide_abort_if_false(user_context, block_allocator != nullptr);
+
+    // get the physical device properties to determine limits and allocation requirements
+    VkPhysicalDeviceProperties physical_device_properties = {0};
+    memset(&physical_device_limits, 0, sizeof(VkPhysicalDeviceLimits));
+    vkGetPhysicalDeviceProperties(physical_device, &physical_device_properties);
+    memcpy(&physical_device_limits, &(physical_device_properties.limits), sizeof(VkPhysicalDeviceLimits));
 }
 
 MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) {
@@ -488,14 +495,22 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements);
     vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
 
-#if defined(HL_VK_DEBUG_MEM)
+// #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
                    << "size=" << (uint32_t)block->size << ", "
                    << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
+                   << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
                    << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
-#endif
+// #endif
 
-    block->properties.alignment = memory_requirements.alignment;
+    if(usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
+        block->properties.alignment = instance->physical_device_limits.minStorageBufferOffsetAlignment;
+    } else if(usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) {
+        block->properties.alignment = instance->physical_device_limits.minUniformBufferOffsetAlignment;
+    } else {
+        block->properties.alignment = memory_requirements.alignment;
+    }
     block->handle = (void *)device_memory;
     instance->block_byte_count += block->size;
     instance->block_count++;

From 3bddbfc88af6697b88bcac4d77621b572963d176 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 30 Jan 2023 16:01:39 -0800
Subject: [PATCH 143/166] Fix shutdown sequence to iterate over descriptor sets
 Avoid bug in validation layer by reordering destruction sequence

---
 src/runtime/vulkan_resources.h | 54 +++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 229b06c926c0..56fd92517693 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -478,6 +478,9 @@ VkResult vk_update_descriptor_set(void *user_context,
         descriptor_buffer_info.append(user_context, &scalar_args_descriptor_buffer_info);
         scalar_args_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
 
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "[0] offset=0 (uniform)\n";
+#endif
         VkWriteDescriptorSet uniform_buffer_write_descriptor_set = {
             VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
             nullptr,                                 // pointer to struct extending this
@@ -519,6 +522,10 @@ VkResult vk_update_descriptor_set(void *user_context,
             descriptor_buffer_info.append(user_context, &device_buffer_info);
             VkDescriptorBufferInfo *device_buffer_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
 
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "[" << (uint32_t)write_descriptor_set.size() << "] offset=" << (uint32_t)(range_offset) << " (storage)\n";
+#endif
+
             VkWriteDescriptorSet storage_buffer_write_descriptor_set = {
                 VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
                 nullptr,                                 // pointer to struct extending this
@@ -1215,12 +1222,21 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 
         void operator()(VulkanCompilationCacheEntry *cache_entry) {
             if (cache_entry != nullptr) {
-                if (cache_entry->shader_module) {
-                    debug(user_context) << " . destroying shader module " << (void *)cache_entry->shader_module << "\n";
-                    vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
-                    cache_entry->shader_module = {0};
-                }
                 if (cache_entry->shader_bindings) {
+                    if (cache_entry->descriptor_set_layouts) {
+                        for (uint32_t n = 0; n < cache_entry->shader_count; n++) {
+                            debug(user_context) << "  destroying descriptor set layout [" << n << "] " << cache_entry->shader_bindings[n].entry_point_name << "\n";
+                            vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
+                            cache_entry->descriptor_set_layouts[n] = {0};
+                        }
+                        vk_host_free(user_context, cache_entry->descriptor_set_layouts, allocator->callbacks());
+                        cache_entry->descriptor_set_layouts = nullptr;
+                    }
+                    if (cache_entry->pipeline_layout) {
+                        debug(user_context) << "  destroying pipeline layout " << (void *)cache_entry->pipeline_layout << "\n";
+                        vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
+                        cache_entry->pipeline_layout = {0};
+                    }
                     for (uint32_t n = 0; n < cache_entry->shader_count; n++) {
                         if (cache_entry->shader_bindings[n].args_region) {
                             vk_destroy_scalar_uniform_buffer(user_context, allocator, cache_entry->shader_bindings[n].args_region);
@@ -1230,10 +1246,6 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                             vk_destroy_descriptor_pool(user_context, allocator, cache_entry->shader_bindings[n].descriptor_pool);
                             cache_entry->shader_bindings[n].descriptor_pool = {0};
                         }
-                        if (cache_entry->shader_bindings[n].compute_pipeline) {
-                            vk_destroy_compute_pipeline(user_context, allocator, cache_entry->shader_bindings[n].compute_pipeline);
-                            cache_entry->shader_bindings[n].compute_pipeline = {0};
-                        }
                         if (cache_entry->shader_bindings[n].specialization_constants) {
                             vk_host_free(user_context, cache_entry->shader_bindings[n].specialization_constants, allocator->callbacks());
                             cache_entry->shader_bindings[n].specialization_constants = nullptr;
@@ -1242,27 +1254,21 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                             vk_host_free(user_context, cache_entry->shader_bindings[n].shared_memory_allocations, allocator->callbacks());
                             cache_entry->shader_bindings[n].shared_memory_allocations = nullptr;
                         }
+                        if (cache_entry->shader_bindings[n].compute_pipeline) {
+                            vk_destroy_compute_pipeline(user_context, allocator, cache_entry->shader_bindings[n].compute_pipeline);
+                            cache_entry->shader_bindings[n].compute_pipeline = {0};
+                        }
                     }
 
                     vk_host_free(user_context, cache_entry->shader_bindings, allocator->callbacks());
                     cache_entry->shader_bindings = nullptr;
-                    cache_entry->shader_count = 0;
-                }
-                if (cache_entry->descriptor_set_layouts) {
-                    for (uint32_t n = 0; n < cache_entry->shader_count; n++) {
-                        debug(user_context) << "  destroying descriptor set layout [" << n << "] " << cache_entry->shader_bindings[n].entry_point_name << "\n";
-                        vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
-                        cache_entry->descriptor_set_layouts[n] = {0};
-                    }
-                    vk_host_free(user_context, cache_entry->descriptor_set_layouts, allocator->callbacks());
-                    cache_entry->descriptor_set_layouts = nullptr;
                 }
-                if (cache_entry->pipeline_layout) {
-                    debug(user_context) << "  destroying pipeline layout " << (void *)cache_entry->pipeline_layout << "\n";
-                    vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
-                    cache_entry->pipeline_layout = {0};
+                if (cache_entry->shader_module) {
+                    debug(user_context) << " . destroying shader module " << (void *)cache_entry->shader_module << "\n";
+                    vkDestroyShaderModule(allocator->current_device(), cache_entry->shader_module, allocator->callbacks());
+                    cache_entry->shader_module = {0};
                 }
-
+                cache_entry->shader_count = 0;
                 vk_host_free(user_context, cache_entry, allocator->callbacks());
             }
         }

From ad3742e3544eff148107fc504d8cefdc83a81169 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 30 Jan 2023 16:09:47 -0800
Subject: [PATCH 144/166] Clang format & tidy pass

---
 src/CodeGen_Vulkan_Dev.cpp     | 10 +++++-----
 src/runtime/mini_vulkan.h      | 18 +++++++++---------
 src/runtime/vulkan.cpp         |  8 ++++----
 src/runtime/vulkan_context.h   | 27 ++++++++++++++-------------
 src/runtime/vulkan_memory.h    |  8 ++++----
 src/runtime/vulkan_resources.h |  4 ++--
 6 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index fb44c2756276..903f49d151f2 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -2202,8 +2202,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit_glsl_op(SpvId glsl_op_code, Type t
     for (const Expr &e : args) {
         e.accept(this);
         SpvId arg_value_id = builder.current_id();
-        if(builder.type_of(arg_value_id) != type_id) {
-            SpvId casted_value_id = cast_type(type, e.type(), arg_value_id); // all GLSL args must match return type
+        if (builder.type_of(arg_value_id) != type_id) {
+            SpvId casted_value_id = cast_type(type, e.type(), arg_value_id);  // all GLSL args must match return type
             operands.push_back(casted_value_id);
         } else {
             operands.push_back(arg_value_id);
@@ -2586,8 +2586,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
     DescriptorSet descriptor_set;
     descriptor_set.entry_point_name = entry_point_name;
 
-    // Add required extension support for storage types which are necessary to 
-    // use smaller bit-width types for any halide buffer *or* device argument 
+    // Add required extension support for storage types which are necessary to
+    // use smaller bit-width types for any halide buffer *or* device argument
     // (passed as a runtime array)
     for (const auto &arg : args) {
         if (arg.type.is_int_or_uint()) {
@@ -2611,7 +2611,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
             } else if (arg.type.bits() == 16) {
                 builder.require_capability(SpvCapabilityUniformAndStorageBuffer16BitAccess);
             }
-            
+
             SpvId arg_type_id = builder.declare_type(arg.type);
             param_struct_members.push_back(arg_type_id);
         }
diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index b6547d3dee4e..184282f9a878 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -6051,20 +6051,20 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetValidationCacheDataEXT(
 
 // Provided by VK_VERSION_1_1
 typedef struct VkPhysicalDeviceShaderFloat16Int8Features {
-    uint32_t           sType;
-    void               *pNext;
-    VkBool32           shaderFloat16;
-    VkBool32           shaderInt8;
+    uint32_t sType;
+    void *pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
 } VkPhysicalDeviceShaderFloat16Int8FeaturesKHR;
 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR 1000082000
 
 // Provided by VK_VERSION_1_2
 typedef struct VkPhysicalDevice8BitStorageFeatures {
-    uint32_t           sType;
-    void*              pNext;
-    VkBool32           storageBuffer8BitAccess;
-    VkBool32           uniformAndStorageBuffer8BitAccess;
-    VkBool32           storagePushConstant8;
+    uint32_t sType;
+    void *pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
 } VkPhysicalDevice8BitStorageFeaturesKHR;
 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR 1000177000
 
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index e7c815de0e28..e963fb1dd32d 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -58,7 +58,7 @@ WEAK int halide_vulkan_acquire_context(void *user_context,
                                        &cached_queue,
                                        &cached_queue_family_index);
         if (result != halide_error_code_success) {
-            debug(user_context) << "halide_vulkan_acquire_context: FAILED to create context!\n"; 
+            debug(user_context) << "halide_vulkan_acquire_context: FAILED to create context!\n";
             __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
             return result;
         }
@@ -258,8 +258,8 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
 
     size_t size = buf->size_in_bytes();
     if (buf->device) {
-        MemoryRegion *device_region = (MemoryRegion*)(buf->device);
-        if(device_region->size >= size) {
+        MemoryRegion *device_region = (MemoryRegion *)(buf->device);
+        if (device_region->size >= size) {
             debug(user_context) << "Vulkan: Requested allocation for existing device memory ... using existing buffer!\n";
             return 0;
         } else {
@@ -279,7 +279,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context) << "    allocating buffer: ";
-    if(buf && buf->dim) {
+    if (buf && buf->dim) {
         debug(user_context) << "extents: " << buf->dim[0].extent << "x"
                             << buf->dim[1].extent << "x" << buf->dim[2].extent << "x"
                             << buf->dim[3].extent << " "
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 4d0844ba1658..2f94d170486a 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -348,8 +348,8 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     vkGetPhysicalDeviceProperties(*physical_device, &device_properties);
     uint32_t major_version = VK_API_VERSION_MAJOR(device_properties.apiVersion);
     uint32_t minor_version = VK_API_VERSION_MINOR(device_properties.apiVersion);
-    bool has_capability_v11 = (major_version >= 1) && (minor_version >= 1); // supports >= v1.1
-    bool has_capability_v12 = (major_version >= 1) && (minor_version >= 2); // supports >= v1.2
+    bool has_capability_v11 = (major_version >= 1) && (minor_version >= 1);  // supports >= v1.1
+    bool has_capability_v12 = (major_version >= 1) && (minor_version >= 2);  // supports >= v1.2
     debug(user_context) << "  found device compute capability v" << major_version << "." << minor_version << " ...\n";
 
     // Get the device features so that all supported features are enabled when device is created
@@ -364,18 +364,19 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     debug(user_context) << "   shader int16 support: " << (device_features.shaderInt16 ? "true" : "false") << "...\n";
 
     // assemble the chain of features to query, but only add the ones that exist in the API version
-    VkPhysicalDeviceShaderFloat16Int8FeaturesKHR shader_f16_i8_ext = { // requires v1.2+
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR,
-        nullptr, VK_FALSE, VK_FALSE};
 
-    VkPhysicalDevice8BitStorageFeaturesKHR storage_8bit_ext = { // requires v1.2+
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR,
-        &shader_f16_i8_ext, VK_FALSE, VK_FALSE, VK_FALSE};
+    // note: requires v1.2+
+    VkPhysicalDeviceShaderFloat16Int8FeaturesKHR shader_f16_i8_ext = {VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR,
+                                                                      nullptr, VK_FALSE, VK_FALSE};
 
-    VkPhysicalDevice16BitStorageFeaturesKHR storage_16bit_ext = { // requires v1.1+
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR,
-        (has_capability_v12 ? &storage_8bit_ext : nullptr), 
-        VK_FALSE, VK_FALSE, VK_FALSE, VK_FALSE};
+    // note: requires v1.2+
+    VkPhysicalDevice8BitStorageFeaturesKHR storage_8bit_ext = {VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR,
+                                                               &shader_f16_i8_ext, VK_FALSE, VK_FALSE, VK_FALSE};
+
+    // note: requires v1.1+
+    VkPhysicalDevice16BitStorageFeaturesKHR storage_16bit_ext = {VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR,
+                                                                 (has_capability_v12 ? &storage_8bit_ext : nullptr),
+                                                                 VK_FALSE, VK_FALSE, VK_FALSE, VK_FALSE};
 
     VkPhysicalDeviceFeatures2KHR device_features_ext = {
         VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR,
@@ -394,7 +395,7 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
         vkGetPhysicalDeviceFeatures2KHR(*physical_device, &device_features_ext);
         debug(user_context) << "   shader int8 support: " << (shader_f16_i8_ext.shaderInt8 ? "true" : "false") << "...\n";
         debug(user_context) << "   shader float16 support: " << (shader_f16_i8_ext.shaderFloat16 ? "true" : "false") << "...\n";
-        if(has_capability_v12) {
+        if (has_capability_v12) {
             debug(user_context) << "   storage buffer 8bit access support: " << (storage_8bit_ext.storageBuffer8BitAccess ? "true" : "false") << "...\n";
             debug(user_context) << "   storage buffer 16bit access support: " << (storage_16bit_ext.storageBuffer16BitAccess ? "true" : "false") << "...\n";
         }
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index a8dc787df962..e192f0858556 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -495,18 +495,18 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements);
     vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
 
-// #if defined(HL_VK_DEBUG_MEM)
+    // #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
                    << "size=" << (uint32_t)block->size << ", "
                    << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
                    << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
                    << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
                    << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
-// #endif
+    // #endif
 
-    if(usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
+    if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
         block->properties.alignment = instance->physical_device_limits.minStorageBufferOffsetAlignment;
-    } else if(usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) {
+    } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) {
         block->properties.alignment = instance->physical_device_limits.minUniformBufferOffsetAlignment;
     } else {
         block->properties.alignment = memory_requirements.alignment;
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 56fd92517693..2cafee8436ca 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -479,7 +479,7 @@ VkResult vk_update_descriptor_set(void *user_context,
         scalar_args_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
 
 #ifdef DEBUG_RUNTIME
-    debug(user_context) << "[0] offset=0 (uniform)\n";
+        debug(user_context) << "[0] offset=0 (uniform)\n";
 #endif
         VkWriteDescriptorSet uniform_buffer_write_descriptor_set = {
             VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
@@ -523,7 +523,7 @@ VkResult vk_update_descriptor_set(void *user_context,
             VkDescriptorBufferInfo *device_buffer_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
 
 #ifdef DEBUG_RUNTIME
-    debug(user_context) << "[" << (uint32_t)write_descriptor_set.size() << "] offset=" << (uint32_t)(range_offset) << " (storage)\n";
+            debug(user_context) << "[" << (uint32_t)write_descriptor_set.size() << "] offset=" << (uint32_t)(range_offset) << " (storage)\n";
 #endif
 
             VkWriteDescriptorSet storage_buffer_write_descriptor_set = {

From ec62988945081cad867e83d63568d977d42185ec Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 3 Feb 2023 09:11:49 -0800
Subject: [PATCH 145/166] Fix logic for locating entry point shader binding ...
 assume exact match for entry point name Cleanup entry point binding variables
 and clarify usage

---
 src/runtime/vulkan.cpp | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index e963fb1dd32d..24c10b1797e1 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -1095,8 +1095,8 @@ WEAK int halide_vulkan_run(void *user_context,
     // 1a. Locate the correct entry point from the cache
     bool found_entry_point = false;
     uint32_t entry_point_index = 0;
-    for (uint32_t n = 0; n < cache_entry->shader_count; ++n) {
-        if (strstr(cache_entry->shader_bindings[n].entry_point_name, entry_name) != nullptr) {
+    for (uint32_t n = 0; (n < cache_entry->shader_count) && !found_entry_point; ++n) {
+        if (strcmp(cache_entry->shader_bindings[n].entry_point_name, entry_name) == 0) {
             entry_point_index = n;
             found_entry_point = true;
         }
@@ -1135,8 +1135,8 @@ WEAK int halide_vulkan_run(void *user_context,
         }
     }
 
-    VulkanShaderBinding *shader_bindings = (cache_entry->shader_bindings + entry_point_index);
-    halide_abort_if_false(user_context, shader_bindings != nullptr);
+    VulkanShaderBinding *entry_point_binding = (cache_entry->shader_bindings + entry_point_index);
+    halide_abort_if_false(user_context, entry_point_binding != nullptr);
 
     VulkanDispatchData dispatch_data = {};
     dispatch_data.shared_mem_bytes = shared_mem_bytes;
@@ -1148,30 +1148,30 @@ WEAK int halide_vulkan_run(void *user_context,
     dispatch_data.local_size[2] = threadsZ;
 
     // 2c. Setup the compute pipeline (eg override any specializations for shared mem or workgroup size)
-    VkResult result = vk_setup_compute_pipeline(user_context, ctx.allocator, shader_bindings, &dispatch_data, cache_entry->shader_module, cache_entry->pipeline_layout, &(shader_bindings->compute_pipeline));
+    VkResult result = vk_setup_compute_pipeline(user_context, ctx.allocator, entry_point_binding, &dispatch_data, cache_entry->shader_module, cache_entry->pipeline_layout, &(entry_point_binding->compute_pipeline));
     if (result != VK_SUCCESS) {
         error(user_context) << "vk_setup_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return halide_error_code_internal_error;
     }
 
     // 2d. Create a descriptor set
-    if (shader_bindings->descriptor_set == 0) {
+    if (entry_point_binding->descriptor_set == 0) {
 
         // Construct a descriptor pool
         //
         // NOTE: while this could be re-used across multiple pipelines, we only know the storage requirements of this kernel's
         //       inputs and outputs ... so create a pool specific to the number of buffers known at this time
 
-        uint32_t uniform_buffer_count = shader_bindings->uniform_buffer_count;
-        uint32_t storage_buffer_count = shader_bindings->storage_buffer_count;
-        VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(shader_bindings->descriptor_pool));
+        uint32_t uniform_buffer_count = entry_point_binding->uniform_buffer_count;
+        uint32_t storage_buffer_count = entry_point_binding->storage_buffer_count;
+        VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(entry_point_binding->descriptor_pool));
         if (result != VK_SUCCESS) {
             error(user_context) << "vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
 
         // Create the descriptor set
-        result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layouts[entry_point_index], shader_bindings->descriptor_pool, &(shader_bindings->descriptor_set));
+        result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layouts[entry_point_index], entry_point_binding->descriptor_pool, &(entry_point_binding->descriptor_set));
         if (result != VK_SUCCESS) {
             error(user_context) << "vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
@@ -1179,11 +1179,11 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     // 3a. Create a buffer for the scalar parameters
-    if ((shader_bindings->args_region == nullptr) && shader_bindings->uniform_buffer_count) {
+    if ((entry_point_binding->args_region == nullptr) && entry_point_binding->uniform_buffer_count) {
         size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context, arg_sizes, args, arg_is_buffer);
         if (scalar_buffer_size > 0) {
-            shader_bindings->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
-            if (shader_bindings->args_region == nullptr) {
+            entry_point_binding->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
+            if (entry_point_binding->args_region == nullptr) {
                 error(user_context) << "vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
                 return halide_error_code_internal_error;
             }
@@ -1192,14 +1192,14 @@ WEAK int halide_vulkan_run(void *user_context,
 
     // 3b. Update uniform buffer with scalar parameters
     VkBuffer *args_buffer = nullptr;
-    if ((shader_bindings->args_region != nullptr) && shader_bindings->uniform_buffer_count) {
-        VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, shader_bindings->args_region, arg_sizes, args, arg_is_buffer);
+    if ((entry_point_binding->args_region != nullptr) && entry_point_binding->uniform_buffer_count) {
+        VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, entry_point_binding->args_region, arg_sizes, args, arg_is_buffer);
         if (result != VK_SUCCESS) {
             debug(user_context) << "vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
             return result;
         }
 
-        args_buffer = reinterpret_cast<VkBuffer *>(shader_bindings->args_region->handle);
+        args_buffer = reinterpret_cast<VkBuffer *>(entry_point_binding->args_region->handle);
         if (args_buffer == nullptr) {
             error(user_context) << "Vulkan: Failed to retrieve scalar args buffer for device memory!\n";
             return halide_error_code_internal_error;
@@ -1207,7 +1207,7 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     // 3c. Update buffer bindings for descriptor set
-    result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, shader_bindings->uniform_buffer_count, shader_bindings->storage_buffer_count, arg_sizes, args, arg_is_buffer, shader_bindings->descriptor_set);
+    result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_binding->descriptor_set);
     if (result != VK_SUCCESS) {
         debug(user_context) << "vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
         return result;
@@ -1224,9 +1224,9 @@ WEAK int halide_vulkan_run(void *user_context,
     // 5. Fill the command buffer
     result = vk_fill_command_buffer_with_dispatch_call(user_context,
                                                        ctx.device, command_buffer,
-                                                       shader_bindings->compute_pipeline,
+                                                       entry_point_binding->compute_pipeline,
                                                        cache_entry->pipeline_layout,
-                                                       shader_bindings->descriptor_set,
+                                                       entry_point_binding->descriptor_set,
                                                        entry_point_index,
                                                        blocksX, blocksY, blocksZ);
     if (result != VK_SUCCESS) {

From d69e36cb7893e2619ea441e35b9219fa980d2289 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 3 Feb 2023 09:13:36 -0800
Subject: [PATCH 146/166] Remove accidentally uncommented debug statements

---
 src/runtime/vulkan_memory.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index e192f0858556..8ae7d5e33e6a 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -495,14 +495,14 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements);
     vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
 
-    // #if defined(HL_VK_DEBUG_MEM)
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
                    << "size=" << (uint32_t)block->size << ", "
                    << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
                    << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
                    << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
                    << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
-    // #endif
+#endif
 
     if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
         block->properties.alignment = instance->physical_device_limits.minStorageBufferOffsetAlignment;
@@ -956,4 +956,4 @@ WEAK const VkAllocationCallbacks *halide_vulkan_get_allocation_callbacks(void *u
 
 }  // extern "C"
 
-#endif  // HALIDE_RUNTIME_VULKAN_MEMORY_H
\ No newline at end of file
+#endif  // HALIDE_RUNTIME_VULKAN_MEMORY_H

From 14ef177b40747eb56519a99a973829075d33ef1a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 3 Feb 2023 15:30:01 -0800
Subject: [PATCH 147/166] Cleanup debug output for buffer related updates

---
 src/runtime/vulkan.cpp         | 21 ++++++++++++++++++---
 src/runtime/vulkan_resources.h | 27 ++++++++++++++++++---------
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 24c10b1797e1..6d4980393f71 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -313,8 +313,9 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
     buf->device_interface->impl->use_module();
 
     debug(user_context)
-        << "    allocated device buffer " << (void *)buf->device
-        << " for buffer " << buf << "\n";
+        << "    allocated device region=" << (void *)device_region << "\n"
+        << "    containing device buffer=" << (void *)device_region->handle << "\n"
+        << "    for halide buffer " << buf << "\n";
 
     // retrieve the buffer from the region
     VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
@@ -346,7 +347,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
         return result;
     }
 
-    // fill buffer with zero values
+    // fill buffer with zero values up to the size of the buffer
     vkCmdFillBuffer(command_buffer, *device_buffer, 0, VK_WHOLE_SIZE, 0);
     debug(user_context) << "    zeroing device_buffer=" << (void *)device_buffer
                         << " size=" << (uint32_t)device_region->size << "\n";
@@ -467,6 +468,13 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
         return halide_error_code_internal_error;
     }
 
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "  copying into device region=" << (void *)device_region << "\n"
+        << "  containing device buffer=" << (void *)device_buffer << "\n"
+        << "  from halide buffer=" << halide_buffer << "\n";
+#endif
+
     // create a command buffer
     VkCommandBuffer command_buffer;
     VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
@@ -614,6 +622,13 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
         return halide_error_code_internal_error;
     }
 
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << "  copying from device region=" << (void *)device_region << "\n"
+        << "  containing device buffer=" << (void *)device_buffer << "\n"
+        << "  into halide buffer=" << halide_buffer << "\n";
+#endif
+
     // create a command buffer
     VkCommandBuffer command_buffer;
     VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 2cafee8436ca..4ab345411924 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -479,7 +479,10 @@ VkResult vk_update_descriptor_set(void *user_context,
         scalar_args_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
 
 #ifdef DEBUG_RUNTIME
-        debug(user_context) << "[0] offset=0 (uniform)\n";
+        debug(user_context) << "  [" << (uint32_t)write_descriptor_set.size() << "] UNIFORM_BUFFER : "
+                            << "buffer=" << (void *)scalar_args_buffer << " "
+                            << "offset=" << (uint32_t)(0) << " "
+                            << "size=VK_WHOLE_SIZE\n";
 #endif
         VkWriteDescriptorSet uniform_buffer_write_descriptor_set = {
             VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,  // struct type
@@ -523,7 +526,11 @@ VkResult vk_update_descriptor_set(void *user_context,
             VkDescriptorBufferInfo *device_buffer_entry = (VkDescriptorBufferInfo *)descriptor_buffer_info.back();
 
 #ifdef DEBUG_RUNTIME
-            debug(user_context) << "[" << (uint32_t)write_descriptor_set.size() << "] offset=" << (uint32_t)(range_offset) << " (storage)\n";
+            debug(user_context) << "  [" << (uint32_t)write_descriptor_set.size() << "] STORAGE_BUFFER : "
+                                << "region=" << (void *)device_region << " "
+                                << "buffer=" << (void *)device_buffer << " "
+                                << "offset=" << (uint32_t)(range_offset) << " "
+                                << "size=" << (uint32_t)(range_size) << "\n";
 #endif
 
             VkWriteDescriptorSet storage_buffer_write_descriptor_set = {
@@ -1072,13 +1079,14 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
             shader_bindings[n].dispatch_data.local_size_binding.constant_id[dim] = module_ptr[idx++];
         }
 
-        debug(user_context) << "  [" << n << "] "
-                            << "entry_point_name='" << (const char *)entry_point_name << "'\n";
+#ifdef DEBUG_RUNTIME
+
+        debug(user_context) << "  [" << n << "] '" << (const char *)entry_point_name << "'\n";
 
-        debug(user_context) << "  uniform_buffer_count=" << uniform_buffer_count << "\n"
-                            << "  storage_buffer_count=" << storage_buffer_count << "\n";
+        debug(user_context) << "   uniform_buffer_count=" << uniform_buffer_count << "\n"
+                            << "   storage_buffer_count=" << storage_buffer_count << "\n";
 
-        debug(user_context) << "  specialization_constants_count=" << specialization_constants_count << "\n";
+        debug(user_context) << "   specialization_constants_count=" << specialization_constants_count << "\n";
         for (uint32_t sc = 0; sc < specialization_constants_count; sc++) {
             debug(user_context) << "   [" << sc << "] "
                                 << "constant_name='" << (const char *)specialization_constants[sc].constant_name << "' "
@@ -1086,7 +1094,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
                                 << "type_size=" << specialization_constants[sc].type_size << "\n";
         }
 
-        debug(user_context) << "  shared_memory_allocations_count=" << shared_memory_allocations_count << "\n";
+        debug(user_context) << "   shared_memory_allocations_count=" << shared_memory_allocations_count << "\n";
         for (uint32_t sm = 0; sm < shared_memory_allocations_count; sm++) {
             debug(user_context) << "   [" << sm << "] "
                                 << "variable_name='" << (const char *)shared_memory_allocations[sm].variable_name << "' "
@@ -1094,11 +1102,12 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
                                 << "type_size=" << shared_memory_allocations[sm].type_size << " "
                                 << "array_size=" << shared_memory_allocations[sm].array_size << "\n";
         }
-        debug(user_context) << "  local_size_binding=[";
+        debug(user_context) << "   local_size_binding=[";
         for (uint32_t dim = 0; dim < 3 && (idx < module_entries); dim++) {
             debug(user_context) << shader_bindings[n].dispatch_data.local_size_binding.constant_id[dim] << " ";
         }
         debug(user_context) << "]\n";
+#endif
         shader_bindings[n].entry_point_name = entry_point_name;  // NOTE: module owns string data
         shader_bindings[n].uniform_buffer_count = uniform_buffer_count;
         shader_bindings[n].storage_buffer_count = storage_buffer_count;

From a452b9b3e0cabf4bdd21f6c10757f4be05961d28 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 9 Feb 2023 11:25:46 -0800
Subject: [PATCH 148/166] Fix split and allocate methods in region allocator to
 fix issues with alignment constraints - discovered a hang if requested size
 couldn't be fulfilled after adjusting to aligned sizes - cause was incorrect
 splitting of existing regions Cleanup region allocator iteration, cleanup and
 shutdown Added maximum_pool_size configuration option to Vulkan Memory
 Allocator to restrict pool sizes

---
 README_vulkan.md                        |  14 +-
 src/runtime/internal/block_allocator.h  | 150 +++++++++++++----
 src/runtime/internal/region_allocator.h | 212 ++++++++++++++++--------
 src/runtime/vulkan_memory.h             |  20 ++-
 4 files changed, 284 insertions(+), 112 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index a70d2b3b40ad..9576982c69e4 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -181,9 +181,17 @@ to the first compute device it can find.
 `HL_VK_ALLOC_CONFIG=...` will tell Halide to configure the Vulkan memory
 allocator use the given constraints specified as three integer values 
 separated by the appropriate environment variable list delimiter 
-(e.g. `N:N:N` on Linux/OSX/Posix, or `N;N;N` on Windows). These values 
-correspond to `minimum_block_size`, `maximum_block_size` and 
-`maximum_block_count`. 
+(e.g. `N:N:N:N` on Linux/OSX/Posix, or `N;N;N;N` on Windows). These values 
+correspond to `maximum_pool_size`, `minimum_block_size`, `maximum_block_size` 
+and `maximum_block_count`. 
+
+The `maximum_pool_size` constraint will tell Halide to configure the 
+Vulkan memory allocator to never request more than N megabytes for the
+entire pool of allocations for the context. This includes all resource 
+blocks used for suballocations. Setting this to a non-zero value will 
+limit the amount device memory used by Halide, which may be useful when
+other applications and frameworks are competing for resources. 
+Default is 0 ... meaning no limit.
 
 The `minimum_block_size` constraint will tell Halide to configure the 
 Vulkan memory allocator to always request a minimum of N megabytes for 
diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index ff81adcc4de2..29823eac6c44 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -40,6 +40,7 @@ class BlockAllocator {
     // Runtime configuration parameters to adjust the behaviour of the block allocator
     struct Config {
         size_t initial_capacity = 0;
+        size_t maximum_pool_size = 0;
         size_t minimum_block_size = 0;
         size_t maximum_block_size = 0;
         size_t maximum_block_count = 0;
@@ -63,6 +64,7 @@ class BlockAllocator {
     const Config &current_config() const;
     const Config &default_config() const;
     size_t block_count() const;
+    size_t pool_size() const;
 
 private:
     // Linked-list for storing the block resources
@@ -107,6 +109,9 @@ class BlockAllocator {
     // Returns true if the given block is compatible with the given properties
     bool is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const;
 
+    // Returns true if the given block is suitable for the request allocation
+    bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const;
+
     Config config;
     LinkedList block_list;
     MemoryAllocators allocators;
@@ -167,13 +172,11 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
 
     MemoryRegion *result = reserve_memory_region(user_context, block->allocator, request);
     if (result == nullptr) {
-
         // Unable to reserve region in an existing block ... create a new block and try again.
-        size_t actual_size = constrain_requested_size(request.size);
-        block_entry = create_block_entry(user_context, request.properties, actual_size, request.dedicated);
+        block_entry = create_block_entry(user_context, request.properties, request.size, request.dedicated);
         if (block_entry == nullptr) {
             error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
-                                << (int32_t)(actual_size) << " bytes)!\n";
+                                << (int32_t)(request.size) << " bytes)!\n";
             return nullptr;
         }
 
@@ -284,51 +287,113 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl
     return result;
 }
 
-BlockAllocator::BlockEntry *
-BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
-    BlockEntry *block_entry = nullptr;
-    for (block_entry = block_list.front(); block_entry != nullptr; block_entry = block_entry->next_ptr) {
-
-        const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
-        if (!is_compatible_block(block, properties)) {
-            continue;
-        }
+bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const {
+    if (!is_compatible_block(block, properties)) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "BlockAllocator: skipping block ... incompatible properties!\n"
+                                        << " block_resource=" << (void *)block << "\n"
+                                        << " block_size=" << (uint32_t)block->memory.size << "\n"
+                                        << " block_reserved=" << (uint32_t)block->reserved << "\n"
+                                        << " block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << "\n"
+                                        << " block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << "\n"
+                                        << " block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << "\n";
+        StackBasicPrinter<256>(nullptr) << " request_size=" << (uint32_t)size << "\n"
+                                        << " request_usage=" << halide_memory_usage_name(properties.usage) << "\n"
+                                        << " request_caching=" << halide_memory_caching_name(properties.caching) << "\n"
+                                        << " request_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+#endif
+        // skip blocks that are using incompatible memory
+        return false;
+    }
 
+    if (dedicated && (block->reserved > 0)) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "BlockAllocator: skipping block ... can be used for dedicated allocation!\n"
+                                        << " block_resource=" << (void *)block << "\n"
+                                        << " block_size=" << (uint32_t)block->memory.size << "\n"
+                                        << " block_reserved=" << (uint32_t)block->reserved << "\n";
+#endif
         // skip blocks that can't be dedicated to a single allocation
-        if (dedicated && (block->reserved > 0)) {
-            continue;
-        }
+        return false;
 
+    } else if (block->memory.dedicated && (block->reserved > 0)) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "BlockAllocator: skipping block ... already dedicated to an allocation!\n"
+                                        << " block_resource=" << (void *)block << "\n"
+                                        << " block_size=" << (uint32_t)block->memory.size << "\n"
+                                        << " block_reserved=" << (uint32_t)block->reserved << "\n";
+#endif
         // skip dedicated blocks that are already allocated
-        if (block->memory.dedicated && (block->reserved > 0)) {
-            continue;
-        }
+        return false;
+    }
+
+    size_t available = (block->memory.size - block->reserved);
+    if (available >= size) {
+        return true;
+    }
+
+    return false;
+}
 
-        size_t available = (block->memory.size - block->reserved);
-        if (available >= size) {
+BlockAllocator::BlockEntry *
+BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+    BlockEntry *block_entry = block_list.back();
+    while (block_entry != nullptr) {
+        BlockEntry *prev_entry = block_entry->prev_ptr;
+        const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+        if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) {
 #ifdef DEBUG_INTERNAL
-            StackBasicPrinter<256>(nullptr) << "BlockAllocator: find_block_entry (FOUND) ("
-                                            << "user_context=" << (void *)(user_context) << " "
-                                            << "block_entry=" << (void *)(block_entry) << " "
-                                            << "size=" << (uint32_t)size << " "
-                                            << "dedicated=" << (dedicated ? "true" : "false") << " "
-                                            << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                                            << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                                            << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n";
+            StackBasicPrinter<256>(nullptr) << "BlockAllocator: found suitable block ...\n"
+                                            << " user_context=" << (void *)(user_context) << "\n"
+                                            << " block_resource=" << (void *)block << "\n"
+                                            << " block_size=" << (uint32_t)block->memory.size << "\n"
+                                            << " block_reserved=" << (uint32_t)block->reserved << "\n";
+            StackBasicPrinter<256>(nullptr) << " request_size=" << (uint32_t)size << "\n"
+                                            << " dedicated=" << (dedicated ? "true" : "false") << "\n"
+                                            << " usage=" << halide_memory_usage_name(properties.usage) << "\n"
+                                            << " caching=" << halide_memory_caching_name(properties.caching) << "\n"
+                                            << " visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
 #endif
-            break;
+            return block_entry;
         }
+        block_entry = prev_entry;
     }
 
+    if (block_entry == nullptr) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "BlockAllocator: couldn't find suitable block!\n"
+                                        << " user_context=" << (void *)(user_context) << "\n"
+                                        << " request_size=" << (uint32_t)size << "\n"
+                                        << " dedicated=" << (dedicated ? "true" : "false") << "\n"
+                                        << " usage=" << halide_memory_usage_name(properties.usage) << "\n"
+                                        << " caching=" << halide_memory_caching_name(properties.caching) << "\n"
+                                        << " visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+#endif
+    }
     return block_entry;
 }
 
 BlockAllocator::BlockEntry *
 BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+#ifdef DEBUG_INTERNAL
+    StackBasicPrinter<256>(nullptr) << "BlockAllocator: reserving block ... !\n"
+                                    << " requested_size=" << (uint32_t)size << "\n"
+                                    << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n"
+                                    << " requested_usage=" << halide_memory_usage_name(properties.usage) << "\n"
+                                    << " requested_caching=" << halide_memory_caching_name(properties.caching) << "\n"
+                                    << " requested_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+#endif
     BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated);
     if (block_entry == nullptr) {
-        size_t actual_size = constrain_requested_size(size);
-        block_entry = create_block_entry(user_context, properties, actual_size, dedicated);
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "BlockAllocator: creating block ... !\n"
+                                        << " requested_size=" << (uint32_t)size << "\n"
+                                        << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n"
+                                        << " requested_usage=" << halide_memory_usage_name(properties.usage) << "\n"
+                                        << " requested_caching=" << halide_memory_caching_name(properties.caching) << "\n"
+                                        << " requested_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+#endif
+        block_entry = create_block_entry(user_context, properties, size, dedicated);
     }
 
     if (block_entry) {
@@ -373,6 +438,13 @@ void BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocato
 
 BlockAllocator::BlockEntry *
 BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+    if (config.maximum_pool_size && (pool_size() >= config.maximum_pool_size)) {
+        error(user_context) << "BlockAllocator: No free blocks found! Maximum pool size reached ("
+                            << (int32_t)(config.maximum_pool_size) << " bytes or "
+                            << (int32_t)(config.maximum_pool_size / (1024 * 1024)) << " MB)\n";
+        return nullptr;
+    }
+
     if (config.maximum_block_count && (block_count() >= config.maximum_block_count)) {
         error(user_context) << "BlockAllocator: No free blocks found! Maximum block count reached ("
                             << (int32_t)(config.maximum_block_count) << ")!\n";
@@ -393,7 +465,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
 #endif
 
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
-    block->memory.size = size;
+    block->memory.size = constrain_requested_size(size);
     block->memory.handle = nullptr;
     block->memory.properties = properties;
     block->memory.dedicated = dedicated;
@@ -507,6 +579,18 @@ size_t BlockAllocator::block_count() const {
     return block_list.size();
 }
 
+size_t BlockAllocator::pool_size() const {
+    size_t total_size = 0;
+    BlockEntry const *block_entry = nullptr;
+    for (block_entry = block_list.front(); block_entry != nullptr; block_entry = block_entry->next_ptr) {
+        const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+        if (block != nullptr) {
+            total_size += block->memory.size;
+        }
+    }
+    return total_size;
+}
+
 // --
 
 }  // namespace Internal
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 8dae7b8b9c16..dfd01f266996 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -65,18 +65,18 @@ class RegionAllocator {
     BlockRegion *find_block_region(void *user_context, const MemoryRequest &request);
 
     // Returns true if block region is unused and available
-    bool is_available(BlockRegion *region);
+    bool is_available(const BlockRegion *region) const;
 
     // Returns true if neighbouring block regions to the given region can be coalesced into one
-    bool can_coalesce(BlockRegion *region);
+    bool can_coalesce(const BlockRegion *region) const;
 
     // Merges available neighbouring block regions into the given region
     BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region);
 
-    // Returns true if the given region can be split to accomadate the given size
-    bool can_split(BlockRegion *region, size_t size);
+    // Returns true if the given region can be split to accomodate the given size
+    bool can_split(const BlockRegion *region, size_t size) const;
 
-    // Splits the given block region into a smaller region to accomadate the given size, followed by empty space for the remaining
+    // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining
     BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment);
 
     // Creates a new block region and adds it to the region list
@@ -94,9 +94,18 @@ class RegionAllocator {
     // Invokes the deallocation callback to free memory for the block region
     void free_block_region(void *user_context, BlockRegion *region);
 
+    // Returns true if the given block region is the last region in the list
+    bool is_last_block_region(void *user_context, const BlockRegion *region) const;
+
     // Returns true if the given block region is compatible with the given properties
     bool is_compatible_block_region(const BlockRegion *region, const MemoryProperties &properties) const;
 
+    // Returns true if the given block region is suitable for the requested allocation
+    bool is_block_region_suitable_for_request(void *user_context, const BlockRegion *region, const MemoryRequest &request) const;
+
+    // Returns the number of active regions for the block;
+    size_t region_count(void *user_context) const;
+
     BlockResource *block = nullptr;
     MemoryArena *arena = nullptr;
     MemoryAllocators allocators;
@@ -139,8 +148,10 @@ void RegionAllocator::initialize(void *user_context, BlockResource *mb, const Me
 
 MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) {
     halide_abort_if_false(user_context, request.size > 0);
+    size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
+    size_t actual_size = (actual_alignment > request.size) ? (actual_alignment) : request.size;
     size_t remaining = block->memory.size - block->reserved;
-    if (remaining < request.size) {
+    if (remaining < actual_size) {
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: Unable to reserve more memory from block "
                                         << "-- requested size (" << (int32_t)(request.size) << " bytes) "
@@ -209,55 +220,94 @@ RegionAllocator *RegionAllocator::find_allocator(void *user_context, MemoryRegio
     return block_region->block_ptr->allocator;
 }
 
-BlockRegion *RegionAllocator::find_block_region(void *user_context, const MemoryRequest &request) {
-    BlockRegion *result = nullptr;
-    for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
+bool RegionAllocator::is_last_block_region(void *user_context, const BlockRegion *region) const {
+    return ((region == nullptr) || (region == region->next_ptr) || (region->next_ptr == nullptr));
+}
 
-        if (!is_available(block_region)) {
-            continue;
-        }
+bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, const BlockRegion *region, const MemoryRequest &request) const {
+    if (!is_available(region)) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: skipping block region ... not available! "
+                                        << " block_region=" << (void *)region << "\n";
+#endif
+        return false;
+    }
 
-        // skip incompatible block regions for this request
-        if (!is_compatible_block_region(block_region, request.properties)) {
-            continue;
-        }
+    // skip incompatible block regions for this request
+    if (!is_compatible_block_region(region, request.properties)) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: skipping block region ... incompatible properties! "
+                                        << " block_region=" << (void *)region << "\n";
+#endif
+        return false;
+    }
 
-        // is the requested size larger than the current region?
-        if (request.size > block_region->memory.size) {
-            continue;
-        }
+    size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
+    size_t actual_size = aligned_size(region->memory.offset, request.size, actual_alignment);
 
-        size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
-        size_t actual_size = aligned_size(block_region->memory.offset, request.size, actual_alignment);
+    // is the adjusted size larger than the current region?
+    if (actual_size > region->memory.size) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: skipping block region ... not enough space for adjusted size! "
+                                        << " block_region=" << (void *)region << "\n";
+#endif
+        return false;
+    }
 
+    // will the adjusted size fit within the remaining unallocated space?
+    if ((actual_size + block->reserved) <= block->memory.size) {
 #ifdef DEBUG_INTERNAL
-        StackBasicPrinter<256>(nullptr) << "RegionAllocator: conform_alignment ("
-                                        << " request=" << (uint32_t)request.alignment
-                                        << " required=" << (uint32_t)block->memory.properties.alignment << ") =>"
-                                        << " actual_alignment=" << (uint32_t)actual_alignment << "\n";
-
-        StackBasicPrinter<256>(nullptr) << "RegionAllocator: aligned_size ("
-                                        << " offset=" << (uint32_t)block_region->memory.offset
-                                        << " request_size=" << (uint32_t)request.size << " "
-                                        << " actual_alignment=" << (uint32_t)actual_alignment << ") =>"
-                                        << " actual_size=" << (uint32_t)actual_size << "\n";
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: found suitable block region! "
+                                        << " block_region=" << (void *)region << "\n";
 #endif
+        return true;  // you betcha
+    }
 
-        // is the adjusted size larger than the current region?
-        if (actual_size > block_region->memory.size) {
-            continue;
+    return false;
+}
+
+BlockRegion *RegionAllocator::find_block_region(void *user_context, const MemoryRequest &request) {
+    BlockRegion *block_region = block->regions;
+    while (block_region != nullptr) {
+        if (is_block_region_suitable_for_request(user_context, block_region, request)) {
+#ifdef DEBUG_INTERNAL
+            StackBasicPrinter<256>(nullptr) << "RegionAllocator: found suitable region ...\n"
+                                            << " user_context=" << (void *)(user_context) << "\n"
+                                            << " block_resource=" << (void *)block << "\n"
+                                            << " block_size=" << (uint32_t)block->memory.size << "\n"
+                                            << " block_reserved=" << (uint32_t)block->reserved << "\n";
+            StackBasicPrinter<256>(nullptr) << " requested_size=" << (uint32_t)request.size << "\n"
+                                            << " requested_is_dedicated=" << (request.dedicated ? "true" : "false") << "\n"
+                                            << " requested_usage=" << halide_memory_usage_name(request.properties.usage) << "\n"
+                                            << " requested_caching=" << halide_memory_caching_name(request.properties.caching) << "\n"
+                                            << " requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << "\n";
+#endif
+            return block_region;
         }
 
-        // will the adjusted size fit within the remaining unallocated space?
-        if ((actual_size + block->reserved) <= block->memory.size) {
-            result = block_region;  // best-fit!
+        if (is_last_block_region(user_context, block_region)) {
+            block_region = nullptr;  // end of list ... nothing found
             break;
         }
+        block_region = block_region->next_ptr;
     }
-    return result;
+
+    if (block_region == nullptr) {
+#ifdef DEBUG_INTERNAL
+        StackBasicPrinter<256>(nullptr) << "RegionAllocator: couldn't find suitable region!\n"
+                                        << " user_context=" << (void *)(user_context) << "\n"
+                                        << " requested_size=" << (uint32_t)request.size << "\n"
+                                        << " requested_is_dedicated=" << (request.dedicated ? "true" : "false") << "\n"
+                                        << " requested_usage=" << halide_memory_usage_name(request.properties.usage) << "\n"
+                                        << " requested_caching=" << halide_memory_caching_name(request.properties.caching) << "\n"
+                                        << " requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << "\n";
+#endif
+    }
+
+    return block_region;
 }
 
-bool RegionAllocator::is_available(BlockRegion *block_region) {
+bool RegionAllocator::is_available(const BlockRegion *block_region) const {
     if (block_region == nullptr) {
         return false;
     }
@@ -270,7 +320,7 @@ bool RegionAllocator::is_available(BlockRegion *block_region) {
     return true;
 }
 
-bool RegionAllocator::can_coalesce(BlockRegion *block_region) {
+bool RegionAllocator::can_coalesce(const BlockRegion *block_region) const {
     if (!is_available(block_region)) {
         return false;
     }
@@ -300,8 +350,8 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
         block_region->memory.handle = nullptr;
     }
 
-    if (is_available(block_region->prev_ptr)) {
-        BlockRegion *prev_region = block_region->prev_ptr;
+    BlockRegion *prev_region = block_region->prev_ptr;
+    if (is_available(prev_region) && (prev_region != block_region)) {
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: Coalescing "
@@ -318,8 +368,8 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
         block_region = prev_region;
     }
 
-    if (is_available(block_region->next_ptr)) {
-        BlockRegion *next_region = block_region->next_ptr;
+    BlockRegion *next_region = block_region->next_ptr;
+    if (is_available(next_region) && (next_region != block_region)) {
 
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "RegionAllocator: Coalescing "
@@ -338,7 +388,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     return block_region;
 }
 
-bool RegionAllocator::can_split(BlockRegion *block_region, size_t size) {
+bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size) const {
     return (block_region && (block_region->memory.size > size) && (block_region->usage_count == 0));
 }
 
@@ -359,29 +409,30 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
         block_region->memory.handle = nullptr;
     }
 
-    alignment = conform_alignment(alignment, block->memory.properties.alignment);
+    size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment);
+    size_t actual_size = aligned_size(block_region->memory.offset, size, actual_alignment);
+    size_t actual_offset = aligned_offset(block_region->memory.offset + size, actual_alignment);
+    size_t empty_size = block_region->memory.size - actual_size;
 
 #ifdef DEBUG_INTERNAL
-    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Conforming alignment ("
-                                    << "requested=" << (uint32_t)alignment << " "
-                                    << "required=" << (uint32_t)block->memory.properties.alignment << " "
-                                    << "actual=" << (uint32_t)alignment << ")\n";
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Conforming size and alignment \n"
+                                    << " requested_size=" << (uint32_t)size << "\n"
+                                    << " actual_size=" << (uint32_t)actual_size << "\n"
+                                    << " requested_alignment=" << (uint32_t)alignment << " "
+                                    << " required_alignment=" << (uint32_t)block->memory.properties.alignment << " "
+                                    << " actual_alignment=" << (uint32_t)actual_alignment << ")\n";
 #endif
 
-    size_t adjusted_size = aligned_size(block_region->memory.offset, size, alignment);
-    size_t adjusted_offset = aligned_offset(block_region->memory.offset + size, alignment);
-    size_t empty_size = block_region->memory.size - adjusted_size;
-
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Splitting "
                                     << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
-                                    << "to create empty region (offset=" << (int32_t)adjusted_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
+                                    << "to create empty region (offset=" << (int32_t)actual_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
 #endif
 
     BlockRegion *next_region = block_region->next_ptr;
     BlockRegion *empty_region = create_block_region(user_context,
                                                     block_region->memory.properties,
-                                                    adjusted_offset, empty_size,
+                                                    actual_offset, empty_size,
                                                     block_region->memory.dedicated);
     halide_abort_if_false(user_context, empty_region != nullptr);
 
@@ -389,8 +440,9 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     if (next_region) {
         next_region->prev_ptr = empty_region;
     }
+    empty_region->prev_ptr = block_region;
     block_region->next_ptr = empty_region;
-    block_region->memory.size = size;
+    block_region->memory.size -= empty_size;
     return empty_region;
 }
 
@@ -408,7 +460,6 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
 #endif
 
     BlockRegion *block_region = static_cast<BlockRegion *>(arena->reserve(user_context, true));
-
     if (block_region == nullptr) {
         error(user_context) << "RegionAllocator: Failed to allocate new block region!\n";
         return nullptr;
@@ -478,13 +529,16 @@ void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *bloc
 #endif
 
     block_region->usage_count = 0;
+    release_block_region(user_context, block_region);
     free_block_region(user_context, block_region);
     arena->reclaim(user_context, block_region);
 }
 
 void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
-    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Allocating region (user_context=" << (void *)(user_context) << " size=" << (int32_t)(block_region->memory.size) << " offset=" << (int32_t)block_region->memory.offset << ")!\n";
+    StackBasicPrinter<256>(nullptr) << "RegionAllocator: Allocating region (user_context=" << (void *)(user_context)
+                                    << " size=" << (int32_t)(block_region->memory.size)
+                                    << " offset=" << (int32_t)block_region->memory.offset << ")!\n";
 #endif
     halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
     halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available);
@@ -552,8 +606,14 @@ void RegionAllocator::release(void *user_context) {
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Releasing all regions ("
                                     << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
-    for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
+
+    BlockRegion *block_region = block->regions;
+    while (block_region != nullptr) {
         release_block_region(user_context, block_region);
+        if (is_last_block_region(user_context, block_region)) {
+            break;
+        }
+        block_region = block_region->next_ptr;
     }
 }
 
@@ -570,10 +630,10 @@ bool RegionAllocator::collect(void *user_context) {
                                     << ")\n";
 #endif
 
-    bool result = false;
-    for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
+    bool has_collected = false;
+    BlockRegion *block_region = block->regions;
+    while (block_region != nullptr) {
         if (can_coalesce(block_region)) {
-
 #ifdef DEBUG_INTERNAL
             count++;
             StackBasicPrinter<256>(nullptr) << "    collecting region ("
@@ -584,12 +644,15 @@ bool RegionAllocator::collect(void *user_context) {
                                             << ")\n";
 #endif
             block_region = coalesce_block_regions(user_context, block_region);
-            result = true;
+            has_collected = true;
+        }
+        if (is_last_block_region(user_context, block_region)) {
+            break;
         }
+        block_region = block_region->next_ptr;
     }
 
-    if (result) {
-
+    if (has_collected) {
 #ifdef DEBUG_INTERNAL
         StackBasicPrinter<256>(nullptr) << "    collected unused regions ("
                                         << "block_ptr=" << (void *)block << " "
@@ -598,7 +661,7 @@ bool RegionAllocator::collect(void *user_context) {
                                         << ")\n";
 #endif
     }
-    return result;
+    return has_collected;
 }
 
 void RegionAllocator::destroy(void *user_context) {
@@ -608,7 +671,7 @@ void RegionAllocator::destroy(void *user_context) {
 #endif
     for (BlockRegion *block_region = block->regions; block_region != nullptr;) {
 
-        if (block_region->next_ptr == nullptr) {
+        if (is_last_block_region(user_context, block_region)) {
             destroy_block_region(user_context, block_region);
             block_region = nullptr;
         } else {
@@ -646,6 +709,17 @@ bool RegionAllocator::is_compatible_block_region(const BlockRegion *block_region
     return true;
 }
 
+size_t RegionAllocator::region_count(void *user_context) const {
+    if (block == nullptr) {
+        return 0;
+    }
+    size_t count = 0;
+    for (BlockRegion const *region = block->regions; !is_last_block_region(user_context, region); region = region->next_ptr) {
+        ++count;
+    }
+    return count;
+}
+
 BlockResource *RegionAllocator::block_resource() const {
     return block;
 }
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 8ae7d5e33e6a..a09be01291ec 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -22,9 +22,10 @@ WEAK const VkAllocationCallbacks *custom_allocation_callbacks = nullptr;  // nul
 
 // Runtime configuration parameters to adjust the behaviour of the block allocator
 struct VulkanMemoryConfig {
-    size_t minimum_block_size = 32 * 1024 * 1024;  // 32MB
-    size_t maximum_block_size = 0;                 //< zero means no constraint
-    size_t maximum_block_count = 0;                //< zero means no constraint
+    size_t maximum_pool_size = 0;                  //< Maximum number of bytes to allocate for the entire pool (including all blocks). Specified in bytes. Zero means no constraint
+    size_t minimum_block_size = 32 * 1024 * 1024;  //< Default block size is 32MB
+    size_t maximum_block_size = 0;                 //< Specified in bytes. Zero means no constraint
+    size_t maximum_block_count = 0;                //< Maximum number of blocks to allocate. Zero means no constraint
 };
 WEAK VulkanMemoryConfig memory_allocator_config;
 
@@ -167,6 +168,7 @@ void VulkanMemoryAllocator::initialize(void *user_context,
     allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block};
     allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region};
     BlockAllocator::Config block_allocator_config = {0};
+    block_allocator_config.maximum_pool_size = cfg.maximum_pool_size;
     block_allocator_config.maximum_block_count = cfg.maximum_block_count;
     block_allocator_config.maximum_block_size = cfg.maximum_block_size;
     block_allocator_config.minimum_block_size = cfg.minimum_block_size;
@@ -900,15 +902,19 @@ VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context,
         StringTable alloc_config_values;
         alloc_config_values.parse(user_context, alloc_config, HL_VK_ENV_DELIM);
         if (alloc_config_values.size() > 0) {
-            config.minimum_block_size = atoi(alloc_config_values[0]) * 1024 * 1024;
-            print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.minimum_block_size << " for minimum block size (in bytes)\n";
+            config.maximum_pool_size = atoi(alloc_config_values[0]) * 1024 * 1024;
+            print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_pool_size << " for maximum pool size (in bytes)\n";
         }
         if (alloc_config_values.size() > 1) {
-            config.maximum_block_size = atoi(alloc_config_values[1]) * 1024 * 1024;
+            config.minimum_block_size = atoi(alloc_config_values[1]) * 1024 * 1024;
             print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.minimum_block_size << " for minimum block size (in bytes)\n";
         }
         if (alloc_config_values.size() > 2) {
-            config.maximum_block_count = atoi(alloc_config_values[2]);
+            config.maximum_block_size = atoi(alloc_config_values[2]) * 1024 * 1024;
+            print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_size << " for maximum block size (in bytes)\n";
+        }
+        if (alloc_config_values.size() > 3) {
+            config.maximum_block_count = atoi(alloc_config_values[3]);
             print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_count << " for maximum block count\n";
         }
     }

From 33007c57ff73d76118c0d9048ae0eb2fc167026a Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Tue, 14 Feb 2023 14:04:43 -0700
Subject: [PATCH 149/166] Added notes about TARGET_VULKAN=ON being the default
 now Added links to LunarG MoltenVK SDK installer, and brew packages

---
 README_vulkan.md | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 9576982c69e4..65172e4000dd 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -13,7 +13,7 @@ See [below](#current-status) for details.
 
 # Compiling Halide w/Vulkan Support
 
-You'll need to configure Halide and enable the cmake option TARGET_VULKAN.
+You'll need to configure Halide and enable the cmake option TARGET_VULKAN (which is now ON by default).
 
 For example, on Linux & OSX:
 
@@ -94,9 +94,23 @@ graphics driver, which can cause problems if installed on an NVIDIA-only system.
 You're better off using Halide's Metal backend instead, but it is possible to run 
 Vulkan apps on a Mac via the MoltenVK library:
 
-MoltenVK:
+MoltenVK Project:
 https://github.com/KhronosGroup/MoltenVK
 
+The easiest way to get the necessary dependencies is to use the official MoltenVK SDK
+installer provided by LunarG:
+
+MoltenVK SDK (Latest Release):
+https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg
+
+Alternatively, if you have the [Homebrew][https://brew.sh/] package manager installed 
+for MacOS, you can use it to install the Vulkan Loader and MoltenVK compatibility 
+layer:
+
+```
+$ brew install vulkan-loader molten-vk
+```
+
 # Testing Your Vulkan Environment
 
 You can validate that everything is configured correctly by running the `vulkaninfo`

From a3a8216ade0b43a8aed1c0645c6a7b1ac6418a58 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <derek.gerstmann@gmail.com>
Date: Tue, 14 Feb 2023 14:20:52 -0700
Subject: [PATCH 150/166] Fix markdown formatting

---
 README_vulkan.md | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 65172e4000dd..635ef802aaf6 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -51,15 +51,9 @@ https://sdk.lunarg.com/sdk/download/latest/windows/vulkan-sdk.exe
 
 For Vulkan device drivers, consult the appropriate hardware vendor for your device.  A few common ones are listed below.
 
-AMD:
-https://www.amd.com/en/technologies/vulkan
-
-NVIDIA:
-https://developer.nvidia.com/vulkan-driver
-
-INTEL:
-https://www.intel.com/content/www/us/en/download-center/home.html
-
+-   [AMD Vulkan Driver](https://www.amd.com/en/technologies/vulkan)
+-   [NVIDIA Vulkan Driver](https://developer.nvidia.com/vulkan-driver)
+-   [INTEL Vulkan Driver](https://www.intel.com/content/www/us/en/download-center/home.html)
 
 ## Linux 
 
@@ -94,16 +88,14 @@ graphics driver, which can cause problems if installed on an NVIDIA-only system.
 You're better off using Halide's Metal backend instead, but it is possible to run 
 Vulkan apps on a Mac via the MoltenVK library:
 
-MoltenVK Project:
-https://github.com/KhronosGroup/MoltenVK
+-   [MoltenVK Project](https://github.com/KhronosGroup/MoltenVK)
 
 The easiest way to get the necessary dependencies is to use the official MoltenVK SDK
 installer provided by LunarG:
 
-MoltenVK SDK (Latest Release):
-https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg
+-   [MoltenVK SDK (Latest Release)](https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg)
 
-Alternatively, if you have the [Homebrew][https://brew.sh/] package manager installed 
+Alternatively, if you have the [Homebrew](https://brew.sh/) package manager installed 
 for MacOS, you can use it to install the Vulkan Loader and MoltenVK compatibility 
 layer:
 

From 1dac977b53921a38461df7166f334c0aecdbfb42 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 18 Feb 2023 08:55:41 -0700
Subject: [PATCH 151/166] Fix error code handling in Vulkan runtime and
 internal datastructures. Refactor all (well nearly all) return values to use
 halide error codes. Reduce the usage of abort_if() for recoverable errors.

---
 src/runtime/internal/block_allocator.h  |  78 +--
 src/runtime/internal/memory_resources.h |   8 +-
 src/runtime/internal/region_allocator.h |  66 +--
 src/runtime/vulkan.cpp                  | 522 ++++++++++++--------
 src/runtime/vulkan_context.h            |  43 +-
 src/runtime/vulkan_extensions.h         |   1 +
 src/runtime/vulkan_internal.h           | 158 +++---
 src/runtime/vulkan_memory.h             | 615 +++++++++++++++---------
 src/runtime/vulkan_resources.h          | 459 ++++++++++++------
 test/runtime/block_allocator.cpp        |  16 +-
 10 files changed, 1204 insertions(+), 762 deletions(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index 29823eac6c44..0f7b51d64e01 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -52,12 +52,12 @@ class BlockAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    void release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
-    void reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
-    void retain(void *user_context, MemoryRegion *region);   //< retain the region and increase the usage count
-    bool collect(void *user_context);                        //< returns true if any blocks were removed
-    void release(void *user_context);
-    void destroy(void *user_context);
+    int release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *region);   //< retain the region and increase the usage count
+    bool collect(void *user_context);                       //< returns true if any blocks were removed
+    int release(void *user_context);
+    int destroy(void *user_context);
 
     // Access methods
     const MemoryAllocators &current_allocators() const;
@@ -80,7 +80,7 @@ class BlockAllocator {
     RegionAllocator *create_region_allocator(void *user_context, BlockResource *block);
 
     // Destroys the given region allocator and all associated memory regions
-    void destroy_region_allocator(void *user_context, RegionAllocator *region_allocator);
+    int destroy_region_allocator(void *user_context, RegionAllocator *region_allocator);
 
     // Reserves a block of memory for the requested size and returns the corresponding block entry, or nullptr on failure
     BlockEntry *reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
@@ -92,16 +92,16 @@ class BlockAllocator {
     BlockEntry *create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
 
     // Releases the block entry from being used, and makes it available for further allocations
-    void release_block_entry(void *user_context, BlockEntry *block_entry);
+    int release_block_entry(void *user_context, BlockEntry *block_entry);
 
     // Destroys the block entry and removes it from the list
-    void destroy_block_entry(void *user_context, BlockEntry *block_entry);
+    int destroy_block_entry(void *user_context, BlockEntry *block_entry);
 
     // Invokes the allocation callback to allocate memory for the block region
-    void alloc_memory_block(void *user_context, BlockResource *block);
+    int alloc_memory_block(void *user_context, BlockResource *block);
 
     // Invokes the deallocation callback to free memory for the memory block
-    void free_memory_block(void *user_context, BlockResource *block);
+    int free_memory_block(void *user_context, BlockResource *block);
 
     // Returns a constrained size for the requested size based on config parameters
     size_t constrain_requested_size(size_t size) const;
@@ -190,31 +190,37 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
     return result;
 }
 
-void BlockAllocator::release(void *user_context, MemoryRegion *memory_region) {
-    halide_abort_if_false(user_context, memory_region != nullptr);
+int BlockAllocator::release(void *user_context, MemoryRegion *memory_region) {
+    if (memory_region == nullptr) {
+        return halide_error_code_internal_error;
+    }
     RegionAllocator *allocator = RegionAllocator::find_allocator(user_context, memory_region);
     if (allocator == nullptr) {
-        return;
+        return halide_error_code_internal_error;
     }
-    allocator->release(user_context, memory_region);
+    return allocator->release(user_context, memory_region);
 }
 
-void BlockAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
-    halide_abort_if_false(user_context, memory_region != nullptr);
+int BlockAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
+    if (memory_region == nullptr) {
+        return halide_error_code_internal_error;
+    }
     RegionAllocator *allocator = RegionAllocator::find_allocator(user_context, memory_region);
     if (allocator == nullptr) {
-        return;
+        return halide_error_code_internal_error;
     }
-    allocator->reclaim(user_context, memory_region);
+    return allocator->reclaim(user_context, memory_region);
 }
 
-void BlockAllocator::retain(void *user_context, MemoryRegion *memory_region) {
-    halide_abort_if_false(user_context, memory_region != nullptr);
+int BlockAllocator::retain(void *user_context, MemoryRegion *memory_region) {
+    if (memory_region == nullptr) {
+        return halide_error_code_internal_error;
+    }
     RegionAllocator *allocator = RegionAllocator::find_allocator(user_context, memory_region);
     if (allocator == nullptr) {
-        return;
+        return halide_error_code_internal_error;
     }
-    allocator->retain(user_context, memory_region);
+    return allocator->retain(user_context, memory_region);
 }
 
 bool BlockAllocator::collect(void *user_context) {
@@ -252,16 +258,17 @@ bool BlockAllocator::collect(void *user_context) {
     return result;
 }
 
-void BlockAllocator::release(void *user_context) {
+int BlockAllocator::release(void *user_context) {
     BlockEntry *block_entry = block_list.back();
     while (block_entry != nullptr) {
         BlockEntry *prev_entry = block_entry->prev_ptr;
         release_block_entry(user_context, block_entry);
         block_entry = prev_entry;
     }
+    return 0;
 }
 
-void BlockAllocator::destroy(void *user_context) {
+int BlockAllocator::destroy(void *user_context) {
     BlockEntry *block_entry = block_list.back();
     while (block_entry != nullptr) {
         BlockEntry *prev_entry = block_entry->prev_ptr;
@@ -269,6 +276,7 @@ void BlockAllocator::destroy(void *user_context) {
         block_entry = prev_entry;
     }
     block_list.destroy(user_context);
+    return 0;
 }
 
 MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAllocator *allocator, const MemoryRequest &request) {
@@ -424,16 +432,16 @@ BlockAllocator::create_region_allocator(void *user_context, BlockResource *block
     return region_allocator;
 }
 
-void BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator *region_allocator) {
+int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator *region_allocator) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Destroying region allocator ("
                                     << "user_context=" << (void *)(user_context) << " "
                                     << "region_allocator=" << (void *)(region_allocator) << ")...\n";
 #endif
     if (region_allocator == nullptr) {
-        return;
+        return 0;
     }
-    RegionAllocator::destroy(user_context, region_allocator);
+    return RegionAllocator::destroy(user_context, region_allocator);
 }
 
 BlockAllocator::BlockEntry *
@@ -475,7 +483,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
     return block_entry;
 }
 
-void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
+int BlockAllocator::release_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Releasing block entry ("
                                     << "block_entry=" << (void *)(block_entry) << " "
@@ -483,11 +491,12 @@ void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::Blo
 #endif
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
     if (block->allocator) {
-        block->allocator->release(user_context);
+        return block->allocator->release(user_context);
     }
+    return 0;
 }
 
-void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
+int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Destroying block entry ("
                                     << "block_entry=" << (void *)(block_entry) << " "
@@ -501,9 +510,10 @@ void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Blo
     }
     free_memory_block(user_context, block);
     block_list.remove(user_context, block_entry);
+    return 0;
 }
 
-void BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) {
+int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n";
 #endif
@@ -511,9 +521,10 @@ void BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block
     MemoryBlock *memory_block = &(block->memory);
     allocators.block.allocate(user_context, memory_block);
     block->reserved = 0;
+    return 0;
 }
 
-void BlockAllocator::free_memory_block(void *user_context, BlockResource *block) {
+int BlockAllocator::free_memory_block(void *user_context, BlockResource *block) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n";
 #endif
@@ -523,6 +534,7 @@ void BlockAllocator::free_memory_block(void *user_context, BlockResource *block)
     memory_block->handle = nullptr;
     block->reserved = 0;
     block->memory.size = 0;
+    return 0;
 }
 
 size_t BlockAllocator::constrain_requested_size(size_t size) const {
diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index cba4792be1d1..e70562d7994e 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -184,16 +184,16 @@ struct HalideSystemAllocatorFns {
     DeallocateSystemFn deallocate = halide_free;
 };
 
-typedef void (*AllocateBlockFn)(void *, MemoryBlock *);
-typedef void (*DeallocateBlockFn)(void *, MemoryBlock *);
+typedef int (*AllocateBlockFn)(void *, MemoryBlock *);
+typedef int (*DeallocateBlockFn)(void *, MemoryBlock *);
 
 struct MemoryBlockAllocatorFns {
     AllocateBlockFn allocate = nullptr;
     DeallocateBlockFn deallocate = nullptr;
 };
 
-typedef void (*AllocateRegionFn)(void *, MemoryRegion *);
-typedef void (*DeallocateRegionFn)(void *, MemoryRegion *);
+typedef int (*AllocateRegionFn)(void *, MemoryRegion *);
+typedef int (*DeallocateRegionFn)(void *, MemoryRegion *);
 
 struct MemoryRegionAllocatorFns {
     AllocateRegionFn allocate = nullptr;
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index dfd01f266996..a0ad773f8f63 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -40,26 +40,26 @@ class RegionAllocator {
 
     // Factory methods for creation / destruction
     static RegionAllocator *create(void *user_context, BlockResource *block, const MemoryAllocators &ma);
-    static void destroy(void *user_context, RegionAllocator *region_allocator);
+    static int destroy(void *user_context, RegionAllocator *region_allocator);
 
     // Returns the allocator class instance for the given allocation (or nullptr)
     static RegionAllocator *find_allocator(void *user_context, MemoryRegion *memory_region);
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    void release(void *user_context, MemoryRegion *memory_region);  //< unmark and cache the region for reuse
-    void reclaim(void *user_context, MemoryRegion *memory_region);  //< free the region and consolidate
-    void retain(void *user_context, MemoryRegion *memory_region);   //< retain the region and increase usage count
-    bool collect(void *user_context);                               //< returns true if any blocks were removed
-    void release(void *user_context);
-    void destroy(void *user_context);
+    int release(void *user_context, MemoryRegion *memory_region);  //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *memory_region);  //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *memory_region);   //< retain the region and increase usage count
+    bool collect(void *user_context);                              //< returns true if any blocks were removed
+    int release(void *user_context);
+    int destroy(void *user_context);
 
     // Returns the currently managed block resource
     BlockResource *block_resource() const;
 
 private:
     // Initializes a new instance
-    void initialize(void *user_context, BlockResource *block, const MemoryAllocators &ma);
+    int initialize(void *user_context, BlockResource *block, const MemoryAllocators &ma);
 
     // Search through allocated block regions (Best-Fit)
     BlockRegion *find_block_region(void *user_context, const MemoryRequest &request);
@@ -83,16 +83,16 @@ class RegionAllocator {
     BlockRegion *create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated);
 
     // Creates a new block region and adds it to the region list
-    void destroy_block_region(void *user_context, BlockRegion *region);
+    int destroy_block_region(void *user_context, BlockRegion *region);
 
     // Invokes the allocation callback to allocate memory for the block region
-    void alloc_block_region(void *user_context, BlockRegion *region);
+    int alloc_block_region(void *user_context, BlockRegion *region);
 
     // Releases a block region and leaves it in the list for further allocations
-    void release_block_region(void *user_context, BlockRegion *region);
+    int release_block_region(void *user_context, BlockRegion *region);
 
     // Invokes the deallocation callback to free memory for the block region
-    void free_block_region(void *user_context, BlockRegion *region);
+    int free_block_region(void *user_context, BlockRegion *region);
 
     // Returns true if the given block region is the last region in the list
     bool is_last_block_region(void *user_context, const BlockRegion *region) const;
@@ -117,7 +117,6 @@ RegionAllocator *RegionAllocator::create(void *user_context, BlockResource *bloc
         allocators.system.allocate(user_context, sizeof(RegionAllocator)));
 
     if (result == nullptr) {
-        halide_error(user_context, "RegionAllocator: Failed to create instance! Out of memory!\n");
         return nullptr;
     }
 
@@ -125,15 +124,16 @@ RegionAllocator *RegionAllocator::create(void *user_context, BlockResource *bloc
     return result;
 }
 
-void RegionAllocator::destroy(void *user_context, RegionAllocator *instance) {
+int RegionAllocator::destroy(void *user_context, RegionAllocator *instance) {
     halide_abort_if_false(user_context, instance != nullptr);
     const MemoryAllocators &allocators = instance->allocators;
     instance->destroy(user_context);
     halide_abort_if_false(user_context, allocators.system.deallocate != nullptr);
     allocators.system.deallocate(user_context, instance);
+    return 0;
 }
 
-void RegionAllocator::initialize(void *user_context, BlockResource *mb, const MemoryAllocators &ma) {
+int RegionAllocator::initialize(void *user_context, BlockResource *mb, const MemoryAllocators &ma) {
     block = mb;
     allocators = ma;
     arena = MemoryArena::create(user_context, {sizeof(BlockRegion), MemoryArena::default_capacity, 0}, allocators.system);
@@ -144,6 +144,7 @@ void RegionAllocator::initialize(void *user_context, BlockResource *mb, const Me
         block->memory.properties,
         0, block->memory.size,
         block->memory.dedicated);
+    return 0;
 }
 
 MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) {
@@ -182,17 +183,17 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     return reinterpret_cast<MemoryRegion *>(block_region);
 }
 
-void RegionAllocator::release(void *user_context, MemoryRegion *memory_region) {
+int RegionAllocator::release(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
     halide_abort_if_false(user_context, block_region->block_ptr == block);
     if (block_region->usage_count > 0) {
         block_region->usage_count--;
     }
-    release_block_region(user_context, block_region);
+    return release_block_region(user_context, block_region);
 }
 
-void RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
+int RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
     halide_abort_if_false(user_context, block_region->block_ptr == block);
@@ -204,13 +205,15 @@ void RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     if (can_coalesce(block_region)) {
         block_region = coalesce_block_regions(user_context, block_region);
     }
+    return 0;
 }
 
-void RegionAllocator::retain(void *user_context, MemoryRegion *memory_region) {
+int RegionAllocator::retain(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
     halide_abort_if_false(user_context, block_region != nullptr);
     halide_abort_if_false(user_context, block_region->block_ptr == block);
     block_region->usage_count++;
+    return 0;
 }
 
 RegionAllocator *RegionAllocator::find_allocator(void *user_context, MemoryRegion *memory_region) {
@@ -491,18 +494,18 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
     return block_region;
 }
 
-void RegionAllocator::release_block_region(void *user_context, BlockRegion *block_region) {
+int RegionAllocator::release_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Releasing block region ("
                                     << "user_context=" << (void *)(user_context) << " "
                                     << "block_region=" << (void *)(block_region) << ") ...\n";
 #endif
     if (block_region == nullptr) {
-        return;
+        return 0;
     }
 
     if (block_region->usage_count > 0) {
-        return;
+        return 0;
     }
 
     if (block_region->status != AllocationStatus::Available) {
@@ -519,9 +522,10 @@ void RegionAllocator::release_block_region(void *user_context, BlockRegion *bloc
         block->reserved -= block_region->memory.size;
     }
     block_region->status = AllocationStatus::Available;
+    return 0;
 }
 
-void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block_region) {
+int RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Destroying block region ("
                                     << "user_context=" << (void *)(user_context) << " "
@@ -532,9 +536,10 @@ void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *bloc
     release_block_region(user_context, block_region);
     free_block_region(user_context, block_region);
     arena->reclaim(user_context, block_region);
+    return 0;
 }
 
-void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_region) {
+int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Allocating region (user_context=" << (void *)(user_context)
                                     << " size=" << (int32_t)(block_region->memory.size)
@@ -542,9 +547,10 @@ void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_
 #endif
     halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
     halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available);
+    int error_code = 0;
     MemoryRegion *memory_region = &(block_region->memory);
     if (memory_region->handle == nullptr) {
-        allocators.region.allocate(user_context, memory_region);
+        error_code = allocators.region.allocate(user_context, memory_region);
         memory_region->is_owner = true;
 
 #ifdef DEBUG_INTERNAL
@@ -571,9 +577,10 @@ void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_
     }
     block_region->status = block_region->memory.dedicated ? AllocationStatus::Dedicated : AllocationStatus::InUse;
     block->reserved += block_region->memory.size;
+    return error_code;
 }
 
-void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_region) {
+int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_region) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Freeing block region ("
                                     << "user_context=" << (void *)(user_context) << " "
@@ -599,9 +606,10 @@ void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_r
     }
     block_region->usage_count = 0;
     block_region->status = AllocationStatus::Available;
+    return 0;
 }
 
-void RegionAllocator::release(void *user_context) {
+int RegionAllocator::release(void *user_context) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Releasing all regions ("
                                     << "user_context=" << (void *)(user_context) << ") ...\n";
@@ -615,6 +623,7 @@ void RegionAllocator::release(void *user_context) {
         }
         block_region = block_region->next_ptr;
     }
+    return 0;
 }
 
 bool RegionAllocator::collect(void *user_context) {
@@ -664,7 +673,7 @@ bool RegionAllocator::collect(void *user_context) {
     return has_collected;
 }
 
-void RegionAllocator::destroy(void *user_context) {
+int RegionAllocator::destroy(void *user_context) {
 #ifdef DEBUG_INTERNAL
     StackBasicPrinter<256>(nullptr) << "RegionAllocator: Destroying all block regions ("
                                     << "user_context=" << (void *)(user_context) << ") ...\n";
@@ -685,6 +694,7 @@ void RegionAllocator::destroy(void *user_context) {
     block->allocator = nullptr;
     MemoryArena::destroy(user_context, arena);
     arena = nullptr;
+    return 0;
 }
 
 bool RegionAllocator::is_compatible_block_region(const BlockRegion *block_region, const MemoryProperties &properties) const {
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 6d4980393f71..873ae0ac8c6a 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -49,18 +49,18 @@ WEAK int halide_vulkan_acquire_context(void *user_context,
 
     // If the context has not been initialized, initialize it now.
     if ((cached_instance == nullptr) && create) {
-        int result = vk_create_context(user_context,
-                                       reinterpret_cast<VulkanMemoryAllocator **>(&cached_allocator),
-                                       &cached_instance,
-                                       &cached_device,
-                                       &cached_physical_device,
-                                       &cached_command_pool,
-                                       &cached_queue,
-                                       &cached_queue_family_index);
-        if (result != halide_error_code_success) {
+        int error_code = vk_create_context(user_context,
+                                           reinterpret_cast<VulkanMemoryAllocator **>(&cached_allocator),
+                                           &cached_instance,
+                                           &cached_device,
+                                           &cached_physical_device,
+                                           &cached_command_pool,
+                                           &cached_queue,
+                                           &cached_queue_family_index);
+        if (error_code != halide_error_code_success) {
             debug(user_context) << "halide_vulkan_acquire_context: FAILED to create context!\n";
             __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
-            return result;
+            return error_code;
         }
     }
 
@@ -71,12 +71,12 @@ WEAK int halide_vulkan_acquire_context(void *user_context,
     *command_pool = cached_command_pool;
     *queue = cached_queue;
     *queue_family_index = cached_queue_family_index;
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_release_context(void *user_context, VkInstance instance, VkDevice device, VkQueue queue) {
     __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_buffer) {
@@ -88,10 +88,14 @@ WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_b
     // should be allowed to call halide_vulkan_device_free on any halide_buffer_t
     // including ones that have never been used with a GPU.
     if (halide_buffer->device == 0) {
-        return 0;
+        return halide_error_code_success;
     }
 
     VulkanContext ctx(user_context);
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
+        return ctx.error;
+    }
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_before = halide_current_time_ns(user_context);
@@ -116,7 +120,7 @@ WEAK int halide_vulkan_device_free(void *user_context, halide_buffer_t *halide_b
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_compute_capability(void *user_context, int *major, int *minor) {
@@ -132,7 +136,8 @@ WEAK int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr,
         << ", size: " << size << "\n";
 
     VulkanContext ctx(user_context);
-    if (ctx.error != VK_SUCCESS) {
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
         return ctx.error;
     }
 
@@ -145,6 +150,7 @@ WEAK int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr,
     if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.device, cache_entry,
                                               Halide::Runtime::Internal::Vulkan::vk_compile_shader_module,
                                               user_context, ctx.allocator, src, size)) {
+        error(user_context) << "Vulkan: Failed to setup compilation cache!\n";
         return halide_error_code_generic_error;
     }
 
@@ -153,7 +159,7 @@ WEAK int halide_vulkan_initialize_kernels(void *user_context, void **state_ptr,
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK void halide_vulkan_finalize_kernels(void *user_context, void *state_ptr) {
@@ -166,7 +172,7 @@ WEAK void halide_vulkan_finalize_kernels(void *user_context, void *state_ptr) {
 #endif
 
     VulkanContext ctx(user_context);
-    if (ctx.error == VK_SUCCESS) {
+    if (ctx.error == halide_error_code_success) {
         compilation_cache.release_hold(user_context, ctx.device, state_ptr);
     }
 
@@ -181,7 +187,10 @@ WEAK int halide_vulkan_device_sync(void *user_context, halide_buffer_t *) {
     debug(user_context) << "halide_vulkan_device_sync (user_context: " << user_context << ")\n";
 
     VulkanContext ctx(user_context);
-    halide_debug_assert(user_context, ctx.error == VK_SUCCESS);
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
+        return ctx.error;
+    }
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_before = halide_current_time_ns(user_context);
@@ -194,7 +203,7 @@ WEAK int halide_vulkan_device_sync(void *user_context, halide_buffer_t *) {
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_device_release(void *user_context) {
@@ -212,10 +221,8 @@ WEAK int halide_vulkan_device_release(void *user_context) {
     int acquire_status = halide_vulkan_acquire_context(user_context,
                                                        reinterpret_cast<halide_vulkan_memory_allocator **>(&allocator),
                                                        &instance, &device, &physical_device, &command_pool, &queue, &_throwaway, false);
-    halide_debug_assert(user_context, acquire_status == VK_SUCCESS);
-    (void)acquire_status;
-    if (instance != nullptr) {
 
+    if ((acquire_status == halide_error_code_success) && (instance != nullptr)) {
         vkQueueWaitIdle(queue);
         if (command_pool == cached_command_pool) {
             cached_command_pool = 0;
@@ -243,7 +250,7 @@ WEAK int halide_vulkan_device_release(void *user_context) {
         halide_vulkan_release_context(user_context, instance, device, queue);
     }
 
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
@@ -252,8 +259,9 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
         << ", buf: " << buf << ")\n";
 
     VulkanContext ctx(user_context);
-    if (ctx.error != VK_SUCCESS) {
-        return -1;
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
+        return ctx.error;
     }
 
     size_t size = buf->size_in_bytes();
@@ -261,7 +269,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
         MemoryRegion *device_region = (MemoryRegion *)(buf->device);
         if (device_region->size >= size) {
             debug(user_context) << "Vulkan: Requested allocation for existing device memory ... using existing buffer!\n";
-            return 0;
+            return halide_error_code_success;
         } else {
             debug(user_context) << "Vulkan: Requested allocation of different size ... reallocating buffer!\n";
             if (halide_can_reuse_device_allocations(user_context)) {
@@ -305,7 +313,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
     MemoryRegion *device_region = ctx.allocator->reserve(user_context, request);
     if ((device_region == nullptr) || (device_region->handle == nullptr)) {
         error(user_context) << "Vulkan: Failed to allocate device memory!\n";
-        return -1;
+        return halide_error_code_device_malloc_failed;
     }
 
     buf->device = (uint64_t)device_region;
@@ -326,10 +334,10 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
 
     // create a command buffer
     VkCommandBuffer command_buffer;
-    VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
-    if (result != VK_SUCCESS) {
-        debug(user_context) << "Vulkan: vkCreateCommandBuffer returned: " << vk_get_error_name(result) << "\n";
-        return result;
+    int error_code = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to create command buffer!\n";
+        return error_code;
     }
 
     // begin the command buffer
@@ -341,10 +349,10 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
             nullptr                                       // pointer to parent command buffer
         };
 
-    result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+    VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
 
     // fill buffer with zero values up to the size of the buffer
@@ -355,8 +363,8 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
     // end the command buffer
     result = vkEndCommandBuffer(command_buffer);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
 
     // submit the command buffer
@@ -375,31 +383,36 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
 
     result = vkQueueSubmit(ctx.queue, 1, &submit_info, 0);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
 
     // wait for memset to finish
     result = vkQueueWaitIdle(ctx.queue);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
 
-    vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+    error_code = vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to destroy command buffer!\n";
+        return error_code;
+    }
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halide_buffer) {
-    int err = halide_vulkan_device_malloc(user_context, halide_buffer);
-    if (err) {
-        return err;
+    int error_code = halide_vulkan_device_malloc(user_context, halide_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to allocate device memory!\n";
+        return error_code;
     }
 
     debug(user_context)
@@ -408,7 +421,8 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
 
     // Acquire the context so we can use the command queue.
     VulkanContext ctx(user_context);
-    if (ctx.error != VK_SUCCESS) {
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
         return ctx.error;
     }
 
@@ -416,8 +430,10 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    halide_abort_if_false(user_context, halide_buffer->host && halide_buffer->device);
-
+    if ((halide_buffer->host == nullptr) || (halide_buffer->device == 0)) {
+        error(user_context) << "Vulkan: Missing host/device pointers for halide buffer!\n";
+        return halide_error_code_internal_error;
+    }
     device_copy copy_helper = make_host_to_device_copy(halide_buffer);
 
     // We construct a staging buffer to copy into from host memory.  Then,
@@ -433,7 +449,7 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     MemoryRegion *staging_region = ctx.allocator->reserve(user_context, request);
     if ((staging_region == nullptr) || (staging_region->handle == nullptr)) {
         error(user_context) << "Vulkan: Failed to allocate device memory!\n";
-        return -1;
+        return halide_error_code_device_malloc_failed;
     }
 
     // map the region to a host ptr
@@ -455,11 +471,24 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     }
 
     // unmap the pointer
-    ctx.allocator->unmap(user_context, staging_region);
+    error_code = ctx.allocator->unmap(user_context, staging_region);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to unmap host pointer to device memory!\n";
+        return error_code;
+    }
 
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
+    if (device_region == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve device region for buffer!\n";
+        return halide_error_code_internal_error;
+    }
+
     MemoryRegion *memory_region = ctx.allocator->owner_of(user_context, device_region);
+    if (memory_region == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve memory region for device!\n";
+        return halide_error_code_internal_error;
+    }
 
     // retrieve the buffer from the region
     VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(memory_region->handle);
@@ -477,10 +506,10 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
 
     // create a command buffer
     VkCommandBuffer command_buffer;
-    VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
-    if (result != VK_SUCCESS) {
-        debug(user_context) << "Vulkan: vkCreateCommandBuffer returned: " << vk_get_error_name(result) << "\n";
-        return result;
+    error_code = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to create command buffer!\n";
+        return error_code;
     }
 
     // begin the command buffer
@@ -492,10 +521,10 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
             nullptr                                       // pointer to parent command buffer
         };
 
-    result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+    VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_device_buffer_copy_failed;
     }
 
     // define the src and dst config
@@ -507,16 +536,21 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     uint64_t dst_offset = device_region->range.head_offset;
 
     // enqueue the copy operation, using the allocated buffers
-    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
-                                src_offset, dst_offset,
-                                halide_buffer->dimensions,
-                                from_host, to_host);
+    error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
+                                             src_offset, dst_offset,
+                                             halide_buffer->dimensions,
+                                             from_host, to_host);
+
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: vk_do_multidimensional_copy failed!\n";
+        return error_code;
+    }
 
     // end the command buffer
     result = vkEndCommandBuffer(command_buffer);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_device_buffer_copy_failed;
     }
 
     //// 13. Submit the command buffer to our command queue
@@ -535,15 +569,15 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
 
     result = vkQueueSubmit(ctx.queue, 1, &submit_info, 0);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_device_buffer_copy_failed;
     }
 
     //// 14. Wait until the queue is done with the command buffer
     result = vkQueueWaitIdle(ctx.queue);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_device_buffer_copy_failed;
     }
 
     //// 15. Reclaim the staging buffer
@@ -553,14 +587,18 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
         ctx.allocator->reclaim(user_context, staging_region);
     }
 
-    vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+    error_code = vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to destroy command buffer!\n";
+        return error_code;
+    }
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_buffer) {
@@ -569,12 +607,17 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
         << "halide_copy_to_host (user_context: " << user_context
         << ", halide_buffer: " << halide_buffer << ")\n";
 #endif
+    if (halide_buffer == nullptr) {
+        error(user_context) << "Vulkan: Failed to copy buffer to host ... invalid halide buffer!\n";
+        return halide_error_code_copy_to_host_failed;
+    }
 
     // Acquire the context so we can use the command queue. This also avoids multiple
     // redundant calls to enqueue a download when multiple threads are trying to copy
     // the same buffer.
     VulkanContext ctx(user_context);
-    if (ctx.error != VK_SUCCESS) {
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
         return ctx.error;
     }
 
@@ -582,15 +625,14 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
     if ((halide_buffer->host == nullptr) || (halide_buffer->device == 0)) {
-        error(user_context) << "Vulkan: Unable to copy buffer to host ... missing host and device pointers!\n";
-        return -1;
+        error(user_context) << "Vulkan: Missing host/device pointers for halide buffer!\n";
+        return halide_error_code_internal_error;
     }
 
     device_copy copy_helper = make_device_to_host_copy(halide_buffer);
 
     // This is the inverse of copy_to_device: we create a staging buffer, copy into
     // it, map it so the host can see it, then copy into the host buffer
-
     MemoryRequest request = {0};
     request.size = halide_buffer->size_in_bytes();
     request.properties.usage = MemoryUsage::TransferDst;
@@ -601,7 +643,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     MemoryRegion *staging_region = ctx.allocator->reserve(user_context, request);
     if ((staging_region == nullptr) || (staging_region->handle == nullptr)) {
         error(user_context) << "Vulkan: Failed to allocate device memory!\n";
-        return -1;
+        return halide_error_code_device_malloc_failed;
     }
 
     // retrieve the buffer from the region
@@ -613,7 +655,16 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
+    if (device_region == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve device region for buffer!\n";
+        return halide_error_code_internal_error;
+    }
+
     MemoryRegion *memory_region = ctx.allocator->owner_of(user_context, device_region);
+    if (memory_region == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve memory region for buffer!\n";
+        return halide_error_code_internal_error;
+    }
 
     // retrieve the buffer from the region
     VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(memory_region->handle);
@@ -631,10 +682,10 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 
     // create a command buffer
     VkCommandBuffer command_buffer;
-    VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "vk_create_command_buffer returned: " << vk_get_error_name(result) << "\n";
-        return -1;
+    int error_code = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to create command buffer!\n";
+        return error_code;
     }
 
     // begin the command buffer
@@ -646,10 +697,10 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
             nullptr                                       // pointer to parent command buffer
         };
 
-    result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+    VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_device_buffer_copy_failed;
     }
 
     // define the src and dst config
@@ -662,10 +713,15 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     uint64_t dst_offset = 0;
 
     // enqueue the copy operation, using the allocated buffers
-    vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
-                                src_offset, dst_offset,
-                                halide_buffer->dimensions,
-                                from_host, to_host);
+    error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
+                                             src_offset, dst_offset,
+                                             halide_buffer->dimensions,
+                                             from_host, to_host);
+
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: vk_do_multidimensional_copy failed!\n";
+        return error_code;
+    }
 
     // end the command buffer
     result = vkEndCommandBuffer(command_buffer);
@@ -690,22 +746,22 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
 
     result = vkQueueSubmit(ctx.queue, 1, &submit_info, 0);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_copy_to_device_failed;
     }
 
     //// 14. Wait until the queue is done with the command buffer
     result = vkQueueWaitIdle(ctx.queue);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_copy_to_device_failed;
     }
 
     // map the staging region to a host ptr
     uint8_t *stage_host_ptr = (uint8_t *)ctx.allocator->map(user_context, staging_region);
     if (stage_host_ptr == nullptr) {
         error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
-        return halide_error_code_internal_error;
+        return halide_error_code_copy_to_device_failed;
     }
 
     // copy to the (host-visible/coherent) staging buffer
@@ -714,7 +770,12 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     copy_memory(copy_helper, user_context);
 
     // unmap the pointer and reclaim the staging region
-    ctx.allocator->unmap(user_context, staging_region);
+    error_code = ctx.allocator->unmap(user_context, staging_region);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to umap staging region!\n";
+        return error_code;
+    }
+
     if (halide_can_reuse_device_allocations(user_context)) {
         ctx.allocator->release(user_context, staging_region);
     } else {
@@ -727,25 +788,24 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *src,
                                    const struct halide_device_interface_t *dst_device_interface,
                                    struct halide_buffer_t *dst) {
     if (dst->dimensions > MAX_COPY_DIMS) {
-        error(user_context) << "Buffer has too many dimensions to copy to/from GPU\n";
-        return halide_error_code_device_buffer_copy_failed;
+        error(user_context) << "Vulkan: Buffer has too many dimensions to copy to/from GPU\n";
+        return halide_error_code_buffer_extents_too_large;
     }
 
     // We only handle copies to Vulkan buffers or to host
-    if (dst_device_interface != nullptr && dst_device_interface != &vulkan_device_interface) {
-        error(user_context) << "halide_vulkan_buffer_copy: only handle copies to metal buffers or to host\n";
+    if ((dst_device_interface != nullptr) && (dst_device_interface != &vulkan_device_interface)) {
+        error(user_context) << "Vulkan: Unable to copy buffer ... only Vulkan allocated device buffers copying to/from host are supported!\n";
         return halide_error_code_device_buffer_copy_failed;
     }
 
-    if ((src->device_dirty() || src->host == nullptr) && src->device_interface != &vulkan_device_interface) {
-        halide_debug_assert(user_context, dst_device_interface == &vulkan_device_interface);
+    if ((src->device_dirty() || src->host == nullptr) && (src->device_interface != &vulkan_device_interface)) {
         // This is handled at the higher level.
         return halide_error_code_incompatible_device_interface;
     }
@@ -756,20 +816,21 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
     bool to_host = !dst_device_interface;
 
     if (!(from_host || src->device)) {
-        error(user_context) << "halide_vulkan_buffer_copy: invalid copy source\n";
+        error(user_context) << "Vulkan: halide_vulkan_buffer_copy: invalid copy source\n";
         return halide_error_code_device_buffer_copy_failed;
     }
     if (!(to_host || dst->device)) {
-        error(user_context) << "halide_vulkan_buffer_copy: invalid copy destination\n";
+        error(user_context) << "Vulkan: halide_vulkan_buffer_copy: invalid copy destination\n";
         return halide_error_code_device_buffer_copy_failed;
     }
 
     device_copy copy_helper = make_buffer_copy(src, from_host, dst, to_host);
 
-    int err = 0;
+    int error_code = halide_error_code_success;
     {
         VulkanContext ctx(user_context);
-        if (ctx.error != VK_SUCCESS) {
+        if (ctx.error != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to acquire context!\n";
             return ctx.error;
         }
 
@@ -787,10 +848,15 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
         //// wait until the queue is done with the command buffer
         VkResult wait_result = vkQueueWaitIdle(ctx.queue);
         if (wait_result != VK_SUCCESS) {
-            error(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(wait_result) << "\n";
-            return wait_result;
+            error(user_context) << "Vulkan: vkQueueWaitIdle returned " << vk_get_error_name(wait_result) << "\n";
+            if (to_host) {
+                return halide_error_code_copy_to_host_failed;
+            } else {
+                return halide_error_code_copy_to_device_failed;
+            }
         }
 
+        int error_code = halide_error_code_success;
         if (!from_host && !to_host) {
             // Device only case
             debug(user_context) << " buffer copy from: device to: device\n";
@@ -814,7 +880,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
             staging_region = ctx.allocator->reserve(user_context, request);
             if ((staging_region == nullptr) || (staging_region->handle == nullptr)) {
                 error(user_context) << "Vulkan: Failed to allocate device memory!\n";
-                return -1;
+                return halide_error_code_device_malloc_failed;
             }
 
             // use the staging region and buffer from the copy destination
@@ -836,14 +902,14 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
             staging_region = ctx.allocator->reserve(user_context, request);
             if ((staging_region == nullptr) || (staging_region->handle == nullptr)) {
                 error(user_context) << "Vulkan: Failed to allocate device memory!\n";
-                return -1;
+                return halide_error_code_device_malloc_failed;
             }
 
             // map the region to a host ptr
             uint8_t *stage_host_ptr = (uint8_t *)ctx.allocator->map(user_context, staging_region);
             if (stage_host_ptr == nullptr) {
                 error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
-                return halide_error_code_internal_error;
+                return halide_error_code_copy_to_device_failed;
             }
 
             // copy to the (host-visible/coherent) staging buffer, then restore the dst pointer
@@ -853,7 +919,11 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
             copy_helper.dst = copy_dst_ptr;
 
             // unmap the pointer
-            ctx.allocator->unmap(user_context, staging_region);
+            error_code = ctx.allocator->unmap(user_context, staging_region);
+            if (error_code != halide_error_code_success) {
+                error(user_context) << "Vulkan: Failed to unmap staging region!\n";
+                return halide_error_code_copy_to_device_failed;
+            }
 
             // use the staging region and buffer from the copy source
             src_buffer_region = staging_region;
@@ -862,7 +932,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
         } else if (from_host && to_host) {
             debug(user_context) << " buffer copy from: host to: host\n";
             copy_memory(copy_helper, user_context);
-            return 0;
+            return halide_error_code_success;
         }
 
         if (src_buffer_region == nullptr) {
@@ -885,10 +955,14 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
 
         // create a command buffer
         VkCommandBuffer command_buffer;
-        VkResult result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
-        if (result != VK_SUCCESS) {
-            error(user_context) << "vk_create_command_buffer returned: " << vk_get_error_name(result) << "\n";
-            return -1;
+        error_code = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to create command buffer!\n";
+            if (to_host) {
+                return halide_error_code_copy_to_host_failed;
+            } else {
+                return halide_error_code_copy_to_device_failed;
+            }
         }
 
         // begin the command buffer
@@ -900,10 +974,14 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
                 nullptr                                       // pointer to parent command buffer
             };
 
-        result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+        VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
         if (result != VK_SUCCESS) {
-            error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
-            return result;
+            error(user_context) << "Vulkan: vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+            if (to_host) {
+                return halide_error_code_copy_to_host_failed;
+            } else {
+                return halide_error_code_copy_to_device_failed;
+            }
         }
 
         // define the src and dst config
@@ -921,16 +999,25 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
         debug(user_context) << " dst region=" << (void *)dst_memory_region << " buffer=" << (void *)dst_device_buffer << " crop_offset=" << (uint64_t)dst_buffer_region->range.head_offset << " copy_offset=" << dst_offset << "\n";
 
         // enqueue the copy operation, using the allocated buffers
-        vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
-                                    src_offset, dst_offset,
-                                    src->dimensions,
-                                    from_host, to_host);
+        error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
+                                                 src_offset, dst_offset,
+                                                 src->dimensions,
+                                                 from_host, to_host);
+
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: vk_do_multidimensional_copy failed!\n";
+            return error_code;
+        }
 
         // end the command buffer
         result = vkEndCommandBuffer(command_buffer);
         if (result != VK_SUCCESS) {
             error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
-            return result;
+            if (to_host) {
+                return halide_error_code_copy_to_host_failed;
+            } else {
+                return halide_error_code_copy_to_device_failed;
+            }
         }
 
         //// submit the command buffer to our command queue
@@ -974,18 +1061,31 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
             copy_memory(copy_helper, user_context);
 
             // unmap the pointer and reclaim the staging region
-            ctx.allocator->unmap(user_context, staging_region);
+            error_code = ctx.allocator->unmap(user_context, staging_region);
+            if (error_code != halide_error_code_success) {
+                error(user_context) << "Vulkan: Failed to unmap pointer for staging region!\n";
+                return error_code;
+            }
         }
 
         if (staging_region) {
             if (halide_can_reuse_device_allocations(user_context)) {
-                ctx.allocator->release(user_context, staging_region);
+                error_code = ctx.allocator->release(user_context, staging_region);
             } else {
-                ctx.allocator->reclaim(user_context, staging_region);
+                error_code = ctx.allocator->reclaim(user_context, staging_region);
             }
         }
 
-        vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to release staging region allocation!\n";
+            return error_code;
+        }
+
+        error_code = vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to destroy command buffer!\n";
+            return error_code;
+        }
 
 #ifdef DEBUG_RUNTIME
         uint64_t t_after = halide_current_time_ns(user_context);
@@ -993,7 +1093,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
 #endif
     }
 
-    return err;
+    return error_code;
 }
 
 WEAK int halide_vulkan_device_crop(void *user_context,
@@ -1019,7 +1119,8 @@ WEAK int halide_vulkan_device_release_crop(void *user_context,
         << ", halide_buffer: " << halide_buffer << ")\n";
 
     VulkanContext ctx(user_context);
-    if (ctx.error != VK_SUCCESS) {
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
         return ctx.error;
     }
 
@@ -1027,18 +1128,25 @@ WEAK int halide_vulkan_device_release_crop(void *user_context,
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    halide_abort_if_false(user_context, halide_buffer->device);
-
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(halide_buffer->device);
-    ctx.allocator->destroy_crop(user_context, device_region);
+    if (device_region == nullptr) {
+        error(user_context) << "Vulkan: Failed to retrieve device region for buffer!\n";
+        return halide_error_code_internal_error;
+    }
+
+    int error_code = ctx.allocator->destroy_crop(user_context, device_region);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to destroy crop for device region!\n";
+        return error_code;
+    }
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_run(void *user_context,
@@ -1060,7 +1168,8 @@ WEAK int halide_vulkan_run(void *user_context,
 #endif
 
     VulkanContext ctx(user_context);
-    if (ctx.error != VK_SUCCESS) {
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
         return ctx.error;
     }
 
@@ -1101,8 +1210,7 @@ WEAK int halide_vulkan_run(void *user_context,
     // 1. Get the shader module cache entry
     VulkanCompilationCacheEntry *cache_entry = nullptr;
     bool found = compilation_cache.lookup(ctx.device, state_ptr, cache_entry);
-    halide_abort_if_false(user_context, found);
-    if (cache_entry == nullptr) {
+    if (!found || (cache_entry == nullptr)) {
         error(user_context) << "Vulkan: Failed to locate shader module! Unable to proceed!\n";
         return halide_error_code_internal_error;
     }
@@ -1116,7 +1224,7 @@ WEAK int halide_vulkan_run(void *user_context,
             found_entry_point = true;
         }
     }
-    if (!found_entry_point) {
+    if (!found_entry_point || (entry_point_index >= cache_entry->shader_count)) {
         error(user_context) << "Vulkan: Failed to locate shader entry point! Unable to proceed!\n";
         return halide_error_code_internal_error;
     }
@@ -1125,7 +1233,12 @@ WEAK int halide_vulkan_run(void *user_context,
                         << "] '" << entry_name << "'\n";
 
     // 2. Create objects for execution
-    halide_abort_if_false(user_context, cache_entry->descriptor_set_layouts != nullptr);
+    if (cache_entry->descriptor_set_layouts == nullptr) {
+        error(user_context) << "Vulkan: Missing descriptor set layouts! Unable to proceed!\n";
+        return halide_error_code_internal_error;
+    }
+
+    int error_code = halide_error_code_success;
     if (cache_entry->pipeline_layout == 0) {
 
         // 2a. Create all descriptor set layouts
@@ -1134,25 +1247,22 @@ WEAK int halide_vulkan_run(void *user_context,
                 uint32_t uniform_buffer_count = cache_entry->shader_bindings[n].uniform_buffer_count;
                 uint32_t storage_buffer_count = cache_entry->shader_bindings[n].storage_buffer_count;
                 debug(user_context) << " creating descriptor set layout [" << n << "] " << cache_entry->shader_bindings[n].entry_point_name << "\n";
-                VkResult result = vk_create_descriptor_set_layout(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(cache_entry->descriptor_set_layouts[n]));
-                if (result != VK_SUCCESS) {
-                    error(user_context) << "vk_create_descriptor_set_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
-                    return result;
+                error_code = vk_create_descriptor_set_layout(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(cache_entry->descriptor_set_layouts[n]));
+                if (error_code != halide_error_code_success) {
+                    error(user_context) << "Vulkan: Failed to create descriptor set layout!\n";
+                    return error_code;
                 }
             }
         }
 
         // 2b. Create the pipeline layout
-        VkResult result = vk_create_pipeline_layout(user_context, ctx.allocator, cache_entry->shader_count, cache_entry->descriptor_set_layouts, &(cache_entry->pipeline_layout));
-        if (result != VK_SUCCESS) {
-            error(user_context) << "vk_create_pipeline_layout() failed! Unable to create shader module! Error: " << vk_get_error_name(result) << "\n";
-            return halide_error_code_internal_error;
+        error_code = vk_create_pipeline_layout(user_context, ctx.allocator, cache_entry->shader_count, cache_entry->descriptor_set_layouts, &(cache_entry->pipeline_layout));
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to create pipeline layout!\n";
+            return error_code;
         }
     }
 
-    VulkanShaderBinding *entry_point_binding = (cache_entry->shader_bindings + entry_point_index);
-    halide_abort_if_false(user_context, entry_point_binding != nullptr);
-
     VulkanDispatchData dispatch_data = {};
     dispatch_data.shared_mem_bytes = shared_mem_bytes;
     dispatch_data.global_size[0] = blocksX;
@@ -1162,11 +1272,13 @@ WEAK int halide_vulkan_run(void *user_context,
     dispatch_data.local_size[1] = threadsY;
     dispatch_data.local_size[2] = threadsZ;
 
+    VulkanShaderBinding *entry_point_binding = (cache_entry->shader_bindings + entry_point_index);
+
     // 2c. Setup the compute pipeline (eg override any specializations for shared mem or workgroup size)
-    VkResult result = vk_setup_compute_pipeline(user_context, ctx.allocator, entry_point_binding, &dispatch_data, cache_entry->shader_module, cache_entry->pipeline_layout, &(entry_point_binding->compute_pipeline));
-    if (result != VK_SUCCESS) {
-        error(user_context) << "vk_setup_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-        return halide_error_code_internal_error;
+    error_code = vk_setup_compute_pipeline(user_context, ctx.allocator, entry_point_binding, &dispatch_data, cache_entry->shader_module, cache_entry->pipeline_layout, &(entry_point_binding->compute_pipeline));
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to setup compute pipeline!\n";
+        return error_code;
     }
 
     // 2d. Create a descriptor set
@@ -1179,17 +1291,17 @@ WEAK int halide_vulkan_run(void *user_context,
 
         uint32_t uniform_buffer_count = entry_point_binding->uniform_buffer_count;
         uint32_t storage_buffer_count = entry_point_binding->storage_buffer_count;
-        VkResult result = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(entry_point_binding->descriptor_pool));
-        if (result != VK_SUCCESS) {
-            error(user_context) << "vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-            return result;
+        error_code = vk_create_descriptor_pool(user_context, ctx.allocator, uniform_buffer_count, storage_buffer_count, &(entry_point_binding->descriptor_pool));
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Unable to create shader module ... failed to create descriptor pool!\n";
+            return error_code;
         }
 
         // Create the descriptor set
-        result = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layouts[entry_point_index], entry_point_binding->descriptor_pool, &(entry_point_binding->descriptor_set));
-        if (result != VK_SUCCESS) {
-            error(user_context) << "vk_create_descriptor_pool() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-            return result;
+        error_code = vk_create_descriptor_set(user_context, ctx.allocator, cache_entry->descriptor_set_layouts[entry_point_index], entry_point_binding->descriptor_pool, &(entry_point_binding->descriptor_set));
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Unable to create shader module ... failed to create descriptor set!\n";
+            return error_code;
         }
     }
 
@@ -1199,8 +1311,8 @@ WEAK int halide_vulkan_run(void *user_context,
         if (scalar_buffer_size > 0) {
             entry_point_binding->args_region = vk_create_scalar_uniform_buffer(user_context, ctx.allocator, scalar_buffer_size);
             if (entry_point_binding->args_region == nullptr) {
-                error(user_context) << "vk_create_scalar_uniform_buffer() failed! Unable to create shader module!\n";
-                return halide_error_code_internal_error;
+                error(user_context) << "Vulkan: Failed to create scalar uniform buffer!\n";
+                return halide_error_code_out_of_memory;
             }
         }
     }
@@ -1208,10 +1320,10 @@ WEAK int halide_vulkan_run(void *user_context,
     // 3b. Update uniform buffer with scalar parameters
     VkBuffer *args_buffer = nullptr;
     if ((entry_point_binding->args_region != nullptr) && entry_point_binding->uniform_buffer_count) {
-        VkResult result = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, entry_point_binding->args_region, arg_sizes, args, arg_is_buffer);
-        if (result != VK_SUCCESS) {
-            debug(user_context) << "vk_update_scalar_uniform_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-            return result;
+        error_code = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, entry_point_binding->args_region, arg_sizes, args, arg_is_buffer);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to update scalar uniform buffer!\n";
+            return error_code;
         }
 
         args_buffer = reinterpret_cast<VkBuffer *>(entry_point_binding->args_region->handle);
@@ -1222,50 +1334,54 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     // 3c. Update buffer bindings for descriptor set
-    result = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_binding->descriptor_set);
-    if (result != VK_SUCCESS) {
-        debug(user_context) << "vk_update_descriptor_set() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-        return result;
+    error_code = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_binding->descriptor_set);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to update descriptor set!\n";
+        return error_code;
     }
 
     // 4. Create a command buffer from the command pool
     VkCommandBuffer command_buffer;
-    result = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
-    if (result != VK_SUCCESS) {
-        debug(user_context) << "vk_create_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-        return result;
+    error_code = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to create command buffer!\n";
+        return error_code;
     }
 
     // 5. Fill the command buffer
-    result = vk_fill_command_buffer_with_dispatch_call(user_context,
-                                                       ctx.device, command_buffer,
-                                                       entry_point_binding->compute_pipeline,
-                                                       cache_entry->pipeline_layout,
-                                                       entry_point_binding->descriptor_set,
-                                                       entry_point_index,
-                                                       blocksX, blocksY, blocksZ);
-    if (result != VK_SUCCESS) {
-        debug(user_context) << "vk_fill_command_buffer_with_dispatch_call() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-        return result;
+    error_code = vk_fill_command_buffer_with_dispatch_call(user_context,
+                                                           ctx.device, command_buffer,
+                                                           entry_point_binding->compute_pipeline,
+                                                           cache_entry->pipeline_layout,
+                                                           entry_point_binding->descriptor_set,
+                                                           entry_point_index,
+                                                           blocksX, blocksY, blocksZ);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to fill command buffer with dispatch call!\n";
+        return error_code;
     }
 
     // 6. Submit the command buffer to our command queue
-    result = vk_submit_command_buffer(user_context, ctx.queue, command_buffer);
-    if (result != VK_SUCCESS) {
-        debug(user_context) << "vk_submit_command_buffer() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
-        return result;
+    error_code = vk_submit_command_buffer(user_context, ctx.queue, command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to fill submit command buffer!\n";
+        return error_code;
     }
 
     // 7. Wait until the queue is done with the command buffer
-    result = vkQueueWaitIdle(ctx.queue);
+    VkResult result = vkQueueWaitIdle(ctx.queue);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
 
     // 8. Cleanup
-    vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+    error_code = vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
     vkResetCommandPool(ctx.device, ctx.command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to destroy command buffer!\n";
+        return error_code;
+    }
 
 #ifdef DEBUG_RUNTIME
     debug(user_context) << "halide_vulkan_run: blocks_allocated="
@@ -1278,7 +1394,7 @@ WEAK int halide_vulkan_run(void *user_context,
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
@@ -1292,24 +1408,27 @@ WEAK int halide_vulkan_device_and_host_free(void *user_context, struct halide_bu
 WEAK int halide_vulkan_wrap_vk_buffer(void *user_context, struct halide_buffer_t *buf, uint64_t vk_buffer) {
     halide_debug_assert(user_context, buf->device == 0);
     if (buf->device != 0) {
-        return -2;
+        error(user_context) << "Vulkan: Unable to wrap buffer ... invalid device pointer!\n";
+        return halide_error_code_device_wrap_native_failed;
     }
     buf->device = vk_buffer;
     buf->device_interface = &vulkan_device_interface;
     buf->device_interface->impl->use_module();
-
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK int halide_vulkan_detach_vk_buffer(void *user_context, halide_buffer_t *buf) {
     if (buf->device == 0) {
-        return 0;
+        return halide_error_code_success;
+    }
+    if (buf->device_interface != &vulkan_device_interface) {
+        error(user_context) << "Vulkan: Unable to detach buffer ... invalid device interface!\n";
+        return halide_error_code_incompatible_device_interface;
     }
-    halide_debug_assert(user_context, buf->device_interface == &vulkan_device_interface);
     buf->device = 0;
     buf->device_interface->impl->release_module();
     buf->device_interface = nullptr;
-    return 0;
+    return halide_error_code_success;
 }
 
 WEAK uintptr_t halide_vulkan_get_vk_buffer(void *user_context, halide_buffer_t *buf) {
@@ -1332,13 +1451,14 @@ WEAK int halide_vulkan_release_unused_device_allocations(void *user_context) {
         << ")\n";
 
     VulkanContext ctx(user_context);
-    if (ctx.error != VK_SUCCESS) {
-        return -1;
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
+        return ctx.error;
     }
 
     // collect all unused allocations
     ctx.allocator->collect(user_context);
-    return 0;
+    return halide_error_code_success;
 }
 
 namespace {
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 2f94d170486a..e064ea26643c 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -46,7 +46,7 @@ class VulkanContext {
     VkPhysicalDevice physical_device;
     VkQueue queue;
     uint32_t queue_family_index;  // used for operations requiring queue family
-    VkResult error;
+    halide_error_code_t error;
 
     HALIDE_ALWAYS_INLINE VulkanContext(void *user_context)
         : user_context(user_context),
@@ -57,12 +57,13 @@ class VulkanContext {
           physical_device(nullptr),
           queue(nullptr),
           queue_family_index(0),
-          error(VK_SUCCESS) {
+          error(halide_error_code_success) {
 
         int result = halide_vulkan_acquire_context(user_context,
                                                    reinterpret_cast<halide_vulkan_memory_allocator **>(&allocator),
                                                    &instance, &device, &physical_device, &command_pool, &queue, &queue_family_index);
         if (result != halide_error_code_success) {
+            error = halide_error_code_device_interface_no_device;
             halide_error_no_device_interface(user_context);
         }
         halide_debug_assert(user_context, allocator != nullptr);
@@ -178,7 +179,7 @@ int vk_create_instance(void *user_context, const StringTable &requested_layers,
     VkResult result = vkCreateInstance(&create_info, alloc_callbacks, instance);
     if (result != VK_SUCCESS) {
         debug(user_context) << "Vulkan: vkCreateInstance failed with return code: " << vk_get_error_name(result) << "\n";
-        return halide_error_code_incompatible_device_interface;
+        return halide_error_code_device_interface_no_device;
     }
 
     return halide_error_code_success;
@@ -199,11 +200,11 @@ int vk_select_device_for_context(void *user_context,
     VkResult result = vkEnumeratePhysicalDevices(*instance, &device_count, nullptr);
     if ((result != VK_SUCCESS) && (result != VK_INCOMPLETE)) {
         debug(user_context) << "Vulkan: vkEnumeratePhysicalDevices failed with return code: " << vk_get_error_name(result) << "\n";
-        return halide_error_code_incompatible_device_interface;
+        return halide_error_code_device_interface_no_device;
     }
     if (device_count == 0) {
         debug(user_context) << "Vulkan: No devices found.\n";
-        return halide_error_code_incompatible_device_interface;
+        return halide_error_code_device_interface_no_device;
     }
 
     // allocate enough storage for the physical device query results
@@ -221,7 +222,7 @@ int vk_select_device_for_context(void *user_context,
     result = vkEnumeratePhysicalDevices(*instance, &device_count, avail_devices);
     if ((result != VK_SUCCESS) && (result != VK_INCOMPLETE)) {
         debug(user_context) << "Vulkan: vkEnumeratePhysicalDevices failed with return code: " << vk_get_error_name(result) << "\n";
-        return halide_error_code_incompatible_device_interface;
+        return halide_error_code_device_interface_no_device;
     }
 
     // get the configurable device type to search for (e.g. 'cpu', 'gpu', 'integrated-gpu', 'discrete-gpu', ...)
@@ -315,7 +316,7 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     bool valid_device = vk_validate_required_extension_support(user_context, required_device_extensions, supported_device_extensions);
     if (!valid_device) {
         debug(user_context) << "Vulkan: Unable to validate required extension support!\n";
-        return halide_error_code_incompatible_device_interface;
+        return halide_error_code_device_interface_no_device;
     }
 
     debug(user_context) << "  found " << (uint32_t)required_device_extensions.size() << " required extensions for device!\n";
@@ -418,7 +419,7 @@ int vk_create_device(void *user_context, const StringTable &requested_layers, Vk
     VkResult result = vkCreateDevice(*physical_device, &device_create_info, alloc_callbacks, device);
     if (result != VK_SUCCESS) {
         debug(user_context) << "Vulkan: vkCreateDevice failed with return code: " << vk_get_error_name(result) << "\n";
-        return halide_error_code_incompatible_device_interface;
+        return halide_error_code_device_interface_no_device;
     }
 
     vkGetDeviceQueue(cached_device, *queue_family_index, 0, queue);
@@ -440,26 +441,26 @@ int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
     }
 
     const VkAllocationCallbacks *alloc_callbacks = halide_vulkan_get_allocation_callbacks(user_context);
-    int status = vk_create_instance(user_context, requested_layers, instance, alloc_callbacks);
-    if (status != halide_error_code_success) {
+    int error_code = vk_create_instance(user_context, requested_layers, instance, alloc_callbacks);
+    if (error_code != halide_error_code_success) {
         error(user_context) << "Vulkan: Failed to create instance for context!\n";
-        return halide_error_code_generic_error;
+        return error_code;
     }
 
     if (vkCreateDevice == nullptr) {
         vk_load_vulkan_functions(*instance);
     }
 
-    status = vk_select_device_for_context(user_context, instance, device, physical_device, queue_family_index);
-    if (status != halide_error_code_success) {
+    error_code = vk_select_device_for_context(user_context, instance, device, physical_device, queue_family_index);
+    if (error_code != halide_error_code_success) {
         error(user_context) << "Vulkan: Failed to select device for context!\n";
-        return halide_error_code_generic_error;
+        return error_code;
     }
 
-    status = vk_create_device(user_context, requested_layers, instance, device, queue, physical_device, queue_family_index, alloc_callbacks);
-    if (status != halide_error_code_success) {
+    error_code = vk_create_device(user_context, requested_layers, instance, device, queue, physical_device, queue_family_index, alloc_callbacks);
+    if (error_code != halide_error_code_success) {
         error(user_context) << "Vulkan: Failed to create device for context!\n";
-        return halide_error_code_generic_error;
+        return error_code;
     }
 
     *allocator = vk_create_memory_allocator(user_context, *device, *physical_device, alloc_callbacks);
@@ -468,10 +469,10 @@ int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
         return halide_error_code_generic_error;
     }
 
-    VkResult result = vk_create_command_pool(user_context, *allocator, *queue_family_index, command_pool);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to create command pool for context! Error: " << vk_get_error_name(result) << "\n";
-        return result;
+    error_code = vk_create_command_pool(user_context, *allocator, *queue_family_index, command_pool);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to create command pool for context!\n";
+        return error_code;
     }
 
     return halide_error_code_success;
diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
index 5bf17048a2ad..1eaea7ced487 100644
--- a/src/runtime/vulkan_extensions.h
+++ b/src/runtime/vulkan_extensions.h
@@ -204,6 +204,7 @@ uint32_t vk_get_optional_device_extensions(void *user_context, StringTable &ext_
     }
     return optional_ext_count;
 }
+
 uint32_t vk_get_supported_device_extensions(void *user_context, VkPhysicalDevice physical_device, StringTable &ext_table) {
     debug(user_context) << "vk_get_supported_device_extensions\n";
     if (vkEnumerateDeviceExtensionProperties == nullptr) {
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 5d8dbf0484d0..04d702627045 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -85,23 +85,23 @@ bool vk_validate_required_extension_support(void *user_context,
 // --------------------------------------------------------------------------
 
 // -- Command Pool
-VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool);
-void vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool);
+int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool);
+int vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool);
 
 // -- Command Buffer
-VkResult vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool pool, VkCommandBuffer *command_buffer);
-void vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer command_buffer);
+int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool pool, VkCommandBuffer *command_buffer);
+int vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer command_buffer);
 
-VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
-                                                   VkDevice device,
-                                                   VkCommandBuffer command_buffer,
-                                                   VkPipeline compute_pipeline,
-                                                   VkPipelineLayout pipeline_layout,
-                                                   VkDescriptorSet descriptor_set,
-                                                   uint32_t descriptor_set_index,
-                                                   int blocksX, int blocksY, int blocksZ);
+int vk_fill_command_buffer_with_dispatch_call(void *user_context,
+                                              VkDevice device,
+                                              VkCommandBuffer command_buffer,
+                                              VkPipeline compute_pipeline,
+                                              VkPipelineLayout pipeline_layout,
+                                              VkDescriptorSet descriptor_set,
+                                              uint32_t descriptor_set_index,
+                                              int blocksX, int blocksY, int blocksZ);
 
-VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer);
+int vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer);
 
 // -- Scalar Uniform Buffer
 bool vk_needs_scalar_uniform_buffer(void *user_context,
@@ -118,25 +118,25 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
                                               VulkanMemoryAllocator *allocator,
                                               size_t scalar_buffer_size);
 
-VkResult vk_update_scalar_uniform_buffer(void *user_context,
-                                         VulkanMemoryAllocator *allocator,
-                                         MemoryRegion *region,
-                                         size_t arg_sizes[],
-                                         void *args[],
-                                         int8_t arg_is_buffer[]);
+int vk_update_scalar_uniform_buffer(void *user_context,
+                                    VulkanMemoryAllocator *allocator,
+                                    MemoryRegion *region,
+                                    size_t arg_sizes[],
+                                    void *args[],
+                                    int8_t arg_is_buffer[]);
 
-void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
-                                      MemoryRegion *scalar_args_region);
+int vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
+                                     MemoryRegion *scalar_args_region);
 // -- Descriptor Pool
-VkResult vk_create_descriptor_pool(void *user_context,
-                                   VulkanMemoryAllocator *allocator,
-                                   uint32_t uniform_buffer_count,
-                                   uint32_t storage_buffer_count,
-                                   VkDescriptorPool *descriptor_pool);
+int vk_create_descriptor_pool(void *user_context,
+                              VulkanMemoryAllocator *allocator,
+                              uint32_t uniform_buffer_count,
+                              uint32_t storage_buffer_count,
+                              VkDescriptorPool *descriptor_pool);
 
-VkResult vk_destroy_descriptor_pool(void *user_context,
-                                    VulkanMemoryAllocator *allocator,
-                                    VkDescriptorPool descriptor_pool);
+int vk_destroy_descriptor_pool(void *user_context,
+                               VulkanMemoryAllocator *allocator,
+                               VkDescriptorPool descriptor_pool);
 
 // -- Descriptor Set Layout
 uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
@@ -144,62 +144,62 @@ uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
                                               void *args[],
                                               int8_t arg_is_buffer[]);
 
-VkResult vk_create_descriptor_set_layout(void *user_context,
-                                         VulkanMemoryAllocator *allocator,
-                                         uint32_t uniform_buffer_count,
-                                         uint32_t storage_buffer_count,
-                                         VkDescriptorSetLayout *layout);
+int vk_create_descriptor_set_layout(void *user_context,
+                                    VulkanMemoryAllocator *allocator,
+                                    uint32_t uniform_buffer_count,
+                                    uint32_t storage_buffer_count,
+                                    VkDescriptorSetLayout *layout);
 
-VkResult vk_destroy_descriptor_set_layout(void *user_context,
-                                          VulkanMemoryAllocator *allocator,
-                                          VkDescriptorSetLayout descriptor_set_layout);
+int vk_destroy_descriptor_set_layout(void *user_context,
+                                     VulkanMemoryAllocator *allocator,
+                                     VkDescriptorSetLayout descriptor_set_layout);
 
 // -- Descriptor Set
-VkResult vk_create_descriptor_set(void *user_context,
-                                  VulkanMemoryAllocator *allocator,
-                                  VkDescriptorSetLayout descriptor_set_layout,
-                                  VkDescriptorPool descriptor_pool,
-                                  VkDescriptorSet *descriptor_set);
-
-VkResult vk_update_descriptor_set(void *user_context,
-                                  VulkanMemoryAllocator *allocator,
-                                  VkBuffer *scalar_args_buffer,
-                                  size_t uniform_buffer_count,
-                                  size_t storage_buffer_count,
-                                  size_t arg_sizes[],
-                                  void *args[],
-                                  int8_t arg_is_buffer[],
-                                  VkDescriptorSet descriptor_set);
+int vk_create_descriptor_set(void *user_context,
+                             VulkanMemoryAllocator *allocator,
+                             VkDescriptorSetLayout descriptor_set_layout,
+                             VkDescriptorPool descriptor_pool,
+                             VkDescriptorSet *descriptor_set);
+
+int vk_update_descriptor_set(void *user_context,
+                             VulkanMemoryAllocator *allocator,
+                             VkBuffer *scalar_args_buffer,
+                             size_t uniform_buffer_count,
+                             size_t storage_buffer_count,
+                             size_t arg_sizes[],
+                             void *args[],
+                             int8_t arg_is_buffer[],
+                             VkDescriptorSet descriptor_set);
 
 // -- Pipeline Layout
-VkResult vk_create_pipeline_layout(void *user_context,
-                                   VulkanMemoryAllocator *allocator,
-                                   uint32_t descriptor_set_count,
-                                   VkDescriptorSetLayout *descriptor_set_layouts,
-                                   VkPipelineLayout *pipeline_layout);
-
-VkResult vk_destroy_pipeline_layout(void *user_context,
-                                    VulkanMemoryAllocator *allocator,
-                                    VkPipelineLayout pipeline_layout);
+int vk_create_pipeline_layout(void *user_context,
+                              VulkanMemoryAllocator *allocator,
+                              uint32_t descriptor_set_count,
+                              VkDescriptorSetLayout *descriptor_set_layouts,
+                              VkPipelineLayout *pipeline_layout);
+
+int vk_destroy_pipeline_layout(void *user_context,
+                               VulkanMemoryAllocator *allocator,
+                               VkPipelineLayout pipeline_layout);
 // -- Compute Pipeline
-VkResult vk_create_compute_pipeline(void *user_context,
-                                    VulkanMemoryAllocator *allocator,
-                                    const char *pipeline_name,
-                                    VkShaderModule shader_module,
-                                    VkPipelineLayout pipeline_layout,
-                                    VkSpecializationInfo *specialization_info,
-                                    VkPipeline *compute_pipeline);
-
-VkResult vk_setup_compute_pipeline(void *user_context,
-                                   VulkanMemoryAllocator *allocator,
-                                   VulkanShaderBinding *shader_bindings,
-                                   VkShaderModule shader_module,
-                                   VkPipelineLayout pipeline_layout,
-                                   VkPipeline *compute_pipeline);
-
-VkResult vk_destroy_compute_pipeline(void *user_context,
-                                     VulkanMemoryAllocator *allocator,
-                                     VkPipeline compute_pipeline);
+int vk_create_compute_pipeline(void *user_context,
+                               VulkanMemoryAllocator *allocator,
+                               const char *pipeline_name,
+                               VkShaderModule shader_module,
+                               VkPipelineLayout pipeline_layout,
+                               VkSpecializationInfo *specialization_info,
+                               VkPipeline *compute_pipeline);
+
+int vk_setup_compute_pipeline(void *user_context,
+                              VulkanMemoryAllocator *allocator,
+                              VulkanShaderBinding *shader_bindings,
+                              VkShaderModule shader_module,
+                              VkPipelineLayout pipeline_layout,
+                              VkPipeline *compute_pipeline);
+
+int vk_destroy_compute_pipeline(void *user_context,
+                                VulkanMemoryAllocator *allocator,
+                                VkPipeline compute_pipeline);
 
 // -- Shader Module
 VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryAllocator *allocator,
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index a09be01291ec..f0301f5b37bf 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -2,10 +2,11 @@
 #define HALIDE_RUNTIME_VULKAN_MEMORY_H
 
 #include "internal/block_allocator.h"
+#include "internal/memory_resources.h"
 #include "vulkan_internal.h"
 
 // Uncomment to enable verbose memory allocation debugging
-#define HL_VK_DEBUG_MEM 1
+// #define HL_VK_DEBUG_MEM 1
 
 namespace Halide {
 namespace Runtime {
@@ -53,21 +54,21 @@ class VulkanMemoryAllocator {
                                          const SystemMemoryAllocatorFns &system_allocator,
                                          const VkAllocationCallbacks *alloc_callbacks = nullptr);
 
-    static void destroy(void *user_context, VulkanMemoryAllocator *allocator);
+    static int destroy(void *user_context, VulkanMemoryAllocator *allocator);
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, MemoryRequest &request);
-    void release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
-    void reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
-    void retain(void *user_context, MemoryRegion *region);   //< retain the region and increase its use count
-    bool collect(void *user_context);                        //< returns true if any blocks were removed
-    void release(void *user_context);
-    void destroy(void *user_context);
+    int release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *region);   //< retain the region and increase its use count
+    bool collect(void *user_context);                       //< returns true if any blocks were removed
+    int release(void *user_context);
+    int destroy(void *user_context);
 
     void *map(void *user_context, MemoryRegion *region);
-    void unmap(void *user_context, MemoryRegion *region);
+    int unmap(void *user_context, MemoryRegion *region);
     MemoryRegion *create_crop(void *user_context, MemoryRegion *region, uint64_t offset);
-    void destroy_crop(void *user_context, MemoryRegion *region);
+    int destroy_crop(void *user_context, MemoryRegion *region);
     MemoryRegion *owner_of(void *user_context, MemoryRegion *region);
 
     VkDevice current_device() const {
@@ -82,11 +83,11 @@ class VulkanMemoryAllocator {
 
     static const VulkanMemoryConfig &default_config();
 
-    static void allocate_block(void *user_context, MemoryBlock *block);
-    static void deallocate_block(void *user_context, MemoryBlock *block);
+    static int allocate_block(void *instance_ptr, MemoryBlock *block);
+    static int deallocate_block(void *instance_ptr, MemoryBlock *block);
 
-    static void allocate_region(void *user_context, MemoryRegion *region);
-    static void deallocate_region(void *user_context, MemoryRegion *region);
+    static int allocate_region(void *instance_ptr, MemoryRegion *region);
+    static int deallocate_region(void *instance_ptr, MemoryRegion *region);
 
     size_t bytes_allocated_for_blocks() const;
     size_t blocks_allocated() const;
@@ -99,10 +100,10 @@ class VulkanMemoryAllocator {
     static constexpr uint32_t invalid_memory_type = uint32_t(VK_MAX_MEMORY_TYPES);
 
     // Initializes a new instance
-    void initialize(void *user_context, const VulkanMemoryConfig &config,
-                    VkDevice dev, VkPhysicalDevice phys_dev,
-                    const SystemMemoryAllocatorFns &system_allocator,
-                    const VkAllocationCallbacks *alloc_callbacks = nullptr);
+    int initialize(void *user_context, const VulkanMemoryConfig &config,
+                   VkDevice dev, VkPhysicalDevice phys_dev,
+                   const SystemMemoryAllocatorFns &system_allocator,
+                   const VkAllocationCallbacks *alloc_callbacks = nullptr);
 
     uint32_t select_memory_usage(void *user_context, MemoryProperties properties) const;
 
@@ -115,6 +116,7 @@ class VulkanMemoryAllocator {
     size_t block_count = 0;
     size_t region_byte_count = 0;
     size_t region_count = 0;
+    void *owner_context = nullptr;
     VulkanMemoryConfig config;
     VkDevice device = nullptr;
     VkPhysicalDevice physical_device = nullptr;
@@ -128,33 +130,45 @@ VulkanMemoryAllocator *VulkanMemoryAllocator::create(void *user_context,
                                                      const SystemMemoryAllocatorFns &system_allocator,
                                                      const VkAllocationCallbacks *alloc_callbacks) {
 
-    halide_abort_if_false(user_context, system_allocator.allocate != nullptr);
+    if (system_allocator.allocate == nullptr) {
+        error(user_context) << "VulkanBlockAllocator: Unable to create instance! Missing system allocator interface!\n";
+        return nullptr;
+    }
+
     VulkanMemoryAllocator *result = reinterpret_cast<VulkanMemoryAllocator *>(
         system_allocator.allocate(user_context, sizeof(VulkanMemoryAllocator)));
 
     if (result == nullptr) {
         error(user_context) << "VulkanMemoryAllocator: Failed to create instance! Out of memory!\n";
-        return nullptr;
+        return nullptr;  // caller must handle error case for out-of-memory
     }
 
     result->initialize(user_context, cfg, dev, phys_dev, system_allocator, alloc_callbacks);
     return result;
 }
 
-void VulkanMemoryAllocator::destroy(void *user_context, VulkanMemoryAllocator *instance) {
-    halide_abort_if_false(user_context, instance != nullptr);
+int VulkanMemoryAllocator::destroy(void *user_context, VulkanMemoryAllocator *instance) {
+    if (instance == nullptr) {
+        error(user_context) << "VulkanBlockAllocator: Unable to destroy instance! Invalide instance pointer!\n";
+        return halide_error_code_internal_error;
+    }
     const BlockAllocator::MemoryAllocators &allocators = instance->block_allocator->current_allocators();
     instance->destroy(user_context);
     BlockAllocator::destroy(user_context, instance->block_allocator);
-    halide_abort_if_false(user_context, allocators.system.deallocate != nullptr);
+    if (allocators.system.deallocate == nullptr) {
+        error(user_context) << "VulkanBlockAllocator: Unable to destroy instance! Missing system allocator interface!\n";
+        return halide_error_code_internal_error;
+    }
     allocators.system.deallocate(user_context, instance);
+    return halide_error_code_success;
 }
 
-void VulkanMemoryAllocator::initialize(void *user_context,
-                                       const VulkanMemoryConfig &cfg, VkDevice dev, VkPhysicalDevice phys_dev,
-                                       const SystemMemoryAllocatorFns &system_allocator,
-                                       const VkAllocationCallbacks *callbacks) {
+int VulkanMemoryAllocator::initialize(void *user_context,
+                                      const VulkanMemoryConfig &cfg, VkDevice dev, VkPhysicalDevice phys_dev,
+                                      const SystemMemoryAllocatorFns &system_allocator,
+                                      const VkAllocationCallbacks *callbacks) {
 
+    owner_context = user_context;
     config = cfg;
     device = dev;
     physical_device = phys_dev;
@@ -173,68 +187,89 @@ void VulkanMemoryAllocator::initialize(void *user_context,
     block_allocator_config.maximum_block_size = cfg.maximum_block_size;
     block_allocator_config.minimum_block_size = cfg.minimum_block_size;
     block_allocator = BlockAllocator::create(user_context, block_allocator_config, allocators);
-    halide_abort_if_false(user_context, block_allocator != nullptr);
+    if (block_allocator == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Failed to create BlockAllocator! Out of memory?!\n";
+        return halide_error_code_out_of_memory;
+    }
 
     // get the physical device properties to determine limits and allocation requirements
     VkPhysicalDeviceProperties physical_device_properties = {0};
     memset(&physical_device_limits, 0, sizeof(VkPhysicalDeviceLimits));
     vkGetPhysicalDeviceProperties(physical_device, &physical_device_properties);
     memcpy(&physical_device_limits, &(physical_device_properties.limits), sizeof(VkPhysicalDeviceLimits));
+    return halide_error_code_success;
 }
 
 MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Reserving memory ("
-                   << "user_context=" << user_context << " "
-                   << "block_allocator=" << (void *)(block_allocator) << " "
-                   << "request_size=" << (uint32_t)(request.size) << " "
-                   << "device=" << (void *)(device) << " "
-                   << "physical_device=" << (void *)(physical_device) << ") ...\n";
+    debug(user_context) << "VulkanMemoryAllocator: Reserving memory ("
+                        << "user_context=" << user_context << " "
+                        << "block_allocator=" << (void *)(block_allocator) << " "
+                        << "request_size=" << (uint32_t)(request.size) << " "
+                        << "device=" << (void *)(device) << " "
+                        << "physical_device=" << (void *)(physical_device) << ") ...\n";
 #endif
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, physical_device != nullptr);
-    halide_abort_if_false(user_context, block_allocator != nullptr);
+
+    if ((device == nullptr) || (physical_device == nullptr)) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to reserve memory! Invalid device handle!\n";
+        return nullptr;
+    }
+
+    if (block_allocator == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to reserve memory! Invalid block allocator!\n";
+        return nullptr;
+    }
+
     return block_allocator->reserve(this, request);
 }
 
 void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Mapping region ("
-                   << "user_context=" << user_context << " "
-                   << "device=" << (void *)(device) << " "
-                   << "physical_device=" << (void *)(physical_device) << " "
-                   << "region=" << (void *)(region) << " "
-                   << "region_size=" << (uint32_t)region->size << " "
-                   << "region_offset=" << (uint32_t)region->offset << " "
-                   << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
+    debug(user_context) << "VulkanMemoryAllocator: Mapping region ("
+                        << "user_context=" << user_context << " "
+                        << "device=" << (void *)(device) << " "
+                        << "physical_device=" << (void *)(physical_device) << " "
+                        << "region=" << (void *)(region) << " "
+                        << "region_size=" << (uint32_t)region->size << " "
+                        << "region_offset=" << (uint32_t)region->offset << " "
+                        << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
 #endif
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, physical_device != nullptr);
-    halide_abort_if_false(user_context, block_allocator != nullptr);
+    if ((device == nullptr) || (physical_device == nullptr)) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to map memory! Invalid device handle!\n";
+        return nullptr;
+    }
+
+    if (block_allocator == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to map memory! Invalid block allocator!\n";
+        return nullptr;
+    }
 
     MemoryRegion *owner = owner_of(user_context, region);
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
-        error(nullptr) << "VulkanMemoryAllocator: Unable to map region! Invalid region allocator handle!\n";
-        return nullptr;
+        error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid region allocator handle!\n";
+        return nullptr;  // NOTE: caller must handle nullptr
     }
 
     BlockResource *block_resource = region_allocator->block_resource();
     if (block_resource == nullptr) {
-        error(nullptr) << "VulkanMemoryAllocator: Unable to map region! Invalid block resource handle!\n";
-        return nullptr;
+        error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid block resource handle!\n";
+        return nullptr;  // NOTE: caller must handle nullptr
     }
 
     VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block_resource->memory.handle);
     if (device_memory == nullptr) {
-        error(nullptr) << "VulkanMemoryAllocator: Unable to map region! Invalid device memory handle!\n";
-        return nullptr;
+        error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid device memory handle!\n";
+        return nullptr;  // NOTE: caller must handle nullptr
     }
 
     uint8_t *mapped_ptr = nullptr;
     VkDeviceSize memory_offset = region->offset + region->range.head_offset;
     VkDeviceSize memory_size = region->size - region->range.tail_offset - region->range.head_offset;
-    halide_abort_if_false(user_context, (region->size - region->range.tail_offset - region->range.head_offset) > 0);
+    if (((double)region->size - (double)region->range.tail_offset - (double)region->range.head_offset) <= 0.0) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid memory range !\n";
+        return nullptr;
+    }
     VkResult result = vkMapMemory(device, *device_memory, memory_offset, memory_size, 0, (void **)(&mapped_ptr));
     if (result != VK_SUCCESS) {
         error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n";
@@ -244,98 +279,126 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
     return mapped_ptr;
 }
 
-void VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
+int VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Unmapping region ("
-                   << "user_context=" << user_context << " "
-                   << "device=" << (void *)(device) << " "
-                   << "physical_device=" << (void *)(physical_device) << " "
-                   << "region=" << (void *)(region) << " "
-                   << "region_size=" << (uint32_t)region->size << " "
-                   << "region_offset=" << (uint32_t)region->offset << " "
-                   << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
+    debug(user_context) << "VulkanMemoryAllocator: Unmapping region ("
+                        << "user_context=" << user_context << " "
+                        << "device=" << (void *)(device) << " "
+                        << "physical_device=" << (void *)(physical_device) << " "
+                        << "region=" << (void *)(region) << " "
+                        << "region_size=" << (uint32_t)region->size << " "
+                        << "region_offset=" << (uint32_t)region->offset << " "
+                        << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
 #endif
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, physical_device != nullptr);
+    if ((device == nullptr) || (physical_device == nullptr)) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to unmap region! Invalid device handle!\n";
+        return halide_error_code_generic_error;
+    }
 
     MemoryRegion *owner = owner_of(user_context, region);
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
-        error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
-        return;
+        error(user_context) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
+        return halide_error_code_internal_error;
     }
 
     BlockResource *block_resource = region_allocator->block_resource();
     if (block_resource == nullptr) {
-        error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid block resource handle!\n";
-        return;
+        error(user_context) << "VulkanMemoryAllocator: Unable to unmap region! Invalid block resource handle!\n";
+        return halide_error_code_internal_error;
     }
 
     VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block_resource->memory.handle);
     if (device_memory == nullptr) {
-        error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid device memory handle!\n";
-        return;
+        error(user_context) << "VulkanMemoryAllocator: Unable to unmap region! Invalid device memory handle!\n";
+        return halide_error_code_internal_error;
     }
 
     vkUnmapMemory(device, *device_memory);
+    return halide_error_code_success;
 }
 
 MemoryRegion *VulkanMemoryAllocator::create_crop(void *user_context, MemoryRegion *region, uint64_t offset) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Cropping region ("
-                   << "user_context=" << user_context << " "
-                   << "device=" << (void *)(device) << " "
-                   << "physical_device=" << (void *)(physical_device) << " "
-                   << "region=" << (void *)(region) << " "
-                   << "region_size=" << (uint32_t)region->size << " "
-                   << "region_offset=" << (uint32_t)region->offset << " "
-                   << "crop_offset=" << (int64_t)offset << ") ...\n";
+    debug(user_context) << "VulkanMemoryAllocator: Cropping region ("
+                        << "user_context=" << user_context << " "
+                        << "device=" << (void *)(device) << " "
+                        << "physical_device=" << (void *)(physical_device) << " "
+                        << "region=" << (void *)(region) << " "
+                        << "region_size=" << (uint32_t)region->size << " "
+                        << "region_offset=" << (uint32_t)region->offset << " "
+                        << "crop_offset=" << (int64_t)offset << ") ...\n";
 #endif
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, physical_device != nullptr);
+    if ((device == nullptr) || (physical_device == nullptr)) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to crop region! Invalid device handle!\n";
+        return nullptr;
+    }
 
     MemoryRegion *owner = owner_of(user_context, region);
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
-        error(nullptr) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
-        return nullptr;
+        error(user_context) << "VulkanMemoryAllocator: Unable to unmap region! Invalid region allocator handle!\n";
+        return nullptr;  // NOTE: caller must handle nullptr
     }
 
     // increment usage count
-    region_allocator->retain(this, owner);
+    int error_code = region_allocator->retain(this, owner);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to crop region! Failed to retain memory region!\n";
+        return nullptr;  // NOTE: caller must handle nullptr
+    }
 
     // create a new region to return, and copy all the other region's properties
     const BlockAllocator::MemoryAllocators &allocators = block_allocator->current_allocators();
-    halide_abort_if_false(user_context, allocators.system.allocate != nullptr);
-    MemoryRegion *result = reinterpret_cast<MemoryRegion *>(
+    if (allocators.system.allocate == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to create crop! Missing system allocator interface!\n";
+        return nullptr;
+    }
+
+    MemoryRegion *memory_region = reinterpret_cast<MemoryRegion *>(
         allocators.system.allocate(user_context, sizeof(MemoryRegion)));
 
-    halide_abort_if_false(user_context, result != nullptr);
-    memcpy(result, owner, sizeof(MemoryRegion));
+    if (memory_region == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Failed to allocate memory region! Out of memory!\n";
+        return nullptr;
+    }
+    memcpy(memory_region, owner, sizeof(MemoryRegion));
 
     // point the handle to the owner of the allocated region, and update the head offset
-    result->is_owner = false;
-    result->handle = (void *)owner;
-    result->range.head_offset = owner->range.head_offset + offset;
-    return result;
+    memory_region->is_owner = false;
+    memory_region->handle = (void *)owner;
+    memory_region->range.head_offset = owner->range.head_offset + offset;
+    return memory_region;
 }
 
-void VulkanMemoryAllocator::destroy_crop(void *user_context, MemoryRegion *region) {
+int VulkanMemoryAllocator::destroy_crop(void *user_context, MemoryRegion *region) {
+    if (region == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Failed to destroy crop! Invalid memory region!\n";
+        return halide_error_code_generic_error;
+    }
 
     MemoryRegion *owner = owner_of(user_context, region);
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, owner);
     if (region_allocator == nullptr) {
-        error(nullptr) << "VulkanMemoryAllocator: Unable to destroy crop region! Invalid region allocator handle!\n";
-        return;
+        error(user_context) << "VulkanMemoryAllocator: Unable to destroy crop region! Invalid region allocator handle!\n";
+        return halide_error_code_internal_error;
     }
 
     // decrement usage count
-    region_allocator->release(this, owner);
+    int error_code = region_allocator->release(this, owner);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "VulkanBlockAllocator: Unable to destroy crop region! Region allocator failed to release memory region!\n";
+        return error_code;
+    }
 
     // discard the copied region struct
     const BlockAllocator::MemoryAllocators &allocators = block_allocator->current_allocators();
-    halide_abort_if_false(user_context, allocators.system.deallocate != nullptr);
+    if (allocators.system.deallocate == nullptr) {
+        error(user_context) << "VulkanBlockAllocator: Unable to destroy crop region! Missing system allocator interface!\n";
+        return halide_error_code_internal_error;
+    }
     allocators.system.deallocate(user_context, region);
+    return halide_error_code_success;
 }
 
 MemoryRegion *VulkanMemoryAllocator::owner_of(void *user_context, MemoryRegion *region) {
@@ -347,77 +410,104 @@ MemoryRegion *VulkanMemoryAllocator::owner_of(void *user_context, MemoryRegion *
     }
 }
 
-void VulkanMemoryAllocator::release(void *user_context, MemoryRegion *region) {
+int VulkanMemoryAllocator::release(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Releasing region ("
-                   << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << " "
-                   << "size=" << (uint32_t)region->size << " "
-                   << "offset=" << (uint32_t)region->offset << ") ...\n";
+    debug(user_context) << "VulkanMemoryAllocator: Releasing region ("
+                        << "user_context=" << user_context << " "
+                        << "region=" << (void *)(region) << " "
+                        << "size=" << (uint32_t)region->size << " "
+                        << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, physical_device != nullptr);
-
+    if ((device == nullptr) || (physical_device == nullptr)) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to release region! Invalid device handle!\n";
+        return halide_error_code_generic_error;
+    }
+    if (block_allocator == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to release region! Invalid block allocator!\n";
+        return halide_error_code_generic_error;
+    }
     return block_allocator->release(this, region);
 }
 
-void VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
+int VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Reclaiming region ("
-                   << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << " "
-                   << "size=" << (uint32_t)region->size << " "
-                   << "offset=" << (uint32_t)region->offset << ") ...\n";
+    debug(user_context) << "VulkanMemoryAllocator: Reclaiming region ("
+                        << "user_context=" << user_context << " "
+                        << "region=" << (void *)(region) << " "
+                        << "size=" << (uint32_t)region->size << " "
+                        << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, physical_device != nullptr);
-
+    if ((device == nullptr) || (physical_device == nullptr)) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to reclaim region! Invalid device handle!\n";
+        return halide_error_code_generic_error;
+    }
+    if (block_allocator == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to reclaim region! Invalid block allocator!\n";
+        return halide_error_code_generic_error;
+    }
     return block_allocator->reclaim(this, region);
 }
 
-void VulkanMemoryAllocator::retain(void *user_context, MemoryRegion *region) {
+int VulkanMemoryAllocator::retain(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Retaining region ("
-                   << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << " "
-                   << "size=" << (uint32_t)region->size << " "
-                   << "offset=" << (uint32_t)region->offset << ") ...\n";
+    debug(user_context) << "VulkanMemoryAllocator: Retaining region ("
+                        << "user_context=" << user_context << " "
+                        << "region=" << (void *)(region) << " "
+                        << "size=" << (uint32_t)region->size << " "
+                        << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
+    if ((device == nullptr) || (physical_device == nullptr)) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to retain region! Invalid device handle!\n";
+        return halide_error_code_generic_error;
+    }
+    if (block_allocator == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to retain region! Invalid block allocator!\n";
+        return halide_error_code_generic_error;
+    }
     return block_allocator->retain(this, region);
 }
 
 bool VulkanMemoryAllocator::collect(void *user_context) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Collecting unused memory ("
-                   << "user_context=" << user_context << ") ... \n";
+    debug(user_context) << "VulkanMemoryAllocator: Collecting unused memory ("
+                        << "user_context=" << user_context << ") ... \n";
 #endif
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, physical_device != nullptr);
-
+    if ((device == nullptr) || (physical_device == nullptr) || (block_allocator == nullptr)) {
+        return false;
+    }
     return block_allocator->collect(this);
 }
 
-void VulkanMemoryAllocator::release(void *user_context) {
+int VulkanMemoryAllocator::release(void *user_context) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Releasing block allocator ("
-                   << "user_context=" << user_context << ") ... \n";
+    debug(user_context) << "VulkanMemoryAllocator: Releasing block allocator ("
+                        << "user_context=" << user_context << ") ... \n";
 #endif
-    halide_abort_if_false(user_context, device != nullptr);
-    halide_abort_if_false(user_context, physical_device != nullptr);
+    if ((device == nullptr) || (physical_device == nullptr)) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to release allocator! Invalid device handle!\n";
+        return halide_error_code_generic_error;
+    }
+    if (block_allocator == nullptr) {
+        error(user_context) << "VulkanMemoryAllocator: Unable to release allocator! Invalid block allocator!\n";
+        return halide_error_code_generic_error;
+    }
 
-    block_allocator->release(this);
+    return block_allocator->release(this);
 }
 
-void VulkanMemoryAllocator::destroy(void *user_context) {
+int VulkanMemoryAllocator::destroy(void *user_context) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Destroying allocator ("
-                   << "user_context=" << user_context << ") ... \n";
+    debug(user_context) << "VulkanMemoryAllocator: Destroying allocator ("
+                        << "user_context=" << user_context << ") ... \n";
 #endif
-    block_allocator->destroy(this);
+    if (block_allocator != nullptr) {
+        block_allocator->destroy(this);
+    }
     region_count = 0;
     region_byte_count = 0;
     block_count = 0;
     block_byte_count = 0;
+    return halide_error_code_success;
 }
 
 const VulkanMemoryConfig &
@@ -428,29 +518,39 @@ VulkanMemoryAllocator::default_config() {
 
 // --
 
-void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *block) {
-    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
-    halide_abort_if_false(user_context, instance != nullptr);
-    halide_abort_if_false(user_context, instance->device != nullptr);
-    halide_abort_if_false(user_context, instance->physical_device != nullptr);
-    halide_abort_if_false(user_context, block != nullptr);
+int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block) {
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
+    if (instance == nullptr) {
+        return halide_error_code_internal_error;
+    }
+
+    void *user_context = instance->owner_context;
+    if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
+        error(user_context) << "VulkanBlockAllocator: Unable to deallocate block! Invalid device handle!\n";
+        return halide_error_code_internal_error;
+    }
+
+    if (block == nullptr) {
+        error(user_context) << "VulkanBlockAllocator: Unable to deallocate block! Invalid pointer!\n";
+        return halide_error_code_internal_error;
+    }
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
-                   << "user_context=" << user_context << " "
-                   << "block=" << (void *)(block) << " "
-                   << "size=" << (uint64_t)block->size << ", "
-                   << "dedicated=" << (block->dedicated ? "true" : "false") << " "
-                   << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
-                   << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
-                   << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+    debug(user_context) << "VulkanMemoryAllocator: Allocating block ("
+                        << "user_context=" << user_context << " "
+                        << "block=" << (void *)(block) << " "
+                        << "size=" << (uint64_t)block->size << ", "
+                        << "dedicated=" << (block->dedicated ? "true" : "false") << " "
+                        << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
+                        << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
+                        << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
 #endif
 
     // Find an appropriate memory type given the flags
     uint32_t memory_type = instance->select_memory_type(user_context, instance->physical_device, block->properties, 0);
     if (memory_type == invalid_memory_type) {
-        debug(nullptr) << "VulkanMemoryAllocator: Unable to find appropriate memory type for device!\n";
-        return;
+        error(user_context) << "VulkanMemoryAllocator: Unable to find appropriate memory type for device!\n";
+        return halide_error_code_generic_error;
     }
 
     // Allocate memory
@@ -463,14 +563,14 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
 
     VkDeviceMemory *device_memory = (VkDeviceMemory *)vk_host_malloc(nullptr, sizeof(VkDeviceMemory), 0, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT, instance->alloc_callbacks);
     if (device_memory == nullptr) {
-        error(nullptr) << "VulkanBlockAllocator: Unable to allocate block! Failed to allocate device memory handle!\n";
-        return;
+        debug(user_context) << "VulkanBlockAllocator: Unable to allocate block! Failed to allocate device memory handle!\n";
+        return halide_error_code_out_of_memory;
     }
 
     VkResult result = vkAllocateMemory(instance->device, &alloc_info, instance->alloc_callbacks, device_memory);
     if (result != VK_SUCCESS) {
-        error(nullptr) << "VulkanMemoryAllocator: Allocation failed! vkAllocateMemory returned: " << vk_get_error_name(result) << "\n";
-        return;
+        debug(user_context) << "VulkanMemoryAllocator: Allocation failed! vkAllocateMemory returned: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_device_malloc_failed;
     }
 
     uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties);
@@ -488,9 +588,9 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     VkBuffer buffer = {0};
     result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, &buffer);
     if (result != VK_SUCCESS) {
-        error(nullptr) << "VulkanMemoryAllocator: Failed to create buffer!\n\t"
-                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
-        return;
+        debug(user_context) << "VulkanMemoryAllocator: Failed to create buffer!\n\t"
+                            << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_device_malloc_failed;
     }
 
     VkMemoryRequirements memory_requirements = {0};
@@ -498,12 +598,12 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
-                   << "size=" << (uint32_t)block->size << ", "
-                   << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
-                   << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
-                   << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
-                   << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
+    debug(user_context) << "VulkanMemoryAllocator: Block allocated ("
+                        << "size=" << (uint32_t)block->size << ", "
+                        << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                        << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
+                        << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
+                        << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
 #endif
 
     if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
@@ -516,39 +616,50 @@ void VulkanMemoryAllocator::allocate_block(void *user_context, MemoryBlock *bloc
     block->handle = (void *)device_memory;
     instance->block_byte_count += block->size;
     instance->block_count++;
+    return halide_error_code_success;
 }
 
-void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *block) {
+int VulkanMemoryAllocator::deallocate_block(void *instance_ptr, MemoryBlock *block) {
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
+    if (instance == nullptr) {
+        return halide_error_code_internal_error;
+    }
+
+    void *user_context = instance->owner_context;
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Deallocating block ("
-                   << "user_context=" << user_context << " "
-                   << "block=" << (void *)(block) << ") ... \n";
+    debug(user_context) << "VulkanMemoryAllocator: Deallocating block ("
+                        << "user_context=" << user_context << " "
+                        << "block=" << (void *)(block) << ") ... \n";
 #endif
 
-    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
-    halide_abort_if_false(user_context, instance != nullptr);
-    halide_abort_if_false(user_context, instance->device != nullptr);
-    halide_abort_if_false(user_context, instance->physical_device != nullptr);
-    halide_abort_if_false(user_context, block != nullptr);
+    if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
+        error(user_context) << "VulkanBlockAllocator: Unable to deallocate block! Invalid device handle!\n";
+        return halide_error_code_internal_error;
+    }
+
+    if (block == nullptr) {
+        error(user_context) << "VulkanBlockAllocator: Unable to deallocate block! Invalid pointer!\n";
+        return halide_error_code_internal_error;
+    }
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanBlockAllocator: deallocating block ("
-                   << "size=" << (uint32_t)block->size << ", "
-                   << "dedicated=" << (block->dedicated ? "true" : "false") << " "
-                   << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
-                   << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
-                   << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+    debug(user_context) << "VulkanBlockAllocator: deallocating block ("
+                        << "size=" << (uint32_t)block->size << ", "
+                        << "dedicated=" << (block->dedicated ? "true" : "false") << " "
+                        << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
+                        << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
+                        << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
 #endif
 
     if (block->handle == nullptr) {
-        error(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid handle!\n";
-        return;
+        error(user_context) << "VulkanBlockAllocator: Unable to deallocate block! Invalid handle!\n";
+        return halide_error_code_internal_error;
     }
 
     VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block->handle);
     if (device_memory == nullptr) {
-        error(nullptr) << "VulkanBlockAllocator: Unable to deallocate block! Invalid device memory handle!\n";
-        return;
+        error(user_context) << "VulkanBlockAllocator: Unable to deallocate block! Invalid device memory handle!\n";
+        return halide_error_code_internal_error;
     }
 
     vkFreeMemory(instance->device, *device_memory, instance->alloc_callbacks);
@@ -570,6 +681,7 @@ void VulkanMemoryAllocator::deallocate_block(void *user_context, MemoryBlock *bl
     block->handle = nullptr;
     vk_host_free(nullptr, device_memory, instance->alloc_callbacks);
     device_memory = nullptr;
+    return halide_error_code_success;
 }
 
 size_t VulkanMemoryAllocator::blocks_allocated() const {
@@ -630,8 +742,8 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
         break;
     case MemoryCaching::InvalidCaching:
     default:
-        error(nullptr) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory caching request!\n\t"
-                       << "caching=" << halide_memory_caching_name(properties.caching) << "\n";
+        error(user_context) << "VulkanMemoryAllocator: Unable to convert type! Invalid memory caching request!\n\t"
+                            << "caching=" << halide_memory_caching_name(properties.caching) << "\n";
         return invalid_memory_type;
     };
 
@@ -666,10 +778,10 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
     }
 
     if (result == invalid_memory_type) {
-        error(nullptr) << "VulkanBlockAllocator: Failed to find appropriate memory type for given properties:\n\t"
-                       << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                       << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                       << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+        error(user_context) << "VulkanBlockAllocator: Failed to find appropriate memory type for given properties:\n\t"
+                            << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                            << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                            << "visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
         return invalid_memory_type;
     }
 
@@ -678,27 +790,38 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
 
 // --
 
-void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *region) {
+int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *region) {
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
+    if (instance == nullptr) {
+        return halide_error_code_internal_error;
+    }
+
+    void *user_context = instance->owner_context;
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Allocating region ("
-                   << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << ") ... \n";
+    debug(user_context) << "VulkanMemoryAllocator: Allocating region ("
+                        << "user_context=" << user_context << " "
+                        << "region=" << (void *)(region) << ") ... \n";
 #endif
 
-    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
-    halide_abort_if_false(user_context, instance != nullptr);
-    halide_abort_if_false(user_context, instance->device != nullptr);
-    halide_abort_if_false(user_context, instance->physical_device != nullptr);
-    halide_abort_if_false(user_context, region != nullptr);
+    if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
+        error(user_context) << "VulkanRegionAllocator: Unable to allocate region! Invalid device handle!\n";
+        return halide_error_code_internal_error;
+    }
+
+    if (region == nullptr) {
+        error(user_context) << "VulkanRegionAllocator: Unable to allocate region! Invalid pointer!\n";
+        return halide_error_code_internal_error;
+    }
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanRegionAllocator: Allocating region ("
-                   << "size=" << (uint32_t)region->size << ", "
-                   << "offset=" << (uint32_t)region->offset << ", "
-                   << "dedicated=" << (region->dedicated ? "true" : "false") << " "
-                   << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
-                   << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
-                   << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+    debug(user_context) << "VulkanRegionAllocator: Allocating region ("
+                        << "size=" << (uint32_t)region->size << ", "
+                        << "offset=" << (uint32_t)region->offset << ", "
+                        << "dedicated=" << (region->dedicated ? "true" : "false") << " "
+                        << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
+                        << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
+                        << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
 #endif
 
     uint32_t usage_flags = instance->select_memory_usage(user_context, region->properties);
@@ -714,76 +837,87 @@ void VulkanMemoryAllocator::allocate_region(void *user_context, MemoryRegion *re
 
     VkBuffer *buffer = (VkBuffer *)vk_host_malloc(nullptr, sizeof(VkBuffer), 0, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT, instance->alloc_callbacks);
     if (buffer == nullptr) {
-        error(nullptr) << "VulkanRegionAllocator: Unable to allocate region! Failed to allocate buffer handle!\n";
-        return;
+        error(user_context) << "VulkanRegionAllocator: Unable to allocate region! Failed to allocate buffer handle!\n";
+        return halide_error_code_out_of_memory;
     }
 
     VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer);
     if (result != VK_SUCCESS) {
-        error(nullptr) << "VulkanRegionAllocator: Failed to create buffer!\n\t"
-                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
-        return;
+        error(user_context) << "VulkanRegionAllocator: Failed to create buffer!\n\t"
+                            << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_device_malloc_failed;
     }
 
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, region);
     BlockResource *block_resource = region_allocator->block_resource();
     if (block_resource == nullptr) {
-        error(nullptr) << "VulkanBlockAllocator: Unable to allocate region! Invalid block resource handle!\n";
-        return;
+        error(user_context) << "VulkanBlockAllocator: Unable to allocate region! Invalid block resource handle!\n";
+        return halide_error_code_internal_error;
     }
 
     VkDeviceMemory *device_memory = reinterpret_cast<VkDeviceMemory *>(block_resource->memory.handle);
     if (device_memory == nullptr) {
-        error(nullptr) << "VulkanBlockAllocator: Unable to allocate region! Invalid device memory handle!\n";
-        return;
+        error(user_context) << "VulkanBlockAllocator: Unable to allocate region! Invalid device memory handle!\n";
+        return halide_error_code_internal_error;
     }
 
     // Finally, bind buffer to the device memory
     result = vkBindBufferMemory(instance->device, *buffer, *device_memory, region->offset);
     if (result != VK_SUCCESS) {
-        error(nullptr) << "VulkanRegionAllocator: Failed to bind buffer!\n\t"
-                       << "vkBindBufferMemory returned: " << vk_get_error_name(result) << "\n";
-        return;
+        error(user_context) << "VulkanRegionAllocator: Failed to bind buffer!\n\t"
+                            << "vkBindBufferMemory returned: " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
 
     region->handle = (void *)buffer;
     region->is_owner = true;
     instance->region_byte_count += region->size;
     instance->region_count++;
+    return halide_error_code_success;
 }
 
-void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *region) {
+int VulkanMemoryAllocator::deallocate_region(void *instance_ptr, MemoryRegion *region) {
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
+    if (instance == nullptr) {
+        return halide_error_code_internal_error;
+    }
+
+    void *user_context = instance->owner_context;
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Deallocating region ("
-                   << "user_context=" << user_context << " "
-                   << "region=" << (void *)(region) << ") ... \n";
+    debug(user_context) << "VulkanMemoryAllocator: Deallocating region ("
+                        << "user_context=" << user_context << " "
+                        << "region=" << (void *)(region) << ") ... \n";
 #endif
 
-    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(user_context);
-    halide_abort_if_false(user_context, instance != nullptr);
-    halide_abort_if_false(user_context, instance->device != nullptr);
-    halide_abort_if_false(user_context, instance->physical_device != nullptr);
-    halide_abort_if_false(user_context, region != nullptr);
+    if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
+        error(user_context) << "VulkanRegionAllocator: Unable to deallocate region! Invalid device handle!\n";
+        return halide_error_code_internal_error;
+    }
+
+    if (region == nullptr) {
+        error(user_context) << "VulkanRegionAllocator: Unable to deallocate region! Invalid pointer!\n";
+        return halide_error_code_internal_error;
+    }
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanRegionAllocator: Deallocating region ("
-                   << "size=" << (uint32_t)region->size << ", "
-                   << "offset=" << (uint32_t)region->offset << ", "
-                   << "dedicated=" << (region->dedicated ? "true" : "false") << " "
-                   << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
-                   << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
-                   << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+    debug(user_context) << "VulkanRegionAllocator: Deallocating region ("
+                        << "size=" << (uint32_t)region->size << ", "
+                        << "offset=" << (uint32_t)region->offset << ", "
+                        << "dedicated=" << (region->dedicated ? "true" : "false") << " "
+                        << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
+                        << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
+                        << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
 #endif
 
     if (region->handle == nullptr) {
-        error(nullptr) << "VulkanRegionAllocator: Unable to deallocate region! Invalid handle!\n";
-        return;
+        error(user_context) << "VulkanRegionAllocator: Unable to deallocate region! Invalid handle!\n";
+        return halide_error_code_internal_error;
     }
 
     VkBuffer *buffer = reinterpret_cast<VkBuffer *>(region->handle);
     if (buffer == nullptr) {
-        error(nullptr) << "VulkanRegionAllocator: Unable to deallocate region! Invalid buffer handle!\n";
-        return;
+        error(user_context) << "VulkanRegionAllocator: Unable to deallocate region! Invalid buffer handle!\n";
+        return halide_error_code_internal_error;
     }
 
     vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks);
@@ -793,6 +927,7 @@ void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *
     } else {
         error(nullptr) << "VulkanRegionAllocator: Region counter invalid ... reseting to zero!\n";
         instance->region_count = 0;
+        return halide_error_code_internal_error;
     }
 
     if (int64_t(instance->region_byte_count) - int64_t(region->size) >= 0) {
@@ -800,9 +935,11 @@ void VulkanMemoryAllocator::deallocate_region(void *user_context, MemoryRegion *
     } else {
         error(nullptr) << "VulkanRegionAllocator: Region byte counter invalid ... reseting to zero!\n";
         instance->region_byte_count = 0;
+        return halide_error_code_internal_error;
     }
     vk_host_free(nullptr, buffer, instance->alloc_callbacks);
     buffer = nullptr;
+    return halide_error_code_success;
 }
 
 size_t VulkanMemoryAllocator::regions_allocated() const {
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 4ab345411924..5b7a484fa225 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -73,15 +73,19 @@ namespace {  // internalize
 
 // --------------------------------------------------------------------------
 
-VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool) {
+int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_create_command_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "queue_index: " << queue_index << ")\n";
 #endif
 
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create command pool ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
     VkCommandPoolCreateInfo command_pool_info =
         {
             VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,  // struct type
@@ -89,30 +93,45 @@ VkResult vk_create_command_pool(void *user_context, VulkanMemoryAllocator *alloc
             VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,        // flags. Assume transient short-lived single-use command buffers
             queue_index                                  // queue family index corresponding to the compute command queue
         };
-    return vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
+
+    VkResult result = vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: Failed to create command pool!\n";
+        return halide_error_code_generic_error;
+    }
+    return halide_error_code_success;
 }
 
-void vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool) {
+int vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_destroy_command_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "command_pool: " << (void *)command_pool << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
     vkDestroyCommandPool(allocator->current_device(), command_pool, allocator->callbacks());
+    return halide_error_code_success;
 }
 
 // --
 
-VkResult vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer *command_buffer) {
+int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer *command_buffer) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_create_command_buffer (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "command_pool: " << (void *)command_pool << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
     VkCommandBufferAllocateInfo command_buffer_info =
         {
             VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,  // struct type
@@ -122,29 +141,39 @@ VkResult vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *all
             1                                                // number to allocate
         };
 
-    return vkAllocateCommandBuffers(allocator->current_device(), &command_buffer_info, command_buffer);
+    VkResult result = vkAllocateCommandBuffers(allocator->current_device(), &command_buffer_info, command_buffer);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: Failed to allocate command buffers!\n";
+        return halide_error_code_generic_error;
+    }
+    return halide_error_code_success;
 }
 
-void vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer command_buffer) {
+int vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer command_buffer) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_destroy_command_buffer (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "command_pool: " << (void *)command_pool << ", "
         << "command_buffer: " << (void *)command_buffer << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
     vkFreeCommandBuffers(allocator->current_device(), command_pool, 1, &command_buffer);
+    return halide_error_code_success;
 }
 
-VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
-                                                   VkDevice device,
-                                                   VkCommandBuffer command_buffer,
-                                                   VkPipeline compute_pipeline,
-                                                   VkPipelineLayout pipeline_layout,
-                                                   VkDescriptorSet descriptor_set,
-                                                   uint32_t descriptor_set_index,
-                                                   int blocksX, int blocksY, int blocksZ) {
+int vk_fill_command_buffer_with_dispatch_call(void *user_context,
+                                              VkDevice device,
+                                              VkCommandBuffer command_buffer,
+                                              VkPipeline compute_pipeline,
+                                              VkPipelineLayout pipeline_layout,
+                                              VkDescriptorSet descriptor_set,
+                                              uint32_t descriptor_set_index,
+                                              int blocksX, int blocksY, int blocksZ) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
@@ -167,7 +196,7 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
     VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
     if (result != VK_SUCCESS) {
         error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return result;
+        return halide_error_code_generic_error;
     }
 
     vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute_pipeline);
@@ -178,13 +207,13 @@ VkResult vk_fill_command_buffer_with_dispatch_call(void *user_context,
     result = vkEndCommandBuffer(command_buffer);
     if (result != VK_SUCCESS) {
         error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return result;
+        return halide_error_code_generic_error;
     }
 
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
-VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer) {
+int vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_submit_command_buffer (user_context: " << user_context << ", "
@@ -207,10 +236,10 @@ VkResult vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBu
 
     VkResult result = vkQueueSubmit(queue, 1, &submit_info, 0);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
 // --
@@ -249,19 +278,22 @@ uint32_t vk_count_bindings_for_descriptor_set(void *user_context,
 
 // --
 
-VkResult vk_create_descriptor_pool(void *user_context,
-                                   VulkanMemoryAllocator *allocator,
-                                   uint32_t uniform_buffer_count,
-                                   uint32_t storage_buffer_count,
-                                   VkDescriptorPool *descriptor_pool) {
+int vk_create_descriptor_pool(void *user_context,
+                              VulkanMemoryAllocator *allocator,
+                              uint32_t uniform_buffer_count,
+                              uint32_t storage_buffer_count,
+                              VkDescriptorPool *descriptor_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_create_descriptor_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "uniform_buffer_count: " << (uint32_t)uniform_buffer_count << ", "
         << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
 
     BlockStorage::Config pool_config;
     pool_config.entry_size = sizeof(VkDescriptorPoolSize);
@@ -296,43 +328,49 @@ VkResult vk_create_descriptor_pool(void *user_context,
 
     VkResult result = vkCreateDescriptorPool(allocator->current_device(), &descriptor_pool_info, allocator->callbacks(), descriptor_pool);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "Vulkan: Failed to create descriptor pool! vkCreateDescriptorPool returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: Failed to create descriptor pool! vkCreateDescriptorPool returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
-VkResult vk_destroy_descriptor_pool(void *user_context,
-                                    VulkanMemoryAllocator *allocator,
-                                    VkDescriptorPool descriptor_pool) {
+int vk_destroy_descriptor_pool(void *user_context,
+                               VulkanMemoryAllocator *allocator,
+                               VkDescriptorPool descriptor_pool) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_destroy_descriptor_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
     vkDestroyDescriptorPool(allocator->current_device(), descriptor_pool, allocator->callbacks());
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
 // --
 
-VkResult vk_create_descriptor_set_layout(void *user_context,
-                                         VulkanMemoryAllocator *allocator,
-                                         uint32_t uniform_buffer_count,
-                                         uint32_t storage_buffer_count,
-                                         VkDescriptorSetLayout *layout) {
+int vk_create_descriptor_set_layout(void *user_context,
+                                    VulkanMemoryAllocator *allocator,
+                                    uint32_t uniform_buffer_count,
+                                    uint32_t storage_buffer_count,
+                                    VkDescriptorSetLayout *layout) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_create_descriptor_set_layout (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "uniform_buffer_count: " << uniform_buffer_count << ", "
         << "storage_buffer_count: " << storage_buffer_count << ", "
         << "layout: " << (void *)layout << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
 
     BlockStorage::Config layout_config;
     layout_config.entry_size = sizeof(VkDescriptorSetLayoutBinding);
@@ -389,35 +427,48 @@ VkResult vk_create_descriptor_set_layout(void *user_context,
     VkResult result = vkCreateDescriptorSetLayout(allocator->current_device(), &layout_info, allocator->callbacks(), layout);
     if (result != VK_SUCCESS) {
         error(user_context) << "vkCreateDescriptorSetLayout returned " << vk_get_error_name(result) << "\n";
-        return result;
+        return halide_error_code_generic_error;
     }
 
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
-VkResult vk_destroy_descriptor_set_layout(void *user_context,
-                                          VulkanMemoryAllocator *allocator,
-                                          VkDescriptorSetLayout descriptor_set_layout) {
+int vk_destroy_descriptor_set_layout(void *user_context,
+                                     VulkanMemoryAllocator *allocator,
+                                     VkDescriptorSetLayout descriptor_set_layout) {
 
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << " vk_destroy_descriptor_set_layout (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "layout: " << (void *)descriptor_set_layout << ")\n";
+#endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
     vkDestroyDescriptorSetLayout(allocator->current_device(), descriptor_set_layout, allocator->callbacks());
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
 // --
 
-VkResult vk_create_descriptor_set(void *user_context,
-                                  VulkanMemoryAllocator *allocator,
-                                  VkDescriptorSetLayout descriptor_set_layout,
-                                  VkDescriptorPool descriptor_pool,
-                                  VkDescriptorSet *descriptor_set) {
+int vk_create_descriptor_set(void *user_context,
+                             VulkanMemoryAllocator *allocator,
+                             VkDescriptorSetLayout descriptor_set_layout,
+                             VkDescriptorPool descriptor_pool,
+                             VkDescriptorSet *descriptor_set) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_create_descriptor_set (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "descriptor_set_layout: " << (void *)descriptor_set_layout << ", "
         << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
 
     VkDescriptorSetAllocateInfo descriptor_set_info =
         {
@@ -430,32 +481,35 @@ VkResult vk_create_descriptor_set(void *user_context,
 
     VkResult result = vkAllocateDescriptorSets(allocator->current_device(), &descriptor_set_info, descriptor_set);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkAllocateDescriptorSets returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkAllocateDescriptorSets returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
 
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
-VkResult vk_update_descriptor_set(void *user_context,
-                                  VulkanMemoryAllocator *allocator,
-                                  VkBuffer *scalar_args_buffer,
-                                  size_t uniform_buffer_count,
-                                  size_t storage_buffer_count,
-                                  size_t arg_sizes[],
-                                  void *args[],
-                                  int8_t arg_is_buffer[],
-                                  VkDescriptorSet descriptor_set) {
+int vk_update_descriptor_set(void *user_context,
+                             VulkanMemoryAllocator *allocator,
+                             VkBuffer *scalar_args_buffer,
+                             size_t uniform_buffer_count,
+                             size_t storage_buffer_count,
+                             size_t arg_sizes[],
+                             void *args[],
+                             int8_t arg_is_buffer[],
+                             VkDescriptorSet descriptor_set) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_update_descriptor_set (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "scalar_args_buffer: " << (void *)scalar_args_buffer << ", "
         << "uniform_buffer_count: " << (uint32_t)uniform_buffer_count << ", "
         << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ", "
         << "descriptor_set: " << (void *)descriptor_set << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
 
     BlockStorage::Config dbi_config;
     dbi_config.minimum_capacity = storage_buffer_count + uniform_buffer_count;
@@ -511,7 +565,7 @@ VkResult vk_update_descriptor_set(void *user_context,
             VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(owner->handle);
             if (device_buffer == nullptr) {
                 error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
-                return VK_ERROR_INITIALIZATION_FAILED;
+                return halide_error_code_internal_error;
             }
 
             VkDeviceSize range_offset = device_region->range.head_offset;
@@ -551,7 +605,7 @@ VkResult vk_update_descriptor_set(void *user_context,
 
     // issue the update call to populate the descriptor set
     vkUpdateDescriptorSets(allocator->current_device(), (uint32_t)write_descriptor_set.size(), (const VkWriteDescriptorSet *)write_descriptor_set.data(), 0, nullptr);
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
 // --
@@ -579,10 +633,14 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
     debug(user_context)
         << " vk_create_scalar_uniform_buffer (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "scalar_buffer_size: " << (uint32_t)scalar_buffer_size << ")\n";
 #endif
 
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!\n";
+        return nullptr;
+    }
+
     MemoryRequest request = {0};
     request.size = scalar_buffer_size;
     request.properties.usage = MemoryUsage::UniformStorage;
@@ -592,7 +650,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
     // allocate a new region
     MemoryRegion *region = allocator->reserve(user_context, request);
     if ((region == nullptr) || (region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Failed to allocate device memory!\n";
+        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!\n";
         return nullptr;
     }
 
@@ -600,12 +658,12 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
     return region;
 }
 
-VkResult vk_update_scalar_uniform_buffer(void *user_context,
-                                         VulkanMemoryAllocator *allocator,
-                                         MemoryRegion *region,
-                                         size_t arg_sizes[],
-                                         void *args[],
-                                         int8_t arg_is_buffer[]) {
+int vk_update_scalar_uniform_buffer(void *user_context,
+                                    VulkanMemoryAllocator *allocator,
+                                    MemoryRegion *region,
+                                    size_t arg_sizes[],
+                                    void *args[],
+                                    int8_t arg_is_buffer[]) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
@@ -613,16 +671,21 @@ VkResult vk_update_scalar_uniform_buffer(void *user_context,
         << "region: " << (void *)region << ")\n";
 #endif
 
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
     if ((region == nullptr) || (region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Invalid memory region used for updating scalar uniform buffer!\n";
-        return VK_INCOMPLETE;
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!\n";
+        return halide_error_code_internal_error;
     }
 
     // map the region to a host ptr
     uint8_t *host_ptr = (uint8_t *)allocator->map(user_context, region);
     if (host_ptr == nullptr) {
-        error(user_context) << "Vulkan: Failed to map host pointer to device memory!\n";
-        return VK_INCOMPLETE;
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!\n";
+        return halide_error_code_internal_error;
     }
 
     // copy to the (host-visible/coherent) scalar uniform buffer
@@ -636,43 +699,56 @@ VkResult vk_update_scalar_uniform_buffer(void *user_context,
 
     // unmap the pointer to the buffer for the region
     allocator->unmap(user_context, region);
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
-void vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
-                                      MemoryRegion *scalar_args_region) {
+int vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *allocator,
+                                     MemoryRegion *scalar_args_region) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_destroy_scalar_uniform_buffer (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "scalar_args_region: " << (void *)scalar_args_region << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
 
     if (!scalar_args_region) {
-        return;
+        return halide_error_code_success;
+    }
+
+    int error_code = halide_error_code_success;
+    if (halide_can_reuse_device_allocations(user_context)) {
+        error_code = allocator->release(user_context, scalar_args_region);
+    } else {
+        error_code = allocator->reclaim(user_context, scalar_args_region);
     }
-    allocator->reclaim(user_context, scalar_args_region);
+    return error_code;
 }
 
 // --
 
-VkResult vk_create_pipeline_layout(void *user_context,
-                                   VulkanMemoryAllocator *allocator,
-                                   uint32_t descriptor_set_count,
-                                   VkDescriptorSetLayout *descriptor_set_layouts,
-                                   VkPipelineLayout *pipeline_layout) {
+int vk_create_pipeline_layout(void *user_context,
+                              VulkanMemoryAllocator *allocator,
+                              uint32_t descriptor_set_count,
+                              VkDescriptorSetLayout *descriptor_set_layouts,
+                              VkPipelineLayout *pipeline_layout) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_create_pipeline_layout (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "descriptor_set_count: " << descriptor_set_count << ", "
         << "descriptor_set_layouts: " << (void *)descriptor_set_layouts << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
 
     VkPipelineLayoutCreateInfo pipeline_layout_info = {
         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,  // structure type
@@ -686,46 +762,53 @@ VkResult vk_create_pipeline_layout(void *user_context,
 
     VkResult result = vkCreatePipelineLayout(allocator->current_device(), &pipeline_layout_info, allocator->callbacks(), pipeline_layout);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "vkCreatePipelineLayout returned " << vk_get_error_name(result) << "\n";
-        return result;
+        error(user_context) << "Vulkan: vkCreatePipelineLayout returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
-VkResult vk_destroy_pipeline_layout(void *user_context,
-                                    VulkanMemoryAllocator *allocator,
-                                    VkPipelineLayout pipeline_layout) {
+int vk_destroy_pipeline_layout(void *user_context,
+                               VulkanMemoryAllocator *allocator,
+                               VkPipelineLayout pipeline_layout) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_destroy_pipeline_layout (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
 
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
     vkDestroyPipelineLayout(allocator->current_device(), pipeline_layout, allocator->callbacks());
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
 // --
 
-VkResult vk_create_compute_pipeline(void *user_context,
-                                    VulkanMemoryAllocator *allocator,
-                                    const char *pipeline_name,
-                                    VkShaderModule shader_module,
-                                    VkPipelineLayout pipeline_layout,
-                                    VkSpecializationInfo *specialization_info,
-                                    VkPipeline *compute_pipeline) {
+int vk_create_compute_pipeline(void *user_context,
+                               VulkanMemoryAllocator *allocator,
+                               const char *pipeline_name,
+                               VkShaderModule shader_module,
+                               VkPipelineLayout pipeline_layout,
+                               VkSpecializationInfo *specialization_info,
+                               VkPipeline *compute_pipeline) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_create_compute_pipeline (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "shader_module: " << (void *)shader_module << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
 
     VkComputePipelineCreateInfo compute_pipeline_info =
         {
@@ -750,36 +833,52 @@ VkResult vk_create_compute_pipeline(void *user_context,
     VkResult result = vkCreateComputePipelines(allocator->current_device(), 0, 1, &compute_pipeline_info, allocator->callbacks(), compute_pipeline);
     if (result != VK_SUCCESS) {
         error(user_context) << "Vulkan: Failed to create compute pipeline! vkCreateComputePipelines returned " << vk_get_error_name(result) << "\n";
-        return result;
+        return halide_error_code_generic_error;
     }
 
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
-VkResult vk_setup_compute_pipeline(void *user_context,
-                                   VulkanMemoryAllocator *allocator,
-                                   VulkanShaderBinding *shader_bindings,
-                                   VulkanDispatchData *dispatch_data,
-                                   VkShaderModule shader_module,
-                                   VkPipelineLayout pipeline_layout,
-                                   VkPipeline *compute_pipeline) {
+int vk_setup_compute_pipeline(void *user_context,
+                              VulkanMemoryAllocator *allocator,
+                              VulkanShaderBinding *shader_bindings,
+                              VulkanDispatchData *dispatch_data,
+                              VkShaderModule shader_module,
+                              VkPipelineLayout pipeline_layout,
+                              VkPipeline *compute_pipeline) {
 
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_setup_compute_pipeline (user_context: " << user_context << ", "
         << "entry_point_name: '" << shader_bindings->entry_point_name << "', "
         << "allocator: " << (void *)allocator << ", "
-        << "device: " << (void *)allocator->current_device() << ", "
         << "shader_bindings: " << (void *)shader_bindings << ", "
+        << "dispatch_data: " << (void *)dispatch_data << ", "
         << "shader_module: " << (void *)shader_module << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
-    halide_abort_if_false(user_context, allocator != nullptr);
-    halide_abort_if_false(user_context, shader_bindings != nullptr);
-    halide_abort_if_false(user_context, dispatch_data != nullptr);
+
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
+    if (shader_bindings == nullptr) {
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!\n";
+        return halide_error_code_generic_error;
+    }
+
+    if (shader_bindings == nullptr) {
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!\n";
+        return halide_error_code_generic_error;
+    }
 
     VkResult result = VK_SUCCESS;
     const char *entry_point_name = shader_bindings->entry_point_name;
+    if (entry_point_name == nullptr) {
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!\n";
+        return halide_error_code_generic_error;
+    }
 
     uint32_t dispatch_constant_index = 0;
     uint32_t dispatch_constant_ids[4] = {0, 0, 0, 0};
@@ -801,6 +900,7 @@ VkResult vk_setup_compute_pipeline(void *user_context,
                 // dynamic allocation
                 if (shared_mem_constant_id > 0) {
                     error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is suported!!\n";
+                    result = VK_ERROR_TOO_MANY_OBJECTS;
                     break;
                 }
                 shared_mem_constant_id = allocation->constant_id;
@@ -847,7 +947,16 @@ VkResult vk_setup_compute_pipeline(void *user_context,
                 break;
             }
         }
-        halide_abort_if_false(user_context, found_index != invalid_index);
+        if (found_index == invalid_index) {
+            error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!\n";
+            result = VK_ERROR_INITIALIZATION_FAILED;
+        }
+    }
+
+    // don't even attempt to create the pipeline layout if we encountered errors in the shader binding
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
     }
 
     // Prepare specialization mapping for all dispatch constants
@@ -872,32 +981,38 @@ VkResult vk_setup_compute_pipeline(void *user_context,
 
         // Recreate the pipeline with the requested shared memory allocation
         if (shader_bindings->compute_pipeline) {
-            vk_destroy_compute_pipeline(user_context, allocator, shader_bindings->compute_pipeline);
+            int error_code = vk_destroy_compute_pipeline(user_context, allocator, shader_bindings->compute_pipeline);
+            if (error_code != halide_error_code_success) {
+                error(user_context) << "Vulkan: Failed to destroy compute pipeline!\n";
+                return halide_error_code_generic_error;
+            }
             shader_bindings->compute_pipeline = {0};
         }
 
-        result = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, &specialization_info, &(shader_bindings->compute_pipeline));
-        if (result != VK_SUCCESS) {
-            error(user_context) << "vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+        int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, &specialization_info, &(shader_bindings->compute_pipeline));
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to create compute pipeline!\n";
+            return error_code;
         }
 
     } else {
 
         // Construct and re-use the fixed pipeline
         if (shader_bindings->compute_pipeline == 0) {
-            result = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, nullptr, &(shader_bindings->compute_pipeline));
-            if (result != VK_SUCCESS) {
-                error(user_context) << "vk_create_compute_pipeline() failed! Unable to proceed! Error: " << vk_get_error_name(result) << "\n";
+            int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, nullptr, &(shader_bindings->compute_pipeline));
+            if (error_code != halide_error_code_success) {
+                error(user_context) << "Vulkan: Failed to create compute pipeline!\n";
+                return error_code;
             }
         }
     }
 
-    return result;
+    return halide_error_code_success;
 }
 
-VkResult vk_destroy_compute_pipeline(void *user_context,
-                                     VulkanMemoryAllocator *allocator,
-                                     VkPipeline compute_pipeline) {
+int vk_destroy_compute_pipeline(void *user_context,
+                                VulkanMemoryAllocator *allocator,
+                                VkPipeline compute_pipeline) {
 #ifdef DEBUG_RUNTIME
     debug(user_context)
         << " vk_destroy_compute_pipeline (user_context: " << user_context << ", "
@@ -905,8 +1020,13 @@ VkResult vk_destroy_compute_pipeline(void *user_context,
         << "device: " << (void *)allocator->current_device() << ", "
         << "compute_pipeline: " << (void *)compute_pipeline << ")\n";
 #endif
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
     vkDestroyPipeline(allocator->current_device(), compute_pipeline, allocator->callbacks());
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
 // --------------------------------------------------------------------------
@@ -921,7 +1041,16 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
 
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
-    halide_debug_assert(user_context, module_ptr != nullptr);
+
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!\n";
+        return nullptr;
+    }
+
+    if ((module_ptr == nullptr) || (module_size < (2 * sizeof(uint32_t)))) {
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!\n";
+        return nullptr;
+    }
 
     // Decode the sidecar for the module that lists the descriptor sets
     // corresponding to each entry point contained in the module.
@@ -963,11 +1092,11 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
     // Both vk_decode_shader_bindings() and vk_compile_shader_module() will
     // need to be updated if the header encoding ever changes!
     //
-    halide_debug_assert(user_context, module_size >= (2 * sizeof(uint32_t)));
     uint32_t module_entries = module_size / sizeof(uint32_t);
     uint32_t idx = 1;  // skip past the header_word_count
     uint32_t shader_count = module_ptr[idx++];
     if (shader_count < 1) {
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!\n";
         return nullptr;  // no descriptors
     }
 
@@ -1138,6 +1267,16 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!\n";
+        return nullptr;
+    }
+
+    if ((ptr == nullptr) || (size <= 0)) {
+        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!\n";
+        return nullptr;
+    }
+
     const uint32_t *module_ptr = (const uint32_t *)ptr;
     const uint32_t module_size = (const uint32_t)size;
 
@@ -1177,12 +1316,17 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
     }
     memset(cache_entry, 0, sizeof(VulkanCompilationCacheEntry));
 
-    // decode the entry point data and save it in the cache entry
-    cache_entry->shader_bindings = vk_decode_shader_bindings(user_context, allocator, module_ptr, module_size);
-    if (cache_entry->shader_bindings != nullptr) {
-        cache_entry->shader_count = shader_count;
+    // decode the entry point data and extract the shader bindings
+    VulkanShaderBinding *decoded_bindings = vk_decode_shader_bindings(user_context, allocator, module_ptr, module_size);
+    if (decoded_bindings == nullptr) {
+        error(user_context) << "Vulkan: Failed to decode shader bindings!\n";
+        return nullptr;
     }
 
+    // save the shader bindings in the cache entry
+    cache_entry->shader_bindings = decoded_bindings;
+    cache_entry->shader_count = shader_count;
+
     VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &cache_entry->shader_module);
     if ((result != VK_SUCCESS)) {
         error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
@@ -1220,6 +1364,11 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
+    if (allocator == nullptr) {
+        error(user_context) << "Vulkan: Failed to destroy shader modules ... invalid allocator pointer!\n";
+        return halide_error_code_generic_error;
+    }
+
     // Functor to match compilation cache destruction call with scoped params
     struct DestroyShaderModule {
         void *user_context = nullptr;
@@ -1290,7 +1439,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
-    return VK_SUCCESS;
+    return halide_error_code_success;
 }
 
 // --------------------------------------------------------------------------
@@ -1314,7 +1463,7 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
             VkBuffer *dst_buffer = reinterpret_cast<VkBuffer *>(c.dst);
             if (!src_buffer || !dst_buffer) {
                 error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
-                return -1;
+                return halide_error_code_internal_error;
             }
 
             vkCmdCopyBuffer(command_buffer, *src_buffer, *dst_buffer, 1, &buffer_copy);
@@ -1341,7 +1490,7 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
             }
         }
     }
-    return 0;
+    return halide_error_code_success;
 }
 
 int vk_device_crop_from_offset(void *user_context,
@@ -1350,7 +1499,8 @@ int vk_device_crop_from_offset(void *user_context,
                                struct halide_buffer_t *dst) {
 
     VulkanContext ctx(user_context);
-    if (ctx.error != VK_SUCCESS) {
+    if (ctx.error != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to acquire context!\n";
         return ctx.error;
     }
 
@@ -1358,20 +1508,23 @@ int vk_device_crop_from_offset(void *user_context,
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    halide_abort_if_false(user_context, src->device);
     if (offset < 0) {
         error(user_context) << "Vulkan: Invalid offset for device crop!\n";
-        return halide_error_code_internal_error;
+        return halide_error_code_device_crop_failed;
     }
 
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(src->device);
+    if (device_region == nullptr) {
+        error(user_context) << "Vulkan: Failed to crop region! Invalide device region!\n";
+        return halide_error_code_device_crop_failed;
+    }
 
     // create the croppeg region from the allocated region
     MemoryRegion *cropped_region = ctx.allocator->create_crop(user_context, device_region, (uint64_t)offset);
     if ((cropped_region == nullptr) || (cropped_region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Failed to crop region!\n";
-        return halide_error_code_internal_error;
+        error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!\n";
+        return halide_error_code_device_crop_failed;
     }
 
     // update the destination to the cropped region
@@ -1383,7 +1536,7 @@ int vk_device_crop_from_offset(void *user_context,
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
 
-    return 0;
+    return halide_error_code_success;
 }
 
 // --------------------------------------------------------------------------
diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp
index c7659705cce6..2e61d0abb38c 100644
--- a/test/runtime/block_allocator.cpp
+++ b/test/runtime/block_allocator.cpp
@@ -10,7 +10,7 @@ namespace {
 size_t allocated_region_memory = 0;
 size_t allocated_block_memory = 0;
 
-void allocate_block(void *user_context, MemoryBlock *block) {
+int allocate_block(void *user_context, MemoryBlock *block) {
     block->handle = allocate_system(user_context, block->size);
     allocated_block_memory += block->size;
 
@@ -19,9 +19,11 @@ void allocate_block(void *user_context, MemoryBlock *block) {
                         << "block_size=" << int32_t(block->size) << " "
                         << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
                         << ") !\n";
+
+    return halide_error_code_success;
 }
 
-void deallocate_block(void *user_context, MemoryBlock *block) {
+int deallocate_block(void *user_context, MemoryBlock *block) {
     deallocate_system(user_context, block->handle);
     allocated_block_memory -= block->size;
 
@@ -30,9 +32,11 @@ void deallocate_block(void *user_context, MemoryBlock *block) {
                         << "block_size=" << int32_t(block->size) << " "
                         << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
                         << ") !\n";
+
+    return halide_error_code_success;
 }
 
-void allocate_region(void *user_context, MemoryRegion *region) {
+int allocate_region(void *user_context, MemoryRegion *region) {
     region->handle = (void *)1;
     allocated_region_memory += region->size;
 
@@ -41,9 +45,11 @@ void allocate_region(void *user_context, MemoryRegion *region) {
                         << "region_size=" << int32_t(region->size) << " "
                         << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
                         << ") !\n";
+
+    return halide_error_code_success;
 }
 
-void deallocate_region(void *user_context, MemoryRegion *region) {
+int deallocate_region(void *user_context, MemoryRegion *region) {
     region->handle = (void *)0;
     allocated_region_memory -= region->size;
 
@@ -52,6 +58,8 @@ void deallocate_region(void *user_context, MemoryRegion *region) {
                         << "region_size=" << int32_t(region->size) << " "
                         << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
                         << ") !\n";
+
+    return halide_error_code_success;
 }
 
 }  // end namespace

From bf9854e99bc94278a3ab802f61b02b33e0245483 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 18 Feb 2023 08:58:55 -0700
Subject: [PATCH 152/166] Fix typo in error message

---
 src/CodeGen_Vulkan_Dev.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 903f49d151f2..ec9a482a9dc8 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -2763,7 +2763,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::compile(std::vector<char> &module) {
     }
 
     if (builder.is_capability_required(SpvCapabilityFloat64) && !target.has_feature(Target::VulkanFloat64)) {
-        user_error << "Vulkan: Code requires 16-bit floating-point support (which is not enabled in the target features)! "
+        user_error << "Vulkan: Code requires 64-bit floating-point support (which is not enabled in the target features)! "
                    << "Either enable the target feature, or adjust the algorithm to avoid using this data type!";
     }
 

From f158b96746aab89c67f049b0205a48bf33e057f5 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 18 Feb 2023 08:59:21 -0700
Subject: [PATCH 153/166] Fix typo in readme

---
 README_vulkan.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index 635ef802aaf6..ed8aeadc6bd3 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -185,7 +185,7 @@ Halide will search for the first 'gpu' like device it can find, or fall back
 to the first compute device it can find.
 
 `HL_VK_ALLOC_CONFIG=...` will tell Halide to configure the Vulkan memory
-allocator use the given constraints specified as three integer values 
+allocator use the given constraints specified as four integer values 
 separated by the appropriate environment variable list delimiter 
 (e.g. `N:N:N:N` on Linux/OSX/Posix, or `N;N;N;N` on Windows). These values 
 correspond to `maximum_pool_size`, `minimum_block_size`, `maximum_block_size` 

From c5b7f944323add48c6a967bd789d63b3aa57f91e Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 18 Feb 2023 08:59:50 -0700
Subject: [PATCH 154/166] Skip GPU allocation cache test on MacOSX since
 MoltenVK only supports 30 buffers to be allocated

---
 test/correctness/gpu_allocation_cache.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/correctness/gpu_allocation_cache.cpp b/test/correctness/gpu_allocation_cache.cpp
index efdb5ebb1fbe..9c3def15e7e3 100644
--- a/test/correctness/gpu_allocation_cache.cpp
+++ b/test/correctness/gpu_allocation_cache.cpp
@@ -24,7 +24,10 @@ int main(int argc, char **argv) {
         printf("[SKIP] Allocation cache not yet implemented for D3D12Compute.\n");
         return 0;
     }
-
+    if (target.has_feature(Target::Vulkan) && ((target.os == Target::IOS) || target.os == Target::OSX)) {
+        printf("[SKIP] Skipping test for Vulkan on iOS/OSX (MoltenVK only allows 30 buffers to be allocated)!\n");
+        return 0;
+    }
     const int N = 30;
     Var x, y, xi, yi;
 

From e867ff393a5ec6e3ab42f935d8fb69fc9924b0ea Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 18 Feb 2023 09:00:52 -0700
Subject: [PATCH 155/166] Skip widening reduction test on Vulkan for Mac
 OSX/IOS since MoltenVK fails to translate calls with vector types for
 builtins like min/max. etc

---
 test/correctness/widening_reduction.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/correctness/widening_reduction.cpp b/test/correctness/widening_reduction.cpp
index 73abb1fa660a..f392cf021830 100644
--- a/test/correctness/widening_reduction.cpp
+++ b/test/correctness/widening_reduction.cpp
@@ -7,6 +7,13 @@ using namespace Halide::ConciseCasts;
 using namespace Halide::Internal;
 
 int main(int arch, char **argv) {
+
+    Halide::Target target = get_jit_target_from_environment();
+    if (target.has_feature(Target::Vulkan) && ((target.os == Target::IOS) || target.os == Target::OSX)) {
+        printf("[SKIP] Skipping test for Vulkan on iOS/OSX (MoltenVK fails to convert max/min intrinsics correctly)!\n");
+        return 0;
+    }
+
     const int W = 256, H = 256;
 
     Buffer<uint8_t> in(W, H);
@@ -39,7 +46,6 @@ int main(int arch, char **argv) {
         f(x, y) = u8_sat(sum(i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y)) / 16);
 
         // Schedule.
-        Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
         } else if (target.has_feature(Target::HVX)) {
@@ -81,7 +87,6 @@ int main(int arch, char **argv) {
         g(x, y) = u8_sat((f(x, y)[0] + f(x, y)[1]) / 16);
 
         // Schedule.
-        Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             g.gpu_tile(x, y, xi, yi, 16, 16);
         } else if (target.has_feature(Target::HVX)) {
@@ -122,7 +127,6 @@ int main(int arch, char **argv) {
         g(x, y) = u8_sat((f(x, y) + f(x + 1, y)) / 2);
 
         // Schedule.
-        Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             g.gpu_tile(x, y, xi, yi, 16, 16);
         } else if (target.has_feature(Target::HVX)) {

From baa303f1240055f6fed9d35306594cd1e8d9ce83 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 18 Feb 2023 09:02:14 -0700
Subject: [PATCH 156/166] Skip doubles in vector cast test on Vulkan for Mac
 OSX/IOS since Molten doesn't support them

---
 test/correctness/vector_cast.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/correctness/vector_cast.cpp b/test/correctness/vector_cast.cpp
index c5afbfc330d0..e5fef413729d 100644
--- a/test/correctness/vector_cast.cpp
+++ b/test/correctness/vector_cast.cpp
@@ -30,6 +30,13 @@ bool is_type_supported(int vec_width, const Target &target) {
     if (target.has_feature(Target::HVX)) {
         device = DeviceAPI::Hexagon;
     }
+    if (target.has_feature(Target::Vulkan)) {
+        if (type_of<T>() == Float(64)) {
+            if ((target.os == Target::OSX || target.os == Target::IOS)) {
+                return false;  // MoltenVK doesn't support Float64
+            }
+        }
+    }
     return target.supports_type(type_of<T>().with_lanes(vec_width), device);
 }
 

From b2186e750e319c33e415519641554a96dc255e40 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Sat, 18 Feb 2023 09:02:54 -0700
Subject: [PATCH 157/166] Skip gpu_dynamic_shared and gpu_specialize test for
 Vulkan on Mac OSX/IOS since MoltenVK doesn't support the dynamic shared
 memory allocation or dynamic grid size.

---
 test/correctness/gpu_dynamic_shared.cpp | 4 ++++
 test/correctness/gpu_specialize.cpp     | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index 7df99b759d4d..db6ecf73cc45 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -24,6 +24,10 @@ int main(int argc, char **argv) {
             printf("[SKIP] Vulkan %d.%d is less than required 1.2.\n", major, minor);
             return 0;
         }
+        if ((t.os == Target::IOS) || (t.os == Target::OSX)) {
+            printf("[SKIP] Skipping test for Vulkan on iOS/OSX (MoltenVK doesn't support dynamic LocalSizeId yet)!\n");
+            return 0;
+        }
     }
 
     // Check dynamic allocations per-block and per-thread into both
diff --git a/test/correctness/gpu_specialize.cpp b/test/correctness/gpu_specialize.cpp
index f21a61e0fbda..3a849dc5be7f 100644
--- a/test/correctness/gpu_specialize.cpp
+++ b/test/correctness/gpu_specialize.cpp
@@ -4,10 +4,15 @@
 using namespace Halide;
 
 int main(int argc, char **argv) {
-    if (!get_jit_target_from_environment().has_gpu_feature()) {
+    Halide::Target target = get_jit_target_from_environment();
+    if (!target.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
         return 0;
     }
+    if (target.has_feature(Target::Vulkan) && ((target.os == Target::IOS) || target.os == Target::OSX)) {
+        printf("[SKIP] Skipping test for Vulkan on iOS/OSX (MoltenVK doesn't support dynamically allocated shared mem)!\n");
+        return 0;
+    }
 
     {
         Func f("f"), g("g"), h("h");

From 3bc28cf4bc74d14195d0b8130fd4653ecf7fa158 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Thu, 23 Mar 2023 11:06:27 -0700
Subject: [PATCH 158/166] Clang format / tidy pass

---
 src/FuseGPUThreadLoops.cpp                |   2 +-
 src/runtime/internal/block_allocator.h    |   6 +-
 src/runtime/mini_webgpu.h                 | 734 +++++++++++-----------
 test/correctness/gpu_allocation_cache.cpp |   3 +-
 test/correctness/simd_op_check.h          |   2 +-
 5 files changed, 374 insertions(+), 373 deletions(-)

diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index cf33f343c88a..3c678b5bd693 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -1029,7 +1029,7 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
           num_threads_var_name(unique_name('t')),
           may_merge_allocs_of_different_type(device_api != DeviceAPI::OpenGLCompute &&
                                              device_api != DeviceAPI::D3D12Compute &&
-						device_api != DeviceAPI::Vulkan &&                                   
+                                             device_api != DeviceAPI::Vulkan &&
                                              device_api != DeviceAPI::WebGPU) {
     }
 };  // namespace Internal
diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index e88e82cef22c..498f0c71a84c 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -419,9 +419,9 @@ BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &
 RegionAllocator *
 BlockAllocator::create_region_allocator(void *user_context, BlockResource *block) {
 #ifdef DEBUG_RUNTIME
-     debug(user_context) << "BlockAllocator: Creating region allocator ("
-                         << "user_context=" << (void *)(user_context) << " "
-                         << "block_resource=" << (void *)(block) << ")...\n";
+    debug(user_context) << "BlockAllocator: Creating region allocator ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "block_resource=" << (void *)(block) << ")...\n";
 #endif
     halide_abort_if_false(user_context, block != nullptr);
     RegionAllocator *region_allocator = RegionAllocator::create(
diff --git a/src/runtime/mini_webgpu.h b/src/runtime/mini_webgpu.h
index 54d2bef37492..d00a5cef43ae 100644
--- a/src/runtime/mini_webgpu.h
+++ b/src/runtime/mini_webgpu.h
@@ -31,21 +31,21 @@
 #define WEBGPU_H_
 
 #if defined(WGPU_SHARED_LIBRARY)
-#    if defined(_WIN32)
-#        if defined(WGPU_IMPLEMENTATION)
-#            define WGPU_EXPORT __declspec(dllexport)
-#        else
-#            define WGPU_EXPORT __declspec(dllimport)
-#        endif
-#    else  // defined(_WIN32)
-#        if defined(WGPU_IMPLEMENTATION)
-#            define WGPU_EXPORT __attribute__((visibility("default")))
-#        else
-#            define WGPU_EXPORT
-#        endif
-#    endif  // defined(_WIN32)
-#else       // defined(WGPU_SHARED_LIBRARY)
-#    define WGPU_EXPORT
+#if defined(_WIN32)
+#if defined(WGPU_IMPLEMENTATION)
+#define WGPU_EXPORT __declspec(dllexport)
+#else
+#define WGPU_EXPORT __declspec(dllimport)
+#endif
+#else  // defined(_WIN32)
+#if defined(WGPU_IMPLEMENTATION)
+#define WGPU_EXPORT __attribute__((visibility("default")))
+#else
+#define WGPU_EXPORT
+#endif
+#endif  // defined(_WIN32)
+#else   // defined(WGPU_SHARED_LIBRARY)
+#define WGPU_EXPORT
 #endif  // defined(WGPU_SHARED_LIBRARY)
 
 #define WGPU_ARRAY_LAYER_COUNT_UNDEFINED (0xffffffffUL)
@@ -59,30 +59,30 @@
 
 typedef uint32_t WGPUFlags;
 
-typedef struct WGPUAdapterImpl* WGPUAdapter;
-typedef struct WGPUBindGroupImpl* WGPUBindGroup;
-typedef struct WGPUBindGroupLayoutImpl* WGPUBindGroupLayout;
-typedef struct WGPUBufferImpl* WGPUBuffer;
-typedef struct WGPUCommandBufferImpl* WGPUCommandBuffer;
-typedef struct WGPUCommandEncoderImpl* WGPUCommandEncoder;
-typedef struct WGPUComputePassEncoderImpl* WGPUComputePassEncoder;
-typedef struct WGPUComputePipelineImpl* WGPUComputePipeline;
-typedef struct WGPUDeviceImpl* WGPUDevice;
-typedef struct WGPUExternalTextureImpl* WGPUExternalTexture;
-typedef struct WGPUInstanceImpl* WGPUInstance;
-typedef struct WGPUPipelineLayoutImpl* WGPUPipelineLayout;
-typedef struct WGPUQuerySetImpl* WGPUQuerySet;
-typedef struct WGPUQueueImpl* WGPUQueue;
-typedef struct WGPURenderBundleImpl* WGPURenderBundle;
-typedef struct WGPURenderBundleEncoderImpl* WGPURenderBundleEncoder;
-typedef struct WGPURenderPassEncoderImpl* WGPURenderPassEncoder;
-typedef struct WGPURenderPipelineImpl* WGPURenderPipeline;
-typedef struct WGPUSamplerImpl* WGPUSampler;
-typedef struct WGPUShaderModuleImpl* WGPUShaderModule;
-typedef struct WGPUSurfaceImpl* WGPUSurface;
-typedef struct WGPUSwapChainImpl* WGPUSwapChain;
-typedef struct WGPUTextureImpl* WGPUTexture;
-typedef struct WGPUTextureViewImpl* WGPUTextureView;
+typedef struct WGPUAdapterImpl *WGPUAdapter;
+typedef struct WGPUBindGroupImpl *WGPUBindGroup;
+typedef struct WGPUBindGroupLayoutImpl *WGPUBindGroupLayout;
+typedef struct WGPUBufferImpl *WGPUBuffer;
+typedef struct WGPUCommandBufferImpl *WGPUCommandBuffer;
+typedef struct WGPUCommandEncoderImpl *WGPUCommandEncoder;
+typedef struct WGPUComputePassEncoderImpl *WGPUComputePassEncoder;
+typedef struct WGPUComputePipelineImpl *WGPUComputePipeline;
+typedef struct WGPUDeviceImpl *WGPUDevice;
+typedef struct WGPUExternalTextureImpl *WGPUExternalTexture;
+typedef struct WGPUInstanceImpl *WGPUInstance;
+typedef struct WGPUPipelineLayoutImpl *WGPUPipelineLayout;
+typedef struct WGPUQuerySetImpl *WGPUQuerySet;
+typedef struct WGPUQueueImpl *WGPUQueue;
+typedef struct WGPURenderBundleImpl *WGPURenderBundle;
+typedef struct WGPURenderBundleEncoderImpl *WGPURenderBundleEncoder;
+typedef struct WGPURenderPassEncoderImpl *WGPURenderPassEncoder;
+typedef struct WGPURenderPipelineImpl *WGPURenderPipeline;
+typedef struct WGPUSamplerImpl *WGPUSampler;
+typedef struct WGPUShaderModuleImpl *WGPUShaderModule;
+typedef struct WGPUSurfaceImpl *WGPUSurface;
+typedef struct WGPUSwapChainImpl *WGPUSwapChain;
+typedef struct WGPUTextureImpl *WGPUTexture;
+typedef struct WGPUTextureViewImpl *WGPUTextureView;
 
 typedef enum WGPUAdapterType {
     WGPUAdapterType_DiscreteGPU = 0x00000000,
@@ -680,35 +680,35 @@ typedef enum WGPUTextureUsage {
 typedef WGPUFlags WGPUTextureUsageFlags;
 
 typedef struct WGPUChainedStruct {
-    struct WGPUChainedStruct const * next;
+    struct WGPUChainedStruct const *next;
     WGPUSType sType;
 } WGPUChainedStruct;
 
 typedef struct WGPUChainedStructOut {
-    struct WGPUChainedStructOut * next;
+    struct WGPUChainedStructOut *next;
     WGPUSType sType;
 } WGPUChainedStructOut;
 
 typedef struct WGPUAdapterProperties {
-    WGPUChainedStructOut * nextInChain;
+    WGPUChainedStructOut *nextInChain;
     uint32_t vendorID;
-    char const * vendorName;
-    char const * architecture;
+    char const *vendorName;
+    char const *architecture;
     uint32_t deviceID;
-    char const * name;
-    char const * driverDescription;
+    char const *name;
+    char const *driverDescription;
     WGPUAdapterType adapterType;
     WGPUBackendType backendType;
 } WGPUAdapterProperties;
 
 typedef struct WGPUBindGroupEntry {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     uint32_t binding;
-    WGPUBuffer buffer; // nullable
+    WGPUBuffer buffer;  // nullable
     uint64_t offset;
     uint64_t size;
-    WGPUSampler sampler; // nullable
-    WGPUTextureView textureView; // nullable
+    WGPUSampler sampler;          // nullable
+    WGPUTextureView textureView;  // nullable
 } WGPUBindGroupEntry;
 
 typedef struct WGPUBlendComponent {
@@ -718,15 +718,15 @@ typedef struct WGPUBlendComponent {
 } WGPUBlendComponent;
 
 typedef struct WGPUBufferBindingLayout {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUBufferBindingType type;
     bool hasDynamicOffset;
     uint64_t minBindingSize;
 } WGPUBufferBindingLayout;
 
 typedef struct WGPUBufferDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     WGPUBufferUsageFlags usage;
     uint64_t size;
     bool mappedAtCreation;
@@ -740,18 +740,18 @@ typedef struct WGPUColor {
 } WGPUColor;
 
 typedef struct WGPUCommandBufferDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
 } WGPUCommandBufferDescriptor;
 
 typedef struct WGPUCommandEncoderDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
 } WGPUCommandEncoderDescriptor;
 
 typedef struct WGPUCompilationMessage {
-    WGPUChainedStruct const * nextInChain;
-    char const * message; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *message;  // nullable
     WGPUCompilationMessageType type;
     uint64_t lineNum;
     uint64_t linePos;
@@ -769,19 +769,19 @@ typedef struct WGPUComputePassTimestampWrite {
 } WGPUComputePassTimestampWrite;
 
 typedef struct WGPUConstantEntry {
-    WGPUChainedStruct const * nextInChain;
-    char const * key;
+    WGPUChainedStruct const *nextInChain;
+    char const *key;
     double value;
 } WGPUConstantEntry;
 
 typedef struct WGPUCopyTextureForBrowserOptions {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     bool flipY;
     bool needsColorSpaceConversion;
     WGPUAlphaMode srcAlphaMode;
-    float const * srcTransferFunctionParameters; // nullable
-    float const * conversionMatrix; // nullable
-    float const * dstTransferFunctionParameters; // nullable
+    float const *srcTransferFunctionParameters;  // nullable
+    float const *conversionMatrix;               // nullable
+    float const *dstTransferFunctionParameters;  // nullable
     WGPUAlphaMode dstAlphaMode;
     bool internalUsage;
 } WGPUCopyTextureForBrowserOptions;
@@ -801,7 +801,7 @@ typedef struct WGPUDawnBufferDescriptorErrorInfoFromWireClient {
 // Can be chained in WGPUDeviceDescriptor
 typedef struct WGPUDawnCacheDeviceDescriptor {
     WGPUChainedStruct chain;
-    char const * isolationKey;
+    char const *isolationKey;
 } WGPUDawnCacheDeviceDescriptor;
 
 // Can be chained in WGPUCommandEncoderDescriptor
@@ -814,7 +814,7 @@ typedef struct WGPUDawnEncoderInternalUsageDescriptor {
 typedef struct WGPUDawnInstanceDescriptor {
     WGPUChainedStruct chain;
     uint32_t additionalRuntimeSearchPathsCount;
-    const char* const * additionalRuntimeSearchPaths;
+    const char *const *additionalRuntimeSearchPaths;
 } WGPUDawnInstanceDescriptor;
 
 // Can be chained in WGPUShaderModuleDescriptor
@@ -834,18 +834,18 @@ typedef struct WGPUDawnTextureInternalUsageDescriptor {
 typedef struct WGPUDawnTogglesDescriptor {
     WGPUChainedStruct chain;
     uint32_t enabledTogglesCount;
-    const char* const * enabledToggles;
+    const char *const *enabledToggles;
     uint32_t disabledTogglesCount;
-    const char* const * disabledToggles;
+    const char *const *disabledToggles;
 } WGPUDawnTogglesDescriptor;
 
 // Can be chained in WGPUDeviceDescriptor
 typedef struct WGPUDawnTogglesDeviceDescriptor {
     WGPUChainedStruct chain;
     uint32_t forceEnabledTogglesCount;
-    const char* const * forceEnabledToggles;
+    const char *const *forceEnabledToggles;
     uint32_t forceDisabledTogglesCount;
-    const char* const * forceDisabledToggles;
+    const char *const *forceDisabledToggles;
 } WGPUDawnTogglesDeviceDescriptor;
 
 typedef struct WGPUExtent2D {
@@ -871,7 +871,7 @@ typedef struct WGPUExternalTextureBindingLayout {
 } WGPUExternalTextureBindingLayout;
 
 typedef struct WGPUInstanceDescriptor {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
 } WGPUInstanceDescriptor;
 
 typedef struct WGPULimits {
@@ -910,7 +910,7 @@ typedef struct WGPULimits {
 } WGPULimits;
 
 typedef struct WGPUMultisampleState {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     uint32_t count;
     uint32_t mask;
     bool alphaToCoverageEnabled;
@@ -928,10 +928,10 @@ typedef struct WGPUOrigin3D {
 } WGPUOrigin3D;
 
 typedef struct WGPUPipelineLayoutDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     uint32_t bindGroupLayoutCount;
-    WGPUBindGroupLayout const * bindGroupLayouts;
+    WGPUBindGroupLayout const *bindGroupLayouts;
 } WGPUPipelineLayoutDescriptor;
 
 // Can be chained in WGPUPrimitiveState
@@ -941,7 +941,7 @@ typedef struct WGPUPrimitiveDepthClipControl {
 } WGPUPrimitiveDepthClipControl;
 
 typedef struct WGPUPrimitiveState {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUPrimitiveTopology topology;
     WGPUIndexFormat stripIndexFormat;
     WGPUFrontFace frontFace;
@@ -949,29 +949,29 @@ typedef struct WGPUPrimitiveState {
 } WGPUPrimitiveState;
 
 typedef struct WGPUQuerySetDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     WGPUQueryType type;
     uint32_t count;
-    WGPUPipelineStatisticName const * pipelineStatistics;
+    WGPUPipelineStatisticName const *pipelineStatistics;
     uint32_t pipelineStatisticsCount;
 } WGPUQuerySetDescriptor;
 
 typedef struct WGPUQueueDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
 } WGPUQueueDescriptor;
 
 typedef struct WGPURenderBundleDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
 } WGPURenderBundleDescriptor;
 
 typedef struct WGPURenderBundleEncoderDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     uint32_t colorFormatsCount;
-    WGPUTextureFormat const * colorFormats;
+    WGPUTextureFormat const *colorFormats;
     WGPUTextureFormat depthStencilFormat;
     uint32_t sampleCount;
     bool depthReadOnly;
@@ -1003,20 +1003,20 @@ typedef struct WGPURenderPassTimestampWrite {
 } WGPURenderPassTimestampWrite;
 
 typedef struct WGPURequestAdapterOptions {
-    WGPUChainedStruct const * nextInChain;
-    WGPUSurface compatibleSurface; // nullable
+    WGPUChainedStruct const *nextInChain;
+    WGPUSurface compatibleSurface;  // nullable
     WGPUPowerPreference powerPreference;
     bool forceFallbackAdapter;
 } WGPURequestAdapterOptions;
 
 typedef struct WGPUSamplerBindingLayout {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUSamplerBindingType type;
 } WGPUSamplerBindingLayout;
 
 typedef struct WGPUSamplerDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     WGPUAddressMode addressModeU;
     WGPUAddressMode addressModeV;
     WGPUAddressMode addressModeW;
@@ -1030,21 +1030,21 @@ typedef struct WGPUSamplerDescriptor {
 } WGPUSamplerDescriptor;
 
 typedef struct WGPUShaderModuleDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
 } WGPUShaderModuleDescriptor;
 
 // Can be chained in WGPUShaderModuleDescriptor
 typedef struct WGPUShaderModuleSPIRVDescriptor {
     WGPUChainedStruct chain;
     uint32_t codeSize;
-    uint32_t const * code;
+    uint32_t const *code;
 } WGPUShaderModuleSPIRVDescriptor;
 
 // Can be chained in WGPUShaderModuleDescriptor
 typedef struct WGPUShaderModuleWGSLDescriptor {
     WGPUChainedStruct chain;
-    char const * source;
+    char const *source;
 } WGPUShaderModuleWGSLDescriptor;
 
 typedef struct WGPUStencilFaceState {
@@ -1055,71 +1055,71 @@ typedef struct WGPUStencilFaceState {
 } WGPUStencilFaceState;
 
 typedef struct WGPUStorageTextureBindingLayout {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUStorageTextureAccess access;
     WGPUTextureFormat format;
     WGPUTextureViewDimension viewDimension;
 } WGPUStorageTextureBindingLayout;
 
 typedef struct WGPUSurfaceDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
 } WGPUSurfaceDescriptor;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromAndroidNativeWindow {
     WGPUChainedStruct chain;
-    void * window;
+    void *window;
 } WGPUSurfaceDescriptorFromAndroidNativeWindow;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromCanvasHTMLSelector {
     WGPUChainedStruct chain;
-    char const * selector;
+    char const *selector;
 } WGPUSurfaceDescriptorFromCanvasHTMLSelector;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromMetalLayer {
     WGPUChainedStruct chain;
-    void * layer;
+    void *layer;
 } WGPUSurfaceDescriptorFromMetalLayer;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWaylandSurface {
     WGPUChainedStruct chain;
-    void * display;
-    void * surface;
+    void *display;
+    void *surface;
 } WGPUSurfaceDescriptorFromWaylandSurface;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWindowsCoreWindow {
     WGPUChainedStruct chain;
-    void * coreWindow;
+    void *coreWindow;
 } WGPUSurfaceDescriptorFromWindowsCoreWindow;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWindowsHWND {
     WGPUChainedStruct chain;
-    void * hinstance;
-    void * hwnd;
+    void *hinstance;
+    void *hwnd;
 } WGPUSurfaceDescriptorFromWindowsHWND;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWindowsSwapChainPanel {
     WGPUChainedStruct chain;
-    void * swapChainPanel;
+    void *swapChainPanel;
 } WGPUSurfaceDescriptorFromWindowsSwapChainPanel;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromXlibWindow {
     WGPUChainedStruct chain;
-    void * display;
+    void *display;
     uint32_t window;
 } WGPUSurfaceDescriptorFromXlibWindow;
 
 typedef struct WGPUSwapChainDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     WGPUTextureUsageFlags usage;
     WGPUTextureFormat format;
     uint32_t width;
@@ -1129,22 +1129,22 @@ typedef struct WGPUSwapChainDescriptor {
 } WGPUSwapChainDescriptor;
 
 typedef struct WGPUTextureBindingLayout {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUTextureSampleType sampleType;
     WGPUTextureViewDimension viewDimension;
     bool multisampled;
 } WGPUTextureBindingLayout;
 
 typedef struct WGPUTextureDataLayout {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     uint64_t offset;
     uint32_t bytesPerRow;
     uint32_t rowsPerImage;
 } WGPUTextureDataLayout;
 
 typedef struct WGPUTextureViewDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     WGPUTextureFormat format;
     WGPUTextureViewDimension dimension;
     uint32_t baseMipLevel;
@@ -1161,15 +1161,15 @@ typedef struct WGPUVertexAttribute {
 } WGPUVertexAttribute;
 
 typedef struct WGPUBindGroupDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     WGPUBindGroupLayout layout;
     uint32_t entryCount;
-    WGPUBindGroupEntry const * entries;
+    WGPUBindGroupEntry const *entries;
 } WGPUBindGroupDescriptor;
 
 typedef struct WGPUBindGroupLayoutEntry {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     uint32_t binding;
     WGPUShaderStageFlags visibility;
     WGPUBufferBindingLayout buffer;
@@ -1184,20 +1184,20 @@ typedef struct WGPUBlendState {
 } WGPUBlendState;
 
 typedef struct WGPUCompilationInfo {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     uint32_t messageCount;
-    WGPUCompilationMessage const * messages;
+    WGPUCompilationMessage const *messages;
 } WGPUCompilationInfo;
 
 typedef struct WGPUComputePassDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     uint32_t timestampWriteCount;
-    WGPUComputePassTimestampWrite const * timestampWrites;
+    WGPUComputePassTimestampWrite const *timestampWrites;
 } WGPUComputePassDescriptor;
 
 typedef struct WGPUDepthStencilState {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUTextureFormat format;
     bool depthWriteEnabled;
     WGPUCompareFunction depthCompare;
@@ -1211,35 +1211,35 @@ typedef struct WGPUDepthStencilState {
 } WGPUDepthStencilState;
 
 typedef struct WGPUExternalTextureDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     WGPUTextureView plane0;
-    WGPUTextureView plane1; // nullable
+    WGPUTextureView plane1;  // nullable
     WGPUOrigin2D visibleOrigin;
     WGPUExtent2D visibleSize;
     bool doYuvToRgbConversionOnly;
-    float const * yuvToRgbConversionMatrix; // nullable
-    float const * srcTransferFunctionParameters;
-    float const * dstTransferFunctionParameters;
-    float const * gamutConversionMatrix;
+    float const *yuvToRgbConversionMatrix;  // nullable
+    float const *srcTransferFunctionParameters;
+    float const *dstTransferFunctionParameters;
+    float const *gamutConversionMatrix;
     bool flipY;
     WGPUExternalTextureRotation rotation;
 } WGPUExternalTextureDescriptor;
 
 typedef struct WGPUImageCopyBuffer {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUTextureDataLayout layout;
     WGPUBuffer buffer;
 } WGPUImageCopyBuffer;
 
 typedef struct WGPUImageCopyExternalTexture {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUExternalTexture externalTexture;
     WGPUOrigin3D origin;
 } WGPUImageCopyExternalTexture;
 
 typedef struct WGPUImageCopyTexture {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUTexture texture;
     uint32_t mipLevel;
     WGPUOrigin3D origin;
@@ -1247,34 +1247,34 @@ typedef struct WGPUImageCopyTexture {
 } WGPUImageCopyTexture;
 
 typedef struct WGPUProgrammableStageDescriptor {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUShaderModule module;
-    char const * entryPoint;
+    char const *entryPoint;
     uint32_t constantCount;
-    WGPUConstantEntry const * constants;
+    WGPUConstantEntry const *constants;
 } WGPUProgrammableStageDescriptor;
 
 typedef struct WGPURenderPassColorAttachment {
-    WGPUTextureView view; // nullable
-    WGPUTextureView resolveTarget; // nullable
+    WGPUTextureView view;           // nullable
+    WGPUTextureView resolveTarget;  // nullable
     WGPULoadOp loadOp;
     WGPUStoreOp storeOp;
     WGPUColor clearValue;
 } WGPURenderPassColorAttachment;
 
 typedef struct WGPURequiredLimits {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPULimits limits;
 } WGPURequiredLimits;
 
 typedef struct WGPUSupportedLimits {
-    WGPUChainedStructOut * nextInChain;
+    WGPUChainedStructOut *nextInChain;
     WGPULimits limits;
 } WGPUSupportedLimits;
 
 typedef struct WGPUTextureDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     WGPUTextureUsageFlags usage;
     WGPUTextureDimension dimension;
     WGPUExtent3D size;
@@ -1282,164 +1282,164 @@ typedef struct WGPUTextureDescriptor {
     uint32_t mipLevelCount;
     uint32_t sampleCount;
     uint32_t viewFormatCount;
-    WGPUTextureFormat const * viewFormats;
+    WGPUTextureFormat const *viewFormats;
 } WGPUTextureDescriptor;
 
 typedef struct WGPUVertexBufferLayout {
     uint64_t arrayStride;
     WGPUVertexStepMode stepMode;
     uint32_t attributeCount;
-    WGPUVertexAttribute const * attributes;
+    WGPUVertexAttribute const *attributes;
 } WGPUVertexBufferLayout;
 
 typedef struct WGPUBindGroupLayoutDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     uint32_t entryCount;
-    WGPUBindGroupLayoutEntry const * entries;
+    WGPUBindGroupLayoutEntry const *entries;
 } WGPUBindGroupLayoutDescriptor;
 
 typedef struct WGPUColorTargetState {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUTextureFormat format;
-    WGPUBlendState const * blend; // nullable
+    WGPUBlendState const *blend;  // nullable
     WGPUColorWriteMaskFlags writeMask;
 } WGPUColorTargetState;
 
 typedef struct WGPUComputePipelineDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
-    WGPUPipelineLayout layout; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;          // nullable
+    WGPUPipelineLayout layout;  // nullable
     WGPUProgrammableStageDescriptor compute;
 } WGPUComputePipelineDescriptor;
 
 typedef struct WGPUDeviceDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     uint32_t requiredFeaturesCount;
-    WGPUFeatureName const * requiredFeatures;
-    WGPURequiredLimits const * requiredLimits; // nullable
+    WGPUFeatureName const *requiredFeatures;
+    WGPURequiredLimits const *requiredLimits;  // nullable
     WGPUQueueDescriptor defaultQueue;
 } WGPUDeviceDescriptor;
 
 typedef struct WGPURenderPassDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;  // nullable
     uint32_t colorAttachmentCount;
-    WGPURenderPassColorAttachment const * colorAttachments;
-    WGPURenderPassDepthStencilAttachment const * depthStencilAttachment; // nullable
-    WGPUQuerySet occlusionQuerySet; // nullable
+    WGPURenderPassColorAttachment const *colorAttachments;
+    WGPURenderPassDepthStencilAttachment const *depthStencilAttachment;  // nullable
+    WGPUQuerySet occlusionQuerySet;                                      // nullable
     uint32_t timestampWriteCount;
-    WGPURenderPassTimestampWrite const * timestampWrites;
+    WGPURenderPassTimestampWrite const *timestampWrites;
 } WGPURenderPassDescriptor;
 
 typedef struct WGPUVertexState {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUShaderModule module;
-    char const * entryPoint;
+    char const *entryPoint;
     uint32_t constantCount;
-    WGPUConstantEntry const * constants;
+    WGPUConstantEntry const *constants;
     uint32_t bufferCount;
-    WGPUVertexBufferLayout const * buffers;
+    WGPUVertexBufferLayout const *buffers;
 } WGPUVertexState;
 
 typedef struct WGPUFragmentState {
-    WGPUChainedStruct const * nextInChain;
+    WGPUChainedStruct const *nextInChain;
     WGPUShaderModule module;
-    char const * entryPoint;
+    char const *entryPoint;
     uint32_t constantCount;
-    WGPUConstantEntry const * constants;
+    WGPUConstantEntry const *constants;
     uint32_t targetCount;
-    WGPUColorTargetState const * targets;
+    WGPUColorTargetState const *targets;
 } WGPUFragmentState;
 
 typedef struct WGPURenderPipelineDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    char const * label; // nullable
-    WGPUPipelineLayout layout; // nullable
+    WGPUChainedStruct const *nextInChain;
+    char const *label;          // nullable
+    WGPUPipelineLayout layout;  // nullable
     WGPUVertexState vertex;
     WGPUPrimitiveState primitive;
-    WGPUDepthStencilState const * depthStencil; // nullable
+    WGPUDepthStencilState const *depthStencil;  // nullable
     WGPUMultisampleState multisample;
-    WGPUFragmentState const * fragment; // nullable
+    WGPUFragmentState const *fragment;  // nullable
 } WGPURenderPipelineDescriptor;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef void (*WGPUBufferMapCallback)(WGPUBufferMapAsyncStatus status, void * userdata);
-typedef void (*WGPUCompilationInfoCallback)(WGPUCompilationInfoRequestStatus status, WGPUCompilationInfo const * compilationInfo, void * userdata);
-typedef void (*WGPUCreateComputePipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPUComputePipeline pipeline, char const * message, void * userdata);
-typedef void (*WGPUCreateRenderPipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPURenderPipeline pipeline, char const * message, void * userdata);
-typedef void (*WGPUDeviceLostCallback)(WGPUDeviceLostReason reason, char const * message, void * userdata);
-typedef void (*WGPUErrorCallback)(WGPUErrorType type, char const * message, void * userdata);
-typedef void (*WGPULoggingCallback)(WGPULoggingType type, char const * message, void * userdata);
+typedef void (*WGPUBufferMapCallback)(WGPUBufferMapAsyncStatus status, void *userdata);
+typedef void (*WGPUCompilationInfoCallback)(WGPUCompilationInfoRequestStatus status, WGPUCompilationInfo const *compilationInfo, void *userdata);
+typedef void (*WGPUCreateComputePipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPUComputePipeline pipeline, char const *message, void *userdata);
+typedef void (*WGPUCreateRenderPipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPURenderPipeline pipeline, char const *message, void *userdata);
+typedef void (*WGPUDeviceLostCallback)(WGPUDeviceLostReason reason, char const *message, void *userdata);
+typedef void (*WGPUErrorCallback)(WGPUErrorType type, char const *message, void *userdata);
+typedef void (*WGPULoggingCallback)(WGPULoggingType type, char const *message, void *userdata);
 typedef void (*WGPUProc)();
-typedef void (*WGPUQueueWorkDoneCallback)(WGPUQueueWorkDoneStatus status, void * userdata);
-typedef void (*WGPURequestAdapterCallback)(WGPURequestAdapterStatus status, WGPUAdapter adapter, char const * message, void * userdata);
-typedef void (*WGPURequestDeviceCallback)(WGPURequestDeviceStatus status, WGPUDevice device, char const * message, void * userdata);
+typedef void (*WGPUQueueWorkDoneCallback)(WGPUQueueWorkDoneStatus status, void *userdata);
+typedef void (*WGPURequestAdapterCallback)(WGPURequestAdapterStatus status, WGPUAdapter adapter, char const *message, void *userdata);
+typedef void (*WGPURequestDeviceCallback)(WGPURequestDeviceStatus status, WGPUDevice device, char const *message, void *userdata);
 
 #if !defined(WGPU_SKIP_PROCS)
 
-typedef WGPUInstance (*WGPUProcCreateInstance)(WGPUInstanceDescriptor const * descriptor);
-typedef WGPUProc (*WGPUProcGetProcAddress)(WGPUDevice device, char const * procName);
+typedef WGPUInstance (*WGPUProcCreateInstance)(WGPUInstanceDescriptor const *descriptor);
+typedef WGPUProc (*WGPUProcGetProcAddress)(WGPUDevice device, char const *procName);
 
 // Procs of Adapter
-typedef WGPUDevice (*WGPUProcAdapterCreateDevice)(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor /* nullable */);
-typedef size_t (*WGPUProcAdapterEnumerateFeatures)(WGPUAdapter adapter, WGPUFeatureName * features);
-typedef bool (*WGPUProcAdapterGetLimits)(WGPUAdapter adapter, WGPUSupportedLimits * limits);
-typedef void (*WGPUProcAdapterGetProperties)(WGPUAdapter adapter, WGPUAdapterProperties * properties);
+typedef WGPUDevice (*WGPUProcAdapterCreateDevice)(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor /* nullable */);
+typedef size_t (*WGPUProcAdapterEnumerateFeatures)(WGPUAdapter adapter, WGPUFeatureName *features);
+typedef bool (*WGPUProcAdapterGetLimits)(WGPUAdapter adapter, WGPUSupportedLimits *limits);
+typedef void (*WGPUProcAdapterGetProperties)(WGPUAdapter adapter, WGPUAdapterProperties *properties);
 typedef bool (*WGPUProcAdapterHasFeature)(WGPUAdapter adapter, WGPUFeatureName feature);
-typedef void (*WGPUProcAdapterRequestDevice)(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor /* nullable */, WGPURequestDeviceCallback callback, void * userdata);
+typedef void (*WGPUProcAdapterRequestDevice)(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor /* nullable */, WGPURequestDeviceCallback callback, void *userdata);
 typedef void (*WGPUProcAdapterReference)(WGPUAdapter adapter);
 typedef void (*WGPUProcAdapterRelease)(WGPUAdapter adapter);
 
 // Procs of BindGroup
-typedef void (*WGPUProcBindGroupSetLabel)(WGPUBindGroup bindGroup, char const * label);
+typedef void (*WGPUProcBindGroupSetLabel)(WGPUBindGroup bindGroup, char const *label);
 typedef void (*WGPUProcBindGroupReference)(WGPUBindGroup bindGroup);
 typedef void (*WGPUProcBindGroupRelease)(WGPUBindGroup bindGroup);
 
 // Procs of BindGroupLayout
-typedef void (*WGPUProcBindGroupLayoutSetLabel)(WGPUBindGroupLayout bindGroupLayout, char const * label);
+typedef void (*WGPUProcBindGroupLayoutSetLabel)(WGPUBindGroupLayout bindGroupLayout, char const *label);
 typedef void (*WGPUProcBindGroupLayoutReference)(WGPUBindGroupLayout bindGroupLayout);
 typedef void (*WGPUProcBindGroupLayoutRelease)(WGPUBindGroupLayout bindGroupLayout);
 
 // Procs of Buffer
 typedef void (*WGPUProcBufferDestroy)(WGPUBuffer buffer);
-typedef void const * (*WGPUProcBufferGetConstMappedRange)(WGPUBuffer buffer, size_t offset, size_t size);
+typedef void const *(*WGPUProcBufferGetConstMappedRange)(WGPUBuffer buffer, size_t offset, size_t size);
 typedef WGPUBufferMapState (*WGPUProcBufferGetMapState)(WGPUBuffer buffer);
-typedef void * (*WGPUProcBufferGetMappedRange)(WGPUBuffer buffer, size_t offset, size_t size);
+typedef void *(*WGPUProcBufferGetMappedRange)(WGPUBuffer buffer, size_t offset, size_t size);
 typedef uint64_t (*WGPUProcBufferGetSize)(WGPUBuffer buffer);
 typedef WGPUBufferUsage (*WGPUProcBufferGetUsage)(WGPUBuffer buffer);
-typedef void (*WGPUProcBufferMapAsync)(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void * userdata);
-typedef void (*WGPUProcBufferSetLabel)(WGPUBuffer buffer, char const * label);
+typedef void (*WGPUProcBufferMapAsync)(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void *userdata);
+typedef void (*WGPUProcBufferSetLabel)(WGPUBuffer buffer, char const *label);
 typedef void (*WGPUProcBufferUnmap)(WGPUBuffer buffer);
 typedef void (*WGPUProcBufferReference)(WGPUBuffer buffer);
 typedef void (*WGPUProcBufferRelease)(WGPUBuffer buffer);
 
 // Procs of CommandBuffer
-typedef void (*WGPUProcCommandBufferSetLabel)(WGPUCommandBuffer commandBuffer, char const * label);
+typedef void (*WGPUProcCommandBufferSetLabel)(WGPUCommandBuffer commandBuffer, char const *label);
 typedef void (*WGPUProcCommandBufferReference)(WGPUCommandBuffer commandBuffer);
 typedef void (*WGPUProcCommandBufferRelease)(WGPUCommandBuffer commandBuffer);
 
 // Procs of CommandEncoder
-typedef WGPUComputePassEncoder (*WGPUProcCommandEncoderBeginComputePass)(WGPUCommandEncoder commandEncoder, WGPUComputePassDescriptor const * descriptor /* nullable */);
-typedef WGPURenderPassEncoder (*WGPUProcCommandEncoderBeginRenderPass)(WGPUCommandEncoder commandEncoder, WGPURenderPassDescriptor const * descriptor);
+typedef WGPUComputePassEncoder (*WGPUProcCommandEncoderBeginComputePass)(WGPUCommandEncoder commandEncoder, WGPUComputePassDescriptor const *descriptor /* nullable */);
+typedef WGPURenderPassEncoder (*WGPUProcCommandEncoderBeginRenderPass)(WGPUCommandEncoder commandEncoder, WGPURenderPassDescriptor const *descriptor);
 typedef void (*WGPUProcCommandEncoderClearBuffer)(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t offset, uint64_t size);
 typedef void (*WGPUProcCommandEncoderCopyBufferToBuffer)(WGPUCommandEncoder commandEncoder, WGPUBuffer source, uint64_t sourceOffset, WGPUBuffer destination, uint64_t destinationOffset, uint64_t size);
-typedef void (*WGPUProcCommandEncoderCopyBufferToTexture)(WGPUCommandEncoder commandEncoder, WGPUImageCopyBuffer const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
-typedef void (*WGPUProcCommandEncoderCopyTextureToBuffer)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyBuffer const * destination, WGPUExtent3D const * copySize);
-typedef void (*WGPUProcCommandEncoderCopyTextureToTexture)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
-typedef void (*WGPUProcCommandEncoderCopyTextureToTextureInternal)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
-typedef WGPUCommandBuffer (*WGPUProcCommandEncoderFinish)(WGPUCommandEncoder commandEncoder, WGPUCommandBufferDescriptor const * descriptor /* nullable */);
-typedef void (*WGPUProcCommandEncoderInjectValidationError)(WGPUCommandEncoder commandEncoder, char const * message);
-typedef void (*WGPUProcCommandEncoderInsertDebugMarker)(WGPUCommandEncoder commandEncoder, char const * markerLabel);
+typedef void (*WGPUProcCommandEncoderCopyBufferToTexture)(WGPUCommandEncoder commandEncoder, WGPUImageCopyBuffer const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
+typedef void (*WGPUProcCommandEncoderCopyTextureToBuffer)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyBuffer const *destination, WGPUExtent3D const *copySize);
+typedef void (*WGPUProcCommandEncoderCopyTextureToTexture)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
+typedef void (*WGPUProcCommandEncoderCopyTextureToTextureInternal)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
+typedef WGPUCommandBuffer (*WGPUProcCommandEncoderFinish)(WGPUCommandEncoder commandEncoder, WGPUCommandBufferDescriptor const *descriptor /* nullable */);
+typedef void (*WGPUProcCommandEncoderInjectValidationError)(WGPUCommandEncoder commandEncoder, char const *message);
+typedef void (*WGPUProcCommandEncoderInsertDebugMarker)(WGPUCommandEncoder commandEncoder, char const *markerLabel);
 typedef void (*WGPUProcCommandEncoderPopDebugGroup)(WGPUCommandEncoder commandEncoder);
-typedef void (*WGPUProcCommandEncoderPushDebugGroup)(WGPUCommandEncoder commandEncoder, char const * groupLabel);
+typedef void (*WGPUProcCommandEncoderPushDebugGroup)(WGPUCommandEncoder commandEncoder, char const *groupLabel);
 typedef void (*WGPUProcCommandEncoderResolveQuerySet)(WGPUCommandEncoder commandEncoder, WGPUQuerySet querySet, uint32_t firstQuery, uint32_t queryCount, WGPUBuffer destination, uint64_t destinationOffset);
-typedef void (*WGPUProcCommandEncoderSetLabel)(WGPUCommandEncoder commandEncoder, char const * label);
-typedef void (*WGPUProcCommandEncoderWriteBuffer)(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t bufferOffset, uint8_t const * data, uint64_t size);
+typedef void (*WGPUProcCommandEncoderSetLabel)(WGPUCommandEncoder commandEncoder, char const *label);
+typedef void (*WGPUProcCommandEncoderWriteBuffer)(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t bufferOffset, uint8_t const *data, uint64_t size);
 typedef void (*WGPUProcCommandEncoderWriteTimestamp)(WGPUCommandEncoder commandEncoder, WGPUQuerySet querySet, uint32_t queryIndex);
 typedef void (*WGPUProcCommandEncoderReference)(WGPUCommandEncoder commandEncoder);
 typedef void (*WGPUProcCommandEncoderRelease)(WGPUCommandEncoder commandEncoder);
@@ -1451,11 +1451,11 @@ typedef void (*WGPUProcComputePassEncoderDispatchWorkgroups)(WGPUComputePassEnco
 typedef void (*WGPUProcComputePassEncoderDispatchWorkgroupsIndirect)(WGPUComputePassEncoder computePassEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
 typedef void (*WGPUProcComputePassEncoderEnd)(WGPUComputePassEncoder computePassEncoder);
 typedef void (*WGPUProcComputePassEncoderEndPass)(WGPUComputePassEncoder computePassEncoder);
-typedef void (*WGPUProcComputePassEncoderInsertDebugMarker)(WGPUComputePassEncoder computePassEncoder, char const * markerLabel);
+typedef void (*WGPUProcComputePassEncoderInsertDebugMarker)(WGPUComputePassEncoder computePassEncoder, char const *markerLabel);
 typedef void (*WGPUProcComputePassEncoderPopDebugGroup)(WGPUComputePassEncoder computePassEncoder);
-typedef void (*WGPUProcComputePassEncoderPushDebugGroup)(WGPUComputePassEncoder computePassEncoder, char const * groupLabel);
-typedef void (*WGPUProcComputePassEncoderSetBindGroup)(WGPUComputePassEncoder computePassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
-typedef void (*WGPUProcComputePassEncoderSetLabel)(WGPUComputePassEncoder computePassEncoder, char const * label);
+typedef void (*WGPUProcComputePassEncoderPushDebugGroup)(WGPUComputePassEncoder computePassEncoder, char const *groupLabel);
+typedef void (*WGPUProcComputePassEncoderSetBindGroup)(WGPUComputePassEncoder computePassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
+typedef void (*WGPUProcComputePassEncoderSetLabel)(WGPUComputePassEncoder computePassEncoder, char const *label);
 typedef void (*WGPUProcComputePassEncoderSetPipeline)(WGPUComputePassEncoder computePassEncoder, WGPUComputePipeline pipeline);
 typedef void (*WGPUProcComputePassEncoderWriteTimestamp)(WGPUComputePassEncoder computePassEncoder, WGPUQuerySet querySet, uint32_t queryIndex);
 typedef void (*WGPUProcComputePassEncoderReference)(WGPUComputePassEncoder computePassEncoder);
@@ -1463,46 +1463,46 @@ typedef void (*WGPUProcComputePassEncoderRelease)(WGPUComputePassEncoder compute
 
 // Procs of ComputePipeline
 typedef WGPUBindGroupLayout (*WGPUProcComputePipelineGetBindGroupLayout)(WGPUComputePipeline computePipeline, uint32_t groupIndex);
-typedef void (*WGPUProcComputePipelineSetLabel)(WGPUComputePipeline computePipeline, char const * label);
+typedef void (*WGPUProcComputePipelineSetLabel)(WGPUComputePipeline computePipeline, char const *label);
 typedef void (*WGPUProcComputePipelineReference)(WGPUComputePipeline computePipeline);
 typedef void (*WGPUProcComputePipelineRelease)(WGPUComputePipeline computePipeline);
 
 // Procs of Device
-typedef WGPUBindGroup (*WGPUProcDeviceCreateBindGroup)(WGPUDevice device, WGPUBindGroupDescriptor const * descriptor);
-typedef WGPUBindGroupLayout (*WGPUProcDeviceCreateBindGroupLayout)(WGPUDevice device, WGPUBindGroupLayoutDescriptor const * descriptor);
-typedef WGPUBuffer (*WGPUProcDeviceCreateBuffer)(WGPUDevice device, WGPUBufferDescriptor const * descriptor);
-typedef WGPUCommandEncoder (*WGPUProcDeviceCreateCommandEncoder)(WGPUDevice device, WGPUCommandEncoderDescriptor const * descriptor /* nullable */);
-typedef WGPUComputePipeline (*WGPUProcDeviceCreateComputePipeline)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor);
-typedef void (*WGPUProcDeviceCreateComputePipelineAsync)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallback callback, void * userdata);
-typedef WGPUBuffer (*WGPUProcDeviceCreateErrorBuffer)(WGPUDevice device, WGPUBufferDescriptor const * descriptor);
+typedef WGPUBindGroup (*WGPUProcDeviceCreateBindGroup)(WGPUDevice device, WGPUBindGroupDescriptor const *descriptor);
+typedef WGPUBindGroupLayout (*WGPUProcDeviceCreateBindGroupLayout)(WGPUDevice device, WGPUBindGroupLayoutDescriptor const *descriptor);
+typedef WGPUBuffer (*WGPUProcDeviceCreateBuffer)(WGPUDevice device, WGPUBufferDescriptor const *descriptor);
+typedef WGPUCommandEncoder (*WGPUProcDeviceCreateCommandEncoder)(WGPUDevice device, WGPUCommandEncoderDescriptor const *descriptor /* nullable */);
+typedef WGPUComputePipeline (*WGPUProcDeviceCreateComputePipeline)(WGPUDevice device, WGPUComputePipelineDescriptor const *descriptor);
+typedef void (*WGPUProcDeviceCreateComputePipelineAsync)(WGPUDevice device, WGPUComputePipelineDescriptor const *descriptor, WGPUCreateComputePipelineAsyncCallback callback, void *userdata);
+typedef WGPUBuffer (*WGPUProcDeviceCreateErrorBuffer)(WGPUDevice device, WGPUBufferDescriptor const *descriptor);
 typedef WGPUExternalTexture (*WGPUProcDeviceCreateErrorExternalTexture)(WGPUDevice device);
-typedef WGPUTexture (*WGPUProcDeviceCreateErrorTexture)(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
-typedef WGPUExternalTexture (*WGPUProcDeviceCreateExternalTexture)(WGPUDevice device, WGPUExternalTextureDescriptor const * externalTextureDescriptor);
-typedef WGPUPipelineLayout (*WGPUProcDeviceCreatePipelineLayout)(WGPUDevice device, WGPUPipelineLayoutDescriptor const * descriptor);
-typedef WGPUQuerySet (*WGPUProcDeviceCreateQuerySet)(WGPUDevice device, WGPUQuerySetDescriptor const * descriptor);
-typedef WGPURenderBundleEncoder (*WGPUProcDeviceCreateRenderBundleEncoder)(WGPUDevice device, WGPURenderBundleEncoderDescriptor const * descriptor);
-typedef WGPURenderPipeline (*WGPUProcDeviceCreateRenderPipeline)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor);
-typedef void (*WGPUProcDeviceCreateRenderPipelineAsync)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void * userdata);
-typedef WGPUSampler (*WGPUProcDeviceCreateSampler)(WGPUDevice device, WGPUSamplerDescriptor const * descriptor /* nullable */);
-typedef WGPUShaderModule (*WGPUProcDeviceCreateShaderModule)(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor);
-typedef WGPUSwapChain (*WGPUProcDeviceCreateSwapChain)(WGPUDevice device, WGPUSurface surface /* nullable */, WGPUSwapChainDescriptor const * descriptor);
-typedef WGPUTexture (*WGPUProcDeviceCreateTexture)(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
+typedef WGPUTexture (*WGPUProcDeviceCreateErrorTexture)(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
+typedef WGPUExternalTexture (*WGPUProcDeviceCreateExternalTexture)(WGPUDevice device, WGPUExternalTextureDescriptor const *externalTextureDescriptor);
+typedef WGPUPipelineLayout (*WGPUProcDeviceCreatePipelineLayout)(WGPUDevice device, WGPUPipelineLayoutDescriptor const *descriptor);
+typedef WGPUQuerySet (*WGPUProcDeviceCreateQuerySet)(WGPUDevice device, WGPUQuerySetDescriptor const *descriptor);
+typedef WGPURenderBundleEncoder (*WGPUProcDeviceCreateRenderBundleEncoder)(WGPUDevice device, WGPURenderBundleEncoderDescriptor const *descriptor);
+typedef WGPURenderPipeline (*WGPUProcDeviceCreateRenderPipeline)(WGPUDevice device, WGPURenderPipelineDescriptor const *descriptor);
+typedef void (*WGPUProcDeviceCreateRenderPipelineAsync)(WGPUDevice device, WGPURenderPipelineDescriptor const *descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void *userdata);
+typedef WGPUSampler (*WGPUProcDeviceCreateSampler)(WGPUDevice device, WGPUSamplerDescriptor const *descriptor /* nullable */);
+typedef WGPUShaderModule (*WGPUProcDeviceCreateShaderModule)(WGPUDevice device, WGPUShaderModuleDescriptor const *descriptor);
+typedef WGPUSwapChain (*WGPUProcDeviceCreateSwapChain)(WGPUDevice device, WGPUSurface surface /* nullable */, WGPUSwapChainDescriptor const *descriptor);
+typedef WGPUTexture (*WGPUProcDeviceCreateTexture)(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
 typedef void (*WGPUProcDeviceDestroy)(WGPUDevice device);
-typedef size_t (*WGPUProcDeviceEnumerateFeatures)(WGPUDevice device, WGPUFeatureName * features);
-typedef void (*WGPUProcDeviceForceLoss)(WGPUDevice device, WGPUDeviceLostReason type, char const * message);
+typedef size_t (*WGPUProcDeviceEnumerateFeatures)(WGPUDevice device, WGPUFeatureName *features);
+typedef void (*WGPUProcDeviceForceLoss)(WGPUDevice device, WGPUDeviceLostReason type, char const *message);
 typedef WGPUAdapter (*WGPUProcDeviceGetAdapter)(WGPUDevice device);
-typedef bool (*WGPUProcDeviceGetLimits)(WGPUDevice device, WGPUSupportedLimits * limits);
+typedef bool (*WGPUProcDeviceGetLimits)(WGPUDevice device, WGPUSupportedLimits *limits);
 typedef WGPUQueue (*WGPUProcDeviceGetQueue)(WGPUDevice device);
 typedef bool (*WGPUProcDeviceHasFeature)(WGPUDevice device, WGPUFeatureName feature);
-typedef void (*WGPUProcDeviceInjectError)(WGPUDevice device, WGPUErrorType type, char const * message);
-typedef bool (*WGPUProcDevicePopErrorScope)(WGPUDevice device, WGPUErrorCallback callback, void * userdata);
+typedef void (*WGPUProcDeviceInjectError)(WGPUDevice device, WGPUErrorType type, char const *message);
+typedef bool (*WGPUProcDevicePopErrorScope)(WGPUDevice device, WGPUErrorCallback callback, void *userdata);
 typedef void (*WGPUProcDevicePushErrorScope)(WGPUDevice device, WGPUErrorFilter filter);
-typedef void (*WGPUProcDeviceSetDeviceLostCallback)(WGPUDevice device, WGPUDeviceLostCallback callback, void * userdata);
-typedef void (*WGPUProcDeviceSetLabel)(WGPUDevice device, char const * label);
-typedef void (*WGPUProcDeviceSetLoggingCallback)(WGPUDevice device, WGPULoggingCallback callback, void * userdata);
-typedef void (*WGPUProcDeviceSetUncapturedErrorCallback)(WGPUDevice device, WGPUErrorCallback callback, void * userdata);
+typedef void (*WGPUProcDeviceSetDeviceLostCallback)(WGPUDevice device, WGPUDeviceLostCallback callback, void *userdata);
+typedef void (*WGPUProcDeviceSetLabel)(WGPUDevice device, char const *label);
+typedef void (*WGPUProcDeviceSetLoggingCallback)(WGPUDevice device, WGPULoggingCallback callback, void *userdata);
+typedef void (*WGPUProcDeviceSetUncapturedErrorCallback)(WGPUDevice device, WGPUErrorCallback callback, void *userdata);
 typedef void (*WGPUProcDeviceTick)(WGPUDevice device);
-typedef void (*WGPUProcDeviceValidateTextureDescriptor)(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
+typedef void (*WGPUProcDeviceValidateTextureDescriptor)(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
 typedef void (*WGPUProcDeviceReference)(WGPUDevice device);
 typedef void (*WGPUProcDeviceRelease)(WGPUDevice device);
 
@@ -1510,18 +1510,18 @@ typedef void (*WGPUProcDeviceRelease)(WGPUDevice device);
 typedef void (*WGPUProcExternalTextureDestroy)(WGPUExternalTexture externalTexture);
 typedef void (*WGPUProcExternalTextureExpire)(WGPUExternalTexture externalTexture);
 typedef void (*WGPUProcExternalTextureRefresh)(WGPUExternalTexture externalTexture);
-typedef void (*WGPUProcExternalTextureSetLabel)(WGPUExternalTexture externalTexture, char const * label);
+typedef void (*WGPUProcExternalTextureSetLabel)(WGPUExternalTexture externalTexture, char const *label);
 typedef void (*WGPUProcExternalTextureReference)(WGPUExternalTexture externalTexture);
 typedef void (*WGPUProcExternalTextureRelease)(WGPUExternalTexture externalTexture);
 
 // Procs of Instance
-typedef WGPUSurface (*WGPUProcInstanceCreateSurface)(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor);
-typedef void (*WGPUProcInstanceRequestAdapter)(WGPUInstance instance, WGPURequestAdapterOptions const * options /* nullable */, WGPURequestAdapterCallback callback, void * userdata);
+typedef WGPUSurface (*WGPUProcInstanceCreateSurface)(WGPUInstance instance, WGPUSurfaceDescriptor const *descriptor);
+typedef void (*WGPUProcInstanceRequestAdapter)(WGPUInstance instance, WGPURequestAdapterOptions const *options /* nullable */, WGPURequestAdapterCallback callback, void *userdata);
 typedef void (*WGPUProcInstanceReference)(WGPUInstance instance);
 typedef void (*WGPUProcInstanceRelease)(WGPUInstance instance);
 
 // Procs of PipelineLayout
-typedef void (*WGPUProcPipelineLayoutSetLabel)(WGPUPipelineLayout pipelineLayout, char const * label);
+typedef void (*WGPUProcPipelineLayoutSetLabel)(WGPUPipelineLayout pipelineLayout, char const *label);
 typedef void (*WGPUProcPipelineLayoutReference)(WGPUPipelineLayout pipelineLayout);
 typedef void (*WGPUProcPipelineLayoutRelease)(WGPUPipelineLayout pipelineLayout);
 
@@ -1529,18 +1529,18 @@ typedef void (*WGPUProcPipelineLayoutRelease)(WGPUPipelineLayout pipelineLayout)
 typedef void (*WGPUProcQuerySetDestroy)(WGPUQuerySet querySet);
 typedef uint32_t (*WGPUProcQuerySetGetCount)(WGPUQuerySet querySet);
 typedef WGPUQueryType (*WGPUProcQuerySetGetType)(WGPUQuerySet querySet);
-typedef void (*WGPUProcQuerySetSetLabel)(WGPUQuerySet querySet, char const * label);
+typedef void (*WGPUProcQuerySetSetLabel)(WGPUQuerySet querySet, char const *label);
 typedef void (*WGPUProcQuerySetReference)(WGPUQuerySet querySet);
 typedef void (*WGPUProcQuerySetRelease)(WGPUQuerySet querySet);
 
 // Procs of Queue
-typedef void (*WGPUProcQueueCopyExternalTextureForBrowser)(WGPUQueue queue, WGPUImageCopyExternalTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options);
-typedef void (*WGPUProcQueueCopyTextureForBrowser)(WGPUQueue queue, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options);
-typedef void (*WGPUProcQueueOnSubmittedWorkDone)(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void * userdata);
-typedef void (*WGPUProcQueueSetLabel)(WGPUQueue queue, char const * label);
-typedef void (*WGPUProcQueueSubmit)(WGPUQueue queue, uint32_t commandCount, WGPUCommandBuffer const * commands);
-typedef void (*WGPUProcQueueWriteBuffer)(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const * data, size_t size);
-typedef void (*WGPUProcQueueWriteTexture)(WGPUQueue queue, WGPUImageCopyTexture const * destination, void const * data, size_t dataSize, WGPUTextureDataLayout const * dataLayout, WGPUExtent3D const * writeSize);
+typedef void (*WGPUProcQueueCopyExternalTextureForBrowser)(WGPUQueue queue, WGPUImageCopyExternalTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize, WGPUCopyTextureForBrowserOptions const *options);
+typedef void (*WGPUProcQueueCopyTextureForBrowser)(WGPUQueue queue, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize, WGPUCopyTextureForBrowserOptions const *options);
+typedef void (*WGPUProcQueueOnSubmittedWorkDone)(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void *userdata);
+typedef void (*WGPUProcQueueSetLabel)(WGPUQueue queue, char const *label);
+typedef void (*WGPUProcQueueSubmit)(WGPUQueue queue, uint32_t commandCount, WGPUCommandBuffer const *commands);
+typedef void (*WGPUProcQueueWriteBuffer)(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const *data, size_t size);
+typedef void (*WGPUProcQueueWriteTexture)(WGPUQueue queue, WGPUImageCopyTexture const *destination, void const *data, size_t dataSize, WGPUTextureDataLayout const *dataLayout, WGPUExtent3D const *writeSize);
 typedef void (*WGPUProcQueueReference)(WGPUQueue queue);
 typedef void (*WGPUProcQueueRelease)(WGPUQueue queue);
 
@@ -1553,13 +1553,13 @@ typedef void (*WGPUProcRenderBundleEncoderDraw)(WGPURenderBundleEncoder renderBu
 typedef void (*WGPUProcRenderBundleEncoderDrawIndexed)(WGPURenderBundleEncoder renderBundleEncoder, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t baseVertex, uint32_t firstInstance);
 typedef void (*WGPUProcRenderBundleEncoderDrawIndexedIndirect)(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
 typedef void (*WGPUProcRenderBundleEncoderDrawIndirect)(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
-typedef WGPURenderBundle (*WGPUProcRenderBundleEncoderFinish)(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderBundleDescriptor const * descriptor /* nullable */);
-typedef void (*WGPUProcRenderBundleEncoderInsertDebugMarker)(WGPURenderBundleEncoder renderBundleEncoder, char const * markerLabel);
+typedef WGPURenderBundle (*WGPUProcRenderBundleEncoderFinish)(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderBundleDescriptor const *descriptor /* nullable */);
+typedef void (*WGPUProcRenderBundleEncoderInsertDebugMarker)(WGPURenderBundleEncoder renderBundleEncoder, char const *markerLabel);
 typedef void (*WGPUProcRenderBundleEncoderPopDebugGroup)(WGPURenderBundleEncoder renderBundleEncoder);
-typedef void (*WGPUProcRenderBundleEncoderPushDebugGroup)(WGPURenderBundleEncoder renderBundleEncoder, char const * groupLabel);
-typedef void (*WGPUProcRenderBundleEncoderSetBindGroup)(WGPURenderBundleEncoder renderBundleEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
+typedef void (*WGPUProcRenderBundleEncoderPushDebugGroup)(WGPURenderBundleEncoder renderBundleEncoder, char const *groupLabel);
+typedef void (*WGPUProcRenderBundleEncoderSetBindGroup)(WGPURenderBundleEncoder renderBundleEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
 typedef void (*WGPUProcRenderBundleEncoderSetIndexBuffer)(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer buffer, WGPUIndexFormat format, uint64_t offset, uint64_t size);
-typedef void (*WGPUProcRenderBundleEncoderSetLabel)(WGPURenderBundleEncoder renderBundleEncoder, char const * label);
+typedef void (*WGPUProcRenderBundleEncoderSetLabel)(WGPURenderBundleEncoder renderBundleEncoder, char const *label);
 typedef void (*WGPUProcRenderBundleEncoderSetPipeline)(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderPipeline pipeline);
 typedef void (*WGPUProcRenderBundleEncoderSetVertexBuffer)(WGPURenderBundleEncoder renderBundleEncoder, uint32_t slot, WGPUBuffer buffer, uint64_t offset, uint64_t size);
 typedef void (*WGPUProcRenderBundleEncoderReference)(WGPURenderBundleEncoder renderBundleEncoder);
@@ -1574,14 +1574,14 @@ typedef void (*WGPUProcRenderPassEncoderDrawIndirect)(WGPURenderPassEncoder rend
 typedef void (*WGPUProcRenderPassEncoderEnd)(WGPURenderPassEncoder renderPassEncoder);
 typedef void (*WGPUProcRenderPassEncoderEndOcclusionQuery)(WGPURenderPassEncoder renderPassEncoder);
 typedef void (*WGPUProcRenderPassEncoderEndPass)(WGPURenderPassEncoder renderPassEncoder);
-typedef void (*WGPUProcRenderPassEncoderExecuteBundles)(WGPURenderPassEncoder renderPassEncoder, uint32_t bundleCount, WGPURenderBundle const * bundles);
-typedef void (*WGPUProcRenderPassEncoderInsertDebugMarker)(WGPURenderPassEncoder renderPassEncoder, char const * markerLabel);
+typedef void (*WGPUProcRenderPassEncoderExecuteBundles)(WGPURenderPassEncoder renderPassEncoder, uint32_t bundleCount, WGPURenderBundle const *bundles);
+typedef void (*WGPUProcRenderPassEncoderInsertDebugMarker)(WGPURenderPassEncoder renderPassEncoder, char const *markerLabel);
 typedef void (*WGPUProcRenderPassEncoderPopDebugGroup)(WGPURenderPassEncoder renderPassEncoder);
-typedef void (*WGPUProcRenderPassEncoderPushDebugGroup)(WGPURenderPassEncoder renderPassEncoder, char const * groupLabel);
-typedef void (*WGPUProcRenderPassEncoderSetBindGroup)(WGPURenderPassEncoder renderPassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
-typedef void (*WGPUProcRenderPassEncoderSetBlendConstant)(WGPURenderPassEncoder renderPassEncoder, WGPUColor const * color);
+typedef void (*WGPUProcRenderPassEncoderPushDebugGroup)(WGPURenderPassEncoder renderPassEncoder, char const *groupLabel);
+typedef void (*WGPUProcRenderPassEncoderSetBindGroup)(WGPURenderPassEncoder renderPassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
+typedef void (*WGPUProcRenderPassEncoderSetBlendConstant)(WGPURenderPassEncoder renderPassEncoder, WGPUColor const *color);
 typedef void (*WGPUProcRenderPassEncoderSetIndexBuffer)(WGPURenderPassEncoder renderPassEncoder, WGPUBuffer buffer, WGPUIndexFormat format, uint64_t offset, uint64_t size);
-typedef void (*WGPUProcRenderPassEncoderSetLabel)(WGPURenderPassEncoder renderPassEncoder, char const * label);
+typedef void (*WGPUProcRenderPassEncoderSetLabel)(WGPURenderPassEncoder renderPassEncoder, char const *label);
 typedef void (*WGPUProcRenderPassEncoderSetPipeline)(WGPURenderPassEncoder renderPassEncoder, WGPURenderPipeline pipeline);
 typedef void (*WGPUProcRenderPassEncoderSetScissorRect)(WGPURenderPassEncoder renderPassEncoder, uint32_t x, uint32_t y, uint32_t width, uint32_t height);
 typedef void (*WGPUProcRenderPassEncoderSetStencilReference)(WGPURenderPassEncoder renderPassEncoder, uint32_t reference);
@@ -1593,18 +1593,18 @@ typedef void (*WGPUProcRenderPassEncoderRelease)(WGPURenderPassEncoder renderPas
 
 // Procs of RenderPipeline
 typedef WGPUBindGroupLayout (*WGPUProcRenderPipelineGetBindGroupLayout)(WGPURenderPipeline renderPipeline, uint32_t groupIndex);
-typedef void (*WGPUProcRenderPipelineSetLabel)(WGPURenderPipeline renderPipeline, char const * label);
+typedef void (*WGPUProcRenderPipelineSetLabel)(WGPURenderPipeline renderPipeline, char const *label);
 typedef void (*WGPUProcRenderPipelineReference)(WGPURenderPipeline renderPipeline);
 typedef void (*WGPUProcRenderPipelineRelease)(WGPURenderPipeline renderPipeline);
 
 // Procs of Sampler
-typedef void (*WGPUProcSamplerSetLabel)(WGPUSampler sampler, char const * label);
+typedef void (*WGPUProcSamplerSetLabel)(WGPUSampler sampler, char const *label);
 typedef void (*WGPUProcSamplerReference)(WGPUSampler sampler);
 typedef void (*WGPUProcSamplerRelease)(WGPUSampler sampler);
 
 // Procs of ShaderModule
-typedef void (*WGPUProcShaderModuleGetCompilationInfo)(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void * userdata);
-typedef void (*WGPUProcShaderModuleSetLabel)(WGPUShaderModule shaderModule, char const * label);
+typedef void (*WGPUProcShaderModuleGetCompilationInfo)(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void *userdata);
+typedef void (*WGPUProcShaderModuleSetLabel)(WGPUShaderModule shaderModule, char const *label);
 typedef void (*WGPUProcShaderModuleReference)(WGPUShaderModule shaderModule);
 typedef void (*WGPUProcShaderModuleRelease)(WGPUShaderModule shaderModule);
 
@@ -1620,7 +1620,7 @@ typedef void (*WGPUProcSwapChainReference)(WGPUSwapChain swapChain);
 typedef void (*WGPUProcSwapChainRelease)(WGPUSwapChain swapChain);
 
 // Procs of Texture
-typedef WGPUTextureView (*WGPUProcTextureCreateView)(WGPUTexture texture, WGPUTextureViewDescriptor const * descriptor /* nullable */);
+typedef WGPUTextureView (*WGPUProcTextureCreateView)(WGPUTexture texture, WGPUTextureViewDescriptor const *descriptor /* nullable */);
 typedef void (*WGPUProcTextureDestroy)(WGPUTexture texture);
 typedef uint32_t (*WGPUProcTextureGetDepthOrArrayLayers)(WGPUTexture texture);
 typedef WGPUTextureDimension (*WGPUProcTextureGetDimension)(WGPUTexture texture);
@@ -1630,12 +1630,12 @@ typedef uint32_t (*WGPUProcTextureGetMipLevelCount)(WGPUTexture texture);
 typedef uint32_t (*WGPUProcTextureGetSampleCount)(WGPUTexture texture);
 typedef WGPUTextureUsage (*WGPUProcTextureGetUsage)(WGPUTexture texture);
 typedef uint32_t (*WGPUProcTextureGetWidth)(WGPUTexture texture);
-typedef void (*WGPUProcTextureSetLabel)(WGPUTexture texture, char const * label);
+typedef void (*WGPUProcTextureSetLabel)(WGPUTexture texture, char const *label);
 typedef void (*WGPUProcTextureReference)(WGPUTexture texture);
 typedef void (*WGPUProcTextureRelease)(WGPUTexture texture);
 
 // Procs of TextureView
-typedef void (*WGPUProcTextureViewSetLabel)(WGPUTextureView textureView, char const * label);
+typedef void (*WGPUProcTextureViewSetLabel)(WGPUTextureView textureView, char const *label);
 typedef void (*WGPUProcTextureViewReference)(WGPUTextureView textureView);
 typedef void (*WGPUProcTextureViewRelease)(WGPUTextureView textureView);
 
@@ -1643,64 +1643,64 @@ typedef void (*WGPUProcTextureViewRelease)(WGPUTextureView textureView);
 
 #if !defined(WGPU_SKIP_DECLARATIONS)
 
-WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPUInstanceDescriptor const * descriptor);
-WGPU_EXPORT WGPUProc wgpuGetProcAddress(WGPUDevice device, char const * procName);
+WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPUInstanceDescriptor const *descriptor);
+WGPU_EXPORT WGPUProc wgpuGetProcAddress(WGPUDevice device, char const *procName);
 
 // Methods of Adapter
-WGPU_EXPORT WGPUDevice wgpuAdapterCreateDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor /* nullable */);
-WGPU_EXPORT size_t wgpuAdapterEnumerateFeatures(WGPUAdapter adapter, WGPUFeatureName * features);
-WGPU_EXPORT bool wgpuAdapterGetLimits(WGPUAdapter adapter, WGPUSupportedLimits * limits);
-WGPU_EXPORT void wgpuAdapterGetProperties(WGPUAdapter adapter, WGPUAdapterProperties * properties);
+WGPU_EXPORT WGPUDevice wgpuAdapterCreateDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor /* nullable */);
+WGPU_EXPORT size_t wgpuAdapterEnumerateFeatures(WGPUAdapter adapter, WGPUFeatureName *features);
+WGPU_EXPORT bool wgpuAdapterGetLimits(WGPUAdapter adapter, WGPUSupportedLimits *limits);
+WGPU_EXPORT void wgpuAdapterGetProperties(WGPUAdapter adapter, WGPUAdapterProperties *properties);
 WGPU_EXPORT bool wgpuAdapterHasFeature(WGPUAdapter adapter, WGPUFeatureName feature);
-WGPU_EXPORT void wgpuAdapterRequestDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor /* nullable */, WGPURequestDeviceCallback callback, void * userdata);
+WGPU_EXPORT void wgpuAdapterRequestDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor /* nullable */, WGPURequestDeviceCallback callback, void *userdata);
 WGPU_EXPORT void wgpuAdapterReference(WGPUAdapter adapter);
 WGPU_EXPORT void wgpuAdapterRelease(WGPUAdapter adapter);
 
 // Methods of BindGroup
-WGPU_EXPORT void wgpuBindGroupSetLabel(WGPUBindGroup bindGroup, char const * label);
+WGPU_EXPORT void wgpuBindGroupSetLabel(WGPUBindGroup bindGroup, char const *label);
 WGPU_EXPORT void wgpuBindGroupReference(WGPUBindGroup bindGroup);
 WGPU_EXPORT void wgpuBindGroupRelease(WGPUBindGroup bindGroup);
 
 // Methods of BindGroupLayout
-WGPU_EXPORT void wgpuBindGroupLayoutSetLabel(WGPUBindGroupLayout bindGroupLayout, char const * label);
+WGPU_EXPORT void wgpuBindGroupLayoutSetLabel(WGPUBindGroupLayout bindGroupLayout, char const *label);
 WGPU_EXPORT void wgpuBindGroupLayoutReference(WGPUBindGroupLayout bindGroupLayout);
 WGPU_EXPORT void wgpuBindGroupLayoutRelease(WGPUBindGroupLayout bindGroupLayout);
 
 // Methods of Buffer
 WGPU_EXPORT void wgpuBufferDestroy(WGPUBuffer buffer);
-WGPU_EXPORT void const * wgpuBufferGetConstMappedRange(WGPUBuffer buffer, size_t offset, size_t size);
+WGPU_EXPORT void const *wgpuBufferGetConstMappedRange(WGPUBuffer buffer, size_t offset, size_t size);
 WGPU_EXPORT WGPUBufferMapState wgpuBufferGetMapState(WGPUBuffer buffer);
-WGPU_EXPORT void * wgpuBufferGetMappedRange(WGPUBuffer buffer, size_t offset, size_t size);
+WGPU_EXPORT void *wgpuBufferGetMappedRange(WGPUBuffer buffer, size_t offset, size_t size);
 WGPU_EXPORT uint64_t wgpuBufferGetSize(WGPUBuffer buffer);
 WGPU_EXPORT WGPUBufferUsage wgpuBufferGetUsage(WGPUBuffer buffer);
-WGPU_EXPORT void wgpuBufferMapAsync(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void * userdata);
-WGPU_EXPORT void wgpuBufferSetLabel(WGPUBuffer buffer, char const * label);
+WGPU_EXPORT void wgpuBufferMapAsync(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void *userdata);
+WGPU_EXPORT void wgpuBufferSetLabel(WGPUBuffer buffer, char const *label);
 WGPU_EXPORT void wgpuBufferUnmap(WGPUBuffer buffer);
 WGPU_EXPORT void wgpuBufferReference(WGPUBuffer buffer);
 WGPU_EXPORT void wgpuBufferRelease(WGPUBuffer buffer);
 
 // Methods of CommandBuffer
-WGPU_EXPORT void wgpuCommandBufferSetLabel(WGPUCommandBuffer commandBuffer, char const * label);
+WGPU_EXPORT void wgpuCommandBufferSetLabel(WGPUCommandBuffer commandBuffer, char const *label);
 WGPU_EXPORT void wgpuCommandBufferReference(WGPUCommandBuffer commandBuffer);
 WGPU_EXPORT void wgpuCommandBufferRelease(WGPUCommandBuffer commandBuffer);
 
 // Methods of CommandEncoder
-WGPU_EXPORT WGPUComputePassEncoder wgpuCommandEncoderBeginComputePass(WGPUCommandEncoder commandEncoder, WGPUComputePassDescriptor const * descriptor /* nullable */);
-WGPU_EXPORT WGPURenderPassEncoder wgpuCommandEncoderBeginRenderPass(WGPUCommandEncoder commandEncoder, WGPURenderPassDescriptor const * descriptor);
+WGPU_EXPORT WGPUComputePassEncoder wgpuCommandEncoderBeginComputePass(WGPUCommandEncoder commandEncoder, WGPUComputePassDescriptor const *descriptor /* nullable */);
+WGPU_EXPORT WGPURenderPassEncoder wgpuCommandEncoderBeginRenderPass(WGPUCommandEncoder commandEncoder, WGPURenderPassDescriptor const *descriptor);
 WGPU_EXPORT void wgpuCommandEncoderClearBuffer(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t offset, uint64_t size);
 WGPU_EXPORT void wgpuCommandEncoderCopyBufferToBuffer(WGPUCommandEncoder commandEncoder, WGPUBuffer source, uint64_t sourceOffset, WGPUBuffer destination, uint64_t destinationOffset, uint64_t size);
-WGPU_EXPORT void wgpuCommandEncoderCopyBufferToTexture(WGPUCommandEncoder commandEncoder, WGPUImageCopyBuffer const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
-WGPU_EXPORT void wgpuCommandEncoderCopyTextureToBuffer(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyBuffer const * destination, WGPUExtent3D const * copySize);
-WGPU_EXPORT void wgpuCommandEncoderCopyTextureToTexture(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
-WGPU_EXPORT void wgpuCommandEncoderCopyTextureToTextureInternal(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
-WGPU_EXPORT WGPUCommandBuffer wgpuCommandEncoderFinish(WGPUCommandEncoder commandEncoder, WGPUCommandBufferDescriptor const * descriptor /* nullable */);
-WGPU_EXPORT void wgpuCommandEncoderInjectValidationError(WGPUCommandEncoder commandEncoder, char const * message);
-WGPU_EXPORT void wgpuCommandEncoderInsertDebugMarker(WGPUCommandEncoder commandEncoder, char const * markerLabel);
+WGPU_EXPORT void wgpuCommandEncoderCopyBufferToTexture(WGPUCommandEncoder commandEncoder, WGPUImageCopyBuffer const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
+WGPU_EXPORT void wgpuCommandEncoderCopyTextureToBuffer(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyBuffer const *destination, WGPUExtent3D const *copySize);
+WGPU_EXPORT void wgpuCommandEncoderCopyTextureToTexture(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
+WGPU_EXPORT void wgpuCommandEncoderCopyTextureToTextureInternal(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
+WGPU_EXPORT WGPUCommandBuffer wgpuCommandEncoderFinish(WGPUCommandEncoder commandEncoder, WGPUCommandBufferDescriptor const *descriptor /* nullable */);
+WGPU_EXPORT void wgpuCommandEncoderInjectValidationError(WGPUCommandEncoder commandEncoder, char const *message);
+WGPU_EXPORT void wgpuCommandEncoderInsertDebugMarker(WGPUCommandEncoder commandEncoder, char const *markerLabel);
 WGPU_EXPORT void wgpuCommandEncoderPopDebugGroup(WGPUCommandEncoder commandEncoder);
-WGPU_EXPORT void wgpuCommandEncoderPushDebugGroup(WGPUCommandEncoder commandEncoder, char const * groupLabel);
+WGPU_EXPORT void wgpuCommandEncoderPushDebugGroup(WGPUCommandEncoder commandEncoder, char const *groupLabel);
 WGPU_EXPORT void wgpuCommandEncoderResolveQuerySet(WGPUCommandEncoder commandEncoder, WGPUQuerySet querySet, uint32_t firstQuery, uint32_t queryCount, WGPUBuffer destination, uint64_t destinationOffset);
-WGPU_EXPORT void wgpuCommandEncoderSetLabel(WGPUCommandEncoder commandEncoder, char const * label);
-WGPU_EXPORT void wgpuCommandEncoderWriteBuffer(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t bufferOffset, uint8_t const * data, uint64_t size);
+WGPU_EXPORT void wgpuCommandEncoderSetLabel(WGPUCommandEncoder commandEncoder, char const *label);
+WGPU_EXPORT void wgpuCommandEncoderWriteBuffer(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t bufferOffset, uint8_t const *data, uint64_t size);
 WGPU_EXPORT void wgpuCommandEncoderWriteTimestamp(WGPUCommandEncoder commandEncoder, WGPUQuerySet querySet, uint32_t queryIndex);
 WGPU_EXPORT void wgpuCommandEncoderReference(WGPUCommandEncoder commandEncoder);
 WGPU_EXPORT void wgpuCommandEncoderRelease(WGPUCommandEncoder commandEncoder);
@@ -1712,11 +1712,11 @@ WGPU_EXPORT void wgpuComputePassEncoderDispatchWorkgroups(WGPUComputePassEncoder
 WGPU_EXPORT void wgpuComputePassEncoderDispatchWorkgroupsIndirect(WGPUComputePassEncoder computePassEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
 WGPU_EXPORT void wgpuComputePassEncoderEnd(WGPUComputePassEncoder computePassEncoder);
 WGPU_EXPORT void wgpuComputePassEncoderEndPass(WGPUComputePassEncoder computePassEncoder);
-WGPU_EXPORT void wgpuComputePassEncoderInsertDebugMarker(WGPUComputePassEncoder computePassEncoder, char const * markerLabel);
+WGPU_EXPORT void wgpuComputePassEncoderInsertDebugMarker(WGPUComputePassEncoder computePassEncoder, char const *markerLabel);
 WGPU_EXPORT void wgpuComputePassEncoderPopDebugGroup(WGPUComputePassEncoder computePassEncoder);
-WGPU_EXPORT void wgpuComputePassEncoderPushDebugGroup(WGPUComputePassEncoder computePassEncoder, char const * groupLabel);
-WGPU_EXPORT void wgpuComputePassEncoderSetBindGroup(WGPUComputePassEncoder computePassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
-WGPU_EXPORT void wgpuComputePassEncoderSetLabel(WGPUComputePassEncoder computePassEncoder, char const * label);
+WGPU_EXPORT void wgpuComputePassEncoderPushDebugGroup(WGPUComputePassEncoder computePassEncoder, char const *groupLabel);
+WGPU_EXPORT void wgpuComputePassEncoderSetBindGroup(WGPUComputePassEncoder computePassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
+WGPU_EXPORT void wgpuComputePassEncoderSetLabel(WGPUComputePassEncoder computePassEncoder, char const *label);
 WGPU_EXPORT void wgpuComputePassEncoderSetPipeline(WGPUComputePassEncoder computePassEncoder, WGPUComputePipeline pipeline);
 WGPU_EXPORT void wgpuComputePassEncoderWriteTimestamp(WGPUComputePassEncoder computePassEncoder, WGPUQuerySet querySet, uint32_t queryIndex);
 WGPU_EXPORT void wgpuComputePassEncoderReference(WGPUComputePassEncoder computePassEncoder);
@@ -1724,46 +1724,46 @@ WGPU_EXPORT void wgpuComputePassEncoderRelease(WGPUComputePassEncoder computePas
 
 // Methods of ComputePipeline
 WGPU_EXPORT WGPUBindGroupLayout wgpuComputePipelineGetBindGroupLayout(WGPUComputePipeline computePipeline, uint32_t groupIndex);
-WGPU_EXPORT void wgpuComputePipelineSetLabel(WGPUComputePipeline computePipeline, char const * label);
+WGPU_EXPORT void wgpuComputePipelineSetLabel(WGPUComputePipeline computePipeline, char const *label);
 WGPU_EXPORT void wgpuComputePipelineReference(WGPUComputePipeline computePipeline);
 WGPU_EXPORT void wgpuComputePipelineRelease(WGPUComputePipeline computePipeline);
 
 // Methods of Device
-WGPU_EXPORT WGPUBindGroup wgpuDeviceCreateBindGroup(WGPUDevice device, WGPUBindGroupDescriptor const * descriptor);
-WGPU_EXPORT WGPUBindGroupLayout wgpuDeviceCreateBindGroupLayout(WGPUDevice device, WGPUBindGroupLayoutDescriptor const * descriptor);
-WGPU_EXPORT WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, WGPUBufferDescriptor const * descriptor);
-WGPU_EXPORT WGPUCommandEncoder wgpuDeviceCreateCommandEncoder(WGPUDevice device, WGPUCommandEncoderDescriptor const * descriptor /* nullable */);
-WGPU_EXPORT WGPUComputePipeline wgpuDeviceCreateComputePipeline(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor);
-WGPU_EXPORT void wgpuDeviceCreateComputePipelineAsync(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallback callback, void * userdata);
-WGPU_EXPORT WGPUBuffer wgpuDeviceCreateErrorBuffer(WGPUDevice device, WGPUBufferDescriptor const * descriptor);
+WGPU_EXPORT WGPUBindGroup wgpuDeviceCreateBindGroup(WGPUDevice device, WGPUBindGroupDescriptor const *descriptor);
+WGPU_EXPORT WGPUBindGroupLayout wgpuDeviceCreateBindGroupLayout(WGPUDevice device, WGPUBindGroupLayoutDescriptor const *descriptor);
+WGPU_EXPORT WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, WGPUBufferDescriptor const *descriptor);
+WGPU_EXPORT WGPUCommandEncoder wgpuDeviceCreateCommandEncoder(WGPUDevice device, WGPUCommandEncoderDescriptor const *descriptor /* nullable */);
+WGPU_EXPORT WGPUComputePipeline wgpuDeviceCreateComputePipeline(WGPUDevice device, WGPUComputePipelineDescriptor const *descriptor);
+WGPU_EXPORT void wgpuDeviceCreateComputePipelineAsync(WGPUDevice device, WGPUComputePipelineDescriptor const *descriptor, WGPUCreateComputePipelineAsyncCallback callback, void *userdata);
+WGPU_EXPORT WGPUBuffer wgpuDeviceCreateErrorBuffer(WGPUDevice device, WGPUBufferDescriptor const *descriptor);
 WGPU_EXPORT WGPUExternalTexture wgpuDeviceCreateErrorExternalTexture(WGPUDevice device);
-WGPU_EXPORT WGPUTexture wgpuDeviceCreateErrorTexture(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
-WGPU_EXPORT WGPUExternalTexture wgpuDeviceCreateExternalTexture(WGPUDevice device, WGPUExternalTextureDescriptor const * externalTextureDescriptor);
-WGPU_EXPORT WGPUPipelineLayout wgpuDeviceCreatePipelineLayout(WGPUDevice device, WGPUPipelineLayoutDescriptor const * descriptor);
-WGPU_EXPORT WGPUQuerySet wgpuDeviceCreateQuerySet(WGPUDevice device, WGPUQuerySetDescriptor const * descriptor);
-WGPU_EXPORT WGPURenderBundleEncoder wgpuDeviceCreateRenderBundleEncoder(WGPUDevice device, WGPURenderBundleEncoderDescriptor const * descriptor);
-WGPU_EXPORT WGPURenderPipeline wgpuDeviceCreateRenderPipeline(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor);
-WGPU_EXPORT void wgpuDeviceCreateRenderPipelineAsync(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void * userdata);
-WGPU_EXPORT WGPUSampler wgpuDeviceCreateSampler(WGPUDevice device, WGPUSamplerDescriptor const * descriptor /* nullable */);
-WGPU_EXPORT WGPUShaderModule wgpuDeviceCreateShaderModule(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor);
-WGPU_EXPORT WGPUSwapChain wgpuDeviceCreateSwapChain(WGPUDevice device, WGPUSurface surface /* nullable */, WGPUSwapChainDescriptor const * descriptor);
-WGPU_EXPORT WGPUTexture wgpuDeviceCreateTexture(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
+WGPU_EXPORT WGPUTexture wgpuDeviceCreateErrorTexture(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
+WGPU_EXPORT WGPUExternalTexture wgpuDeviceCreateExternalTexture(WGPUDevice device, WGPUExternalTextureDescriptor const *externalTextureDescriptor);
+WGPU_EXPORT WGPUPipelineLayout wgpuDeviceCreatePipelineLayout(WGPUDevice device, WGPUPipelineLayoutDescriptor const *descriptor);
+WGPU_EXPORT WGPUQuerySet wgpuDeviceCreateQuerySet(WGPUDevice device, WGPUQuerySetDescriptor const *descriptor);
+WGPU_EXPORT WGPURenderBundleEncoder wgpuDeviceCreateRenderBundleEncoder(WGPUDevice device, WGPURenderBundleEncoderDescriptor const *descriptor);
+WGPU_EXPORT WGPURenderPipeline wgpuDeviceCreateRenderPipeline(WGPUDevice device, WGPURenderPipelineDescriptor const *descriptor);
+WGPU_EXPORT void wgpuDeviceCreateRenderPipelineAsync(WGPUDevice device, WGPURenderPipelineDescriptor const *descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void *userdata);
+WGPU_EXPORT WGPUSampler wgpuDeviceCreateSampler(WGPUDevice device, WGPUSamplerDescriptor const *descriptor /* nullable */);
+WGPU_EXPORT WGPUShaderModule wgpuDeviceCreateShaderModule(WGPUDevice device, WGPUShaderModuleDescriptor const *descriptor);
+WGPU_EXPORT WGPUSwapChain wgpuDeviceCreateSwapChain(WGPUDevice device, WGPUSurface surface /* nullable */, WGPUSwapChainDescriptor const *descriptor);
+WGPU_EXPORT WGPUTexture wgpuDeviceCreateTexture(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
 WGPU_EXPORT void wgpuDeviceDestroy(WGPUDevice device);
-WGPU_EXPORT size_t wgpuDeviceEnumerateFeatures(WGPUDevice device, WGPUFeatureName * features);
-WGPU_EXPORT void wgpuDeviceForceLoss(WGPUDevice device, WGPUDeviceLostReason type, char const * message);
+WGPU_EXPORT size_t wgpuDeviceEnumerateFeatures(WGPUDevice device, WGPUFeatureName *features);
+WGPU_EXPORT void wgpuDeviceForceLoss(WGPUDevice device, WGPUDeviceLostReason type, char const *message);
 WGPU_EXPORT WGPUAdapter wgpuDeviceGetAdapter(WGPUDevice device);
-WGPU_EXPORT bool wgpuDeviceGetLimits(WGPUDevice device, WGPUSupportedLimits * limits);
+WGPU_EXPORT bool wgpuDeviceGetLimits(WGPUDevice device, WGPUSupportedLimits *limits);
 WGPU_EXPORT WGPUQueue wgpuDeviceGetQueue(WGPUDevice device);
 WGPU_EXPORT bool wgpuDeviceHasFeature(WGPUDevice device, WGPUFeatureName feature);
-WGPU_EXPORT void wgpuDeviceInjectError(WGPUDevice device, WGPUErrorType type, char const * message);
-WGPU_EXPORT bool wgpuDevicePopErrorScope(WGPUDevice device, WGPUErrorCallback callback, void * userdata);
+WGPU_EXPORT void wgpuDeviceInjectError(WGPUDevice device, WGPUErrorType type, char const *message);
+WGPU_EXPORT bool wgpuDevicePopErrorScope(WGPUDevice device, WGPUErrorCallback callback, void *userdata);
 WGPU_EXPORT void wgpuDevicePushErrorScope(WGPUDevice device, WGPUErrorFilter filter);
-WGPU_EXPORT void wgpuDeviceSetDeviceLostCallback(WGPUDevice device, WGPUDeviceLostCallback callback, void * userdata);
-WGPU_EXPORT void wgpuDeviceSetLabel(WGPUDevice device, char const * label);
-WGPU_EXPORT void wgpuDeviceSetLoggingCallback(WGPUDevice device, WGPULoggingCallback callback, void * userdata);
-WGPU_EXPORT void wgpuDeviceSetUncapturedErrorCallback(WGPUDevice device, WGPUErrorCallback callback, void * userdata);
+WGPU_EXPORT void wgpuDeviceSetDeviceLostCallback(WGPUDevice device, WGPUDeviceLostCallback callback, void *userdata);
+WGPU_EXPORT void wgpuDeviceSetLabel(WGPUDevice device, char const *label);
+WGPU_EXPORT void wgpuDeviceSetLoggingCallback(WGPUDevice device, WGPULoggingCallback callback, void *userdata);
+WGPU_EXPORT void wgpuDeviceSetUncapturedErrorCallback(WGPUDevice device, WGPUErrorCallback callback, void *userdata);
 WGPU_EXPORT void wgpuDeviceTick(WGPUDevice device);
-WGPU_EXPORT void wgpuDeviceValidateTextureDescriptor(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
+WGPU_EXPORT void wgpuDeviceValidateTextureDescriptor(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
 WGPU_EXPORT void wgpuDeviceReference(WGPUDevice device);
 WGPU_EXPORT void wgpuDeviceRelease(WGPUDevice device);
 
@@ -1771,18 +1771,18 @@ WGPU_EXPORT void wgpuDeviceRelease(WGPUDevice device);
 WGPU_EXPORT void wgpuExternalTextureDestroy(WGPUExternalTexture externalTexture);
 WGPU_EXPORT void wgpuExternalTextureExpire(WGPUExternalTexture externalTexture);
 WGPU_EXPORT void wgpuExternalTextureRefresh(WGPUExternalTexture externalTexture);
-WGPU_EXPORT void wgpuExternalTextureSetLabel(WGPUExternalTexture externalTexture, char const * label);
+WGPU_EXPORT void wgpuExternalTextureSetLabel(WGPUExternalTexture externalTexture, char const *label);
 WGPU_EXPORT void wgpuExternalTextureReference(WGPUExternalTexture externalTexture);
 WGPU_EXPORT void wgpuExternalTextureRelease(WGPUExternalTexture externalTexture);
 
 // Methods of Instance
-WGPU_EXPORT WGPUSurface wgpuInstanceCreateSurface(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor);
-WGPU_EXPORT void wgpuInstanceRequestAdapter(WGPUInstance instance, WGPURequestAdapterOptions const * options /* nullable */, WGPURequestAdapterCallback callback, void * userdata);
+WGPU_EXPORT WGPUSurface wgpuInstanceCreateSurface(WGPUInstance instance, WGPUSurfaceDescriptor const *descriptor);
+WGPU_EXPORT void wgpuInstanceRequestAdapter(WGPUInstance instance, WGPURequestAdapterOptions const *options /* nullable */, WGPURequestAdapterCallback callback, void *userdata);
 WGPU_EXPORT void wgpuInstanceReference(WGPUInstance instance);
 WGPU_EXPORT void wgpuInstanceRelease(WGPUInstance instance);
 
 // Methods of PipelineLayout
-WGPU_EXPORT void wgpuPipelineLayoutSetLabel(WGPUPipelineLayout pipelineLayout, char const * label);
+WGPU_EXPORT void wgpuPipelineLayoutSetLabel(WGPUPipelineLayout pipelineLayout, char const *label);
 WGPU_EXPORT void wgpuPipelineLayoutReference(WGPUPipelineLayout pipelineLayout);
 WGPU_EXPORT void wgpuPipelineLayoutRelease(WGPUPipelineLayout pipelineLayout);
 
@@ -1790,18 +1790,18 @@ WGPU_EXPORT void wgpuPipelineLayoutRelease(WGPUPipelineLayout pipelineLayout);
 WGPU_EXPORT void wgpuQuerySetDestroy(WGPUQuerySet querySet);
 WGPU_EXPORT uint32_t wgpuQuerySetGetCount(WGPUQuerySet querySet);
 WGPU_EXPORT WGPUQueryType wgpuQuerySetGetType(WGPUQuerySet querySet);
-WGPU_EXPORT void wgpuQuerySetSetLabel(WGPUQuerySet querySet, char const * label);
+WGPU_EXPORT void wgpuQuerySetSetLabel(WGPUQuerySet querySet, char const *label);
 WGPU_EXPORT void wgpuQuerySetReference(WGPUQuerySet querySet);
 WGPU_EXPORT void wgpuQuerySetRelease(WGPUQuerySet querySet);
 
 // Methods of Queue
-WGPU_EXPORT void wgpuQueueCopyExternalTextureForBrowser(WGPUQueue queue, WGPUImageCopyExternalTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options);
-WGPU_EXPORT void wgpuQueueCopyTextureForBrowser(WGPUQueue queue, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options);
-WGPU_EXPORT void wgpuQueueOnSubmittedWorkDone(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void * userdata);
-WGPU_EXPORT void wgpuQueueSetLabel(WGPUQueue queue, char const * label);
-WGPU_EXPORT void wgpuQueueSubmit(WGPUQueue queue, uint32_t commandCount, WGPUCommandBuffer const * commands);
-WGPU_EXPORT void wgpuQueueWriteBuffer(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const * data, size_t size);
-WGPU_EXPORT void wgpuQueueWriteTexture(WGPUQueue queue, WGPUImageCopyTexture const * destination, void const * data, size_t dataSize, WGPUTextureDataLayout const * dataLayout, WGPUExtent3D const * writeSize);
+WGPU_EXPORT void wgpuQueueCopyExternalTextureForBrowser(WGPUQueue queue, WGPUImageCopyExternalTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize, WGPUCopyTextureForBrowserOptions const *options);
+WGPU_EXPORT void wgpuQueueCopyTextureForBrowser(WGPUQueue queue, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize, WGPUCopyTextureForBrowserOptions const *options);
+WGPU_EXPORT void wgpuQueueOnSubmittedWorkDone(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void *userdata);
+WGPU_EXPORT void wgpuQueueSetLabel(WGPUQueue queue, char const *label);
+WGPU_EXPORT void wgpuQueueSubmit(WGPUQueue queue, uint32_t commandCount, WGPUCommandBuffer const *commands);
+WGPU_EXPORT void wgpuQueueWriteBuffer(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const *data, size_t size);
+WGPU_EXPORT void wgpuQueueWriteTexture(WGPUQueue queue, WGPUImageCopyTexture const *destination, void const *data, size_t dataSize, WGPUTextureDataLayout const *dataLayout, WGPUExtent3D const *writeSize);
 WGPU_EXPORT void wgpuQueueReference(WGPUQueue queue);
 WGPU_EXPORT void wgpuQueueRelease(WGPUQueue queue);
 
@@ -1814,13 +1814,13 @@ WGPU_EXPORT void wgpuRenderBundleEncoderDraw(WGPURenderBundleEncoder renderBundl
 WGPU_EXPORT void wgpuRenderBundleEncoderDrawIndexed(WGPURenderBundleEncoder renderBundleEncoder, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t baseVertex, uint32_t firstInstance);
 WGPU_EXPORT void wgpuRenderBundleEncoderDrawIndexedIndirect(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
 WGPU_EXPORT void wgpuRenderBundleEncoderDrawIndirect(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
-WGPU_EXPORT WGPURenderBundle wgpuRenderBundleEncoderFinish(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderBundleDescriptor const * descriptor /* nullable */);
-WGPU_EXPORT void wgpuRenderBundleEncoderInsertDebugMarker(WGPURenderBundleEncoder renderBundleEncoder, char const * markerLabel);
+WGPU_EXPORT WGPURenderBundle wgpuRenderBundleEncoderFinish(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderBundleDescriptor const *descriptor /* nullable */);
+WGPU_EXPORT void wgpuRenderBundleEncoderInsertDebugMarker(WGPURenderBundleEncoder renderBundleEncoder, char const *markerLabel);
 WGPU_EXPORT void wgpuRenderBundleEncoderPopDebugGroup(WGPURenderBundleEncoder renderBundleEncoder);
-WGPU_EXPORT void wgpuRenderBundleEncoderPushDebugGroup(WGPURenderBundleEncoder renderBundleEncoder, char const * groupLabel);
-WGPU_EXPORT void wgpuRenderBundleEncoderSetBindGroup(WGPURenderBundleEncoder renderBundleEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
+WGPU_EXPORT void wgpuRenderBundleEncoderPushDebugGroup(WGPURenderBundleEncoder renderBundleEncoder, char const *groupLabel);
+WGPU_EXPORT void wgpuRenderBundleEncoderSetBindGroup(WGPURenderBundleEncoder renderBundleEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
 WGPU_EXPORT void wgpuRenderBundleEncoderSetIndexBuffer(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer buffer, WGPUIndexFormat format, uint64_t offset, uint64_t size);
-WGPU_EXPORT void wgpuRenderBundleEncoderSetLabel(WGPURenderBundleEncoder renderBundleEncoder, char const * label);
+WGPU_EXPORT void wgpuRenderBundleEncoderSetLabel(WGPURenderBundleEncoder renderBundleEncoder, char const *label);
 WGPU_EXPORT void wgpuRenderBundleEncoderSetPipeline(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderPipeline pipeline);
 WGPU_EXPORT void wgpuRenderBundleEncoderSetVertexBuffer(WGPURenderBundleEncoder renderBundleEncoder, uint32_t slot, WGPUBuffer buffer, uint64_t offset, uint64_t size);
 WGPU_EXPORT void wgpuRenderBundleEncoderReference(WGPURenderBundleEncoder renderBundleEncoder);
@@ -1835,14 +1835,14 @@ WGPU_EXPORT void wgpuRenderPassEncoderDrawIndirect(WGPURenderPassEncoder renderP
 WGPU_EXPORT void wgpuRenderPassEncoderEnd(WGPURenderPassEncoder renderPassEncoder);
 WGPU_EXPORT void wgpuRenderPassEncoderEndOcclusionQuery(WGPURenderPassEncoder renderPassEncoder);
 WGPU_EXPORT void wgpuRenderPassEncoderEndPass(WGPURenderPassEncoder renderPassEncoder);
-WGPU_EXPORT void wgpuRenderPassEncoderExecuteBundles(WGPURenderPassEncoder renderPassEncoder, uint32_t bundleCount, WGPURenderBundle const * bundles);
-WGPU_EXPORT void wgpuRenderPassEncoderInsertDebugMarker(WGPURenderPassEncoder renderPassEncoder, char const * markerLabel);
+WGPU_EXPORT void wgpuRenderPassEncoderExecuteBundles(WGPURenderPassEncoder renderPassEncoder, uint32_t bundleCount, WGPURenderBundle const *bundles);
+WGPU_EXPORT void wgpuRenderPassEncoderInsertDebugMarker(WGPURenderPassEncoder renderPassEncoder, char const *markerLabel);
 WGPU_EXPORT void wgpuRenderPassEncoderPopDebugGroup(WGPURenderPassEncoder renderPassEncoder);
-WGPU_EXPORT void wgpuRenderPassEncoderPushDebugGroup(WGPURenderPassEncoder renderPassEncoder, char const * groupLabel);
-WGPU_EXPORT void wgpuRenderPassEncoderSetBindGroup(WGPURenderPassEncoder renderPassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
-WGPU_EXPORT void wgpuRenderPassEncoderSetBlendConstant(WGPURenderPassEncoder renderPassEncoder, WGPUColor const * color);
+WGPU_EXPORT void wgpuRenderPassEncoderPushDebugGroup(WGPURenderPassEncoder renderPassEncoder, char const *groupLabel);
+WGPU_EXPORT void wgpuRenderPassEncoderSetBindGroup(WGPURenderPassEncoder renderPassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
+WGPU_EXPORT void wgpuRenderPassEncoderSetBlendConstant(WGPURenderPassEncoder renderPassEncoder, WGPUColor const *color);
 WGPU_EXPORT void wgpuRenderPassEncoderSetIndexBuffer(WGPURenderPassEncoder renderPassEncoder, WGPUBuffer buffer, WGPUIndexFormat format, uint64_t offset, uint64_t size);
-WGPU_EXPORT void wgpuRenderPassEncoderSetLabel(WGPURenderPassEncoder renderPassEncoder, char const * label);
+WGPU_EXPORT void wgpuRenderPassEncoderSetLabel(WGPURenderPassEncoder renderPassEncoder, char const *label);
 WGPU_EXPORT void wgpuRenderPassEncoderSetPipeline(WGPURenderPassEncoder renderPassEncoder, WGPURenderPipeline pipeline);
 WGPU_EXPORT void wgpuRenderPassEncoderSetScissorRect(WGPURenderPassEncoder renderPassEncoder, uint32_t x, uint32_t y, uint32_t width, uint32_t height);
 WGPU_EXPORT void wgpuRenderPassEncoderSetStencilReference(WGPURenderPassEncoder renderPassEncoder, uint32_t reference);
@@ -1854,18 +1854,18 @@ WGPU_EXPORT void wgpuRenderPassEncoderRelease(WGPURenderPassEncoder renderPassEn
 
 // Methods of RenderPipeline
 WGPU_EXPORT WGPUBindGroupLayout wgpuRenderPipelineGetBindGroupLayout(WGPURenderPipeline renderPipeline, uint32_t groupIndex);
-WGPU_EXPORT void wgpuRenderPipelineSetLabel(WGPURenderPipeline renderPipeline, char const * label);
+WGPU_EXPORT void wgpuRenderPipelineSetLabel(WGPURenderPipeline renderPipeline, char const *label);
 WGPU_EXPORT void wgpuRenderPipelineReference(WGPURenderPipeline renderPipeline);
 WGPU_EXPORT void wgpuRenderPipelineRelease(WGPURenderPipeline renderPipeline);
 
 // Methods of Sampler
-WGPU_EXPORT void wgpuSamplerSetLabel(WGPUSampler sampler, char const * label);
+WGPU_EXPORT void wgpuSamplerSetLabel(WGPUSampler sampler, char const *label);
 WGPU_EXPORT void wgpuSamplerReference(WGPUSampler sampler);
 WGPU_EXPORT void wgpuSamplerRelease(WGPUSampler sampler);
 
 // Methods of ShaderModule
-WGPU_EXPORT void wgpuShaderModuleGetCompilationInfo(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void * userdata);
-WGPU_EXPORT void wgpuShaderModuleSetLabel(WGPUShaderModule shaderModule, char const * label);
+WGPU_EXPORT void wgpuShaderModuleGetCompilationInfo(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void *userdata);
+WGPU_EXPORT void wgpuShaderModuleSetLabel(WGPUShaderModule shaderModule, char const *label);
 WGPU_EXPORT void wgpuShaderModuleReference(WGPUShaderModule shaderModule);
 WGPU_EXPORT void wgpuShaderModuleRelease(WGPUShaderModule shaderModule);
 
@@ -1881,7 +1881,7 @@ WGPU_EXPORT void wgpuSwapChainReference(WGPUSwapChain swapChain);
 WGPU_EXPORT void wgpuSwapChainRelease(WGPUSwapChain swapChain);
 
 // Methods of Texture
-WGPU_EXPORT WGPUTextureView wgpuTextureCreateView(WGPUTexture texture, WGPUTextureViewDescriptor const * descriptor /* nullable */);
+WGPU_EXPORT WGPUTextureView wgpuTextureCreateView(WGPUTexture texture, WGPUTextureViewDescriptor const *descriptor /* nullable */);
 WGPU_EXPORT void wgpuTextureDestroy(WGPUTexture texture);
 WGPU_EXPORT uint32_t wgpuTextureGetDepthOrArrayLayers(WGPUTexture texture);
 WGPU_EXPORT WGPUTextureDimension wgpuTextureGetDimension(WGPUTexture texture);
@@ -1891,19 +1891,19 @@ WGPU_EXPORT uint32_t wgpuTextureGetMipLevelCount(WGPUTexture texture);
 WGPU_EXPORT uint32_t wgpuTextureGetSampleCount(WGPUTexture texture);
 WGPU_EXPORT WGPUTextureUsage wgpuTextureGetUsage(WGPUTexture texture);
 WGPU_EXPORT uint32_t wgpuTextureGetWidth(WGPUTexture texture);
-WGPU_EXPORT void wgpuTextureSetLabel(WGPUTexture texture, char const * label);
+WGPU_EXPORT void wgpuTextureSetLabel(WGPUTexture texture, char const *label);
 WGPU_EXPORT void wgpuTextureReference(WGPUTexture texture);
 WGPU_EXPORT void wgpuTextureRelease(WGPUTexture texture);
 
 // Methods of TextureView
-WGPU_EXPORT void wgpuTextureViewSetLabel(WGPUTextureView textureView, char const * label);
+WGPU_EXPORT void wgpuTextureViewSetLabel(WGPUTextureView textureView, char const *label);
 WGPU_EXPORT void wgpuTextureViewReference(WGPUTextureView textureView);
 WGPU_EXPORT void wgpuTextureViewRelease(WGPUTextureView textureView);
 
 #endif  // !defined(WGPU_SKIP_DECLARATIONS)
 
 #ifdef __cplusplus
-} // extern "C"
+}  // extern "C"
 #endif
 
-#endif // WEBGPU_H_
+#endif  // WEBGPU_H_
diff --git a/test/correctness/gpu_allocation_cache.cpp b/test/correctness/gpu_allocation_cache.cpp
index c0eb982fdf66..cbb864bd6409 100644
--- a/test/correctness/gpu_allocation_cache.cpp
+++ b/test/correctness/gpu_allocation_cache.cpp
@@ -31,7 +31,8 @@ int main(int argc, char **argv) {
     if (target.has_feature(Target::WebGPU)) {
         printf("[SKIP] Allocation cache not yet implemented for WebGPU.\n");
         return 0;
-    }    const int N = 30;
+    }
+    const int N = 30;
     Var x, y, xi, yi;
 
     // Fixed size, overlapping lifetimes, looped 300 times. Should have 3 allocations live and OOM if there's a leak.
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index 8c71612be3bc..9dd4c853329c 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -373,7 +373,7 @@ class SimdOpCheckTest {
         std::cout << "simd_op_check test seed: " << seed << "\n";
 
         for (const auto &t : targets_to_test) {
-            if(!t.supported()) {
+            if (!t.supported()) {
                 std::cout << "[SKIP] Unsupported target: " << t << "\n";
                 return 0;
             }

From 28bcdc93f4b04b6ef96b71acedca171e8f0a497d Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 7 Apr 2023 10:38:17 -0700
Subject: [PATCH 159/166] Resolve conflicts for mini_webgpu.h ... revert to
 main

---
 src/runtime/mini_webgpu.h | 736 +++++++++++++++++++-------------------
 1 file changed, 370 insertions(+), 366 deletions(-)

diff --git a/src/runtime/mini_webgpu.h b/src/runtime/mini_webgpu.h
index d00a5cef43ae..856aaca693af 100644
--- a/src/runtime/mini_webgpu.h
+++ b/src/runtime/mini_webgpu.h
@@ -30,22 +30,24 @@
 #ifndef WEBGPU_H_
 #define WEBGPU_H_
 
+// clang-format off
+
 #if defined(WGPU_SHARED_LIBRARY)
-#if defined(_WIN32)
-#if defined(WGPU_IMPLEMENTATION)
-#define WGPU_EXPORT __declspec(dllexport)
-#else
-#define WGPU_EXPORT __declspec(dllimport)
-#endif
-#else  // defined(_WIN32)
-#if defined(WGPU_IMPLEMENTATION)
-#define WGPU_EXPORT __attribute__((visibility("default")))
-#else
-#define WGPU_EXPORT
-#endif
-#endif  // defined(_WIN32)
-#else   // defined(WGPU_SHARED_LIBRARY)
-#define WGPU_EXPORT
+#    if defined(_WIN32)
+#        if defined(WGPU_IMPLEMENTATION)
+#            define WGPU_EXPORT __declspec(dllexport)
+#        else
+#            define WGPU_EXPORT __declspec(dllimport)
+#        endif
+#    else  // defined(_WIN32)
+#        if defined(WGPU_IMPLEMENTATION)
+#            define WGPU_EXPORT __attribute__((visibility("default")))
+#        else
+#            define WGPU_EXPORT
+#        endif
+#    endif  // defined(_WIN32)
+#else       // defined(WGPU_SHARED_LIBRARY)
+#    define WGPU_EXPORT
 #endif  // defined(WGPU_SHARED_LIBRARY)
 
 #define WGPU_ARRAY_LAYER_COUNT_UNDEFINED (0xffffffffUL)
@@ -59,30 +61,30 @@
 
 typedef uint32_t WGPUFlags;
 
-typedef struct WGPUAdapterImpl *WGPUAdapter;
-typedef struct WGPUBindGroupImpl *WGPUBindGroup;
-typedef struct WGPUBindGroupLayoutImpl *WGPUBindGroupLayout;
-typedef struct WGPUBufferImpl *WGPUBuffer;
-typedef struct WGPUCommandBufferImpl *WGPUCommandBuffer;
-typedef struct WGPUCommandEncoderImpl *WGPUCommandEncoder;
-typedef struct WGPUComputePassEncoderImpl *WGPUComputePassEncoder;
-typedef struct WGPUComputePipelineImpl *WGPUComputePipeline;
-typedef struct WGPUDeviceImpl *WGPUDevice;
-typedef struct WGPUExternalTextureImpl *WGPUExternalTexture;
-typedef struct WGPUInstanceImpl *WGPUInstance;
-typedef struct WGPUPipelineLayoutImpl *WGPUPipelineLayout;
-typedef struct WGPUQuerySetImpl *WGPUQuerySet;
-typedef struct WGPUQueueImpl *WGPUQueue;
-typedef struct WGPURenderBundleImpl *WGPURenderBundle;
-typedef struct WGPURenderBundleEncoderImpl *WGPURenderBundleEncoder;
-typedef struct WGPURenderPassEncoderImpl *WGPURenderPassEncoder;
-typedef struct WGPURenderPipelineImpl *WGPURenderPipeline;
-typedef struct WGPUSamplerImpl *WGPUSampler;
-typedef struct WGPUShaderModuleImpl *WGPUShaderModule;
-typedef struct WGPUSurfaceImpl *WGPUSurface;
-typedef struct WGPUSwapChainImpl *WGPUSwapChain;
-typedef struct WGPUTextureImpl *WGPUTexture;
-typedef struct WGPUTextureViewImpl *WGPUTextureView;
+typedef struct WGPUAdapterImpl* WGPUAdapter;
+typedef struct WGPUBindGroupImpl* WGPUBindGroup;
+typedef struct WGPUBindGroupLayoutImpl* WGPUBindGroupLayout;
+typedef struct WGPUBufferImpl* WGPUBuffer;
+typedef struct WGPUCommandBufferImpl* WGPUCommandBuffer;
+typedef struct WGPUCommandEncoderImpl* WGPUCommandEncoder;
+typedef struct WGPUComputePassEncoderImpl* WGPUComputePassEncoder;
+typedef struct WGPUComputePipelineImpl* WGPUComputePipeline;
+typedef struct WGPUDeviceImpl* WGPUDevice;
+typedef struct WGPUExternalTextureImpl* WGPUExternalTexture;
+typedef struct WGPUInstanceImpl* WGPUInstance;
+typedef struct WGPUPipelineLayoutImpl* WGPUPipelineLayout;
+typedef struct WGPUQuerySetImpl* WGPUQuerySet;
+typedef struct WGPUQueueImpl* WGPUQueue;
+typedef struct WGPURenderBundleImpl* WGPURenderBundle;
+typedef struct WGPURenderBundleEncoderImpl* WGPURenderBundleEncoder;
+typedef struct WGPURenderPassEncoderImpl* WGPURenderPassEncoder;
+typedef struct WGPURenderPipelineImpl* WGPURenderPipeline;
+typedef struct WGPUSamplerImpl* WGPUSampler;
+typedef struct WGPUShaderModuleImpl* WGPUShaderModule;
+typedef struct WGPUSurfaceImpl* WGPUSurface;
+typedef struct WGPUSwapChainImpl* WGPUSwapChain;
+typedef struct WGPUTextureImpl* WGPUTexture;
+typedef struct WGPUTextureViewImpl* WGPUTextureView;
 
 typedef enum WGPUAdapterType {
     WGPUAdapterType_DiscreteGPU = 0x00000000,
@@ -680,35 +682,35 @@ typedef enum WGPUTextureUsage {
 typedef WGPUFlags WGPUTextureUsageFlags;
 
 typedef struct WGPUChainedStruct {
-    struct WGPUChainedStruct const *next;
+    struct WGPUChainedStruct const * next;
     WGPUSType sType;
 } WGPUChainedStruct;
 
 typedef struct WGPUChainedStructOut {
-    struct WGPUChainedStructOut *next;
+    struct WGPUChainedStructOut * next;
     WGPUSType sType;
 } WGPUChainedStructOut;
 
 typedef struct WGPUAdapterProperties {
-    WGPUChainedStructOut *nextInChain;
+    WGPUChainedStructOut * nextInChain;
     uint32_t vendorID;
-    char const *vendorName;
-    char const *architecture;
+    char const * vendorName;
+    char const * architecture;
     uint32_t deviceID;
-    char const *name;
-    char const *driverDescription;
+    char const * name;
+    char const * driverDescription;
     WGPUAdapterType adapterType;
     WGPUBackendType backendType;
 } WGPUAdapterProperties;
 
 typedef struct WGPUBindGroupEntry {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     uint32_t binding;
-    WGPUBuffer buffer;  // nullable
+    WGPUBuffer buffer; // nullable
     uint64_t offset;
     uint64_t size;
-    WGPUSampler sampler;          // nullable
-    WGPUTextureView textureView;  // nullable
+    WGPUSampler sampler; // nullable
+    WGPUTextureView textureView; // nullable
 } WGPUBindGroupEntry;
 
 typedef struct WGPUBlendComponent {
@@ -718,15 +720,15 @@ typedef struct WGPUBlendComponent {
 } WGPUBlendComponent;
 
 typedef struct WGPUBufferBindingLayout {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUBufferBindingType type;
     bool hasDynamicOffset;
     uint64_t minBindingSize;
 } WGPUBufferBindingLayout;
 
 typedef struct WGPUBufferDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     WGPUBufferUsageFlags usage;
     uint64_t size;
     bool mappedAtCreation;
@@ -740,18 +742,18 @@ typedef struct WGPUColor {
 } WGPUColor;
 
 typedef struct WGPUCommandBufferDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
 } WGPUCommandBufferDescriptor;
 
 typedef struct WGPUCommandEncoderDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
 } WGPUCommandEncoderDescriptor;
 
 typedef struct WGPUCompilationMessage {
-    WGPUChainedStruct const *nextInChain;
-    char const *message;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * message; // nullable
     WGPUCompilationMessageType type;
     uint64_t lineNum;
     uint64_t linePos;
@@ -769,19 +771,19 @@ typedef struct WGPUComputePassTimestampWrite {
 } WGPUComputePassTimestampWrite;
 
 typedef struct WGPUConstantEntry {
-    WGPUChainedStruct const *nextInChain;
-    char const *key;
+    WGPUChainedStruct const * nextInChain;
+    char const * key;
     double value;
 } WGPUConstantEntry;
 
 typedef struct WGPUCopyTextureForBrowserOptions {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     bool flipY;
     bool needsColorSpaceConversion;
     WGPUAlphaMode srcAlphaMode;
-    float const *srcTransferFunctionParameters;  // nullable
-    float const *conversionMatrix;               // nullable
-    float const *dstTransferFunctionParameters;  // nullable
+    float const * srcTransferFunctionParameters; // nullable
+    float const * conversionMatrix; // nullable
+    float const * dstTransferFunctionParameters; // nullable
     WGPUAlphaMode dstAlphaMode;
     bool internalUsage;
 } WGPUCopyTextureForBrowserOptions;
@@ -801,7 +803,7 @@ typedef struct WGPUDawnBufferDescriptorErrorInfoFromWireClient {
 // Can be chained in WGPUDeviceDescriptor
 typedef struct WGPUDawnCacheDeviceDescriptor {
     WGPUChainedStruct chain;
-    char const *isolationKey;
+    char const * isolationKey;
 } WGPUDawnCacheDeviceDescriptor;
 
 // Can be chained in WGPUCommandEncoderDescriptor
@@ -814,7 +816,7 @@ typedef struct WGPUDawnEncoderInternalUsageDescriptor {
 typedef struct WGPUDawnInstanceDescriptor {
     WGPUChainedStruct chain;
     uint32_t additionalRuntimeSearchPathsCount;
-    const char *const *additionalRuntimeSearchPaths;
+    const char* const * additionalRuntimeSearchPaths;
 } WGPUDawnInstanceDescriptor;
 
 // Can be chained in WGPUShaderModuleDescriptor
@@ -834,18 +836,18 @@ typedef struct WGPUDawnTextureInternalUsageDescriptor {
 typedef struct WGPUDawnTogglesDescriptor {
     WGPUChainedStruct chain;
     uint32_t enabledTogglesCount;
-    const char *const *enabledToggles;
+    const char* const * enabledToggles;
     uint32_t disabledTogglesCount;
-    const char *const *disabledToggles;
+    const char* const * disabledToggles;
 } WGPUDawnTogglesDescriptor;
 
 // Can be chained in WGPUDeviceDescriptor
 typedef struct WGPUDawnTogglesDeviceDescriptor {
     WGPUChainedStruct chain;
     uint32_t forceEnabledTogglesCount;
-    const char *const *forceEnabledToggles;
+    const char* const * forceEnabledToggles;
     uint32_t forceDisabledTogglesCount;
-    const char *const *forceDisabledToggles;
+    const char* const * forceDisabledToggles;
 } WGPUDawnTogglesDeviceDescriptor;
 
 typedef struct WGPUExtent2D {
@@ -871,7 +873,7 @@ typedef struct WGPUExternalTextureBindingLayout {
 } WGPUExternalTextureBindingLayout;
 
 typedef struct WGPUInstanceDescriptor {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
 } WGPUInstanceDescriptor;
 
 typedef struct WGPULimits {
@@ -910,7 +912,7 @@ typedef struct WGPULimits {
 } WGPULimits;
 
 typedef struct WGPUMultisampleState {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     uint32_t count;
     uint32_t mask;
     bool alphaToCoverageEnabled;
@@ -928,10 +930,10 @@ typedef struct WGPUOrigin3D {
 } WGPUOrigin3D;
 
 typedef struct WGPUPipelineLayoutDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     uint32_t bindGroupLayoutCount;
-    WGPUBindGroupLayout const *bindGroupLayouts;
+    WGPUBindGroupLayout const * bindGroupLayouts;
 } WGPUPipelineLayoutDescriptor;
 
 // Can be chained in WGPUPrimitiveState
@@ -941,7 +943,7 @@ typedef struct WGPUPrimitiveDepthClipControl {
 } WGPUPrimitiveDepthClipControl;
 
 typedef struct WGPUPrimitiveState {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUPrimitiveTopology topology;
     WGPUIndexFormat stripIndexFormat;
     WGPUFrontFace frontFace;
@@ -949,29 +951,29 @@ typedef struct WGPUPrimitiveState {
 } WGPUPrimitiveState;
 
 typedef struct WGPUQuerySetDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     WGPUQueryType type;
     uint32_t count;
-    WGPUPipelineStatisticName const *pipelineStatistics;
+    WGPUPipelineStatisticName const * pipelineStatistics;
     uint32_t pipelineStatisticsCount;
 } WGPUQuerySetDescriptor;
 
 typedef struct WGPUQueueDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
 } WGPUQueueDescriptor;
 
 typedef struct WGPURenderBundleDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
 } WGPURenderBundleDescriptor;
 
 typedef struct WGPURenderBundleEncoderDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     uint32_t colorFormatsCount;
-    WGPUTextureFormat const *colorFormats;
+    WGPUTextureFormat const * colorFormats;
     WGPUTextureFormat depthStencilFormat;
     uint32_t sampleCount;
     bool depthReadOnly;
@@ -1003,20 +1005,20 @@ typedef struct WGPURenderPassTimestampWrite {
 } WGPURenderPassTimestampWrite;
 
 typedef struct WGPURequestAdapterOptions {
-    WGPUChainedStruct const *nextInChain;
-    WGPUSurface compatibleSurface;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    WGPUSurface compatibleSurface; // nullable
     WGPUPowerPreference powerPreference;
     bool forceFallbackAdapter;
 } WGPURequestAdapterOptions;
 
 typedef struct WGPUSamplerBindingLayout {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUSamplerBindingType type;
 } WGPUSamplerBindingLayout;
 
 typedef struct WGPUSamplerDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     WGPUAddressMode addressModeU;
     WGPUAddressMode addressModeV;
     WGPUAddressMode addressModeW;
@@ -1030,21 +1032,21 @@ typedef struct WGPUSamplerDescriptor {
 } WGPUSamplerDescriptor;
 
 typedef struct WGPUShaderModuleDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
 } WGPUShaderModuleDescriptor;
 
 // Can be chained in WGPUShaderModuleDescriptor
 typedef struct WGPUShaderModuleSPIRVDescriptor {
     WGPUChainedStruct chain;
     uint32_t codeSize;
-    uint32_t const *code;
+    uint32_t const * code;
 } WGPUShaderModuleSPIRVDescriptor;
 
 // Can be chained in WGPUShaderModuleDescriptor
 typedef struct WGPUShaderModuleWGSLDescriptor {
     WGPUChainedStruct chain;
-    char const *source;
+    char const * source;
 } WGPUShaderModuleWGSLDescriptor;
 
 typedef struct WGPUStencilFaceState {
@@ -1055,71 +1057,71 @@ typedef struct WGPUStencilFaceState {
 } WGPUStencilFaceState;
 
 typedef struct WGPUStorageTextureBindingLayout {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUStorageTextureAccess access;
     WGPUTextureFormat format;
     WGPUTextureViewDimension viewDimension;
 } WGPUStorageTextureBindingLayout;
 
 typedef struct WGPUSurfaceDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
 } WGPUSurfaceDescriptor;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromAndroidNativeWindow {
     WGPUChainedStruct chain;
-    void *window;
+    void * window;
 } WGPUSurfaceDescriptorFromAndroidNativeWindow;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromCanvasHTMLSelector {
     WGPUChainedStruct chain;
-    char const *selector;
+    char const * selector;
 } WGPUSurfaceDescriptorFromCanvasHTMLSelector;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromMetalLayer {
     WGPUChainedStruct chain;
-    void *layer;
+    void * layer;
 } WGPUSurfaceDescriptorFromMetalLayer;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWaylandSurface {
     WGPUChainedStruct chain;
-    void *display;
-    void *surface;
+    void * display;
+    void * surface;
 } WGPUSurfaceDescriptorFromWaylandSurface;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWindowsCoreWindow {
     WGPUChainedStruct chain;
-    void *coreWindow;
+    void * coreWindow;
 } WGPUSurfaceDescriptorFromWindowsCoreWindow;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWindowsHWND {
     WGPUChainedStruct chain;
-    void *hinstance;
-    void *hwnd;
+    void * hinstance;
+    void * hwnd;
 } WGPUSurfaceDescriptorFromWindowsHWND;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWindowsSwapChainPanel {
     WGPUChainedStruct chain;
-    void *swapChainPanel;
+    void * swapChainPanel;
 } WGPUSurfaceDescriptorFromWindowsSwapChainPanel;
 
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromXlibWindow {
     WGPUChainedStruct chain;
-    void *display;
+    void * display;
     uint32_t window;
 } WGPUSurfaceDescriptorFromXlibWindow;
 
 typedef struct WGPUSwapChainDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     WGPUTextureUsageFlags usage;
     WGPUTextureFormat format;
     uint32_t width;
@@ -1129,22 +1131,22 @@ typedef struct WGPUSwapChainDescriptor {
 } WGPUSwapChainDescriptor;
 
 typedef struct WGPUTextureBindingLayout {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUTextureSampleType sampleType;
     WGPUTextureViewDimension viewDimension;
     bool multisampled;
 } WGPUTextureBindingLayout;
 
 typedef struct WGPUTextureDataLayout {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     uint64_t offset;
     uint32_t bytesPerRow;
     uint32_t rowsPerImage;
 } WGPUTextureDataLayout;
 
 typedef struct WGPUTextureViewDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     WGPUTextureFormat format;
     WGPUTextureViewDimension dimension;
     uint32_t baseMipLevel;
@@ -1161,15 +1163,15 @@ typedef struct WGPUVertexAttribute {
 } WGPUVertexAttribute;
 
 typedef struct WGPUBindGroupDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     WGPUBindGroupLayout layout;
     uint32_t entryCount;
-    WGPUBindGroupEntry const *entries;
+    WGPUBindGroupEntry const * entries;
 } WGPUBindGroupDescriptor;
 
 typedef struct WGPUBindGroupLayoutEntry {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     uint32_t binding;
     WGPUShaderStageFlags visibility;
     WGPUBufferBindingLayout buffer;
@@ -1184,20 +1186,20 @@ typedef struct WGPUBlendState {
 } WGPUBlendState;
 
 typedef struct WGPUCompilationInfo {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     uint32_t messageCount;
-    WGPUCompilationMessage const *messages;
+    WGPUCompilationMessage const * messages;
 } WGPUCompilationInfo;
 
 typedef struct WGPUComputePassDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     uint32_t timestampWriteCount;
-    WGPUComputePassTimestampWrite const *timestampWrites;
+    WGPUComputePassTimestampWrite const * timestampWrites;
 } WGPUComputePassDescriptor;
 
 typedef struct WGPUDepthStencilState {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUTextureFormat format;
     bool depthWriteEnabled;
     WGPUCompareFunction depthCompare;
@@ -1211,35 +1213,35 @@ typedef struct WGPUDepthStencilState {
 } WGPUDepthStencilState;
 
 typedef struct WGPUExternalTextureDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     WGPUTextureView plane0;
-    WGPUTextureView plane1;  // nullable
+    WGPUTextureView plane1; // nullable
     WGPUOrigin2D visibleOrigin;
     WGPUExtent2D visibleSize;
     bool doYuvToRgbConversionOnly;
-    float const *yuvToRgbConversionMatrix;  // nullable
-    float const *srcTransferFunctionParameters;
-    float const *dstTransferFunctionParameters;
-    float const *gamutConversionMatrix;
+    float const * yuvToRgbConversionMatrix; // nullable
+    float const * srcTransferFunctionParameters;
+    float const * dstTransferFunctionParameters;
+    float const * gamutConversionMatrix;
     bool flipY;
     WGPUExternalTextureRotation rotation;
 } WGPUExternalTextureDescriptor;
 
 typedef struct WGPUImageCopyBuffer {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUTextureDataLayout layout;
     WGPUBuffer buffer;
 } WGPUImageCopyBuffer;
 
 typedef struct WGPUImageCopyExternalTexture {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUExternalTexture externalTexture;
     WGPUOrigin3D origin;
 } WGPUImageCopyExternalTexture;
 
 typedef struct WGPUImageCopyTexture {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUTexture texture;
     uint32_t mipLevel;
     WGPUOrigin3D origin;
@@ -1247,34 +1249,34 @@ typedef struct WGPUImageCopyTexture {
 } WGPUImageCopyTexture;
 
 typedef struct WGPUProgrammableStageDescriptor {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUShaderModule module;
-    char const *entryPoint;
+    char const * entryPoint;
     uint32_t constantCount;
-    WGPUConstantEntry const *constants;
+    WGPUConstantEntry const * constants;
 } WGPUProgrammableStageDescriptor;
 
 typedef struct WGPURenderPassColorAttachment {
-    WGPUTextureView view;           // nullable
-    WGPUTextureView resolveTarget;  // nullable
+    WGPUTextureView view; // nullable
+    WGPUTextureView resolveTarget; // nullable
     WGPULoadOp loadOp;
     WGPUStoreOp storeOp;
     WGPUColor clearValue;
 } WGPURenderPassColorAttachment;
 
 typedef struct WGPURequiredLimits {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPULimits limits;
 } WGPURequiredLimits;
 
 typedef struct WGPUSupportedLimits {
-    WGPUChainedStructOut *nextInChain;
+    WGPUChainedStructOut * nextInChain;
     WGPULimits limits;
 } WGPUSupportedLimits;
 
 typedef struct WGPUTextureDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     WGPUTextureUsageFlags usage;
     WGPUTextureDimension dimension;
     WGPUExtent3D size;
@@ -1282,164 +1284,164 @@ typedef struct WGPUTextureDescriptor {
     uint32_t mipLevelCount;
     uint32_t sampleCount;
     uint32_t viewFormatCount;
-    WGPUTextureFormat const *viewFormats;
+    WGPUTextureFormat const * viewFormats;
 } WGPUTextureDescriptor;
 
 typedef struct WGPUVertexBufferLayout {
     uint64_t arrayStride;
     WGPUVertexStepMode stepMode;
     uint32_t attributeCount;
-    WGPUVertexAttribute const *attributes;
+    WGPUVertexAttribute const * attributes;
 } WGPUVertexBufferLayout;
 
 typedef struct WGPUBindGroupLayoutDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     uint32_t entryCount;
-    WGPUBindGroupLayoutEntry const *entries;
+    WGPUBindGroupLayoutEntry const * entries;
 } WGPUBindGroupLayoutDescriptor;
 
 typedef struct WGPUColorTargetState {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUTextureFormat format;
-    WGPUBlendState const *blend;  // nullable
+    WGPUBlendState const * blend; // nullable
     WGPUColorWriteMaskFlags writeMask;
 } WGPUColorTargetState;
 
 typedef struct WGPUComputePipelineDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;          // nullable
-    WGPUPipelineLayout layout;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
+    WGPUPipelineLayout layout; // nullable
     WGPUProgrammableStageDescriptor compute;
 } WGPUComputePipelineDescriptor;
 
 typedef struct WGPUDeviceDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     uint32_t requiredFeaturesCount;
-    WGPUFeatureName const *requiredFeatures;
-    WGPURequiredLimits const *requiredLimits;  // nullable
+    WGPUFeatureName const * requiredFeatures;
+    WGPURequiredLimits const * requiredLimits; // nullable
     WGPUQueueDescriptor defaultQueue;
 } WGPUDeviceDescriptor;
 
 typedef struct WGPURenderPassDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
     uint32_t colorAttachmentCount;
-    WGPURenderPassColorAttachment const *colorAttachments;
-    WGPURenderPassDepthStencilAttachment const *depthStencilAttachment;  // nullable
-    WGPUQuerySet occlusionQuerySet;                                      // nullable
+    WGPURenderPassColorAttachment const * colorAttachments;
+    WGPURenderPassDepthStencilAttachment const * depthStencilAttachment; // nullable
+    WGPUQuerySet occlusionQuerySet; // nullable
     uint32_t timestampWriteCount;
-    WGPURenderPassTimestampWrite const *timestampWrites;
+    WGPURenderPassTimestampWrite const * timestampWrites;
 } WGPURenderPassDescriptor;
 
 typedef struct WGPUVertexState {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUShaderModule module;
-    char const *entryPoint;
+    char const * entryPoint;
     uint32_t constantCount;
-    WGPUConstantEntry const *constants;
+    WGPUConstantEntry const * constants;
     uint32_t bufferCount;
-    WGPUVertexBufferLayout const *buffers;
+    WGPUVertexBufferLayout const * buffers;
 } WGPUVertexState;
 
 typedef struct WGPUFragmentState {
-    WGPUChainedStruct const *nextInChain;
+    WGPUChainedStruct const * nextInChain;
     WGPUShaderModule module;
-    char const *entryPoint;
+    char const * entryPoint;
     uint32_t constantCount;
-    WGPUConstantEntry const *constants;
+    WGPUConstantEntry const * constants;
     uint32_t targetCount;
-    WGPUColorTargetState const *targets;
+    WGPUColorTargetState const * targets;
 } WGPUFragmentState;
 
 typedef struct WGPURenderPipelineDescriptor {
-    WGPUChainedStruct const *nextInChain;
-    char const *label;          // nullable
-    WGPUPipelineLayout layout;  // nullable
+    WGPUChainedStruct const * nextInChain;
+    char const * label; // nullable
+    WGPUPipelineLayout layout; // nullable
     WGPUVertexState vertex;
     WGPUPrimitiveState primitive;
-    WGPUDepthStencilState const *depthStencil;  // nullable
+    WGPUDepthStencilState const * depthStencil; // nullable
     WGPUMultisampleState multisample;
-    WGPUFragmentState const *fragment;  // nullable
+    WGPUFragmentState const * fragment; // nullable
 } WGPURenderPipelineDescriptor;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef void (*WGPUBufferMapCallback)(WGPUBufferMapAsyncStatus status, void *userdata);
-typedef void (*WGPUCompilationInfoCallback)(WGPUCompilationInfoRequestStatus status, WGPUCompilationInfo const *compilationInfo, void *userdata);
-typedef void (*WGPUCreateComputePipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPUComputePipeline pipeline, char const *message, void *userdata);
-typedef void (*WGPUCreateRenderPipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPURenderPipeline pipeline, char const *message, void *userdata);
-typedef void (*WGPUDeviceLostCallback)(WGPUDeviceLostReason reason, char const *message, void *userdata);
-typedef void (*WGPUErrorCallback)(WGPUErrorType type, char const *message, void *userdata);
-typedef void (*WGPULoggingCallback)(WGPULoggingType type, char const *message, void *userdata);
+typedef void (*WGPUBufferMapCallback)(WGPUBufferMapAsyncStatus status, void * userdata);
+typedef void (*WGPUCompilationInfoCallback)(WGPUCompilationInfoRequestStatus status, WGPUCompilationInfo const * compilationInfo, void * userdata);
+typedef void (*WGPUCreateComputePipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPUComputePipeline pipeline, char const * message, void * userdata);
+typedef void (*WGPUCreateRenderPipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPURenderPipeline pipeline, char const * message, void * userdata);
+typedef void (*WGPUDeviceLostCallback)(WGPUDeviceLostReason reason, char const * message, void * userdata);
+typedef void (*WGPUErrorCallback)(WGPUErrorType type, char const * message, void * userdata);
+typedef void (*WGPULoggingCallback)(WGPULoggingType type, char const * message, void * userdata);
 typedef void (*WGPUProc)();
-typedef void (*WGPUQueueWorkDoneCallback)(WGPUQueueWorkDoneStatus status, void *userdata);
-typedef void (*WGPURequestAdapterCallback)(WGPURequestAdapterStatus status, WGPUAdapter adapter, char const *message, void *userdata);
-typedef void (*WGPURequestDeviceCallback)(WGPURequestDeviceStatus status, WGPUDevice device, char const *message, void *userdata);
+typedef void (*WGPUQueueWorkDoneCallback)(WGPUQueueWorkDoneStatus status, void * userdata);
+typedef void (*WGPURequestAdapterCallback)(WGPURequestAdapterStatus status, WGPUAdapter adapter, char const * message, void * userdata);
+typedef void (*WGPURequestDeviceCallback)(WGPURequestDeviceStatus status, WGPUDevice device, char const * message, void * userdata);
 
 #if !defined(WGPU_SKIP_PROCS)
 
-typedef WGPUInstance (*WGPUProcCreateInstance)(WGPUInstanceDescriptor const *descriptor);
-typedef WGPUProc (*WGPUProcGetProcAddress)(WGPUDevice device, char const *procName);
+typedef WGPUInstance (*WGPUProcCreateInstance)(WGPUInstanceDescriptor const * descriptor);
+typedef WGPUProc (*WGPUProcGetProcAddress)(WGPUDevice device, char const * procName);
 
 // Procs of Adapter
-typedef WGPUDevice (*WGPUProcAdapterCreateDevice)(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor /* nullable */);
-typedef size_t (*WGPUProcAdapterEnumerateFeatures)(WGPUAdapter adapter, WGPUFeatureName *features);
-typedef bool (*WGPUProcAdapterGetLimits)(WGPUAdapter adapter, WGPUSupportedLimits *limits);
-typedef void (*WGPUProcAdapterGetProperties)(WGPUAdapter adapter, WGPUAdapterProperties *properties);
+typedef WGPUDevice (*WGPUProcAdapterCreateDevice)(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor /* nullable */);
+typedef size_t (*WGPUProcAdapterEnumerateFeatures)(WGPUAdapter adapter, WGPUFeatureName * features);
+typedef bool (*WGPUProcAdapterGetLimits)(WGPUAdapter adapter, WGPUSupportedLimits * limits);
+typedef void (*WGPUProcAdapterGetProperties)(WGPUAdapter adapter, WGPUAdapterProperties * properties);
 typedef bool (*WGPUProcAdapterHasFeature)(WGPUAdapter adapter, WGPUFeatureName feature);
-typedef void (*WGPUProcAdapterRequestDevice)(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor /* nullable */, WGPURequestDeviceCallback callback, void *userdata);
+typedef void (*WGPUProcAdapterRequestDevice)(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor /* nullable */, WGPURequestDeviceCallback callback, void * userdata);
 typedef void (*WGPUProcAdapterReference)(WGPUAdapter adapter);
 typedef void (*WGPUProcAdapterRelease)(WGPUAdapter adapter);
 
 // Procs of BindGroup
-typedef void (*WGPUProcBindGroupSetLabel)(WGPUBindGroup bindGroup, char const *label);
+typedef void (*WGPUProcBindGroupSetLabel)(WGPUBindGroup bindGroup, char const * label);
 typedef void (*WGPUProcBindGroupReference)(WGPUBindGroup bindGroup);
 typedef void (*WGPUProcBindGroupRelease)(WGPUBindGroup bindGroup);
 
 // Procs of BindGroupLayout
-typedef void (*WGPUProcBindGroupLayoutSetLabel)(WGPUBindGroupLayout bindGroupLayout, char const *label);
+typedef void (*WGPUProcBindGroupLayoutSetLabel)(WGPUBindGroupLayout bindGroupLayout, char const * label);
 typedef void (*WGPUProcBindGroupLayoutReference)(WGPUBindGroupLayout bindGroupLayout);
 typedef void (*WGPUProcBindGroupLayoutRelease)(WGPUBindGroupLayout bindGroupLayout);
 
 // Procs of Buffer
 typedef void (*WGPUProcBufferDestroy)(WGPUBuffer buffer);
-typedef void const *(*WGPUProcBufferGetConstMappedRange)(WGPUBuffer buffer, size_t offset, size_t size);
+typedef void const * (*WGPUProcBufferGetConstMappedRange)(WGPUBuffer buffer, size_t offset, size_t size);
 typedef WGPUBufferMapState (*WGPUProcBufferGetMapState)(WGPUBuffer buffer);
-typedef void *(*WGPUProcBufferGetMappedRange)(WGPUBuffer buffer, size_t offset, size_t size);
+typedef void * (*WGPUProcBufferGetMappedRange)(WGPUBuffer buffer, size_t offset, size_t size);
 typedef uint64_t (*WGPUProcBufferGetSize)(WGPUBuffer buffer);
 typedef WGPUBufferUsage (*WGPUProcBufferGetUsage)(WGPUBuffer buffer);
-typedef void (*WGPUProcBufferMapAsync)(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void *userdata);
-typedef void (*WGPUProcBufferSetLabel)(WGPUBuffer buffer, char const *label);
+typedef void (*WGPUProcBufferMapAsync)(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void * userdata);
+typedef void (*WGPUProcBufferSetLabel)(WGPUBuffer buffer, char const * label);
 typedef void (*WGPUProcBufferUnmap)(WGPUBuffer buffer);
 typedef void (*WGPUProcBufferReference)(WGPUBuffer buffer);
 typedef void (*WGPUProcBufferRelease)(WGPUBuffer buffer);
 
 // Procs of CommandBuffer
-typedef void (*WGPUProcCommandBufferSetLabel)(WGPUCommandBuffer commandBuffer, char const *label);
+typedef void (*WGPUProcCommandBufferSetLabel)(WGPUCommandBuffer commandBuffer, char const * label);
 typedef void (*WGPUProcCommandBufferReference)(WGPUCommandBuffer commandBuffer);
 typedef void (*WGPUProcCommandBufferRelease)(WGPUCommandBuffer commandBuffer);
 
 // Procs of CommandEncoder
-typedef WGPUComputePassEncoder (*WGPUProcCommandEncoderBeginComputePass)(WGPUCommandEncoder commandEncoder, WGPUComputePassDescriptor const *descriptor /* nullable */);
-typedef WGPURenderPassEncoder (*WGPUProcCommandEncoderBeginRenderPass)(WGPUCommandEncoder commandEncoder, WGPURenderPassDescriptor const *descriptor);
+typedef WGPUComputePassEncoder (*WGPUProcCommandEncoderBeginComputePass)(WGPUCommandEncoder commandEncoder, WGPUComputePassDescriptor const * descriptor /* nullable */);
+typedef WGPURenderPassEncoder (*WGPUProcCommandEncoderBeginRenderPass)(WGPUCommandEncoder commandEncoder, WGPURenderPassDescriptor const * descriptor);
 typedef void (*WGPUProcCommandEncoderClearBuffer)(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t offset, uint64_t size);
 typedef void (*WGPUProcCommandEncoderCopyBufferToBuffer)(WGPUCommandEncoder commandEncoder, WGPUBuffer source, uint64_t sourceOffset, WGPUBuffer destination, uint64_t destinationOffset, uint64_t size);
-typedef void (*WGPUProcCommandEncoderCopyBufferToTexture)(WGPUCommandEncoder commandEncoder, WGPUImageCopyBuffer const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
-typedef void (*WGPUProcCommandEncoderCopyTextureToBuffer)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyBuffer const *destination, WGPUExtent3D const *copySize);
-typedef void (*WGPUProcCommandEncoderCopyTextureToTexture)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
-typedef void (*WGPUProcCommandEncoderCopyTextureToTextureInternal)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
-typedef WGPUCommandBuffer (*WGPUProcCommandEncoderFinish)(WGPUCommandEncoder commandEncoder, WGPUCommandBufferDescriptor const *descriptor /* nullable */);
-typedef void (*WGPUProcCommandEncoderInjectValidationError)(WGPUCommandEncoder commandEncoder, char const *message);
-typedef void (*WGPUProcCommandEncoderInsertDebugMarker)(WGPUCommandEncoder commandEncoder, char const *markerLabel);
+typedef void (*WGPUProcCommandEncoderCopyBufferToTexture)(WGPUCommandEncoder commandEncoder, WGPUImageCopyBuffer const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
+typedef void (*WGPUProcCommandEncoderCopyTextureToBuffer)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyBuffer const * destination, WGPUExtent3D const * copySize);
+typedef void (*WGPUProcCommandEncoderCopyTextureToTexture)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
+typedef void (*WGPUProcCommandEncoderCopyTextureToTextureInternal)(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
+typedef WGPUCommandBuffer (*WGPUProcCommandEncoderFinish)(WGPUCommandEncoder commandEncoder, WGPUCommandBufferDescriptor const * descriptor /* nullable */);
+typedef void (*WGPUProcCommandEncoderInjectValidationError)(WGPUCommandEncoder commandEncoder, char const * message);
+typedef void (*WGPUProcCommandEncoderInsertDebugMarker)(WGPUCommandEncoder commandEncoder, char const * markerLabel);
 typedef void (*WGPUProcCommandEncoderPopDebugGroup)(WGPUCommandEncoder commandEncoder);
-typedef void (*WGPUProcCommandEncoderPushDebugGroup)(WGPUCommandEncoder commandEncoder, char const *groupLabel);
+typedef void (*WGPUProcCommandEncoderPushDebugGroup)(WGPUCommandEncoder commandEncoder, char const * groupLabel);
 typedef void (*WGPUProcCommandEncoderResolveQuerySet)(WGPUCommandEncoder commandEncoder, WGPUQuerySet querySet, uint32_t firstQuery, uint32_t queryCount, WGPUBuffer destination, uint64_t destinationOffset);
-typedef void (*WGPUProcCommandEncoderSetLabel)(WGPUCommandEncoder commandEncoder, char const *label);
-typedef void (*WGPUProcCommandEncoderWriteBuffer)(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t bufferOffset, uint8_t const *data, uint64_t size);
+typedef void (*WGPUProcCommandEncoderSetLabel)(WGPUCommandEncoder commandEncoder, char const * label);
+typedef void (*WGPUProcCommandEncoderWriteBuffer)(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t bufferOffset, uint8_t const * data, uint64_t size);
 typedef void (*WGPUProcCommandEncoderWriteTimestamp)(WGPUCommandEncoder commandEncoder, WGPUQuerySet querySet, uint32_t queryIndex);
 typedef void (*WGPUProcCommandEncoderReference)(WGPUCommandEncoder commandEncoder);
 typedef void (*WGPUProcCommandEncoderRelease)(WGPUCommandEncoder commandEncoder);
@@ -1451,11 +1453,11 @@ typedef void (*WGPUProcComputePassEncoderDispatchWorkgroups)(WGPUComputePassEnco
 typedef void (*WGPUProcComputePassEncoderDispatchWorkgroupsIndirect)(WGPUComputePassEncoder computePassEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
 typedef void (*WGPUProcComputePassEncoderEnd)(WGPUComputePassEncoder computePassEncoder);
 typedef void (*WGPUProcComputePassEncoderEndPass)(WGPUComputePassEncoder computePassEncoder);
-typedef void (*WGPUProcComputePassEncoderInsertDebugMarker)(WGPUComputePassEncoder computePassEncoder, char const *markerLabel);
+typedef void (*WGPUProcComputePassEncoderInsertDebugMarker)(WGPUComputePassEncoder computePassEncoder, char const * markerLabel);
 typedef void (*WGPUProcComputePassEncoderPopDebugGroup)(WGPUComputePassEncoder computePassEncoder);
-typedef void (*WGPUProcComputePassEncoderPushDebugGroup)(WGPUComputePassEncoder computePassEncoder, char const *groupLabel);
-typedef void (*WGPUProcComputePassEncoderSetBindGroup)(WGPUComputePassEncoder computePassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
-typedef void (*WGPUProcComputePassEncoderSetLabel)(WGPUComputePassEncoder computePassEncoder, char const *label);
+typedef void (*WGPUProcComputePassEncoderPushDebugGroup)(WGPUComputePassEncoder computePassEncoder, char const * groupLabel);
+typedef void (*WGPUProcComputePassEncoderSetBindGroup)(WGPUComputePassEncoder computePassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
+typedef void (*WGPUProcComputePassEncoderSetLabel)(WGPUComputePassEncoder computePassEncoder, char const * label);
 typedef void (*WGPUProcComputePassEncoderSetPipeline)(WGPUComputePassEncoder computePassEncoder, WGPUComputePipeline pipeline);
 typedef void (*WGPUProcComputePassEncoderWriteTimestamp)(WGPUComputePassEncoder computePassEncoder, WGPUQuerySet querySet, uint32_t queryIndex);
 typedef void (*WGPUProcComputePassEncoderReference)(WGPUComputePassEncoder computePassEncoder);
@@ -1463,46 +1465,46 @@ typedef void (*WGPUProcComputePassEncoderRelease)(WGPUComputePassEncoder compute
 
 // Procs of ComputePipeline
 typedef WGPUBindGroupLayout (*WGPUProcComputePipelineGetBindGroupLayout)(WGPUComputePipeline computePipeline, uint32_t groupIndex);
-typedef void (*WGPUProcComputePipelineSetLabel)(WGPUComputePipeline computePipeline, char const *label);
+typedef void (*WGPUProcComputePipelineSetLabel)(WGPUComputePipeline computePipeline, char const * label);
 typedef void (*WGPUProcComputePipelineReference)(WGPUComputePipeline computePipeline);
 typedef void (*WGPUProcComputePipelineRelease)(WGPUComputePipeline computePipeline);
 
 // Procs of Device
-typedef WGPUBindGroup (*WGPUProcDeviceCreateBindGroup)(WGPUDevice device, WGPUBindGroupDescriptor const *descriptor);
-typedef WGPUBindGroupLayout (*WGPUProcDeviceCreateBindGroupLayout)(WGPUDevice device, WGPUBindGroupLayoutDescriptor const *descriptor);
-typedef WGPUBuffer (*WGPUProcDeviceCreateBuffer)(WGPUDevice device, WGPUBufferDescriptor const *descriptor);
-typedef WGPUCommandEncoder (*WGPUProcDeviceCreateCommandEncoder)(WGPUDevice device, WGPUCommandEncoderDescriptor const *descriptor /* nullable */);
-typedef WGPUComputePipeline (*WGPUProcDeviceCreateComputePipeline)(WGPUDevice device, WGPUComputePipelineDescriptor const *descriptor);
-typedef void (*WGPUProcDeviceCreateComputePipelineAsync)(WGPUDevice device, WGPUComputePipelineDescriptor const *descriptor, WGPUCreateComputePipelineAsyncCallback callback, void *userdata);
-typedef WGPUBuffer (*WGPUProcDeviceCreateErrorBuffer)(WGPUDevice device, WGPUBufferDescriptor const *descriptor);
+typedef WGPUBindGroup (*WGPUProcDeviceCreateBindGroup)(WGPUDevice device, WGPUBindGroupDescriptor const * descriptor);
+typedef WGPUBindGroupLayout (*WGPUProcDeviceCreateBindGroupLayout)(WGPUDevice device, WGPUBindGroupLayoutDescriptor const * descriptor);
+typedef WGPUBuffer (*WGPUProcDeviceCreateBuffer)(WGPUDevice device, WGPUBufferDescriptor const * descriptor);
+typedef WGPUCommandEncoder (*WGPUProcDeviceCreateCommandEncoder)(WGPUDevice device, WGPUCommandEncoderDescriptor const * descriptor /* nullable */);
+typedef WGPUComputePipeline (*WGPUProcDeviceCreateComputePipeline)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor);
+typedef void (*WGPUProcDeviceCreateComputePipelineAsync)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallback callback, void * userdata);
+typedef WGPUBuffer (*WGPUProcDeviceCreateErrorBuffer)(WGPUDevice device, WGPUBufferDescriptor const * descriptor);
 typedef WGPUExternalTexture (*WGPUProcDeviceCreateErrorExternalTexture)(WGPUDevice device);
-typedef WGPUTexture (*WGPUProcDeviceCreateErrorTexture)(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
-typedef WGPUExternalTexture (*WGPUProcDeviceCreateExternalTexture)(WGPUDevice device, WGPUExternalTextureDescriptor const *externalTextureDescriptor);
-typedef WGPUPipelineLayout (*WGPUProcDeviceCreatePipelineLayout)(WGPUDevice device, WGPUPipelineLayoutDescriptor const *descriptor);
-typedef WGPUQuerySet (*WGPUProcDeviceCreateQuerySet)(WGPUDevice device, WGPUQuerySetDescriptor const *descriptor);
-typedef WGPURenderBundleEncoder (*WGPUProcDeviceCreateRenderBundleEncoder)(WGPUDevice device, WGPURenderBundleEncoderDescriptor const *descriptor);
-typedef WGPURenderPipeline (*WGPUProcDeviceCreateRenderPipeline)(WGPUDevice device, WGPURenderPipelineDescriptor const *descriptor);
-typedef void (*WGPUProcDeviceCreateRenderPipelineAsync)(WGPUDevice device, WGPURenderPipelineDescriptor const *descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void *userdata);
-typedef WGPUSampler (*WGPUProcDeviceCreateSampler)(WGPUDevice device, WGPUSamplerDescriptor const *descriptor /* nullable */);
-typedef WGPUShaderModule (*WGPUProcDeviceCreateShaderModule)(WGPUDevice device, WGPUShaderModuleDescriptor const *descriptor);
-typedef WGPUSwapChain (*WGPUProcDeviceCreateSwapChain)(WGPUDevice device, WGPUSurface surface /* nullable */, WGPUSwapChainDescriptor const *descriptor);
-typedef WGPUTexture (*WGPUProcDeviceCreateTexture)(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
+typedef WGPUTexture (*WGPUProcDeviceCreateErrorTexture)(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
+typedef WGPUExternalTexture (*WGPUProcDeviceCreateExternalTexture)(WGPUDevice device, WGPUExternalTextureDescriptor const * externalTextureDescriptor);
+typedef WGPUPipelineLayout (*WGPUProcDeviceCreatePipelineLayout)(WGPUDevice device, WGPUPipelineLayoutDescriptor const * descriptor);
+typedef WGPUQuerySet (*WGPUProcDeviceCreateQuerySet)(WGPUDevice device, WGPUQuerySetDescriptor const * descriptor);
+typedef WGPURenderBundleEncoder (*WGPUProcDeviceCreateRenderBundleEncoder)(WGPUDevice device, WGPURenderBundleEncoderDescriptor const * descriptor);
+typedef WGPURenderPipeline (*WGPUProcDeviceCreateRenderPipeline)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor);
+typedef void (*WGPUProcDeviceCreateRenderPipelineAsync)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void * userdata);
+typedef WGPUSampler (*WGPUProcDeviceCreateSampler)(WGPUDevice device, WGPUSamplerDescriptor const * descriptor /* nullable */);
+typedef WGPUShaderModule (*WGPUProcDeviceCreateShaderModule)(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor);
+typedef WGPUSwapChain (*WGPUProcDeviceCreateSwapChain)(WGPUDevice device, WGPUSurface surface /* nullable */, WGPUSwapChainDescriptor const * descriptor);
+typedef WGPUTexture (*WGPUProcDeviceCreateTexture)(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
 typedef void (*WGPUProcDeviceDestroy)(WGPUDevice device);
-typedef size_t (*WGPUProcDeviceEnumerateFeatures)(WGPUDevice device, WGPUFeatureName *features);
-typedef void (*WGPUProcDeviceForceLoss)(WGPUDevice device, WGPUDeviceLostReason type, char const *message);
+typedef size_t (*WGPUProcDeviceEnumerateFeatures)(WGPUDevice device, WGPUFeatureName * features);
+typedef void (*WGPUProcDeviceForceLoss)(WGPUDevice device, WGPUDeviceLostReason type, char const * message);
 typedef WGPUAdapter (*WGPUProcDeviceGetAdapter)(WGPUDevice device);
-typedef bool (*WGPUProcDeviceGetLimits)(WGPUDevice device, WGPUSupportedLimits *limits);
+typedef bool (*WGPUProcDeviceGetLimits)(WGPUDevice device, WGPUSupportedLimits * limits);
 typedef WGPUQueue (*WGPUProcDeviceGetQueue)(WGPUDevice device);
 typedef bool (*WGPUProcDeviceHasFeature)(WGPUDevice device, WGPUFeatureName feature);
-typedef void (*WGPUProcDeviceInjectError)(WGPUDevice device, WGPUErrorType type, char const *message);
-typedef bool (*WGPUProcDevicePopErrorScope)(WGPUDevice device, WGPUErrorCallback callback, void *userdata);
+typedef void (*WGPUProcDeviceInjectError)(WGPUDevice device, WGPUErrorType type, char const * message);
+typedef bool (*WGPUProcDevicePopErrorScope)(WGPUDevice device, WGPUErrorCallback callback, void * userdata);
 typedef void (*WGPUProcDevicePushErrorScope)(WGPUDevice device, WGPUErrorFilter filter);
-typedef void (*WGPUProcDeviceSetDeviceLostCallback)(WGPUDevice device, WGPUDeviceLostCallback callback, void *userdata);
-typedef void (*WGPUProcDeviceSetLabel)(WGPUDevice device, char const *label);
-typedef void (*WGPUProcDeviceSetLoggingCallback)(WGPUDevice device, WGPULoggingCallback callback, void *userdata);
-typedef void (*WGPUProcDeviceSetUncapturedErrorCallback)(WGPUDevice device, WGPUErrorCallback callback, void *userdata);
+typedef void (*WGPUProcDeviceSetDeviceLostCallback)(WGPUDevice device, WGPUDeviceLostCallback callback, void * userdata);
+typedef void (*WGPUProcDeviceSetLabel)(WGPUDevice device, char const * label);
+typedef void (*WGPUProcDeviceSetLoggingCallback)(WGPUDevice device, WGPULoggingCallback callback, void * userdata);
+typedef void (*WGPUProcDeviceSetUncapturedErrorCallback)(WGPUDevice device, WGPUErrorCallback callback, void * userdata);
 typedef void (*WGPUProcDeviceTick)(WGPUDevice device);
-typedef void (*WGPUProcDeviceValidateTextureDescriptor)(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
+typedef void (*WGPUProcDeviceValidateTextureDescriptor)(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
 typedef void (*WGPUProcDeviceReference)(WGPUDevice device);
 typedef void (*WGPUProcDeviceRelease)(WGPUDevice device);
 
@@ -1510,18 +1512,18 @@ typedef void (*WGPUProcDeviceRelease)(WGPUDevice device);
 typedef void (*WGPUProcExternalTextureDestroy)(WGPUExternalTexture externalTexture);
 typedef void (*WGPUProcExternalTextureExpire)(WGPUExternalTexture externalTexture);
 typedef void (*WGPUProcExternalTextureRefresh)(WGPUExternalTexture externalTexture);
-typedef void (*WGPUProcExternalTextureSetLabel)(WGPUExternalTexture externalTexture, char const *label);
+typedef void (*WGPUProcExternalTextureSetLabel)(WGPUExternalTexture externalTexture, char const * label);
 typedef void (*WGPUProcExternalTextureReference)(WGPUExternalTexture externalTexture);
 typedef void (*WGPUProcExternalTextureRelease)(WGPUExternalTexture externalTexture);
 
 // Procs of Instance
-typedef WGPUSurface (*WGPUProcInstanceCreateSurface)(WGPUInstance instance, WGPUSurfaceDescriptor const *descriptor);
-typedef void (*WGPUProcInstanceRequestAdapter)(WGPUInstance instance, WGPURequestAdapterOptions const *options /* nullable */, WGPURequestAdapterCallback callback, void *userdata);
+typedef WGPUSurface (*WGPUProcInstanceCreateSurface)(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor);
+typedef void (*WGPUProcInstanceRequestAdapter)(WGPUInstance instance, WGPURequestAdapterOptions const * options /* nullable */, WGPURequestAdapterCallback callback, void * userdata);
 typedef void (*WGPUProcInstanceReference)(WGPUInstance instance);
 typedef void (*WGPUProcInstanceRelease)(WGPUInstance instance);
 
 // Procs of PipelineLayout
-typedef void (*WGPUProcPipelineLayoutSetLabel)(WGPUPipelineLayout pipelineLayout, char const *label);
+typedef void (*WGPUProcPipelineLayoutSetLabel)(WGPUPipelineLayout pipelineLayout, char const * label);
 typedef void (*WGPUProcPipelineLayoutReference)(WGPUPipelineLayout pipelineLayout);
 typedef void (*WGPUProcPipelineLayoutRelease)(WGPUPipelineLayout pipelineLayout);
 
@@ -1529,18 +1531,18 @@ typedef void (*WGPUProcPipelineLayoutRelease)(WGPUPipelineLayout pipelineLayout)
 typedef void (*WGPUProcQuerySetDestroy)(WGPUQuerySet querySet);
 typedef uint32_t (*WGPUProcQuerySetGetCount)(WGPUQuerySet querySet);
 typedef WGPUQueryType (*WGPUProcQuerySetGetType)(WGPUQuerySet querySet);
-typedef void (*WGPUProcQuerySetSetLabel)(WGPUQuerySet querySet, char const *label);
+typedef void (*WGPUProcQuerySetSetLabel)(WGPUQuerySet querySet, char const * label);
 typedef void (*WGPUProcQuerySetReference)(WGPUQuerySet querySet);
 typedef void (*WGPUProcQuerySetRelease)(WGPUQuerySet querySet);
 
 // Procs of Queue
-typedef void (*WGPUProcQueueCopyExternalTextureForBrowser)(WGPUQueue queue, WGPUImageCopyExternalTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize, WGPUCopyTextureForBrowserOptions const *options);
-typedef void (*WGPUProcQueueCopyTextureForBrowser)(WGPUQueue queue, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize, WGPUCopyTextureForBrowserOptions const *options);
-typedef void (*WGPUProcQueueOnSubmittedWorkDone)(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void *userdata);
-typedef void (*WGPUProcQueueSetLabel)(WGPUQueue queue, char const *label);
-typedef void (*WGPUProcQueueSubmit)(WGPUQueue queue, uint32_t commandCount, WGPUCommandBuffer const *commands);
-typedef void (*WGPUProcQueueWriteBuffer)(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const *data, size_t size);
-typedef void (*WGPUProcQueueWriteTexture)(WGPUQueue queue, WGPUImageCopyTexture const *destination, void const *data, size_t dataSize, WGPUTextureDataLayout const *dataLayout, WGPUExtent3D const *writeSize);
+typedef void (*WGPUProcQueueCopyExternalTextureForBrowser)(WGPUQueue queue, WGPUImageCopyExternalTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options);
+typedef void (*WGPUProcQueueCopyTextureForBrowser)(WGPUQueue queue, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options);
+typedef void (*WGPUProcQueueOnSubmittedWorkDone)(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void * userdata);
+typedef void (*WGPUProcQueueSetLabel)(WGPUQueue queue, char const * label);
+typedef void (*WGPUProcQueueSubmit)(WGPUQueue queue, uint32_t commandCount, WGPUCommandBuffer const * commands);
+typedef void (*WGPUProcQueueWriteBuffer)(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const * data, size_t size);
+typedef void (*WGPUProcQueueWriteTexture)(WGPUQueue queue, WGPUImageCopyTexture const * destination, void const * data, size_t dataSize, WGPUTextureDataLayout const * dataLayout, WGPUExtent3D const * writeSize);
 typedef void (*WGPUProcQueueReference)(WGPUQueue queue);
 typedef void (*WGPUProcQueueRelease)(WGPUQueue queue);
 
@@ -1553,13 +1555,13 @@ typedef void (*WGPUProcRenderBundleEncoderDraw)(WGPURenderBundleEncoder renderBu
 typedef void (*WGPUProcRenderBundleEncoderDrawIndexed)(WGPURenderBundleEncoder renderBundleEncoder, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t baseVertex, uint32_t firstInstance);
 typedef void (*WGPUProcRenderBundleEncoderDrawIndexedIndirect)(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
 typedef void (*WGPUProcRenderBundleEncoderDrawIndirect)(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
-typedef WGPURenderBundle (*WGPUProcRenderBundleEncoderFinish)(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderBundleDescriptor const *descriptor /* nullable */);
-typedef void (*WGPUProcRenderBundleEncoderInsertDebugMarker)(WGPURenderBundleEncoder renderBundleEncoder, char const *markerLabel);
+typedef WGPURenderBundle (*WGPUProcRenderBundleEncoderFinish)(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderBundleDescriptor const * descriptor /* nullable */);
+typedef void (*WGPUProcRenderBundleEncoderInsertDebugMarker)(WGPURenderBundleEncoder renderBundleEncoder, char const * markerLabel);
 typedef void (*WGPUProcRenderBundleEncoderPopDebugGroup)(WGPURenderBundleEncoder renderBundleEncoder);
-typedef void (*WGPUProcRenderBundleEncoderPushDebugGroup)(WGPURenderBundleEncoder renderBundleEncoder, char const *groupLabel);
-typedef void (*WGPUProcRenderBundleEncoderSetBindGroup)(WGPURenderBundleEncoder renderBundleEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
+typedef void (*WGPUProcRenderBundleEncoderPushDebugGroup)(WGPURenderBundleEncoder renderBundleEncoder, char const * groupLabel);
+typedef void (*WGPUProcRenderBundleEncoderSetBindGroup)(WGPURenderBundleEncoder renderBundleEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
 typedef void (*WGPUProcRenderBundleEncoderSetIndexBuffer)(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer buffer, WGPUIndexFormat format, uint64_t offset, uint64_t size);
-typedef void (*WGPUProcRenderBundleEncoderSetLabel)(WGPURenderBundleEncoder renderBundleEncoder, char const *label);
+typedef void (*WGPUProcRenderBundleEncoderSetLabel)(WGPURenderBundleEncoder renderBundleEncoder, char const * label);
 typedef void (*WGPUProcRenderBundleEncoderSetPipeline)(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderPipeline pipeline);
 typedef void (*WGPUProcRenderBundleEncoderSetVertexBuffer)(WGPURenderBundleEncoder renderBundleEncoder, uint32_t slot, WGPUBuffer buffer, uint64_t offset, uint64_t size);
 typedef void (*WGPUProcRenderBundleEncoderReference)(WGPURenderBundleEncoder renderBundleEncoder);
@@ -1574,14 +1576,14 @@ typedef void (*WGPUProcRenderPassEncoderDrawIndirect)(WGPURenderPassEncoder rend
 typedef void (*WGPUProcRenderPassEncoderEnd)(WGPURenderPassEncoder renderPassEncoder);
 typedef void (*WGPUProcRenderPassEncoderEndOcclusionQuery)(WGPURenderPassEncoder renderPassEncoder);
 typedef void (*WGPUProcRenderPassEncoderEndPass)(WGPURenderPassEncoder renderPassEncoder);
-typedef void (*WGPUProcRenderPassEncoderExecuteBundles)(WGPURenderPassEncoder renderPassEncoder, uint32_t bundleCount, WGPURenderBundle const *bundles);
-typedef void (*WGPUProcRenderPassEncoderInsertDebugMarker)(WGPURenderPassEncoder renderPassEncoder, char const *markerLabel);
+typedef void (*WGPUProcRenderPassEncoderExecuteBundles)(WGPURenderPassEncoder renderPassEncoder, uint32_t bundleCount, WGPURenderBundle const * bundles);
+typedef void (*WGPUProcRenderPassEncoderInsertDebugMarker)(WGPURenderPassEncoder renderPassEncoder, char const * markerLabel);
 typedef void (*WGPUProcRenderPassEncoderPopDebugGroup)(WGPURenderPassEncoder renderPassEncoder);
-typedef void (*WGPUProcRenderPassEncoderPushDebugGroup)(WGPURenderPassEncoder renderPassEncoder, char const *groupLabel);
-typedef void (*WGPUProcRenderPassEncoderSetBindGroup)(WGPURenderPassEncoder renderPassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
-typedef void (*WGPUProcRenderPassEncoderSetBlendConstant)(WGPURenderPassEncoder renderPassEncoder, WGPUColor const *color);
+typedef void (*WGPUProcRenderPassEncoderPushDebugGroup)(WGPURenderPassEncoder renderPassEncoder, char const * groupLabel);
+typedef void (*WGPUProcRenderPassEncoderSetBindGroup)(WGPURenderPassEncoder renderPassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
+typedef void (*WGPUProcRenderPassEncoderSetBlendConstant)(WGPURenderPassEncoder renderPassEncoder, WGPUColor const * color);
 typedef void (*WGPUProcRenderPassEncoderSetIndexBuffer)(WGPURenderPassEncoder renderPassEncoder, WGPUBuffer buffer, WGPUIndexFormat format, uint64_t offset, uint64_t size);
-typedef void (*WGPUProcRenderPassEncoderSetLabel)(WGPURenderPassEncoder renderPassEncoder, char const *label);
+typedef void (*WGPUProcRenderPassEncoderSetLabel)(WGPURenderPassEncoder renderPassEncoder, char const * label);
 typedef void (*WGPUProcRenderPassEncoderSetPipeline)(WGPURenderPassEncoder renderPassEncoder, WGPURenderPipeline pipeline);
 typedef void (*WGPUProcRenderPassEncoderSetScissorRect)(WGPURenderPassEncoder renderPassEncoder, uint32_t x, uint32_t y, uint32_t width, uint32_t height);
 typedef void (*WGPUProcRenderPassEncoderSetStencilReference)(WGPURenderPassEncoder renderPassEncoder, uint32_t reference);
@@ -1593,18 +1595,18 @@ typedef void (*WGPUProcRenderPassEncoderRelease)(WGPURenderPassEncoder renderPas
 
 // Procs of RenderPipeline
 typedef WGPUBindGroupLayout (*WGPUProcRenderPipelineGetBindGroupLayout)(WGPURenderPipeline renderPipeline, uint32_t groupIndex);
-typedef void (*WGPUProcRenderPipelineSetLabel)(WGPURenderPipeline renderPipeline, char const *label);
+typedef void (*WGPUProcRenderPipelineSetLabel)(WGPURenderPipeline renderPipeline, char const * label);
 typedef void (*WGPUProcRenderPipelineReference)(WGPURenderPipeline renderPipeline);
 typedef void (*WGPUProcRenderPipelineRelease)(WGPURenderPipeline renderPipeline);
 
 // Procs of Sampler
-typedef void (*WGPUProcSamplerSetLabel)(WGPUSampler sampler, char const *label);
+typedef void (*WGPUProcSamplerSetLabel)(WGPUSampler sampler, char const * label);
 typedef void (*WGPUProcSamplerReference)(WGPUSampler sampler);
 typedef void (*WGPUProcSamplerRelease)(WGPUSampler sampler);
 
 // Procs of ShaderModule
-typedef void (*WGPUProcShaderModuleGetCompilationInfo)(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void *userdata);
-typedef void (*WGPUProcShaderModuleSetLabel)(WGPUShaderModule shaderModule, char const *label);
+typedef void (*WGPUProcShaderModuleGetCompilationInfo)(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void * userdata);
+typedef void (*WGPUProcShaderModuleSetLabel)(WGPUShaderModule shaderModule, char const * label);
 typedef void (*WGPUProcShaderModuleReference)(WGPUShaderModule shaderModule);
 typedef void (*WGPUProcShaderModuleRelease)(WGPUShaderModule shaderModule);
 
@@ -1620,7 +1622,7 @@ typedef void (*WGPUProcSwapChainReference)(WGPUSwapChain swapChain);
 typedef void (*WGPUProcSwapChainRelease)(WGPUSwapChain swapChain);
 
 // Procs of Texture
-typedef WGPUTextureView (*WGPUProcTextureCreateView)(WGPUTexture texture, WGPUTextureViewDescriptor const *descriptor /* nullable */);
+typedef WGPUTextureView (*WGPUProcTextureCreateView)(WGPUTexture texture, WGPUTextureViewDescriptor const * descriptor /* nullable */);
 typedef void (*WGPUProcTextureDestroy)(WGPUTexture texture);
 typedef uint32_t (*WGPUProcTextureGetDepthOrArrayLayers)(WGPUTexture texture);
 typedef WGPUTextureDimension (*WGPUProcTextureGetDimension)(WGPUTexture texture);
@@ -1630,12 +1632,12 @@ typedef uint32_t (*WGPUProcTextureGetMipLevelCount)(WGPUTexture texture);
 typedef uint32_t (*WGPUProcTextureGetSampleCount)(WGPUTexture texture);
 typedef WGPUTextureUsage (*WGPUProcTextureGetUsage)(WGPUTexture texture);
 typedef uint32_t (*WGPUProcTextureGetWidth)(WGPUTexture texture);
-typedef void (*WGPUProcTextureSetLabel)(WGPUTexture texture, char const *label);
+typedef void (*WGPUProcTextureSetLabel)(WGPUTexture texture, char const * label);
 typedef void (*WGPUProcTextureReference)(WGPUTexture texture);
 typedef void (*WGPUProcTextureRelease)(WGPUTexture texture);
 
 // Procs of TextureView
-typedef void (*WGPUProcTextureViewSetLabel)(WGPUTextureView textureView, char const *label);
+typedef void (*WGPUProcTextureViewSetLabel)(WGPUTextureView textureView, char const * label);
 typedef void (*WGPUProcTextureViewReference)(WGPUTextureView textureView);
 typedef void (*WGPUProcTextureViewRelease)(WGPUTextureView textureView);
 
@@ -1643,64 +1645,64 @@ typedef void (*WGPUProcTextureViewRelease)(WGPUTextureView textureView);
 
 #if !defined(WGPU_SKIP_DECLARATIONS)
 
-WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPUInstanceDescriptor const *descriptor);
-WGPU_EXPORT WGPUProc wgpuGetProcAddress(WGPUDevice device, char const *procName);
+WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPUInstanceDescriptor const * descriptor);
+WGPU_EXPORT WGPUProc wgpuGetProcAddress(WGPUDevice device, char const * procName);
 
 // Methods of Adapter
-WGPU_EXPORT WGPUDevice wgpuAdapterCreateDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor /* nullable */);
-WGPU_EXPORT size_t wgpuAdapterEnumerateFeatures(WGPUAdapter adapter, WGPUFeatureName *features);
-WGPU_EXPORT bool wgpuAdapterGetLimits(WGPUAdapter adapter, WGPUSupportedLimits *limits);
-WGPU_EXPORT void wgpuAdapterGetProperties(WGPUAdapter adapter, WGPUAdapterProperties *properties);
+WGPU_EXPORT WGPUDevice wgpuAdapterCreateDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor /* nullable */);
+WGPU_EXPORT size_t wgpuAdapterEnumerateFeatures(WGPUAdapter adapter, WGPUFeatureName * features);
+WGPU_EXPORT bool wgpuAdapterGetLimits(WGPUAdapter adapter, WGPUSupportedLimits * limits);
+WGPU_EXPORT void wgpuAdapterGetProperties(WGPUAdapter adapter, WGPUAdapterProperties * properties);
 WGPU_EXPORT bool wgpuAdapterHasFeature(WGPUAdapter adapter, WGPUFeatureName feature);
-WGPU_EXPORT void wgpuAdapterRequestDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor /* nullable */, WGPURequestDeviceCallback callback, void *userdata);
+WGPU_EXPORT void wgpuAdapterRequestDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor /* nullable */, WGPURequestDeviceCallback callback, void * userdata);
 WGPU_EXPORT void wgpuAdapterReference(WGPUAdapter adapter);
 WGPU_EXPORT void wgpuAdapterRelease(WGPUAdapter adapter);
 
 // Methods of BindGroup
-WGPU_EXPORT void wgpuBindGroupSetLabel(WGPUBindGroup bindGroup, char const *label);
+WGPU_EXPORT void wgpuBindGroupSetLabel(WGPUBindGroup bindGroup, char const * label);
 WGPU_EXPORT void wgpuBindGroupReference(WGPUBindGroup bindGroup);
 WGPU_EXPORT void wgpuBindGroupRelease(WGPUBindGroup bindGroup);
 
 // Methods of BindGroupLayout
-WGPU_EXPORT void wgpuBindGroupLayoutSetLabel(WGPUBindGroupLayout bindGroupLayout, char const *label);
+WGPU_EXPORT void wgpuBindGroupLayoutSetLabel(WGPUBindGroupLayout bindGroupLayout, char const * label);
 WGPU_EXPORT void wgpuBindGroupLayoutReference(WGPUBindGroupLayout bindGroupLayout);
 WGPU_EXPORT void wgpuBindGroupLayoutRelease(WGPUBindGroupLayout bindGroupLayout);
 
 // Methods of Buffer
 WGPU_EXPORT void wgpuBufferDestroy(WGPUBuffer buffer);
-WGPU_EXPORT void const *wgpuBufferGetConstMappedRange(WGPUBuffer buffer, size_t offset, size_t size);
+WGPU_EXPORT void const * wgpuBufferGetConstMappedRange(WGPUBuffer buffer, size_t offset, size_t size);
 WGPU_EXPORT WGPUBufferMapState wgpuBufferGetMapState(WGPUBuffer buffer);
-WGPU_EXPORT void *wgpuBufferGetMappedRange(WGPUBuffer buffer, size_t offset, size_t size);
+WGPU_EXPORT void * wgpuBufferGetMappedRange(WGPUBuffer buffer, size_t offset, size_t size);
 WGPU_EXPORT uint64_t wgpuBufferGetSize(WGPUBuffer buffer);
 WGPU_EXPORT WGPUBufferUsage wgpuBufferGetUsage(WGPUBuffer buffer);
-WGPU_EXPORT void wgpuBufferMapAsync(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void *userdata);
-WGPU_EXPORT void wgpuBufferSetLabel(WGPUBuffer buffer, char const *label);
+WGPU_EXPORT void wgpuBufferMapAsync(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void * userdata);
+WGPU_EXPORT void wgpuBufferSetLabel(WGPUBuffer buffer, char const * label);
 WGPU_EXPORT void wgpuBufferUnmap(WGPUBuffer buffer);
 WGPU_EXPORT void wgpuBufferReference(WGPUBuffer buffer);
 WGPU_EXPORT void wgpuBufferRelease(WGPUBuffer buffer);
 
 // Methods of CommandBuffer
-WGPU_EXPORT void wgpuCommandBufferSetLabel(WGPUCommandBuffer commandBuffer, char const *label);
+WGPU_EXPORT void wgpuCommandBufferSetLabel(WGPUCommandBuffer commandBuffer, char const * label);
 WGPU_EXPORT void wgpuCommandBufferReference(WGPUCommandBuffer commandBuffer);
 WGPU_EXPORT void wgpuCommandBufferRelease(WGPUCommandBuffer commandBuffer);
 
 // Methods of CommandEncoder
-WGPU_EXPORT WGPUComputePassEncoder wgpuCommandEncoderBeginComputePass(WGPUCommandEncoder commandEncoder, WGPUComputePassDescriptor const *descriptor /* nullable */);
-WGPU_EXPORT WGPURenderPassEncoder wgpuCommandEncoderBeginRenderPass(WGPUCommandEncoder commandEncoder, WGPURenderPassDescriptor const *descriptor);
+WGPU_EXPORT WGPUComputePassEncoder wgpuCommandEncoderBeginComputePass(WGPUCommandEncoder commandEncoder, WGPUComputePassDescriptor const * descriptor /* nullable */);
+WGPU_EXPORT WGPURenderPassEncoder wgpuCommandEncoderBeginRenderPass(WGPUCommandEncoder commandEncoder, WGPURenderPassDescriptor const * descriptor);
 WGPU_EXPORT void wgpuCommandEncoderClearBuffer(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t offset, uint64_t size);
 WGPU_EXPORT void wgpuCommandEncoderCopyBufferToBuffer(WGPUCommandEncoder commandEncoder, WGPUBuffer source, uint64_t sourceOffset, WGPUBuffer destination, uint64_t destinationOffset, uint64_t size);
-WGPU_EXPORT void wgpuCommandEncoderCopyBufferToTexture(WGPUCommandEncoder commandEncoder, WGPUImageCopyBuffer const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
-WGPU_EXPORT void wgpuCommandEncoderCopyTextureToBuffer(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyBuffer const *destination, WGPUExtent3D const *copySize);
-WGPU_EXPORT void wgpuCommandEncoderCopyTextureToTexture(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
-WGPU_EXPORT void wgpuCommandEncoderCopyTextureToTextureInternal(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize);
-WGPU_EXPORT WGPUCommandBuffer wgpuCommandEncoderFinish(WGPUCommandEncoder commandEncoder, WGPUCommandBufferDescriptor const *descriptor /* nullable */);
-WGPU_EXPORT void wgpuCommandEncoderInjectValidationError(WGPUCommandEncoder commandEncoder, char const *message);
-WGPU_EXPORT void wgpuCommandEncoderInsertDebugMarker(WGPUCommandEncoder commandEncoder, char const *markerLabel);
+WGPU_EXPORT void wgpuCommandEncoderCopyBufferToTexture(WGPUCommandEncoder commandEncoder, WGPUImageCopyBuffer const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
+WGPU_EXPORT void wgpuCommandEncoderCopyTextureToBuffer(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyBuffer const * destination, WGPUExtent3D const * copySize);
+WGPU_EXPORT void wgpuCommandEncoderCopyTextureToTexture(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
+WGPU_EXPORT void wgpuCommandEncoderCopyTextureToTextureInternal(WGPUCommandEncoder commandEncoder, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize);
+WGPU_EXPORT WGPUCommandBuffer wgpuCommandEncoderFinish(WGPUCommandEncoder commandEncoder, WGPUCommandBufferDescriptor const * descriptor /* nullable */);
+WGPU_EXPORT void wgpuCommandEncoderInjectValidationError(WGPUCommandEncoder commandEncoder, char const * message);
+WGPU_EXPORT void wgpuCommandEncoderInsertDebugMarker(WGPUCommandEncoder commandEncoder, char const * markerLabel);
 WGPU_EXPORT void wgpuCommandEncoderPopDebugGroup(WGPUCommandEncoder commandEncoder);
-WGPU_EXPORT void wgpuCommandEncoderPushDebugGroup(WGPUCommandEncoder commandEncoder, char const *groupLabel);
+WGPU_EXPORT void wgpuCommandEncoderPushDebugGroup(WGPUCommandEncoder commandEncoder, char const * groupLabel);
 WGPU_EXPORT void wgpuCommandEncoderResolveQuerySet(WGPUCommandEncoder commandEncoder, WGPUQuerySet querySet, uint32_t firstQuery, uint32_t queryCount, WGPUBuffer destination, uint64_t destinationOffset);
-WGPU_EXPORT void wgpuCommandEncoderSetLabel(WGPUCommandEncoder commandEncoder, char const *label);
-WGPU_EXPORT void wgpuCommandEncoderWriteBuffer(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t bufferOffset, uint8_t const *data, uint64_t size);
+WGPU_EXPORT void wgpuCommandEncoderSetLabel(WGPUCommandEncoder commandEncoder, char const * label);
+WGPU_EXPORT void wgpuCommandEncoderWriteBuffer(WGPUCommandEncoder commandEncoder, WGPUBuffer buffer, uint64_t bufferOffset, uint8_t const * data, uint64_t size);
 WGPU_EXPORT void wgpuCommandEncoderWriteTimestamp(WGPUCommandEncoder commandEncoder, WGPUQuerySet querySet, uint32_t queryIndex);
 WGPU_EXPORT void wgpuCommandEncoderReference(WGPUCommandEncoder commandEncoder);
 WGPU_EXPORT void wgpuCommandEncoderRelease(WGPUCommandEncoder commandEncoder);
@@ -1712,11 +1714,11 @@ WGPU_EXPORT void wgpuComputePassEncoderDispatchWorkgroups(WGPUComputePassEncoder
 WGPU_EXPORT void wgpuComputePassEncoderDispatchWorkgroupsIndirect(WGPUComputePassEncoder computePassEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
 WGPU_EXPORT void wgpuComputePassEncoderEnd(WGPUComputePassEncoder computePassEncoder);
 WGPU_EXPORT void wgpuComputePassEncoderEndPass(WGPUComputePassEncoder computePassEncoder);
-WGPU_EXPORT void wgpuComputePassEncoderInsertDebugMarker(WGPUComputePassEncoder computePassEncoder, char const *markerLabel);
+WGPU_EXPORT void wgpuComputePassEncoderInsertDebugMarker(WGPUComputePassEncoder computePassEncoder, char const * markerLabel);
 WGPU_EXPORT void wgpuComputePassEncoderPopDebugGroup(WGPUComputePassEncoder computePassEncoder);
-WGPU_EXPORT void wgpuComputePassEncoderPushDebugGroup(WGPUComputePassEncoder computePassEncoder, char const *groupLabel);
-WGPU_EXPORT void wgpuComputePassEncoderSetBindGroup(WGPUComputePassEncoder computePassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
-WGPU_EXPORT void wgpuComputePassEncoderSetLabel(WGPUComputePassEncoder computePassEncoder, char const *label);
+WGPU_EXPORT void wgpuComputePassEncoderPushDebugGroup(WGPUComputePassEncoder computePassEncoder, char const * groupLabel);
+WGPU_EXPORT void wgpuComputePassEncoderSetBindGroup(WGPUComputePassEncoder computePassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
+WGPU_EXPORT void wgpuComputePassEncoderSetLabel(WGPUComputePassEncoder computePassEncoder, char const * label);
 WGPU_EXPORT void wgpuComputePassEncoderSetPipeline(WGPUComputePassEncoder computePassEncoder, WGPUComputePipeline pipeline);
 WGPU_EXPORT void wgpuComputePassEncoderWriteTimestamp(WGPUComputePassEncoder computePassEncoder, WGPUQuerySet querySet, uint32_t queryIndex);
 WGPU_EXPORT void wgpuComputePassEncoderReference(WGPUComputePassEncoder computePassEncoder);
@@ -1724,46 +1726,46 @@ WGPU_EXPORT void wgpuComputePassEncoderRelease(WGPUComputePassEncoder computePas
 
 // Methods of ComputePipeline
 WGPU_EXPORT WGPUBindGroupLayout wgpuComputePipelineGetBindGroupLayout(WGPUComputePipeline computePipeline, uint32_t groupIndex);
-WGPU_EXPORT void wgpuComputePipelineSetLabel(WGPUComputePipeline computePipeline, char const *label);
+WGPU_EXPORT void wgpuComputePipelineSetLabel(WGPUComputePipeline computePipeline, char const * label);
 WGPU_EXPORT void wgpuComputePipelineReference(WGPUComputePipeline computePipeline);
 WGPU_EXPORT void wgpuComputePipelineRelease(WGPUComputePipeline computePipeline);
 
 // Methods of Device
-WGPU_EXPORT WGPUBindGroup wgpuDeviceCreateBindGroup(WGPUDevice device, WGPUBindGroupDescriptor const *descriptor);
-WGPU_EXPORT WGPUBindGroupLayout wgpuDeviceCreateBindGroupLayout(WGPUDevice device, WGPUBindGroupLayoutDescriptor const *descriptor);
-WGPU_EXPORT WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, WGPUBufferDescriptor const *descriptor);
-WGPU_EXPORT WGPUCommandEncoder wgpuDeviceCreateCommandEncoder(WGPUDevice device, WGPUCommandEncoderDescriptor const *descriptor /* nullable */);
-WGPU_EXPORT WGPUComputePipeline wgpuDeviceCreateComputePipeline(WGPUDevice device, WGPUComputePipelineDescriptor const *descriptor);
-WGPU_EXPORT void wgpuDeviceCreateComputePipelineAsync(WGPUDevice device, WGPUComputePipelineDescriptor const *descriptor, WGPUCreateComputePipelineAsyncCallback callback, void *userdata);
-WGPU_EXPORT WGPUBuffer wgpuDeviceCreateErrorBuffer(WGPUDevice device, WGPUBufferDescriptor const *descriptor);
+WGPU_EXPORT WGPUBindGroup wgpuDeviceCreateBindGroup(WGPUDevice device, WGPUBindGroupDescriptor const * descriptor);
+WGPU_EXPORT WGPUBindGroupLayout wgpuDeviceCreateBindGroupLayout(WGPUDevice device, WGPUBindGroupLayoutDescriptor const * descriptor);
+WGPU_EXPORT WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, WGPUBufferDescriptor const * descriptor);
+WGPU_EXPORT WGPUCommandEncoder wgpuDeviceCreateCommandEncoder(WGPUDevice device, WGPUCommandEncoderDescriptor const * descriptor /* nullable */);
+WGPU_EXPORT WGPUComputePipeline wgpuDeviceCreateComputePipeline(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor);
+WGPU_EXPORT void wgpuDeviceCreateComputePipelineAsync(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallback callback, void * userdata);
+WGPU_EXPORT WGPUBuffer wgpuDeviceCreateErrorBuffer(WGPUDevice device, WGPUBufferDescriptor const * descriptor);
 WGPU_EXPORT WGPUExternalTexture wgpuDeviceCreateErrorExternalTexture(WGPUDevice device);
-WGPU_EXPORT WGPUTexture wgpuDeviceCreateErrorTexture(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
-WGPU_EXPORT WGPUExternalTexture wgpuDeviceCreateExternalTexture(WGPUDevice device, WGPUExternalTextureDescriptor const *externalTextureDescriptor);
-WGPU_EXPORT WGPUPipelineLayout wgpuDeviceCreatePipelineLayout(WGPUDevice device, WGPUPipelineLayoutDescriptor const *descriptor);
-WGPU_EXPORT WGPUQuerySet wgpuDeviceCreateQuerySet(WGPUDevice device, WGPUQuerySetDescriptor const *descriptor);
-WGPU_EXPORT WGPURenderBundleEncoder wgpuDeviceCreateRenderBundleEncoder(WGPUDevice device, WGPURenderBundleEncoderDescriptor const *descriptor);
-WGPU_EXPORT WGPURenderPipeline wgpuDeviceCreateRenderPipeline(WGPUDevice device, WGPURenderPipelineDescriptor const *descriptor);
-WGPU_EXPORT void wgpuDeviceCreateRenderPipelineAsync(WGPUDevice device, WGPURenderPipelineDescriptor const *descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void *userdata);
-WGPU_EXPORT WGPUSampler wgpuDeviceCreateSampler(WGPUDevice device, WGPUSamplerDescriptor const *descriptor /* nullable */);
-WGPU_EXPORT WGPUShaderModule wgpuDeviceCreateShaderModule(WGPUDevice device, WGPUShaderModuleDescriptor const *descriptor);
-WGPU_EXPORT WGPUSwapChain wgpuDeviceCreateSwapChain(WGPUDevice device, WGPUSurface surface /* nullable */, WGPUSwapChainDescriptor const *descriptor);
-WGPU_EXPORT WGPUTexture wgpuDeviceCreateTexture(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
+WGPU_EXPORT WGPUTexture wgpuDeviceCreateErrorTexture(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
+WGPU_EXPORT WGPUExternalTexture wgpuDeviceCreateExternalTexture(WGPUDevice device, WGPUExternalTextureDescriptor const * externalTextureDescriptor);
+WGPU_EXPORT WGPUPipelineLayout wgpuDeviceCreatePipelineLayout(WGPUDevice device, WGPUPipelineLayoutDescriptor const * descriptor);
+WGPU_EXPORT WGPUQuerySet wgpuDeviceCreateQuerySet(WGPUDevice device, WGPUQuerySetDescriptor const * descriptor);
+WGPU_EXPORT WGPURenderBundleEncoder wgpuDeviceCreateRenderBundleEncoder(WGPUDevice device, WGPURenderBundleEncoderDescriptor const * descriptor);
+WGPU_EXPORT WGPURenderPipeline wgpuDeviceCreateRenderPipeline(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor);
+WGPU_EXPORT void wgpuDeviceCreateRenderPipelineAsync(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void * userdata);
+WGPU_EXPORT WGPUSampler wgpuDeviceCreateSampler(WGPUDevice device, WGPUSamplerDescriptor const * descriptor /* nullable */);
+WGPU_EXPORT WGPUShaderModule wgpuDeviceCreateShaderModule(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor);
+WGPU_EXPORT WGPUSwapChain wgpuDeviceCreateSwapChain(WGPUDevice device, WGPUSurface surface /* nullable */, WGPUSwapChainDescriptor const * descriptor);
+WGPU_EXPORT WGPUTexture wgpuDeviceCreateTexture(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
 WGPU_EXPORT void wgpuDeviceDestroy(WGPUDevice device);
-WGPU_EXPORT size_t wgpuDeviceEnumerateFeatures(WGPUDevice device, WGPUFeatureName *features);
-WGPU_EXPORT void wgpuDeviceForceLoss(WGPUDevice device, WGPUDeviceLostReason type, char const *message);
+WGPU_EXPORT size_t wgpuDeviceEnumerateFeatures(WGPUDevice device, WGPUFeatureName * features);
+WGPU_EXPORT void wgpuDeviceForceLoss(WGPUDevice device, WGPUDeviceLostReason type, char const * message);
 WGPU_EXPORT WGPUAdapter wgpuDeviceGetAdapter(WGPUDevice device);
-WGPU_EXPORT bool wgpuDeviceGetLimits(WGPUDevice device, WGPUSupportedLimits *limits);
+WGPU_EXPORT bool wgpuDeviceGetLimits(WGPUDevice device, WGPUSupportedLimits * limits);
 WGPU_EXPORT WGPUQueue wgpuDeviceGetQueue(WGPUDevice device);
 WGPU_EXPORT bool wgpuDeviceHasFeature(WGPUDevice device, WGPUFeatureName feature);
-WGPU_EXPORT void wgpuDeviceInjectError(WGPUDevice device, WGPUErrorType type, char const *message);
-WGPU_EXPORT bool wgpuDevicePopErrorScope(WGPUDevice device, WGPUErrorCallback callback, void *userdata);
+WGPU_EXPORT void wgpuDeviceInjectError(WGPUDevice device, WGPUErrorType type, char const * message);
+WGPU_EXPORT bool wgpuDevicePopErrorScope(WGPUDevice device, WGPUErrorCallback callback, void * userdata);
 WGPU_EXPORT void wgpuDevicePushErrorScope(WGPUDevice device, WGPUErrorFilter filter);
-WGPU_EXPORT void wgpuDeviceSetDeviceLostCallback(WGPUDevice device, WGPUDeviceLostCallback callback, void *userdata);
-WGPU_EXPORT void wgpuDeviceSetLabel(WGPUDevice device, char const *label);
-WGPU_EXPORT void wgpuDeviceSetLoggingCallback(WGPUDevice device, WGPULoggingCallback callback, void *userdata);
-WGPU_EXPORT void wgpuDeviceSetUncapturedErrorCallback(WGPUDevice device, WGPUErrorCallback callback, void *userdata);
+WGPU_EXPORT void wgpuDeviceSetDeviceLostCallback(WGPUDevice device, WGPUDeviceLostCallback callback, void * userdata);
+WGPU_EXPORT void wgpuDeviceSetLabel(WGPUDevice device, char const * label);
+WGPU_EXPORT void wgpuDeviceSetLoggingCallback(WGPUDevice device, WGPULoggingCallback callback, void * userdata);
+WGPU_EXPORT void wgpuDeviceSetUncapturedErrorCallback(WGPUDevice device, WGPUErrorCallback callback, void * userdata);
 WGPU_EXPORT void wgpuDeviceTick(WGPUDevice device);
-WGPU_EXPORT void wgpuDeviceValidateTextureDescriptor(WGPUDevice device, WGPUTextureDescriptor const *descriptor);
+WGPU_EXPORT void wgpuDeviceValidateTextureDescriptor(WGPUDevice device, WGPUTextureDescriptor const * descriptor);
 WGPU_EXPORT void wgpuDeviceReference(WGPUDevice device);
 WGPU_EXPORT void wgpuDeviceRelease(WGPUDevice device);
 
@@ -1771,18 +1773,18 @@ WGPU_EXPORT void wgpuDeviceRelease(WGPUDevice device);
 WGPU_EXPORT void wgpuExternalTextureDestroy(WGPUExternalTexture externalTexture);
 WGPU_EXPORT void wgpuExternalTextureExpire(WGPUExternalTexture externalTexture);
 WGPU_EXPORT void wgpuExternalTextureRefresh(WGPUExternalTexture externalTexture);
-WGPU_EXPORT void wgpuExternalTextureSetLabel(WGPUExternalTexture externalTexture, char const *label);
+WGPU_EXPORT void wgpuExternalTextureSetLabel(WGPUExternalTexture externalTexture, char const * label);
 WGPU_EXPORT void wgpuExternalTextureReference(WGPUExternalTexture externalTexture);
 WGPU_EXPORT void wgpuExternalTextureRelease(WGPUExternalTexture externalTexture);
 
 // Methods of Instance
-WGPU_EXPORT WGPUSurface wgpuInstanceCreateSurface(WGPUInstance instance, WGPUSurfaceDescriptor const *descriptor);
-WGPU_EXPORT void wgpuInstanceRequestAdapter(WGPUInstance instance, WGPURequestAdapterOptions const *options /* nullable */, WGPURequestAdapterCallback callback, void *userdata);
+WGPU_EXPORT WGPUSurface wgpuInstanceCreateSurface(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor);
+WGPU_EXPORT void wgpuInstanceRequestAdapter(WGPUInstance instance, WGPURequestAdapterOptions const * options /* nullable */, WGPURequestAdapterCallback callback, void * userdata);
 WGPU_EXPORT void wgpuInstanceReference(WGPUInstance instance);
 WGPU_EXPORT void wgpuInstanceRelease(WGPUInstance instance);
 
 // Methods of PipelineLayout
-WGPU_EXPORT void wgpuPipelineLayoutSetLabel(WGPUPipelineLayout pipelineLayout, char const *label);
+WGPU_EXPORT void wgpuPipelineLayoutSetLabel(WGPUPipelineLayout pipelineLayout, char const * label);
 WGPU_EXPORT void wgpuPipelineLayoutReference(WGPUPipelineLayout pipelineLayout);
 WGPU_EXPORT void wgpuPipelineLayoutRelease(WGPUPipelineLayout pipelineLayout);
 
@@ -1790,18 +1792,18 @@ WGPU_EXPORT void wgpuPipelineLayoutRelease(WGPUPipelineLayout pipelineLayout);
 WGPU_EXPORT void wgpuQuerySetDestroy(WGPUQuerySet querySet);
 WGPU_EXPORT uint32_t wgpuQuerySetGetCount(WGPUQuerySet querySet);
 WGPU_EXPORT WGPUQueryType wgpuQuerySetGetType(WGPUQuerySet querySet);
-WGPU_EXPORT void wgpuQuerySetSetLabel(WGPUQuerySet querySet, char const *label);
+WGPU_EXPORT void wgpuQuerySetSetLabel(WGPUQuerySet querySet, char const * label);
 WGPU_EXPORT void wgpuQuerySetReference(WGPUQuerySet querySet);
 WGPU_EXPORT void wgpuQuerySetRelease(WGPUQuerySet querySet);
 
 // Methods of Queue
-WGPU_EXPORT void wgpuQueueCopyExternalTextureForBrowser(WGPUQueue queue, WGPUImageCopyExternalTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize, WGPUCopyTextureForBrowserOptions const *options);
-WGPU_EXPORT void wgpuQueueCopyTextureForBrowser(WGPUQueue queue, WGPUImageCopyTexture const *source, WGPUImageCopyTexture const *destination, WGPUExtent3D const *copySize, WGPUCopyTextureForBrowserOptions const *options);
-WGPU_EXPORT void wgpuQueueOnSubmittedWorkDone(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void *userdata);
-WGPU_EXPORT void wgpuQueueSetLabel(WGPUQueue queue, char const *label);
-WGPU_EXPORT void wgpuQueueSubmit(WGPUQueue queue, uint32_t commandCount, WGPUCommandBuffer const *commands);
-WGPU_EXPORT void wgpuQueueWriteBuffer(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const *data, size_t size);
-WGPU_EXPORT void wgpuQueueWriteTexture(WGPUQueue queue, WGPUImageCopyTexture const *destination, void const *data, size_t dataSize, WGPUTextureDataLayout const *dataLayout, WGPUExtent3D const *writeSize);
+WGPU_EXPORT void wgpuQueueCopyExternalTextureForBrowser(WGPUQueue queue, WGPUImageCopyExternalTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options);
+WGPU_EXPORT void wgpuQueueCopyTextureForBrowser(WGPUQueue queue, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options);
+WGPU_EXPORT void wgpuQueueOnSubmittedWorkDone(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void * userdata);
+WGPU_EXPORT void wgpuQueueSetLabel(WGPUQueue queue, char const * label);
+WGPU_EXPORT void wgpuQueueSubmit(WGPUQueue queue, uint32_t commandCount, WGPUCommandBuffer const * commands);
+WGPU_EXPORT void wgpuQueueWriteBuffer(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const * data, size_t size);
+WGPU_EXPORT void wgpuQueueWriteTexture(WGPUQueue queue, WGPUImageCopyTexture const * destination, void const * data, size_t dataSize, WGPUTextureDataLayout const * dataLayout, WGPUExtent3D const * writeSize);
 WGPU_EXPORT void wgpuQueueReference(WGPUQueue queue);
 WGPU_EXPORT void wgpuQueueRelease(WGPUQueue queue);
 
@@ -1814,13 +1816,13 @@ WGPU_EXPORT void wgpuRenderBundleEncoderDraw(WGPURenderBundleEncoder renderBundl
 WGPU_EXPORT void wgpuRenderBundleEncoderDrawIndexed(WGPURenderBundleEncoder renderBundleEncoder, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t baseVertex, uint32_t firstInstance);
 WGPU_EXPORT void wgpuRenderBundleEncoderDrawIndexedIndirect(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
 WGPU_EXPORT void wgpuRenderBundleEncoderDrawIndirect(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer indirectBuffer, uint64_t indirectOffset);
-WGPU_EXPORT WGPURenderBundle wgpuRenderBundleEncoderFinish(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderBundleDescriptor const *descriptor /* nullable */);
-WGPU_EXPORT void wgpuRenderBundleEncoderInsertDebugMarker(WGPURenderBundleEncoder renderBundleEncoder, char const *markerLabel);
+WGPU_EXPORT WGPURenderBundle wgpuRenderBundleEncoderFinish(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderBundleDescriptor const * descriptor /* nullable */);
+WGPU_EXPORT void wgpuRenderBundleEncoderInsertDebugMarker(WGPURenderBundleEncoder renderBundleEncoder, char const * markerLabel);
 WGPU_EXPORT void wgpuRenderBundleEncoderPopDebugGroup(WGPURenderBundleEncoder renderBundleEncoder);
-WGPU_EXPORT void wgpuRenderBundleEncoderPushDebugGroup(WGPURenderBundleEncoder renderBundleEncoder, char const *groupLabel);
-WGPU_EXPORT void wgpuRenderBundleEncoderSetBindGroup(WGPURenderBundleEncoder renderBundleEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
+WGPU_EXPORT void wgpuRenderBundleEncoderPushDebugGroup(WGPURenderBundleEncoder renderBundleEncoder, char const * groupLabel);
+WGPU_EXPORT void wgpuRenderBundleEncoderSetBindGroup(WGPURenderBundleEncoder renderBundleEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
 WGPU_EXPORT void wgpuRenderBundleEncoderSetIndexBuffer(WGPURenderBundleEncoder renderBundleEncoder, WGPUBuffer buffer, WGPUIndexFormat format, uint64_t offset, uint64_t size);
-WGPU_EXPORT void wgpuRenderBundleEncoderSetLabel(WGPURenderBundleEncoder renderBundleEncoder, char const *label);
+WGPU_EXPORT void wgpuRenderBundleEncoderSetLabel(WGPURenderBundleEncoder renderBundleEncoder, char const * label);
 WGPU_EXPORT void wgpuRenderBundleEncoderSetPipeline(WGPURenderBundleEncoder renderBundleEncoder, WGPURenderPipeline pipeline);
 WGPU_EXPORT void wgpuRenderBundleEncoderSetVertexBuffer(WGPURenderBundleEncoder renderBundleEncoder, uint32_t slot, WGPUBuffer buffer, uint64_t offset, uint64_t size);
 WGPU_EXPORT void wgpuRenderBundleEncoderReference(WGPURenderBundleEncoder renderBundleEncoder);
@@ -1835,14 +1837,14 @@ WGPU_EXPORT void wgpuRenderPassEncoderDrawIndirect(WGPURenderPassEncoder renderP
 WGPU_EXPORT void wgpuRenderPassEncoderEnd(WGPURenderPassEncoder renderPassEncoder);
 WGPU_EXPORT void wgpuRenderPassEncoderEndOcclusionQuery(WGPURenderPassEncoder renderPassEncoder);
 WGPU_EXPORT void wgpuRenderPassEncoderEndPass(WGPURenderPassEncoder renderPassEncoder);
-WGPU_EXPORT void wgpuRenderPassEncoderExecuteBundles(WGPURenderPassEncoder renderPassEncoder, uint32_t bundleCount, WGPURenderBundle const *bundles);
-WGPU_EXPORT void wgpuRenderPassEncoderInsertDebugMarker(WGPURenderPassEncoder renderPassEncoder, char const *markerLabel);
+WGPU_EXPORT void wgpuRenderPassEncoderExecuteBundles(WGPURenderPassEncoder renderPassEncoder, uint32_t bundleCount, WGPURenderBundle const * bundles);
+WGPU_EXPORT void wgpuRenderPassEncoderInsertDebugMarker(WGPURenderPassEncoder renderPassEncoder, char const * markerLabel);
 WGPU_EXPORT void wgpuRenderPassEncoderPopDebugGroup(WGPURenderPassEncoder renderPassEncoder);
-WGPU_EXPORT void wgpuRenderPassEncoderPushDebugGroup(WGPURenderPassEncoder renderPassEncoder, char const *groupLabel);
-WGPU_EXPORT void wgpuRenderPassEncoderSetBindGroup(WGPURenderPassEncoder renderPassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const *dynamicOffsets);
-WGPU_EXPORT void wgpuRenderPassEncoderSetBlendConstant(WGPURenderPassEncoder renderPassEncoder, WGPUColor const *color);
+WGPU_EXPORT void wgpuRenderPassEncoderPushDebugGroup(WGPURenderPassEncoder renderPassEncoder, char const * groupLabel);
+WGPU_EXPORT void wgpuRenderPassEncoderSetBindGroup(WGPURenderPassEncoder renderPassEncoder, uint32_t groupIndex, WGPUBindGroup group, uint32_t dynamicOffsetCount, uint32_t const * dynamicOffsets);
+WGPU_EXPORT void wgpuRenderPassEncoderSetBlendConstant(WGPURenderPassEncoder renderPassEncoder, WGPUColor const * color);
 WGPU_EXPORT void wgpuRenderPassEncoderSetIndexBuffer(WGPURenderPassEncoder renderPassEncoder, WGPUBuffer buffer, WGPUIndexFormat format, uint64_t offset, uint64_t size);
-WGPU_EXPORT void wgpuRenderPassEncoderSetLabel(WGPURenderPassEncoder renderPassEncoder, char const *label);
+WGPU_EXPORT void wgpuRenderPassEncoderSetLabel(WGPURenderPassEncoder renderPassEncoder, char const * label);
 WGPU_EXPORT void wgpuRenderPassEncoderSetPipeline(WGPURenderPassEncoder renderPassEncoder, WGPURenderPipeline pipeline);
 WGPU_EXPORT void wgpuRenderPassEncoderSetScissorRect(WGPURenderPassEncoder renderPassEncoder, uint32_t x, uint32_t y, uint32_t width, uint32_t height);
 WGPU_EXPORT void wgpuRenderPassEncoderSetStencilReference(WGPURenderPassEncoder renderPassEncoder, uint32_t reference);
@@ -1854,18 +1856,18 @@ WGPU_EXPORT void wgpuRenderPassEncoderRelease(WGPURenderPassEncoder renderPassEn
 
 // Methods of RenderPipeline
 WGPU_EXPORT WGPUBindGroupLayout wgpuRenderPipelineGetBindGroupLayout(WGPURenderPipeline renderPipeline, uint32_t groupIndex);
-WGPU_EXPORT void wgpuRenderPipelineSetLabel(WGPURenderPipeline renderPipeline, char const *label);
+WGPU_EXPORT void wgpuRenderPipelineSetLabel(WGPURenderPipeline renderPipeline, char const * label);
 WGPU_EXPORT void wgpuRenderPipelineReference(WGPURenderPipeline renderPipeline);
 WGPU_EXPORT void wgpuRenderPipelineRelease(WGPURenderPipeline renderPipeline);
 
 // Methods of Sampler
-WGPU_EXPORT void wgpuSamplerSetLabel(WGPUSampler sampler, char const *label);
+WGPU_EXPORT void wgpuSamplerSetLabel(WGPUSampler sampler, char const * label);
 WGPU_EXPORT void wgpuSamplerReference(WGPUSampler sampler);
 WGPU_EXPORT void wgpuSamplerRelease(WGPUSampler sampler);
 
 // Methods of ShaderModule
-WGPU_EXPORT void wgpuShaderModuleGetCompilationInfo(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void *userdata);
-WGPU_EXPORT void wgpuShaderModuleSetLabel(WGPUShaderModule shaderModule, char const *label);
+WGPU_EXPORT void wgpuShaderModuleGetCompilationInfo(WGPUShaderModule shaderModule, WGPUCompilationInfoCallback callback, void * userdata);
+WGPU_EXPORT void wgpuShaderModuleSetLabel(WGPUShaderModule shaderModule, char const * label);
 WGPU_EXPORT void wgpuShaderModuleReference(WGPUShaderModule shaderModule);
 WGPU_EXPORT void wgpuShaderModuleRelease(WGPUShaderModule shaderModule);
 
@@ -1881,7 +1883,7 @@ WGPU_EXPORT void wgpuSwapChainReference(WGPUSwapChain swapChain);
 WGPU_EXPORT void wgpuSwapChainRelease(WGPUSwapChain swapChain);
 
 // Methods of Texture
-WGPU_EXPORT WGPUTextureView wgpuTextureCreateView(WGPUTexture texture, WGPUTextureViewDescriptor const *descriptor /* nullable */);
+WGPU_EXPORT WGPUTextureView wgpuTextureCreateView(WGPUTexture texture, WGPUTextureViewDescriptor const * descriptor /* nullable */);
 WGPU_EXPORT void wgpuTextureDestroy(WGPUTexture texture);
 WGPU_EXPORT uint32_t wgpuTextureGetDepthOrArrayLayers(WGPUTexture texture);
 WGPU_EXPORT WGPUTextureDimension wgpuTextureGetDimension(WGPUTexture texture);
@@ -1891,19 +1893,21 @@ WGPU_EXPORT uint32_t wgpuTextureGetMipLevelCount(WGPUTexture texture);
 WGPU_EXPORT uint32_t wgpuTextureGetSampleCount(WGPUTexture texture);
 WGPU_EXPORT WGPUTextureUsage wgpuTextureGetUsage(WGPUTexture texture);
 WGPU_EXPORT uint32_t wgpuTextureGetWidth(WGPUTexture texture);
-WGPU_EXPORT void wgpuTextureSetLabel(WGPUTexture texture, char const *label);
+WGPU_EXPORT void wgpuTextureSetLabel(WGPUTexture texture, char const * label);
 WGPU_EXPORT void wgpuTextureReference(WGPUTexture texture);
 WGPU_EXPORT void wgpuTextureRelease(WGPUTexture texture);
 
 // Methods of TextureView
-WGPU_EXPORT void wgpuTextureViewSetLabel(WGPUTextureView textureView, char const *label);
+WGPU_EXPORT void wgpuTextureViewSetLabel(WGPUTextureView textureView, char const * label);
 WGPU_EXPORT void wgpuTextureViewReference(WGPUTextureView textureView);
 WGPU_EXPORT void wgpuTextureViewRelease(WGPUTextureView textureView);
 
 #endif  // !defined(WGPU_SKIP_DECLARATIONS)
 
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
+// clang-format on
+
 #endif  // WEBGPU_H_

From b25bce5b9082eb2720968fc706c9bab105058772 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 7 Apr 2023 10:39:26 -0700
Subject: [PATCH 160/166] Use unique intrinsic var names for each kernel
 Cleanup constant value declarations with template helper methods Add comments
 on workgroup size usage

---
 src/CodeGen_Vulkan_Dev.cpp | 190 +++++++++++++++++++------------------
 1 file changed, 97 insertions(+), 93 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index ec9a482a9dc8..3d8af0ed869d 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -26,12 +26,6 @@ namespace {  // anonymous
 
 // --
 
-template<typename CodeGenT, typename ValueT>
-ValueT lower_int_uint_div(CodeGenT *cg, Expr a, Expr b);
-
-template<typename CodeGenT, typename ValueT>
-ValueT lower_int_uint_mod(CodeGenT *cg, Expr a, Expr b);
-
 class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 public:
     CodeGen_Vulkan_Dev(Target target);
@@ -159,6 +153,15 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         template<typename StmtOrExpr>
         SpvFactory::BlockVariables emit_if_then_else(const Expr &condition, StmtOrExpr then_case, StmtOrExpr else_case);
 
+        template<typename T>
+        SpvId declare_constant_int(Type value_type, int64_t value);
+
+        template<typename T>
+        SpvId declare_constant_uint(Type value_type, uint64_t value);
+
+        template<typename T>
+        SpvId declare_constant_float(Type value_type, float value);
+
         // Map from Halide built-in names to extended GLSL intrinsics for SPIR-V
         using BuiltinMap = std::unordered_map<std::string, SpvId>;
         const BuiltinMap glsl_builtin = {
@@ -266,7 +269,11 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
         using DescriptorSetTable = std::vector<DescriptorSet>;
         DescriptorSetTable descriptor_set_table;
 
-        // The workgroup size.  May vary between kernels.
+        // The workgroup size ... this indicates the extents of the 1-3 dimensional index space
+        // used as part of the kernel dispatch. It can also be used to adjust the layout for work
+        // items (aka GPU threads), based on logical groupings. If a zero sized workgroup is
+        // encountered during CodeGen, it is assumed that the extents are dynamic and specified
+        // at runtime
         uint32_t workgroup_size[3];
 
         // Current index of kernel for module
@@ -459,23 +466,39 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Variable *var) {
     builder.update_id(variable_id);
 }
 
+template<typename T>
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_constant_int(Type value_type, int64_t value) {
+    const T typed_value = (T)(value);
+    SpvId constant_id = builder.declare_constant(value_type, &typed_value);
+    builder.update_id(constant_id);
+    return constant_id;
+}
+
+template<typename T>
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_constant_uint(Type value_type, uint64_t value) {
+    const T typed_value = (T)(value);
+    SpvId constant_id = builder.declare_constant(value_type, &typed_value);
+    builder.update_id(constant_id);
+    return constant_id;
+}
+
+template<typename T>
+SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_constant_float(Type value_type, float value) {
+    const T typed_value = (T)(value);
+    SpvId constant_id = builder.declare_constant(value_type, &typed_value);
+    builder.update_id(constant_id);
+    return constant_id;
+}
+
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IntImm *imm) {
     if (imm->type.bits() == 8) {
-        const int8_t value = (int8_t)(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_int<int8_t>(imm->type, imm->value);
     } else if (imm->type.bits() == 16) {
-        const int16_t value = (int16_t)(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_int<int16_t>(imm->type, imm->value);
     } else if (imm->type.bits() == 32) {
-        const int32_t value = (int32_t)(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_int<int32_t>(imm->type, imm->value);
     } else if (imm->type.bits() == 64) {
-        const int64_t value = (int64_t)(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_int<int64_t>(imm->type, imm->value);
     } else {
         internal_error << "Vulkan backend currently only supports 8-bit, 16-bit, 32-bit or 64-bit signed integers!\n";
     }
@@ -483,21 +506,13 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const IntImm *imm) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const UIntImm *imm) {
     if (imm->type.bits() == 8) {
-        const uint8_t value = (uint8_t)(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_uint<uint8_t>(imm->type, imm->value);
     } else if (imm->type.bits() == 16) {
-        const uint16_t value = (uint16_t)(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_uint<uint16_t>(imm->type, imm->value);
     } else if (imm->type.bits() == 32) {
-        const uint32_t value = (uint32_t)(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_uint<uint32_t>(imm->type, imm->value);
     } else if (imm->type.bits() == 64) {
-        const uint64_t value = (uint64_t)(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_uint<uint64_t>(imm->type, imm->value);
     } else {
         internal_error << "Vulkan backend currently only supports 8-bit, 16-bit, 32-bit or 64-bit unsigned integers!\n";
     }
@@ -511,79 +526,66 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const StringImm *imm) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
     if (imm->type.bits() == 16) {
         if (imm->type.is_bfloat()) {
-            const bfloat16_t value = bfloat16_t(imm->value);
-            SpvId constant_id = builder.declare_constant(imm->type, &value);
-            builder.update_id(constant_id);
+            declare_constant_float<bfloat16_t>(imm->type, imm->value);
         } else {
-            const float16_t value = float16_t(imm->value);
-            SpvId constant_id = builder.declare_constant(imm->type, &value);
-            builder.update_id(constant_id);
+            declare_constant_float<float16_t>(imm->type, imm->value);
         }
     } else if (imm->type.bits() == 32) {
-        const float value = float(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_float<float>(imm->type, imm->value);
     } else if (imm->type.bits() == 64) {
-        const double value = double(imm->value);
-        SpvId constant_id = builder.declare_constant(imm->type, &value);
-        builder.update_id(constant_id);
+        declare_constant_float<double>(imm->type, imm->value);
     } else {
         internal_error << "Vulkan backend currently only supports 32-bit or 64-bit floats\n";
     }
 }
 
+template<typename T>
+void fill_bytes_with_value(uint8_t *bytes, int count, int value) {
+    T *v = reinterpret_cast<T *>(bytes);
+    for (int i = 0; i < count; ++i) {
+        v[i] = (T)value;
+    }
+}
+
 SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type value_type, SpvId value_id) {
     if (!value_type.is_bool()) {
         value_id = cast_type(Bool(), value_type, value_id);
     }
+
+    const int true_value = 1;
+    const int false_value = 0;
+
     std::vector<uint8_t> true_data(target_type.bytes(), (uint8_t)0);
     std::vector<uint8_t> false_data(target_type.bytes(), (uint8_t)0);
-    for (int i = 0; i < target_type.lanes(); ++i) {
-        if (target_type.is_int_or_uint() && target_type.bits() == 8) {
-            int8_t *td = reinterpret_cast<int8_t *>(&true_data[0]);
-            int8_t *fd = reinterpret_cast<int8_t *>(&false_data[0]);
-            td[i] = (int8_t)1;
-            fd[i] = (int8_t)0;
-        } else if (target_type.is_int_or_uint() && target_type.bits() == 16) {
-            int16_t *td = reinterpret_cast<int16_t *>(&true_data[0]);
-            int16_t *fd = reinterpret_cast<int16_t *>(&false_data[0]);
-            td[i] = (int16_t)1;
-            fd[i] = (int16_t)0;
-        } else if (target_type.is_int_or_uint() && target_type.bits() == 32) {
-            int32_t *td = reinterpret_cast<int32_t *>(&true_data[0]);
-            int32_t *fd = reinterpret_cast<int32_t *>(&false_data[0]);
-            td[i] = (int32_t)1;
-            fd[i] = (int32_t)0;
-        } else if (target_type.is_int_or_uint() && target_type.bits() == 64) {
-            int64_t *td = reinterpret_cast<int64_t *>(&true_data[0]);
-            int64_t *fd = reinterpret_cast<int64_t *>(&false_data[0]);
-            td[i] = (int64_t)1;
-            fd[i] = (int64_t)0;
-        } else if (target_type.is_float() && target_type.bits() == 16) {
-            if (target_type.is_bfloat()) {
-                bfloat16_t *td = reinterpret_cast<bfloat16_t *>(&true_data[0]);
-                bfloat16_t *fd = reinterpret_cast<bfloat16_t *>(&false_data[0]);
-                td[i] = (bfloat16_t)1.0f;
-                fd[i] = (bfloat16_t)0.0f;
-            } else {
-                float16_t *td = reinterpret_cast<float16_t *>(&true_data[0]);
-                float16_t *fd = reinterpret_cast<float16_t *>(&false_data[0]);
-                td[i] = (float16_t)1.0f;
-                fd[i] = (float16_t)0.0f;
-            }
-        } else if (target_type.is_float() && target_type.bits() == 32) {
-            float *td = reinterpret_cast<float *>(&true_data[0]);
-            float *fd = reinterpret_cast<float *>(&false_data[0]);
-            td[i] = (float)1.0f;
-            fd[i] = (float)0.0f;
-        } else if (target_type.is_float() && target_type.bits() == 64) {
-            double *td = reinterpret_cast<double *>(&true_data[0]);
-            double *fd = reinterpret_cast<double *>(&false_data[0]);
-            td[i] = (double)1.0;
-            fd[i] = (double)0.0;
+
+    if (target_type.is_int_or_uint() && target_type.bits() == 8) {
+        fill_bytes_with_value<int8_t>(&true_data[0], target_type.lanes(), true_value);
+        fill_bytes_with_value<int8_t>(&false_data[0], target_type.lanes(), false_value);
+    } else if (target_type.is_int_or_uint() && target_type.bits() == 16) {
+        fill_bytes_with_value<int16_t>(&true_data[0], target_type.lanes(), true_value);
+        fill_bytes_with_value<int16_t>(&false_data[0], target_type.lanes(), false_value);
+    } else if (target_type.is_int_or_uint() && target_type.bits() == 32) {
+        fill_bytes_with_value<int32_t>(&true_data[0], target_type.lanes(), true_value);
+        fill_bytes_with_value<int32_t>(&false_data[0], target_type.lanes(), false_value);
+    } else if (target_type.is_int_or_uint() && target_type.bits() == 64) {
+        fill_bytes_with_value<int64_t>(&true_data[0], target_type.lanes(), true_value);
+        fill_bytes_with_value<int64_t>(&false_data[0], target_type.lanes(), false_value);
+    } else if (target_type.is_float() && target_type.bits() == 16) {
+        if (target_type.is_bfloat()) {
+            fill_bytes_with_value<bfloat16_t>(&true_data[0], target_type.lanes(), true_value);
+            fill_bytes_with_value<bfloat16_t>(&false_data[0], target_type.lanes(), false_value);
         } else {
-            user_error << "Unhandled type cast from value type '" << value_type << "' to target type '" << target_type << "'!";
-        }
+            fill_bytes_with_value<float16_t>(&true_data[0], target_type.lanes(), true_value);
+            fill_bytes_with_value<float16_t>(&false_data[0], target_type.lanes(), false_value);
+        }
+    } else if (target_type.is_float() && target_type.bits() == 32) {
+        fill_bytes_with_value<float>(&true_data[0], target_type.lanes(), true_value);
+        fill_bytes_with_value<float>(&false_data[0], target_type.lanes(), false_value);
+    } else if (target_type.is_float() && target_type.bits() == 64) {
+        fill_bytes_with_value<double>(&true_data[0], target_type.lanes(), true_value);
+        fill_bytes_with_value<double>(&false_data[0], target_type.lanes(), false_value);
+    } else {
+        user_error << "Unhandled type cast from value type '" << value_type << "' to target type '" << target_type << "'!";
     }
 
     SpvId result_id = builder.reserve_id(SpvResultId);
@@ -1660,11 +1662,12 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         // This should always be true at this point in codegen
         internal_assert(is_const_zero(op->min));
         auto intrinsic = simt_intrinsic(op->name);
+        const std::string intrinsic_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + intrinsic.first;
 
         // Intrinsics are inserted when adding the kernel
-        internal_assert(symbol_table.contains(intrinsic.first));
-        SpvId intrinsic_id = symbol_table.get(intrinsic.first).first;
-        SpvStorageClass storage_class = symbol_table.get(intrinsic.first).second;
+        internal_assert(symbol_table.contains(intrinsic_var_name));
+        SpvId intrinsic_id = symbol_table.get(intrinsic_var_name).first;
+        SpvStorageClass storage_class = symbol_table.get(intrinsic_var_name).second;
 
         // extract and cast to the extent type (which is what's expected by Halide's for loops)
         Type unsigned_type = UInt(32);
@@ -2559,10 +2562,11 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_entry_point(const Stmt &s, SpvId
         SpvStorageClass storage_class = SpvStorageClassInput;
         SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
         SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, storage_class);
-        SpvId intrinsic_var_id = builder.declare_global_variable(intrinsic_name, intrinsic_ptr_type_id, storage_class);
+        const std::string intrinsic_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + intrinsic_name;
+        SpvId intrinsic_var_id = builder.declare_global_variable(intrinsic_var_name, intrinsic_ptr_type_id, storage_class);
         SpvId intrinsic_loaded_id = builder.reserve_id();
         builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_var_id));
-        symbol_table.push(intrinsic_name, {intrinsic_loaded_id, storage_class});
+        symbol_table.push(intrinsic_var_name, {intrinsic_loaded_id, storage_class});
 
         // Annotate that this is the specific builtin
         SpvBuiltIn built_in_kind = map_simt_builtin(intrinsic_name);

From ff9f2b0de0bb3aa065866c68d33d764636b722dd Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 7 Apr 2023 10:41:07 -0700
Subject: [PATCH 161/166] Wrap debug output under ifdef DEBUG_RUNTIME_INTERNAL
 macro guard Add nearest_multiple constraint to block/region allocator

---
 src/runtime/internal/block_allocator.h  | 50 +++++++++-------
 src/runtime/internal/block_storage.h    | 46 +++++++--------
 src/runtime/internal/memory_arena.h     |  2 +-
 src/runtime/internal/memory_resources.h | 16 +++++-
 src/runtime/internal/pointer_table.h    |  8 +--
 src/runtime/internal/region_allocator.h | 76 +++++++++++++------------
 6 files changed, 111 insertions(+), 87 deletions(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index 498f0c71a84c..3ff850e5b19f 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -42,10 +42,11 @@ class BlockAllocator {
     // Runtime configuration parameters to adjust the behaviour of the block allocator
     struct Config {
         size_t initial_capacity = 0;
-        size_t maximum_pool_size = 0;
-        size_t minimum_block_size = 0;
-        size_t maximum_block_size = 0;
-        size_t maximum_block_count = 0;
+        size_t maximum_pool_size = 0;    //< Maximum number of bytes to allocate for the entire pool (including all blocks). Specified in bytes. Zero means no constraint
+        size_t minimum_block_size = 0;   //< Minimum block size in bytes. Zero mean no constraint.
+        size_t maximum_block_size = 0;   //< Maximum block size in bytes. Zero means no constraint
+        size_t maximum_block_count = 0;  //< Maximum number of blocks to allocate. Zero means no constraint
+        size_t nearest_multiple = 0;     //< Always round up the requested region sizes to the given integer value. Zero means no constraint
     };
 
     // Factory methods for creation / destruction
@@ -151,7 +152,7 @@ void BlockAllocator::initialize(void *user_context, const Config &cfg, const Mem
 }
 
 MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &request) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Reserve ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "offset=" << (uint32_t)request.offset << " "
@@ -237,13 +238,13 @@ bool BlockAllocator::collect(void *user_context) {
             continue;
         }
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         uint64_t reserved = block->reserved;
 #endif
 
         bool collected = block->allocator->collect(user_context);
         if (collected) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
             debug(user_context) << "Collected block ("
                                 << "block=" << (void *)block << " "
                                 << "reserved=" << (uint32_t)block->reserved << " "
@@ -285,7 +286,7 @@ int BlockAllocator::destroy(void *user_context) {
 MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAllocator *allocator, const MemoryRequest &request) {
     MemoryRegion *result = allocator->reserve(user_context, request);
     if (result == nullptr) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: Failed to allocate region of size ("
                             << (int32_t)(request.size) << " bytes)!\n";
 #endif
@@ -300,7 +301,7 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl
 
 bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const {
     if (!is_compatible_block(block, properties)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: skipping block ... incompatible properties!\n"
                             << " block_resource=" << (void *)block << "\n"
                             << " block_size=" << (uint32_t)block->memory.size << "\n"
@@ -318,7 +319,7 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
     }
 
     if (dedicated && (block->reserved > 0)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation!\n"
                             << " block_resource=" << (void *)block << "\n"
                             << " block_size=" << (uint32_t)block->memory.size << "\n"
@@ -328,7 +329,7 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
         return false;
 
     } else if (block->memory.dedicated && (block->reserved > 0)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: skipping block ... already dedicated to an allocation!\n"
                             << " block_resource=" << (void *)block << "\n"
                             << " block_size=" << (uint32_t)block->memory.size << "\n"
@@ -353,7 +354,7 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
         BlockEntry *prev_entry = block_entry->prev_ptr;
         const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
         if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
             debug(user_context) << "BlockAllocator: found suitable block ...\n"
                                 << " user_context=" << (void *)(user_context) << "\n"
                                 << " block_resource=" << (void *)block << "\n"
@@ -371,7 +372,7 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
     }
 
     if (block_entry == nullptr) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: couldn't find suitable block!\n"
                             << " user_context=" << (void *)(user_context) << "\n"
                             << " request_size=" << (uint32_t)size << "\n"
@@ -386,7 +387,7 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
 
 BlockAllocator::BlockEntry *
 BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: reserving block ... !\n"
                         << " requested_size=" << (uint32_t)size << "\n"
                         << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n"
@@ -396,7 +397,7 @@ BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &
 #endif
     BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated);
     if (block_entry == nullptr) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: creating block ... !\n"
                             << " requested_size=" << (uint32_t)size << "\n"
                             << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n"
@@ -418,7 +419,7 @@ BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &
 
 RegionAllocator *
 BlockAllocator::create_region_allocator(void *user_context, BlockResource *block) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Creating region allocator ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_resource=" << (void *)(block) << ")...\n";
@@ -436,7 +437,7 @@ BlockAllocator::create_region_allocator(void *user_context, BlockResource *block
 }
 
 int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator *region_allocator) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Destroying region allocator ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "region_allocator=" << (void *)(region_allocator) << ")...\n";
@@ -468,7 +469,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
         return nullptr;
     }
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Creating block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << " "
@@ -479,6 +480,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
     block->memory.size = constrain_requested_size(size);
     block->memory.handle = nullptr;
     block->memory.properties = properties;
+    block->memory.properties.nearest_multiple = max(config.nearest_multiple, properties.nearest_multiple);
     block->memory.dedicated = dedicated;
     block->reserved = 0;
     block->allocator = create_region_allocator(user_context, block);
@@ -487,7 +489,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
 }
 
 int BlockAllocator::release_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Releasing block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << ")...\n";
@@ -500,7 +502,7 @@ int BlockAllocator::release_block_entry(void *user_context, BlockAllocator::Bloc
 }
 
 int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Destroying block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << " "
@@ -517,7 +519,7 @@ int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Bloc
 }
 
 int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n";
 #endif
     halide_abort_if_false(user_context, allocators.block.allocate != nullptr);
@@ -528,7 +530,7 @@ int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block)
 }
 
 int BlockAllocator::free_memory_block(void *user_context, BlockResource *block) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n";
 #endif
     halide_abort_if_false(user_context, allocators.block.deallocate != nullptr);
@@ -542,6 +544,9 @@ int BlockAllocator::free_memory_block(void *user_context, BlockResource *block)
 
 size_t BlockAllocator::constrain_requested_size(size_t size) const {
     size_t actual_size = size;
+    if (config.nearest_multiple) {
+        actual_size = (((actual_size + config.nearest_multiple - 1) / config.nearest_multiple) * config.nearest_multiple);
+    }
     if (config.minimum_block_size) {
         actual_size = ((actual_size < config.minimum_block_size) ?
                            config.minimum_block_size :
@@ -552,6 +557,7 @@ size_t BlockAllocator::constrain_requested_size(size_t size) const {
                            config.maximum_block_size :
                            actual_size);
     }
+
     return actual_size;
 }
 
diff --git a/src/runtime/internal/block_storage.h b/src/runtime/internal/block_storage.h
index 41cd7b07aa4e..c99709d9e4ae 100644
--- a/src/runtime/internal/block_storage.h
+++ b/src/runtime/internal/block_storage.h
@@ -215,7 +215,7 @@ void BlockStorage::resize(void *user_context, size_t entry_count, bool realloc)
         return;
     }
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockStorage: Resize ("
                         << "requested_size=" << (int32_t)requested_size << " "
                         << "current_size=" << (int32_t)current_size << " "
@@ -258,15 +258,15 @@ void BlockStorage::remove(void *user_context, size_t index, size_t entry_count)
         size_t src_offset = (index + entry_count) * config.entry_size;
         size_t bytes = (last_index - index - entry_count) * config.entry_size;
 
-#ifdef DEBUG_RUNTIME
-        debug(0) << "BlockStorage: Remove ("
-                 << "index=" << (int32_t)index << " "
-                 << "entry_count=" << (int32_t)entry_count << " "
-                 << "entry_size=" << (int32_t)config.entry_size << " "
-                 << "last_index=" << (int32_t)last_index << " "
-                 << "src_offset=" << (int32_t)src_offset << " "
-                 << "dst_offset=" << (int32_t)dst_offset << " "
-                 << "bytes=" << (int32_t)bytes << ")...\n";
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "BlockStorage: Remove ("
+                            << "index=" << (int32_t)index << " "
+                            << "entry_count=" << (int32_t)entry_count << " "
+                            << "entry_size=" << (int32_t)config.entry_size << " "
+                            << "last_index=" << (int32_t)last_index << " "
+                            << "src_offset=" << (int32_t)src_offset << " "
+                            << "dst_offset=" << (int32_t)dst_offset << " "
+                            << "bytes=" << (int32_t)bytes << ")...\n";
 #endif
         void *dst_ptr = offset_address(ptr, dst_offset);
         void *src_ptr = offset_address(ptr, src_offset);
@@ -281,13 +281,13 @@ void BlockStorage::replace(void *user_context, size_t index, const void *array,
     size_t remaining = count - index;
 
 #if DEBUG
-    debug(0) << "BlockStorage: Replace ("
-             << "index=" << (int32_t)index << " "
-             << "array_size=" << (int32_t)array_size << " "
-             << "entry_size=" << (int32_t)config.entry_size << " "
-             << "offset=" << (int32_t)offset << " "
-             << "remaining=" << (int32_t)remaining << " "
-             << "capacity=" << (int32_t)capacity << ")...\n";
+    debug(user_context) << "BlockStorage: Replace ("
+                        << "index=" << (int32_t)index << " "
+                        << "array_size=" << (int32_t)array_size << " "
+                        << "entry_size=" << (int32_t)config.entry_size << " "
+                        << "offset=" << (int32_t)offset << " "
+                        << "remaining=" << (int32_t)remaining << " "
+                        << "capacity=" << (int32_t)capacity << ")...\n";
 #endif
 
     halide_abort_if_false(user_context, remaining > 0);
@@ -389,12 +389,12 @@ void BlockStorage::allocate(void *user_context, size_t new_capacity) {
         size_t block_count = (requested_bytes / block_size);
         block_count += (requested_bytes % block_size) ? 1 : 0;
         size_t alloc_size = block_count * block_size;
-#ifdef DEBUG_RUNTIME
-        debug(0) << "BlockStorage: Allocating ("
-                 << "requested_bytes=" << (int32_t)requested_bytes << " "
-                 << "block_size=" << (int32_t)block_size << " "
-                 << "block_count=" << (int32_t)block_count << " "
-                 << "alloc_size=" << (int32_t)alloc_size << ") ...\n";
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "BlockStorage: Allocating ("
+                            << "requested_bytes=" << (int32_t)requested_bytes << " "
+                            << "block_size=" << (int32_t)block_size << " "
+                            << "block_count=" << (int32_t)block_count << " "
+                            << "alloc_size=" << (int32_t)alloc_size << ") ...\n";
 #endif
         void *new_ptr = alloc_size ? allocator.allocate(user_context, alloc_size) : nullptr;
         if (count != 0 && ptr != nullptr && new_ptr != nullptr) {
diff --git a/src/runtime/internal/memory_arena.h b/src/runtime/internal/memory_arena.h
index fe4f46d0863e..9c069b47f687 100644
--- a/src/runtime/internal/memory_arena.h
+++ b/src/runtime/internal/memory_arena.h
@@ -271,7 +271,7 @@ void *MemoryArena::create_entry(void *user_context, Block *block, uint32_t index
     void *entry_ptr = lookup_entry(user_context, block, index);
     block->free_index = block->indices[index];
     block->status[index] = AllocationStatus::InUse;
-#if DEBUG_RUNTIME
+#if DEBUG_RUNTIME_INTERNAL
     memset(entry_ptr, 0, config.entry_size);
 #endif
     return entry_ptr;
diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index 0b6e441a62ff..e30afb0dd4ea 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -57,7 +57,8 @@ struct MemoryProperties {
     MemoryVisibility visibility = MemoryVisibility::InvalidVisibility;
     MemoryUsage usage = MemoryUsage::InvalidUsage;
     MemoryCaching caching = MemoryCaching::InvalidCaching;
-    size_t alignment = 0;  //< required alignment of allocations (zero for no constraint)
+    size_t alignment = 0;         //< required alignment of allocations (zero for no constraint)
+    size_t nearest_multiple = 0;  //< require the allocation size to round up to the nearest multiple (zero means no rounding)
 };
 
 // Client-facing struct for exchanging memory block allocation requests
@@ -145,6 +146,19 @@ ALWAYS_INLINE size_t aligned_size(size_t offset, size_t size, size_t alignment)
     return actual_size;
 }
 
+// Returns a padded size to accommodate an adjusted offset due to alignment constraints rounded up to the nearest multiple
+// -- Alignment must be power of two!
+ALWAYS_INLINE size_t conform_size(size_t offset, size_t size, size_t alignment, size_t nearest_multiple) {
+    size_t adjusted_size = aligned_size(offset, size, alignment);
+    adjusted_size = (alignment > adjusted_size) ? alignment : adjusted_size;
+    if (nearest_multiple > 0) {
+        size_t rounded_size = (((adjusted_size + nearest_multiple - 1) / nearest_multiple) * nearest_multiple);
+        return rounded_size;
+    } else {
+        return adjusted_size;
+    }
+}
+
 // Clamps the given value to be within the [min_value, max_value] range
 ALWAYS_INLINE size_t clamped_size(size_t value, size_t min_value, size_t max_value) {
     size_t result = (value < min_value) ? min_value : value;
diff --git a/src/runtime/internal/pointer_table.h b/src/runtime/internal/pointer_table.h
index 58f84d1bd3eb..51f250ee2396 100644
--- a/src/runtime/internal/pointer_table.h
+++ b/src/runtime/internal/pointer_table.h
@@ -185,7 +185,7 @@ void PointerTable::resize(void *user_context, size_t entry_count, bool realloc)
     size_t actual_size = current_size;
     count = requested_size;
 
-#ifdef DEBUG_INTERNAL
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "PointerTable: Resize ("
                         << "requested_size=" << (int32_t)requested_size << " "
                         << "current_size=" << (int32_t)current_size << " "
@@ -235,7 +235,7 @@ void PointerTable::remove(void *user_context, size_t index, size_t entry_count)
         size_t src_offset = (index + entry_count) * sizeof(void *);
         size_t bytes = (last_index - index - entry_count) * sizeof(void *);
 
-#ifdef DEBUG_INTERNAL
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "PointerTable: Remove ("
                             << "index=" << (int32_t)index << " "
                             << "entry_count=" << (int32_t)entry_count << " "
@@ -254,7 +254,7 @@ void PointerTable::replace(void *user_context, size_t index, const void **array,
     size_t remaining = count - index;
     size_t copy_count = min(remaining, array_size);
 
-#ifdef DEBUG_INTERNAL
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "PointerTable: Replace ("
                         << "index=" << (int32_t)index << " "
                         << "array_size=" << (int32_t)array_size << " "
@@ -332,7 +332,7 @@ void PointerTable::allocate(void *user_context, size_t new_capacity) {
         halide_debug_assert(user_context, allocator.allocate != nullptr);
         size_t bytes = new_capacity * sizeof(void *);
 
-#ifdef DEBUG_INTERNAL
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "PointerTable: Allocating (bytes=" << (int32_t)bytes << " allocator=" << (void *)allocator.allocate << ")...\n";
 #endif
 
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 8cd2548018b0..0bf3765301d7 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -149,10 +149,10 @@ int RegionAllocator::initialize(void *user_context, BlockResource *mb, const Mem
 MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) {
     halide_abort_if_false(user_context, request.size > 0);
     size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
-    size_t actual_size = (actual_alignment > request.size) ? (actual_alignment) : request.size;
+    size_t actual_size = conform_size(request.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple);
     size_t remaining = block->memory.size - block->reserved;
     if (remaining < actual_size) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Unable to reserve more memory from block "
                             << "-- requested size (" << (int32_t)(request.size) << " bytes) "
                             << "greater than available (" << (int32_t)(remaining) << " bytes)!\n";
@@ -162,7 +162,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
 
     BlockRegion *block_region = find_block_region(user_context, request);
     if (block_region == nullptr) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Failed to locate region for requested size ("
                             << (int32_t)(request.size) << " bytes)!\n";
 #endif
@@ -170,7 +170,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     }
 
     if (can_split(block_region, request.size)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
                             << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n";
 #endif
@@ -216,8 +216,12 @@ int RegionAllocator::retain(void *user_context, MemoryRegion *memory_region) {
 
 RegionAllocator *RegionAllocator::find_allocator(void *user_context, MemoryRegion *memory_region) {
     BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
-    halide_abort_if_false(user_context, block_region != nullptr);
-    halide_abort_if_false(user_context, block_region->block_ptr != nullptr);
+    if (block_region == nullptr) {
+        return nullptr;
+    }
+    if (block_region->block_ptr == nullptr) {
+        return nullptr;
+    }
     return block_region->block_ptr->allocator;
 }
 
@@ -227,7 +231,7 @@ bool RegionAllocator::is_last_block_region(void *user_context, const BlockRegion
 
 bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, const BlockRegion *region, const MemoryRequest &request) const {
     if (!is_available(region)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: skipping block region ... not available! "
                             << " block_region=" << (void *)region << "\n";
 #endif
@@ -236,7 +240,7 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
 
     // skip incompatible block regions for this request
     if (!is_compatible_block_region(region, request.properties)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: skipping block region ... incompatible properties! "
                             << " block_region=" << (void *)region << "\n";
 #endif
@@ -244,11 +248,11 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
     }
 
     size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
-    size_t actual_size = aligned_size(region->memory.offset, request.size, actual_alignment);
+    size_t actual_size = conform_size(region->memory.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple);
 
     // is the adjusted size larger than the current region?
     if (actual_size > region->memory.size) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: skipping block region ... not enough space for adjusted size! "
                             << " block_region=" << (void *)region << "\n";
 #endif
@@ -257,7 +261,7 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
 
     // will the adjusted size fit within the remaining unallocated space?
     if ((actual_size + block->reserved) <= block->memory.size) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: found suitable block region! "
                             << " block_region=" << (void *)region << "\n";
 #endif
@@ -271,7 +275,7 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
     BlockRegion *block_region = block->regions;
     while (block_region != nullptr) {
         if (is_block_region_suitable_for_request(user_context, block_region, request)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
             debug(user_context) << "RegionAllocator: found suitable region ...\n"
                                 << " user_context=" << (void *)(user_context) << "\n"
                                 << " block_resource=" << (void *)block << "\n"
@@ -294,7 +298,7 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
     }
 
     if (block_region == nullptr) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: couldn't find suitable region!\n"
                             << " user_context=" << (void *)(user_context) << "\n"
                             << " requested_size=" << (uint32_t)request.size << "\n"
@@ -337,7 +341,7 @@ bool RegionAllocator::can_coalesce(const BlockRegion *block_region) const {
 BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRegion *block_region) {
 
     if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "Freeing region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
@@ -354,7 +358,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     BlockRegion *prev_region = block_region->prev_ptr;
     if (is_available(prev_region) && (prev_region != block_region)) {
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Coalescing "
                             << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) "
                             << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!";
@@ -372,7 +376,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     BlockRegion *next_region = block_region->next_ptr;
     if (is_available(next_region) && (next_region != block_region)) {
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Coalescing "
                             << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) "
                             << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n";
@@ -396,7 +400,7 @@ bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size) co
 BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
 
     if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Split deallocate region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
@@ -411,11 +415,11 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     }
 
     size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment);
-    size_t actual_size = aligned_size(block_region->memory.offset, size, actual_alignment);
+    size_t actual_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple);
     size_t actual_offset = aligned_offset(block_region->memory.offset + size, actual_alignment);
     size_t empty_size = block_region->memory.size - actual_size;
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Conforming size and alignment \n"
                         << " requested_size=" << (uint32_t)size << "\n"
                         << " actual_size=" << (uint32_t)actual_size << "\n"
@@ -424,7 +428,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
                         << " actual_alignment=" << (uint32_t)actual_alignment << ")\n";
 #endif
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Splitting "
                         << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
                         << "to create empty region (offset=" << (int32_t)actual_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
@@ -448,7 +452,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 }
 
 BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Creating block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "offset=" << (uint32_t)offset << " "
@@ -466,7 +470,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
         return nullptr;
     }
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Added block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
@@ -481,7 +485,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
     block_region->block_ptr = block;
     block_region->usage_count = 0;
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "Creating region ("
                         << "block_ptr=" << (void *)block_region->block_ptr << " "
                         << "block_region=" << (void *)block_region << " "
@@ -493,7 +497,7 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
 }
 
 int RegionAllocator::release_block_region(void *user_context, BlockRegion *block_region) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Releasing block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
@@ -508,7 +512,7 @@ int RegionAllocator::release_block_region(void *user_context, BlockRegion *block
 
     if (block_region->status != AllocationStatus::Available) {
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "Releasing region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
@@ -524,7 +528,7 @@ int RegionAllocator::release_block_region(void *user_context, BlockRegion *block
 }
 
 int RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block_region) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Destroying block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << ") ...\n";
@@ -538,7 +542,7 @@ int RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block
 }
 
 int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_region) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Allocating region (user_context=" << (void *)(user_context)
                         << " size=" << (int32_t)(block_region->memory.size)
                         << " offset=" << (int32_t)block_region->memory.offset << ")!\n";
@@ -551,7 +555,7 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r
         error_code = allocators.region.allocate(user_context, memory_region);
         memory_region->is_owner = true;
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "Allocating region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
@@ -563,7 +567,7 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r
 
     } else {
 
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "Re-using region  ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
@@ -579,7 +583,7 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r
 }
 
 int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_region) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Freeing block region ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "block_region=" << (void *)(block_region) << " "
@@ -587,7 +591,7 @@ int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_re
                         << "usage_count=" << (uint32_t)block_region->usage_count << ") ...\n";
 #endif
     if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "Freeing region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
@@ -608,7 +612,7 @@ int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_re
 }
 
 int RegionAllocator::release(void *user_context) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Releasing all regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif
@@ -625,7 +629,7 @@ int RegionAllocator::release(void *user_context) {
 }
 
 bool RegionAllocator::collect(void *user_context) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Collecting free block regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
 
@@ -641,7 +645,7 @@ bool RegionAllocator::collect(void *user_context) {
     BlockRegion *block_region = block->regions;
     while (block_region != nullptr) {
         if (can_coalesce(block_region)) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
             count++;
             debug(user_context) << "    collecting region ("
                                 << "block_ptr=" << (void *)block_region->block_ptr << " "
@@ -660,7 +664,7 @@ bool RegionAllocator::collect(void *user_context) {
     }
 
     if (has_collected) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "    collected unused regions ("
                             << "block_ptr=" << (void *)block << " "
                             << "region_count=" << (uint32_t)count << " "
@@ -672,7 +676,7 @@ bool RegionAllocator::collect(void *user_context) {
 }
 
 int RegionAllocator::destroy(void *user_context) {
-#ifdef DEBUG_RUNTIME
+#ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Destroying all block regions ("
                         << "user_context=" << (void *)(user_context) << ") ...\n";
 #endif

From 3e1c15745091f51c70af2da6d2978d5bdf120329 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 7 Apr 2023 10:43:58 -0700
Subject: [PATCH 162/166] Add vk_clear_device_buffer utility method Add
 nearest_multiple constrating to vulkan memory allocatori + fixes
 correctness/multiple_outputs test Add vkCreateBuffer/vkDestroyBuffer debug
 output i + for gpu_object_lifetime_tracker Cleanup shutdown for shader_module
 destruction

---
 src/runtime/vulkan.cpp         |  66 +------
 src/runtime/vulkan_internal.h  |   6 +-
 src/runtime/vulkan_memory.h    | 315 ++++++++++++++++++++++-----------
 src/runtime/vulkan_resources.h |  29 +--
 4 files changed, 239 insertions(+), 177 deletions(-)

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index 873ae0ac8c6a..a4765e4f56fa 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -320,10 +320,12 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
     buf->device_interface = &vulkan_device_interface;
     buf->device_interface->impl->use_module();
 
+#ifdef DEBUG_RUNTIME
     debug(user_context)
         << "    allocated device region=" << (void *)device_region << "\n"
         << "    containing device buffer=" << (void *)device_region->handle << "\n"
         << "    for halide buffer " << buf << "\n";
+#endif
 
     // retrieve the buffer from the region
     VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(device_region->handle);
@@ -332,69 +334,7 @@ WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
         return halide_error_code_internal_error;
     }
 
-    // create a command buffer
-    VkCommandBuffer command_buffer;
-    int error_code = vk_create_command_buffer(user_context, ctx.allocator, ctx.command_pool, &command_buffer);
-    if (error_code != halide_error_code_success) {
-        error(user_context) << "Vulkan: Failed to create command buffer!\n";
-        return error_code;
-    }
-
-    // begin the command buffer
-    VkCommandBufferBeginInfo command_buffer_begin_info =
-        {
-            VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
-            nullptr,                                      // pointer to struct extending this
-            VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
-            nullptr                                       // pointer to parent command buffer
-        };
-
-    VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return halide_error_code_generic_error;
-    }
-
-    // fill buffer with zero values up to the size of the buffer
-    vkCmdFillBuffer(command_buffer, *device_buffer, 0, VK_WHOLE_SIZE, 0);
-    debug(user_context) << "    zeroing device_buffer=" << (void *)device_buffer
-                        << " size=" << (uint32_t)device_region->size << "\n";
-
-    // end the command buffer
-    result = vkEndCommandBuffer(command_buffer);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
-        return halide_error_code_generic_error;
-    }
-
-    // submit the command buffer
-    VkSubmitInfo submit_info =
-        {
-            VK_STRUCTURE_TYPE_SUBMIT_INFO,  // struct type
-            nullptr,                        // pointer to struct extending this
-            0,                              // wait semaphore count
-            nullptr,                        // semaphores
-            nullptr,                        // pipeline stages where semaphore waits occur
-            1,                              // how many command buffers to execute
-            &command_buffer,                // the command buffers
-            0,                              // number of semaphores to signal
-            nullptr                         // the semaphores to signal
-        };
-
-    result = vkQueueSubmit(ctx.queue, 1, &submit_info, 0);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
-        return halide_error_code_generic_error;
-    }
-
-    // wait for memset to finish
-    result = vkQueueWaitIdle(ctx.queue);
-    if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
-        return halide_error_code_generic_error;
-    }
-
-    error_code = vk_destroy_command_buffer(user_context, ctx.allocator, ctx.command_pool, command_buffer);
+    int error_code = vk_clear_device_buffer(user_context, ctx.allocator, ctx.command_pool, ctx.queue, *device_buffer);
     if (error_code != halide_error_code_success) {
         error(user_context) << "Vulkan: Failed to destroy command buffer!\n";
         return error_code;
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 04d702627045..05eb03361d15 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -40,7 +40,11 @@ VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context, VkDevice d
                                                   const VkAllocationCallbacks *alloc_callbacks);
 
 int vk_destroy_memory_allocator(void *user_context, VulkanMemoryAllocator *allocator);
-
+int vk_clear_device_buffer(void *user_context,
+                           VulkanMemoryAllocator *allocator,
+                           VkCommandPool command_pool,
+                           VkQueue command_queue,
+                           VkBuffer device_buffer);
 // --------------------------------------------------------------------------
 // Context
 // --------------------------------------------------------------------------
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index f0301f5b37bf..c925538265dc 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -27,6 +27,7 @@ struct VulkanMemoryConfig {
     size_t minimum_block_size = 32 * 1024 * 1024;  //< Default block size is 32MB
     size_t maximum_block_size = 0;                 //< Specified in bytes. Zero means no constraint
     size_t maximum_block_count = 0;                //< Maximum number of blocks to allocate. Zero means no constraint
+    size_t nearest_multiple = 32;                  //< Always round up the requested region sizes to the given integer value. Zero means no constraint
 };
 WEAK VulkanMemoryConfig memory_allocator_config;
 
@@ -202,12 +203,12 @@ int VulkanMemoryAllocator::initialize(void *user_context,
 
 MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Reserving memory ("
-                        << "user_context=" << user_context << " "
-                        << "block_allocator=" << (void *)(block_allocator) << " "
-                        << "request_size=" << (uint32_t)(request.size) << " "
-                        << "device=" << (void *)(device) << " "
-                        << "physical_device=" << (void *)(physical_device) << ") ...\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Reserving memory ("
+                   << "user_context=" << user_context << " "
+                   << "block_allocator=" << (void *)(block_allocator) << " "
+                   << "request_size=" << (uint32_t)(request.size) << " "
+                   << "device=" << (void *)(device) << " "
+                   << "physical_device=" << (void *)(physical_device) << ") ...\n";
 #endif
 
     if ((device == nullptr) || (physical_device == nullptr)) {
@@ -225,14 +226,14 @@ MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &
 
 void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Mapping region ("
-                        << "user_context=" << user_context << " "
-                        << "device=" << (void *)(device) << " "
-                        << "physical_device=" << (void *)(physical_device) << " "
-                        << "region=" << (void *)(region) << " "
-                        << "region_size=" << (uint32_t)region->size << " "
-                        << "region_offset=" << (uint32_t)region->offset << " "
-                        << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Mapping region ("
+                   << "user_context=" << user_context << " "
+                   << "device=" << (void *)(device) << " "
+                   << "physical_device=" << (void *)(physical_device) << " "
+                   << "region=" << (void *)(region) << " "
+                   << "region_size=" << (uint32_t)region->size << " "
+                   << "region_offset=" << (uint32_t)region->offset << " "
+                   << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
 #endif
     if ((device == nullptr) || (physical_device == nullptr)) {
         error(user_context) << "VulkanMemoryAllocator: Unable to map memory! Invalid device handle!\n";
@@ -263,13 +264,22 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
         return nullptr;  // NOTE: caller must handle nullptr
     }
 
-    uint8_t *mapped_ptr = nullptr;
+    void *mapped_ptr = nullptr;
     VkDeviceSize memory_offset = region->offset + region->range.head_offset;
     VkDeviceSize memory_size = region->size - region->range.tail_offset - region->range.head_offset;
     if (((double)region->size - (double)region->range.tail_offset - (double)region->range.head_offset) <= 0.0) {
         error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid memory range !\n";
         return nullptr;
     }
+    debug(nullptr) << "VulkanMemoryAllocator: MapMemory ("
+                   << "user_context=" << user_context << "\n"
+                   << "  region_size=" << (uint32_t)region->size << "\n"
+                   << "  region_offset=" << (uint32_t)region->offset << "\n"
+                   << "  region_range.head_offset=" << (uint32_t)region->range.head_offset << "\n"
+                   << "  region_range.tail_offset=" << (uint32_t)region->range.tail_offset << "\n"
+                   << "  memory_offset=" << (uint32_t)memory_offset << "\n"
+                   << "  memory_size=" << (uint32_t)memory_size << ") ...\n";
+
     VkResult result = vkMapMemory(device, *device_memory, memory_offset, memory_size, 0, (void **)(&mapped_ptr));
     if (result != VK_SUCCESS) {
         error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n";
@@ -281,14 +291,14 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
 
 int VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Unmapping region ("
-                        << "user_context=" << user_context << " "
-                        << "device=" << (void *)(device) << " "
-                        << "physical_device=" << (void *)(physical_device) << " "
-                        << "region=" << (void *)(region) << " "
-                        << "region_size=" << (uint32_t)region->size << " "
-                        << "region_offset=" << (uint32_t)region->offset << " "
-                        << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Unmapping region ("
+                   << "user_context=" << user_context << " "
+                   << "device=" << (void *)(device) << " "
+                   << "physical_device=" << (void *)(physical_device) << " "
+                   << "region=" << (void *)(region) << " "
+                   << "region_size=" << (uint32_t)region->size << " "
+                   << "region_offset=" << (uint32_t)region->offset << " "
+                   << "crop_offset=" << (uint32_t)region->range.head_offset << ") ...\n";
 #endif
     if ((device == nullptr) || (physical_device == nullptr)) {
         error(user_context) << "VulkanMemoryAllocator: Unable to unmap region! Invalid device handle!\n";
@@ -320,14 +330,14 @@ int VulkanMemoryAllocator::unmap(void *user_context, MemoryRegion *region) {
 
 MemoryRegion *VulkanMemoryAllocator::create_crop(void *user_context, MemoryRegion *region, uint64_t offset) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Cropping region ("
-                        << "user_context=" << user_context << " "
-                        << "device=" << (void *)(device) << " "
-                        << "physical_device=" << (void *)(physical_device) << " "
-                        << "region=" << (void *)(region) << " "
-                        << "region_size=" << (uint32_t)region->size << " "
-                        << "region_offset=" << (uint32_t)region->offset << " "
-                        << "crop_offset=" << (int64_t)offset << ") ...\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Cropping region ("
+                   << "user_context=" << user_context << " "
+                   << "device=" << (void *)(device) << " "
+                   << "physical_device=" << (void *)(physical_device) << " "
+                   << "region=" << (void *)(region) << " "
+                   << "region_size=" << (uint32_t)region->size << " "
+                   << "region_offset=" << (uint32_t)region->offset << " "
+                   << "crop_offset=" << (int64_t)offset << ") ...\n";
 #endif
     if ((device == nullptr) || (physical_device == nullptr)) {
         error(user_context) << "VulkanMemoryAllocator: Unable to crop region! Invalid device handle!\n";
@@ -412,11 +422,11 @@ MemoryRegion *VulkanMemoryAllocator::owner_of(void *user_context, MemoryRegion *
 
 int VulkanMemoryAllocator::release(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Releasing region ("
-                        << "user_context=" << user_context << " "
-                        << "region=" << (void *)(region) << " "
-                        << "size=" << (uint32_t)region->size << " "
-                        << "offset=" << (uint32_t)region->offset << ") ...\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Releasing region ("
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << " "
+                   << "size=" << (uint32_t)region->size << " "
+                   << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
     if ((device == nullptr) || (physical_device == nullptr)) {
         error(user_context) << "VulkanMemoryAllocator: Unable to release region! Invalid device handle!\n";
@@ -431,11 +441,11 @@ int VulkanMemoryAllocator::release(void *user_context, MemoryRegion *region) {
 
 int VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Reclaiming region ("
-                        << "user_context=" << user_context << " "
-                        << "region=" << (void *)(region) << " "
-                        << "size=" << (uint32_t)region->size << " "
-                        << "offset=" << (uint32_t)region->offset << ") ...\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Reclaiming region ("
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << " "
+                   << "size=" << (uint32_t)region->size << " "
+                   << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
     if ((device == nullptr) || (physical_device == nullptr)) {
         error(user_context) << "VulkanMemoryAllocator: Unable to reclaim region! Invalid device handle!\n";
@@ -450,11 +460,11 @@ int VulkanMemoryAllocator::reclaim(void *user_context, MemoryRegion *region) {
 
 int VulkanMemoryAllocator::retain(void *user_context, MemoryRegion *region) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Retaining region ("
-                        << "user_context=" << user_context << " "
-                        << "region=" << (void *)(region) << " "
-                        << "size=" << (uint32_t)region->size << " "
-                        << "offset=" << (uint32_t)region->offset << ") ...\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Retaining region ("
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << " "
+                   << "size=" << (uint32_t)region->size << " "
+                   << "offset=" << (uint32_t)region->offset << ") ...\n";
 #endif
     if ((device == nullptr) || (physical_device == nullptr)) {
         error(user_context) << "VulkanMemoryAllocator: Unable to retain region! Invalid device handle!\n";
@@ -469,8 +479,8 @@ int VulkanMemoryAllocator::retain(void *user_context, MemoryRegion *region) {
 
 bool VulkanMemoryAllocator::collect(void *user_context) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Collecting unused memory ("
-                        << "user_context=" << user_context << ") ... \n";
+    debug(nullptr) << "VulkanMemoryAllocator: Collecting unused memory ("
+                   << "user_context=" << user_context << ") ... \n";
 #endif
     if ((device == nullptr) || (physical_device == nullptr) || (block_allocator == nullptr)) {
         return false;
@@ -480,8 +490,8 @@ bool VulkanMemoryAllocator::collect(void *user_context) {
 
 int VulkanMemoryAllocator::release(void *user_context) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Releasing block allocator ("
-                        << "user_context=" << user_context << ") ... \n";
+    debug(nullptr) << "VulkanMemoryAllocator: Releasing block allocator ("
+                   << "user_context=" << user_context << ") ... \n";
 #endif
     if ((device == nullptr) || (physical_device == nullptr)) {
         error(user_context) << "VulkanMemoryAllocator: Unable to release allocator! Invalid device handle!\n";
@@ -497,8 +507,8 @@ int VulkanMemoryAllocator::release(void *user_context) {
 
 int VulkanMemoryAllocator::destroy(void *user_context) {
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Destroying allocator ("
-                        << "user_context=" << user_context << ") ... \n";
+    debug(nullptr) << "VulkanMemoryAllocator: Destroying allocator ("
+                   << "user_context=" << user_context << ") ... \n";
 #endif
     if (block_allocator != nullptr) {
         block_allocator->destroy(this);
@@ -536,14 +546,14 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block
     }
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Allocating block ("
-                        << "user_context=" << user_context << " "
-                        << "block=" << (void *)(block) << " "
-                        << "size=" << (uint64_t)block->size << ", "
-                        << "dedicated=" << (block->dedicated ? "true" : "false") << " "
-                        << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
-                        << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Allocating block ("
+                   << "user_context=" << user_context << " "
+                   << "block=" << (void *)(block) << " "
+                   << "size=" << (uint64_t)block->size << ", "
+                   << "dedicated=" << (block->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
 #endif
 
     // Find an appropriate memory type given the flags
@@ -563,15 +573,18 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block
 
     VkDeviceMemory *device_memory = (VkDeviceMemory *)vk_host_malloc(nullptr, sizeof(VkDeviceMemory), 0, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT, instance->alloc_callbacks);
     if (device_memory == nullptr) {
-        debug(user_context) << "VulkanBlockAllocator: Unable to allocate block! Failed to allocate device memory handle!\n";
+        debug(nullptr) << "VulkanBlockAllocator: Unable to allocate block! Failed to allocate device memory handle!\n";
         return halide_error_code_out_of_memory;
     }
 
     VkResult result = vkAllocateMemory(instance->device, &alloc_info, instance->alloc_callbacks, device_memory);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "VulkanMemoryAllocator: Allocation failed! vkAllocateMemory returned: " << vk_get_error_name(result) << "\n";
+        debug(nullptr) << "VulkanMemoryAllocator: Allocation failed! vkAllocateMemory returned: " << vk_get_error_name(result) << "\n";
         return halide_error_code_device_malloc_failed;
     }
+#ifdef DEBUG_RUNTIME
+    debug(nullptr) << "vkAllocateMemory: Allocated memory for device region (" << (uint64_t)block->size << " bytes) ...\n";
+#endif
 
     uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties);
 
@@ -588,8 +601,8 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block
     VkBuffer buffer = {0};
     result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, &buffer);
     if (result != VK_SUCCESS) {
-        debug(user_context) << "VulkanMemoryAllocator: Failed to create buffer!\n\t"
-                            << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+        debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer!\n\t"
+                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
         return halide_error_code_device_malloc_failed;
     }
 
@@ -598,12 +611,12 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block
     vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Block allocated ("
-                        << "size=" << (uint32_t)block->size << ", "
-                        << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
-                        << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
-                        << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
-                        << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
+    debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
+                   << "size=" << (uint32_t)block->size << ", "
+                   << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
+                   << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
+                   << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
 #endif
 
     if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
@@ -627,9 +640,9 @@ int VulkanMemoryAllocator::deallocate_block(void *instance_ptr, MemoryBlock *blo
 
     void *user_context = instance->owner_context;
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Deallocating block ("
-                        << "user_context=" << user_context << " "
-                        << "block=" << (void *)(block) << ") ... \n";
+    debug(nullptr) << "VulkanMemoryAllocator: Deallocating block ("
+                   << "user_context=" << user_context << " "
+                   << "block=" << (void *)(block) << ") ... \n";
 #endif
 
     if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
@@ -643,12 +656,12 @@ int VulkanMemoryAllocator::deallocate_block(void *instance_ptr, MemoryBlock *blo
     }
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanBlockAllocator: deallocating block ("
-                        << "size=" << (uint32_t)block->size << ", "
-                        << "dedicated=" << (block->dedicated ? "true" : "false") << " "
-                        << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
-                        << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
+    debug(nullptr) << "VulkanBlockAllocator: deallocating block ("
+                   << "size=" << (uint32_t)block->size << ", "
+                   << "dedicated=" << (block->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(block->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(block->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(block->properties.visibility) << ")\n";
 #endif
 
     if (block->handle == nullptr) {
@@ -663,6 +676,9 @@ int VulkanMemoryAllocator::deallocate_block(void *instance_ptr, MemoryBlock *blo
     }
 
     vkFreeMemory(instance->device, *device_memory, instance->alloc_callbacks);
+#ifdef DEBUG_RUNTIME
+    debug(nullptr) << "vkFreeMemory: Deallocated memory for device region (" << (uint64_t)block->size << " bytes) ...\n";
+#endif
 
     if (instance->block_count > 0) {
         instance->block_count--;
@@ -799,9 +815,9 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
 
     void *user_context = instance->owner_context;
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Allocating region ("
-                        << "user_context=" << user_context << " "
-                        << "region=" << (void *)(region) << ") ... \n";
+    debug(nullptr) << "VulkanMemoryAllocator: Allocating region ("
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << ") ... \n";
 #endif
 
     if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
@@ -815,13 +831,13 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
     }
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanRegionAllocator: Allocating region ("
-                        << "size=" << (uint32_t)region->size << ", "
-                        << "offset=" << (uint32_t)region->offset << ", "
-                        << "dedicated=" << (region->dedicated ? "true" : "false") << " "
-                        << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
-                        << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+    debug(nullptr) << "VulkanRegionAllocator: Allocating region ("
+                   << "size=" << (uint32_t)region->size << ", "
+                   << "offset=" << (uint32_t)region->offset << ", "
+                   << "dedicated=" << (region->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
 #endif
 
     uint32_t usage_flags = instance->select_memory_usage(user_context, region->properties);
@@ -847,8 +863,16 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
                             << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
         return halide_error_code_device_malloc_failed;
     }
+#ifdef DEBUG_RUNTIME
+    debug(nullptr) << "vkCreateBuffer: Created buffer for device region (" << (uint64_t)region->size << " bytes) ...\n";
+#endif
 
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, region);
+    if (region_allocator == nullptr) {
+        error(user_context) << "VulkanBlockAllocator: Unable to allocate region! Invalid region allocator!\n";
+        return halide_error_code_internal_error;
+    }
+
     BlockResource *block_resource = region_allocator->block_resource();
     if (block_resource == nullptr) {
         error(user_context) << "VulkanBlockAllocator: Unable to allocate region! Invalid block resource handle!\n";
@@ -884,9 +908,9 @@ int VulkanMemoryAllocator::deallocate_region(void *instance_ptr, MemoryRegion *r
 
     void *user_context = instance->owner_context;
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanMemoryAllocator: Deallocating region ("
-                        << "user_context=" << user_context << " "
-                        << "region=" << (void *)(region) << ") ... \n";
+    debug(nullptr) << "VulkanMemoryAllocator: Deallocating region ("
+                   << "user_context=" << user_context << " "
+                   << "region=" << (void *)(region) << ") ... \n";
 #endif
 
     if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
@@ -900,13 +924,13 @@ int VulkanMemoryAllocator::deallocate_region(void *instance_ptr, MemoryRegion *r
     }
 
 #if defined(HL_VK_DEBUG_MEM)
-    debug(user_context) << "VulkanRegionAllocator: Deallocating region ("
-                        << "size=" << (uint32_t)region->size << ", "
-                        << "offset=" << (uint32_t)region->offset << ", "
-                        << "dedicated=" << (region->dedicated ? "true" : "false") << " "
-                        << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
-                        << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
+    debug(nullptr) << "VulkanRegionAllocator: Deallocating region ("
+                   << "size=" << (uint32_t)region->size << ", "
+                   << "offset=" << (uint32_t)region->offset << ", "
+                   << "dedicated=" << (region->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(region->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(region->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(region->properties.visibility) << ")\n";
 #endif
 
     if (region->handle == nullptr) {
@@ -921,6 +945,9 @@ int VulkanMemoryAllocator::deallocate_region(void *instance_ptr, MemoryRegion *r
     }
 
     vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks);
+#ifdef DEBUG_RUNTIME
+    debug(nullptr) << "vkDestroyBuffer: Destroyed buffer for device region (" << (uint64_t)region->size << " bytes) ...\n";
+#endif
     region->handle = nullptr;
     if (instance->region_count > 0) {
         instance->region_count--;
@@ -1054,6 +1081,10 @@ VulkanMemoryAllocator *vk_create_memory_allocator(void *user_context,
             config.maximum_block_count = atoi(alloc_config_values[3]);
             print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.maximum_block_count << " for maximum block count\n";
         }
+        if (alloc_config_values.size() > 4) {
+            config.nearest_multiple = atoi(alloc_config_values[4]);
+            print(user_context) << "Vulkan: Configuring allocator with " << (uint32_t)config.nearest_multiple << " for nearest multiple\n";
+        }
     }
 
     return VulkanMemoryAllocator::create(user_context,
@@ -1071,6 +1102,92 @@ int vk_destroy_memory_allocator(void *user_context, VulkanMemoryAllocator *alloc
 
 // --------------------------------------------------------------------------
 
+int vk_clear_device_buffer(void *user_context,
+                           VulkanMemoryAllocator *allocator,
+                           VkCommandPool command_pool,
+                           VkQueue command_queue,
+                           VkBuffer device_buffer) {
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << " vk_clear_device_buffer (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "command_pool: " << (void *)command_pool << ", "
+        << "command_queue: " << (void *)command_queue << ", "
+        << "device_buffer: " << (void *)device_buffer << ")\n";
+#endif
+
+    // create a command buffer
+    VkCommandBuffer command_buffer;
+    int error_code = vk_create_command_buffer(user_context, allocator, command_pool, &command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to create command buffer!\n";
+        return error_code;
+    }
+
+    // begin the command buffer
+    VkCommandBufferBeginInfo command_buffer_begin_info =
+        {
+            VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,  // struct type
+            nullptr,                                      // pointer to struct extending this
+            VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,  // flags
+            nullptr                                       // pointer to parent command buffer
+        };
+
+    VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
+    }
+
+    // fill buffer with zero values up to the size of the buffer
+    vkCmdFillBuffer(command_buffer, device_buffer, 0, VK_WHOLE_SIZE, 0);
+
+    // end the command buffer
+    result = vkEndCommandBuffer(command_buffer);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
+    }
+
+    // submit the command buffer
+    VkSubmitInfo submit_info =
+        {
+            VK_STRUCTURE_TYPE_SUBMIT_INFO,  // struct type
+            nullptr,                        // pointer to struct extending this
+            0,                              // wait semaphore count
+            nullptr,                        // semaphores
+            nullptr,                        // pipeline stages where semaphore waits occur
+            1,                              // how many command buffers to execute
+            &command_buffer,                // the command buffers
+            0,                              // number of semaphores to signal
+            nullptr                         // the semaphores to signal
+        };
+
+    result = vkQueueSubmit(command_queue, 1, &submit_info, 0);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
+    }
+
+    // wait for memset to finish
+    result = vkQueueWaitIdle(command_queue);
+    if (result != VK_SUCCESS) {
+        error(user_context) << "Vulkan: vkQueueWaitIdle returned " << vk_get_error_name(result) << "\n";
+        return halide_error_code_generic_error;
+    }
+
+    error_code = vk_destroy_command_buffer(user_context, allocator, command_pool, command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to destroy command buffer!\n";
+        return error_code;
+    }
+
+    return halide_error_code_success;
+}
+
+// --------------------------------------------------------------------------
+
 }  // namespace
 }  // namespace Vulkan
 }  // namespace Internal
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index 5b7a484fa225..d4b7bf866d11 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -1380,21 +1380,21 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
 
         void operator()(VulkanCompilationCacheEntry *cache_entry) {
             if (cache_entry != nullptr) {
-                if (cache_entry->shader_bindings) {
-                    if (cache_entry->descriptor_set_layouts) {
-                        for (uint32_t n = 0; n < cache_entry->shader_count; n++) {
-                            debug(user_context) << "  destroying descriptor set layout [" << n << "] " << cache_entry->shader_bindings[n].entry_point_name << "\n";
-                            vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
-                            cache_entry->descriptor_set_layouts[n] = {0};
-                        }
-                        vk_host_free(user_context, cache_entry->descriptor_set_layouts, allocator->callbacks());
-                        cache_entry->descriptor_set_layouts = nullptr;
-                    }
-                    if (cache_entry->pipeline_layout) {
-                        debug(user_context) << "  destroying pipeline layout " << (void *)cache_entry->pipeline_layout << "\n";
-                        vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
-                        cache_entry->pipeline_layout = {0};
+                if (cache_entry->descriptor_set_layouts) {
+                    for (uint32_t n = 0; n < cache_entry->shader_count; n++) {
+                        debug(user_context) << "  destroying descriptor set layout [" << n << "] " << cache_entry->shader_bindings[n].entry_point_name << "\n";
+                        vk_destroy_descriptor_set_layout(user_context, allocator, cache_entry->descriptor_set_layouts[n]);
+                        cache_entry->descriptor_set_layouts[n] = {0};
                     }
+                    vk_host_free(user_context, cache_entry->descriptor_set_layouts, allocator->callbacks());
+                    cache_entry->descriptor_set_layouts = nullptr;
+                }
+                if (cache_entry->pipeline_layout) {
+                    debug(user_context) << "  destroying pipeline layout " << (void *)cache_entry->pipeline_layout << "\n";
+                    vk_destroy_pipeline_layout(user_context, allocator, cache_entry->pipeline_layout);
+                    cache_entry->pipeline_layout = {0};
+                }
+                if (cache_entry->shader_bindings) {
                     for (uint32_t n = 0; n < cache_entry->shader_count; n++) {
                         if (cache_entry->shader_bindings[n].args_region) {
                             vk_destroy_scalar_uniform_buffer(user_context, allocator, cache_entry->shader_bindings[n].args_region);
@@ -1428,6 +1428,7 @@ int vk_destroy_shader_modules(void *user_context, VulkanMemoryAllocator *allocat
                 }
                 cache_entry->shader_count = 0;
                 vk_host_free(user_context, cache_entry, allocator->callbacks());
+                cache_entry = nullptr;
             }
         }
     };

From 4fd417fc68dd6724c51c4a8948f95a76d5530a97 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 7 Apr 2023 10:46:27 -0700
Subject: [PATCH 163/166] Add note about nearest_multiple constraint for vulkan
 memory allocator

---
 README_vulkan.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/README_vulkan.md b/README_vulkan.md
index ed8aeadc6bd3..017dd56aed73 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -185,11 +185,11 @@ Halide will search for the first 'gpu' like device it can find, or fall back
 to the first compute device it can find.
 
 `HL_VK_ALLOC_CONFIG=...` will tell Halide to configure the Vulkan memory
-allocator use the given constraints specified as four integer values 
+allocator use the given constraints specified as 5x integer values 
 separated by the appropriate environment variable list delimiter 
-(e.g. `N:N:N:N` on Linux/OSX/Posix, or `N;N;N;N` on Windows). These values 
-correspond to `maximum_pool_size`, `minimum_block_size`, `maximum_block_size` 
-and `maximum_block_count`. 
+(e.g. `N:N:N:N:N` on Linux/OSX/Posix, or `N;N;N;N;N` on Windows). These values 
+correspond to `maximum_pool_size`, `minimum_block_size`, `maximum_block_size`, 
+`maximum_block_count` and `nearest_multiple`. 
 
 The `maximum_pool_size` constraint will tell Halide to configure the 
 Vulkan memory allocator to never request more than N megabytes for the
@@ -216,6 +216,11 @@ Vulkan memory allocator to never exceed a total of N block allocations.
 Decreasing this value may free up more memory but may impact performance, 
 and/or restrict allocations. Default is 0 ... meaning no limit.
 
+The `nearest_multiple` constraint will tell Halide to configure the 
+Vulkan memory allocator to always round up the requested allocation sizes
+to the given integer value. This is useful for architectures that
+require specific alignments for subregions allocated within a block.
+Default is 32 ... setting this to zero means no constraint. 
 
 # Debug Environment Variables
 

From b087283e7b58f51903ae63db8e7ec192aba35a91 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 7 Apr 2023 10:46:56 -0700
Subject: [PATCH 164/166] Hookup gpu_object_lifetime_tracker with Vulkan debug
 statements

---
 test/common/gpu_object_lifetime_tracker.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/common/gpu_object_lifetime_tracker.h b/test/common/gpu_object_lifetime_tracker.h
index d8f18716261c..436c44b6262c 100644
--- a/test/common/gpu_object_lifetime_tracker.h
+++ b/test/common/gpu_object_lifetime_tracker.h
@@ -22,7 +22,7 @@ class GpuObjectLifetimeTracker {
         }
     };
 
-    std::array<ObjectType, 14> object_types = {{
+    std::array<ObjectType, 22> object_types = {{
         {"Caching compiled kernel:", "Releasing cached compilation:"},
 
         // OpenCL objects
@@ -45,6 +45,16 @@ class GpuObjectLifetimeTracker {
         {"halide_remote_load_library", "halide_remote_release_library"},
         {"ion_alloc", "ion_free"},
 
+        // Vulkan objects
+        {"vk_create_context", "vk_destroy_context", true},
+        {"vk_create_command_pool", "vk_destroy_command_pool"},
+        {"vk_create_command_buffer", "vk_destroy_command_buffer"},
+        {"vk_create_pipeline_layout", "vk_destroy_pipeline_layout"},
+        {"vk_create_compute_pipeline", "vk_destroy_compute_pipeline"},
+        {"vk_create_descriptor_pool", "vk_destroy_descriptor_pool"},
+        {"Vulkan: Allocated memory for device region", "Vulkan: Deallocated memory for device region"},
+        {"Vulkan: Created buffer", "Vulkan: Destroyed buffer"},
+
         // WebGPU objects
         {"wgpuCreateInstance", "wgpuInstanceRelease", true},
         {"wgpuDeviceCreateBuffer", "wgpuBufferRelease"},

From 03a260882b7795c869d1d12d6301d45208d73ebf Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Fri, 7 Apr 2023 10:56:27 -0700
Subject: [PATCH 165/166] Skip dynamic shared memory portion of test for Vulkan
 on iOS/OSX.

---
 test/correctness/gpu_reuse_shared_memory.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/correctness/gpu_reuse_shared_memory.cpp b/test/correctness/gpu_reuse_shared_memory.cpp
index a557e08ed58f..422775ac2021 100644
--- a/test/correctness/gpu_reuse_shared_memory.cpp
+++ b/test/correctness/gpu_reuse_shared_memory.cpp
@@ -191,6 +191,8 @@ int main(int argc, char **argv) {
         printf("Running dynamic shared test\n");
         if (t.has_feature(Target::OpenGLCompute) && memory_type == MemoryType::GPUShared) {
             printf("Skipping test because GL doesn't support dynamic sizes for shared memory\n");
+        } else if (t.has_feature(Target::Vulkan) && ((t.os == Target::IOS) || t.os == Target::OSX)) {
+            printf("Skipping test for Vulkan on iOS/OSX (MoltenVK doesn't support dynamic sizes for shared memory)!\n");
         } else {
             if (dynamic_shared_test(memory_type) != 0) {
                 return 1;

From bf21d5fd2daf776b4d96957ed61f1d6aeefaac67 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <dgerstmann@adobe.com>
Date: Mon, 24 Apr 2023 12:46:30 -0700
Subject: [PATCH 166/166] Fix stale comment for float type support. Fix
 incorrect lowering for intrinsic.

---
 src/CodeGen_Vulkan_Dev.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 3d8af0ed869d..ce6e16d44316 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -535,7 +535,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const FloatImm *imm) {
     } else if (imm->type.bits() == 64) {
         declare_constant_float<double>(imm->type, imm->value);
     } else {
-        internal_error << "Vulkan backend currently only supports 32-bit or 64-bit floats\n";
+        internal_error << "Vulkan backend currently only supports 16-bit, 32-bit or 64-bit floats\n";
     }
 }
 
@@ -1217,8 +1217,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
     } else if (op->is_intrinsic()) {
         Expr lowered = lower_intrinsic(op);
         if (lowered.defined()) {
-            Expr e = lower_intrinsic(op);
-            e.accept(this);
+            lowered.accept(this);
         } else {
             internal_error << "Unhandled intrinsic in Vulkan backend: " << op->name << "\n";
         }
@@ -2795,6 +2794,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::add_kernel(const Stmt &s,
 
     // Add function definition
     // TODO: can we use one of the function control annotations?
+    // https://github.com/halide/Halide/issues/7533
 
     // Discover the workgroup size
     find_workgroup_size(s);