Merge pull request #164 from EthicalML/160_op_memory

Amend memory hierarchy to enable for push constants and functional interface for more flexible operations
KomputeProject · Feb 28, 2021 · 672cf22 · 672cf22
2 parents 2834519 + 4fddf74
commit 672cf22
Show file tree

Hide file tree

Showing 61 changed files with 3,080 additions and 4,804 deletions.
diff --git a/.ccls b/.ccls
@@ -19,6 +19,7 @@
 -I./external/googletest/googletest/include/
 -I./external/glslang/
 -I./external/spdlog/include/
+-I./external/fmt/include/
 -I./src/include/
 -I./single_include/
 -I./vk_ndk_wrapper_include/

diff --git a/Makefile b/Makefile
@@ -57,7 +57,6 @@ MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
 mk_cmake:
 	cmake \
 		-Bbuild \
-		$(MK_CMAKE_EXTRA_FLAGS) \
 		-DKOMPUTE_EXTRA_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
 		-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
 		-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
@@ -69,6 +68,7 @@ mk_cmake:
 		-DKOMPUTE_OPT_BUILD_SINGLE_HEADER=1 \
 		-DKOMPUTE_OPT_ENABLE_SPDLOG=1 \
 		-DKOMPUTE_OPT_CODE_COVERAGE=1 \
+		$(MK_CMAKE_EXTRA_FLAGS) \
 		-G "Unix Makefiles"
 
 mk_build_all:

diff --git a/README.md b/README.md
@@ -56,35 +56,65 @@ int main() {
     // 2. Create and initialise Kompute Tensors through manager
     auto tensorInA = mgr.tensor({ 2., 2., 2. });
     auto tensorInB = mgr.tensor({ 1., 2., 3. });
-    auto tensorOut = mgr.tensor({ 0., 0., 0. });
+    auto tensorOutA = mgr.tensor({ 0., 0., 0. });
+    auto tensorOutB = mgr.tensor({ 0., 0., 0. });
 
-    // 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path)
-    std::string shaderString = (R"(
+    std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
+
+    // 3. Create algorithm based on shader (supports buffers & push/spec constants)
+    std::string shader = (R"(
         #version 450
 
         layout (local_size_x = 1) in;
 
         // The input tensors bind index is relative to index in parameter passed
-        layout(set = 0, binding = 0) buffer bina { float tina[]; };
-        layout(set = 0, binding = 1) buffer binb { float tinb[]; };
-        layout(set = 0, binding = 2) buffer bout { float tout[]; };
+        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
+        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
+
+        // Kompute supports push constants updated on dispatch
+        layout(push_constant) uniform PushConstants {
+            float val;
+        } push_const;
+
+        // Kompute also supports spec constants on initalization
+        layout(constant_id = 0) const float const_one = 0;
 
         void main() {
             uint index = gl_GlobalInvocationID.x;
-            tout[index] = tina[index] * tinb[index];
+            out_a[index] += in_a[index] * in_b[index];
+            out_b[index] += const_one * push_const.val;
         }
     )");
 
-    // 3. Run operation with string shader synchronously
-    mgr.evalOpDefault<kp::OpAlgoBase>(
-        { tensorInA, tensorInB, tensorOut },
-        kp::Shader::compile_source(shaderString));
+    kp::Workgroup workgroup({3, 1, 1});
+    kp::Constants specConsts({ 2 });
+
+    auto algorithm = mgr.algorithm(params, kp::Shader::compile_source(shader), workgroup, specConsts);
+
+    kp::Constants pushConstsA({ 2.0 });
+    kp::Constants pushConstsB({ 3.0 });
 
-    // 4. Map results back from GPU memory to print the results
-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorInA, tensorInB, tensorOut });
+    // 4. Run operation synchronously using sequence
+    mgr.sequence()
+        ->record<kp::OpTensorSyncDevice>(params)
+        ->record<kp::OpAlgoDispatch>(algorithm, pushConstsA)
+        ->record<kp::OpAlgoDispatch>(algorithm, pushConstsB)
+        ->eval();
 
-    // Prints the output which is Output: { 2, 4, 6 }
-    for (const float& elem : tensorOut->data()) std::cout << elem << "  ";
+    // 5. Sync results from the GPU asynchronously
+    sq = mgr.sequence()
+    sq->evalAsync<kp::OpTensorSyncLocal>(params);
+
+    // ... Do other work asynchronously whilst GPU finishes
+
+    sq->evalAwait();
+
+    // Prints the first output which is: { 4, 8, 12 }
+    for (const float& elem : tensorOutA->data()) std::cout << elem << "  ";
+    // Prints the second output which is: { 10, 10, 10 }
+    for (const float& elem : tensorOutB->data()) std::cout << elem << "  ";
 }
 
 ```
@@ -94,34 +124,72 @@ int main() {
 The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.
 
 ```python
+
 # 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue)
-mgr = Manager()
+mgr = kp.Manager()
+
+# 2. Create and initialise Kompute Tensors through manager
+tensor_in_a = mgr.tensor([2, 2, 2])
+tensor_in_b = mgr.tensor([1, 2, 3])
+tensor_out_a = mgr.tensor([0, 0, 0])
+tensor_out_b = mgr.tensor([0, 0, 0])
+
+params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
+
+# 3. Create algorithm based on shader (supports buffers & push/spec constants)
+shader = """
+    #version 450
+
+    layout (local_size_x = 1) in;
+
+    // The input tensors bind index is relative to index in parameter passed
+    layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
+    layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
+    layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
+    layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
+
+    // Kompute supports push constants updated on dispatch
+    layout(push_constant) uniform PushConstants {
+        float val;
+    } push_const;
+
+    // Kompute also supports spec constants on initalization
+    layout(constant_id = 0) const float const_one = 0;
+
+    void main() {
+        uint index = gl_GlobalInvocationID.x;
+        out_a[index] += in_a[index] * in_b[index];
+        out_b[index] += const_one * push_const.val;
+    }
+"""
+
+workgroup = (3, 1, 1)
+spec_consts = [2]
+push_consts_a = [2]
+push_consts_b = [3]
+
+algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts)
 
-# 2. Create and initialise Kompute Tensors (can be initialized with List[] or np.Array)
-tensor_in_a = Tensor([2, 2, 2])
-tensor_in_b = Tensor([1, 2, 3])
-tensor_out = Tensor([0, 0, 0])
+# 4. Run operation synchronously using sequence
+(mgr.sequence()
+    .record(kp.OpTensorSyncDevice(params))
+    .record(kp.OpAlgoDispatch(algo, push_consts_a))
+    .record(kp.OpAlgoDispatch(algo, push_consts_b))
+    .eval())
 
-mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+# 5. Sync results from the GPU asynchronously
+sq = mgr.sequence()
+sq.eval_async(kp.OpTensorSyncLocal(params))
 
-# 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path)
-@python2shader
-def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
-                            data1=("buffer", 0, Array(f32)),
-                            data2=("buffer", 1, Array(f32)),
-                            data3=("buffer", 2, Array(f32))):
-    i = index.x
-    data3[i] = data1[i] * data2[i]
+# ... Do other work asynchronously whilst GPU finishes
 
-# 4. Run multiplication operation synchronously
-mgr.eval_algo_data_def(
-    [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
+sq.eval_await()
 
-# 5. Map results back from GPU memory to print the results
-mgr.eval_tensor_sync_local_def([tensor_out])
+# Prints the first output which is: { 4, 8, 12 }
+print(tensor_out_a)
+# Prints the first output which is: { 10, 10, 10 }
+print(tensor_out_b)
 
-# Prints [2.0, 4.0, 6.0]
-print(tensor_out.data())
 ```
 
 ### Interactive Notebooks & Hands on Videos
@@ -199,7 +267,7 @@ The core architecture of Kompute includes the following:
 * [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
 * [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
 * [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
-* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) code executed in the GPU
+* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU
 
 To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).
 

diff --git a/docs/images/kompute-vulkan-architecture.jpg b/docs/images/kompute-vulkan-architecture.jpg
diff --git a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
@@ -20,61 +20,62 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
     uint32_t ITERATIONS = 100;
     float learningRate = 0.1;
 
-    std::shared_ptr<kp::Tensor> xI{ new kp::Tensor(xIData) };
-    std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor(xJData) };
-
-    std::shared_ptr<kp::Tensor> y{ new kp::Tensor(yData) };
-
-    std::shared_ptr<kp::Tensor> wIn{ new kp::Tensor({ 0.001, 0.001 }) };
-    std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor(zerosData) };
-    std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor(zerosData) };
-
-    std::shared_ptr<kp::Tensor> bIn{ new kp::Tensor({ 0 }) };
-    std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor(zerosData) };
+    {
+        kp::Manager mgr;
 
-    std::shared_ptr<kp::Tensor> lOut{ new kp::Tensor(zerosData) };
+        std::shared_ptr<kp::Tensor> xI = mgr.tensor(xIData);
+        std::shared_ptr<kp::Tensor> xJ = mgr.tensor(xJData);
 
-    std::vector<std::shared_ptr<kp::Tensor>> params = { xI,  xJ,    y,
-                                                        wIn, wOutI, wOutJ,
-                                                        bIn, bOut,  lOut };
+        std::shared_ptr<kp::Tensor> y = mgr.tensor(yData);
 
-    {
-        kp::Manager mgr;
+        std::shared_ptr<kp::Tensor> wIn = mgr.tensor({ 0.001, 0.001 });
+        std::shared_ptr<kp::Tensor> wOutI = mgr.tensor(zerosData);
+        std::shared_ptr<kp::Tensor> wOutJ = mgr.tensor(zerosData);
 
-        {
-            mgr.rebuild(params);
+        std::shared_ptr<kp::Tensor> bIn = mgr.tensor({ 0 });
+        std::shared_ptr<kp::Tensor> bOut = mgr.tensor(zerosData);
 
-            std::shared_ptr<kp::Sequence> sq = mgr.sequence();
+        std::shared_ptr<kp::Tensor> lOut = mgr.tensor(zerosData);
 
-            // Record op algo base
-            sq->begin();
+        std::vector<std::shared_ptr<kp::Tensor>> params = { xI,  xJ,    y,
+                                                            wIn, wOutI, wOutJ,
+                                                            bIn, bOut,  lOut };
 
-            sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
+        std::vector<uint32_t> spirv(
+                    (uint32_t*)kp::shader_data::shaders_glsl_logisticregression_comp_spv,
+                    (uint32_t*)(kp::shader_data::shaders_glsl_logisticregression_comp_spv
+                        + kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));
 
-            // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase>(
-                    params, kp::Shader::compile_source(LR_SHADER));
+        std::shared_ptr<kp::Algorithm> algo =
+                mgr.algorithm(params, spirv, kp::Workgroup({ 5 }), kp::Constants({ 5.0 }));
 
-            sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
+        mgr.sequence()->eval<kp::OpTensorSyncDevice>(params);
 
-            sq->end();
+        std::shared_ptr<kp::Sequence> sq = mgr.sequence()
+            ->record<kp::OpTensorSyncDevice>({ wIn, bIn })
+            ->record<kp::OpAlgoDispatch>(algo)
+            ->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
 
-            // Iterate across all expected iterations
-            for (size_t i = 0; i < ITERATIONS; i++) {
+        // Iterate across all expected iterations
+        for (size_t i = 0; i < ITERATIONS; i++) {
 
-                sq->eval();
+            sq->eval();
 
-                for (size_t j = 0; j < bOut->size(); j++) {
-                    wIn->data()[0] -= learningRate * wOutI->data()[j];
-                    wIn->data()[1] -= learningRate * wOutJ->data()[j];
-                    bIn->data()[0] -= learningRate * bOut->data()[j];
-                }
+            for (size_t j = 0; j < bOut->size(); j++) {
+                wIn->data()[0] -= learningRate * wOutI->data()[j];
+                wIn->data()[1] -= learningRate * wOutJ->data()[j];
+                bIn->data()[0] -= learningRate * bOut->data()[j];
             }
         }
-    }
 
-    this->mWeights = kp::Tensor(wIn->data());
-    this->mBias = kp::Tensor(bIn->data());
+        KP_LOG_INFO("RESULT: <<<<<<<<<<<<<<<<<<<");
+        KP_LOG_INFO("{}", wIn->data()[0]);
+        KP_LOG_INFO("{}", wIn->data()[1]);
+        KP_LOG_INFO("{}", bIn->data()[0]);
+
+        this->mWeights = wIn;
+        this->mBias = bIn;
+    }
 }
 
 std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<float> xJ) {
@@ -88,9 +89,9 @@ std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<fl
     for (size_t i = 0; i < xI.size(); i++) {
         float xIVal = xI[i];
         float xJVal = xJ[i];
-        float result = (xIVal * this->mWeights.data()[0]
-                        + xJVal * this->mWeights.data()[1]
-                        + this->mBias.data()[0]);
+        float result = (xIVal * this->mWeights->data()[0]
+                        + xJVal * this->mWeights->data()[1]
+                        + this->mBias->data()[0]);
 
         // Instead of using sigmoid we'll just return full numbers
         float var = result > 0 ? 1 : 0;
@@ -103,13 +104,13 @@ std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<fl
 std::vector<float> KomputeModelML::get_params() {
     std::vector<float> retVector;
 
-    if(this->mWeights.size() + this->mBias.size() == 0) {
+    if(this->mWeights->size() + this->mBias->size() == 0) {
         return retVector;
     }
 
-    retVector.push_back(this->mWeights.data()[0]);
-    retVector.push_back(this->mWeights.data()[1]);
-    retVector.push_back(this->mBias.data()[0]);
+    retVector.push_back(this->mWeights->data()[0]);
+    retVector.push_back(this->mWeights->data()[1]);
+    retVector.push_back(this->mBias->data()[0]);
     retVector.push_back(99.0);
 
     return retVector;

diff --git a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.hpp b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.hpp
@@ -4,6 +4,7 @@
 
 #include <vector>
 #include <string>
+#include <memory>
 
 #include "kompute/Kompute.hpp"
 
@@ -20,8 +21,8 @@ class KomputeModelML {
     std::vector<float> get_params();
 
 private:
-    kp::Tensor mWeights;
-    kp::Tensor mBias;
+    std::shared_ptr<kp::Tensor> mWeights;
+    std::shared_ptr<kp::Tensor> mBias;
 
 };
 

diff --git a/examples/array_multiplication/src/Main.cpp b/examples/array_multiplication/src/Main.cpp
@@ -37,11 +37,14 @@ int main()
         }
       )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase>(
-            { tensorInA, tensorInB, tensorOut },
-            kp::Shader::compile_source(shader));
+    std::vector<std::shared_ptr<kp::Tensor>> params = { tensorInA, tensorInB, tensorOut };
 
-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
+    std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(params, kp::Shader::compile_source(shader));
+
+    mgr.sequence()
+        ->record<kp::OpTensorSyncDevice>(params)
+        ->record<kp::OpAlgoDispatch>(algo)
+        ->record<kp::OpTensorSyncLocal>(params);
 
     // prints "Output {  0  4  12  }"
     std::cout<< "Output: {  ";