KomputeProject · axsaucedo · Feb 21, 2021 · Feb 21, 2021 · Feb 21, 2021 · Feb 21, 2021
diff --git a/.gitmodules b/.gitmodules
@@ -18,3 +18,7 @@
 	path = external/glslang
 	url = https://github.com/KhronosGroup/glslang/
     branch = 11.1.0
+[submodule "external/fmt"]
+	path = external/fmt
+	url = https://github.com/fmtlib/fmt
+    branch = 7.1.3
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,12 +25,18 @@ set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, se
 
 if(KOMPUTE_OPT_ENABLE_SPDLOG)
     set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DKOMPUTE_ENABLE_SPDLOG=1")
+    set(SPDLOG_FMT_EXTERNAL ON CACHE BOOL "Enables external fmt as its current dep" FORCE)
     if(KOMPUTE_OPT_INSTALL)
         # Enable install parameters for spdlog (overrides parameters passed)
-        set(SPDLOG_INSTALL ON CACHE BOOL "Enables install of glslang" FORCE)
+        set(SPDLOG_INSTALL ON CACHE BOOL "Enables install of spdlot" FORCE)
     endif()
 endif()
 
+if(KOMPUTE_OPT_INSTALL)
+	# Enable install parameters for fmt (overrides parameters passed)
+	set(FMT_INSTALL ON CACHE BOOL "Enables install of fmt" FORCE)
+endif()
+
 if(KOMPUTE_OPT_ANDOID_BUILD)
     set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DVK_USE_PLATFORM_ANDROID_KHR")
 endif()

diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ int main() {
     // 3. Run operation with string shader synchronously
     mgr.evalOpDefault<kp::OpAlgoBase>(
         { tensorInA, tensorInB, tensorOut },
-        std::vector<uint32_t>(shaderString.begin(), shaderString.end()));
+        kp::Shader::compile_source(shaderString));
 
     // 4. Map results back from GPU memory to print the results
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorInA, tensorInB, tensorOut });

diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst
@@ -45,7 +45,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path
         auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
 
         // Create tensors data explicitly in GPU with an operation
-        mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
+        mgr.rebuild({ tensorA, tensorB });
 
         // Define your shader as a string (using string literals for simplicity)
         // (You can also pass the raw compiled bytes, or even path to file)
@@ -67,7 +67,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path
         // Run Kompute operation on the parameters provided with dispatch layout
         mgr.evalOpDefault<kp::OpAlgoBase>(
             { tensorA, tensorB }, 
-            std::vector<char>(shader.begin(), shader.end()));
+            kp::Shader::compile_source(shader));
 
         // Sync the GPU memory back to the local tensor
         mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
@@ -105,7 +105,7 @@ Record commands in a single submit by using a Sequence to send in batch to GPU.
            sq->begin();
 
            // Record batch commands to send to GPU
-           sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+           sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
            sq->record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});
 
            // Stop recording
@@ -146,7 +146,7 @@ You can submit operations asynchronously with the async/await commands in the kp
        auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
 
        // Create tensors data explicitly in GPU with an operation
-       mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
+       mgr.rebuild(tensor)
 
        // Define your shader as a string (using string literals for simplicity)
        // (You can also pass the raw compiled bytes, or even path to file)
@@ -174,6 +174,8 @@ You can submit operations asynchronously with the async/await commands in the kp
            }
        )");
 
+       std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+
        // We can now await for the previous submitted command
        // The first parameter can be the amount of time to wait
        // The time provided is in nanoseconds
@@ -182,7 +184,7 @@ You can submit operations asynchronously with the async/await commands in the kp
        // Run Async Kompute operation on the parameters provided
        mgr.evalOpAsyncDefault<kp::OpAlgoBase>(
            { tensor }, 
-           std::vector<char>(shader.begin(), shader.end()));
+           spirv);
 
        // Here we can do other work
 
@@ -234,7 +236,7 @@ Back to `examples list <#simple-examples>`_.
        auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
 
        // We run the first step synchronously on the default sequence
-       mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
+       mgr.rebuild({ tensorA, tensorB });
 
        // Define your shader as a string (using string literals for simplicity)
        // (You can also pass the raw compiled bytes, or even path to file)
@@ -262,17 +264,19 @@ Back to `examples list <#simple-examples>`_.
            }
        )");
 
+       std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+
        // Run the first parallel operation in the `queueOne` sequence
        mgr.evalOpAsync<kp::OpAlgoBase>(
            { tensorA }, 
            "queueOne",
-           std::vector<char>(shader.begin(), shader.end()));
+           spirv);
 
        // Run the second parallel operation in the `queueTwo` sequence
        mgr.evalOpAsync<kp::OpAlgoBase>(
            { tensorB }, 
            "queueTwo",
-           std::vector<char>(shader.begin(), shader.end()));
+           spirv);
 
        // Here we can do other work
 
@@ -308,7 +312,7 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
          : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "")
        {
            // Perform your custom steps such as reading from a shader file
-           this->mShaderFilePath = "shaders/glsl/opmult.comp";
+           this->mShaderFilePath = "shaders/glsl/opmult.comp.spv";
        }
    }
 
@@ -323,7 +327,7 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
        auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
 
        // Create tensors data explicitly in GPU with an operation
-       mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
+       mgr.rebuild({ tensorLhs, tensorRhs, tensorOut });
 
        // Run Kompute operation on the parameters provided with dispatch layout
        mgr.evalOpDefault<kp::OpMyCustom<3, 1, 1>>(
@@ -334,258 +338,3 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
    }
 
 
-Logistic Regression Example
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Logistic regression is oftens seen as the hello world in machine learning so we will be using it for our examples. Back to `examples list <#simple-examples>`_.
-
-.. image:: ../images/logistic-regression.jpg
-   :width: 300px
-
-
-In summary, we have:
-
-
-* Vector ``X`` with input data (with a pair of inputs ``Xi`` and ``Xj``\ )
-* Output ``Y`` with expected predictions
-
-With this we will:
-
-* Optimize the function simplified as ``Y = WX + b``
-* We'll want our program to learn the parameters ``W`` and ``b``
-
-We will have to convert this into Kompute terminology.
-
-First specifically around the inputs, we will be using the following:
-
-* Two vertors for the variable `X`, vector `Xi` and `Xj`
-* One vector `Y` for the true predictions
-* A vector `W` containing the two input weight values to use for inference
-* A vector `B` containing a single input parameter for `b`
-
-.. code-block:: cpp
-   :linenos:
-
-   std::vector<float> wInVec = { 0.001, 0.001 };
-   std::vector<float> bInVec = { 0 };
-
-   std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })};
-   std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
-
-   std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
-
-   std::shared_ptr<kp::Tensor> wIn{ 
-       new kp::Tensor(wInVec, kp::Tensor::TensorTypes::eStaging)};
-
-   std::shared_ptr<kp::Tensor> bIn{ 
-       new kp::Tensor(bInVec, kp::Tensor::TensorTypes::eStaging)};
-
-
-We will have the following output vectors:
-
-* Two output vectors `Wi` and `Wj` to store all the deltas to perform gradient descent on W
-* One output vector `Bout` to store all the deltas to perform gradient descent on B
-
-.. code-block:: cpp
-   :linenos:
-
-   std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
-   std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
-
-   std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
-
-
-For simplicity we will store all the tensors inside a params variable:
-
-.. code-block:: cpp
-   :linenos:
-
-   std::vector<std::shared_ptr<kp::Tensor>> params = 
-       {xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut};
-
-
-Now that we have the inputs and outputs we will be able to use them in the processing. The workflow we will be using is the following:
-
-1. Create a Sequence to record and submit GPU commands
-2. Submit OpCreateTensor to create all the tensors 
-3. Record the OpAlgo with the Logistic Regression shader
-4. Loop across number of iterations:
-  4-a. Submit algo operation on LR shader
-  4-b. Re-calculate weights from loss
-5. Print output weights and bias
-
-1. Create a sequence to record and submit GPU commands
-
-.. code-block:: cpp
-    :linenos:
-
-    kp::Manager mgr;
-
-    if (std::shared_ptr<kp::Sequence> sq = 
-            mgr.sequence("createTensors").lock()) 
-    {
-        // ...
-
-
-
-Submit OpCreateTensor to create all the tensors
-
-.. code-block:: cpp
-    :linenos:
-
-    {
-        // ... continuing from codeblock above
-
-        sq->begin();
-
-        sq->record<kp::OpCreateTensor>(params);
-
-        sq->end();
-        sq->eval();
-
-
-Record the OpAlgo with the Logistic Regression shader
-
-Once we re-record, all the instructions that were recorded previously are cleared.
-
-Because of this we can record now the new commands which will consist of the following:
-
-
-.. code-block:: cpp
-    :linenos:
-
-    {
-        // ... continuing from codeblock above
-
-        sq->begin();
-
-        sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
-
-        sq->record<kp::OpAlgoBase>(
-                params, 
-                false, // Whether to copy output from device
-                "test/shaders/glsl/test_logistic_regression.comp");
-
-        sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
-
-        sq->end();
-
-
-
-Loop across number of iterations + 4-a. Submit algo operation on LR shader
-
-.. code-block:: cpp
-    :linenos:
-
-    {
-        // ... continuing from codeblock above
-
-        uint32_t ITERATIONS = 100;
-
-        for (size_t i = 0; i < ITERATIONS; i++) 
-        {
-            // Run evaluation which passes data through shader once
-            sq->eval();
-
-
-
-4-b. Re-calculate weights from loss
-
-
-Once the shader code is executed, we are able to use the outputs from the shader calculation.
-
-In this case we want to basically add all the calculated weights and bias from the back-prop step.
-
-.. code-block:: cpp
-   :linenos:
-
-   {
-       // ... 
-       for (size_t i = 0; i < ITERATIONS; i++) 
-       {
-           // ... continuing from codeblock above
-
-           // Run evaluation which passes data through shader once
-           sq->eval();
-
-           // Subtract the resulting weights and biases
-           for(size_t j = 0; j < bOut->size(); j++) {
-               wInVec[0] -= wOutI->data()[j];
-               wInVec[1] -= wOutJ->data()[j];
-               bInVec[0] -= bOut->data()[j];
-           }
-           // Set the data for the GPU to use in the next iteration
-           wIn->mapDataIntoHostMemory();
-           bIn->mapDataIntoHostMemory();
-       }
-
-5. Print output weights and bias
-
-.. code-block:: cpp
-    :linenos:
-
-    std::cout << "Weight i: " << wIn->data()[0] << std::endl;
-    std::cout << "Weight j: " << wIn->data()[1] << std::endl;
-    std::cout << "Bias: " << bIn->data()[0] << std::endl;
-
-
-
-Logistic Regression Compute Shader
-----------------------------------
-
-Finally you can see the shader used for the logistic regression usecase below:
-
-.. code-block:: cpp
-   :linenos:
-
-   #version 450
-
-   layout (constant_id = 0) const uint M = 0;
-
-   layout (local_size_x = 1) in;
-
-   layout(set = 0, binding = 0) buffer bxi { float xi[]; };
-   layout(set = 0, binding = 1) buffer bxj { float xj[]; };
-   layout(set = 0, binding = 2) buffer by { float y[]; };
-   layout(set = 0, binding = 3) buffer bwin { float win[]; };
-   layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
-   layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
-   layout(set = 0, binding = 6) buffer bbin { float bin[]; };
-   layout(set = 0, binding = 7) buffer bbout { float bout[]; };
-
-   float learningRate = 0.1;
-   float m = float(M);
-
-   float sigmoid(float z) {
-       return 1.0 / (1.0 + exp(-z));
-   }
-
-   float inference(vec2 x, vec2 w, float b) {
-       float z = dot(w, x) + b;
-       float yHat = sigmoid(z);
-       return yHat;
-   }
-
-   float calculateLoss(float yHat, float y) {
-       return -(y * log(yHat)  +  (1.0 - y) * log(1.0 - yHat));
-   }
-
-   void main() {
-       uint idx = gl_GlobalInvocationID.x;
-
-       vec2 wCurr = vec2(win[0], win[1]);
-       float bCurr = bin[0];
-
-       vec2 xCurr = vec2(xi[idx], xj[idx]);
-       float yCurr = y[idx];
-
-       float yHat = inference(xCurr, wCurr, bCurr);
-       float loss = calculateLoss(yHat, yCurr);
-
-       float dZ = yHat - yCurr;
-       vec2 dW = (1. / m) * xCurr * dZ;
-       float dB = (1. / m) * dZ;
-       wouti[idx] = learningRate * dW.x;
-       woutj[idx] = learningRate * dW.y;
-       bout[idx] = learningRate * dB;
-   }