KomputeProject · MiroPalmu · Sep 11, 2022 · Sep 15, 2022 · Sep 15, 2022 · Sep 15, 2022
diff --git a/.gitmodules b/.gitmodules
@@ -5,7 +5,7 @@
 [submodule "external/Vulkan-Headers"]
 	path = external/Vulkan-Headers
 	url = https://github.com/KhronosGroup/Vulkan-Headers
-	branch = v1.2.158
+	branch = v1.3.227
 [submodule "external/spdlog"]
 	path = external/spdlog
 	url = https://github.com/gabime/spdlog

diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/vulkan:1.1.121
+FROM nvidia/vulkan:1.3-470
 
 RUN apt update -y
 RUN apt install g++ -y

diff --git a/docker-builders/VulkanSDK.Dockerfile b/docker-builders/VulkanSDK.Dockerfile
@@ -1,6 +1,6 @@
 FROM amd64/ubuntu:20.04
 
-ARG VULKAN_SDK_VERSION=1.2.154.0
+ARG VULKAN_SDK_VERSION=1.3.227
 
 # First install vulkan 
 RUN apt-get update

diff --git a/docs/overview/build-system.rst b/docs/overview/build-system.rst
@@ -65,7 +65,7 @@ Compile Flags
      - Sets the default api version to use for kompute api
    * - -DKOMPUTE_VK_API_MAJOR_VERSION=1
      - Major version to use for the Vulkan SDK
-   * - -DKOMPUTE_VK_API_MINOR_VERSION=1
+   * - -DKOMPUTE_VK_API_MINOR_VERSION=3
      - Minor version to use for the Vulkan SDK
    * - -DKOMPUTE_ENABLE_SPDLOG=1
      - Enables the build with SPDLOG and FMT dependencies (must be installed)

diff --git a/python/src/docstrings.hpp b/python/src/docstrings.hpp
@@ -765,6 +765,8 @@ static const char *__doc_kp_Tensor_getStagingBufferUsageFlags = R"doc()doc";
 
 static const char *__doc_kp_Tensor_getStagingMemoryPropertyFlags = R"doc()doc";
 
+static const char *__doc_kp_Tensor_isDeviceOnlyTensor= R"doc()doc";
+
 static const char *__doc_kp_Tensor_isInit =
 R"doc(Check whether tensor is initialized based on the created gpu
 resources.

diff --git a/python/src/main.cpp b/python/src/main.cpp
@@ -114,6 +114,7 @@ PYBIND11_MODULE(kp, m) {
         .def("tensor_type", &kp::Tensor::tensorType, DOC(kp, Tensor, tensorType))
         .def("data_type", &kp::Tensor::dataType, DOC(kp, Tensor, dataType))
         .def("is_init", &kp::Tensor::isInit, DOC(kp, Tensor, isInit))
+        .def("is_device_only_tensor", &kp::Tensor::isDeviceOnlyTensor, DOC(kp, Tensor, isDeviceOnlyTensor))
         .def("destroy", &kp::Tensor::destroy, DOC(kp, Tensor, destroy));
 
     py::class_<kp::Sequence, std::shared_ptr<kp::Sequence>>(m, "Sequence")
@@ -340,4 +341,3 @@ PYBIND11_MODULE(kp, m) {
     m.attr("__version__") = "dev";
 #endif
 }
-
diff --git a/python/test/test_logistic_regression.py b/python/test/test_logistic_regression.py
@@ -72,24 +72,31 @@ def compute_shader(
     # Create a managed sequence
     sq = mgr.sequence()
 
-    # Record operation to sync memory from local to GPU memory
-    sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in]))
-
-    # Record operation to execute GPU shader against all our parameters
-    sq.record(kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv())))
-
-    # Record operation to sync memory from GPU to local memory
-    sq.record(kp.OpTensorSyncLocal([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]))
 
     ITERATIONS = 100
     learning_rate = 0.1
 
     # Perform machine learning training and inference across all input X and Y
     for i_iter in range(ITERATIONS):
 
-        # Execute an iteration of the algorithm
+        # We have to call eval after each record becasue recorded commands
+        # are not guaranteed to run in order
+
+        # Also now eval clears recorded operations so we have to record them again
+
+        # Record operation to sync memory from local to GPU memory
+        sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in]))
         sq.eval()
 
+        # Record operation to execute GPU shader against all our parameters
+        sq.record(kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv())))
+        sq.eval()
+
+        # Record operation to sync memory from GPU to local memory
+        sq.record(kp.OpTensorSyncLocal([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]))
+        sq.eval()
+
+
         # Calculate the parameters based on the respective derivatives calculated
         for j_iter in range(tensor_b_out.size()):
             tensor_w_in.data()[0] -= learning_rate * tensor_w_out_i.data()[j_iter]

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
diff --git a/src/OpTensorCopy.cpp b/src/OpTensorCopy.cpp
@@ -60,11 +60,16 @@ OpTensorCopy::postEval(const vk::CommandBuffer& /*commandBuffer*/)
 {
     KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");
 
-    void* data = this->mTensors[0]->rawData();
+    if (!this->mTensors[0]->isDeviceOnlyTensor()) {
+        KP_LOG_DEBUG("Copying raw data on host memory to another tensors");
+        void* data = this->mTensors[0]->rawData();
 
-    // Copy the data from the first tensor into all the tensors
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->setRawData(data);
+        // Copy the data from the first tensor into all the tensors
+        for (auto tensor : this->mTensors) {
+            if (!tensor->isDeviceOnlyTensor()) {
+                tensor->setRawData(data);
+            }
+        }
     }
 }
 

diff --git a/src/Sequence.cpp b/src/Sequence.cpp
@@ -49,7 +49,16 @@ Sequence::begin()
     }
 
     KP_LOG_INFO("Kompute Sequence command now started recording");
-    this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
+    const auto commandBufferBeginInfo = vk::CommandBufferBeginInfo
+    {
+        // This has to be set because otherwise chaining multiple evals in row will
+        // submit same work multiple times. For example:
+        // mgr.sequence()
+        //    ->eval<kp::OpTensorSyncDevice>({tensor_a}) 
+        //    ->eval<kp::OpTensorCopy>({tensor_a, tensor_b})
+        vk::CommandBufferUsageFlagBits::eOneTimeSubmit
+    };
+    this->mCommandBuffer->begin(commandBufferBeginInfo);
     this->mRecording = true;
 
     // latch the first timestamp before any commands are submitted
@@ -170,6 +179,8 @@ Sequence::evalAwait(uint64_t waitFor)
         this->mOperations[i]->postEval(*this->mCommandBuffer);
     }
 
+    this->mOperations.clear();
+
     return shared_from_this();
 }
 

diff --git a/src/Tensor.cpp b/src/Tensor.cpp
@@ -53,9 +53,11 @@ Tensor::rebuild(void* data,
     }
 
     this->allocateMemoryCreateGPUResources();
-    this->mapRawData();
 
-    memcpy(this->mRawData, data, this->memorySize());
+    if (!this->isDeviceOnlyTensor()) {
+        this->mapRawData();
+        this->setRawData(data);
+    }
 }
 
 Tensor::TensorTypes
@@ -64,6 +66,12 @@ Tensor::tensorType()
     return this->mTensorType;
 }
 
+bool
+Tensor::isDeviceOnlyTensor()
+{
+    return this->mTensorType == TensorTypes::eStorage;
+}
+
 bool
 Tensor::isInit()
 {
@@ -121,7 +129,7 @@ Tensor::mapRawData()
         hostVisibleMemory = this->mStagingMemory;
     } else {
         KP_LOG_WARN(
-          "Kompute Tensor mapping data not supported on storage tensor");
+          "Kompute Tensor mapping data not supported on {} tensor", this->mTensorType);
         return;
     }
 
@@ -131,7 +139,6 @@ Tensor::mapRawData()
     // flush
     this->mRawData = this->mDevice->mapMemory(
       *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
-
 }
 
 void
@@ -148,7 +155,7 @@ Tensor::unmapRawData()
         hostVisibleMemory = this->mStagingMemory;
     } else {
         KP_LOG_WARN(
-          "Kompute Tensor mapping data not supported on storage tensor");
+          "Kompute Tensor mapping data not supported on {} tensor", this->mTensorType);
         return;
     }
 
@@ -486,7 +493,10 @@ Tensor::destroy()
     }
 
     // Unmap the current memory data
-    this->unmapRawData();
+    if (!this->isDeviceOnlyTensor()) {
+        this->unmapRawData();
+
+    }
 
     if (this->mFreePrimaryBuffer) {
         if (!this->mPrimaryBuffer) {

diff --git a/src/include/kompute/Core.hpp b/src/include/kompute/Core.hpp
@@ -25,7 +25,7 @@ typedef std::vector<float> Constants;
 #define KOMPUTE_VK_API_MAJOR_VERSION 1
 #endif // KOMPUTE_VK_API_MAJOR_VERSION
 #ifndef KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_MINOR_VERSION 1
+#define KOMPUTE_VK_API_MINOR_VERSION 3
 #endif // KOMPUTE_VK_API_MINOR_VERSION
 #define KOMPUTE_VK_API_VERSION                                                 \
     VK_MAKE_VERSION(                                                           \

diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
@@ -90,6 +90,8 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     /**
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job synchronously (with a barrier).
+     * 
+     * It also clears operations recorded to sequence.
      *
      * @return shared_ptr<Sequence> of the Sequence class itself
      */
@@ -99,6 +101,8 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * Resets all the recorded and stored operations, records the operation
      * provided and submits into the gpu as a submit job synchronously (with a
      * barrier).
+     * 
+     * It also clears operations recorded to sequence.
      *
      * @return shared_ptr<Sequence> of the Sequence class itself
      */
@@ -107,6 +111,8 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     /**
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job with a barrier.
+     * 
+     * It also clears operations recorded to sequence.
      *
      * @param tensors Vector of tensors to use for the operation
      * @param TArgs Template parameters that are used to initialise operation
@@ -123,6 +129,8 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     /**
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job with a barrier.
+     * 
+     * It also clears operations recorded to sequence.
      *
      * @param algorithm Algorithm to use for the record often used for OpAlgo
      * operations
@@ -196,6 +204,8 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     /**
      * Eval Await waits for the fence to finish processing and then once it
      * finishes, it runs the postEval of all operations.
+     * 
+     * It also clears operations recorded to sequence.
      *
      * @param waitFor Number of milliseconds to wait before timing out.
      * @return shared_ptr<Sequence> of the Sequence class itself

diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
@@ -86,6 +86,15 @@ class Tensor
      */
     bool isInit();
 
+    /**
+     * Tells if data of tensor is only located at device. Used for
+     * determing if mapping and copying of raw data is needed in rebuild.
+     * For example eStorage tensors are device only.
+     * 
+     * @return Boolean stating wherer data in tensor is device only
+     */
+    bool isDeviceOnlyTensor();
+
     /**
      * Retrieve the tensor type of the Tensor
      *

diff --git a/test/TestOpTensorCopy.cpp b/test/TestOpTensorCopy.cpp
@@ -156,3 +156,36 @@ TEST(TestOpTensorCopy, SingleTensorShouldFail)
     EXPECT_THROW(mgr.sequence()->eval<kp::OpTensorCopy>({ tensorA }),
                  std::runtime_error);
 }
+
+TEST(TestOpTensorCopy, eStorageTensorCopy)
+{
+    kp::Manager mgr;
+
+    const auto vec_in = std::vector<float>{ 0.0, 1.2, 3.2 };
+    const auto vec_out_with_wrong_values =
+      std::vector<float>(vec_in.size(), 0.0);
+
+    auto tensor_in = mgr.tensor(vec_in, kp::Tensor::TensorTypes::eDevice);
+    auto tensor_temp = mgr.tensor(nullptr,
+                                  vec_in.size(),
+                                  sizeof(decltype(vec_in.back())),
+                                  kp::Tensor::TensorDataTypes::eFloat,
+                                  kp::Tensor::TensorTypes::eStorage);
+    auto tensor_out = mgr.tensor(vec_out_with_wrong_values, kp::Tensor::TensorTypes::eDevice);
+
+    EXPECT_TRUE(tensor_in->isInit());
+
+    // For eStorage tensors isInit() will return false if provided nullptr as data pointer 
+    EXPECT_TRUE(!tensor_temp->isInit());
+
+    EXPECT_TRUE(tensor_out->isInit());
+
+    mgr.sequence()
+      ->eval<kp::OpTensorSyncDevice>({tensor_in})
+      ->eval<kp::OpTensorCopy>({tensor_in, tensor_temp})
+      ->eval<kp::OpTensorCopy>({tensor_temp, tensor_out})
+      ->eval<kp::OpTensorSyncLocal>({tensor_out});
+
+    EXPECT_EQ(tensor_in->vector(), tensor_out->vector());
+
+}
diff --git a/test/TestSequence.cpp b/test/TestSequence.cpp
@@ -97,8 +97,10 @@ TEST(TestSequence, RerecordSequence)
 
     algo->rebuild({ tensorOut, tensorA, tensorB }, spirv);
 
-    // Refresh and trigger a rerecord
-    sq->rerecord();
+    // Rerecord (Cannot call rerecord because operations from sequence gets cleared in eval())
+    sq->record<kp::OpAlgoDispatch>(algo)->record<kp::OpTensorSyncLocal>(
+      { tensorA, tensorB, tensorOut });
+
     sq->eval();
 
     EXPECT_EQ(tensorB->vector(), std::vector<float>({ 2, 8, 18 }));