Skip to content

Commit

Permalink
Merge pull request #164 from EthicalML/160_op_memory
Browse files Browse the repository at this point in the history
Amend memory hierarchy to enable for push constants and functional interface for more flexible operations
  • Loading branch information
axsaucedo authored Feb 28, 2021
2 parents 2834519 + 4fddf74 commit 672cf22
Show file tree
Hide file tree
Showing 61 changed files with 3,080 additions and 4,804 deletions.
1 change: 1 addition & 0 deletions .ccls
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
-I./external/googletest/googletest/include/
-I./external/glslang/
-I./external/spdlog/include/
-I./external/fmt/include/
-I./src/include/
-I./single_include/
-I./vk_ndk_wrapper_include/
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
mk_cmake:
cmake \
-Bbuild \
$(MK_CMAKE_EXTRA_FLAGS) \
-DKOMPUTE_EXTRA_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
Expand All @@ -69,6 +68,7 @@ mk_cmake:
-DKOMPUTE_OPT_BUILD_SINGLE_HEADER=1 \
-DKOMPUTE_OPT_ENABLE_SPDLOG=1 \
-DKOMPUTE_OPT_CODE_COVERAGE=1 \
$(MK_CMAKE_EXTRA_FLAGS) \
-G "Unix Makefiles"

mk_build_all:
Expand Down
142 changes: 105 additions & 37 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,35 +56,65 @@ int main() {
// 2. Create and initialise Kompute Tensors through manager
auto tensorInA = mgr.tensor({ 2., 2., 2. });
auto tensorInB = mgr.tensor({ 1., 2., 3. });
auto tensorOut = mgr.tensor({ 0., 0., 0. });
auto tensorOutA = mgr.tensor({ 0., 0., 0. });
auto tensorOutB = mgr.tensor({ 0., 0., 0. });

// 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path)
std::string shaderString = (R"(
std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};

// 3. Create algorithm based on shader (supports buffers & push/spec constants)
std::string shader = (R"(
#version 450

layout (local_size_x = 1) in;

// The input tensors bind index is relative to index in parameter passed
layout(set = 0, binding = 0) buffer bina { float tina[]; };
layout(set = 0, binding = 1) buffer binb { float tinb[]; };
layout(set = 0, binding = 2) buffer bout { float tout[]; };
layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };

// Kompute supports push constants updated on dispatch
layout(push_constant) uniform PushConstants {
float val;
} push_const;

// Kompute also supports spec constants on initalization
layout(constant_id = 0) const float const_one = 0;

void main() {
uint index = gl_GlobalInvocationID.x;
tout[index] = tina[index] * tinb[index];
out_a[index] += in_a[index] * in_b[index];
out_b[index] += const_one * push_const.val;
}
)");

// 3. Run operation with string shader synchronously
mgr.evalOpDefault<kp::OpAlgoBase>(
{ tensorInA, tensorInB, tensorOut },
kp::Shader::compile_source(shaderString));
kp::Workgroup workgroup({3, 1, 1});
kp::Constants specConsts({ 2 });

auto algorithm = mgr.algorithm(params, kp::Shader::compile_source(shader), workgroup, specConsts);

kp::Constants pushConstsA({ 2.0 });
kp::Constants pushConstsB({ 3.0 });

// 4. Map results back from GPU memory to print the results
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorInA, tensorInB, tensorOut });
// 4. Run operation synchronously using sequence
mgr.sequence()
->record<kp::OpTensorSyncDevice>(params)
->record<kp::OpAlgoDispatch>(algorithm, pushConstsA)
->record<kp::OpAlgoDispatch>(algorithm, pushConstsB)
->eval();

// Prints the output which is Output: { 2, 4, 6 }
for (const float& elem : tensorOut->data()) std::cout << elem << " ";
// 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq->evalAsync<kp::OpTensorSyncLocal>(params);

// ... Do other work asynchronously whilst GPU finishes

sq->evalAwait();

// Prints the first output which is: { 4, 8, 12 }
for (const float& elem : tensorOutA->data()) std::cout << elem << " ";
// Prints the second output which is: { 10, 10, 10 }
for (const float& elem : tensorOutB->data()) std::cout << elem << " ";
}

```
Expand All @@ -94,34 +124,72 @@ int main() {
The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.

```python

# 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue)
mgr = Manager()
mgr = kp.Manager()

# 2. Create and initialise Kompute Tensors through manager
tensor_in_a = mgr.tensor([2, 2, 2])
tensor_in_b = mgr.tensor([1, 2, 3])
tensor_out_a = mgr.tensor([0, 0, 0])
tensor_out_b = mgr.tensor([0, 0, 0])

params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]

# 3. Create algorithm based on shader (supports buffers & push/spec constants)
shader = """
#version 450
layout (local_size_x = 1) in;
// The input tensors bind index is relative to index in parameter passed
layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
// Kompute supports push constants updated on dispatch
layout(push_constant) uniform PushConstants {
float val;
} push_const;
// Kompute also supports spec constants on initalization
layout(constant_id = 0) const float const_one = 0;
void main() {
uint index = gl_GlobalInvocationID.x;
out_a[index] += in_a[index] * in_b[index];
out_b[index] += const_one * push_const.val;
}
"""

workgroup = (3, 1, 1)
spec_consts = [2]
push_consts_a = [2]
push_consts_b = [3]

algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts)

# 2. Create and initialise Kompute Tensors (can be initialized with List[] or np.Array)
tensor_in_a = Tensor([2, 2, 2])
tensor_in_b = Tensor([1, 2, 3])
tensor_out = Tensor([0, 0, 0])
# 4. Run operation synchronously using sequence
(mgr.sequence()
.record(kp.OpTensorSyncDevice(params))
.record(kp.OpAlgoDispatch(algo, push_consts_a))
.record(kp.OpAlgoDispatch(algo, push_consts_b))
.eval())

mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
# 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq.eval_async(kp.OpTensorSyncLocal(params))

# 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path)
@python2shader
def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
data1=("buffer", 0, Array(f32)),
data2=("buffer", 1, Array(f32)),
data3=("buffer", 2, Array(f32))):
i = index.x
data3[i] = data1[i] * data2[i]
# ... Do other work asynchronously whilst GPU finishes

# 4. Run multiplication operation synchronously
mgr.eval_algo_data_def(
[tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
sq.eval_await()

# 5. Map results back from GPU memory to print the results
mgr.eval_tensor_sync_local_def([tensor_out])
# Prints the first output which is: { 4, 8, 12 }
print(tensor_out_a)
# Prints the first output which is: { 10, 10, 10 }
print(tensor_out_b)

# Prints [2.0, 4.0, 6.0]
print(tensor_out.data())
```

### Interactive Notebooks & Hands on Videos
Expand Down Expand Up @@ -199,7 +267,7 @@ The core architecture of Kompute includes the following:
* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) code executed in the GPU
* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU

To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).

Expand Down
Binary file modified docs/images/kompute-vulkan-architecture.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
93 changes: 47 additions & 46 deletions examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,61 +20,62 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
uint32_t ITERATIONS = 100;
float learningRate = 0.1;

std::shared_ptr<kp::Tensor> xI{ new kp::Tensor(xIData) };
std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor(xJData) };

std::shared_ptr<kp::Tensor> y{ new kp::Tensor(yData) };

std::shared_ptr<kp::Tensor> wIn{ new kp::Tensor({ 0.001, 0.001 }) };
std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor(zerosData) };
std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor(zerosData) };

std::shared_ptr<kp::Tensor> bIn{ new kp::Tensor({ 0 }) };
std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor(zerosData) };
{
kp::Manager mgr;

std::shared_ptr<kp::Tensor> lOut{ new kp::Tensor(zerosData) };
std::shared_ptr<kp::Tensor> xI = mgr.tensor(xIData);
std::shared_ptr<kp::Tensor> xJ = mgr.tensor(xJData);

std::vector<std::shared_ptr<kp::Tensor>> params = { xI, xJ, y,
wIn, wOutI, wOutJ,
bIn, bOut, lOut };
std::shared_ptr<kp::Tensor> y = mgr.tensor(yData);

{
kp::Manager mgr;
std::shared_ptr<kp::Tensor> wIn = mgr.tensor({ 0.001, 0.001 });
std::shared_ptr<kp::Tensor> wOutI = mgr.tensor(zerosData);
std::shared_ptr<kp::Tensor> wOutJ = mgr.tensor(zerosData);

{
mgr.rebuild(params);
std::shared_ptr<kp::Tensor> bIn = mgr.tensor({ 0 });
std::shared_ptr<kp::Tensor> bOut = mgr.tensor(zerosData);

std::shared_ptr<kp::Sequence> sq = mgr.sequence();
std::shared_ptr<kp::Tensor> lOut = mgr.tensor(zerosData);

// Record op algo base
sq->begin();
std::vector<std::shared_ptr<kp::Tensor>> params = { xI, xJ, y,
wIn, wOutI, wOutJ,
bIn, bOut, lOut };

sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
std::vector<uint32_t> spirv(
(uint32_t*)kp::shader_data::shaders_glsl_logisticregression_comp_spv,
(uint32_t*)(kp::shader_data::shaders_glsl_logisticregression_comp_spv
+ kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));

// Newer versions of Android are able to use shaderc to read raw string
sq->record<kp::OpAlgoBase>(
params, kp::Shader::compile_source(LR_SHADER));
std::shared_ptr<kp::Algorithm> algo =
mgr.algorithm(params, spirv, kp::Workgroup({ 5 }), kp::Constants({ 5.0 }));

sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
mgr.sequence()->eval<kp::OpTensorSyncDevice>(params);

sq->end();
std::shared_ptr<kp::Sequence> sq = mgr.sequence()
->record<kp::OpTensorSyncDevice>({ wIn, bIn })
->record<kp::OpAlgoDispatch>(algo)
->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });

// Iterate across all expected iterations
for (size_t i = 0; i < ITERATIONS; i++) {
// Iterate across all expected iterations
for (size_t i = 0; i < ITERATIONS; i++) {

sq->eval();
sq->eval();

for (size_t j = 0; j < bOut->size(); j++) {
wIn->data()[0] -= learningRate * wOutI->data()[j];
wIn->data()[1] -= learningRate * wOutJ->data()[j];
bIn->data()[0] -= learningRate * bOut->data()[j];
}
for (size_t j = 0; j < bOut->size(); j++) {
wIn->data()[0] -= learningRate * wOutI->data()[j];
wIn->data()[1] -= learningRate * wOutJ->data()[j];
bIn->data()[0] -= learningRate * bOut->data()[j];
}
}
}

this->mWeights = kp::Tensor(wIn->data());
this->mBias = kp::Tensor(bIn->data());
KP_LOG_INFO("RESULT: <<<<<<<<<<<<<<<<<<<");
KP_LOG_INFO("{}", wIn->data()[0]);
KP_LOG_INFO("{}", wIn->data()[1]);
KP_LOG_INFO("{}", bIn->data()[0]);

this->mWeights = wIn;
this->mBias = bIn;
}
}

std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<float> xJ) {
Expand All @@ -88,9 +89,9 @@ std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<fl
for (size_t i = 0; i < xI.size(); i++) {
float xIVal = xI[i];
float xJVal = xJ[i];
float result = (xIVal * this->mWeights.data()[0]
+ xJVal * this->mWeights.data()[1]
+ this->mBias.data()[0]);
float result = (xIVal * this->mWeights->data()[0]
+ xJVal * this->mWeights->data()[1]
+ this->mBias->data()[0]);

// Instead of using sigmoid we'll just return full numbers
float var = result > 0 ? 1 : 0;
Expand All @@ -103,13 +104,13 @@ std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<fl
std::vector<float> KomputeModelML::get_params() {
std::vector<float> retVector;

if(this->mWeights.size() + this->mBias.size() == 0) {
if(this->mWeights->size() + this->mBias->size() == 0) {
return retVector;
}

retVector.push_back(this->mWeights.data()[0]);
retVector.push_back(this->mWeights.data()[1]);
retVector.push_back(this->mBias.data()[0]);
retVector.push_back(this->mWeights->data()[0]);
retVector.push_back(this->mWeights->data()[1]);
retVector.push_back(this->mBias->data()[0]);
retVector.push_back(99.0);

return retVector;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <vector>
#include <string>
#include <memory>

#include "kompute/Kompute.hpp"

Expand All @@ -20,8 +21,8 @@ class KomputeModelML {
std::vector<float> get_params();

private:
kp::Tensor mWeights;
kp::Tensor mBias;
std::shared_ptr<kp::Tensor> mWeights;
std::shared_ptr<kp::Tensor> mBias;

};

Expand Down
11 changes: 7 additions & 4 deletions examples/array_multiplication/src/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@ int main()
}
)");

mgr.evalOpDefault<kp::OpAlgoBase>(
{ tensorInA, tensorInB, tensorOut },
kp::Shader::compile_source(shader));
std::vector<std::shared_ptr<kp::Tensor>> params = { tensorInA, tensorInB, tensorOut };

mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(params, kp::Shader::compile_source(shader));

mgr.sequence()
->record<kp::OpTensorSyncDevice>(params)
->record<kp::OpAlgoDispatch>(algo)
->record<kp::OpTensorSyncLocal>(params);

// prints "Output { 0 4 12 }"
std::cout<< "Output: { ";
Expand Down
Loading

0 comments on commit 672cf22

Please sign in to comment.