Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Amend memory hierarchy to enable for push constants and functional interface for more flexible operations #164

Merged
merged 24 commits into from
Feb 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9aae5d6
Initial checkpoint with reasonable workflow
axsaucedo Feb 24, 2021
635fdb0
Added baseline functionality including new memory models
axsaucedo Feb 24, 2021
f35a62e
Updated core initial running test
axsaucedo Feb 24, 2021
3f12882
Working initial base
axsaucedo Feb 25, 2021
6378583
Further tests added to new structure
axsaucedo Feb 25, 2021
fb617d1
Initial base set of tests aligned with new interface
axsaucedo Feb 25, 2021
3304767
Updated to enable for opmult to work
axsaucedo Feb 26, 2021
b3abbf1
Added algo executions tests
axsaucedo Feb 26, 2021
fd0f0d3
Fixed all OpTensorSync tests
axsaucedo Feb 26, 2021
7fb1515
Added TensorOpTensorCreate tests
axsaucedo Feb 27, 2021
08a5543
Removed the brainstorming code in the py file
axsaucedo Feb 27, 2021
2b09c55
Updated testopshaders
axsaucedo Feb 27, 2021
f206c62
Added TensorOpCopy tests
axsaucedo Feb 27, 2021
9788c79
Updated sequence and sequence tests
axsaucedo Feb 27, 2021
9d206c3
Update cmakelists to align with required setup
axsaucedo Feb 27, 2021
198fb46
Fixed integration tests fails due to pipeline not freed
axsaucedo Feb 27, 2021
4c4d073
Python implementation
axsaucedo Feb 27, 2021
91d3b9a
All python tests pass
axsaucedo Feb 28, 2021
7dc1f35
Added support for push constants
axsaucedo Feb 28, 2021
38f356f
Updated python to align with current configuration
axsaucedo Feb 28, 2021
ddb7770
Updated examples in readme
axsaucedo Feb 28, 2021
75315db
Updated ref architecture
axsaucedo Feb 28, 2021
63e220a
Reformat
axsaucedo Feb 28, 2021
4fddf74
Updated examples
axsaucedo Feb 28, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .ccls
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
-I./external/googletest/googletest/include/
-I./external/glslang/
-I./external/spdlog/include/
-I./external/fmt/include/
-I./src/include/
-I./single_include/
-I./vk_ndk_wrapper_include/
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
mk_cmake:
cmake \
-Bbuild \
$(MK_CMAKE_EXTRA_FLAGS) \
-DKOMPUTE_EXTRA_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
Expand All @@ -69,6 +68,7 @@ mk_cmake:
-DKOMPUTE_OPT_BUILD_SINGLE_HEADER=1 \
-DKOMPUTE_OPT_ENABLE_SPDLOG=1 \
-DKOMPUTE_OPT_CODE_COVERAGE=1 \
$(MK_CMAKE_EXTRA_FLAGS) \
-G "Unix Makefiles"

mk_build_all:
Expand Down
142 changes: 105 additions & 37 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,35 +56,65 @@ int main() {
// 2. Create and initialise Kompute Tensors through manager
auto tensorInA = mgr.tensor({ 2., 2., 2. });
auto tensorInB = mgr.tensor({ 1., 2., 3. });
auto tensorOut = mgr.tensor({ 0., 0., 0. });
auto tensorOutA = mgr.tensor({ 0., 0., 0. });
auto tensorOutB = mgr.tensor({ 0., 0., 0. });

// 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path)
std::string shaderString = (R"(
std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};

// 3. Create algorithm based on shader (supports buffers & push/spec constants)
std::string shader = (R"(
#version 450

layout (local_size_x = 1) in;

// The input tensors bind index is relative to index in parameter passed
layout(set = 0, binding = 0) buffer bina { float tina[]; };
layout(set = 0, binding = 1) buffer binb { float tinb[]; };
layout(set = 0, binding = 2) buffer bout { float tout[]; };
layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };

// Kompute supports push constants updated on dispatch
layout(push_constant) uniform PushConstants {
float val;
} push_const;

// Kompute also supports spec constants on initalization
layout(constant_id = 0) const float const_one = 0;

void main() {
uint index = gl_GlobalInvocationID.x;
tout[index] = tina[index] * tinb[index];
out_a[index] += in_a[index] * in_b[index];
out_b[index] += const_one * push_const.val;
}
)");

// 3. Run operation with string shader synchronously
mgr.evalOpDefault<kp::OpAlgoBase>(
{ tensorInA, tensorInB, tensorOut },
kp::Shader::compile_source(shaderString));
kp::Workgroup workgroup({3, 1, 1});
kp::Constants specConsts({ 2 });

auto algorithm = mgr.algorithm(params, kp::Shader::compile_source(shader), workgroup, specConsts);

kp::Constants pushConstsA({ 2.0 });
kp::Constants pushConstsB({ 3.0 });

// 4. Map results back from GPU memory to print the results
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorInA, tensorInB, tensorOut });
// 4. Run operation synchronously using sequence
mgr.sequence()
->record<kp::OpTensorSyncDevice>(params)
->record<kp::OpAlgoDispatch>(algorithm, pushConstsA)
->record<kp::OpAlgoDispatch>(algorithm, pushConstsB)
->eval();

// Prints the output which is Output: { 2, 4, 6 }
for (const float& elem : tensorOut->data()) std::cout << elem << " ";
// 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq->evalAsync<kp::OpTensorSyncLocal>(params);

// ... Do other work asynchronously whilst GPU finishes

sq->evalAwait();

// Prints the first output which is: { 4, 8, 12 }
for (const float& elem : tensorOutA->data()) std::cout << elem << " ";
// Prints the second output which is: { 10, 10, 10 }
for (const float& elem : tensorOutB->data()) std::cout << elem << " ";
}

```
Expand All @@ -94,34 +124,72 @@ int main() {
The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.

```python

# 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue)
mgr = Manager()
mgr = kp.Manager()

# 2. Create and initialise Kompute Tensors through manager
tensor_in_a = mgr.tensor([2, 2, 2])
tensor_in_b = mgr.tensor([1, 2, 3])
tensor_out_a = mgr.tensor([0, 0, 0])
tensor_out_b = mgr.tensor([0, 0, 0])

params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]

# 3. Create algorithm based on shader (supports buffers & push/spec constants)
shader = """
#version 450

layout (local_size_x = 1) in;

// The input tensors bind index is relative to index in parameter passed
layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };

// Kompute supports push constants updated on dispatch
layout(push_constant) uniform PushConstants {
float val;
} push_const;

// Kompute also supports spec constants on initalization
layout(constant_id = 0) const float const_one = 0;

void main() {
uint index = gl_GlobalInvocationID.x;
out_a[index] += in_a[index] * in_b[index];
out_b[index] += const_one * push_const.val;
}
"""

workgroup = (3, 1, 1)
spec_consts = [2]
push_consts_a = [2]
push_consts_b = [3]

algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts)

# 2. Create and initialise Kompute Tensors (can be initialized with List[] or np.Array)
tensor_in_a = Tensor([2, 2, 2])
tensor_in_b = Tensor([1, 2, 3])
tensor_out = Tensor([0, 0, 0])
# 4. Run operation synchronously using sequence
(mgr.sequence()
.record(kp.OpTensorSyncDevice(params))
.record(kp.OpAlgoDispatch(algo, push_consts_a))
.record(kp.OpAlgoDispatch(algo, push_consts_b))
.eval())

mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
# 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq.eval_async(kp.OpTensorSyncLocal(params))

# 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path)
@python2shader
def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
data1=("buffer", 0, Array(f32)),
data2=("buffer", 1, Array(f32)),
data3=("buffer", 2, Array(f32))):
i = index.x
data3[i] = data1[i] * data2[i]
# ... Do other work asynchronously whilst GPU finishes

# 4. Run multiplication operation synchronously
mgr.eval_algo_data_def(
[tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
sq.eval_await()

# 5. Map results back from GPU memory to print the results
mgr.eval_tensor_sync_local_def([tensor_out])
# Prints the first output which is: { 4, 8, 12 }
print(tensor_out_a)
# Prints the first output which is: { 10, 10, 10 }
print(tensor_out_b)

# Prints [2.0, 4.0, 6.0]
print(tensor_out.data())
```

### Interactive Notebooks & Hands on Videos
Expand Down Expand Up @@ -199,7 +267,7 @@ The core architecture of Kompute includes the following:
* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) code executed in the GPU
* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU

To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).

Expand Down
Binary file modified docs/images/kompute-vulkan-architecture.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
93 changes: 47 additions & 46 deletions examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,61 +20,62 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
uint32_t ITERATIONS = 100;
float learningRate = 0.1;

std::shared_ptr<kp::Tensor> xI{ new kp::Tensor(xIData) };
std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor(xJData) };

std::shared_ptr<kp::Tensor> y{ new kp::Tensor(yData) };

std::shared_ptr<kp::Tensor> wIn{ new kp::Tensor({ 0.001, 0.001 }) };
std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor(zerosData) };
std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor(zerosData) };

std::shared_ptr<kp::Tensor> bIn{ new kp::Tensor({ 0 }) };
std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor(zerosData) };
{
kp::Manager mgr;

std::shared_ptr<kp::Tensor> lOut{ new kp::Tensor(zerosData) };
std::shared_ptr<kp::Tensor> xI = mgr.tensor(xIData);
std::shared_ptr<kp::Tensor> xJ = mgr.tensor(xJData);

std::vector<std::shared_ptr<kp::Tensor>> params = { xI, xJ, y,
wIn, wOutI, wOutJ,
bIn, bOut, lOut };
std::shared_ptr<kp::Tensor> y = mgr.tensor(yData);

{
kp::Manager mgr;
std::shared_ptr<kp::Tensor> wIn = mgr.tensor({ 0.001, 0.001 });
std::shared_ptr<kp::Tensor> wOutI = mgr.tensor(zerosData);
std::shared_ptr<kp::Tensor> wOutJ = mgr.tensor(zerosData);

{
mgr.rebuild(params);
std::shared_ptr<kp::Tensor> bIn = mgr.tensor({ 0 });
std::shared_ptr<kp::Tensor> bOut = mgr.tensor(zerosData);

std::shared_ptr<kp::Sequence> sq = mgr.sequence();
std::shared_ptr<kp::Tensor> lOut = mgr.tensor(zerosData);

// Record op algo base
sq->begin();
std::vector<std::shared_ptr<kp::Tensor>> params = { xI, xJ, y,
wIn, wOutI, wOutJ,
bIn, bOut, lOut };

sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
std::vector<uint32_t> spirv(
(uint32_t*)kp::shader_data::shaders_glsl_logisticregression_comp_spv,
(uint32_t*)(kp::shader_data::shaders_glsl_logisticregression_comp_spv
+ kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));

// Newer versions of Android are able to use shaderc to read raw string
sq->record<kp::OpAlgoBase>(
params, kp::Shader::compile_source(LR_SHADER));
std::shared_ptr<kp::Algorithm> algo =
mgr.algorithm(params, spirv, kp::Workgroup({ 5 }), kp::Constants({ 5.0 }));

sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
mgr.sequence()->eval<kp::OpTensorSyncDevice>(params);

sq->end();
std::shared_ptr<kp::Sequence> sq = mgr.sequence()
->record<kp::OpTensorSyncDevice>({ wIn, bIn })
->record<kp::OpAlgoDispatch>(algo)
->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });

// Iterate across all expected iterations
for (size_t i = 0; i < ITERATIONS; i++) {
// Iterate across all expected iterations
for (size_t i = 0; i < ITERATIONS; i++) {

sq->eval();
sq->eval();

for (size_t j = 0; j < bOut->size(); j++) {
wIn->data()[0] -= learningRate * wOutI->data()[j];
wIn->data()[1] -= learningRate * wOutJ->data()[j];
bIn->data()[0] -= learningRate * bOut->data()[j];
}
for (size_t j = 0; j < bOut->size(); j++) {
wIn->data()[0] -= learningRate * wOutI->data()[j];
wIn->data()[1] -= learningRate * wOutJ->data()[j];
bIn->data()[0] -= learningRate * bOut->data()[j];
}
}
}

this->mWeights = kp::Tensor(wIn->data());
this->mBias = kp::Tensor(bIn->data());
KP_LOG_INFO("RESULT: <<<<<<<<<<<<<<<<<<<");
KP_LOG_INFO("{}", wIn->data()[0]);
KP_LOG_INFO("{}", wIn->data()[1]);
KP_LOG_INFO("{}", bIn->data()[0]);

this->mWeights = wIn;
this->mBias = bIn;
}
}

std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<float> xJ) {
Expand All @@ -88,9 +89,9 @@ std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<fl
for (size_t i = 0; i < xI.size(); i++) {
float xIVal = xI[i];
float xJVal = xJ[i];
float result = (xIVal * this->mWeights.data()[0]
+ xJVal * this->mWeights.data()[1]
+ this->mBias.data()[0]);
float result = (xIVal * this->mWeights->data()[0]
+ xJVal * this->mWeights->data()[1]
+ this->mBias->data()[0]);

// Instead of using sigmoid we'll just return full numbers
float var = result > 0 ? 1 : 0;
Expand All @@ -103,13 +104,13 @@ std::vector<float> KomputeModelML::predict(std::vector<float> xI, std::vector<fl
std::vector<float> KomputeModelML::get_params() {
std::vector<float> retVector;

if(this->mWeights.size() + this->mBias.size() == 0) {
if(this->mWeights->size() + this->mBias->size() == 0) {
return retVector;
}

retVector.push_back(this->mWeights.data()[0]);
retVector.push_back(this->mWeights.data()[1]);
retVector.push_back(this->mBias.data()[0]);
retVector.push_back(this->mWeights->data()[0]);
retVector.push_back(this->mWeights->data()[1]);
retVector.push_back(this->mBias->data()[0]);
retVector.push_back(99.0);

return retVector;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <vector>
#include <string>
#include <memory>

#include "kompute/Kompute.hpp"

Expand All @@ -20,8 +21,8 @@ class KomputeModelML {
std::vector<float> get_params();

private:
kp::Tensor mWeights;
kp::Tensor mBias;
std::shared_ptr<kp::Tensor> mWeights;
std::shared_ptr<kp::Tensor> mBias;

};

Expand Down
11 changes: 7 additions & 4 deletions examples/array_multiplication/src/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@ int main()
}
)");

mgr.evalOpDefault<kp::OpAlgoBase>(
{ tensorInA, tensorInB, tensorOut },
kp::Shader::compile_source(shader));
std::vector<std::shared_ptr<kp::Tensor>> params = { tensorInA, tensorInB, tensorOut };

mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(params, kp::Shader::compile_source(shader));

mgr.sequence()
->record<kp::OpTensorSyncDevice>(params)
->record<kp::OpAlgoDispatch>(algo)
->record<kp::OpTensorSyncLocal>(params);

// prints "Output { 0 4 12 }"
std::cout<< "Output: { ";
Expand Down
Loading