Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend non-spdlog print functions to use fmt::format / fmt::print #159

Merged
merged 4 commits into from
Feb 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@
path = external/glslang
url = https://github.com/KhronosGroup/glslang/
branch = 11.1.0
[submodule "external/fmt"]
path = external/fmt
url = https://github.com/fmtlib/fmt
branch = 7.1.3
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,18 @@ set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, se

if(KOMPUTE_OPT_ENABLE_SPDLOG)
set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DKOMPUTE_ENABLE_SPDLOG=1")
set(SPDLOG_FMT_EXTERNAL ON CACHE BOOL "Enables external fmt as its current dep" FORCE)
if(KOMPUTE_OPT_INSTALL)
# Enable install parameters for spdlog (overrides parameters passed)
set(SPDLOG_INSTALL ON CACHE BOOL "Enables install of glslang" FORCE)
set(SPDLOG_INSTALL ON CACHE BOOL "Enables install of spdlot" FORCE)
endif()
endif()

if(KOMPUTE_OPT_INSTALL)
# Enable install parameters for fmt (overrides parameters passed)
set(FMT_INSTALL ON CACHE BOOL "Enables install of fmt" FORCE)
endif()

if(KOMPUTE_OPT_ANDOID_BUILD)
set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DVK_USE_PLATFORM_ANDROID_KHR")
endif()
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ int main() {
// 3. Run operation with string shader synchronously
mgr.evalOpDefault<kp::OpAlgoBase>(
{ tensorInA, tensorInB, tensorOut },
std::vector<uint32_t>(shaderString.begin(), shaderString.end()));
kp::Shader::compile_source(shaderString));

// 4. Map results back from GPU memory to print the results
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorInA, tensorInB, tensorOut });
Expand Down
279 changes: 14 additions & 265 deletions docs/overview/advanced-examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path
auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));

// Create tensors data explicitly in GPU with an operation
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
mgr.rebuild({ tensorA, tensorB });

// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
Expand All @@ -67,7 +67,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path
// Run Kompute operation on the parameters provided with dispatch layout
mgr.evalOpDefault<kp::OpAlgoBase>(
{ tensorA, tensorB },
std::vector<char>(shader.begin(), shader.end()));
kp::Shader::compile_source(shader));

// Sync the GPU memory back to the local tensor
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
Expand Down Expand Up @@ -105,7 +105,7 @@ Record commands in a single submit by using a Sequence to send in batch to GPU.
sq->begin();

// Record batch commands to send to GPU
sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
sq->record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});

// Stop recording
Expand Down Expand Up @@ -146,7 +146,7 @@ You can submit operations asynchronously with the async/await commands in the kp
auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));

// Create tensors data explicitly in GPU with an operation
mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
mgr.rebuild(tensor)

// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
Expand Down Expand Up @@ -174,6 +174,8 @@ You can submit operations asynchronously with the async/await commands in the kp
}
)");

std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);

// We can now await for the previous submitted command
// The first parameter can be the amount of time to wait
// The time provided is in nanoseconds
Expand All @@ -182,7 +184,7 @@ You can submit operations asynchronously with the async/await commands in the kp
// Run Async Kompute operation on the parameters provided
mgr.evalOpAsyncDefault<kp::OpAlgoBase>(
{ tensor },
std::vector<char>(shader.begin(), shader.end()));
spirv);

// Here we can do other work

Expand Down Expand Up @@ -234,7 +236,7 @@ Back to `examples list <#simple-examples>`_.
auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));

// We run the first step synchronously on the default sequence
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
mgr.rebuild({ tensorA, tensorB });

// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
Expand Down Expand Up @@ -262,17 +264,19 @@ Back to `examples list <#simple-examples>`_.
}
)");

std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);

// Run the first parallel operation in the `queueOne` sequence
mgr.evalOpAsync<kp::OpAlgoBase>(
{ tensorA },
"queueOne",
std::vector<char>(shader.begin(), shader.end()));
spirv);

// Run the second parallel operation in the `queueTwo` sequence
mgr.evalOpAsync<kp::OpAlgoBase>(
{ tensorB },
"queueTwo",
std::vector<char>(shader.begin(), shader.end()));
spirv);

// Here we can do other work

Expand Down Expand Up @@ -308,7 +312,7 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "")
{
// Perform your custom steps such as reading from a shader file
this->mShaderFilePath = "shaders/glsl/opmult.comp";
this->mShaderFilePath = "shaders/glsl/opmult.comp.spv";
}
}

Expand All @@ -323,7 +327,7 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));

// Create tensors data explicitly in GPU with an operation
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
mgr.rebuild({ tensorLhs, tensorRhs, tensorOut });

// Run Kompute operation on the parameters provided with dispatch layout
mgr.evalOpDefault<kp::OpMyCustom<3, 1, 1>>(
Expand All @@ -334,258 +338,3 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
}


Logistic Regression Example
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Logistic regression is oftens seen as the hello world in machine learning so we will be using it for our examples. Back to `examples list <#simple-examples>`_.

.. image:: ../images/logistic-regression.jpg
:width: 300px


In summary, we have:


* Vector ``X`` with input data (with a pair of inputs ``Xi`` and ``Xj``\ )
* Output ``Y`` with expected predictions

With this we will:

* Optimize the function simplified as ``Y = WX + b``
* We'll want our program to learn the parameters ``W`` and ``b``

We will have to convert this into Kompute terminology.

First specifically around the inputs, we will be using the following:

* Two vertors for the variable `X`, vector `Xi` and `Xj`
* One vector `Y` for the true predictions
* A vector `W` containing the two input weight values to use for inference
* A vector `B` containing a single input parameter for `b`

.. code-block:: cpp
:linenos:

std::vector<float> wInVec = { 0.001, 0.001 };
std::vector<float> bInVec = { 0 };

std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })};
std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })};

std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 })};

std::shared_ptr<kp::Tensor> wIn{
new kp::Tensor(wInVec, kp::Tensor::TensorTypes::eStaging)};

std::shared_ptr<kp::Tensor> bIn{
new kp::Tensor(bInVec, kp::Tensor::TensorTypes::eStaging)};


We will have the following output vectors:

* Two output vectors `Wi` and `Wj` to store all the deltas to perform gradient descent on W
* One output vector `Bout` to store all the deltas to perform gradient descent on B

.. code-block:: cpp
:linenos:

std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })};

std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })};


For simplicity we will store all the tensors inside a params variable:

.. code-block:: cpp
:linenos:

std::vector<std::shared_ptr<kp::Tensor>> params =
{xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut};


Now that we have the inputs and outputs we will be able to use them in the processing. The workflow we will be using is the following:

1. Create a Sequence to record and submit GPU commands
2. Submit OpCreateTensor to create all the tensors
3. Record the OpAlgo with the Logistic Regression shader
4. Loop across number of iterations:
4-a. Submit algo operation on LR shader
4-b. Re-calculate weights from loss
5. Print output weights and bias

1. Create a sequence to record and submit GPU commands

.. code-block:: cpp
:linenos:

kp::Manager mgr;

if (std::shared_ptr<kp::Sequence> sq =
mgr.sequence("createTensors").lock())
{
// ...



Submit OpCreateTensor to create all the tensors

.. code-block:: cpp
:linenos:

{
// ... continuing from codeblock above

sq->begin();

sq->record<kp::OpCreateTensor>(params);

sq->end();
sq->eval();


Record the OpAlgo with the Logistic Regression shader

Once we re-record, all the instructions that were recorded previously are cleared.

Because of this we can record now the new commands which will consist of the following:


.. code-block:: cpp
:linenos:

{
// ... continuing from codeblock above

sq->begin();

sq->record<kp::OpTensorSyncDevice>({wIn, bIn});

sq->record<kp::OpAlgoBase>(
params,
false, // Whether to copy output from device
"test/shaders/glsl/test_logistic_regression.comp");

sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});

sq->end();



Loop across number of iterations + 4-a. Submit algo operation on LR shader

.. code-block:: cpp
:linenos:

{
// ... continuing from codeblock above

uint32_t ITERATIONS = 100;

for (size_t i = 0; i < ITERATIONS; i++)
{
// Run evaluation which passes data through shader once
sq->eval();



4-b. Re-calculate weights from loss


Once the shader code is executed, we are able to use the outputs from the shader calculation.

In this case we want to basically add all the calculated weights and bias from the back-prop step.

.. code-block:: cpp
:linenos:

{
// ...
for (size_t i = 0; i < ITERATIONS; i++)
{
// ... continuing from codeblock above

// Run evaluation which passes data through shader once
sq->eval();

// Subtract the resulting weights and biases
for(size_t j = 0; j < bOut->size(); j++) {
wInVec[0] -= wOutI->data()[j];
wInVec[1] -= wOutJ->data()[j];
bInVec[0] -= bOut->data()[j];
}
// Set the data for the GPU to use in the next iteration
wIn->mapDataIntoHostMemory();
bIn->mapDataIntoHostMemory();
}

5. Print output weights and bias

.. code-block:: cpp
:linenos:

std::cout << "Weight i: " << wIn->data()[0] << std::endl;
std::cout << "Weight j: " << wIn->data()[1] << std::endl;
std::cout << "Bias: " << bIn->data()[0] << std::endl;



Logistic Regression Compute Shader
----------------------------------

Finally you can see the shader used for the logistic regression usecase below:

.. code-block:: cpp
:linenos:

#version 450

layout (constant_id = 0) const uint M = 0;

layout (local_size_x = 1) in;

layout(set = 0, binding = 0) buffer bxi { float xi[]; };
layout(set = 0, binding = 1) buffer bxj { float xj[]; };
layout(set = 0, binding = 2) buffer by { float y[]; };
layout(set = 0, binding = 3) buffer bwin { float win[]; };
layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
layout(set = 0, binding = 6) buffer bbin { float bin[]; };
layout(set = 0, binding = 7) buffer bbout { float bout[]; };

float learningRate = 0.1;
float m = float(M);

float sigmoid(float z) {
return 1.0 / (1.0 + exp(-z));
}

float inference(vec2 x, vec2 w, float b) {
float z = dot(w, x) + b;
float yHat = sigmoid(z);
return yHat;
}

float calculateLoss(float yHat, float y) {
return -(y * log(yHat) + (1.0 - y) * log(1.0 - yHat));
}

void main() {
uint idx = gl_GlobalInvocationID.x;

vec2 wCurr = vec2(win[0], win[1]);
float bCurr = bin[0];

vec2 xCurr = vec2(xi[idx], xj[idx]);
float yCurr = y[idx];

float yHat = inference(xCurr, wCurr, bCurr);
float loss = calculateLoss(yHat, yCurr);

float dZ = yHat - yCurr;
vec2 dW = (1. / m) * xCurr * dZ;
float dB = (1. / m) * dZ;
wouti[idx] = learningRate * dW.x;
woutj[idx] = learningRate * dW.y;
bout[idx] = learningRate * dB;
}
Loading