-
Notifications
You must be signed in to change notification settings - Fork 539
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
215 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
.. meta:: | ||
:description: This chapter describes how to use HIP graphs. | ||
:keywords: ROCm, HIP, graph, stream | ||
|
||
.. _how_to_HIP_graph: | ||
|
||
******************************************************************************** | ||
Using HIP graphs | ||
******************************************************************************** | ||
|
||
This chapter explains how to create and use HIP graphs. To get a better understanding of | ||
HIP graphs see :ref:`the understand-chapter about HIP graphs<understand_HIP_graph>`. | ||
|
||
There are two different ways of creating graphs: Capturing kernel launches from a stream, or explicitly creating graphs. | ||
|
||
Either way ends up with a ``hipGraph_t``, which is a template for a graph. | ||
In order to actually launch a graph, the template needs to be instantiated using ``hipGraphInstantiate``, | ||
which results in an executable graph of type ``hipGraphExec_t``. | ||
This executable graph can then be launched with ``hipGraphLaunch``, | ||
replaying the operations within the graph. | ||
|
||
Stream capture | ||
================================= | ||
|
||
The easy way to integrate graphs into already existing code is to use stream capture. | ||
|
||
When starting to capture operations for a graph using ``hipStreamBeginCapture``, | ||
the operations assigned to the stream are captured into a graph instead of being | ||
executed. That graph is returned when calling ``hipStreamEndCapture``, which | ||
also stops capturing operations. | ||
|
||
The following code is an example of how to use the HIP graph API to capture a graph from a stream | ||
|
||
.. code-block:: cpp | ||
#include <hip/hip_runtime.h> | ||
#include <vector> | ||
#define HIP_CHECK(c){if(c != hipSuccess) return -1;} | ||
__global__ void kernelA(double* arrayA, size_t size); | ||
__global__ void kernelB(int* arrayB, size_t size); | ||
__global__ void kernelC(double* arrayA, int* arrayB, size_t size); | ||
int main(){ | ||
size_t array_size = 1U << 20; | ||
int numOfBlocks = 1024; | ||
int threadsPerBlock = 1024; | ||
double* d_arrayA; | ||
int* d_arrayB; | ||
std::vector<double> h_array(array_size); | ||
HIP_CHECK(hipMalloc(&d_arrayA, array_size * sizeof(*d_arrayA))); | ||
HIP_CHECK(hipMalloc(&d_arrayB, array_size * sizeof(*d_arrayB))); | ||
hipStream_t captureStream; | ||
HIP_CHECK(hipStreamCreate(&captureStream)); | ||
// Start capturing the operations | ||
HIP_CHECK(hipStreamBeginCapture(captureStream, hipStreamCaptureModeGlobal)); | ||
HIP_CHECK(hipMemcpy(d_arrayA, &h_array, array_size * sizeof(*d_arrayA), hipMemcpyHostToDevice)); | ||
kernelA<<<numOfBlocks, threadsPerBlock, 0, captureStream>>>(d_arrayA, array_size); | ||
kernelB<<<numOfBlocks, threadsPerBlock, 0, captureStream>>>(d_arrayB, array_size); | ||
HIP_CHECK(hipDeviceSynchronize()); | ||
kernelC<<<numOfBlocks, threadsPerBlock, 0, captureStream>>>(d_arrayA, d_arrayB, array_size); | ||
HIP_CHECK(hipMemcpy(&h_array, d_arrayA, array_size * sizeof(*d_arrayA), hipMemcpyDeviceToHost)); | ||
hipGraph_t graph; | ||
HIP_CHECK(hipStreamEndCapture(captureStream, &graph)); | ||
// Create an executable graph from the captured graph. | ||
hipGraphExec_t graphExec; | ||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); | ||
// Actually launch the graph. The stream does not have | ||
// to be the same as the one used for capturing. | ||
HIP_CHECK(hipGraphLaunch(graphExec, captureStream)); | ||
HIP_CHECK(hipGraphExecDestroy(graphExec)); | ||
HIP_CHECK(hipFree(d_arrayA)); | ||
HIP_CHECK(hipFree(d_arrayB)); | ||
HIP_CHECK(hipGraphDestroy(graph)); | ||
HIP_CHECK(hipStreamDestroy(captureStream)); | ||
} | ||
Direct graph creation | ||
================================= | ||
|
||
Graphs can also be created directly using the HIP graph API, giving more fine-grained control over the graph. | ||
The nodes are represented by ``hipGraphNode_t``, and the specific parameters | ||
have a separate type each, e.g. ``hipKernelNodeParams``. Depending on the | ||
operation, the function to call for adding the node varies. For kernel nodes | ||
it is ``hipGraphAddKernelNode``, or for memory copies it is ``hipGraphAddMemcpyNode``. | ||
For a full list see the :doc:`HIP graph API documentation<../doxygen/html/group___graph>`. | ||
|
||
.. code-block:: cpp | ||
#include <hip/hip_runtime.h> | ||
#include <vector> | ||
#define HIP_CHECK(c){if(c != hipSuccess) return -1;} | ||
__global__ void kernelA(double* arrayA, size_t size); | ||
__global__ void kernelB(int* arrayB, size_t size); | ||
__global__ void kernelC(double* arrayA, int* arrayB, size_t size); | ||
int main(){ | ||
size_t array_size = 1U << 20; | ||
int numberOfBlocks = 1024; | ||
int threadsPerBlock = 1024; | ||
double* d_arrayA; | ||
int* d_arrayB; | ||
std::vector<double> h_array(array_size); | ||
HIP_CHECK(hipMalloc(&d_arrayA, array_size * sizeof(*d_arrayA))); | ||
HIP_CHECK(hipMalloc(&d_arrayB, array_size * sizeof(*d_arrayB))); | ||
// Set up parameters for kernel and copy nodes | ||
hipKernelNodeParams kernelAParams, kernelBParams, kernelCParams; | ||
hipMemcpy3DParms cpyToDevParams, cpyToHostParams; | ||
void* kernelAArgs[] = {static_cast<void*>(&d_arrayA), static_cast<void*>(&array_size)}; | ||
kernelAParams.func = reinterpret_cast<void*>(kernelA); | ||
kernelAParams.gridDim = numberOfBlocks; | ||
kernelAParams.blockDim = threadsPerBlock; | ||
kernelAParams.sharedMemBytes = 0; | ||
kernelAParams.kernelParams = kernelAArgs; | ||
kernelAParams.extra = nullptr; | ||
void* kernelBArgs[] = {static_cast<void*>(&d_arrayB), static_cast<void*>(&array_size)}; | ||
kernelBParams.func = reinterpret_cast<void*>(kernelB); | ||
kernelAParams.gridDim = numberOfBlocks; | ||
kernelAParams.blockDim = threadsPerBlock; | ||
kernelAParams.sharedMemBytes = 0; | ||
kernelAParams.kernelParams = kernelBArgs; | ||
kernelAParams.extra = nullptr; | ||
void* kernelCArgs[] = {static_cast<void*>(&d_arrayA), static_cast<void*>(&d_arrayB), static_cast<void*>(&array_size)}; | ||
kernelCParams.func = reinterpret_cast<void*>(kernelC); | ||
kernelAParams.gridDim = numberOfBlocks; | ||
kernelAParams.blockDim = threadsPerBlock; | ||
kernelAParams.sharedMemBytes = 0; | ||
kernelAParams.kernelParams = kernelCArgs; | ||
kernelAParams.extra = nullptr; | ||
cpyToDevParams.srcArray = nullptr; | ||
cpyToDevParams.srcPos = make_hipPos(0, 0, 0); | ||
cpyToDevParams.srcPtr = make_hipPitchedPtr(h_array.data(), array_size * sizeof(h_array[0]), array_size, 1); | ||
cpyToDevParams.dstArray = nullptr; | ||
cpyToDevParams.dstPos = make_hipPos(0, 0, 0); | ||
cpyToDevParams.dstPtr = make_hipPitchedPtr(d_arrayA, array_size * sizeof(*d_arrayA), array_size, 1); | ||
cpyToDevParams.extent = make_hipExtent(array_size * sizeof(*d_arrayA), 1, 1); | ||
cpyToDevParams.kind = hipMemcpyHostToDevice; | ||
cpyToHostParams.srcArray = nullptr; | ||
cpyToHostParams.srcPos = make_hipPos(0, 0, 0); | ||
cpyToHostParams.srcPtr = make_hipPitchedPtr(d_arrayA, array_size * sizeof(*d_arrayA), array_size, 1); | ||
cpyToHostParams.dstArray = nullptr; | ||
cpyToHostParams.dstPos = make_hipPos(0, 0, 0); | ||
cpyToHostParams.dstPtr = make_hipPitchedPtr(h_array.data(), array_size * sizeof(h_array[0]), array_size, 1); | ||
cpyToHostParams.extent = make_hipExtent(array_size * sizeof(*d_arrayA), 1, 1); | ||
cpyToHostParams.kind = hipMemcpyDeviceToHost; | ||
// Create graph and add nodes with their respective parameters | ||
hipGraph_t graph; | ||
hipGraphNode_t kernelANode, kernelBNode, kernelCNode, cpyToDevNode, cpyToHostNode; | ||
HIP_CHECK(hipGraphCreate(&graph, 0)); | ||
// Add copy operations | ||
HIP_CHECK(hipGraphAddMemcpyNode(&cpyToDevNode, graph, nullptr, 0, &cpyToDevParams)); | ||
HIP_CHECK(hipGraphAddMemcpyNode(&cpyToHostNode, graph, nullptr, 0, &cpyToHostParams)); | ||
// Add kernels to graph | ||
HIP_CHECK(hipGraphAddKernelNode(&kernelANode, graph, nullptr, 0, &kernelAParams)); | ||
HIP_CHECK(hipGraphAddKernelNode(&kernelBNode, graph, nullptr, 0, &kernelBParams)); | ||
HIP_CHECK(hipGraphAddKernelNode(&kernelCNode, graph, nullptr, 0, &kernelCParams)); | ||
// Add dependencies between nodes | ||
// kernels A and B have to wait for the copy operation | ||
HIP_CHECK(hipGraphAddDependencies(graph, &cpyToDevNode, &kernelANode, 1)); | ||
HIP_CHECK(hipGraphAddDependencies(graph, &cpyToDevNode, &kernelBNode, 1)); | ||
// kernel C is dependent on kernels A and B | ||
HIP_CHECK(hipGraphAddDependencies(graph, &kernelANode, &kernelCNode, 1)); | ||
HIP_CHECK(hipGraphAddDependencies(graph, &kernelBNode, &kernelCNode, 1)); | ||
// The copy back to the host has to wait for kernel C to finish | ||
HIP_CHECK(hipGraphAddDependencies(graph, &kernelCNode, &cpyToHostNode, 1)); | ||
// Instantiate graph the just created graph in order to execute it | ||
hipGraphExec_t graphExec; | ||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); | ||
// Launch the executable graph | ||
hipStream_t graphStream; | ||
HIP_CHECK(hipStreamCreate(&graphStream)); | ||
HIP_CHECK(hipGraphLaunch(graphExec, graphStream)); | ||
HIP_CHECK(hipGraphExecDestroy(graphExec)); | ||
HIP_CHECK(hipFree(d_arrayA)); | ||
HIP_CHECK(hipFree(d_arrayB)); | ||
HIP_CHECK(hipGraphDestroy(graph)); | ||
HIP_CHECK(hipStreamDestroy(graphStream)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters