Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

An example using cuBLAS library in alpaka #2430

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ add_subdirectory("randomCells2D/")
add_subdirectory("reduce/")
add_subdirectory("tagSpecialization/")
add_subdirectory("vectorAdd/")
add_subdirectory("useCuBLASInAlpaka/")

58 changes: 58 additions & 0 deletions example/useCuBLASInAlpaka/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#
# Copyright 2023 Benjamin Worpitz, Jan Stephan
# SPDX-License-Identifier: ISC
#

################################################################################
# Required CMake version.

cmake_minimum_required(VERSION 3.25)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

################################################################################
# Project.

set(_TARGET_NAME useCuBLASInAlpaka)

project(${_TARGET_NAME} LANGUAGES CXX)

# Check if the cmake variables to see if the Acc option is Cuda Only.
if(NOT alpaka_ACC_GPU_CUDA_ONLY_MODE)
# Print a warning and skip target creation
message(WARNING "Skipping build of 'useCuBLASInAlpaka' because alpaka_ACC_GPU_CUDA_ONLY_MODE is not enabled.")
return()
endif()

# Add cuBLAS library
find_package(CUDA REQUIRED)
set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas)

#-------------------------------------------------------------------------------
# Find alpaka.

if(NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

if(alpaka_USE_SOURCE_TREE)
# Don't build the examples recursively
set(alpaka_BUILD_EXAMPLES OFF)
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
else()
find_package(alpaka REQUIRED)
endif()
endif()

#-------------------------------------------------------------------------------
# Add executable.

alpaka_add_executable(
${_TARGET_NAME}
src/useCuBLASInAlpaka.cpp)
target_link_libraries(
${_TARGET_NAME}
PUBLIC alpaka::alpaka ${CUDA_LIBRARIES})

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
#set_target_properties(${_TARGET_NAME} PROPERTIES CUDA_STANDARD 17)
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
178 changes: 178 additions & 0 deletions example/useCuBLASInAlpaka/src/useCuBLASInAlpaka.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/* Copyright 2023 Mehmet Yusufoglu, Rene Widera,
* SPDX-License-Identifier: ISC
*/
/*
* This example uses cuBLAS library functions in alpaka. A cuBLAS function cublasSgemm is called by using alpaka
* buffers and queue. Since the code needs only AccGpuCuda backend. Make sure the correct alpaka cmake backend flag is
* set for alpaka.
*/
#include <alpaka/alpaka.hpp>

#include <cublas_v2.h>

#include <cmath>
#include <iostream>

// Index type
using Idx = std::size_t;
// Set data type
using DataType = float;

// Initialize the matrix in column-major order (1D buffer)
void initializeMatrix(DataType* buffer, Idx rows, Idx cols)
{
for(Idx j = 0; j < rows; ++j)
{
for(Idx i = 0; i < cols; ++i)
{
// generate some values and set buffer
buffer[i + j * cols] = static_cast<DataType>((i + j * cols) % 10);
}
}
}

auto main() -> int
{
using Dim1D = alpaka::DimInt<1>;

// Define matrix dimensions, A is MxK and B is KxN
Idx const M = 4; // Rows in A and C
Idx const N = 2; // Columns in B and C
Idx const K = 3; // Columns in A and rows in B

// Define the accelerator and queue
// Use Cuda Accelerator. Cmake Acc flags should be set to Cuda-Only
using Acc = alpaka::TagToAcc<alpaka::TagGpuCudaRt, Dim1D, Idx>;
using Queue = alpaka::Queue<Acc, alpaka::Blocking>;

auto const platformHost = alpaka::PlatformCpu{};
auto const devHost = alpaka::getDevByIdx(platformHost, 0);
auto const platformAcc = alpaka::Platform<Acc>{};
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);

Queue queue(devAcc);

// Allocate 1D host memory
auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, M * K);
auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, K * N);
auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, M * N);

DataType* hostA = std::data(bufHostA);
DataType* hostB = std::data(bufHostB);
DataType* hostC = std::data(bufHostC);

// Initialize host matrices with some values
initializeMatrix(hostA, M, K);
initializeMatrix(hostB, K, N);
std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s

// Print initialized matrices
std::cout << "Matrix A (Host):" << std::endl;
for(Idx j = 0; j < M; ++j)
{
for(Idx i = 0; i < K; ++i)
{
std::cout << hostA[i + j * K] << " ";
}
std::cout << std::endl;
}

std::cout << "Matrix B (Host):" << std::endl;
for(Idx j = 0; j < K; ++j)
{
for(Idx i = 0; i < N; ++i)
{
std::cout << hostB[i + j * N] << " ";
}
std::cout << std::endl;
}

// Allocate 1D device memory
auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, M * K);
auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, K * N);
auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, M * N);

// Copy data to device
alpaka::memcpy(queue, bufDevA, bufHostA);
alpaka::memcpy(queue, bufDevB, bufHostB);
alpaka::memcpy(queue, bufDevC, bufHostC);
alpaka::wait(queue);

std::cout << "Copied matrices A and B to the device." << std::endl;

// Get the native CUDA stream from Alpaka queue
auto alpakaStream = alpaka::getNativeHandle(queue);

// cuBLAS setup
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
cublasSetStream(cublasHandle, alpakaStream);

// Perform matrix multiplication: C = A * B
float alpha = 1.0f, beta = 0.0f; // Set beta to 0.0f to overwrite C
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N, // No transpose for A and B
M,
N,
K, // Dimensions: C = A * B
&alpha,
std::data(bufDevA),
M, // Leading dimension of A
std::data(bufDevB),
K, // Leading dimension of B
&beta,
std::data(bufDevC),
M // Leading dimension of C
);

alpaka::wait(queue); // Wait for multiplication to complete
std::cout << "Matrix multiplication completed." << std::endl;

// Copy result back to host
alpaka::memcpy(queue, bufHostC, bufDevC);
alpaka::wait(queue);
std::cout << "Copied result matrix C back to the host." << std::endl;

// Print result matrix C
std::cout << "Matrix C (Host):" << std::endl;
for(Idx j = 0; j < M; ++j)
{
for(Idx i = 0; i < N; ++i)
{
std::cout << hostC[i + j * N] << " ";
}
std::cout << std::endl;
}

// Expected values of elements of C
std::vector<DataType> expectedResult{20, 23, 6, 9, 56, 68, 30, 42};

// Verify the result
bool success = true;
for(Idx j = 0; j < M; ++j)
{
for(Idx i = 0; i < N; ++i)
{
if(std::fabs(hostC[i + j * N] - expectedResult[i + j * N]) > 1e-5f)
{ // Allow small floating-point errors
std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * N]
<< " != " << expectedResult[i + j * N] << std::endl;
success = false;
}
}
}

std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N
<< (success ? " succeeded!" : " failed!") << std::endl;

if(!success)
{
return EXIT_FAILURE;
}

// Cleanup cuBLAS
cublasDestroy(cublasHandle);
return EXIT_SUCCESS;
}