Skip to content

Commit

Permalink
Run cuBLAS functions from alpaka
Browse files Browse the repository at this point in the history
  • Loading branch information
mehmetyusufoglu committed Nov 28, 2024
1 parent 8fefd70 commit 4f98208
Show file tree
Hide file tree
Showing 3 changed files with 238 additions and 0 deletions.
2 changes: 2 additions & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ add_subdirectory("randomCells2D/")
add_subdirectory("reduce/")
add_subdirectory("tagSpecialization/")
add_subdirectory("vectorAdd/")
add_subdirectory("useCuBLASInAlpaka/")

58 changes: 58 additions & 0 deletions example/useCuBLASInAlpaka/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#
# Copyright 2023 Benjamin Worpitz, Jan Stephan
# SPDX-License-Identifier: ISC
#

################################################################################
# Required CMake version.

cmake_minimum_required(VERSION 3.25)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

################################################################################
# Project.

set(_TARGET_NAME useCuBLASInAlpaka)

project(${_TARGET_NAME} LANGUAGES CXX)

# Check if the cmake variables to see if the Acc option is Cuda Only.
if(NOT alpaka_ACC_GPU_CUDA_ONLY_MODE)
# Print a warning and skip target creation
message(WARNING "Skipping build of 'useCuBLASInAlpaka' because alpaka_ACC_GPU_CUDA_ONLY_MODE is not enabled.")
return()
endif()

# Add cuBLAS library
find_package(CUDA REQUIRED)
set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas)

#-------------------------------------------------------------------------------
# Find alpaka.

if(NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

if(alpaka_USE_SOURCE_TREE)
# Don't build the examples recursively
set(alpaka_BUILD_EXAMPLES OFF)
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
else()
find_package(alpaka REQUIRED)
endif()
endif()

#-------------------------------------------------------------------------------
# Add executable.

alpaka_add_executable(
${_TARGET_NAME}
src/useCuBLASInAlpaka.cpp)
target_link_libraries(
${_TARGET_NAME}
PUBLIC alpaka::alpaka ${CUDA_LIBRARIES})

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
set_target_properties(${_TARGET_NAME} PROPERTIES CUDA_STANDARD 14)
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
178 changes: 178 additions & 0 deletions example/useCuBLASInAlpaka/src/useCuBLASInAlpaka.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/* Copyright 2023 Mehmet Yusufoglu, Rene Widera,
* SPDX-License-Identifier: ISC
*/
/*
* This example uses cuBLAS library functions in alpaka. A cuBLAS function cublasSgemm is called by using alpaka
* buffers and queue. Since the code needs only AccGpuCuda backend. Make sure the correct alpaka cmake backend flag is
* set for alpaka.
*/
#include <alpaka/alpaka.hpp>

#include <cublas_v2.h>

#include <cmath>
#include <iostream>

// Index type
using Idx = std::size_t;
// Set data type
using DataType = float;

// Initialize the matrix in column-major order (1D buffer)
void initializeMatrix(DataType* buffer, Idx rows, Idx cols)
{
for(Idx j = 0; j < rows; ++j)
{
for(Idx i = 0; i < cols; ++i)
{
// generate some values and set buffer
buffer[i + j * cols] = static_cast<DataType>((i + j * cols) % 10);
}
}
}

auto main() -> int
{
using Dim1D = alpaka::DimInt<1>;

// Define matrix dimensions, A is MxK and B is KxN
Idx const M = 4; // Rows in A and C
Idx const N = 2; // Columns in B and C
Idx const K = 3; // Columns in A and rows in B

// Define the accelerator and queue
// Use Cuda Accelerator. Cmake Acc flags should be set to Cuda-Only
using Acc = alpaka::TagToAcc<alpaka::TagGpuCudaRt, Dim1D, Idx>;
using Queue = alpaka::Queue<Acc, alpaka::Blocking>;

auto const platformHost = alpaka::PlatformCpu{};
auto const devHost = alpaka::getDevByIdx(platformHost, 0);
auto const platformAcc = alpaka::Platform<Acc>{};
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);

Queue queue(devAcc);

// Allocate 1D host memory
auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, M * K);
auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, K * N);
auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, M * N);

DataType* hostA = std::data(bufHostA);
DataType* hostB = std::data(bufHostB);
DataType* hostC = std::data(bufHostC);

// Initialize host matrices with some values
initializeMatrix(hostA, M, K);
initializeMatrix(hostB, K, N);
std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s

// Print initialized matrices
std::cout << "Matrix A (Host):" << std::endl;
for(Idx j = 0; j < M; ++j)
{
for(Idx i = 0; i < K; ++i)
{
std::cout << hostA[i + j * K] << " ";
}
std::cout << std::endl;
}

std::cout << "Matrix B (Host):" << std::endl;
for(Idx j = 0; j < K; ++j)
{
for(Idx i = 0; i < N; ++i)
{
std::cout << hostB[i + j * N] << " ";
}
std::cout << std::endl;
}

// Allocate 1D device memory
auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, M * K);
auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, K * N);
auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, M * N);

// Copy data to device
alpaka::memcpy(queue, bufDevA, bufHostA);
alpaka::memcpy(queue, bufDevB, bufHostB);
alpaka::memcpy(queue, bufDevC, bufHostC);
alpaka::wait(queue);

std::cout << "Copied matrices A and B to the device." << std::endl;

// Get the native CUDA stream from Alpaka queue
auto alpakaStream = alpaka::getNativeHandle(queue);

// cuBLAS setup
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
cublasSetStream(cublasHandle, alpakaStream);

// Perform matrix multiplication: C = A * B
float alpha = 1.0f, beta = 0.0f; // Set beta to 0.0f to overwrite C
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N, // No transpose for A and B
M,
N,
K, // Dimensions: C = A * B
&alpha,
std::data(bufDevA),
M, // Leading dimension of A
std::data(bufDevB),
K, // Leading dimension of B
&beta,
std::data(bufDevC),
M // Leading dimension of C
);

alpaka::wait(queue); // Wait for multiplication to complete
std::cout << "Matrix multiplication completed." << std::endl;

// Copy result back to host
alpaka::memcpy(queue, bufHostC, bufDevC);
alpaka::wait(queue);
std::cout << "Copied result matrix C back to the host." << std::endl;

// Print result matrix C
std::cout << "Matrix C (Host):" << std::endl;
for(Idx j = 0; j < M; ++j)
{
for(Idx i = 0; i < N; ++i)
{
std::cout << hostC[i + j * N] << " ";
}
std::cout << std::endl;
}

// Expected values of elements of C
std::vector<DataType> expectedResult{20, 23, 6, 9, 56, 68, 30, 42};

// Verify the result
bool success = true;
for(Idx j = 0; j < M; ++j)
{
for(Idx i = 0; i < N; ++i)
{
if(std::fabs(hostC[i + j * N] - expectedResult[i + j * N]) > 1e-5f)
{ // Allow small floating-point errors
std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * N]
<< " != " << expectedResult[i + j * N] << std::endl;
success = false;
}
}
}

std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N
<< (success ? " succeeded!" : " failed!") << std::endl;

if(!success)
{
return EXIT_FAILURE;
}

// Cleanup cuBLAS
cublasDestroy(cublasHandle);
return EXIT_SUCCESS;
}

0 comments on commit 4f98208

Please sign in to comment.