Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial overhaul of Profile mode #200

Merged
merged 9 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
308 changes: 169 additions & 139 deletions sample/vcopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,157 +5,187 @@
#include <iostream>
using namespace std;


#define HIP_ASSERT(x) (assert((x)==hipSuccess))

// HIP kernel. Each thread takes care of one element of c
__global__ void vecCopy(double *a, double *b, double *c, int n,int stride)
{
__global__ void vecCopy(double *a, double *b, double *c, int n, int stride) {
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;

if (id < n) {
c[id] = a[id];
}
if (id < n)
c[id] = a[id];
}

void usage()
{
printf("\nUsage: vcopy [n] [blocksize] {dev}\n\n");
// Duplicate of vecCopy kernel. Included for testing purposes
__global__ void vecCopy_2(double *a, double *b, double *c, int n, int stride) {
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
if (id < n)
c[id] = a[id];
}

void usage() {
std::cout << "Usage: vcopy [OPTIONS]\n";
std::cout << "Required:\n";
std::cout << " -n/--numThreads <value> Set the num of threads\n";
std::cout << " -b/--blockSize <value> Set the block size\n";
std::cout << "Optional:\n";
std::cout << " -d/--dev <value> Set the device ID [Default: 0]\n";
std::cout << " -i/--iter <value> Set the num of iterations [Default: 1]\n";
std::cout << " -h/--help Display this help message\n";
exit(1);
return;
}

int main( int argc, char* argv[] )
{
// Size of vectors
int n; //64 MB
int blockSize, gridSize;

// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
//Host output vector for verification
double *h_verify_c;

// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;

int stride = 1;
int devId = 0;

if(argc < 3)
usage();
if(argc > 3)
devId = atoi(argv[3]);

n = atoi(argv[1]);
blockSize = atoi(argv[2]);

int numGpuDevices;
HIP_ASSERT(hipGetDeviceCount(&numGpuDevices));
if(devId >= numGpuDevices)
devId = 0;
HIP_ASSERT(hipSetDevice(devId));

printf("vcopy testing on GCD %d\n", devId);

assert(n > 0);
assert(blockSize > 0);

// Size, in bytes, of each vector
size_t bytes = n*sizeof(double)*stride;

// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
h_verify_c = (double*)malloc(bytes);

printf("Finished allocating vectors on the CPU\n");
// Allocate memory for each vector on GPU
HIP_ASSERT(hipMalloc(&d_a, bytes));
HIP_ASSERT(hipMalloc(&d_b, bytes));
HIP_ASSERT(hipMalloc(&d_c, bytes));

printf("Finished allocating vectors on the GPU\n");

int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_a[i] = i;
h_b[i] = i;
}


// Copy host vectors to device
HIP_ASSERT(hipMemcpy( d_a, h_a, bytes, hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(d_b, h_b, bytes, hipMemcpyHostToDevice));

printf("Finished copying vectors to the GPU\n");


// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
//gridSize = 1;
int main(int argc, char* argv[]) {
// Size of vectors
int n; //64 MB
int blockSize, gridSize;

// Launch multiple kernels
bool multiKernel = false;

// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
//Host output vector for verification
double *h_verify_c;

// Device input vectors
double *d_a;
double *d_b;
// Device output vector
double *d_c;

int stride = 1;
int devId = 0;
int numIter = 1;

for (int i = 0; i < argc; i++){
std::string arg = argv[i];
if ((arg == "--blockSize" || arg == "-b") && i+1 < argc)
blockSize = std::atoi(argv[i+1]);

int tot_waves = (blockSize*gridSize)/64;
float num_bytes_kb = ((sizeof(double))*n)/(1024);
float num_bytes_wave = (1.0*num_bytes_kb)/(1.0*tot_waves);

printf("sw thinks it moved %f KB per wave \n", (2.0*num_bytes_wave));

printf("Total threads: %d, Grid Size: %d block Size:%d, Wavefronts:%d:\n", n, gridSize, blockSize, tot_waves);
printf("Launching the kernel on the GPU\n");
// Execute the kernel
hipLaunchKernelGGL(vecCopy, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n,stride);
hipDeviceSynchronize( );
else if ((arg == "--vec" || arg == "-n") && i+1 < argc)
n = std::atoi(argv[i+1]);

else if ((arg == "--device" || arg == "-d") && i+1 < argc)
devId = std::atoi(argv[i+1]);

else if ((arg == "--iter" || arg == "-i") && i+1 < argc)
numIter = std::atoi(argv[i+1]);

else if (arg == "--multikernel")
multiKernel = true;

else if (arg == "--help" || arg == "-h")
usage();
}

if (blockSize == 0)
usage();

if (n == 0)
usage();


int numGpuDevices;
HIP_ASSERT(hipGetDeviceCount(&numGpuDevices));
if(devId >= numGpuDevices)
devId = 0;
HIP_ASSERT(hipSetDevice(devId));

printf("vcopy testing on GCD %d\n", devId);

assert(n > 0);
assert(blockSize > 0);

// Size, in bytes, of each vector
size_t bytes = n*sizeof(double)*stride;

// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
h_verify_c = (double*)malloc(bytes);

printf("Finished allocating vectors on the CPU\n");

// Allocate memory for each vector on GPU
HIP_ASSERT(hipMalloc(&d_a, bytes));
HIP_ASSERT(hipMalloc(&d_b, bytes));
HIP_ASSERT(hipMalloc(&d_c, bytes));

printf("Finished allocating vectors on the GPU\n");

// Initialize vectors on host
for(int i = 0; i < n; i++) {
h_a[i] = i;
h_b[i] = i;
}

// Copy host vectors to device
HIP_ASSERT(hipMemcpy(d_a, h_a, bytes, hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(d_b, h_b, bytes, hipMemcpyHostToDevice));

printf("Finished copying vectors to the GPU\n");

// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
int tot_waves = (blockSize*gridSize)/64;
float num_bytes_kb = ((sizeof(double))*n)/(1024);
float num_bytes_wave = (1.0*num_bytes_kb)/(1.0*tot_waves);

printf("sw thinks it moved %f KB per wave \n", (2.0*num_bytes_wave));
printf("Total threads: %d, Grid Size: %d block Size:%d, Wavefronts:%d:\n", n, gridSize, blockSize, tot_waves);
printf("Launching the kernel on the GPU\n");

// Execute the kernel
for(int i = 0; i < numIter; i++){
hipLaunchKernelGGL(vecCopy, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n, stride);
hipDeviceSynchronize();
printf("Finished executing kernel\n");
// Copy array back to host
HIP_ASSERT(hipMemcpy( h_c, d_c, bytes, hipMemcpyDeviceToHost));
printf("Finished copying the output vector from the GPU to the CPU\n");

//Compute for CPU
for(i=0; i <n; i++)
{
// h_verify_c[i*stride] = h_a[i*stride] + h_b[i*stride];
h_verify_c[i*stride] = h_a[i*stride] ;
}


//Verfiy results
for(i=0; i <n; i++)
{
if (abs(h_verify_c[i*stride] - h_c[i*stride]) > 1e-5)
{
printf("Error at position i %d, Expected: %f, Found: %f \n", i, h_c[i], d_c[i]);
}
}

// printf("Printing few elements from the output vector\n");

for(i=0; i < 20; i++)
{
// printf("Output[%d]:%f\n",i, h_c[i]);
// Optionally, launch a second kernel. Only here for testing purposes
if (multiKernel){
hipLaunchKernelGGL(vecCopy_2, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n, stride);
hipDeviceSynchronize();
printf("Finished executing kernel\n");
}

printf("Releasing GPU memory\n");

// Release device memory
HIP_ASSERT(hipFree(d_a));
HIP_ASSERT(hipFree(d_b));
HIP_ASSERT(hipFree(d_c));

// Release host memory
printf("Releasing CPU memory\n");
free(h_a);
free(h_b);
free(h_c);

return 0;
}

// Copy array back to host
HIP_ASSERT(hipMemcpy( h_c, d_c, bytes, hipMemcpyDeviceToHost));
printf("Finished copying the output vector from the GPU to the CPU\n");

// Compute for CPU
for(int i=0; i<n; i++) {
// h_verify_c[i*stride] = h_a[i*stride] + h_b[i*stride];
h_verify_c[i*stride] = h_a[i*stride] ;
}

// Verfiy results
for(int i = 0; i < n; i++) {
if (abs(h_verify_c[i*stride] - h_c[i*stride]) > 1e-5)
printf("Error at position i %d, Expected: %f, Found: %f \n", i, h_c[i], d_c[i]);
}
//printf("Printing few elements from the output vector\n");
for(int i = 0; i < 20; i++) {
//printf("Output[%d]:%f\n",i, h_c[i]);
}

printf("Releasing GPU memory\n");

// Release device memory
HIP_ASSERT(hipFree(d_a));
HIP_ASSERT(hipFree(d_b));
HIP_ASSERT(hipFree(d_c));

// Release host memory
printf("Releasing CPU memory\n");
free(h_a);
free(h_b);
free(h_c);

return 0;
}
Loading