-
Notifications
You must be signed in to change notification settings - Fork 1
/
gpu_heater.cu
127 lines (114 loc) · 3.28 KB
/
gpu_heater.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/*
* Nvidia Tesla C2075 GPU
* Jens Lang, 2013
* This programme creates high-resolution power profiles for GPU routines executed on Nvidia GPUs.
* It needs the CUDA, NVML and PAPI libraries. It should be compiled with gcc using the switch
* -std=c++11. For further information, please refer to:
* Lang, Jens; Rünger, Gudula: High-Resolution Power Profiling of GPU Functions Using Low-Resolution
* Measurement. In: Wolf, F.; Mohr, B.; an Mey, D. (Hrsg.): Euro-Par 2013 Parallel Processing
* (LNCS, Bd. 8097): S. 801–812. Springer – ISBN 978-3-642-40046-9, 2013. DOI: 10.1007/978-3-642-40047-6_80
*/
#include <papi.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "nvml.h"
#include "gpu_heater.h"
//#define M 2300
int M = 19456;
//int M = 1024;
#define IDX2C(i,j,M) (i*M+j)
cublasHandle_t handle;
typedef float matrix_t;
matrix_t* devPtrA;
matrix_t* devPtrB;
matrix_t* devPtrC;
void heatup_gpu()
{
// cuBLAS call
matrix_t alpha = 2.3f;
matrix_t beta = 5.7f;
for (int xxx=0; xxx<1; xxx++) {
cublasSgemm(
handle, CUBLAS_OP_N, CUBLAS_OP_N,
M, M, M,
&alpha,
devPtrA, M,
devPtrB, M,
&beta,
devPtrC, M);
cudaThreadSynchronize(); // Wait for the GPU launched work to complete
}
// status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha,
// d_A, N, d_B, N, &beta, d_C, N);
}
void init_heater()
{
/* Initialise random number generator (for sleep) */
/* std::uniform_int_distribution<long long> random_sleep_time{20*1000*1000, 40*1000*1000};
std::default_random_engine random_engine(std::chrono::system_clock::now().time_since_epoch().count());
*/
cudaError_t cudaStat;
cublasStatus_t stat;
// allocate host memory for matrices and copy them to the device
int i, j;
matrix_t* a;
matrix_t* b;
//matrix_t* c;
a = (matrix_t *)malloc (M * M * sizeof (*a));
b = (matrix_t *)malloc (M * M * sizeof (*b));
//c = (matrix_t *)malloc (M * M * sizeof (*c));
if (!a) {
printf ("host memory allocation failed\n");
return;
}
for (j = 0; j < M; j++) {
for (i = 0; i < M; i++) {
a[IDX2C(i,j,M)] = (matrix_t)(i * M + j + 1);
b[IDX2C(i,j,M)] = (matrix_t)(i * M + j - 1);
}
}
cudaStat = cudaMalloc ((void**)&devPtrA, M*M*sizeof(*a));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed\n");
return;
}
cudaStat = cudaMalloc ((void**)&devPtrB, M*M*sizeof(*b));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed\n");
return;
}
cudaStat = cudaMalloc ((void**)&devPtrC, M*M*sizeof(*b));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed\n");
return;
}
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return;
}
stat = cublasSetMatrix (M, M, sizeof(*a), a, M, devPtrA, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed\n");
cudaFree (devPtrA);
cublasDestroy(handle);
return;
}
stat = cublasSetMatrix (M, M, sizeof(*b), b, M, devPtrB, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed\n");
cudaFree (devPtrB);
cublasDestroy(handle);
return;
}
printf("#Size of matrices: %d\n", M);
}
void shutdown_heater()
{
// shutdown cuBLAS
cudaFree (devPtrA);
cublasDestroy(handle);
}