-
Notifications
You must be signed in to change notification settings - Fork 355
/
verifyCuda.py
executable file
·99 lines (78 loc) · 2.62 KB
/
verifyCuda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Multiplies two square matrices together using a *single* block of threads and
global memory only. Each thread computes one element of the resulting matrix.
"""
import numpy as np
from pycuda import compiler, gpuarray
# -- initialize the device
kernel_code_template = """
__global__ void MatrixMulKernel(float *a, float *b, float *c)
{
// 2D Thread ID (assuming that only *one* block will be executed)
int tx = threadIdx.x;
int ty = threadIdx.y;
// Pvalue is used to store the element of the matrix
// that is computed by the thread
float Pvalue = 0;
// Each thread loads one row of M and one column of N,
// to produce one element of P.
for (int k = 0; k < %(MATRIX_SIZE)s; ++k) {
float Aelement = a[ty * %(MATRIX_SIZE)s + k];
float Belement = b[k * %(MATRIX_SIZE)s + tx];
Pvalue += Aelement * Belement;
}
// Write the matrix to device memory;
// each thread writes one element
c[ty * %(MATRIX_SIZE)s + tx] = Pvalue;
}
"""
# define the (square) matrix size
# note that we'll only use *one* block of threads here
# as a consequence this number (squared) can't exceed max_threads,
# see http://documen.tician.de/pycuda/util.html#pycuda.tools.DeviceData
# for more information on how to get this number for your device
MATRIX_SIZE = 20
# create two random square matrices
a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
# compute reference on the CPU to verify GPU computation
c_cpu = np.dot(a_cpu, b_cpu)
# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
# create empty gpu array for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
# get the kernel code from the template
# by specifying the constant MATRIX_SIZE
kernel_code = kernel_code_template % {
'MATRIX_SIZE': MATRIX_SIZE
}
# compile the kernel code
mod = compiler.SourceModule(kernel_code)
# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")
# call the kernel on the card
matrixmul(
# inputs
a_gpu, b_gpu,
# output
c_gpu,
# (only one) block of MATRIX_SIZE x MATRIX_SIZE threads
block=(MATRIX_SIZE, MATRIX_SIZE, 1),
)
# print the results
print "-" * 80
print "Matrix A (GPU):"
print a_gpu.get()
print "-" * 80
print "Matrix B (GPU):"
print b_gpu.get()
print "-" * 80
print "Matrix C (GPU):"
print c_gpu.get()
print "-" * 80
print "CPU-GPU difference:"
print c_cpu - c_gpu.get()
np.allclose(c_cpu, c_gpu.get())