-
Notifications
You must be signed in to change notification settings - Fork 5
/
kernelExample.py
98 lines (76 loc) · 2.61 KB
/
kernelExample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
import scipy as sp
from scipy.stats import norm
from pycuda.compiler import SourceModule
import math
# Here's the kernel, essentially identical to that used in the CUDA and RCUDA examples
mod = SourceModule("""
#include <stdio.h>
#define SQRT_TWO_PI 2.506628274631000
__global__ void calc_loglik(double *vals, double *x, int n, double mu, double sigma, int dbg)
{
// note that this assumes no third dimension to the grid
int myblock = blockIdx.x + blockIdx.y * gridDim.x;
// size of each block (within grid of blocks)
int blocksize = blockDim.x * blockDim.y * blockDim.z;
// id of thread in a given block
int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
// assign overall id/index of the thread
int idx = myblock * blocksize + subthread;
if (idx < n) {
if (dbg){
printf("thread idx: %04d\\t x[%d] = %f\\t (n=%d,mu=%f,sigma=%f)\\n",idx,idx,x[idx],n,mu,sigma);
}
double std = (x[idx] - mu)/sigma;
double e = exp( - 0.5 * std * std);
vals[idx] = e / ( sigma * SQRT_TWO_PI);
} else {
if (dbg){
printf("thread idx: %04d\\t (>=n=%d)\\n",idx,n);
}
}
return;
}
""")
calc_loglik = mod.get_function("calc_loglik")
# Arguments must be numpy datatypes i.e., n = 1000 will not work!
n = np.int32(134217728)
# Threads per block and number of blocks:
threads_per_block = int(1024)
block_dims = (threads_per_block, 1, 1)
grid_d = int(math.ceil(math.sqrt(n/threads_per_block)))
grid_dims = (grid_d, grid_d, 1)
print("Generating random normals...")
x = np.random.normal(size = n)
# Evaluate at N(0.3, 1.5)
mu = np.float64(0.3)
sigma = np.float64(1.5)
dbg = False # True
verbose = np.int32(dbg)
# Allocate storage for the result:
out = np.zeros_like(x)
# Create two timers:
start = drv.Event()
end = drv.Event()
# Launch the kernel
print("Running GPU code...")
start.record()
calc_loglik(drv.Out(out), drv.In(x), n, mu, sigma, verbose, block= block_dims, grid = grid_dims)
end.record() # end timing
# calculate the run length
end.synchronize()
gpu_secs = start.time_till(end)*1e-3
print("Time for calculation (GPU): %fs" % gpu_secs)
# Scipy version:
print("Running Scipy CPU code...")
start.record()
out2 = norm.pdf(x, loc = mu, scale = sigma)
end.record() # end timing
# calculate the run length
end.synchronize()
cpu_secs = start.time_till(end)*1e-3
print("Time for calculation (CPU): %fs" % cpu_secs)
print("Output from GPU: %f %f %f" % (out[0], out[1], out[2]))
print("Output from CPU: %f %f %f" % (out2[0], out2[1], out2[2]))