forked from gorgonia/gorgonia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cuda.go
190 lines (160 loc) · 4.63 KB
/
cuda.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// +build cuda
package gorgonia
// for non-cuda builds, look at noextern.go
import (
"log"
"github.com/chewxy/cu"
)
const CUDA = true
var cudaStdLib map[string]string
//go:generate cudagen
// CUDAMachine is a representation of CUDA capable VMs.
type CUDAMachine interface {
External
Contexts() []cu.Context
Modules() map[string][]cu.Module
Functions() map[string][]cu.Function
ElemGridSize(n, dev int) (gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ int)
}
// ExternMetadata holds any metadata for CUDA related stuff.
// The slices in there are indexed by deviceID
type ExternMetadata struct {
warp []int // WarpSize
mtpb []int // MaxThreadsPerBlock
mgdx []int // MaxGridDimX
mgdy []int // MaxGridDimY
mgdz []int // MaxGridDimZ
mbdx []int // MaxBlockDimX
mbdy []int // MaxBlockDimY
mbdz []int // MaxBlockDimZ
c []cu.Context
d []cu.Device
m map[string][]cu.Module
f map[string][]cu.Function
}
// elemGridSize calculates the gridsize for elementwise operations
func (md ExternMetadata) ElemGridSize(n, dev int) (gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ int) {
if dev > len(md.warp) {
// error
}
maxThreads := md.mtpb[dev]
maxGridX := md.mgdx[dev]
maxGridY := md.mgdy[dev]
maxGridZ := md.mgdz[dev]
blockDimX = 1
blockDimY = 1
blockDimZ = 1
gridDimX = 1
gridDimY = 1
gridDimZ = 1
blocks := calcBlocks(n, maxThreads)
switch {
case blocks == 1:
blockDimX = n
case blocks >= maxGridX*maxGridY*maxGridZ:
// what kind of monstrosity is this??!
case blocks >= maxGridX*maxGridY:
gridDimX = maxGridX
gridDimY = maxGridY
gridDimZ = calcBlocks(blocks%(maxGridX*maxGridY), maxGridZ)
blockDimX = maxThreads
case blocks >= maxGridX:
gridDimX = maxGridX
gridDimY = calcBlocks(blocks%(maxGridX), maxGridY)
blockDimX = maxThreads
default:
gridDimX = blocks
blockDimX = maxThreads
}
return
}
// HasFunc returns true if the execution is external (cgo/cuda/openCL) AND the external device contains the function with the given name
//
// Note that BLAS names will always return false, even if using a BLAS that requires cgo calls (like Intel MKL)
func (m ExternMetadata) HasFunc(name string) bool {
_, ok := m.f[name]
return ok
}
// Contexts return a slice of contexts that is being used by this CUDAMachine
func (m ExternMetadata) Contexts() []cu.Context { return m.c }
// Modules returns a list of modules loaded (and referable by name) in this CUDAMachine
func (m ExternMetadata) Modules() map[string][]cu.Module { return m.m }
// Functions returns a list of functions loaded (and refereable by name) in this CUDAMachine
func (m ExternMetadata) Functions() map[string][]cu.Function { return m.f }
func (m ExternMetadata) init() {
devices, err := cu.NumDevices()
if err != nil {
cudaLogf("Failed to get number of devices: %v", err)
return
}
if devices == 0 {
return
}
m.c = make([]cu.Context, devices)
m.d = make([]cu.Device, devices)
m.warp = make([]int, devices)
m.mtpb = make([]int, devices)
m.mgdx = make([]int, devices)
m.mgdy = make([]int, devices)
m.mgdz = make([]int, devices)
m.mbdx = make([]int, devices)
m.mbdy = make([]int, devices)
m.mbdz = make([]int, devices)
for i := range m.c {
dev, err := cu.GetDevice(i)
if err != nil {
cudaLogf("Failed to get device %d: %v", i, err)
m.cleanup()
return
}
ctx, err := dev.MakeContext(cu.SchedAuto)
if err != nil {
if err == cu.OutOfMemory {
var free, total int64
if free, total, err = cu.MemInfo(); err != nil {
cudaLogf("Error while getting mem info: %v", err)
}
cudaLogf("Out of memory. Free: %v, total %v", free, total)
m.cleanup()
return
}
cudaLogf("Failed to make context for device %d. Error: %v", i, err)
m.cleanup()
return
}
var attrs []int
if attrs, err = dev.Attributes(cu.WarpSize, cu.MaxThreadsPerBlock, cu.MaxGridDimX, cu.MaxGridDimY, cu.MaxGridDimZ, cu.MaxBlockDimX, cu.MaxBlockDimY, cu.MaxBlockDimZ); err != nil {
cudaLogf("Failed to get attributes for device %d. Error: %v", i, err)
m.cleanup()
return
}
m.warp[i] = attrs[0]
m.mtpb[i] = attrs[1]
m.mgdx[i] = attrs[2]
m.mgdy[i] = attrs[3]
m.mgdz[i] = attrs[4]
m.mbdx[i] = attrs[5]
m.mbdy[i] = attrs[6]
m.mbdz[i] = attrs[7]
m.c[i] = ctx
m.d[i] = dev
}
if len(m.c) > 0 {
cu.SetCurrent(m.c[0])
}
m.m = make(map[string][]cu.Module)
m.f = make(map[string][]cu.Function)
}
func (m ExternMetadata) cleanup() {
m.c = nil
m.m = nil
m.f = nil
m.d = nil
}
func init() {
log.Println("Using CUDA build")
}
// it's just a generic ceiling function. Added here to avoid mixing with any potential ceilInt operation
func calcBlocks(n, maxThreads int) int {
return (n + maxThreads - 1) / maxThreads
}