@@ -34,6 +34,38 @@ void bitonic_512_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a
34
34
clEnqueueNDRangeKernel (other.cmdQueue , other.bitonic512 , 1 , NULL , &a_size, &localWorkSize, 0 , NULL , NULL );
35
35
}
36
36
37
+ void bitonic_1024_gpu (cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other)
38
+ {
39
+ const int kernelSize = (a_N >> 1 );
40
+
41
+ int iSize = kernelSize;
42
+ size_t a_size = kernelSize;
43
+ size_t localWorkSize = 512 ;
44
+
45
+ clSetKernelArg (other.bitonic1024 , 0 , sizeof (cl_mem), (void *)&a_buffer);
46
+ clSetKernelArg (other.bitonic1024 , 1 , sizeof (cl_int), (void *)&stage);
47
+ clSetKernelArg (other.bitonic1024 , 2 , sizeof (cl_int), (void *)&passOfStage);
48
+ clSetKernelArg (other.bitonic1024 , 3 , sizeof (cl_int), (void *)&a_invertModeOn);
49
+
50
+ clEnqueueNDRangeKernel (other.cmdQueue , other.bitonic1024 , 1 , NULL , &a_size, &localWorkSize, 0 , NULL , NULL );
51
+ }
52
+
53
+ void bitonic_2048_gpu (cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other)
54
+ {
55
+ const int kernelSize = (a_N >> 1 );
56
+
57
+ int iSize = kernelSize;
58
+ size_t a_size = kernelSize;
59
+ size_t localWorkSize = 1024 ;
60
+
61
+ clSetKernelArg (other.bitonic2048 , 0 , sizeof (cl_mem), (void *)&a_buffer);
62
+ clSetKernelArg (other.bitonic2048 , 1 , sizeof (cl_int), (void *)&stage);
63
+ clSetKernelArg (other.bitonic2048 , 2 , sizeof (cl_int), (void *)&passOfStage);
64
+ clSetKernelArg (other.bitonic2048 , 3 , sizeof (cl_int), (void *)&a_invertModeOn);
65
+
66
+ clEnqueueNDRangeKernel (other.cmdQueue , other.bitonic2048 , 1 , NULL , &a_size, &localWorkSize, 0 , NULL , NULL );
67
+ }
68
+
37
69
38
70
void bitonic_sort_gpu_simple (cl_mem a_data, int a_N, BitonicCLArgs other)
39
71
{
@@ -61,13 +93,31 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other)
61
93
for (int temp = a_N; temp > 2 ; temp >>= 1 )
62
94
numStages++;
63
95
96
+ // not all devices can have large work groups!
97
+ //
98
+ size_t maxWorkGroupSize = 0 ;
99
+ if (other.dev != 0 )
100
+ clGetDeviceInfo (other.dev , CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (size_t ), &maxWorkGroupSize, NULL );
101
+ else
102
+ maxWorkGroupSize = 256 ;
103
+
64
104
// up, form bitonic sequence with half allays
65
105
//
66
106
for (int stage = 0 ; stage < numStages; stage++)
67
107
{
68
108
for (int passOfStage = stage; passOfStage >= 0 ; passOfStage--)
69
109
{
70
- if (passOfStage > 0 && passOfStage <= 8 )
110
+ if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024 )
111
+ {
112
+ bitonic_2048_gpu (a_data, a_N, stage, passOfStage, 1 , other);
113
+ break ;
114
+ }
115
+ else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512 )
116
+ {
117
+ bitonic_1024_gpu (a_data, a_N, stage, passOfStage, 1 , other);
118
+ break ;
119
+ }
120
+ else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256 )
71
121
{
72
122
bitonic_512_gpu (a_data, a_N, stage, passOfStage, 1 , other);
73
123
break ;
@@ -81,7 +131,17 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other)
81
131
//
82
132
for (int passOfStage = numStages; passOfStage >= 0 ; passOfStage--)
83
133
{
84
- if (passOfStage > 0 && passOfStage <= 8 )
134
+ if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024 )
135
+ {
136
+ bitonic_2048_gpu (a_data, a_N, numStages - 1 , passOfStage, 0 , other);
137
+ break ;
138
+ }
139
+ else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512 )
140
+ {
141
+ bitonic_1024_gpu (a_data, a_N, numStages - 1 , passOfStage, 0 , other);
142
+ break ;
143
+ }
144
+ else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256 )
85
145
{
86
146
bitonic_512_gpu (a_data, a_N, numStages - 1 , passOfStage, 0 , other);
87
147
break ;
0 commit comments