Skip to content

Commit 0becdfe

Browse files
committed
50% optimisations for nvidia (and other GPUs with large work groups);
1 parent 5a9d76c commit 0becdfe

7 files changed

+196
-24
lines changed

bitonic_sort.vcxproj

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<?xml version="1.0" encoding="utf-8"?>
2-
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
2+
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
33
<ItemGroup Label="ProjectConfigurations">
44
<ProjectConfiguration Include="Debug|Win32">
55
<Configuration>Debug</Configuration>
@@ -22,31 +22,32 @@
2222
<ProjectGuid>{F89622BD-0044-4082-A6C6-F5D9678EBAE5}</ProjectGuid>
2323
<Keyword>Win32Proj</Keyword>
2424
<RootNamespace>bitonic_sort</RootNamespace>
25+
<WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
2526
</PropertyGroup>
2627
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
2728
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
2829
<ConfigurationType>Application</ConfigurationType>
2930
<UseDebugLibraries>true</UseDebugLibraries>
30-
<PlatformToolset>v120</PlatformToolset>
31+
<PlatformToolset>v141</PlatformToolset>
3132
<CharacterSet>Unicode</CharacterSet>
3233
</PropertyGroup>
3334
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
3435
<ConfigurationType>Application</ConfigurationType>
3536
<UseDebugLibraries>true</UseDebugLibraries>
36-
<PlatformToolset>v120</PlatformToolset>
37+
<PlatformToolset>v141</PlatformToolset>
3738
<CharacterSet>Unicode</CharacterSet>
3839
</PropertyGroup>
3940
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
4041
<ConfigurationType>Application</ConfigurationType>
4142
<UseDebugLibraries>false</UseDebugLibraries>
42-
<PlatformToolset>v120</PlatformToolset>
43+
<PlatformToolset>v141</PlatformToolset>
4344
<WholeProgramOptimization>true</WholeProgramOptimization>
4445
<CharacterSet>Unicode</CharacterSet>
4546
</PropertyGroup>
4647
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
4748
<ConfigurationType>Application</ConfigurationType>
4849
<UseDebugLibraries>false</UseDebugLibraries>
49-
<PlatformToolset>v120</PlatformToolset>
50+
<PlatformToolset>v141</PlatformToolset>
5051
<WholeProgramOptimization>true</WholeProgramOptimization>
5152
<CharacterSet>Unicode</CharacterSet>
5253
</PropertyGroup>

bitonic_sort_gpu.cpp

+62-2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,38 @@ void bitonic_512_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a
3434
clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic512, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
3535
}
3636

37+
void bitonic_1024_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other)
38+
{
39+
const int kernelSize = (a_N >> 1);
40+
41+
int iSize = kernelSize;
42+
size_t a_size = kernelSize;
43+
size_t localWorkSize = 512;
44+
45+
clSetKernelArg(other.bitonic1024, 0, sizeof(cl_mem), (void*)&a_buffer);
46+
clSetKernelArg(other.bitonic1024, 1, sizeof(cl_int), (void*)&stage);
47+
clSetKernelArg(other.bitonic1024, 2, sizeof(cl_int), (void*)&passOfStage);
48+
clSetKernelArg(other.bitonic1024, 3, sizeof(cl_int), (void*)&a_invertModeOn);
49+
50+
clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic1024, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
51+
}
52+
53+
void bitonic_2048_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other)
54+
{
55+
const int kernelSize = (a_N >> 1);
56+
57+
int iSize = kernelSize;
58+
size_t a_size = kernelSize;
59+
size_t localWorkSize = 1024;
60+
61+
clSetKernelArg(other.bitonic2048, 0, sizeof(cl_mem), (void*)&a_buffer);
62+
clSetKernelArg(other.bitonic2048, 1, sizeof(cl_int), (void*)&stage);
63+
clSetKernelArg(other.bitonic2048, 2, sizeof(cl_int), (void*)&passOfStage);
64+
clSetKernelArg(other.bitonic2048, 3, sizeof(cl_int), (void*)&a_invertModeOn);
65+
66+
clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic2048, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
67+
}
68+
3769

3870
void bitonic_sort_gpu_simple(cl_mem a_data, int a_N, BitonicCLArgs other)
3971
{
@@ -61,13 +93,31 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other)
6193
for (int temp = a_N; temp > 2; temp >>= 1)
6294
numStages++;
6395

96+
// not all devices can have large work groups!
97+
//
98+
size_t maxWorkGroupSize = 0;
99+
if (other.dev != 0)
100+
clGetDeviceInfo(other.dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL);
101+
else
102+
maxWorkGroupSize = 256;
103+
64104
// up, form bitonic sequence with half allays
65105
//
66106
for (int stage = 0; stage < numStages; stage++)
67107
{
68108
for (int passOfStage = stage; passOfStage >= 0; passOfStage--)
69109
{
70-
if (passOfStage > 0 && passOfStage <= 8)
110+
if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024)
111+
{
112+
bitonic_2048_gpu(a_data, a_N, stage, passOfStage, 1, other);
113+
break;
114+
}
115+
else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512)
116+
{
117+
bitonic_1024_gpu(a_data, a_N, stage, passOfStage, 1, other);
118+
break;
119+
}
120+
else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256)
71121
{
72122
bitonic_512_gpu(a_data, a_N, stage, passOfStage, 1, other);
73123
break;
@@ -81,7 +131,17 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other)
81131
//
82132
for (int passOfStage = numStages; passOfStage >= 0; passOfStage--)
83133
{
84-
if (passOfStage > 0 && passOfStage <= 8)
134+
if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024)
135+
{
136+
bitonic_2048_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
137+
break;
138+
}
139+
else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512)
140+
{
141+
bitonic_1024_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
142+
break;
143+
}
144+
else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256)
85145
{
86146
bitonic_512_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
87147
break;

bitonic_sort_gpu.h

+4
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,12 @@
55
struct BitonicCLArgs
66
{
77
cl_kernel bitonic512;
8+
cl_kernel bitonic1024;
9+
cl_kernel bitonic2048;
810
cl_kernel bitonicPassK;
11+
912
cl_command_queue cmdQueue;
13+
cl_device_id dev;
1014
};
1115

1216
void bitonic_sort_gpu(cl_mem a_buffer, int a_N, BitonicCLArgs other);

clew/clew.vcxproj

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<?xml version="1.0" encoding="utf-8"?>
2-
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
2+
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
33
<ItemGroup Label="ProjectConfigurations">
44
<ProjectConfiguration Include="Debug|Win32">
55
<Configuration>Debug</Configuration>
@@ -22,31 +22,32 @@
2222
<ProjectGuid>{5F13E40F-C0F1-4EF4-A775-AB8BC703DE88}</ProjectGuid>
2323
<Keyword>Win32Proj</Keyword>
2424
<RootNamespace>clew</RootNamespace>
25+
<WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
2526
</PropertyGroup>
2627
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
2728
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
2829
<ConfigurationType>StaticLibrary</ConfigurationType>
2930
<UseDebugLibraries>true</UseDebugLibraries>
30-
<PlatformToolset>v120</PlatformToolset>
31+
<PlatformToolset>v141</PlatformToolset>
3132
<CharacterSet>Unicode</CharacterSet>
3233
</PropertyGroup>
3334
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
3435
<ConfigurationType>StaticLibrary</ConfigurationType>
3536
<UseDebugLibraries>true</UseDebugLibraries>
36-
<PlatformToolset>v120</PlatformToolset>
37+
<PlatformToolset>v141</PlatformToolset>
3738
<CharacterSet>Unicode</CharacterSet>
3839
</PropertyGroup>
3940
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
4041
<ConfigurationType>StaticLibrary</ConfigurationType>
4142
<UseDebugLibraries>false</UseDebugLibraries>
42-
<PlatformToolset>v120</PlatformToolset>
43+
<PlatformToolset>v141</PlatformToolset>
4344
<WholeProgramOptimization>true</WholeProgramOptimization>
4445
<CharacterSet>Unicode</CharacterSet>
4546
</PropertyGroup>
4647
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
4748
<ConfigurationType>StaticLibrary</ConfigurationType>
4849
<UseDebugLibraries>false</UseDebugLibraries>
49-
<PlatformToolset>v120</PlatformToolset>
50+
<PlatformToolset>v141</PlatformToolset>
5051
<WholeProgramOptimization>true</WholeProgramOptimization>
5152
<CharacterSet>Unicode</CharacterSet>
5253
</PropertyGroup>

main.cpp

+14-7
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ int main(int argc, const char** argv)
6868

6969
cl_kernel bitonicPassK = bitonicProgs.kernel("bitonic_pass_kernel");
7070
cl_kernel bitonicOpt = bitonicProgs.kernel("bitonic_512");
71+
cl_kernel bitonicOpt2 = bitonicProgs.kernel("bitonic_1024");
72+
cl_kernel bitonicOpt3 = bitonicProgs.kernel("bitonic_2048");
7173

7274
auto gpuData = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int2)*data3.size(), &data3[0], &ciErr1);
7375

@@ -81,10 +83,12 @@ int main(int argc, const char** argv)
8183

8284
{
8385
BitonicCLArgs args;
86+
args.dev = device;
8487
args.cmdQueue = cmdQueue;
8588
args.bitonicPassK = bitonicPassK;
8689
args.bitonic512 = nullptr; // bitonic_sort_gpu_simple don't use shmem kernel
87-
90+
args.bitonic1024 = nullptr; // bitonic_sort_gpu_simple don't use shmem kernel
91+
args.bitonic2048 = nullptr;
8892
bitonic_sort_gpu_simple(gpuData, int(data2.size()), args);
8993
}
9094

@@ -109,9 +113,9 @@ int main(int argc, const char** argv)
109113
}
110114

111115
if (passed)
112-
std::cout << "gpu test sort simple PASSED!" << std::endl;
116+
std::cout << "gpu test sort simple\tPASSED!" << std::endl;
113117
else
114-
std::cout << "gpu test sort simple FAILED! (" << faileId << ")" << std::endl;
118+
std::cout << "gpu test sort simple\tFAILED! (" << faileId << ")" << std::endl;
115119

116120
//
117121
//
@@ -123,9 +127,12 @@ int main(int argc, const char** argv)
123127

124128
{
125129
BitonicCLArgs args;
126-
args.cmdQueue = cmdQueue;
130+
args.dev = device;
131+
args.cmdQueue = cmdQueue;
127132
args.bitonicPassK = bitonicPassK;
128-
args.bitonic512 = bitonicOpt;
133+
args.bitonic512 = bitonicOpt;
134+
args.bitonic1024 = bitonicOpt2;
135+
args.bitonic2048 = bitonicOpt3;
129136

130137
bitonic_sort_gpu(gpuData, int(data3.size()), args);
131138
}
@@ -151,9 +158,9 @@ int main(int argc, const char** argv)
151158
}
152159

153160
if (passed2)
154-
std::cout << "gpu test sort opt PASSED!" << std::endl;
161+
std::cout << "gpu test sort opt\tPASSED!" << std::endl;
155162
else
156-
std::cout << "gpu test sort opt FAILED! (" << faileId << ")" << std::endl;
163+
std::cout << "gpu test sort opt\tFAILED! (" << faileId << ")" << std::endl;
157164

158165
std::cout << std::endl;
159166
std::cout << "[CPU]: std::sort time = " << time1 << " ms" << std::endl;

sort.cl

+98
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,101 @@ __kernel void bitonic_512(__global ElemT* theArray, int stage, int passOfStageBe
8585
theArray[blockId*512 + lid + 256] = s_array[lid + 256];
8686

8787
}
88+
89+
__kernel void bitonic_1024(__global ElemT* theArray, int stage, int passOfStageBegin, int a_invertModeOn)
90+
{
91+
int tid = get_global_id(0);
92+
int lid = get_local_id(0);
93+
94+
int blockId = tid / 512;
95+
96+
__local ElemT s_array[1024];
97+
98+
s_array[lid + 0 ] = theArray[blockId * 1024 + lid + 0];
99+
s_array[lid + 512] = theArray[blockId * 1024 + lid + 512];
100+
101+
barrier(CLK_LOCAL_MEM_FENCE);
102+
103+
for (int passOfStage = passOfStageBegin; passOfStage >= 0; passOfStage--)
104+
{
105+
const int j = lid;
106+
const int r = 1 << (passOfStage);
107+
const int lmask = r - 1;
108+
109+
const int left = ((j >> passOfStage) << (passOfStage + 1)) + (j & lmask);
110+
const int right = left + r;
111+
112+
const ElemT a = s_array[left];
113+
const ElemT b = s_array[right];
114+
115+
const bool cmpRes = compare(a, b);
116+
117+
const ElemT minElem = cmpRes ? a : b;
118+
const ElemT maxElem = cmpRes ? b : a;
119+
120+
const int oddEven = tid >> stage; // (j >> stage)
121+
122+
const bool isSwap = (oddEven & 1) & a_invertModeOn;
123+
124+
const int minId = isSwap ? right : left;
125+
const int maxId = isSwap ? left : right;
126+
127+
s_array[minId] = minElem;
128+
s_array[maxId] = maxElem;
129+
130+
barrier(CLK_LOCAL_MEM_FENCE);
131+
}
132+
133+
theArray[blockId * 1024 + lid + 0] = s_array[lid + 0];
134+
theArray[blockId * 1024 + lid + 512] = s_array[lid + 512];
135+
}
136+
137+
138+
__kernel void bitonic_2048(__global ElemT* theArray, int stage, int passOfStageBegin, int a_invertModeOn)
139+
{
140+
int tid = get_global_id(0);
141+
int lid = get_local_id(0);
142+
143+
int blockId = tid / 1024;
144+
145+
__local ElemT s_array[2048];
146+
147+
s_array[lid + 0 ] = theArray[blockId * 2048 + lid + 0];
148+
s_array[lid + 1024] = theArray[blockId * 2048 + lid + 1024];
149+
150+
barrier(CLK_LOCAL_MEM_FENCE);
151+
152+
for (int passOfStage = passOfStageBegin; passOfStage >= 0; passOfStage--)
153+
{
154+
const int j = lid;
155+
const int r = 1 << (passOfStage);
156+
const int lmask = r - 1;
157+
158+
const int left = ((j >> passOfStage) << (passOfStage + 1)) + (j & lmask);
159+
const int right = left + r;
160+
161+
const ElemT a = s_array[left];
162+
const ElemT b = s_array[right];
163+
164+
const bool cmpRes = compare(a, b);
165+
166+
const ElemT minElem = cmpRes ? a : b;
167+
const ElemT maxElem = cmpRes ? b : a;
168+
169+
const int oddEven = tid >> stage; // (j >> stage)
170+
171+
const bool isSwap = (oddEven & 1) & a_invertModeOn;
172+
173+
const int minId = isSwap ? right : left;
174+
const int maxId = isSwap ? left : right;
175+
176+
s_array[minId] = minElem;
177+
s_array[maxId] = maxElem;
178+
179+
barrier(CLK_LOCAL_MEM_FENCE);
180+
}
181+
182+
theArray[blockId * 2048 + lid + 0] = s_array[lid + 0];
183+
theArray[blockId * 2048 + lid + 1024] = s_array[lid + 1024];
184+
}
185+

vsgl3/vsgl3.vcxproj

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<?xml version="1.0" encoding="utf-8"?>
2-
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
2+
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
33
<ItemGroup Label="ProjectConfigurations">
44
<ProjectConfiguration Include="Debug|Win32">
55
<Configuration>Debug</Configuration>
@@ -21,29 +21,30 @@
2121
<PropertyGroup Label="Globals">
2222
<ProjectGuid>{2758DD4A-78F6-452F-BBF5-4E86B46BD2EA}</ProjectGuid>
2323
<RootNamespace>vsgl3</RootNamespace>
24+
<WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
2425
</PropertyGroup>
2526
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
2627
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
2728
<ConfigurationType>StaticLibrary</ConfigurationType>
2829
<CharacterSet>MultiByte</CharacterSet>
2930
<WholeProgramOptimization>true</WholeProgramOptimization>
30-
<PlatformToolset>v120</PlatformToolset>
31+
<PlatformToolset>v141</PlatformToolset>
3132
</PropertyGroup>
3233
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
3334
<ConfigurationType>StaticLibrary</ConfigurationType>
3435
<CharacterSet>MultiByte</CharacterSet>
35-
<PlatformToolset>v120</PlatformToolset>
36+
<PlatformToolset>v141</PlatformToolset>
3637
</PropertyGroup>
3738
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
3839
<ConfigurationType>StaticLibrary</ConfigurationType>
3940
<CharacterSet>MultiByte</CharacterSet>
4041
<WholeProgramOptimization>true</WholeProgramOptimization>
41-
<PlatformToolset>v120</PlatformToolset>
42+
<PlatformToolset>v141</PlatformToolset>
4243
</PropertyGroup>
4344
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
4445
<ConfigurationType>StaticLibrary</ConfigurationType>
4546
<CharacterSet>MultiByte</CharacterSet>
46-
<PlatformToolset>v120</PlatformToolset>
47+
<PlatformToolset>v141</PlatformToolset>
4748
</PropertyGroup>
4849
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
4950
<ImportGroup Label="ExtensionSettings">

0 commit comments

Comments
 (0)