50% optimisations for nvidia (and other GPUs with large work groups);

FROL256 · FROL256 · commit 0becdfe10e07 · 2018-02-20T18:38:37.000+03:00
diff --git a/bitonic_sort.vcxproj b/bitonic_sort.vcxproj
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
@@ -22,31 +22,32 @@
     <ProjectGuid>{F89622BD-0044-4082-A6C6-F5D9678EBAE5}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>bitonic_sort</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
diff --git a/bitonic_sort_gpu.cpp b/bitonic_sort_gpu.cpp
@@ -34,6 +34,38 @@ void bitonic_512_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a
   clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic512, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
 }
 
+void bitonic_1024_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other)
+{
+  const int kernelSize = (a_N >> 1);
+
+  int    iSize         = kernelSize;
+  size_t a_size        = kernelSize;
+  size_t localWorkSize = 512;
+
+  clSetKernelArg(other.bitonic1024, 0, sizeof(cl_mem), (void*)&a_buffer);
+  clSetKernelArg(other.bitonic1024, 1, sizeof(cl_int), (void*)&stage);
+  clSetKernelArg(other.bitonic1024, 2, sizeof(cl_int), (void*)&passOfStage);
+  clSetKernelArg(other.bitonic1024, 3, sizeof(cl_int), (void*)&a_invertModeOn);
+
+  clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic1024, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
+}
+
+void bitonic_2048_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other)
+{
+  const int kernelSize = (a_N >> 1);
+
+  int    iSize         = kernelSize;
+  size_t a_size        = kernelSize;
+  size_t localWorkSize = 1024;
+
+  clSetKernelArg(other.bitonic2048, 0, sizeof(cl_mem), (void*)&a_buffer);
+  clSetKernelArg(other.bitonic2048, 1, sizeof(cl_int), (void*)&stage);
+  clSetKernelArg(other.bitonic2048, 2, sizeof(cl_int), (void*)&passOfStage);
+  clSetKernelArg(other.bitonic2048, 3, sizeof(cl_int), (void*)&a_invertModeOn);
+
+  clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic2048, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
+}
+
 
 void bitonic_sort_gpu_simple(cl_mem a_data, int a_N, BitonicCLArgs other)
 {
@@ -61,13 +93,31 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other)
   for (int temp = a_N; temp > 2; temp >>= 1)
     numStages++;
 
+  // not all devices can have large work groups!
+  //
+  size_t maxWorkGroupSize = 0;
+  if (other.dev != 0)
+    clGetDeviceInfo(other.dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL);
+  else
+    maxWorkGroupSize = 256;
+
   // up, form bitonic sequence with half allays
   //
   for (int stage = 0; stage < numStages; stage++)
   {
     for (int passOfStage = stage; passOfStage >= 0; passOfStage--)
     {
-      if (passOfStage > 0 && passOfStage <= 8)
+      if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024)
+      {
+        bitonic_2048_gpu(a_data, a_N, stage, passOfStage, 1, other);
+        break;
+      }
+      else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512)
+      {
+        bitonic_1024_gpu(a_data, a_N, stage, passOfStage, 1, other);
+        break;
+      }
+      else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256)
       {
         bitonic_512_gpu(a_data, a_N, stage, passOfStage, 1, other);
         break;
@@ -81,7 +131,17 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other)
   //
   for (int passOfStage = numStages; passOfStage >= 0; passOfStage--)
   {
-    if (passOfStage > 0 && passOfStage <= 8)
+    if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024)
+    {
+      bitonic_2048_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
+      break;
+    }
+    else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512)
+    {
+      bitonic_1024_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
+      break;
+    }
+    else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256)
     {
       bitonic_512_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
       break;
diff --git a/bitonic_sort_gpu.h b/bitonic_sort_gpu.h
@@ -5,8 +5,12 @@
 struct BitonicCLArgs
 {
   cl_kernel        bitonic512;
+  cl_kernel        bitonic1024;
+  cl_kernel        bitonic2048;
   cl_kernel        bitonicPassK;
+
   cl_command_queue cmdQueue;
+  cl_device_id     dev;
 };
 
 void bitonic_sort_gpu(cl_mem a_buffer, int a_N, BitonicCLArgs other);
diff --git a/clew/clew.vcxproj b/clew/clew.vcxproj
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
@@ -22,31 +22,32 @@
     <ProjectGuid>{5F13E40F-C0F1-4EF4-A775-AB8BC703DE88}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>clew</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
diff --git a/main.cpp b/main.cpp
@@ -68,6 +68,8 @@ int main(int argc, const char** argv)
 
     cl_kernel bitonicPassK = bitonicProgs.kernel("bitonic_pass_kernel");
     cl_kernel bitonicOpt   = bitonicProgs.kernel("bitonic_512");
+    cl_kernel bitonicOpt2  = bitonicProgs.kernel("bitonic_1024");
+    cl_kernel bitonicOpt3  = bitonicProgs.kernel("bitonic_2048");
 
     auto gpuData = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int2)*data3.size(), &data3[0], &ciErr1);
 
@@ -81,10 +83,12 @@ int main(int argc, const char** argv)
 
     {
       BitonicCLArgs args;
+      args.dev          = device;
       args.cmdQueue     = cmdQueue;
       args.bitonicPassK = bitonicPassK;
       args.bitonic512   = nullptr;  // bitonic_sort_gpu_simple don't use shmem kernel
-
+      args.bitonic1024  = nullptr;  // bitonic_sort_gpu_simple don't use shmem kernel
+      args.bitonic2048  = nullptr;
       bitonic_sort_gpu_simple(gpuData, int(data2.size()), args);
     }
 
@@ -109,9 +113,9 @@ int main(int argc, const char** argv)
     }
 
     if (passed)
-      std::cout << "gpu test sort simple PASSED!" << std::endl;
+      std::cout << "gpu test sort simple\tPASSED!" << std::endl;
     else
-      std::cout << "gpu test sort simple FAILED! (" << faileId << ")" << std::endl;
+      std::cout << "gpu test sort simple\tFAILED! (" << faileId << ")" << std::endl;
 
     // 
     //
@@ -123,9 +127,12 @@ int main(int argc, const char** argv)
 
     {
       BitonicCLArgs args;
-      args.cmdQueue = cmdQueue;
+      args.dev          = device;
+      args.cmdQueue     = cmdQueue;
       args.bitonicPassK = bitonicPassK;
-      args.bitonic512 = bitonicOpt;
+      args.bitonic512   = bitonicOpt;
+      args.bitonic1024  = bitonicOpt2;
+      args.bitonic2048  = bitonicOpt3;
 
       bitonic_sort_gpu(gpuData, int(data3.size()), args);
     }
@@ -151,9 +158,9 @@ int main(int argc, const char** argv)
     }
 
     if (passed2)
-      std::cout << "gpu test sort opt PASSED!" << std::endl;
+      std::cout << "gpu test sort opt\tPASSED!" << std::endl;
     else
-      std::cout << "gpu test sort opt FAILED! (" << faileId << ")" << std::endl;
+      std::cout << "gpu test sort opt\tFAILED! (" << faileId << ")" << std::endl;
 
     std::cout << std::endl;
     std::cout << "[CPU]: std::sort time      = " << time1 << " ms" << std::endl;
diff --git a/sort.cl b/sort.cl
@@ -85,3 +85,101 @@ __kernel void bitonic_512(__global ElemT* theArray, int stage, int passOfStageBe
   theArray[blockId*512 + lid + 256] = s_array[lid + 256];
 
 }
+
+__kernel void bitonic_1024(__global ElemT* theArray, int stage, int passOfStageBegin, int a_invertModeOn)
+{
+  int tid = get_global_id(0);
+  int lid = get_local_id(0);
+
+  int blockId = tid / 512;
+
+  __local ElemT s_array[1024];
+
+  s_array[lid + 0  ] = theArray[blockId * 1024 + lid + 0];
+  s_array[lid + 512] = theArray[blockId * 1024 + lid + 512];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int passOfStage = passOfStageBegin; passOfStage >= 0; passOfStage--)
+  {
+    const int j = lid;
+    const int r = 1 << (passOfStage);
+    const int lmask = r - 1;
+
+    const int left = ((j >> passOfStage) << (passOfStage + 1)) + (j & lmask);
+    const int right = left + r;
+
+    const ElemT a = s_array[left];
+    const ElemT b = s_array[right];
+
+    const bool cmpRes = compare(a, b);
+
+    const ElemT minElem = cmpRes ? a : b;
+    const ElemT maxElem = cmpRes ? b : a;
+
+    const int oddEven = tid >> stage; // (j >> stage)
+
+    const bool isSwap = (oddEven & 1) & a_invertModeOn;
+
+    const int minId = isSwap ? right : left;
+    const int maxId = isSwap ? left : right;
+
+    s_array[minId] = minElem;
+    s_array[maxId] = maxElem;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  theArray[blockId * 1024 + lid + 0]   = s_array[lid + 0];
+  theArray[blockId * 1024 + lid + 512] = s_array[lid + 512];
+}
+
+
+__kernel void bitonic_2048(__global ElemT* theArray, int stage, int passOfStageBegin, int a_invertModeOn)
+{
+  int tid = get_global_id(0);
+  int lid = get_local_id(0);
+
+  int blockId = tid / 1024;
+
+  __local ElemT s_array[2048];
+
+  s_array[lid + 0   ] = theArray[blockId * 2048 + lid + 0];
+  s_array[lid + 1024] = theArray[blockId * 2048 + lid + 1024];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int passOfStage = passOfStageBegin; passOfStage >= 0; passOfStage--)
+  {
+    const int j = lid;
+    const int r = 1 << (passOfStage);
+    const int lmask = r - 1;
+
+    const int left = ((j >> passOfStage) << (passOfStage + 1)) + (j & lmask);
+    const int right = left + r;
+
+    const ElemT a = s_array[left];
+    const ElemT b = s_array[right];
+
+    const bool cmpRes = compare(a, b);
+
+    const ElemT minElem = cmpRes ? a : b;
+    const ElemT maxElem = cmpRes ? b : a;
+
+    const int oddEven = tid >> stage; // (j >> stage)
+
+    const bool isSwap = (oddEven & 1) & a_invertModeOn;
+
+    const int minId = isSwap ? right : left;
+    const int maxId = isSwap ? left : right;
+
+    s_array[minId] = minElem;
+    s_array[maxId] = maxElem;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  theArray[blockId * 2048 + lid + 0]    = s_array[lid + 0];
+  theArray[blockId * 2048 + lid + 1024] = s_array[lid + 1024];
+}
+
diff --git a/vsgl3/vsgl3.vcxproj b/vsgl3/vsgl3.vcxproj
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
@@ -21,29 +21,30 @@
   <PropertyGroup Label="Globals">
     <ProjectGuid>{2758DD4A-78F6-452F-BBF5-4E86B46BD2EA}</ProjectGuid>
     <RootNamespace>vsgl3</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <CharacterSet>MultiByte</CharacterSet>
     <WholeProgramOptimization>true</WholeProgramOptimization>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <CharacterSet>MultiByte</CharacterSet>
     <WholeProgramOptimization>true</WholeProgramOptimization>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">