cms-patatrack · fwyzard · Dec 1, 2020 · Nov 30, 2020 · Nov 30, 2020 · Nov 30, 2020
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -11,21 +11,26 @@
 #include <cstdint>
 #include <cstring>
 
+// include the CUDA runtime header to define some of the attivutes, types and sybols also on the CPU
 #include <cuda_runtime.h>
 
+// make sure function are inlined to avoid multiple definition
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+
+#undef __forceinline__
+#define __forceinline__ inline __attribute__((always_inline))
+
 namespace cms {
  namespace cudacompat {
 
-#ifndef __CUDA_RUNTIME_H__
- struct dim3 {
- uint32_t x, y, z;
- };
-#endif
+ // run serially with a single thread
+ // 1-dimensional block
  const dim3 threadIdx = {0, 0, 0};
  const dim3 blockDim = {1, 1, 1};
-
- extern thread_local dim3 blockIdx;
- extern thread_local dim3 gridDim;
+ // 1-dimensional grid
+ const dim3 blockIdx = {0, 0, 0};
+ const dim3 gridDim = {1, 1, 1};
 
  template <typename T1, typename T2>
  T1 atomicCAS(T1* address, T1 compare, T2 val) {
@@ -78,35 +83,12 @@ namespace cms {
  return *x;
  }
 
- inline void resetGrid() {
- blockIdx = {0, 0, 0};
- gridDim = {1, 1, 1};
- }
-
  } // namespace cudacompat
 } // namespace cms
 
-// some not needed as done by cuda runtime...
-#ifndef __CUDA_RUNTIME_H__
-#define __host__
-#define __device__
-#define __global__
-#define __shared__
-#define __forceinline__
-#endif
-
-// make sure function are inlined to avoid multiple definition
-#ifndef __CUDA_ARCH__
-#undef __global__
-#define __global__ inline __attribute__((always_inline))
-#undef __forceinline__
-#define __forceinline__ inline __attribute__((always_inline))
-#endif
-
-#ifndef __CUDA_ARCH__
+// make the cudacompat implementation available in the global namespace
 using namespace cms::cudacompat;
-#endif
 
-#endif
+#endif // __CUDACC__
 
 #endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -10,11 +10,10 @@
 #include <vector>
 
 #ifdef __CUDACC__
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #endif
 
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
@@ -33,7 +32,6 @@ int main(void) {
  auto h_x = std::make_unique<uint16_t[]>(numElements);
  auto h_y = std::make_unique<uint16_t[]>(numElements);
  auto h_adc = std::make_unique<uint16_t[]>(numElements);
-
  auto h_clus = std::make_unique<int[]>(numElements);
 
 #ifdef __CUDACC__
@@ -46,11 +44,9 @@ int main(void) {
  auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
  auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
 #else
-
  auto h_moduleStart = std::make_unique<uint32_t[]>(MaxNumModules + 1);
  auto h_clusInModule = std::make_unique<uint32_t[]>(MaxNumModules);
  auto h_moduleId = std::make_unique<uint32_t[]>(MaxNumModules);
-
 #endif
 
  // later random number
@@ -301,9 +297,12 @@ int main(void) {
 
  cudaDeviceSynchronize();
 #else
+
  h_moduleStart[0] = nModules;
  countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
  memset(h_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t));
+#ifdef TODO_FIX_CLUSTERIZER_FOR_ANY_GRID_SIZE
+ // FIXME the findClus kernel should be rewritten to avoid relying on a predefined grid size
  gridDim.x = MaxNumModules; //not needed in the kernel for this specific case;
  assert(blockIdx.x == 0);
  for (; blockIdx.x < gridDim.x; ++blockIdx.x)
@@ -315,7 +314,7 @@ int main(void) {
  h_moduleId.get(),
  h_clus.get(),
  n);
- resetGrid();
+#endif // TODO_FIX_CLUSTERIZER_FOR_ANY_GRID_SIZE
 
  nModules = h_moduleStart[0];
  auto nclus = h_clusInModule.get();
@@ -330,12 +329,14 @@ int main(void) {
  if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
  std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
+#ifdef TODO_FIX_CLUSTERIZER_FOR_ANY_GRID_SIZE
+ // FIXME the clusterChargeCut kernel should be rewritten to avoid relying on a predefined grid size
  gridDim.x = MaxNumModules; // no needed in the kernel for in this specific case
  assert(blockIdx.x == 0);
  for (; blockIdx.x < gridDim.x; ++blockIdx.x)
  clusterChargeCut(
  h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
- resetGrid();
+#endif // TODO_FIX_CLUSTERIZER_FOR_ANY_GRID_SIZE
 
 #endif
 

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -209,8 +209,6 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
  // filled creates view
  SiPixelDigisCUDA::DeviceConstView digiView{xx_.data(), yy_.data(), adc_.data(), moduleInd_.data(), clus_.data()};
  assert(digiView.adc(0) != 0);
- // not needed...
- cms::cudacompat::resetGrid();
  // we run on blockId.x==0
  gpuPixelRecHits::getHits(&cpeView, &bsHost, &digiView, ndigi, &clusterView, output->view());
  for (auto h = fc; h < lc; ++h)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -109,7 +109,6 @@ namespace gpuVertexFinder {
  loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
  cudaCheck(cudaGetLastError());
 #else
- cms::cudacompat::resetGrid();
  init(soa, ws_d.get());
  loadTracks(tksoa, soa, ws_d.get(), ptMin);
 #endif
@@ -157,10 +156,9 @@ namespace gpuVertexFinder {
  // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
  fitVertices(soa, ws_d.get(), 50.);
  // one block per vertex!
- blockIdx.x = 0;
- gridDim.x = 1;
+ assert(blockIdx.x == 0);
+ assert(gridDim.x == 1);
  splitVertices(soa, ws_d.get(), 9.f);
- resetGrid();
  fitVertices(soa, ws_d.get(), 5000.);
  sortByPt2(soa, ws_d.get());
 #endif

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -266,10 +266,9 @@ int main() {
  cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
  cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
- gridDim.x = 1;
+ assert(gridDim.x == 1);
  assert(blockIdx.x == 0);
  splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
- resetGrid();
  nv = ws_d->nvIntermediate;
 #endif
  std::cout << "after split " << nv << std::endl;