Skip to content

Commit

Permalink
Merge pull request #15 from hatakeyamak/PFRecHitAndCluster_GPU_12_5_dev
Browse files Browse the repository at this point in the history
Introduce buildDetIdMapKH. Fix customization.
  • Loading branch information
hatakeyamak authored Oct 7, 2022
2 parents 36240d9 + b755d72 commit 9d63cfe
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def customizeHcalPFOnlyForProfilingGPUOnly(process):
process.consume_step = cms.EndPath(process.consumer)

process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False)
process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False)
#process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False)
#process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False)

return process

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ namespace PFClustering {
namespace HCAL {
struct ConfigurationParameters {
uint32_t maxRH = 4000; // previously: 2000
uint32_t maxPFCFracs = 200000; // previously: 80000
uint32_t maxPFCFracs = 300000; // previously: 80000
uint32_t maxNeighbors = 8;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ namespace PFRecHit {
uint32_t size,
uint32_t const* rh_detIdRef, // Reference table index -> detId
int* rh_inputToFullIdx, // Map for input rechit detId -> reference table index
int* rh_fullToInputIdx, // Map for reference table index -> input rechit index
int* rh_fullToInputIdx, // Map for reference table index -> input rechit index
uint32_t const* recHits_did) // Input rechit detIds
{
{

int first = blockIdx.x*blockDim.x + threadIdx.x;
for (int i = first; i < size; i += gridDim.x * blockDim.x) {
Expand All @@ -129,6 +129,46 @@ namespace PFRecHit {
}
}

__global__ void buildDetIdMapKH(
uint32_t size,
uint32_t const* rh_detIdRef, // Reference table index -> detId
int* rh_inputToFullIdx, // Map for input rechit detId -> reference table index
int* rh_fullToInputIdx, // Map for reference table index -> input rechit index
uint32_t const* recHits_did) // Input rechit detIds
{

int first = blockIdx.x*blockDim.x + threadIdx.x;
for (int i = first; i < size; i += gridDim.x * blockDim.x) {
auto detId = recHits_did[i];

// Get subdetector encoded in detId to narrow the range of reference table values to search
// cmssdt.cern.ch/lxr/source/DataFormats/DetId/interface/DetId.h#0048
uint32_t subdet = (detId >> DetId::kSubdetOffset) & DetId::kSubdetMask;
uint32_t minval, maxval;
if (subdet == HcalBarrel) {
minval = 0;
maxval = constantsGPU_d.nValidBarrelIds;
} else if (subdet == HcalEndcap) {
minval = constantsGPU_d.nValidEndcapIds;
maxval = (constantsGPU_d.nValidBarrelIds + constantsGPU_d.nValidEndcapIds);
} else {
printf("Rechit %u detId %u has invalid subdetector %u!\n", blockIdx.x, detId, subdet);
return;
}

// Search all valid rechits for matching detId
for (uint32_t j = minval; j < maxval; j += 1) {
if (detId == rh_detIdRef[j]) {
// Found it
rh_inputToFullIdx[i] = j; // Input rechit index -> reference table index
rh_fullToInputIdx[j] = i; // Reference table index -> input rechit index
return;
}
}
}

}

// Build detId map with 1 block per input rechit
// Searches by detId for the matching index in reference table
__global__ void buildDetIdMapPerBlock(
Expand Down Expand Up @@ -495,7 +535,7 @@ namespace PFRecHit {
cms::cuda::device::unique_ptr<uint32_t[]> d_nPFRHCleaned; // Number of cleaned PFRecHits
cms::cuda::host::unique_ptr<uint32_t[]> h_nPFRHOut;
cms::cuda::host::unique_ptr<uint32_t[]> h_nPFRHCleaned;

d_nPFRHOut = cms::cuda::make_device_unique<uint32_t[]>(sizeof(uint32_t) , cudaStream);
d_nPFRHCleaned = cms::cuda::make_device_unique<uint32_t[]>(sizeof(uint32_t) , cudaStream);

Expand Down Expand Up @@ -530,21 +570,22 @@ namespace PFRecHit {
#endif

// // First build the mapping for input rechits to reference table indices
buildDetIdMapPerBlock<<<nRHIn, 256, 0, cudaStream>>>(nRHIn,
persistentDataGPU.rh_detId.get(),
scratchDataGPU.rh_inputToFullIdx.get(),
scratchDataGPU.rh_fullToInputIdx.get(),
HBHERecHits_asInput.did.get());
cudaCheck(cudaGetLastError());

// First build the mapping for input rechits to reference table indices
// buildDetIdMapHackathon<<<(nRHIn + threadsPerBlock - 1)/threadsPerBlock, threadsPerBlock, 0, cudaStream>>>(nRHIn,
// buildDetIdMapPerBlock<<<nRHIn, 256, 0, cudaStream>>>(nRHIn,
// persistentDataGPU.rh_detId.get(),
// scratchDataGPU.rh_inputToFullIdx.get(),
// scratchDataGPU.rh_fullToInputIdx.get(),
// HBHERecHits_asInput.did.get());
// cudaCheck(cudaGetLastError());

// First build the mapping for input rechits to reference table indices
// buildDetIdMapHackathon<<<(nRHIn + threadsPerBlock - 1)/threadsPerBlock, threadsPerBlock, 0, cudaStream>>>(nRHIn,
buildDetIdMapKH<<<(nRHIn + threadsPerBlock - 1)/threadsPerBlock, threadsPerBlock, 0, cudaStream>>>(nRHIn,
persistentDataGPU.rh_detId.get(),
scratchDataGPU.rh_inputToFullIdx.get(),
scratchDataGPU.rh_fullToInputIdx.get(),
HBHERecHits_asInput.did.get());
cudaCheck(cudaGetLastError());


// Debugging function used to check the mapping of input index <-> reference table index
// testDetIdMap<<<(nRHIn + threadsPerBlock - 1)/threadsPerBlock, threadsPerBlock, 0, cudaStream>>>(nRHIn,
Expand All @@ -566,7 +607,7 @@ namespace PFRecHit {
// Apply PFRecHit threshold & quality tests

//applyQTests<<<(nRHIn+127)/128, 256, 0, cudaStream>>>(nRHIn, scratchDataGPU.rh_mask.get(), HBHERecHits_asInput.did.get(), HBHERecHits_asInput.energy.get());

applyDepthThresholdQTests<<<(nRHIn + threadsPerBlock - 1) / threadsPerBlock, threadsPerBlock, 0, cudaStream>>>(
nRHIn, scratchDataGPU.rh_mask.get(), HBHERecHits_asInput.did.get(), HBHERecHits_asInput.energy.get());
cudaCheck(cudaGetLastError());
Expand Down Expand Up @@ -595,7 +636,7 @@ namespace PFRecHit {
cudaEventElapsedTime(&timer[3], start, stop);
printf("\napplyMask took %f ms\n\n", timer[3]);
#endif

cms::cuda::copyAsync(h_nPFRHOut, d_nPFRHOut, sizeof(uint32_t), cudaStream);
cms::cuda::copyAsync(h_nPFRHCleaned, d_nPFRHCleaned, sizeof(uint32_t), cudaStream);

Expand Down

0 comments on commit 9d63cfe

Please sign in to comment.