Update throughput calculation, a6000 config

Minimap2onGPU · Feb 9, 2024 · e1248fb · e1248fb
1 parent 216f2b2
commit e1248fb
Show file tree

Hide file tree

Showing 10 changed files with 126 additions and 34 deletions.
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 CFLAGS=		 -O2 -g -DNDEBUG
-CDEBUG_FLAGS= -g -Wall -Wextra -Wno-unused-parameter -Wno-unused-variable -Wno-sign-compare -Wno-unused-function -Wno-c++17-extensions -DDEBUG_PRINT -O2 -Wno-\#warnings #-O0 -DNDEBUG
+CDEBUG_FLAGS= -g -DDEBUG_PRINT -O2 #-Wall -Wextra -Wno-unused-parameter -Wno-unused-variable -Wno-sign-compare -Wno-unused-function -Wno-c++17-extensions -Wno-\#warnings #-O0 -DNDEBUG
 CPPFLAGS=	-DHAVE_KALLOC -D__AMD_SPLIT_KERNELS__ # -Wno-unused-but-set-variable -Wno-unused-variable
 CPPFLAGS+= 	$(if $(MICRO_BATCH),-DMICRO_BATCH=\($(MICRO_BATCH)\))
 INCLUDES=	-I .

diff --git a/a6000_config.json b/a6000_config.json
@@ -0,0 +1,27 @@
+{
+    "//config is for": "aac cloud. Fits one batch + 5% x 4 long buffer avg_read_n 10k",
+    "num_streams": 1,
+    "min_n": 512, 
+    "//min_n": "queries with less anchors will be handled on cpu",
+    "long_seg_buffer_size": 258880000,
+    "max_total_n": 893440000, 
+    "max_read": 893440,
+    "avg_read_n": 20000,
+    "//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified",
+    "range_kernel": {
+        "blockdim": 512,
+        "cut_check_anchors": 10,
+        "//cut_check_anchors": "Number of anchors to check to attemp a cut",
+        "anchor_per_block": 32768,
+        "//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim"
+    },
+    "score_kernel": {
+        "short_blockdim": 64,
+        "long_blockdim": 64,
+        "mid_blockdim": 64,
+        "//blockdim config": "options are not used: static config specified at compile time (make ... LONG_BLOCK_SIZE=1024)",
+        "short_griddim": 2688,
+        "long_griddim": 1024,
+        "mid_griddim": 2688
+    }
+}
diff --git a/gpu/gpu.mk b/gpu/gpu.mk
@@ -29,7 +29,7 @@ CUDATESTFLAG	= -G
 HIPCC			= hipcc
 HIPFLAGS		= -DUSEHIP 
 HIPTESTFLAGS	= -G -Rpass-analysis=kernel-resource-usage
-LIBS			+= -L${ROCM_PATH}/lib -lroctx64 -lroctracer64
+HIPLIBS			= -L${ROCM_PATH}/lib -lroctx64 -lroctracer64
 
 ###################################################
 ############	DEBUG Options	###################
@@ -38,6 +38,7 @@ ifeq ($(GPU), AMD)
 	GPU_CC 		= $(HIPCC)
 	GPU_FLAGS	= $(HIPFLAGS)
 	GPU_TESTFL	= $(HIPTESTFLAGS)
+	LIBS		+= $(HIPLIBS)
 else
 	GPU_CC 		= $(NVCC)
 	GPU_FLAGS	= $(CUDAFLAGS)

diff --git a/gpu/plchain.cu b/gpu/plchain.cu
@@ -424,6 +424,7 @@ void plchain_debug_analysis(stream_ptr_t stream){
     chain_read_t* reads = stream.reads;
     deviceMemPtr* dev_mem = &stream.dev_mem;
     hostMemPtr* host_mem = &stream.host_mems[uid];
+    longMemPtr* long_mem = &stream.long_mem;
     size_t cut_num = stream.host_mems[uid].cut_num;
 
     unsigned int num_mid_seg, num_long_seg;
@@ -441,9 +442,11 @@ void plchain_debug_analysis(stream_ptr_t stream){
     size_t* cut = (size_t*)malloc(sizeof(size_t) * cut_num);
     cudaMemcpy(cut, dev_mem->d_cut, sizeof(size_t) * cut_num,
                 cudaMemcpyDeviceToHost);
-    seg_t* long_segs_og = (seg_t*)malloc(sizeof(seg_t) * num_long_seg);
-    cudaMemcpy(long_segs_og, dev_mem->d_long_seg_og, sizeof(seg_t) * num_long_seg,
+    seg_t* long_segs = (seg_t*)malloc(sizeof(seg_t) * num_long_seg);
+    cudaMemcpy(long_segs, dev_mem->d_long_seg, sizeof(seg_t) * num_long_seg,
                 cudaMemcpyDeviceToHost);
+    int32_t* long_range = (int32_t*)malloc(sizeof(int32_t) * *(long_mem->total_long_segs_n));
+    cudaMemcpy(long_range, dev_mem->d_range_long, sizeof(int32_t) * *(long_mem->total_long_segs_n), cudaMemcpyDeviceToHost);
 
 // Calculate total workload (sc pairs)
     size_t total_sc_pairs = 0;
@@ -452,8 +455,8 @@ void plchain_debug_analysis(stream_ptr_t stream){
     }
     size_t long_seg_sc_pairs = 0;
     for (int long_seg_id = 0; long_seg_id < num_long_seg; long_seg_id++) {
-        for(size_t i = long_segs_og[long_seg_id].start_idx; i < long_segs_og[long_seg_id].end_idx; i++){
-            long_seg_sc_pairs += range[i];
+        for(size_t i = long_segs[long_seg_id].start_idx; i < long_segs[long_seg_id].end_idx; i++){
+            long_seg_sc_pairs += long_range[i];
         }
     }
     fprintf(stderr, "[DEBUG] Total workload (sc pairs) in batch: %lu, in long segs %lu\n", total_sc_pairs, long_seg_sc_pairs);
@@ -462,7 +465,7 @@ void plchain_debug_analysis(stream_ptr_t stream){
     // calculate kernel throughput
     float long_kernel_runtime_ms = 0;
     cudaEventElapsedTime(&long_kernel_runtime_ms, stream.long_kernel_event, stream.stopevent);
-    float long_kernel_througput = long_seg_sc_pairs / long_kernel_runtime_ms;
+    float long_kernel_througput = long_seg_sc_pairs / long_kernel_runtime_ms / (float)1000;
     fprintf(stderr, "[DEBUG] Long Seg kernel throughput: %.2f Mpairs/s\n", long_kernel_througput);
 
 // Check range w.r.t input (MAKE SURE INPUT RANGE EXISTS)
@@ -524,9 +527,28 @@ int plchain_finish_batch(streamSetup_t stream_setup, int stream_id, Misc misc, v
 #ifdef DEBUG_PRINT
         float milliseconds = 0;
         cudaEventElapsedTime(&milliseconds, stream_setup.streams[stream_id].startevent, stream_setup.streams[stream_id].stopevent);
-        fprintf(stderr, "[Info] %s (%s:%d) last launch runtime: %f ms\n", __func__, __FILE__, __LINE__, milliseconds);
+        // fprintf(stderr, "[Info] %s (%s:%d) last launch runtime: %f ms\n", __func__, __FILE__, __LINE__, milliseconds);
 #endif
 
+#ifdef DEBUG_PRINT
+        float long_kernel_runtime_ms = 0;
+        float short_kernel_runtime_ms[MICRO_BATCH];
+        cudaEventElapsedTime(&long_kernel_runtime_ms, stream_setup.streams[stream_id].long_kernel_event, stream_setup.streams[stream_id].stopevent);
+        float long_kernel_througput = *stream_setup.streams[stream_id].long_mem.total_long_segs_n / long_kernel_runtime_ms / (float)1000;
+        for (int uid = 0; uid < MICRO_BATCH; uid++){
+            cudaEventElapsedTime(&short_kernel_runtime_ms[uid], stream_setup.streams[stream_id].short_kernel_event[uid], 
+            uid + 1 < MICRO_BATCH ? stream_setup.streams[stream_id].short_kernel_event[uid+1] : stream_setup.streams[stream_id].long_kernel_event);
+        }
+        fprintf(stderr, " %9.2f %%\n", (float)(*stream_setup.streams[stream_id].long_mem.total_long_segs_n)/stream_setup.long_seg_buffer_size_stream*100);
+        fprintf(stderr, "Runtime(s) = ");
+        for (int uid = 0; uid < MICRO_BATCH; uid++) fprintf(stderr, " %11.2f", short_kernel_runtime_ms[uid] / 1000);
+        fprintf(stderr, " %9.2f %%\n", long_kernel_runtime_ms / 1000);
+        fprintf(stderr, "Throutput (Ma/s) = ");
+        for (int uid = 0; uid < MICRO_BATCH; uid++) fprintf(stderr, "           ");
+        fprintf(stderr, " %11.2f\n", long_kernel_througput);
+        fprintf(stderr, "[Info] %s finish (%s:%d) n_read %d long seg buffer usage %.2f%%. Long seg kernel throughput %.2f Manchors/s\n", __func__, __FILE__, __LINE__, n_reads, (float)(*stream_setup.streams[stream_id].long_mem.total_long_segs_n)/stream_setup.long_seg_buffer_size_stream*100, long_kernel_througput);
+#endif // DEBUG_PRINT
+
 // DEBUG: debug analysis that involves sychronizating the whole device 
 #if defined(DEBUG_CHECK)
         plchain_debug_analysis(stream_setup.streams[stream_id]);
@@ -550,12 +572,7 @@ int plchain_finish_batch(streamSetup_t stream_setup, int stream_id, Misc misc, v
             // accumulate n_reads
             n_reads += stream_setup.streams[stream_id].host_mems[uid].size;
         }
-#ifdef DEBUG_PRINT
-        float long_kernel_runtime_ms = 0;
-        cudaEventElapsedTime(&long_kernel_runtime_ms, stream_setup.streams[stream_id].long_kernel_event, stream_setup.streams[stream_id].stopevent);
-        float long_kernel_througput = *stream_setup.streams[stream_id].long_mem.total_long_segs_n / long_kernel_runtime_ms;
-        fprintf(stderr, "[Info] %s finish (%s:%d) n_read %d long seg buffer usage %.2f%%. Long seg kernel throughput %.2f Manchors/s\n", __func__, __FILE__, __LINE__, n_reads, (float)(*stream_setup.streams[stream_id].long_mem.total_long_segs_n)/stream_setup.long_seg_buffer_size_stream*100, long_kernel_througput);
-#endif // DEBUG_PRINT
+
     return n_reads;
 } 
 
@@ -592,8 +609,19 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
 
     stream_setup.streams[stream_id].reads = reads;
     int read_start = 0;
+#ifdef DEBUG_PRINT
+    fprintf(stderr, "----------------------------------------------------------------\n           ");
+    for (int uid = 0; uid < MICRO_BATCH; uid++){
+        fprintf(stderr, "       Short%d", uid);
+    }
+    fprintf(stderr, "       Long\n");
+	fprintf(stderr, "----------------------------------------------------------------\n");
+    fprintf(stderr, "Mem Usage  = ");
+#endif
     for (int uid = 0; uid < MICRO_BATCH; uid++) {
+#ifdef USEHIP
         roctxRangePushA("microbatch");
+#endif
         // decide the size of micro batch
         size_t batch_n = 0;
         int read_end = 0;
@@ -611,7 +639,9 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
             cut_num += (reads[read_end].n - 1) / an_p_cut + 1;
         }
 #ifdef DEBUG_PRINT
-        fprintf(stderr, "[Info] %s (%s:%d) MICROBATCH#%d batch_n %lu, read_start %d, read_end %d usage %.2f %%\n", __func__, __FILE__, __LINE__, uid, batch_n, read_start, read_end, (float)batch_n/stream_setup.max_anchors_stream*100);
+        // fprintf(stderr, "[Info] %s (%s:%d) MICROBATCH#%d batch_n %lu, read_start %d, read_end %d usage %.2f %%\n", __func__, __FILE__, __LINE__, uid, batch_n, read_start, read_end, (float)batch_n/stream_setup.max_anchors_stream*100);
+
+        fprintf(stderr, " %9.2f %%", (float)batch_n/stream_setup.max_anchors_stream*100);
 #endif // DEBUG_PRINT
         // // sanity check
         // if (stream_setup.max_anchors_stream < total_n){
@@ -628,15 +658,20 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
         assert(stream_setup.max_range_grid >= griddim);
         assert(stream_setup.max_num_cut >= cut_num);
         // work on micro batch
+#ifdef USEHIP
         roctxRangePushA("reorg");
+#endif
         // step1: reorg input
         plmem_reorg_input_arr(reads + read_start, read_end - read_start,
                           &stream_setup.streams[stream_id].host_mems[uid],
                           range_kernel_config);
         // step2: copy to device
+#ifdef USEHIP
         roctxRangePop();
+#endif
 
-
+        cudaEventRecord(stream_setup.streams[stream_id].short_kernel_event[uid],
+                    stream_setup.streams[stream_id].cudastream);
         plmem_async_h2d_short_memcpy(&stream_setup.streams[stream_id], uid);
         // step3: range selection
         plrange_async_range_selection(&stream_setup.streams[stream_id].dev_mem,
@@ -648,7 +683,9 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
         plmem_async_d2h_short_memcpy(&stream_setup.streams[stream_id], uid);
         // update index
         read_start = read_end;
+#ifdef USEHIP
         roctxRangePop();
+#endif
     }
 
     cudaEventRecord(stream_setup.streams[stream_id].long_kernel_event,

diff --git a/gpu/plmem.cu b/gpu/plmem.cu
@@ -13,6 +13,14 @@
 
 void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch,
                            int range_grid_size, size_t buffer_size_long) {
+#ifdef DEBUG_PRINT
+    size_t host_mem_size; 
+    host_mem_size = anchor_per_batch * (sizeof(int32_t) + sizeof(int32_t) + 
+                    sizeof(int8_t) + sizeof(int32_t) + sizeof(int32_t) + sizeof(uint16_t));
+    host_mem_size += range_grid_size * (sizeof(size_t) + sizeof(size_t) + sizeof(size_t));
+    fprintf(stderr, "[Info] Host Malloc Pinned Memory Size %.2f GB\n", (float)host_mem_size / OneG);
+#endif
+
     // data array
     cudaMallocHost((void**)&host_mem->ax, anchor_per_batch * sizeof(int32_t));
     cudaMallocHost((void**)&host_mem->ay, anchor_per_batch * sizeof(int32_t));
@@ -34,6 +42,12 @@ void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch,
 }
 
 void plmem_malloc_long_mem(longMemPtr *long_mem, size_t buffer_size_long) {
+#ifdef DEBUG_PRINT
+    size_t host_mem_size; 
+    host_mem_size =  buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t);
+    host_mem_size += buffer_size_long * (sizeof(int32_t) + sizeof(uint16_t));
+    fprintf(stderr, "[Info] Host Malloc Pinned Memory Size %.2f GB (long seg)\n", (float)host_mem_size / OneG);
+#endif
     // data array
     cudaMallocHost((void**)&long_mem->long_segs, buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t));
     cudaMallocHost((void**)&long_mem->f_long, buffer_size_long * sizeof(int32_t));
@@ -46,6 +60,8 @@ void plmem_malloc_long_mem(longMemPtr *long_mem, size_t buffer_size_long) {
 void plmem_free_host_mem(hostMemPtr *host_mem) { 
     cudaFreeHost(host_mem->ax);
     cudaFreeHost(host_mem->ay);
+    cudaFreeHost(host_mem->sid);
+    cudaFreeHost(host_mem->xrev);
     cudaFreeHost(host_mem->f);
     cudaFreeHost(host_mem->p);
     cudaFreeHost(host_mem->start_idx);
@@ -576,17 +592,18 @@ void plmem_stream_initialize(size_t *max_total_n_,
         for (int j = 0; j < MICRO_BATCH; j++) {
             plmem_malloc_host_mem(&stream_setup.streams[i].host_mems[j], max_anchors_stream,
                               max_range_grid, long_seg_buffer_size);
+            cudaEventCreate(&stream_setup.streams[i].short_kernel_event[j]);
         }
         // one stream has one long mem and one device mem
         plmem_malloc_long_mem(&stream_setup.streams[i].long_mem, long_seg_buffer_size);
         plmem_malloc_device_mem(&stream_setup.streams[i].dev_mem, max_anchors_stream,
                                 max_range_grid, max_num_cut);
-        cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_long_seg_count, 0, sizeof(unsigned int),
-                        stream_setup.streams[i].cudastream);
-        cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int),
-                        stream_setup.streams[i].cudastream);
-        cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_total_n_long, 0, sizeof(size_t),
-                        stream_setup.streams[i].cudastream);
+        cudaMemset(stream_setup.streams[i].dev_mem.d_long_seg_count, 0, sizeof(unsigned int));
+        cudaCheck();
+        cudaMemset(stream_setup.streams[i].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int));
+        cudaCheck();
+        cudaMemset(stream_setup.streams[i].dev_mem.d_total_n_long, 0, sizeof(size_t));
+        cudaCheck();
     }
 
 cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem);

diff --git a/gpu/plmem.cuh b/gpu/plmem.cuh
@@ -74,11 +74,11 @@ typedef struct {
 
     // cut
     size_t *d_cut;  // cut
-    unsigned int *d_long_seg_count;
-    seg_t *d_long_seg;
-    seg_t *d_long_seg_og;
-    unsigned int *d_mid_seg_count;
-    seg_t *d_mid_seg;
+    unsigned int *d_long_seg_count; // total number of long seg (aggregated accross micro batches)
+    seg_t *d_long_seg;              // start & end idx of long segs in the long seg buffer (aggregated across micro batches)
+    seg_t *d_long_seg_og;           // start & end idx of long seg in the micro batch. (aggregated accross micro batches)
+    unsigned int *d_mid_seg_count;  // private to micro batch
+    seg_t *d_mid_seg;               // private to micro batch
 
     // long segement buffer
     int32_t *d_ax_long, *d_ay_long;
@@ -97,6 +97,7 @@ typedef struct stream_ptr_t{
     deviceMemPtr dev_mem;
     cudaStream_t cudastream;
     cudaEvent_t stopevent, startevent, long_kernel_event;
+    cudaEvent_t short_kernel_event[MICRO_BATCH];
     bool busy = false;
 } stream_ptr_t;
 

diff --git a/gpu/plrange.cu b/gpu/plrange.cu
@@ -255,7 +255,7 @@ void plrange_async_range_selection(deviceMemPtr* dev_mem, cudaStream_t* stream)
         dev_mem->d_range, dev_mem->d_cut, dev_mem->d_cut_start_idx, total_n, range_kernel_config);
     cudaCheck();
 #ifdef DEBUG_PRINT
-    fprintf(stderr, "[Info] %s (%s:%d): Batch total_n %lu, Range Kernel Launched, grid %d cut %d\n", __func__, __FILE__, __LINE__, total_n, DimGrid.x, cut_num);
+    // fprintf(stderr, "[Info] %s (%s:%d): Batch total_n %lu, Range Kernel Launched, grid %d cut %d\n", __func__, __FILE__, __LINE__, total_n, DimGrid.x, cut_num);
 #endif
 }
 

diff --git a/gpu/plscore.cu b/gpu/plscore.cu
@@ -290,16 +290,16 @@ __global__ void score_generation_short(
             size_t long_seg_start_idx;
             if (tid == 0) {
                 /* Allocate space in long seg buffer */
-                long_seg_start_idx = atomicAdd(total_n_long, end_idx - start_idx);
+                long_seg_start_idx = atomicAdd((unsigned long long int*)total_n_long, (unsigned long long int)end_idx - start_idx);
                 if (long_seg_start_idx + (end_idx - start_idx) >= buffer_size_long){ // long segement buffer is full
-                    atomicSub(total_n_long, end_idx - start_idx); // rollback total_n_long
+                    atomicAdd((unsigned long long int*)total_n_long, (unsigned long long int)end_idx - start_idx); // rollback total_n_long
                     long_seg_start_idx = SIZE_MAX;
                     // fallback to mid kernel
-                    int mid_seg_idx = atomicAdd(mid_seg_count, 1);
+                    int mid_seg_idx = atomicAdd((unsigned long long int*)mid_seg_count, 1);
                     mid_seg[mid_seg_idx].start_idx = start_idx;
                     mid_seg[mid_seg_idx].end_idx = end_idx;
                 } else {
-                    int long_seg_idx = atomicAdd(long_seg_count, 1);
+                    int long_seg_idx = atomicAdd((unsigned long long int*)long_seg_count, 1);
                     long_seg[long_seg_idx].start_idx = long_seg_start_idx;
                     long_seg[long_seg_idx].end_idx = long_seg_start_idx + (end_idx - start_idx);
                     long_seg_og[long_seg_idx].start_idx = start_idx;
@@ -312,7 +312,11 @@ __global__ void score_generation_short(
                 }
             }
             // broadcast long_seg_start_idx to all scalar registers
+#ifdef USEHIP
             long_seg_start_idx = __builtin_amdgcn_readfirstlane(long_seg_start_idx);
+#else
+            long_seg_start_idx = __shfl_sync(0xffffffff, long_seg_start_idx, 0);
+#endif
             if (long_seg_start_idx == SIZE_MAX)
                 continue;  // failed to allocate long_seg buffer
             for (uint64_t idx = tid; idx < end_idx - start_idx; idx += blockDim.x){
@@ -536,7 +540,7 @@ void plscore_async_short_mid_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* str
     cudaCheck();
 
 #ifdef DEBUG_PRINT
-    fprintf(stderr, "[Info] %s (%s:%d) short mid score generation success\n", __func__, __FILE__, __LINE__);
+    // fprintf(stderr, "[Info] %s (%s:%d) short mid score kernel launched\n", __func__, __FILE__, __LINE__);
 #endif
 
     cudaCheck();
@@ -562,7 +566,7 @@ void plscore_async_long_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream)
     cudaCheck();
 
 #ifdef DEBUG_PRINT
-    fprintf(stderr, "[Info] %s (%s:%d) long score generation success\n", __func__, __FILE__, __LINE__);
+    // fprintf(stderr, "[Info] %s (%s:%d) long score generation launched\n", __func__, __FILE__, __LINE__);
 #endif
 
     cudaCheck();

diff --git a/scripts/acc_integrated.slurm b/scripts/acc_integrated.slurm
diff --git a/scripts/mbit10_integrated.sh b/scripts/mbit10_integrated.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+make clean
+make MICRO_BATCH=4 GPU=NV GPU_CONFIG=a6000_config.json SHORT_BLOCK_SIZE=32 LONG_BLOCK_SIZE=1024 MID_BLOCK_SIZE=512 MID_CUT=1 LONG_CUT=100 DEBUG_ANALYSIS=1
+./minimap2  -t 1 --max-chain-skip=2147483647 data/hg38.mmi data/random_500MBases_90kto100k.fa #--gpu-chain