add timer by event recorder

Minimap2onGPU · Sep 5, 2023 · 1d5d7ab · 1d5d7ab
1 parent 10e7197
commit 1d5d7ab
Show file tree

Hide file tree

Showing 8 changed files with 75 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,5 @@ verf
 trace
 ncu
 nsys
+profile_output*
+workloads
diff --git a/gpu/hipify.cuh b/gpu/hipify.cuh
@@ -29,6 +29,7 @@
 #define     cudaEventRecord                 hipEventRecord
 #define     cudaEventQuery                  hipEventQuery
 #define     cudaEventDestroy                hipEventDestroy
+#define     cudaEventElapsedTime            hipEventElapsedTime
 #define cudaCheck() {                                                       \
     hipError_t err = hipGetLastError();                                     \
     if (hipSuccess != err) {                                                \

diff --git a/gpu/plchain.cu b/gpu/plchain.cu
@@ -75,6 +75,8 @@ void plchain_backtracking(hostMemPtr *host_mem, chain_read_t *reads, Misc misc,
 
     uint16_t* p_hostmem = host_mem->p;
     int32_t* f = host_mem->f;
+    // FIXME: DISABLED BACKTRACK, REMOVE THE RETURN HERE
+    return;
     for (int i = 0; i < n_read; i++) {
         int64_t* p;
         KMALLOC(km, p, reads[i].n);
@@ -270,7 +272,9 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
                    cudaMemcpyDeviceToHost);
         cudaMemcpy(&num_long_seg, dev_mem->d_long_seg_count, sizeof(unsigned int),
                    cudaMemcpyDeviceToHost);
-        fprintf(stderr, "[DEBUG] total segs: %lu, short:%lu mid: %u long: %u\n", cut_num, cut_num - num_mid_seg - num_long_seg, num_mid_seg, num_long_seg);
+        float milliseconds = 0;
+        cudaEventElapsedTime(&milliseconds, stream_setup.streams[stream_id].startevent, stream_setup.streams[stream_id].cudaevent);
+        fprintf(stderr, "[DEBUG] total segs: %lu, short:%lu mid: %u long: %u, last launch runtime: %f ms\n", cut_num, cut_num - num_mid_seg - num_long_seg, num_mid_seg, num_long_seg, milliseconds);
 #ifdef DEBUG_CHECK
         // check range
         int32_t* range = (int32_t*)malloc(sizeof(int32_t) * total_n);
@@ -309,6 +313,8 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
         stream_setup.streams[stream_id].busy = false;
     }
 
+    cudaEventRecord(stream_setup.streams[stream_id].startevent,
+                    stream_setup.streams[stream_id].cudastream);
     // size sanity check
     size_t total_n = 0, cut_num = 0;
     int griddim = 0;
@@ -505,6 +511,19 @@ void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t*
     chain_read_t* reads;
     int n_read;
     cudaStreamSynchronize(stream_setup.streams[t].cudastream);
+    deviceMemPtr* dev_mem = &stream_setup.streams[t].dev_mem;
+    size_t cut_num = stream_setup.streams[t].host_mem.cut_num;
+    // DEBUG: print seg numbers for each kernel
+
+    unsigned int num_mid_seg, num_long_seg;
+    cudaMemcpy(&num_mid_seg, dev_mem->d_mid_seg_count, sizeof(unsigned int),
+                cudaMemcpyDeviceToHost);
+    cudaMemcpy(&num_long_seg, dev_mem->d_long_seg_count, sizeof(unsigned int),
+                cudaMemcpyDeviceToHost);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, stream_setup.streams[t].startevent, stream_setup.streams[t].cudaevent);
+    fprintf(stderr, "[DEBUG] total segs: %lu, short:%lu mid: %u long: %u, last launch runtime: %f ms\n", cut_num, cut_num - num_mid_seg - num_long_seg, num_mid_seg, num_long_seg, milliseconds);
+
     cudaCheck();
     plchain_backtracking(&stream_setup.streams[t].host_mem,
                          stream_setup.streams[t].reads, misc, km);

diff --git a/gpu/plmem.cu b/gpu/plmem.cu
@@ -420,6 +420,7 @@ void plmem_stream_initialize(size_t *max_total_n_,
         stream_setup.streams[i].busy = false;
         cudaStreamCreate(&stream_setup.streams[i].cudastream);
         cudaEventCreate(&stream_setup.streams[i].cudaevent);
+        cudaEventCreate(&stream_setup.streams[i].startevent);
         cudaCheck();
         plmem_malloc_host_mem(&stream_setup.streams[i].host_mem, max_anchors_stream,
                               max_range_grid);

diff --git a/gpu/plmem.cuh b/gpu/plmem.cuh
@@ -74,7 +74,7 @@ typedef struct stream_ptr_t{
     hostMemPtr host_mem;
     deviceMemPtr dev_mem;
     cudaStream_t cudastream;
-    cudaEvent_t cudaevent;
+    cudaEvent_t cudaevent, startevent;
     bool busy = false;
 } stream_ptr_t;
 

diff --git a/gpu/plscore.cu b/gpu/plscore.cu
@@ -68,6 +68,49 @@ __device__ int32_t comput_sc(const int32_t ai_x, const int32_t ai_y, const int32
 inline __device__ void compute_sc_seg_one_wf(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t* range, 
                     size_t start_idx, size_t end_idx,
                     int32_t* f, uint16_t* p
+){
+    Misc blk_misc = misc;
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+    // init f and p
+    for (size_t i=start_idx+tid; i < end_idx; i += blockDim.x) {
+        f[i] = MM_QSPAN;
+        p[i] = 0;
+    }
+    // __syncthreads();
+    // assert(range[end_idx-1] == 0);
+    for (size_t i=start_idx; i < end_idx; i++) {
+        int32_t range_i = range[i];
+        // if (range_i + i >= end_idx)
+        //     printf("range_i %d i %lu start_idx %lu, end_idx %lu\n", range_i, i, start_idx, end_idx);
+        // assert(range_i + i < end_idx);
+        for (int32_t j = tid; j < range_i; j += blockDim.x) {
+            int32_t sc = comput_sc(
+                                anchors_x[i+j+1], 
+                                anchors_y[i+j+1], 
+                                anchors_x[i], 
+                                anchors_y[i],
+                                sid [i+j+1],
+                                sid [i],
+                                blk_misc.max_dist_x, blk_misc.max_dist_y, blk_misc.bw, blk_misc.chn_pen_gap, 
+                                blk_misc.chn_pen_skip, blk_misc.is_cdna, blk_misc.n_seg);
+            if (sc == INT32_MIN) continue;
+            sc += f[i];
+            if (sc >= f[i+j+1] && sc != MM_QSPAN) {
+                f[i+j+1] = sc;
+                p[i+j+1] = j+1;
+
+            }
+        }
+        // __syncthreads();
+    }
+
+}
+
+
+inline __device__ void compute_sc_long_seg_one_wf(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t* range, 
+                    size_t start_idx, size_t end_idx,
+                    int32_t* f, uint16_t* p
 ){
     Misc blk_misc = misc;
     int tid = threadIdx.x;
@@ -290,8 +333,8 @@ __global__ void score_generation_long(const int32_t* anchors_x, const int32_t* a
 
     for(int segid = bid; segid < *long_seg_count; segid += gridDim.x){
         seg_t seg = long_seg[segid]; 
-        compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p);
-        // compute_sc_long_seg_one_wf(anchors_x, anchors_y, range, seg.start_idx, seg.end_idx, f, p);
+        // compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p);
+        compute_sc_long_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p);
     }
 }
 __global__ void score_generation_naive(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t *range,

diff --git a/gpu_config.json b/gpu_config.json
@@ -2,8 +2,8 @@
     "num_streams": 1,
     "min_n": 512, 
     "//min_n": "queries with less anchors will be handled on cpu",
-    "max_total_n": 1268440000, 
-    "max_read": 10120000,
+    "max_total_n": 1368440000, 
+    "max_read": 15120000,
     "avg_read_n": 20000,
     "//avg_read_n": "expect average number of anchors per read",
     "range_kernel": {
@@ -17,8 +17,8 @@
         "short_blockdim": 64,
         "long_blockdim": 64,
         "mid_blockdim": 64,
-        "short_griddim": 2016,
+        "short_griddim": 16128,
         "long_griddim": 2016,
-        "mid_griddim": 8064
+        "mid_griddim": 16128
     }
 }
diff --git a/map.c b/map.c
@@ -1050,6 +1050,7 @@ static void worker_for(void *_data, long i_in, int tid) // kt_for() callback
             tr->is_full = 0;
             tr->has_launched = 1;
             // endof gpu kernel
+			// FIXME: This is wrong without sync stream
             b->timers[MM_TIME_CHAIN] += realtime() - t1;
         } else {
 			// cpu kernel
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,3 +16,5 @@ verf @@
     trace
     ncu
     nsys
+    profile_output*
+    workloads