Skip to content

Commit

Permalink
add timer by event recorder
Browse files Browse the repository at this point in the history
  • Loading branch information
xenshinu committed Sep 5, 2023
1 parent 10e7197 commit 1d5d7ab
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 8 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ verf
trace
ncu
nsys
profile_output*
workloads
1 change: 1 addition & 0 deletions gpu/hipify.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#define cudaEventRecord hipEventRecord
#define cudaEventQuery hipEventQuery
#define cudaEventDestroy hipEventDestroy
#define cudaEventElapsedTime hipEventElapsedTime
#define cudaCheck() { \
hipError_t err = hipGetLastError(); \
if (hipSuccess != err) { \
Expand Down
21 changes: 20 additions & 1 deletion gpu/plchain.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ void plchain_backtracking(hostMemPtr *host_mem, chain_read_t *reads, Misc misc,

uint16_t* p_hostmem = host_mem->p;
int32_t* f = host_mem->f;
// FIXME: DISABLED BACKTRACK, REMOVE THE RETURN HERE
return;
for (int i = 0; i < n_read; i++) {
int64_t* p;
KMALLOC(km, p, reads[i].n);
Expand Down Expand Up @@ -270,7 +272,9 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
cudaMemcpyDeviceToHost);
cudaMemcpy(&num_long_seg, dev_mem->d_long_seg_count, sizeof(unsigned int),
cudaMemcpyDeviceToHost);
fprintf(stderr, "[DEBUG] total segs: %lu, short:%lu mid: %u long: %u\n", cut_num, cut_num - num_mid_seg - num_long_seg, num_mid_seg, num_long_seg);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, stream_setup.streams[stream_id].startevent, stream_setup.streams[stream_id].cudaevent);
fprintf(stderr, "[DEBUG] total segs: %lu, short:%lu mid: %u long: %u, last launch runtime: %f ms\n", cut_num, cut_num - num_mid_seg - num_long_seg, num_mid_seg, num_long_seg, milliseconds);
#ifdef DEBUG_CHECK
// check range
int32_t* range = (int32_t*)malloc(sizeof(int32_t) * total_n);
Expand Down Expand Up @@ -309,6 +313,8 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
stream_setup.streams[stream_id].busy = false;
}

cudaEventRecord(stream_setup.streams[stream_id].startevent,
stream_setup.streams[stream_id].cudastream);
// size sanity check
size_t total_n = 0, cut_num = 0;
int griddim = 0;
Expand Down Expand Up @@ -505,6 +511,19 @@ void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t*
chain_read_t* reads;
int n_read;
cudaStreamSynchronize(stream_setup.streams[t].cudastream);
deviceMemPtr* dev_mem = &stream_setup.streams[t].dev_mem;
size_t cut_num = stream_setup.streams[t].host_mem.cut_num;
// DEBUG: print seg numbers for each kernel

unsigned int num_mid_seg, num_long_seg;
cudaMemcpy(&num_mid_seg, dev_mem->d_mid_seg_count, sizeof(unsigned int),
cudaMemcpyDeviceToHost);
cudaMemcpy(&num_long_seg, dev_mem->d_long_seg_count, sizeof(unsigned int),
cudaMemcpyDeviceToHost);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, stream_setup.streams[t].startevent, stream_setup.streams[t].cudaevent);
fprintf(stderr, "[DEBUG] total segs: %lu, short:%lu mid: %u long: %u, last launch runtime: %f ms\n", cut_num, cut_num - num_mid_seg - num_long_seg, num_mid_seg, num_long_seg, milliseconds);

cudaCheck();
plchain_backtracking(&stream_setup.streams[t].host_mem,
stream_setup.streams[t].reads, misc, km);
Expand Down
1 change: 1 addition & 0 deletions gpu/plmem.cu
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ void plmem_stream_initialize(size_t *max_total_n_,
stream_setup.streams[i].busy = false;
cudaStreamCreate(&stream_setup.streams[i].cudastream);
cudaEventCreate(&stream_setup.streams[i].cudaevent);
cudaEventCreate(&stream_setup.streams[i].startevent);
cudaCheck();
plmem_malloc_host_mem(&stream_setup.streams[i].host_mem, max_anchors_stream,
max_range_grid);
Expand Down
2 changes: 1 addition & 1 deletion gpu/plmem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ typedef struct stream_ptr_t{
hostMemPtr host_mem;
deviceMemPtr dev_mem;
cudaStream_t cudastream;
cudaEvent_t cudaevent;
cudaEvent_t cudaevent, startevent;
bool busy = false;
} stream_ptr_t;

Expand Down
47 changes: 45 additions & 2 deletions gpu/plscore.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,49 @@ __device__ int32_t comput_sc(const int32_t ai_x, const int32_t ai_y, const int32
inline __device__ void compute_sc_seg_one_wf(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t* range,
size_t start_idx, size_t end_idx,
int32_t* f, uint16_t* p
){
Misc blk_misc = misc;
int tid = threadIdx.x;
int bid = blockIdx.x;
// init f and p
for (size_t i=start_idx+tid; i < end_idx; i += blockDim.x) {
f[i] = MM_QSPAN;
p[i] = 0;
}
// __syncthreads();
// assert(range[end_idx-1] == 0);
for (size_t i=start_idx; i < end_idx; i++) {
int32_t range_i = range[i];
// if (range_i + i >= end_idx)
// printf("range_i %d i %lu start_idx %lu, end_idx %lu\n", range_i, i, start_idx, end_idx);
// assert(range_i + i < end_idx);
for (int32_t j = tid; j < range_i; j += blockDim.x) {
int32_t sc = comput_sc(
anchors_x[i+j+1],
anchors_y[i+j+1],
anchors_x[i],
anchors_y[i],
sid [i+j+1],
sid [i],
blk_misc.max_dist_x, blk_misc.max_dist_y, blk_misc.bw, blk_misc.chn_pen_gap,
blk_misc.chn_pen_skip, blk_misc.is_cdna, blk_misc.n_seg);
if (sc == INT32_MIN) continue;
sc += f[i];
if (sc >= f[i+j+1] && sc != MM_QSPAN) {
f[i+j+1] = sc;
p[i+j+1] = j+1;

}
}
// __syncthreads();
}

}


inline __device__ void compute_sc_long_seg_one_wf(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t* range,
size_t start_idx, size_t end_idx,
int32_t* f, uint16_t* p
){
Misc blk_misc = misc;
int tid = threadIdx.x;
Expand Down Expand Up @@ -290,8 +333,8 @@ __global__ void score_generation_long(const int32_t* anchors_x, const int32_t* a

for(int segid = bid; segid < *long_seg_count; segid += gridDim.x){
seg_t seg = long_seg[segid];
compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p);
// compute_sc_long_seg_one_wf(anchors_x, anchors_y, range, seg.start_idx, seg.end_idx, f, p);
// compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p);
compute_sc_long_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p);
}
}
__global__ void score_generation_naive(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t *range,
Expand Down
8 changes: 4 additions & 4 deletions gpu_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"num_streams": 1,
"min_n": 512,
"//min_n": "queries with less anchors will be handled on cpu",
"max_total_n": 1268440000,
"max_read": 10120000,
"max_total_n": 1368440000,
"max_read": 15120000,
"avg_read_n": 20000,
"//avg_read_n": "expect average number of anchors per read",
"range_kernel": {
Expand All @@ -17,8 +17,8 @@
"short_blockdim": 64,
"long_blockdim": 64,
"mid_blockdim": 64,
"short_griddim": 2016,
"short_griddim": 16128,
"long_griddim": 2016,
"mid_griddim": 8064
"mid_griddim": 16128
}
}
1 change: 1 addition & 0 deletions map.c
Original file line number Diff line number Diff line change
Expand Up @@ -1050,6 +1050,7 @@ static void worker_for(void *_data, long i_in, int tid) // kt_for() callback
tr->is_full = 0;
tr->has_launched = 1;
// endof gpu kernel
// FIXME: This is wrong without sync stream
b->timers[MM_TIME_CHAIN] += realtime() - t1;
} else {
// cpu kernel
Expand Down

0 comments on commit 1d5d7ab

Please sign in to comment.