diff --git a/.gitignore b/.gitignore index fb58534d..8037a31a 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,7 @@ verf trace ncu nsys -profile_output* +*_output* workloads .cmake/** .depend \ No newline at end of file diff --git a/gpu/plchain.cu b/gpu/plchain.cu index 895a268a..bf97ba5d 100644 --- a/gpu/plchain.cu +++ b/gpu/plchain.cu @@ -200,13 +200,14 @@ void plchain_cal_score_launch(chain_read_t **reads_, int *n_read_, Misc misc, st int stream_id = plchain_schedule_stream(stream_setup, batchid); if (stream_setup.streams[stream_id].busy) { #ifdef DEBUG_PRINT - fprintf(stderr, "[Info] %s (%s:%d) stream %d sync, total_n %lu\n", __func__, __FILE__, __LINE__, stream_id, stream_setup.streams[stream_id].host_mem.total_n); + fprintf(stderr, "[Info] %s (%s:%d) stream %d sync, total_n %lu\n", __func__, __FILE__, __LINE__, stream_id, stream_setup.streams[stream_id].host_mems[0].total_n); #endif // DEBUG_PRINT // cleanup previous batch in the stream - plchain_backtracking(&stream_setup.streams[stream_id].host_mem, + plchain_backtracking(&stream_setup.streams[stream_id].host_mems[0], stream_setup.streams[stream_id].reads, misc, km); + *reads_ = stream_setup.streams[stream_id].reads; - *n_read_ = stream_setup.streams[stream_id].host_mem.size; + *n_read_ = stream_setup.streams[stream_id].host_mems[0].size; stream_setup.streams[stream_id].busy = false; } @@ -236,7 +237,7 @@ void plchain_cal_score_launch(chain_read_t **reads_, int *n_read_, Misc misc, st assert(stream_setup.max_num_cut >= cut_num); plmem_reorg_input_arr(reads, n_read, - &stream_setup.streams[stream_id].host_mem, + &stream_setup.streams[stream_id].host_mems[0], range_kernel_config); plmem_async_h2d_memcpy(&stream_setup.streams[stream_id]); @@ -416,11 +417,14 @@ void plchain_cal_sc_pair_density(size_t total_n, size_t num_cut, deviceMemPtr* d #ifdef DEBUG_CHECK void plchain_debug_analysis(stream_ptr_t stream){ - size_t total_n = stream.host_mem.total_n; + // TODO: analysis multiple or current host mems + // TODO: this needs to be recalculated + size_t uid = 0; + size_t total_n = stream.host_mems[uid].total_n; chain_read_t* reads = stream.reads; deviceMemPtr* dev_mem = &stream.dev_mem; - hostMemPtr* host_mem = &stream.host_mem; - size_t cut_num = stream.host_mem.cut_num; + hostMemPtr* host_mem = &stream.host_mems[uid]; + size_t cut_num = stream.host_mems[uid].cut_num; unsigned int num_mid_seg, num_long_seg; cudaMemcpy(&num_mid_seg, dev_mem->d_mid_seg_count, sizeof(unsigned int), @@ -466,7 +470,7 @@ void plchain_debug_analysis(stream_ptr_t stream){ #if defined(DEBUG_VERBOSE) && 0 int32_t* ax = (int32_t*) malloc(sizeof(int32_t) * dev_mem->buffer_size_long); cudaMemcpy(ax, dev_mem->d_ax_long, sizeof(int32_t) * dev_mem->buffer_size_long, cudaMemcpyDeviceToHost); - debug_print_segs(host_mem->long_segs, reads, host_mem->long_segs_num, stream.host_mem.size); + debug_print_segs(host_mem->long_segs, reads, host_mem->long_segs_num, stream.host_mems[uid].size); debug_check_anchors(host_mem->long_segs, host_mem->long_segs_num, ax, host_mem->ax); #endif @@ -493,6 +497,7 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str chain_read_t* reads = *reads_; *reads_ = NULL; int n_read = *n_read_; + fprintf(stderr, "[Debug] %s (%s:%d) n_read %d\n", __func__, __FILE__, __LINE__, n_read); *n_read_ = 0; /* sync stream and process previous batch */ int stream_id = thread_id; @@ -500,67 +505,106 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str cudaStreamSynchronize(stream_setup.streams[stream_id].cudastream); #ifdef DEBUG_PRINT - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, stream_setup.streams[stream_id].startevent, stream_setup.streams[stream_id].cudaevent); - fprintf(stderr, "[Info] %s (%s:%d) last launch runtime: %f ms\n", __func__, __FILE__, __LINE__, milliseconds); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, stream_setup.streams[stream_id].startevent, stream_setup.streams[stream_id].cudaevent); + fprintf(stderr, "[Info] %s (%s:%d) last launch runtime: %f ms\n", __func__, __FILE__, __LINE__, milliseconds); #endif // DEBUG: debug analysis that involves sychronizating the whole device #if defined(DEBUG_CHECK) plchain_debug_analysis(stream_setup.streams[stream_id]); #endif // DEBUG_VERBOSE + // reset values + cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_long_seg_count, 0, sizeof(unsigned int), + stream_setup.streams[stream_id].cudastream); + cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int), + stream_setup.streams[stream_id].cudastream); + cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_total_n_long, 0, sizeof(size_t), + stream_setup.streams[stream_id].cudastream); + seg_t* long_segs = stream_setup.streams[stream_id].long_mem.long_segs; + size_t long_seg_idx = 0; + for (int uid = 0; uid < MICRO_BATCH; uid++) { + // regorg long to each host mem ptr + // NOTE: this is the number of long segs till this microbatch + size_t long_segs_num = stream_setup.streams[stream_id].host_mems[uid].long_segs_num; + for (; long_seg_idx < long_segs_num; long_seg_idx++) { + // TODO: write long_segs + long_seg_idx to f/p + } + + // backtrack after p/f is copied + plchain_backtracking(&stream_setup.streams[stream_id].host_mems[uid], + stream_setup.streams[stream_id].reads + *n_read_, misc, km); + // accumulate n_reads + *n_read_ += stream_setup.streams[stream_id].host_mems[uid].size; + } - // cleanup previous batch in the stream - plchain_backtracking(&stream_setup.streams[stream_id].host_mem, - stream_setup.streams[stream_id].reads, misc, km); *reads_ = stream_setup.streams[stream_id].reads; - *n_read_ = stream_setup.streams[stream_id].host_mem.size; + fprintf(stderr, "[Debug] %s finish (%s:%d) n_read %d\n", __func__, __FILE__, __LINE__, *n_read_); stream_setup.streams[stream_id].busy = false; } cudaEventRecord(stream_setup.streams[stream_id].startevent, stream_setup.streams[stream_id].cudastream); - // size sanity check - size_t total_n = 0, cut_num = 0; - int griddim = 0; + size_t total_n = 0; for (int i = 0; i < n_read; i++) { total_n += reads[i].n; - int an_p_block = range_kernel_config.anchor_per_block; - int an_p_cut = range_kernel_config.blockdim; - int block_num = (reads[i].n - 1) / an_p_block + 1; - griddim += block_num; - cut_num += (reads[i].n - 1) / an_p_cut + 1; - } - if (stream_setup.max_anchors_stream < total_n){ - fprintf(stderr, "max_anchors_stream %lu total_n %lu n_read %d\n", - stream_setup.max_anchors_stream, total_n, n_read); - } + } // compute total_n first - if (stream_setup.max_range_grid < griddim) { - fprintf(stderr, "max_range_grid %d griddim %d, total_n %lu n_read %d\n", - stream_setup.max_range_grid, griddim, total_n, n_read); - } + stream_setup.streams[stream_id].reads = reads; + int read_start = 0; + for (int uid = 0; uid < MICRO_BATCH; uid++) { + // decide the size of micro batch + size_t batch_n = 0; + int read_end = 0; + size_t cut_num = 0; + int griddim = 0; + for (read_end = read_start; read_end < n_read; read_end++) { + if (batch_n > (total_n - 1) / MICRO_BATCH + 1) { + break; + } + batch_n += reads[read_end].n; + int an_p_block = range_kernel_config.anchor_per_block; + int an_p_cut = range_kernel_config.blockdim; + int block_num = (reads[read_end].n - 1) / an_p_block + 1; + griddim += block_num; + cut_num += (reads[read_end].n - 1) / an_p_cut + 1; + } + fprintf(stderr, "[Debug] %s (%s:%d) batch_n %lu, read_start %d, read_end %d\n", __func__, __FILE__, __LINE__, batch_n, read_start, read_end); + // sanity check + if (stream_setup.max_anchors_stream < total_n){ + fprintf(stderr, "max_anchors_stream %lu total_n %lu n_read %d\n", + stream_setup.max_anchors_stream, total_n, n_read); + } - assert(stream_setup.max_anchors_stream >= total_n); - assert(stream_setup.max_range_grid >= griddim); - assert(stream_setup.max_num_cut >= cut_num); + if (stream_setup.max_range_grid < griddim) { + fprintf(stderr, "max_range_grid %d griddim %d, total_n %lu n_read %d\n", + stream_setup.max_range_grid, griddim, total_n, n_read); + } - plmem_reorg_input_arr(reads, n_read, - &stream_setup.streams[stream_id].host_mem, + assert(stream_setup.max_anchors_stream >= total_n); + assert(stream_setup.max_range_grid >= griddim); + assert(stream_setup.max_num_cut >= cut_num); + // work on micro batch + plmem_reorg_input_arr(reads + read_start, read_end - read_start, + &stream_setup.streams[stream_id].host_mems[uid], range_kernel_config); - plmem_async_h2d_memcpy(&stream_setup.streams[stream_id]); - plrange_async_range_selection(&stream_setup.streams[stream_id].dev_mem, - &stream_setup.streams[stream_id].cudastream); - // plscore_async_naive_forward_dp(&stream_setup.streams[stream_id].dev_mem, - // &stream_setup.streams[stream_id].cudastream); - plscore_async_long_short_forward_dp(&stream_setup.streams[stream_id].dev_mem, + plmem_async_h2d_short_memcpy(&stream_setup.streams[stream_id], uid); + plrange_async_range_selection(&stream_setup.streams[stream_id].dev_mem, + &stream_setup.streams[stream_id].cudastream); + plscore_async_short_mid_forward_dp(&stream_setup.streams[stream_id].dev_mem, + &stream_setup.streams[stream_id].cudastream); + plmem_async_d2h_short_memcpy(&stream_setup.streams[stream_id], uid); + // update index + read_start = read_end; + } + + plscore_async_long_forward_dp(&stream_setup.streams[stream_id].dev_mem, &stream_setup.streams[stream_id].cudastream); - plmem_async_d2h_memcpy(&stream_setup.streams[stream_id]); + plmem_async_d2h_long_memcpy(&stream_setup.streams[stream_id]); cudaEventRecord(stream_setup.streams[stream_id].cudaevent, stream_setup.streams[stream_id].cudastream); stream_setup.streams[stream_id].busy = true; - stream_setup.streams[stream_id].reads = reads; cudaCheck(); } @@ -710,7 +754,7 @@ void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t* } chain_read_t* reads; - int n_read; + int n_read = 0; cudaStreamSynchronize(stream_setup.streams[t].cudastream); cudaCheck(); @@ -725,15 +769,39 @@ void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t* plchain_debug_analysis(stream_setup.streams[t]); #endif // DEBUG_CHECK - plchain_backtracking(&stream_setup.streams[t].host_mem, - stream_setup.streams[t].reads, misc, km); + // TODO: backtrack multiple pending batches + // reset values + cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_long_seg_count, 0, sizeof(unsigned int), + stream_setup.streams[t].cudastream); + cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int), + stream_setup.streams[t].cudastream); + cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_total_n_long, 0, sizeof(size_t), + stream_setup.streams[t].cudastream); + seg_t* long_segs = stream_setup.streams[t].long_mem.long_segs; + size_t long_seg_idx = 0; + for (int uid = 0; uid < MICRO_BATCH; uid++) { + // regorg long to each host mem ptr + // NOTE: this is the number of long segs till this microbatch + size_t long_segs_num = stream_setup.streams[t].host_mems[uid].long_segs_num; + for (; long_seg_idx < long_segs_num; long_seg_idx++) { + // TODO: write long_segs + long_seg_idx to f/p + } + + // backtrack after p/f is copied + plchain_backtracking(&stream_setup.streams[t].host_mems[uid], + stream_setup.streams[t].reads + *n_read_, misc, km); + // accumulate n_reads + n_read += stream_setup.streams[t].host_mems[uid].size; + } reads = stream_setup.streams[t].reads; - n_read = stream_setup.streams[t].host_mem.size; + fprintf(stderr, "[Debug] %s finish (%s:%d) n_read %d\n", __func__, __FILE__, __LINE__, n_read); + stream_setup.streams[t].busy = false; + for (int i = 0; i < n_read; i++) { post_chaining_helper(mi, opt, &reads[i], misc, km); } stream_setup.streams[t].busy = false; - + // FXIME: return an array of reads *reads_ = reads; *n_read_ = n_read; @@ -742,7 +810,9 @@ void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t* void free_stream_gpu(int n_threads){ for (int t = 0; t < n_threads; t++){ - plmem_free_host_mem(&stream_setup.streams[t].host_mem); + for (int i = 0; i < MICRO_BATCH; i++){ + plmem_free_host_mem(&stream_setup.streams[t].host_mems[i]); + } plmem_free_device_mem(&stream_setup.streams[t].dev_mem); } #ifdef DEBUG_PRINT diff --git a/gpu/plmem.cu b/gpu/plmem.cu index 833ee672..9a4c7f76 100644 --- a/gpu/plmem.cu +++ b/gpu/plmem.cu @@ -21,9 +21,17 @@ void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch, cudaMallocHost((void**)&host_mem->start_idx, range_grid_size * sizeof(size_t)); cudaMallocHost((void**)&host_mem->read_end_idx, range_grid_size * sizeof(size_t)); cudaMallocHost((void**)&host_mem->cut_start_idx, range_grid_size * sizeof(size_t)); - cudaMallocHost((void**)&host_mem->long_segs, buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t)); - cudaMallocHost((void**)&host_mem->f_long, buffer_size_long * sizeof(int32_t)); - cudaMallocHost((void**)&host_mem->p_long, buffer_size_long * sizeof(uint16_t)); + // cudaMallocHost((void**)&host_mem->long_segs, buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t)); + // cudaMallocHost((void**)&host_mem->f_long, buffer_size_long * sizeof(int32_t)); + // cudaMallocHost((void**)&host_mem->p_long, buffer_size_long * sizeof(uint16_t)); + cudaCheck(); +} + +void plmem_malloc_long_mem(longMemPtr *long_mem, size_t buffer_size_long) { + // data array + cudaMallocHost((void**)&long_mem->long_segs, buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t)); + cudaMallocHost((void**)&long_mem->f_long, buffer_size_long * sizeof(int32_t)); + cudaMallocHost((void**)&long_mem->p_long, buffer_size_long * sizeof(uint16_t)); cudaCheck(); } @@ -68,7 +76,7 @@ void plmem_malloc_device_mem(deviceMemPtr *dev_mem, size_t anchor_per_batch, int cudaMalloc(&dev_mem->d_range_long, dev_mem->buffer_size_long * sizeof(int32_t)); cudaMalloc(&dev_mem->d_total_n_long, sizeof(size_t)); cudaMalloc(&dev_mem->d_f_long, sizeof(int32_t) * dev_mem->buffer_size_long); - cudaMalloc(&dev_mem->d_p_long, sizeof(uint16_t) * dev_mem->buffer_size_long); + cudaMalloc(&dev_mem->d_p_long, sizeof(uint16_t) * dev_mem->buffer_size_long); cudaCheck(); } @@ -154,8 +162,47 @@ void plmem_reorg_input_arr(chain_read_t *reads, int n_read, host_mem->griddim = griddim; } +void plmem_async_h2d_short_memcpy(stream_ptr_t* stream_ptrs, size_t uid) { + hostMemPtr *host_mem = &stream_ptrs->host_mems[uid]; + deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; + cudaStream_t *stream = &stream_ptrs->cudastream; + cudaMemcpyAsync(dev_mem->d_ax, host_mem->ax, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_ay, host_mem->ay, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_sid, host_mem->sid, + sizeof(int8_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_xrev, host_mem->xrev, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_start_idx, host_mem->start_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_read_end_idx, host_mem->read_end_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_cut_start_idx, host_mem->cut_start_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemsetAsync(dev_mem->d_cut, 0xff, + sizeof(size_t) * host_mem->cut_num, *stream); + cudaMemsetAsync(dev_mem->d_f, 0, sizeof(int32_t) * host_mem->total_n, + *stream); + cudaMemsetAsync(dev_mem->d_p, 0, sizeof(uint16_t) * host_mem->total_n, + *stream); + cudaCheck(); + dev_mem->total_n = host_mem->total_n; + dev_mem->num_cut = host_mem->cut_num; + dev_mem->size = host_mem->size; + dev_mem->griddim = host_mem->griddim; +} + void plmem_async_h2d_memcpy(stream_ptr_t* stream_ptrs) { - hostMemPtr *host_mem = &stream_ptrs->host_mem; + size_t uid = 0; + hostMemPtr *host_mem = &stream_ptrs->host_mems[uid]; deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; cudaStream_t *stream = &stream_ptrs->cudastream; cudaMemcpyAsync(dev_mem->d_ax, host_mem->ax, @@ -216,7 +263,9 @@ void plmem_sync_h2d_memcpy(hostMemPtr *host_mem, deviceMemPtr *dev_mem) { } void plmem_async_d2h_memcpy(stream_ptr_t *stream_ptrs) { - hostMemPtr *host_mem = &stream_ptrs->host_mem; + size_t uid = 0; + hostMemPtr *host_mem = &stream_ptrs->host_mems[uid]; + longMemPtr *long_mem = &stream_ptrs->long_mem; deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; cudaStream_t *stream = &stream_ptrs->cudastream; cudaMemcpyAsync(host_mem->f, dev_mem->d_f, @@ -225,14 +274,49 @@ void plmem_async_d2h_memcpy(stream_ptr_t *stream_ptrs) { cudaMemcpyAsync(host_mem->p, dev_mem->d_p, sizeof(uint16_t) * host_mem->total_n, cudaMemcpyDeviceToHost, *stream); - cudaMemcpyAsync(host_mem->long_segs, dev_mem->d_long_seg_og, + cudaMemcpyAsync(long_mem->long_segs, dev_mem->d_long_seg_og, dev_mem->buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t), cudaMemcpyDeviceToHost, *stream); cudaMemcpyAsync(&host_mem->long_segs_num, dev_mem->d_long_seg_count, sizeof(unsigned int), cudaMemcpyDeviceToHost, *stream); - cudaMemcpyAsync(host_mem->f_long, dev_mem->d_f_long, sizeof(int32_t)*dev_mem->buffer_size_long, + cudaMemcpyAsync(long_mem->f_long, dev_mem->d_f_long, sizeof(int32_t)*dev_mem->buffer_size_long, + cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->p_long, dev_mem->d_p_long, sizeof(uint16_t)*dev_mem->buffer_size_long, + cudaMemcpyDeviceToHost, *stream); + cudaCheck(); +} + +void plmem_async_d2h_short_memcpy(stream_ptr_t *stream_ptrs, size_t uid) { + hostMemPtr *host_mem = &stream_ptrs->host_mems[uid]; + deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; + cudaStream_t *stream = &stream_ptrs->cudastream; + // TODO: aggregate f and p + cudaMemcpyAsync(host_mem->f, dev_mem->d_f, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyDeviceToHost, + *stream); + cudaMemcpyAsync(host_mem->p, dev_mem->d_p, + sizeof(uint16_t) * host_mem->total_n, + cudaMemcpyDeviceToHost, *stream); + // copy back d_long_seg_count to long_segs_num, this is an accumulative value + cudaMemcpyAsync(&host_mem->long_segs_num, dev_mem->d_long_seg_count, + sizeof(unsigned int), cudaMemcpyDeviceToHost, *stream); + cudaCheck(); +} + +void plmem_async_d2h_long_memcpy(stream_ptr_t *stream_ptrs) { + // TODO: aggregate uids + size_t uid = 0; + longMemPtr *long_mem = &stream_ptrs->long_mem; + deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; + cudaStream_t *stream = &stream_ptrs->cudastream; + cudaMemcpyAsync(long_mem->long_segs, dev_mem->d_long_seg_og, + dev_mem->buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t), + cudaMemcpyDeviceToHost, *stream); + // cudaMemcpyAsync(&long_mem->total_long_segs_num, dev_mem->d_long_seg_count, + // sizeof(unsigned int), cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->f_long, dev_mem->d_f_long, sizeof(int32_t)*dev_mem->buffer_size_long, cudaMemcpyDeviceToHost, *stream); - cudaMemcpyAsync(host_mem->p_long, dev_mem->d_p_long, sizeof(uint16_t)*dev_mem->buffer_size_long, + cudaMemcpyAsync(long_mem->p_long, dev_mem->d_p_long, sizeof(uint16_t)*dev_mem->buffer_size_long, cudaMemcpyDeviceToHost, *stream); cudaCheck(); } @@ -457,10 +541,21 @@ void plmem_stream_initialize(size_t *max_total_n_, cudaEventCreate(&stream_setup.streams[i].startevent); cudaCheck(); stream_setup.streams[i].dev_mem.buffer_size_long = long_seg_buffer_size; - plmem_malloc_host_mem(&stream_setup.streams[i].host_mem, max_anchors_stream, + // one stream has multiple host mems + for (int j = 0; j < MICRO_BATCH; j++) { + plmem_malloc_host_mem(&stream_setup.streams[i].host_mems[j], max_anchors_stream, max_range_grid, long_seg_buffer_size); + } + // one stream has one long mem and one device mem + plmem_malloc_long_mem(&stream_setup.streams[i].long_mem, long_seg_buffer_size); plmem_malloc_device_mem(&stream_setup.streams[i].dev_mem, max_anchors_stream, max_range_grid, max_num_cut); + cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_long_seg_count, 0, sizeof(unsigned int), + stream_setup.streams[i].cudastream); + cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int), + stream_setup.streams[i].cudastream); + cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_total_n_long, 0, sizeof(size_t), + stream_setup.streams[i].cudastream); } *max_total_n_ = max_anchors_stream; @@ -475,7 +570,10 @@ void plmem_stream_cleanup() { cudaStreamDestroy(stream_setup.streams[i].cudastream); cudaEventDestroy(stream_setup.streams[i].cudaevent); cudaCheck(); - plmem_free_host_mem(&stream_setup.streams[i].host_mem); + // free multiple host mems + for (int j = 0; j < MICRO_BATCH; j++) { + plmem_free_host_mem(&stream_setup.streams[i].host_mems[j]); + } plmem_free_device_mem(&stream_setup.streams[i].dev_mem); } delete[] stream_setup.streams; diff --git a/gpu/plmem.cuh b/gpu/plmem.cuh index 095cb72a..efd99f01 100644 --- a/gpu/plmem.cuh +++ b/gpu/plmem.cuh @@ -5,6 +5,7 @@ #include "plutils.h" #define MEM_GPU (16-4) // 16 - 4 GB as memory pool = 16760832(0xffc000) KB +#define MICRO_BATCH 4 typedef struct { int index; // read index / batch index @@ -23,10 +24,8 @@ typedef struct { uint16_t *p; // predecessor // array size: number of cuts in the batch / long_seg_cut - seg_t *long_segs; + // total long segs number till this batch unsigned int long_segs_num; - int32_t *f_long; // score for long segs - uint16_t *p_long; // predecessor for long segs // start index for each block in range selection /***** range selection block assiagnment @@ -43,6 +42,14 @@ typedef struct { size_t *cut_start_idx; } hostMemPtr; +typedef struct { + // array size: number of cuts in the batch / long_seg_cut + seg_t *long_segs; + unsigned int total_long_segs_num; // sum of mini batch long_segs_num + int32_t *f_long; // score for long segs + uint16_t *p_long; // predecessor for long segs +} longMemPtr; + typedef struct { int size; int griddim; @@ -83,7 +90,8 @@ typedef struct { typedef struct stream_ptr_t{ chain_read_t *reads; - hostMemPtr host_mem; + hostMemPtr host_mems[MICRO_BATCH]; + longMemPtr long_mem; deviceMemPtr dev_mem; cudaStream_t cudastream; cudaEvent_t cudaevent, startevent; @@ -108,6 +116,7 @@ void plmem_stream_cleanup(); // alloc and free void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch, int range_grid_size, size_t buffer_size_long); +void plmem_malloc_long_mem(longMemPtr *long_mem, size_t buffer_size_long); void plmem_free_host_mem(hostMemPtr *host_mem); void plmem_malloc_device_mem(deviceMemPtr *dev_mem, size_t anchor_per_batch, int range_grid_size, int num_cut); @@ -117,7 +126,10 @@ void plmem_free_device_mem(deviceMemPtr *dev_mem); void plmem_reorg_input_arr(chain_read_t *reads, int n_read, hostMemPtr *host_mem, range_kernel_config_t config); void plmem_async_h2d_memcpy(stream_ptr_t *stream_ptrs); +void plmem_async_h2d_short_memcpy(stream_ptr_t *stream_ptrs, size_t uid); void plmem_sync_h2d_memcpy(hostMemPtr *host_mem, deviceMemPtr *dev_mem); void plmem_async_d2h_memcpy(stream_ptr_t *stream_ptrs); +void plmem_async_d2h_short_memcpy(stream_ptr_t *stream_ptrs, size_t uid); +void plmem_async_d2h_long_memcpy(stream_ptr_t *stream_ptrs); void plmem_sync_d2h_memcpy(hostMemPtr *host_mem, deviceMemPtr *dev_mem); #endif // _PLMEM_CUH_ \ No newline at end of file diff --git a/gpu/plscore.cu b/gpu/plscore.cu index 9cbaae90..d478a543 100644 --- a/gpu/plscore.cu +++ b/gpu/plscore.cu @@ -108,7 +108,7 @@ inline __device__ void compute_sc_seg_one_wf(int32_t* anchors_x, int32_t* anchor } -inline __device__ void compute_sc_long_seg(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t* range, +inline __device__ void compute_sc_seg_multi_wf(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t* range, size_t start_idx, size_t end_idx, int32_t* f, uint16_t* p ){ @@ -350,7 +350,7 @@ __global__ void score_generation_mid(int32_t* anchors_x, int32_t* anchors_y, int for(int segid = bid; segid < *long_seg_count; segid += gridDim.x){ seg_t seg = long_seg[segid]; // compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); - compute_sc_long_seg(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); + compute_sc_seg_multi_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); } } @@ -367,7 +367,7 @@ __global__ void score_generation_long(int32_t* anchors_x, int32_t* anchors_y, in for(int segid = bid; segid < *long_seg_count; segid += gridDim.x){ seg_t seg = long_seg[segid]; // compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); - compute_sc_long_seg(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); + compute_sc_seg_multi_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); } } __global__ void score_generation_naive(int32_t* anchors_x, int32_t* anchors_y, int8_t* sid, int32_t *range, @@ -425,6 +425,71 @@ void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* st dim3 midDimGrid(score_kernel_config.mid_griddim, 1, 1); dim3 longDimGrid(score_kernel_config.long_griddim, 1, 1); + // Run kernel + // printf("Grid Dim, %d\n", DimGrid.x); + #ifdef __SHORT_BLOCK_SIZE__ + // fprintf(stderr, "short block size: %d\n", __SHORT_BLOCK_SIZE__); + score_generation_short<__SHORT_BLOCK_SIZE__><<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, + dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, + dev_mem->d_ax_long, dev_mem->d_ay_long, dev_mem->d_sid_long, dev_mem->d_range_long, + dev_mem->d_total_n_long, buffer_size_long, + dev_mem->d_long_seg, dev_mem->d_long_seg_og, dev_mem->d_long_seg_count, + dev_mem->d_mid_seg, dev_mem->d_mid_seg_count); + #else + dim3 shortDimBlock(score_kernel_config.short_blockdim, 1, 1); + score_generation_short<<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, + dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, + dev_mem->d_ax_long, dev_mem->d_ay_long, dev_mem->d_sid_long, dev_mem->d_range_long, + dev_mem->d_total_n_long, buffer_size_long, + dev_mem->d_long_seg, dev_mem->d_long_seg_og, dev_mem->d_long_seg_count, + dev_mem->d_mid_seg, dev_mem->d_mid_seg_count); + #endif + cudaCheck(); + + #ifdef __MID_BLOCK_SIZE__ + // fprintf(stderr, "mid block size: %d\n", __MID_BLOCK_SIZE__); + score_generation_mid<__MID_BLOCK_SIZE__><<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, dev_mem->d_mid_seg, + dev_mem->d_mid_seg_count, dev_mem->d_f, dev_mem->d_p); + #else + dim3 midDimBlock(score_kernel_config.mid_blockdim, 1, 1); + score_generation_mid<<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, dev_mem->d_mid_seg, + dev_mem->d_mid_seg_count, dev_mem->d_f, dev_mem->d_p); + #endif + cudaCheck(); + + + #ifdef __LONG_BLOCK_SIZE__ + // fprintf(stderr, "long block size: %d\n", __LONG_BLOCK_SIZE__); + score_generation_long<__LONG_BLOCK_SIZE__><<>>( + dev_mem->d_ax_long, dev_mem->d_ay_long, dev_mem->d_sid_long, dev_mem->d_range_long, dev_mem->d_long_seg, + dev_mem->d_long_seg_count, dev_mem->d_f_long, dev_mem->d_p_long); + #else + dim3 longDimBlock(score_kernel_config.long_blockdim, 1, 1); + score_generation_long<<>>( + dev_mem->d_ax_long, dev_mem->d_ay_long, dev_mem->d_sid_long, dev_mem->d_range_long, dev_mem->d_long_seg, + dev_mem->d_long_seg_count, dev_mem->d_f_long, dev_mem->d_p_long); + #endif + cudaCheck(); + +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] %s (%s:%d) score generation success\n", __func__, __FILE__, __LINE__); +#endif + + cudaCheck(); +} + + +void plscore_async_short_mid_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream) { + size_t total_n = dev_mem->total_n; + size_t cut_num = dev_mem->num_cut; + size_t buffer_size_long = dev_mem->buffer_size_long; + dim3 shortDimGrid(score_kernel_config.short_griddim, 1, 1); + dim3 midDimGrid(score_kernel_config.mid_griddim, 1, 1); + // Run kernel // printf("Grid Dim, %d\n", DimGrid.x); cudaMemsetAsync(dev_mem->d_long_seg_count, 0, sizeof(unsigned int), @@ -468,6 +533,18 @@ void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* st #endif cudaCheck(); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] %s (%s:%d) short mid score generation success\n", __func__, __FILE__, __LINE__); +#endif + + cudaCheck(); +} + +void plscore_async_long_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream) { + size_t total_n = dev_mem->total_n; + size_t cut_num = dev_mem->num_cut; + size_t buffer_size_long = dev_mem->buffer_size_long; + dim3 longDimGrid(score_kernel_config.long_griddim, 1, 1); #ifdef __LONG_BLOCK_SIZE__ // fprintf(stderr, "long block size: %d\n", __LONG_BLOCK_SIZE__); @@ -483,7 +560,7 @@ void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* st cudaCheck(); #ifdef DEBUG_PRINT - fprintf(stderr, "[Info] %s (%s:%d) score generation success\n", __func__, __FILE__, __LINE__); + fprintf(stderr, "[Info] %s (%s:%d) long score generation success\n", __func__, __FILE__, __LINE__); #endif cudaCheck(); diff --git a/gpu/plscore.cuh b/gpu/plscore.cuh index b464ea05..ca4c69da 100644 --- a/gpu/plscore.cuh +++ b/gpu/plscore.cuh @@ -25,6 +25,8 @@ extern "C"{ void plscore_upload_misc(Misc misc); void plscore_async_naive_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream); void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem,cudaStream_t* stream); +void plscore_async_short_mid_forward_dp(deviceMemPtr* dev_mem,cudaStream_t* stream); +void plscore_async_long_forward_dp(deviceMemPtr* dev_mem,cudaStream_t* stream); void plscore_sync_long_short_forward_dp(deviceMemPtr* dev_mem, Misc misc_); void plscore_sync_naive_forward_dp(deviceMemPtr* dev_mem, Misc misc_);