diff --git a/aac_config.json b/aac_config.json new file mode 100644 index 00000000..fc294ad6 --- /dev/null +++ b/aac_config.json @@ -0,0 +1,26 @@ +{ + "//config is for": "aac cloud. Fits one batch + 5% x 4 long buffer avg_read_n 10k", + "num_streams": 1, + "min_n": 512, + "//min_n": "queries with less anchors will be handled on cpu", + "long_seg_buffer_size": 507376000, + "max_total_n": 2536880000, + "max_read": 253688, + "avg_read_n": 20000, + "//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified", + "range_kernel": { + "blockdim": 512, + "cut_check_anchors": 10, + "//cut_check_anchors": "Number of anchors to check to attemp a cut", + "anchor_per_block": 32768, + "//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim" + }, + "score_kernel": { + "short_blockdim": 64, + "long_blockdim": 64, + "mid_blockdim": 64, + "short_griddim": 16128, + "long_griddim": 2016, + "mid_griddim": 16128 + } +} \ No newline at end of file diff --git a/gpu/hipify.cuh b/gpu/hipify.cuh index 4e719fe2..c1a4f275 100644 --- a/gpu/hipify.cuh +++ b/gpu/hipify.cuh @@ -31,6 +31,7 @@ #define cudaEventDestroy hipEventDestroy #define cudaEventElapsedTime hipEventElapsedTime #define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaMemGetInfo hipMemGetInfo #define cudaCheck() { \ hipError_t err = hipGetLastError(); \ if (hipSuccess != err) { \ diff --git a/gpu/plchain.cu b/gpu/plchain.cu index bf97ba5d..ee545308 100644 --- a/gpu/plchain.cu +++ b/gpu/plchain.cu @@ -85,11 +85,11 @@ void plchain_backtracking(hostMemPtr *host_mem, chain_read_t *reads, Misc misc, int64_t* p; KMALLOC(km, p, reads[i].n); p_rel2idx(p_hostmem, p, reads[i].n); -// DEBUG:print scores +// print scores #if defined(DEBUG_VERBOSE) && 0 debug_print_score(p, f, reads[i].n); #endif -//DEBUG: Check score w.r.t to input (MAKE SURE INPUT SCORE EXISTS: search for SCORE CHECK) +// Check score w.r.t to input (MAKE SURE INPUT SCORE EXISTS: search for SCORE CHECK) #if defined(DEBUG_CHECK) && 0 debug_check_score(p, f, reads[i].p, reads[i].f, reads[i].n); #endif @@ -514,19 +514,15 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str #if defined(DEBUG_CHECK) plchain_debug_analysis(stream_setup.streams[stream_id]); #endif // DEBUG_VERBOSE - // reset values - cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_long_seg_count, 0, sizeof(unsigned int), - stream_setup.streams[stream_id].cudastream); - cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int), - stream_setup.streams[stream_id].cudastream); - cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_total_n_long, 0, sizeof(size_t), - stream_setup.streams[stream_id].cudastream); seg_t* long_segs = stream_setup.streams[stream_id].long_mem.long_segs; size_t long_seg_idx = 0; for (int uid = 0; uid < MICRO_BATCH; uid++) { // regorg long to each host mem ptr // NOTE: this is the number of long segs till this microbatch size_t long_segs_num = stream_setup.streams[stream_id].host_mems[uid].long_segs_num; +#ifdef DEBUG_PRINT + fprintf(stderr, "[Debug] %s (%s:%d) long seg %lu - %lu \n", __func__, __FILE__, __LINE__, long_seg_idx, long_segs_num); +#endif for (; long_seg_idx < long_segs_num; long_seg_idx++) { // TODO: write long_segs + long_seg_idx to f/p } @@ -550,6 +546,12 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str total_n += reads[i].n; } // compute total_n first + // reset long seg counters + cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_long_seg_count, 0, sizeof(unsigned int), + stream_setup.streams[stream_id].cudastream); + cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_total_n_long, 0, sizeof(size_t), + stream_setup.streams[stream_id].cudastream); + stream_setup.streams[stream_id].reads = reads; int read_start = 0; for (int uid = 0; uid < MICRO_BATCH; uid++) { @@ -770,19 +772,15 @@ void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t* #endif // DEBUG_CHECK // TODO: backtrack multiple pending batches - // reset values - cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_long_seg_count, 0, sizeof(unsigned int), - stream_setup.streams[t].cudastream); - cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int), - stream_setup.streams[t].cudastream); - cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_total_n_long, 0, sizeof(size_t), - stream_setup.streams[t].cudastream); seg_t* long_segs = stream_setup.streams[t].long_mem.long_segs; size_t long_seg_idx = 0; for (int uid = 0; uid < MICRO_BATCH; uid++) { // regorg long to each host mem ptr // NOTE: this is the number of long segs till this microbatch size_t long_segs_num = stream_setup.streams[t].host_mems[uid].long_segs_num; +#ifdef DEBUG_PRINT + fprintf(stderr, "[Debug] %s (%s:%d) long seg %lu - %lu \n", __func__, __FILE__, __LINE__, long_seg_idx, long_segs_num); +#endif for (; long_seg_idx < long_segs_num; long_seg_idx++) { // TODO: write long_segs + long_seg_idx to f/p } diff --git a/gpu/plmem.cu b/gpu/plmem.cu index 9a4c7f76..974e1784 100644 --- a/gpu/plmem.cu +++ b/gpu/plmem.cu @@ -7,6 +7,10 @@ #include "plscore.cuh" #include +#define OneK 1024 +#define OneM (OneK*1024) +#define OneG (OneM*1024) + void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch, int range_grid_size, size_t buffer_size_long) { // data array @@ -69,6 +73,12 @@ void plmem_malloc_device_mem(deviceMemPtr *dev_mem, size_t anchor_per_batch, int cudaMalloc(&dev_mem->d_mid_seg_count, sizeof(unsigned int)); cudaMalloc(&dev_mem->d_mid_seg, num_cut/(MM_MID_SEG_CUTOFF + 1) * sizeof(seg_t)); + size_t gpu_free_mem, gpu_total_mem; + cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB (before alloc long seg buffer) \n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG); +#endif + // long seg buffer cudaMalloc(&dev_mem->d_ax_long, dev_mem->buffer_size_long * sizeof(int32_t)); cudaMalloc(&dev_mem->d_ay_long, dev_mem->buffer_size_long * sizeof(int32_t)); @@ -517,6 +527,11 @@ void plmem_stream_initialize(size_t *max_total_n_, cJSON *json = plmem_parse_gpu_config(GPU_CONFIG); #endif plmem_config_kernels(json); + size_t gpu_free_mem, gpu_total_mem; + cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB\n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG); +#endif plmem_config_batch(json, &num_stream, min_anchors_, &max_anchors_stream, max_read_, &long_seg_buffer_size); plmem_config_stream(&max_range_grid, &max_num_cut, max_anchors_stream, @@ -558,6 +573,11 @@ void plmem_stream_initialize(size_t *max_total_n_, stream_setup.streams[i].cudastream); } +cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB\n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG); +#endif + *max_total_n_ = max_anchors_stream; stream_setup.max_anchors_stream = max_anchors_stream; diff --git a/gpu/plscore.cu b/gpu/plscore.cu index 8414e47b..33526525 100644 --- a/gpu/plscore.cu +++ b/gpu/plscore.cu @@ -427,6 +427,13 @@ void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* st // Run kernel // printf("Grid Dim, %d\n", DimGrid.x); + cudaMemsetAsync(dev_mem->d_long_seg_count, 0, sizeof(unsigned int), + *stream); + cudaMemsetAsync(dev_mem->d_mid_seg_count, 0, sizeof(unsigned int), + *stream); + cudaMemsetAsync(dev_mem->d_total_n_long, 0, sizeof(size_t), + *stream); + #ifdef __SHORT_BLOCK_SIZE__ // fprintf(stderr, "short block size: %d\n", __SHORT_BLOCK_SIZE__); score_generation_short<__SHORT_BLOCK_SIZE__><<>>( @@ -490,10 +497,7 @@ void plscore_async_short_mid_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* str dim3 shortDimGrid(score_kernel_config.short_griddim, 1, 1); dim3 midDimGrid(score_kernel_config.mid_griddim, 1, 1); - // // Run kernel - // // printf("Grid Dim, %d\n", DimGrid.x); - // cudaMemsetAsync(dev_mem->d_long_seg_count, 0, sizeof(unsigned int), - // *stream); + // Run kernel; cudaMemsetAsync(dev_mem->d_mid_seg_count, 0, sizeof(unsigned int), *stream); // cudaMemsetAsync(dev_mem->d_total_n_long, 0, sizeof(size_t),