Skip to content

Commit

Permalink
Add acc_config. FIX seg fault for long_seg_count reset
Browse files Browse the repository at this point in the history
  • Loading branch information
joydddd committed Oct 30, 2023
1 parent cb1a30e commit 1a585ab
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 20 deletions.
26 changes: 26 additions & 0 deletions aac_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"//config is for": "aac cloud. Fits one batch + 5% x 4 long buffer avg_read_n 10k",
"num_streams": 1,
"min_n": 512,
"//min_n": "queries with less anchors will be handled on cpu",
"long_seg_buffer_size": 507376000,
"max_total_n": 2536880000,
"max_read": 253688,
"avg_read_n": 20000,
"//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified",
"range_kernel": {
"blockdim": 512,
"cut_check_anchors": 10,
"//cut_check_anchors": "Number of anchors to check to attemp a cut",
"anchor_per_block": 32768,
"//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim"
},
"score_kernel": {
"short_blockdim": 64,
"long_blockdim": 64,
"mid_blockdim": 64,
"short_griddim": 16128,
"long_griddim": 2016,
"mid_griddim": 16128
}
}
1 change: 1 addition & 0 deletions gpu/hipify.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#define cudaEventDestroy hipEventDestroy
#define cudaEventElapsedTime hipEventElapsedTime
#define cudaStreamWaitEvent hipStreamWaitEvent
#define cudaMemGetInfo hipMemGetInfo
#define cudaCheck() { \
hipError_t err = hipGetLastError(); \
if (hipSuccess != err) { \
Expand Down
30 changes: 14 additions & 16 deletions gpu/plchain.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ void plchain_backtracking(hostMemPtr *host_mem, chain_read_t *reads, Misc misc,
int64_t* p;
KMALLOC(km, p, reads[i].n);
p_rel2idx(p_hostmem, p, reads[i].n);
// DEBUG:print scores
// print scores
#if defined(DEBUG_VERBOSE) && 0
debug_print_score(p, f, reads[i].n);
#endif
//DEBUG: Check score w.r.t to input (MAKE SURE INPUT SCORE EXISTS: search for SCORE CHECK)
// Check score w.r.t to input (MAKE SURE INPUT SCORE EXISTS: search for SCORE CHECK)
#if defined(DEBUG_CHECK) && 0
debug_check_score(p, f, reads[i].p, reads[i].f, reads[i].n);
#endif
Expand Down Expand Up @@ -514,19 +514,15 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
#if defined(DEBUG_CHECK)
plchain_debug_analysis(stream_setup.streams[stream_id]);
#endif // DEBUG_VERBOSE
// reset values
cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_long_seg_count, 0, sizeof(unsigned int),
stream_setup.streams[stream_id].cudastream);
cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int),
stream_setup.streams[stream_id].cudastream);
cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_total_n_long, 0, sizeof(size_t),
stream_setup.streams[stream_id].cudastream);
seg_t* long_segs = stream_setup.streams[stream_id].long_mem.long_segs;
size_t long_seg_idx = 0;
for (int uid = 0; uid < MICRO_BATCH; uid++) {
// regorg long to each host mem ptr
// NOTE: this is the number of long segs till this microbatch
size_t long_segs_num = stream_setup.streams[stream_id].host_mems[uid].long_segs_num;
#ifdef DEBUG_PRINT
fprintf(stderr, "[Debug] %s (%s:%d) long seg %lu - %lu \n", __func__, __FILE__, __LINE__, long_seg_idx, long_segs_num);
#endif
for (; long_seg_idx < long_segs_num; long_seg_idx++) {
// TODO: write long_segs + long_seg_idx to f/p
}
Expand All @@ -550,6 +546,12 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
total_n += reads[i].n;
} // compute total_n first

// reset long seg counters
cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_long_seg_count, 0, sizeof(unsigned int),
stream_setup.streams[stream_id].cudastream);
cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_total_n_long, 0, sizeof(size_t),
stream_setup.streams[stream_id].cudastream);

stream_setup.streams[stream_id].reads = reads;
int read_start = 0;
for (int uid = 0; uid < MICRO_BATCH; uid++) {
Expand Down Expand Up @@ -770,19 +772,15 @@ void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t*
#endif // DEBUG_CHECK

// TODO: backtrack multiple pending batches
// reset values
cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_long_seg_count, 0, sizeof(unsigned int),
stream_setup.streams[t].cudastream);
cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int),
stream_setup.streams[t].cudastream);
cudaMemsetAsync(stream_setup.streams[t].dev_mem.d_total_n_long, 0, sizeof(size_t),
stream_setup.streams[t].cudastream);
seg_t* long_segs = stream_setup.streams[t].long_mem.long_segs;
size_t long_seg_idx = 0;
for (int uid = 0; uid < MICRO_BATCH; uid++) {
// regorg long to each host mem ptr
// NOTE: this is the number of long segs till this microbatch
size_t long_segs_num = stream_setup.streams[t].host_mems[uid].long_segs_num;
#ifdef DEBUG_PRINT
fprintf(stderr, "[Debug] %s (%s:%d) long seg %lu - %lu \n", __func__, __FILE__, __LINE__, long_seg_idx, long_segs_num);
#endif
for (; long_seg_idx < long_segs_num; long_seg_idx++) {
// TODO: write long_segs + long_seg_idx to f/p
}
Expand Down
20 changes: 20 additions & 0 deletions gpu/plmem.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#include "plscore.cuh"
#include <time.h>

#define OneK 1024
#define OneM (OneK*1024)
#define OneG (OneM*1024)

void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch,
int range_grid_size, size_t buffer_size_long) {
// data array
Expand Down Expand Up @@ -69,6 +73,12 @@ void plmem_malloc_device_mem(deviceMemPtr *dev_mem, size_t anchor_per_batch, int
cudaMalloc(&dev_mem->d_mid_seg_count, sizeof(unsigned int));
cudaMalloc(&dev_mem->d_mid_seg, num_cut/(MM_MID_SEG_CUTOFF + 1) * sizeof(seg_t));

size_t gpu_free_mem, gpu_total_mem;
cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem);
#ifdef DEBUG_PRINT
fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB (before alloc long seg buffer) \n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG);
#endif

// long seg buffer
cudaMalloc(&dev_mem->d_ax_long, dev_mem->buffer_size_long * sizeof(int32_t));
cudaMalloc(&dev_mem->d_ay_long, dev_mem->buffer_size_long * sizeof(int32_t));
Expand Down Expand Up @@ -517,6 +527,11 @@ void plmem_stream_initialize(size_t *max_total_n_,
cJSON *json = plmem_parse_gpu_config(GPU_CONFIG);
#endif
plmem_config_kernels(json);
size_t gpu_free_mem, gpu_total_mem;
cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem);
#ifdef DEBUG_PRINT
fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB\n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG);
#endif
plmem_config_batch<false>(json, &num_stream, min_anchors_, &max_anchors_stream,
max_read_, &long_seg_buffer_size);
plmem_config_stream(&max_range_grid, &max_num_cut, max_anchors_stream,
Expand Down Expand Up @@ -558,6 +573,11 @@ void plmem_stream_initialize(size_t *max_total_n_,
stream_setup.streams[i].cudastream);
}

cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem);
#ifdef DEBUG_PRINT
fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB\n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG);
#endif

*max_total_n_ = max_anchors_stream;

stream_setup.max_anchors_stream = max_anchors_stream;
Expand Down
12 changes: 8 additions & 4 deletions gpu/plscore.cu
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,13 @@ void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* st

// Run kernel
// printf("Grid Dim, %d\n", DimGrid.x);
cudaMemsetAsync(dev_mem->d_long_seg_count, 0, sizeof(unsigned int),
*stream);
cudaMemsetAsync(dev_mem->d_mid_seg_count, 0, sizeof(unsigned int),
*stream);
cudaMemsetAsync(dev_mem->d_total_n_long, 0, sizeof(size_t),
*stream);

#ifdef __SHORT_BLOCK_SIZE__
// fprintf(stderr, "short block size: %d\n", __SHORT_BLOCK_SIZE__);
score_generation_short<__SHORT_BLOCK_SIZE__><<<shortDimGrid, dim3(__SHORT_BLOCK_SIZE__, 1, 1), 0, *stream>>>(
Expand Down Expand Up @@ -490,10 +497,7 @@ void plscore_async_short_mid_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* str
dim3 shortDimGrid(score_kernel_config.short_griddim, 1, 1);
dim3 midDimGrid(score_kernel_config.mid_griddim, 1, 1);

// // Run kernel
// // printf("Grid Dim, %d\n", DimGrid.x);
// cudaMemsetAsync(dev_mem->d_long_seg_count, 0, sizeof(unsigned int),
// *stream);
// Run kernel;
cudaMemsetAsync(dev_mem->d_mid_seg_count, 0, sizeof(unsigned int),
*stream);
// cudaMemsetAsync(dev_mem->d_total_n_long, 0, sizeof(size_t),
Expand Down

0 comments on commit 1a585ab

Please sign in to comment.