Skip to content

Commit

Permalink
Update throughput calculation, a6000 config
Browse files Browse the repository at this point in the history
  • Loading branch information
joydddd committed Feb 9, 2024
1 parent 216f2b2 commit e1248fb
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 34 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CFLAGS= -O2 -g -DNDEBUG
CDEBUG_FLAGS= -g -Wall -Wextra -Wno-unused-parameter -Wno-unused-variable -Wno-sign-compare -Wno-unused-function -Wno-c++17-extensions -DDEBUG_PRINT -O2 -Wno-\#warnings #-O0 -DNDEBUG
CDEBUG_FLAGS= -g -DDEBUG_PRINT -O2 #-Wall -Wextra -Wno-unused-parameter -Wno-unused-variable -Wno-sign-compare -Wno-unused-function -Wno-c++17-extensions -Wno-\#warnings #-O0 -DNDEBUG
CPPFLAGS= -DHAVE_KALLOC -D__AMD_SPLIT_KERNELS__ # -Wno-unused-but-set-variable -Wno-unused-variable
CPPFLAGS+= $(if $(MICRO_BATCH),-DMICRO_BATCH=\($(MICRO_BATCH)\))
INCLUDES= -I .
Expand Down
27 changes: 27 additions & 0 deletions a6000_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"//config is for": "aac cloud. Fits one batch + 5% x 4 long buffer avg_read_n 10k",
"num_streams": 1,
"min_n": 512,
"//min_n": "queries with less anchors will be handled on cpu",
"long_seg_buffer_size": 258880000,
"max_total_n": 893440000,
"max_read": 893440,
"avg_read_n": 20000,
"//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified",
"range_kernel": {
"blockdim": 512,
"cut_check_anchors": 10,
"//cut_check_anchors": "Number of anchors to check to attemp a cut",
"anchor_per_block": 32768,
"//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim"
},
"score_kernel": {
"short_blockdim": 64,
"long_blockdim": 64,
"mid_blockdim": 64,
"//blockdim config": "options are not used: static config specified at compile time (make ... LONG_BLOCK_SIZE=1024)",
"short_griddim": 2688,
"long_griddim": 1024,
"mid_griddim": 2688
}
}
3 changes: 2 additions & 1 deletion gpu/gpu.mk
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ CUDATESTFLAG = -G
HIPCC = hipcc
HIPFLAGS = -DUSEHIP
HIPTESTFLAGS = -G -Rpass-analysis=kernel-resource-usage
LIBS += -L${ROCM_PATH}/lib -lroctx64 -lroctracer64
HIPLIBS = -L${ROCM_PATH}/lib -lroctx64 -lroctracer64

###################################################
############ DEBUG Options ###################
Expand All @@ -38,6 +38,7 @@ ifeq ($(GPU), AMD)
GPU_CC = $(HIPCC)
GPU_FLAGS = $(HIPFLAGS)
GPU_TESTFL = $(HIPTESTFLAGS)
LIBS += $(HIPLIBS)
else
GPU_CC = $(NVCC)
GPU_FLAGS = $(CUDAFLAGS)
Expand Down
65 changes: 51 additions & 14 deletions gpu/plchain.cu
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ void plchain_debug_analysis(stream_ptr_t stream){
chain_read_t* reads = stream.reads;
deviceMemPtr* dev_mem = &stream.dev_mem;
hostMemPtr* host_mem = &stream.host_mems[uid];
longMemPtr* long_mem = &stream.long_mem;
size_t cut_num = stream.host_mems[uid].cut_num;

unsigned int num_mid_seg, num_long_seg;
Expand All @@ -441,9 +442,11 @@ void plchain_debug_analysis(stream_ptr_t stream){
size_t* cut = (size_t*)malloc(sizeof(size_t) * cut_num);
cudaMemcpy(cut, dev_mem->d_cut, sizeof(size_t) * cut_num,
cudaMemcpyDeviceToHost);
seg_t* long_segs_og = (seg_t*)malloc(sizeof(seg_t) * num_long_seg);
cudaMemcpy(long_segs_og, dev_mem->d_long_seg_og, sizeof(seg_t) * num_long_seg,
seg_t* long_segs = (seg_t*)malloc(sizeof(seg_t) * num_long_seg);
cudaMemcpy(long_segs, dev_mem->d_long_seg, sizeof(seg_t) * num_long_seg,
cudaMemcpyDeviceToHost);
int32_t* long_range = (int32_t*)malloc(sizeof(int32_t) * *(long_mem->total_long_segs_n));
cudaMemcpy(long_range, dev_mem->d_range_long, sizeof(int32_t) * *(long_mem->total_long_segs_n), cudaMemcpyDeviceToHost);

// Calculate total workload (sc pairs)
size_t total_sc_pairs = 0;
Expand All @@ -452,8 +455,8 @@ void plchain_debug_analysis(stream_ptr_t stream){
}
size_t long_seg_sc_pairs = 0;
for (int long_seg_id = 0; long_seg_id < num_long_seg; long_seg_id++) {
for(size_t i = long_segs_og[long_seg_id].start_idx; i < long_segs_og[long_seg_id].end_idx; i++){
long_seg_sc_pairs += range[i];
for(size_t i = long_segs[long_seg_id].start_idx; i < long_segs[long_seg_id].end_idx; i++){
long_seg_sc_pairs += long_range[i];
}
}
fprintf(stderr, "[DEBUG] Total workload (sc pairs) in batch: %lu, in long segs %lu\n", total_sc_pairs, long_seg_sc_pairs);
Expand All @@ -462,7 +465,7 @@ void plchain_debug_analysis(stream_ptr_t stream){
// calculate kernel throughput
float long_kernel_runtime_ms = 0;
cudaEventElapsedTime(&long_kernel_runtime_ms, stream.long_kernel_event, stream.stopevent);
float long_kernel_througput = long_seg_sc_pairs / long_kernel_runtime_ms;
float long_kernel_througput = long_seg_sc_pairs / long_kernel_runtime_ms / (float)1000;
fprintf(stderr, "[DEBUG] Long Seg kernel throughput: %.2f Mpairs/s\n", long_kernel_througput);

// Check range w.r.t input (MAKE SURE INPUT RANGE EXISTS)
Expand Down Expand Up @@ -524,9 +527,28 @@ int plchain_finish_batch(streamSetup_t stream_setup, int stream_id, Misc misc, v
#ifdef DEBUG_PRINT
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, stream_setup.streams[stream_id].startevent, stream_setup.streams[stream_id].stopevent);
fprintf(stderr, "[Info] %s (%s:%d) last launch runtime: %f ms\n", __func__, __FILE__, __LINE__, milliseconds);
// fprintf(stderr, "[Info] %s (%s:%d) last launch runtime: %f ms\n", __func__, __FILE__, __LINE__, milliseconds);
#endif

#ifdef DEBUG_PRINT
float long_kernel_runtime_ms = 0;
float short_kernel_runtime_ms[MICRO_BATCH];
cudaEventElapsedTime(&long_kernel_runtime_ms, stream_setup.streams[stream_id].long_kernel_event, stream_setup.streams[stream_id].stopevent);
float long_kernel_througput = *stream_setup.streams[stream_id].long_mem.total_long_segs_n / long_kernel_runtime_ms / (float)1000;
for (int uid = 0; uid < MICRO_BATCH; uid++){
cudaEventElapsedTime(&short_kernel_runtime_ms[uid], stream_setup.streams[stream_id].short_kernel_event[uid],
uid + 1 < MICRO_BATCH ? stream_setup.streams[stream_id].short_kernel_event[uid+1] : stream_setup.streams[stream_id].long_kernel_event);
}
fprintf(stderr, " %9.2f %%\n", (float)(*stream_setup.streams[stream_id].long_mem.total_long_segs_n)/stream_setup.long_seg_buffer_size_stream*100);
fprintf(stderr, "Runtime(s) = ");
for (int uid = 0; uid < MICRO_BATCH; uid++) fprintf(stderr, " %11.2f", short_kernel_runtime_ms[uid] / 1000);
fprintf(stderr, " %9.2f %%\n", long_kernel_runtime_ms / 1000);
fprintf(stderr, "Throutput (Ma/s) = ");
for (int uid = 0; uid < MICRO_BATCH; uid++) fprintf(stderr, " ");
fprintf(stderr, " %11.2f\n", long_kernel_througput);
fprintf(stderr, "[Info] %s finish (%s:%d) n_read %d long seg buffer usage %.2f%%. Long seg kernel throughput %.2f Manchors/s\n", __func__, __FILE__, __LINE__, n_reads, (float)(*stream_setup.streams[stream_id].long_mem.total_long_segs_n)/stream_setup.long_seg_buffer_size_stream*100, long_kernel_througput);
#endif // DEBUG_PRINT

// DEBUG: debug analysis that involves sychronizating the whole device
#if defined(DEBUG_CHECK)
plchain_debug_analysis(stream_setup.streams[stream_id]);
Expand All @@ -550,12 +572,7 @@ int plchain_finish_batch(streamSetup_t stream_setup, int stream_id, Misc misc, v
// accumulate n_reads
n_reads += stream_setup.streams[stream_id].host_mems[uid].size;
}
#ifdef DEBUG_PRINT
float long_kernel_runtime_ms = 0;
cudaEventElapsedTime(&long_kernel_runtime_ms, stream_setup.streams[stream_id].long_kernel_event, stream_setup.streams[stream_id].stopevent);
float long_kernel_througput = *stream_setup.streams[stream_id].long_mem.total_long_segs_n / long_kernel_runtime_ms;
fprintf(stderr, "[Info] %s finish (%s:%d) n_read %d long seg buffer usage %.2f%%. Long seg kernel throughput %.2f Manchors/s\n", __func__, __FILE__, __LINE__, n_reads, (float)(*stream_setup.streams[stream_id].long_mem.total_long_segs_n)/stream_setup.long_seg_buffer_size_stream*100, long_kernel_througput);
#endif // DEBUG_PRINT

return n_reads;
}

Expand Down Expand Up @@ -592,8 +609,19 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str

stream_setup.streams[stream_id].reads = reads;
int read_start = 0;
#ifdef DEBUG_PRINT
fprintf(stderr, "----------------------------------------------------------------\n ");
for (int uid = 0; uid < MICRO_BATCH; uid++){
fprintf(stderr, " Short%d", uid);
}
fprintf(stderr, " Long\n");
fprintf(stderr, "----------------------------------------------------------------\n");
fprintf(stderr, "Mem Usage = ");
#endif
for (int uid = 0; uid < MICRO_BATCH; uid++) {
#ifdef USEHIP
roctxRangePushA("microbatch");
#endif
// decide the size of micro batch
size_t batch_n = 0;
int read_end = 0;
Expand All @@ -611,7 +639,9 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
cut_num += (reads[read_end].n - 1) / an_p_cut + 1;
}
#ifdef DEBUG_PRINT
fprintf(stderr, "[Info] %s (%s:%d) MICROBATCH#%d batch_n %lu, read_start %d, read_end %d usage %.2f %%\n", __func__, __FILE__, __LINE__, uid, batch_n, read_start, read_end, (float)batch_n/stream_setup.max_anchors_stream*100);
// fprintf(stderr, "[Info] %s (%s:%d) MICROBATCH#%d batch_n %lu, read_start %d, read_end %d usage %.2f %%\n", __func__, __FILE__, __LINE__, uid, batch_n, read_start, read_end, (float)batch_n/stream_setup.max_anchors_stream*100);

fprintf(stderr, " %9.2f %%", (float)batch_n/stream_setup.max_anchors_stream*100);
#endif // DEBUG_PRINT
// // sanity check
// if (stream_setup.max_anchors_stream < total_n){
Expand All @@ -628,15 +658,20 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
assert(stream_setup.max_range_grid >= griddim);
assert(stream_setup.max_num_cut >= cut_num);
// work on micro batch
#ifdef USEHIP
roctxRangePushA("reorg");
#endif
// step1: reorg input
plmem_reorg_input_arr(reads + read_start, read_end - read_start,
&stream_setup.streams[stream_id].host_mems[uid],
range_kernel_config);
// step2: copy to device
#ifdef USEHIP
roctxRangePop();
#endif


cudaEventRecord(stream_setup.streams[stream_id].short_kernel_event[uid],
stream_setup.streams[stream_id].cudastream);
plmem_async_h2d_short_memcpy(&stream_setup.streams[stream_id], uid);
// step3: range selection
plrange_async_range_selection(&stream_setup.streams[stream_id].dev_mem,
Expand All @@ -648,7 +683,9 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
plmem_async_d2h_short_memcpy(&stream_setup.streams[stream_id], uid);
// update index
read_start = read_end;
#ifdef USEHIP
roctxRangePop();
#endif
}

cudaEventRecord(stream_setup.streams[stream_id].long_kernel_event,
Expand Down
29 changes: 23 additions & 6 deletions gpu/plmem.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@

void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch,
int range_grid_size, size_t buffer_size_long) {
#ifdef DEBUG_PRINT
size_t host_mem_size;
host_mem_size = anchor_per_batch * (sizeof(int32_t) + sizeof(int32_t) +
sizeof(int8_t) + sizeof(int32_t) + sizeof(int32_t) + sizeof(uint16_t));
host_mem_size += range_grid_size * (sizeof(size_t) + sizeof(size_t) + sizeof(size_t));
fprintf(stderr, "[Info] Host Malloc Pinned Memory Size %.2f GB\n", (float)host_mem_size / OneG);
#endif

// data array
cudaMallocHost((void**)&host_mem->ax, anchor_per_batch * sizeof(int32_t));
cudaMallocHost((void**)&host_mem->ay, anchor_per_batch * sizeof(int32_t));
Expand All @@ -34,6 +42,12 @@ void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch,
}

void plmem_malloc_long_mem(longMemPtr *long_mem, size_t buffer_size_long) {
#ifdef DEBUG_PRINT
size_t host_mem_size;
host_mem_size = buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t);
host_mem_size += buffer_size_long * (sizeof(int32_t) + sizeof(uint16_t));
fprintf(stderr, "[Info] Host Malloc Pinned Memory Size %.2f GB (long seg)\n", (float)host_mem_size / OneG);
#endif
// data array
cudaMallocHost((void**)&long_mem->long_segs, buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t));
cudaMallocHost((void**)&long_mem->f_long, buffer_size_long * sizeof(int32_t));
Expand All @@ -46,6 +60,8 @@ void plmem_malloc_long_mem(longMemPtr *long_mem, size_t buffer_size_long) {
void plmem_free_host_mem(hostMemPtr *host_mem) {
cudaFreeHost(host_mem->ax);
cudaFreeHost(host_mem->ay);
cudaFreeHost(host_mem->sid);
cudaFreeHost(host_mem->xrev);
cudaFreeHost(host_mem->f);
cudaFreeHost(host_mem->p);
cudaFreeHost(host_mem->start_idx);
Expand Down Expand Up @@ -576,17 +592,18 @@ void plmem_stream_initialize(size_t *max_total_n_,
for (int j = 0; j < MICRO_BATCH; j++) {
plmem_malloc_host_mem(&stream_setup.streams[i].host_mems[j], max_anchors_stream,
max_range_grid, long_seg_buffer_size);
cudaEventCreate(&stream_setup.streams[i].short_kernel_event[j]);
}
// one stream has one long mem and one device mem
plmem_malloc_long_mem(&stream_setup.streams[i].long_mem, long_seg_buffer_size);
plmem_malloc_device_mem(&stream_setup.streams[i].dev_mem, max_anchors_stream,
max_range_grid, max_num_cut);
cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_long_seg_count, 0, sizeof(unsigned int),
stream_setup.streams[i].cudastream);
cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int),
stream_setup.streams[i].cudastream);
cudaMemsetAsync(&stream_setup.streams[i].dev_mem.d_total_n_long, 0, sizeof(size_t),
stream_setup.streams[i].cudastream);
cudaMemset(stream_setup.streams[i].dev_mem.d_long_seg_count, 0, sizeof(unsigned int));
cudaCheck();
cudaMemset(stream_setup.streams[i].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int));
cudaCheck();
cudaMemset(stream_setup.streams[i].dev_mem.d_total_n_long, 0, sizeof(size_t));
cudaCheck();
}

cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem);
Expand Down
11 changes: 6 additions & 5 deletions gpu/plmem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,11 @@ typedef struct {

// cut
size_t *d_cut; // cut
unsigned int *d_long_seg_count;
seg_t *d_long_seg;
seg_t *d_long_seg_og;
unsigned int *d_mid_seg_count;
seg_t *d_mid_seg;
unsigned int *d_long_seg_count; // total number of long seg (aggregated accross micro batches)
seg_t *d_long_seg; // start & end idx of long segs in the long seg buffer (aggregated across micro batches)
seg_t *d_long_seg_og; // start & end idx of long seg in the micro batch. (aggregated accross micro batches)
unsigned int *d_mid_seg_count; // private to micro batch
seg_t *d_mid_seg; // private to micro batch

// long segement buffer
int32_t *d_ax_long, *d_ay_long;
Expand All @@ -97,6 +97,7 @@ typedef struct stream_ptr_t{
deviceMemPtr dev_mem;
cudaStream_t cudastream;
cudaEvent_t stopevent, startevent, long_kernel_event;
cudaEvent_t short_kernel_event[MICRO_BATCH];
bool busy = false;
} stream_ptr_t;

Expand Down
2 changes: 1 addition & 1 deletion gpu/plrange.cu
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ void plrange_async_range_selection(deviceMemPtr* dev_mem, cudaStream_t* stream)
dev_mem->d_range, dev_mem->d_cut, dev_mem->d_cut_start_idx, total_n, range_kernel_config);
cudaCheck();
#ifdef DEBUG_PRINT
fprintf(stderr, "[Info] %s (%s:%d): Batch total_n %lu, Range Kernel Launched, grid %d cut %d\n", __func__, __FILE__, __LINE__, total_n, DimGrid.x, cut_num);
// fprintf(stderr, "[Info] %s (%s:%d): Batch total_n %lu, Range Kernel Launched, grid %d cut %d\n", __func__, __FILE__, __LINE__, total_n, DimGrid.x, cut_num);
#endif
}

Expand Down
16 changes: 10 additions & 6 deletions gpu/plscore.cu
Original file line number Diff line number Diff line change
Expand Up @@ -290,16 +290,16 @@ __global__ void score_generation_short(
size_t long_seg_start_idx;
if (tid == 0) {
/* Allocate space in long seg buffer */
long_seg_start_idx = atomicAdd(total_n_long, end_idx - start_idx);
long_seg_start_idx = atomicAdd((unsigned long long int*)total_n_long, (unsigned long long int)end_idx - start_idx);
if (long_seg_start_idx + (end_idx - start_idx) >= buffer_size_long){ // long segement buffer is full
atomicSub(total_n_long, end_idx - start_idx); // rollback total_n_long
atomicAdd((unsigned long long int*)total_n_long, (unsigned long long int)end_idx - start_idx); // rollback total_n_long
long_seg_start_idx = SIZE_MAX;
// fallback to mid kernel
int mid_seg_idx = atomicAdd(mid_seg_count, 1);
int mid_seg_idx = atomicAdd((unsigned long long int*)mid_seg_count, 1);
mid_seg[mid_seg_idx].start_idx = start_idx;
mid_seg[mid_seg_idx].end_idx = end_idx;
} else {
int long_seg_idx = atomicAdd(long_seg_count, 1);
int long_seg_idx = atomicAdd((unsigned long long int*)long_seg_count, 1);
long_seg[long_seg_idx].start_idx = long_seg_start_idx;
long_seg[long_seg_idx].end_idx = long_seg_start_idx + (end_idx - start_idx);
long_seg_og[long_seg_idx].start_idx = start_idx;
Expand All @@ -312,7 +312,11 @@ __global__ void score_generation_short(
}
}
// broadcast long_seg_start_idx to all scalar registers
#ifdef USEHIP
long_seg_start_idx = __builtin_amdgcn_readfirstlane(long_seg_start_idx);
#else
long_seg_start_idx = __shfl_sync(0xffffffff, long_seg_start_idx, 0);
#endif
if (long_seg_start_idx == SIZE_MAX)
continue; // failed to allocate long_seg buffer
for (uint64_t idx = tid; idx < end_idx - start_idx; idx += blockDim.x){
Expand Down Expand Up @@ -536,7 +540,7 @@ void plscore_async_short_mid_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* str
cudaCheck();

#ifdef DEBUG_PRINT
fprintf(stderr, "[Info] %s (%s:%d) short mid score generation success\n", __func__, __FILE__, __LINE__);
// fprintf(stderr, "[Info] %s (%s:%d) short mid score kernel launched\n", __func__, __FILE__, __LINE__);
#endif

cudaCheck();
Expand All @@ -562,7 +566,7 @@ void plscore_async_long_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream)
cudaCheck();

#ifdef DEBUG_PRINT
fprintf(stderr, "[Info] %s (%s:%d) long score generation success\n", __func__, __FILE__, __LINE__);
// fprintf(stderr, "[Info] %s (%s:%d) long score generation launched\n", __func__, __FILE__, __LINE__);
#endif

cudaCheck();
Expand Down
Empty file modified scripts/acc_integrated.slurm
100644 → 100755
Empty file.
5 changes: 5 additions & 0 deletions scripts/mbit10_integrated.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

make clean
make MICRO_BATCH=4 GPU=NV GPU_CONFIG=a6000_config.json SHORT_BLOCK_SIZE=32 LONG_BLOCK_SIZE=1024 MID_BLOCK_SIZE=512 MID_CUT=1 LONG_CUT=100 DEBUG_ANALYSIS=1
./minimap2 -t 1 --max-chain-skip=2147483647 data/hg38.mmi data/random_500MBases_90kto100k.fa #--gpu-chain

0 comments on commit e1248fb

Please sign in to comment.