From 12c3fc0f0dea2a133e49745d4b9209ca71959daa Mon Sep 17 00:00:00 2001 From: Xueshen Liu Date: Fri, 1 Mar 2024 22:50:35 -0600 Subject: [PATCH] config aac that maximize memory usage --- aac_config.json | 2 +- gpu/plchain.cu | 15 ++++++++------- gpu/plscore.cu | 4 ++++ scripts/acc_integrated.slurm | 13 ++++++++++++- 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/aac_config.json b/aac_config.json index cac1d9e0..efcdaf2f 100644 --- a/aac_config.json +++ b/aac_config.json @@ -21,7 +21,7 @@ "mid_blockdim": 64, "//blockdim config": "options are not used: static config specified at compile time (make ... LONG_BLOCK_SIZE=1024)", "short_griddim": 16128, - "long_griddim": 208, + "long_griddim": 150, "mid_griddim": 16128 } } \ No newline at end of file diff --git a/gpu/plchain.cu b/gpu/plchain.cu index 461167a8..948f56f0 100644 --- a/gpu/plchain.cu +++ b/gpu/plchain.cu @@ -104,6 +104,8 @@ void plchain_backtracking(hostMemPtr *host_mem, chain_read_t *reads, Misc misc, uint16_t* p_hostmem = host_mem->p; int32_t* f = host_mem->f; + // FIXME: DISABLED BACKTRACK, REMOVE THE RETURN HERE + return; for (int i = 0; i < n_read; i++) { int64_t* p; KMALLOC(km, p, reads[i].n); @@ -803,24 +805,23 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str #ifdef USEHIP roctxRangePop(); #endif - + plmem_async_h2d_short_memcpy(&stream_setup.streams[stream_id], uid); + // step3: range selection #ifdef DEBUG_PRINT cudaEventRecord(stream_setup.streams[stream_id].short_kernel_start_event[uid], stream_setup.streams[stream_id].cudastream); #endif // DEBUG_PRINT - plmem_async_h2d_short_memcpy(&stream_setup.streams[stream_id], uid); - // step3: range selection plrange_async_range_selection(&stream_setup.streams[stream_id].dev_mem, &stream_setup.streams[stream_id].cudastream); // step4: score generation for short and mid segs plscore_async_short_mid_forward_dp(&stream_setup.streams[stream_id].dev_mem, &stream_setup.streams[stream_id].cudastream); - // step5: copy short and mid results back - plmem_async_d2h_short_memcpy(&stream_setup.streams[stream_id], uid); #ifdef DEBUG_PRINT cudaEventRecord(stream_setup.streams[stream_id].short_kernel_stop_event[uid], stream_setup.streams[stream_id].cudastream); #endif // DEBUG_PRINT + // step5: copy short and mid results back + plmem_async_d2h_short_memcpy(&stream_setup.streams[stream_id], uid); // update index read_start = read_end; @@ -872,9 +873,9 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str stream_setup.streams[stream_id].cudastream); plscore_async_long_forward_dp(&stream_setup.streams[stream_id].dev_mem, &stream_setup.streams[stream_id].cudastream); - plmem_async_d2h_long_memcpy(&stream_setup.streams[stream_id]); cudaEventRecord(stream_setup.streams[stream_id].stopevent, stream_setup.streams[stream_id].cudastream); + plmem_async_d2h_long_memcpy(&stream_setup.streams[stream_id]); stream_setup.streams[stream_id].busy = true; cudaCheck(); } @@ -922,7 +923,7 @@ void chain_blocking_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t // void chain_stream_gpu(const input_meta_t* meta, chain_read_t**in_arr_, int *n_read_) { // static int batchid = 0; // Misc misc = build_misc(INT64_MAX); -// chain_stream_gpu(in_arr_, n_read_, misc, stream_setup, batchid); +// plchain_cal_score_launch(in_arr_, n_read_, misc, stream_setup, batchid); // batchid++; // if (in_arr_){ // int n_read = *n_read_; diff --git a/gpu/plscore.cu b/gpu/plscore.cu index 1389054e..cf8ecd31 100644 --- a/gpu/plscore.cu +++ b/gpu/plscore.cu @@ -599,6 +599,10 @@ void plscore_async_long_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream) size_t buffer_size_long = dev_mem->buffer_size_long; dim3 longDimGrid(score_kernel_config.long_griddim, 1, 1); +#ifdef DEBUG_CHECK + fprintf(stderr, "[Info] %s (%s:%d) Long Grid Dim = %d\n", __func__, __FILE__, __LINE__, longDimGrid.x); +#endif + #ifdef __LONG_BLOCK_SIZE__ // fprintf(stderr, "long block size: %d\n", __LONG_BLOCK_SIZE__); score_generation_long_map<__LONG_BLOCK_SIZE__><<>>( diff --git a/scripts/acc_integrated.slurm b/scripts/acc_integrated.slurm index 4d2b751a..985d8393 100755 --- a/scripts/acc_integrated.slurm +++ b/scripts/acc_integrated.slurm @@ -6,7 +6,7 @@ #SBATCH --ntasks-per-node=1 # Number of tasks (processes) per node #SBATCH --cpus-per-task=16 # Number of CPU cores per task #SBATCH --mem=500g # Memory per node -#SBATCH --time=01:40:00 # Maximum execution time (HH:MM:SS) +#SBATCH --time=10:40:00 # Maximum execution time (HH:MM:SS) #SBATCH --output=slurm_output/sample_sbatch_job.%j.out # Output file #SBATCH --error=slurm_output/sample_sbatch_job.%j.err # Error file @@ -39,7 +39,18 @@ cd $MM2_ROOT # export AMD_LOG_LEVEL=4 make clean make MICRO_BATCH=5 GPU_CONFIG=aac_config.json SHORT_BLOCK_SIZE=64 LONG_BLOCK_SIZE=1024 MID_BLOCK_SIZE=512 MID_CUT=1 LONG_CUT=40 DEBUG=1 DEBUG_ANALYSIS=1 +./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/long_read_600M.fa +echo "Exit: $?" +./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_10kto20k.fa +echo "Exit: $?" +./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_40kto50k.fa +echo "Exit: $?" +./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_10kto300k.fa +echo "Exit: $?" ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_90kto100k.fa +echo "Exit: $?" +./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_100kto300k.fa +echo "Exit: $?" # ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_4GBases_10kto300k.fa