config aac that maximize memory usage

Minimap2onGPU · Mar 2, 2024 · 12c3fc0 · 12c3fc0
1 parent eed2640
commit 12c3fc0
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 9 deletions.
diff --git a/aac_config.json b/aac_config.json
@@ -21,7 +21,7 @@
         "mid_blockdim": 64,
         "//blockdim config": "options are not used: static config specified at compile time (make ... LONG_BLOCK_SIZE=1024)",
         "short_griddim": 16128,
-        "long_griddim": 208,
+        "long_griddim": 150,
         "mid_griddim": 16128
     }
 }
diff --git a/gpu/plchain.cu b/gpu/plchain.cu
@@ -104,6 +104,8 @@ void plchain_backtracking(hostMemPtr *host_mem, chain_read_t *reads, Misc misc,
 
     uint16_t* p_hostmem = host_mem->p;
     int32_t* f = host_mem->f;
+    // FIXME: DISABLED BACKTRACK, REMOVE THE RETURN HERE
+    return;
     for (int i = 0; i < n_read; i++) {
         int64_t* p;
         KMALLOC(km, p, reads[i].n);
@@ -803,24 +805,23 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
 #ifdef USEHIP
         roctxRangePop();
 #endif
-
+        plmem_async_h2d_short_memcpy(&stream_setup.streams[stream_id], uid);
+        // step3: range selection
 #ifdef DEBUG_PRINT
         cudaEventRecord(stream_setup.streams[stream_id].short_kernel_start_event[uid],
                     stream_setup.streams[stream_id].cudastream);
 #endif // DEBUG_PRINT
-        plmem_async_h2d_short_memcpy(&stream_setup.streams[stream_id], uid);
-        // step3: range selection
         plrange_async_range_selection(&stream_setup.streams[stream_id].dev_mem,
                                     &stream_setup.streams[stream_id].cudastream);
         // step4: score generation for short and mid segs
         plscore_async_short_mid_forward_dp(&stream_setup.streams[stream_id].dev_mem,
                                     &stream_setup.streams[stream_id].cudastream);
-        // step5: copy short and mid results back
-        plmem_async_d2h_short_memcpy(&stream_setup.streams[stream_id], uid);
 #ifdef DEBUG_PRINT
         cudaEventRecord(stream_setup.streams[stream_id].short_kernel_stop_event[uid],
                     stream_setup.streams[stream_id].cudastream);
 #endif // DEBUG_PRINT
+        // step5: copy short and mid results back
+        plmem_async_d2h_short_memcpy(&stream_setup.streams[stream_id], uid);
         // update index
         read_start = read_end;
 
@@ -872,9 +873,9 @@ void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, str
                     stream_setup.streams[stream_id].cudastream);
     plscore_async_long_forward_dp(&stream_setup.streams[stream_id].dev_mem,
                                    &stream_setup.streams[stream_id].cudastream);
-    plmem_async_d2h_long_memcpy(&stream_setup.streams[stream_id]);
     cudaEventRecord(stream_setup.streams[stream_id].stopevent,
                     stream_setup.streams[stream_id].cudastream);
+    plmem_async_d2h_long_memcpy(&stream_setup.streams[stream_id]);
     stream_setup.streams[stream_id].busy = true;
     cudaCheck();
 }
@@ -922,7 +923,7 @@ void chain_blocking_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t
 // void chain_stream_gpu(const input_meta_t* meta, chain_read_t**in_arr_, int *n_read_) {
 //     static int batchid = 0;
 //     Misc misc = build_misc(INT64_MAX);
-//     chain_stream_gpu(in_arr_, n_read_, misc, stream_setup, batchid);
+//     plchain_cal_score_launch(in_arr_, n_read_, misc, stream_setup, batchid);
 //     batchid++;
 //     if (in_arr_){
 //         int n_read = *n_read_;

diff --git a/gpu/plscore.cu b/gpu/plscore.cu
@@ -599,6 +599,10 @@ void plscore_async_long_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream)
     size_t buffer_size_long = dev_mem->buffer_size_long;
     dim3 longDimGrid(score_kernel_config.long_griddim, 1, 1);
 
+#ifdef DEBUG_CHECK
+    fprintf(stderr, "[Info] %s (%s:%d) Long Grid Dim = %d\n", __func__, __FILE__, __LINE__, longDimGrid.x);
+#endif
+
     #ifdef __LONG_BLOCK_SIZE__
     // fprintf(stderr, "long block size: %d\n", __LONG_BLOCK_SIZE__);
     score_generation_long_map<__LONG_BLOCK_SIZE__><<<longDimGrid, dim3(__LONG_BLOCK_SIZE__, 1, 1), 0, *stream>>>(

diff --git a/scripts/acc_integrated.slurm b/scripts/acc_integrated.slurm
@@ -6,7 +6,7 @@
 #SBATCH --ntasks-per-node=1           # Number of tasks (processes) per node
 #SBATCH --cpus-per-task=16             # Number of CPU cores per task
 #SBATCH --mem=500g                      # Memory per node
-#SBATCH --time=01:40:00               # Maximum execution time (HH:MM:SS)
+#SBATCH --time=10:40:00               # Maximum execution time (HH:MM:SS)
 #SBATCH --output=slurm_output/sample_sbatch_job.%j.out    # Output file
 #SBATCH --error=slurm_output/sample_sbatch_job.%j.err     # Error file
 
@@ -39,7 +39,18 @@ cd $MM2_ROOT
 # export AMD_LOG_LEVEL=4
 make clean
 make MICRO_BATCH=5 GPU_CONFIG=aac_config.json SHORT_BLOCK_SIZE=64 LONG_BLOCK_SIZE=1024 MID_BLOCK_SIZE=512 MID_CUT=1 LONG_CUT=40 DEBUG=1 DEBUG_ANALYSIS=1
+./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/long_read_600M.fa 
+echo "Exit: $?"
+./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_10kto20k.fa
+echo "Exit: $?"
+./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_40kto50k.fa
+echo "Exit: $?"
+./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_10kto300k.fa
+echo "Exit: $?"
 ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_90kto100k.fa
+echo "Exit: $?"
+./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_100kto300k.fa
+echo "Exit: $?"
 # ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_4GBases_10kto300k.fa