Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gpu kernel break #13

Merged
merged 34 commits into from
Mar 14, 2024
Merged
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
5598718
add omnitrace scripts
xenshinu Oct 26, 2023
5e1abe8
finish minibatch, parameter is still hardcoded, debug function need f…
xenshinu Oct 27, 2023
cb1a30e
no reset long seg on each micro batch cause fault
xenshinu Oct 27, 2023
1a585ab
Add acc_config. FIX seg fault for long_seg_count reset
joydddd Oct 30, 2023
85f1cbb
finish microbatch design, TODO: add batch number to config, and use h…
xenshinu Nov 4, 2023
aa62643
use hostmalloc to avoid step1 delay
xenshinu Jan 10, 2024
15d0003
change script path
joydddd Jan 11, 2024
216f2b2
Add kernel throughput calculatation
joydddd Jan 26, 2024
8ee89c1
update scripts
xenshinu Feb 6, 2024
4aeacfd
add sorting technique
xenshinu Feb 8, 2024
e1248fb
Update throughput calculation, a6000 config
joydddd Feb 9, 2024
d9e7396
add atomic runtime balancing
xenshinu Feb 9, 2024
accea33
Merge branch 'gpu_kernel-break' of github.com:Minimap2onGPU/minimap2 …
xenshinu Feb 9, 2024
cff2e27
debug info control
xenshinu Feb 9, 2024
b21c5f9
Update debug analysis
joydddd Feb 13, 2024
39f758b
Edit throughput calculation. JIT Compilat error on cuda, push to try …
joydddd Feb 16, 2024
9678399
Fix throughput analysis
joydddd Feb 16, 2024
c40bf9f
fix atomicadd -> atomicsub, TODO: add more cudaCheck
xenshinu Feb 17, 2024
66e2e54
fix atomic add in long seg, only first thread in block add the atomic
xenshinu Feb 22, 2024
474e746
Temporal Fix microbacthing error (Use CPU kernel)
joydddd Feb 23, 2024
3239112
Add put long segs back to original reads, but output seems to be wrong??
joydddd Feb 23, 2024
907748f
add seg count
xenshinu Feb 24, 2024
6836cb7
Merge branch 'gpu_kernel-break' of github.com:Minimap2onGPU/minimap2 …
xenshinu Feb 24, 2024
d8ba447
Remove skip backtracking in GPU implementation. Outputs are correct
joydddd Feb 26, 2024
94f333a
update plscore
xenshinu Feb 28, 2024
c982bb5
Merge branch 'gpu_kernel-break' of github.com:Minimap2onGPU/minimap2 …
xenshinu Feb 28, 2024
eed2640
comment in kernel print to maximize tp
xenshinu Mar 1, 2024
12c3fc0
config aac that maximize memory usage
xenshinu Mar 2, 2024
5c742d9
Add data analysis script
joydddd Mar 6, 2024
f7ecde1
Add data analysis script
joydddd Mar 6, 2024
13b2eab
Add range distribution analysis
joydddd Mar 14, 2024
4d2459c
Merge branch 'gpu_kernel-break' of github.com:Minimap2onGPU/minimap2 …
joydddd Mar 14, 2024
ad70b6a
cleanup gpu code for open source. TODO: Add README
joydddd Mar 14, 2024
3218873
Update print compile time config
joydddd Mar 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cleanup gpu code for open source. TODO: Add README
joydddd committed Mar 14, 2024

Verified

This commit was signed with the committer’s verified signature.
rouault Even Rouault
commit ad70b6a3c9675a7ea26630470407ac54a52f1083
18 changes: 12 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CFLAGS= -O2 -g -DNDEBUG
CDEBUG_FLAGS= -g -DDEBUG_PRINT -O2 #-Wall -Wextra -Wno-unused-parameter -Wno-unused-variable -Wno-sign-compare -Wno-unused-function -Wno-c++17-extensions -Wno-\#warnings #-O0 -DNDEBUG
CDEBUG_FLAGS= -g -O2 #-Wall -Wextra -Wno-unused-parameter -Wno-unused-variable -Wno-sign-compare -Wno-unused-function -Wno-c++17-extensions -Wno-\#warnings #-O0 -DNDEBUG
CPPFLAGS= -DHAVE_KALLOC -D__AMD_SPLIT_KERNELS__ # -Wno-unused-but-set-variable -Wno-unused-variable
CPPFLAGS+= $(if $(MICRO_BATCH),-DMICRO_BATCH=\($(MICRO_BATCH)\))
INCLUDES= -I .
@@ -36,12 +36,18 @@ ifneq ($(tsan),)
LIBS+=-fsanitize=thread
endif

ifneq ($(DEBUG),) # turn on debug flags
CFLAGS = $(CDEBUG_FLAGS)

# turn on debug flags
ifeq ($(DEBUG),info)
CFLAGS += -DDEBUG_PRINT
endif
ifeq ($(DEBUG), analyze)
CFLAGS += $(CDEBUG_FLAGS)
CFLAGS += -DDEBUG_CHECK -DDEBUG_PRINT
endif
ifneq ($(DEBUG_ANALYSIS),) # turn on debug flags
CFLAGS = $(CDEBUG_FLAGS)
CFLAGS += -DDEBUG_CHECK -DDEBUG_VERBOSE
ifeq ($(DEBUG), verbose)
CFLAGS += $(CDEBUG_FLAGS)
CFLAGS += -DDEBUG_CHECK -DDEBUG_PRINT -DDEBUG_VERBOSE
endif

.PHONY:all extra clean depend # profile
173 changes: 164 additions & 9 deletions gpu/debug.c
Original file line number Diff line number Diff line change
@@ -36,7 +36,7 @@ void debug_output_anchors(const char debug_folder[], chain_read_t *in) {
fprintf(f_anchors, "*%d\n", in->rep_len);

/* Read Number of Anchors */
fprintf(f_anchors, "#%d\n", in->n);
fprintf(f_anchors, "#%ld\n", in->n);

/* Read Anchors */
for (int i = 0; i < in->n; i++) {
@@ -118,7 +118,7 @@ void debug_print_successor_range(int32_t *range, int64_t n) {
fprintf(stderr, "[Info] Writing successor range to file %s\n",
fout_range_filename);
}
fprintf(fout_range, "> %ld, len: %ld ", read_idx, n);
fprintf(fout_range, "> %d, len: %ld ", read_idx, n);
for (int64_t i = 0; i < n; ++i) {
fprintf(fout_range, "#%ld: %d, ", i, range[i]);
}
@@ -171,7 +171,7 @@ void debug_print_score(const int64_t *p, const int32_t *score, int64_t n) {
fout_score_filename);
fprintf(fout_score, "@@@<qname\tqlen\n");
}
fprintf(fout_score, "<%ld\t\n", read_idx);
fprintf(fout_score, "<%d\t\n", read_idx);
fprintf(fout_score, "#%ld\n", n);
for (int i = 0; i < n; ++i) {
fprintf(fout_score, "%d,%ld\t", score[i], p[i]);
@@ -197,7 +197,7 @@ void debug_print_score_rel_p(const uint16_t *p, const int32_t *score, int64_t n)
fout_score_filename);
fprintf(fout_score, "@@@<qname\tqlen\n");
}
fprintf(fout_score, "<%ld\t\n", read_idx);
fprintf(fout_score, "<%d\t\n", read_idx);
fprintf(fout_score, "#%ld\n", n);
for (int i = 0; i < n; ++i) {
fprintf(fout_score, "%d,%u\t", score[i], (unsigned int)p[i]);
@@ -223,7 +223,7 @@ void debug_print_chain(mm128_t *a, uint64_t *u, int32_t n_u, char* qname) {
}
fprintf(fout_chain, "<%s\n", qname);
for (int i = 0, j = 0; i < n_u; i++) {
fprintf(fout_chain, "[%d] #%d: ", u[i] >> 32, (uint32_t)u[i]);
fprintf(fout_chain, "[%ld] #%d: ", u[i] >> 32, (uint32_t)u[i]);
for (int new_j = j + (uint32_t)u[i]; j < new_j; j++) {
fprintf(fout_chain, "%lx,%lx ", a[j].x, a[j].y);
}
@@ -446,7 +446,7 @@ void debug_check_range(const int32_t* range, size_t n){
static int read_idx = 0;
for (size_t i = 1; i < n; i++){
if (range[i] < range[i-1] - 1)
fprintf(stderr, "[debug]No realistic range sequence read #%d i %d %d %d\n", read_idx, i, range[i-1], range[i]);
fprintf(stderr, "[debug]No realistic range sequence read #%d i %ld %d %d\n", read_idx, i, range[i-1], range[i]);
}
read_idx++;
}
@@ -462,22 +462,22 @@ int debug_check_cut(const size_t *cut, const int32_t *range, size_t max_cut,
if (cut[cid] != 0 && range[cut[cid] - 1] != 0)
fprintf(
stderr,
"[debug] Cut Error: > %ld len %d, Cut at %zu %zu (%d)\n",
"[debug] Cut Error: > %d Cut at %zu %lu (%d)\n",
read_idx, cut[cid], offset, range[cut[cid] - 1]);
}
if (cid > 0 && cut[cid] != SIZE_MAX){
static size_t prev_cut = 0;
int cut_issue = 0;
for (size_t i = prev_cut; i < cut[cid]; i++) {
if (range[i] + i >= cut[cid]){
fprintf(stderr, "[debug] Cut Error: > %ld cid %d , Cut %zu - %zu, i %zu, range %zu\n",
fprintf(stderr, "[debug] Cut Error: > %d cid %ld , Cut %zu - %zu, i %zu, range %u\n",
read_idx, cid, prev_cut, cut[cid], i, range[i]);
cut_issue = 1;
}
}
if (cut_issue){
for (int i = prev_cut; i < cut[cid]; i++){
fprintf(stderr, "%zu[%d]\t", i, range[i]);
fprintf(stderr, "%u[%d]\t", i, range[i]);
}
fprintf(stderr, "\n");
}
@@ -489,4 +489,159 @@ int debug_check_cut(const size_t *cut, const int32_t *range, size_t max_cut,
return cid;
}




// find long seg range distribution
void debug_cal_long_seg_range_dis(size_t total_n, size_t num_cut, int32_t* range){
static uint64_t range_dis[5001] = {0};
static size_t seg_total = 0;
static uint64_t anchors_total = 0;
static FILE* fp = NULL;

for (size_t i = 0; i < total_n; i++){
assert(range[i] <= 5000);
range_dis[range[i]]++;
}
anchors_total += total_n;
seg_total += num_cut;
if (!fp) {
fprintf(stderr, "[Debug] Writing to long_range_dis.csv\n");
fp = fopen("long_range_dis.csv", "w+");
fprintf(fp, "num_segs,num_anchors");
for (int i = 0; i < 5001; i++) fprintf(fp, ",%d", i);
fprintf(fp, "\n");
}
fprintf(fp, "%lusegs,%luanchors", seg_total, anchors_total);
for (int i = 0; i <= 5000; i++){
fprintf(fp, ",%lu", range_dis[i]);
}
fprintf(fp, "\n");
}


void debug_cal_mid_range_dis(size_t total_n, size_t num_cut, int32_t* range){
static uint64_t range_dis[5001] = {0};
static size_t seg_total = 0;
static uint64_t anchors_total = 0;
static FILE* fp = NULL;

fprintf(stderr, "[verbose] %lu cuts generated\n", num_cut);
for (size_t i = 0; i < total_n; i++){
assert(range[i] <= 5000);
range_dis[range[i]]++;
}
anchors_total += total_n;
seg_total += num_cut;
if (!fp) {
fprintf(stderr, "[Debug] Writing to mid_range_dis.csv\n");
fp = fopen("mid_range_dis.csv", "w+");
fprintf(fp, "num_segs,num_anchors");
for (int i = 0; i < 5001; i++) fprintf(fp, ",%d", i);
fprintf(fp, "\n");
}
fprintf(fp, "%lusegs,%luanchors", seg_total, anchors_total);
for (int i = 0; i < 5001; i++){
fprintf(fp, ",%lu", range_dis[i]);
}
fprintf(fp, "\n");
}


// range distribution
void debug_cal_range_dis(size_t total_n, size_t num_cut, int32_t* range){
static uint64_t range_dis[5001] = {0};
static size_t seg_total = 0;
static uint64_t anchors_total = 0;
static FILE* fp = NULL;

fprintf(stderr, "[verbose] %lu cuts generated\n", num_cut);
for (size_t i = 0; i < total_n; i++){
assert(range[i] <= 5000);
range_dis[range[i]]++;
}
anchors_total += total_n;
seg_total += num_cut;
if (!fp) {
fprintf(stderr, "[Debug] Writing to range_dis.csv\n");
fp = fopen("range_dis.csv", "w+");
fprintf(fp, "num_segs,num_anchors");
for (int i = 0; i < 5001; i++) fprintf(fp, ",%d", i);
fprintf(fp, "\n");
}
fprintf(fp, "%lusegs,%luanchors", seg_total, anchors_total);
for (int i = 0; i < 5001; i++){
fprintf(fp, ",%lu", range_dis[i]);
}
fprintf(fp, "\n");
}

#define fine_grind 30
// sc pair vs. seg length
void debug_cal_sc_pair_density(size_t total_n, size_t num_cut, size_t* cut, int32_t* range){
// bin width: 10 cuts, max 5000 cuts
static uint64_t sc_pair_dis[(500+fine_grind)] = {0}; // number of sc pairs for each seg length
static uint64_t anchors_dis[(500+fine_grind)] = {0};
static uint64_t seg_dis[(500+fine_grind)] = {0};


uint64_t start_idx = 0, cut_size = 0;
for (int cid = 0; cid < num_cut; cid++) {
if (cut[cid] != SIZE_MAX) {
uint64_t sc_pair_num = 0;
for (uint64_t i = start_idx; i < cut[cid]; i++){
sc_pair_num += range[i];
}
if (cut_size < fine_grind){
sc_pair_dis[cut_size] += sc_pair_num;
anchors_dis[cut_size] += cut[cid] - start_idx;
seg_dis[cut_size]++;
} else if (cut_size / 10 < 500) {
sc_pair_dis[cut_size/10 + fine_grind/9] += sc_pair_num;
anchors_dis[cut_size/10 + fine_grind/9] += cut[cid] - start_idx;
seg_dis[cut_size / 10 + fine_grind/9]++;
} else {
sc_pair_dis[500 + fine_grind/9] += sc_pair_num;
anchors_dis[500 + fine_grind/9] += cut[cid] - start_idx;
seg_dis[500 + fine_grind/9]++;
}
cut_size = 0;
start_idx = cut[cid];
} else {
++cut_size;
}
}

static FILE* f_sc_pair_dis = NULL;
if (!f_sc_pair_dis){
f_sc_pair_dis = fopen("sc_pair_dis.csv", "w+");
fprintf(stderr, "[Verbose] writing to sc_pair_dis.csv");
fprintf(f_sc_pair_dis, "seg_len");
for(int i = 0; i < fine_grind; i++){
fprintf(f_sc_pair_dis, ",%d", i);
}
for (int i = fine_grind/10; i <= 500; i++){
fprintf(f_sc_pair_dis, ",%d", i*10);
}
fprintf(f_sc_pair_dis, "\n");
}

fprintf(f_sc_pair_dis, "sc_pairs");
for (int i = 0; i < 500 + fine_grind; i++){
fprintf(f_sc_pair_dis, ",%lu", sc_pair_dis[i]);
}
fprintf(f_sc_pair_dis, "\n");
fprintf(f_sc_pair_dis, "anchors");
for (int i = 0; i < 500 + fine_grind; i++){
fprintf(f_sc_pair_dis, ",%lu", anchors_dis[i]);
}
fprintf(f_sc_pair_dis, "\n");
fprintf(f_sc_pair_dis, "segs");
for (int i = 0; i < 500 + fine_grind; i++){
fprintf(f_sc_pair_dis, ",%lu", seg_dis[i]);
}
fprintf(f_sc_pair_dis, "\n");
fflush(f_sc_pair_dis);
}

#endif // DEBUG_CHECK
8 changes: 8 additions & 0 deletions gpu/debug.h
Original file line number Diff line number Diff line change
@@ -49,6 +49,14 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip,
);
#endif // DEBUG_CHECK_FORCE



// Analyze Distribution

void debug_cal_long_seg_range_dis(size_t total_n, size_t num_cut, int32_t* range);
void debug_cal_mid_range_dis(size_t total_n, size_t num_cut, int32_t *range);
void debug_cal_range_dis(size_t total_n, size_t num_cut, int32_t *range);
void debug_cal_sc_pair_density(size_t total_n, size_t num_cut, size_t* cut, int32_t* range);
#endif // DEBUG_CHECK

#ifdef DEBUG_VERBOSE
7 changes: 1 addition & 6 deletions gpu/gpu.mk
Original file line number Diff line number Diff line change
@@ -45,15 +45,10 @@ else
GPU_TESTFL = $(CUDATESTFLAG)
endif

ifneq ($(DEBUG),)
ifeq ($(DEBUG),analyze)
GPU_FLAGS += $(GPU_TESTFL)
endif

ifneq ($(DEBUG_ANALYSIS),)
GPU_FLAGS += $(GPU_TESTFL)
endif


%.o: %.cu
$(GPU_CC) -c $(GPU_FLAGS) $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $(CONFIG) $< -o $@

Loading