diff --git a/.gitignore b/.gitignore index d5a6dd6b..8037a31a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,17 @@ *.dSYM minimap2 mappy.c +data +.vscode/** +test.sam +*.sam +Log/** +debug/** +verf +trace +ncu +nsys +*_output* +workloads +.cmake/** +.depend \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index a80f848d..7707bf77 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "lib/simde"] path = lib/simde url = https://github.com/nemequ/simde.git +[submodule "cJSON"] + path = cJSON + url = https://github.com/DaveGamble/cJSON.git diff --git a/LICENSE.txt b/LICENSE.txt index 1a06f649..e3089e3b 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -2,6 +2,7 @@ The MIT License Copyright (c) 2018- Dana-Farber Cancer Institute 2017-2018 Broad Institute, Inc. + 2022 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/Makefile b/Makefile index 4118616a..0f32d289 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,16 @@ -CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra -CPPFLAGS= -DHAVE_KALLOC -INCLUDES= +CFLAGS= -O2 -g -DNDEBUG +CDEBUG_FLAGS= -g -O2 #-Wall -Wextra -Wno-unused-parameter -Wno-unused-variable -Wno-sign-compare -Wno-unused-function -Wno-c++17-extensions -Wno-\#warnings #-O0 -DNDEBUG +CPPFLAGS= -DHAVE_KALLOC -D__AMD_SPLIT_KERNELS__ # -Wno-unused-but-set-variable -Wno-unused-variable +CPPFLAGS+= $(if $(MAX_MICRO_BATCH),-DMAX_MICRO_BATCH=\($(MAX_MICRO_BATCH)\)) +INCLUDES= -I . OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \ lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ ksw2_ll_sse.o -PROG= minimap2 +# PROG= minimap2-zerobranch-debug +# PROG= minimap2-nobalance-debug +PROG= minimap2$(SUFFIX) PROG_EXTRA= sdust minimap2-lite -LIBS= -lm -lz -lpthread +LIBS= -lm -lz -lpthread ifeq ($(arm_neon),) # if arm_neon is not defined ifeq ($(sse2only),) # if sse2only is not defined @@ -34,7 +38,21 @@ ifneq ($(tsan),) LIBS+=-fsanitize=thread endif -.PHONY:all extra clean depend + +# turn on debug flags +ifeq ($(DEBUG),info) + CFLAGS += -DDEBUG_PRINT +endif +ifeq ($(DEBUG), analyze) + CFLAGS += $(CDEBUG_FLAGS) + CFLAGS += -DDEBUG_CHECK -DDEBUG_PRINT +endif +ifeq ($(DEBUG), verbose) + CFLAGS += $(CDEBUG_FLAGS) + CFLAGS += -DDEBUG_CHECK -DDEBUG_PRINT -DDEBUG_VERBOSE +endif + +.PHONY:all extra clean depend # profile .SUFFIXES:.c .o .c.o: @@ -44,14 +62,25 @@ all:$(PROG) extra:all $(PROG_EXTRA) -minimap2:main.o libminimap2.a - $(CC) $(CFLAGS) main.o -o $@ -L. -lminimap2 $(LIBS) +# build cJSON +CJSON_OBJ= cJSON/cJSON.o +INCLUDES += -I cJSON +$(CJSON_OBJ): + make -C cJSON + +# build kernel objs +include gpu/gpu.mk + + +# compile with nvcc/hipcc +$(PROG):main.o libminimap2.a + $(GPU_CC) $(CFLAGS) $(GPU_FLAGS) main.o -o $@ -L. -lminimap2 $(LIBS) minimap2-lite:example.o libminimap2.a - $(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS) + $(GPU_CC) $(CFLAGS) $(GPU_FLAGS) $< -o $@ -L. -lminimap2 $(LIBS) -libminimap2.a:$(OBJS) - $(AR) -csru $@ $(OBJS) +libminimap2.a:$(OBJS) $(CU_OBJS) $(CJSON_OBJ) + $(AR) -csru $@ $^ sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h $(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz @@ -97,7 +126,7 @@ ksw2_exts2_neon.o:ksw2_exts2_sse.c ksw2.h kalloc.h # other non-file targets -clean: +clean: cleangpu rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM build dist mappy*.so mappy.c python/mappy.c mappy.egg* depend: @@ -129,4 +158,4 @@ pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h sdust.o: kalloc.h kdq.h kvec.h sdust.h seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h -splitidx.o: mmpriv.h minimap.h bseq.h kseq.h +splitidx.o: mmpriv.h minimap.h bseq.h kseq.h \ No newline at end of file diff --git a/a6000_config.json b/a6000_config.json new file mode 100644 index 00000000..0c946395 --- /dev/null +++ b/a6000_config.json @@ -0,0 +1,27 @@ +{ + "//config is for": "a6000. Fits one batch + 5% x 4 long buffer avg_read_n 10k", + "num_streams": 1, + "min_n": 512, + "//min_n": "queries with less anchors will be handled on cpu", + "long_seg_buffer_size": 258880000, + "max_total_n": 893440000, + "max_read": 893440, + "avg_read_n": 20000, + "//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified", + "range_kernel": { + "blockdim": 512, + "cut_check_anchors": 10, + "//cut_check_anchors": "Number of anchors to check to attemp a cut", + "anchor_per_block": 32768, + "//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim" + }, + "score_kernel": { + "short_blockdim": 64, + "long_blockdim": 64, + "mid_blockdim": 64, + "//blockdim config": "options are not used: static config specified at compile time (make ... LONG_BLOCK_SIZE=1024)", + "short_griddim": 2688, + "long_griddim": 1024, + "mid_griddim": 2688 + } +} \ No newline at end of file diff --git a/aac_config.json b/aac_config.json new file mode 100644 index 00000000..c28dcdb4 --- /dev/null +++ b/aac_config.json @@ -0,0 +1,27 @@ +{ + "//config is for": "aac cloud. Fits one batch + 5% x 4 long buffer avg_read_n 10k", + "num_streams": 1, + "min_n": 512, + "//min_n": "queries with less anchors will be handled on cpu", + "long_seg_buffer_size": 1117376000, + "max_total_n": 2036880000, + "max_read": 2036880, + "avg_read_n": 20000, + "//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified", + "range_kernel": { + "blockdim": 512, + "cut_check_anchors": 10, + "//cut_check_anchors": "Number of anchors to check to attemp a cut", + "anchor_per_block": 32768, + "//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim" + }, + "score_kernel": { + "short_blockdim": 64, + "long_blockdim": 64, + "mid_blockdim": 64, + "//blockdim config": "options are not used: static config specified at compile time (make ... LONG_BLOCK_SIZE=1024)", + "short_griddim": 16128, + "long_griddim": 208, + "mid_griddim": 16128 + } +} \ No newline at end of file diff --git a/cJSON b/cJSON new file mode 160000 index 00000000..b45f48e6 --- /dev/null +++ b/cJSON @@ -0,0 +1 @@ +Subproject commit b45f48e600671feade0b6bd65d1c69de7899f2be diff --git a/gfx1030.json b/gfx1030.json new file mode 100644 index 00000000..19c0c0aa --- /dev/null +++ b/gfx1030.json @@ -0,0 +1,28 @@ +{ + "//config is for": "AMD Radeon RX 6800 XT on amdxfx. Fits one batch + 5% x 4 long buffer avg_read_n 10k", + "num_streams": 1, + "min_n": 512, + "//min_n": "queries with less anchors will be handled on cpu", + "long_seg_buffer_size": 100000000, + "max_total_n": 493440000, + "max_read": 493440, + "avg_read_n": 20000, + "//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified", + "range_kernel": { + "blockdim": 512, + "cut_check_anchors": 10, + "//cut_check_anchors": "Number of anchors to check to attemp a cut", + "anchor_per_block": 32768, + "//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim" + }, + "score_kernel": { + "micro_batch": 4, + "mid_blockdim": 512, + "//blockdim config": "options are not used: static config specified at compile time (make ... LONG_BLOCK_SIZE=1024)", + "short_griddim": 2688, + "long_griddim": 144, + "mid_griddim": 2688, + "long_seg_cutoff": 20, + "mid_seg_cutoff": 3 + } +} \ No newline at end of file diff --git a/gpu/debug.c b/gpu/debug.c new file mode 100644 index 00000000..3d4ce9ba --- /dev/null +++ b/gpu/debug.c @@ -0,0 +1,647 @@ +#include "debug.h" + +#include + +#include "plchain.h" + +// FILE *f_anchors = NULL; +// FILE *range_input = NULL; +// FILE *binary = NULL; +// bool use_binary_input = true; +// FILE *range = NULL; + +#ifdef DEBUG_VERBOSE + +///////////////////////////////////////////////////////////////////// +/////////// Print Input Files ///////////////////// +///////////////////////////////////////////////////////////////////// +void debug_output_anchors(const char debug_folder[], chain_read_t *in) { + static FILE *f_anchors = NULL; + if (!f_anchors) { + char anchors_filename[50]; + strcpy(anchors_filename, debug_folder); + strcat(anchors_filename, ".anchor.out"); + if ((f_anchors = fopen(anchors_filename, "w+")) == NULL) { + fprintf(stderr, "[Error] Cannot create output file %s\n", + anchors_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing anchors to file %s\n", + anchors_filename); + fprintf(f_anchors, "@@@seq.name, in->seq.len); + fprintf(f_anchors, "*%d\n", in->rep_len); + + /* Read Number of Anchors */ + fprintf(f_anchors, "#%ld\n", in->n); + + /* Read Anchors */ + for (int i = 0; i < in->n; i++) { + fprintf(f_anchors, "%lx,%lx\t", in->a[i].x, in->a[i].y); + } + fprintf(f_anchors, "\n"); +} + +// DEBUG: not used +#if 0 +void debug_output_score(const char debug_folder[], chain_read_t *in) { + static FILE *f_score = NULL; + if (!f_score) { + char score_filename[50]; + strcpy(score_filename, debug_folder); + strcat(score_filename, ".goldscore.out"); + if ((f_score = fopen(score_filename, "w+")) == NULL) { + fprintf(stderr, "[Error] Cannot crea input file %s\n", + score_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing gold score to file %s\n", + score_filename); + fprintf(f_score, "@@@seq.name, in->seq.len); + + /* Write Score and Predecesser */ + fprintf(f_score, "#%ld\n", in->n); + + /* Write score */ + for (int i = 0; i < in->n; i++) { + fprintf(f_score, "%d,%ld\t", in->f[i], in->p[i]); + } + fprintf(f_score, "\n"); +} +#endif + +void debug_output_meta(const char debug_folder[], input_meta_t *meta) { + static FILE *f_metaout = NULL; + if (!f_metaout) { + char *buf = NULL; + size_t len = 0; + char meta_filename[50]; + strcpy(meta_filename, debug_folder); + strcat(meta_filename, ".meta.out"); + if ((f_metaout = fopen(meta_filename, "w+")) == NULL) { + fprintf(stderr, "[Error] Cannot create input file %s\n", + meta_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing meta data to file %s\n", meta_filename); + fprintf(f_metaout, "@@@>tname\ttlen\n"); + } + + /* Write Number of reference Sequences */ + fprintf(f_metaout, "#%d\n", meta->n_refs); + + /* Write Reference Seq metadata */ + for (int i = 0; i < meta->n_refs; i++) { + fprintf(f_metaout, ">%s\t%d\n", meta->refs[i].name, meta->refs[i].len); + } +} + +void debug_print_successor_range(int32_t *range, int64_t n) { + static FILE *fout_range = NULL; + static int read_idx = 0; + if (fout_range == NULL) { + char fout_range_filename[50]; + strcpy(fout_range_filename, debug_folder); + strcat(fout_range_filename, ".range.out"); + if ((fout_range = fopen(fout_range_filename, "w+")) == NULL) { + fprintf(stderr, "[Error]: Cannot create range output file: %s \n", + fout_range_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing successor range to file %s\n", + fout_range_filename); + } + fprintf(fout_range, "> %d, len: %ld ", read_idx, n); + for (int64_t i = 0; i < n; ++i) { + fprintf(fout_range, "#%ld: %d, ", i, range[i]); + } + fprintf(fout_range, "\n"); + read_idx++; +} + +int debug_print_cut(const size_t *cut, size_t max_cut, size_t n, + size_t offset, char* qname) { + static FILE *fout_cut = NULL; + static int read_idx = 0; + if (fout_cut == NULL) { + char fout_cut_filename[50]; + strcpy(fout_cut_filename, debug_folder); + strcat(fout_cut_filename, ".cut.out"); + if ((fout_cut = fopen(fout_cut_filename, "w+")) == NULL) { + fprintf(stderr, "[Error]: Cannot create cut output file: %s \n", + fout_cut_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing cut to file %s\n", fout_cut_filename); + } + fprintf(fout_cut, "> %s, len: %ld offset %ld ", qname == NULL ? "--" : qname, n, offset); + size_t cid = 0; + for (; cid < max_cut && (cut[cid] < n + offset || cut[cid] == SIZE_MAX); + cid++) { + if (cut[cid] != SIZE_MAX) + fprintf(fout_cut, "%zu(%zu)\t", cut[cid] - offset, cut[cid]); + else + fprintf(fout_cut, "x\t"); + } + fprintf(fout_cut, "\n"); + read_idx++; + return cid; +} + +void debug_print_score(const int64_t *p, const int32_t *score, int64_t n) { + static FILE *fout_score = NULL; + static int read_idx = 0; + if (fout_score == NULL) { + char fout_score_filename[50]; + strcpy(fout_score_filename, debug_folder); + strcat(fout_score_filename, ".score.out"); + if ((fout_score = fopen(fout_score_filename, "w+")) == NULL) { + fprintf(stderr, "[Error]: Cannot create score output file: %s \n", + fout_score_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing score to file %s\n", + fout_score_filename); + fprintf(fout_score, "@@@> 32, (uint32_t)u[i]); + for (int new_j = j + (uint32_t)u[i]; j < new_j; j++) { + fprintf(fout_chain, "%lx,%lx ", a[j].x, a[j].y); + } + fprintf(fout_chain, "\n"); + } +} + +void debug_print_regs(mm_reg1_t* regs, int n_u, char* qname){ + static FILE *fout_regs = NULL; + if (fout_regs == NULL){ + char fout_regs_filename[50]; + strcpy(fout_regs_filename, debug_folder); + strcat(fout_regs_filename, ".regs.out"); + if ((fout_regs = fopen(fout_regs_filename, "w+")) == NULL) { + fprintf(stderr, "[Error]: Cannot create print output file: %s \n", + fout_regs_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing regs to file %s\n", fout_regs_filename); + fprintf(fout_regs, "[regs] \n"); + } + fprintf(fout_regs, "<%s\n", qname); + for (int i = 0; i < n_u; i++){ + fprintf(fout_regs, + "[%d] cnt %d rid %d score %d qs %d qe %d rs %d re %d parent %d " + "subsc %d as %d mlen %d blen %d n_sub %d score0 %d\n", regs[i].id, + regs[i].cnt, regs[i].rid, regs[i].score, regs[i].qs, regs[i].qe, + regs[i].rs, regs[i].re, regs[i].parent, regs[i].subsc, + regs[i].as, regs[i].mlen, regs[i].blen, regs[i].n_sub, + regs[i].score0); + } +} + +FILE* fout_segs = NULL; +void debug_print_segs(seg_t* segs, chain_read_t* reads, int num_segs, int num_reads){ + if (fout_segs == NULL){ + char fout_segs_filename[50]; + strcpy(fout_segs_filename, debug_folder); + strcat(fout_segs_filename, ".long-segs.out"); + if ((fout_segs = fopen(fout_segs_filename, "w+")) == NULL) { + fprintf(stderr, "[Error]: Cannot create print output file: %s \n", + fout_segs_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing segs to file %s\n", fout_segs_filename); + fprintf(fout_segs, "[segs] \n"); + } + fprintf(fout_segs, "Num Segs: %d, Num Reads: %d\n", num_segs, num_reads); + for (int i = 0; i < num_segs; i++){ + fprintf(fout_segs, "Seg #%d, %lu - %lu\n", i, segs[i].start_idx, segs[i].end_idx); + } + fflush(fout_segs); +} + +void debug_check_anchors(seg_t* segs, int num_segs, int32_t* ax_aggregated, int32_t* ax){ + size_t buffer_idx = 0; + for (int seg_id = 0; seg_id < num_segs; seg_id++) { + fprintf(fout_segs, "checking seg %lu - %lu...\n", segs[seg_id].start_idx, segs[seg_id].end_idx); + for (size_t i = segs[seg_id].start_idx; i < segs[seg_id].end_idx; i++) { + if (ax_aggregated[buffer_idx] != ax[i]) + fprintf(fout_segs, "Anchor mismatch: %d(%lu) %d(%lu)\n", ax_aggregated[buffer_idx], buffer_idx, ax[i], i); + buffer_idx++; + } + } +} + +#endif // DEBUG_VERBOSE + +/////////////////////////////////////////////////////////////////////////// +///////////// check functions /////////////////////////////// +/////////////////////////////////////////////////////////////////////////// +#ifdef DEBUG_CHECK + + +// DEBUG: uses with gold standard input score and range. SCORE CHECK +#if 0 +/** + * Read Plaintxt input file for Chaining scores from .score + * Allocate and Populate chain_read_t.f, chain_read_t.p + * Return number of anchors if success, -1 if failed + */ +int debug_skip_score_check = 0; +int debug_no_score_file = 0; +int debug_read_score(const char input_filename[], chain_read_t *in, void *km) { + static FILE *f_score = NULL; + if (debug_no_score_file) return -1; + if (!f_score) { + char *buf = NULL; + size_t len = 0; + char score_filename[50]; + strcpy(score_filename, input_filename); + strcat(score_filename, ".score"); + if ((f_score = fopen(score_filename, "r")) == NULL) { + in->f = NULL; + in->p = NULL; + debug_skip_score_check = 1; + debug_no_score_file = 1; + fprintf(stderr, "[Warning] Cannot open score file %s, skip score checking! \n", + score_filename); + return -1; + } + fprintf(stderr, "[Info] Reading gold score from file %s\n", + score_filename); + if (getline(&buf, &len, f_score) <= 0) { + // discard header line. + fprintf(stderr, "[Error] wrong format: %s %s\n", score_filename, buf); + return -1; + } + if (strlen(buf) < 3 || strncmp(buf, "@@@", 3)) { + fprintf(stderr, "[Error] wrong format %s %s\n", score_filename, buf); + return -1; + }; + kfree(km, buf); + } + + /* Read Sequence Name and Length */ + char buf[100]; + int seqlen = -1; + int num_anchor = -1; + if (fscanf(f_score, "<%s %d\n", buf, &seqlen) != 2) { + return -1; + } + + if (strcmp(buf, in->seq.name) || seqlen != in->seq.len) { + fprintf(stderr, "[Error] query sequence mismatch: %s %d\n", buf, + seqlen); + return -1; + } + + /* Read Score and Predecesser */ + if (fscanf(f_score, "#%ld\n", &num_anchor) != 1) return -1; + assert(num_anchor == in->n); + + /* Read Anchors */ + KMALLOC(km, in->p, in->n); + KMALLOC(km, in->f, in->n); + for (int i = 0; i < in->n; i++) { + if (fscanf(f_score, "%d,%ld", &in->f[i], &in->p[i]) != 2) return -1; + } + fscanf(f_score, "\n"); +#ifdef DEBUG_VERBOSE + debug_output_score(debug_folder, in); +#endif // DEBUG_VERBOSE + return in->n; +} + +#ifdef DEBUG_CHECK_FORCE + +/** + * Build ground truth score using backward_cpu + * Allocate and Populate chain_read_t.f, chain_read_t.p + * Return number of anchors + */ +int debug_build_score(chain_read_t *in, void *km) { + if (debug_skip_score_check){ + fprintf(stderr, + "[Info] force score checking against backward_cpu! \n"); + } + debug_skip_score_check = 0; + + /* Read Anchors */ + KMALLOC(km, in->p, in->n); + KMALLOC(km, in->f, in->n); + + Misc misc = build_misc(INT64_MAX); + mg_lchain_dp(misc.max_dist_x, misc.max_dist_y, misc.bw, misc.max_skip, + misc.max_iter, misc.min_cnt, misc.min_score, 0.12, + misc.chn_pen_skip, misc.is_cdna, misc.n_seg, in->n, in->a, + &in->n_u, &in->u, in->km, in, in->f, in->p); +#ifdef DEBUG_VERBOSE + debug_output_score(debug_folder, in); +#endif // DEBUG_VERBOSE + return in->n; +} + +#endif // DEBUG_CHECK_FORCE + +/** + * Check p[], f[] array against gold standard. + * Print if there is mismatch. +*/ +int debug_check_score(const int64_t *p, const int32_t *f, const int64_t *p_gold, + const int32_t *f_gold, int64_t n, char* qname) { + for (int64_t i = 0; i < n; ++i) { + if (f[i] == 0) { +#ifdef DEBUG_VERBOSE + fprintf(stderr, + "[Debug] Score Mismatch: %s Anchor %ld, score: %d (gold " + "x), previous: %ld (gold x)\n", + qname == 0 ? "--" : qname, i, f[i], p[i]); +#endif + } + } + if (debug_skip_score_check) return -1; + static int readid = 0; + size_t score_mismatches = 0; + int rt = 0; + for (int64_t i = 0; i < n; ++i) { + if (p[i] != p_gold[i] || f[i] != f_gold[i]) { +// #ifdef DEBUG_VERBOSE + fprintf(stderr, + "[Debug] Score Mismatch: %s Anchor %ld, score: %d (gold %d), " + "previous: %ld (gold %ld)\n", + qname == 0? "--": qname, readid, i, f[i], f_gold[i], p[i], p_gold[i]); +// #endif + rt = 1; + score_mismatches++; + } + } + if (rt == 1) + fprintf(stderr, "[Debug] Score Mismatch: %s %d mismatches\n", + qname == 0 ? "--" : qname, score_mismatches); + readid++; + return rt; +} +#endif // uses if we have gold standard input + + +void debug_check_range(const int32_t* range, size_t n){ + static int read_idx = 0; + for (size_t i = 1; i < n; i++){ + if (range[i] < range[i-1] - 1) + fprintf(stderr, "[debug]No realistic range sequence read #%d i %ld %d %d\n", read_idx, i, range[i-1], range[i]); + } + read_idx++; +} + +int debug_check_cut(const size_t *cut, const int32_t *range, size_t max_cut, + size_t n, size_t offset) { + + static int read_idx = 0; + size_t cid = 0; + for (; cid < max_cut && (cut[cid] < n + offset || cut[cid] == SIZE_MAX); + cid++) { + if (cut[cid] != SIZE_MAX) { + if (cut[cid] != 0 && range[cut[cid] - 1] != 0) + fprintf( + stderr, + "[debug] Cut Error: > %d Cut at %zu %lu (%d)\n", + read_idx, cut[cid], offset, range[cut[cid] - 1]); + } + if (cid > 0 && cut[cid] != SIZE_MAX){ + static size_t prev_cut = 0; + int cut_issue = 0; + for (size_t i = prev_cut; i < cut[cid]; i++) { + if (range[i] + i >= cut[cid]){ + fprintf(stderr, "[debug] Cut Error: > %d cid %ld , Cut %zu - %zu, i %zu, range %u\n", + read_idx, cid, prev_cut, cut[cid], i, range[i]); + cut_issue = 1; + } + } + if (cut_issue){ + for (int i = prev_cut; i < cut[cid]; i++){ + fprintf(stderr, "%u[%d]\t", i, range[i]); + } + fprintf(stderr, "\n"); + } + + prev_cut = cut[cid]; + } + } + read_idx++; + return cid; +} + + + + +// find long seg range distribution +void debug_cal_long_seg_range_dis(size_t total_n, size_t num_cut, int32_t* range){ +static uint64_t range_dis[5001] = {0}; + static size_t seg_total = 0; + static uint64_t anchors_total = 0; + static FILE* fp = NULL; + + for (size_t i = 0; i < total_n; i++){ + assert(range[i] <= 5000); + range_dis[range[i]]++; + } + anchors_total += total_n; + seg_total += num_cut; + if (!fp) { + fprintf(stderr, "[Debug] Writing to long_range_dis.csv\n"); + fp = fopen("long_range_dis.csv", "w+"); + fprintf(fp, "num_segs,num_anchors"); + for (int i = 0; i < 5001; i++) fprintf(fp, ",%d", i); + fprintf(fp, "\n"); + } + fprintf(fp, "%lusegs,%luanchors", seg_total, anchors_total); + for (int i = 0; i <= 5000; i++){ + fprintf(fp, ",%lu", range_dis[i]); + } + fprintf(fp, "\n"); +} + + +void debug_cal_mid_range_dis(size_t total_n, size_t num_cut, int32_t* range){ + static uint64_t range_dis[5001] = {0}; + static size_t seg_total = 0; + static uint64_t anchors_total = 0; + static FILE* fp = NULL; + + fprintf(stderr, "[verbose] %lu cuts generated\n", num_cut); + for (size_t i = 0; i < total_n; i++){ + assert(range[i] <= 5000); + range_dis[range[i]]++; + } + anchors_total += total_n; + seg_total += num_cut; + if (!fp) { + fprintf(stderr, "[Debug] Writing to mid_range_dis.csv\n"); + fp = fopen("mid_range_dis.csv", "w+"); + fprintf(fp, "num_segs,num_anchors"); + for (int i = 0; i < 5001; i++) fprintf(fp, ",%d", i); + fprintf(fp, "\n"); + } + fprintf(fp, "%lusegs,%luanchors", seg_total, anchors_total); + for (int i = 0; i < 5001; i++){ + fprintf(fp, ",%lu", range_dis[i]); + } + fprintf(fp, "\n"); +} + + +// range distribution +void debug_cal_range_dis(size_t total_n, size_t num_cut, int32_t* range){ + static uint64_t range_dis[5001] = {0}; + static size_t seg_total = 0; + static uint64_t anchors_total = 0; + static FILE* fp = NULL; + + fprintf(stderr, "[verbose] %lu cuts generated\n", num_cut); + for (size_t i = 0; i < total_n; i++){ + assert(range[i] <= 5000); + range_dis[range[i]]++; + } + anchors_total += total_n; + seg_total += num_cut; + if (!fp) { + fprintf(stderr, "[Debug] Writing to range_dis.csv\n"); + fp = fopen("range_dis.csv", "w+"); + fprintf(fp, "num_segs,num_anchors"); + for (int i = 0; i < 5001; i++) fprintf(fp, ",%d", i); + fprintf(fp, "\n"); + } + fprintf(fp, "%lusegs,%luanchors", seg_total, anchors_total); + for (int i = 0; i < 5001; i++){ + fprintf(fp, ",%lu", range_dis[i]); + } + fprintf(fp, "\n"); +} + +#define fine_grind 30 +// sc pair vs. seg length +void debug_cal_sc_pair_density(size_t total_n, size_t num_cut, size_t* cut, int32_t* range){ + // bin width: 10 cuts, max 5000 cuts + static uint64_t sc_pair_dis[(500+fine_grind)] = {0}; // number of sc pairs for each seg length + static uint64_t anchors_dis[(500+fine_grind)] = {0}; + static uint64_t seg_dis[(500+fine_grind)] = {0}; + + + uint64_t start_idx = 0, cut_size = 0; + for (int cid = 0; cid < num_cut; cid++) { + if (cut[cid] != SIZE_MAX) { + uint64_t sc_pair_num = 0; + for (uint64_t i = start_idx; i < cut[cid]; i++){ + sc_pair_num += range[i]; + } + if (cut_size < fine_grind){ + sc_pair_dis[cut_size] += sc_pair_num; + anchors_dis[cut_size] += cut[cid] - start_idx; + seg_dis[cut_size]++; + } else if (cut_size / 10 < 500) { + sc_pair_dis[cut_size/10 + fine_grind/9] += sc_pair_num; + anchors_dis[cut_size/10 + fine_grind/9] += cut[cid] - start_idx; + seg_dis[cut_size / 10 + fine_grind/9]++; + } else { + sc_pair_dis[500 + fine_grind/9] += sc_pair_num; + anchors_dis[500 + fine_grind/9] += cut[cid] - start_idx; + seg_dis[500 + fine_grind/9]++; + } + cut_size = 0; + start_idx = cut[cid]; + } else { + ++cut_size; + } + } + + static FILE* f_sc_pair_dis = NULL; + if (!f_sc_pair_dis){ + f_sc_pair_dis = fopen("sc_pair_dis.csv", "w+"); + fprintf(stderr, "[Verbose] writing to sc_pair_dis.csv"); + fprintf(f_sc_pair_dis, "seg_len"); + for(int i = 0; i < fine_grind; i++){ + fprintf(f_sc_pair_dis, ",%d", i); + } + for (int i = fine_grind/10; i <= 500; i++){ + fprintf(f_sc_pair_dis, ",%d", i*10); + } + fprintf(f_sc_pair_dis, "\n"); + } + + fprintf(f_sc_pair_dis, "sc_pairs"); + for (int i = 0; i < 500 + fine_grind; i++){ + fprintf(f_sc_pair_dis, ",%lu", sc_pair_dis[i]); + } + fprintf(f_sc_pair_dis, "\n"); + fprintf(f_sc_pair_dis, "anchors"); + for (int i = 0; i < 500 + fine_grind; i++){ + fprintf(f_sc_pair_dis, ",%lu", anchors_dis[i]); + } + fprintf(f_sc_pair_dis, "\n"); + fprintf(f_sc_pair_dis, "segs"); + for (int i = 0; i < 500 + fine_grind; i++){ + fprintf(f_sc_pair_dis, ",%lu", seg_dis[i]); + } + fprintf(f_sc_pair_dis, "\n"); + fflush(f_sc_pair_dis); +} + +#endif // DEBUG_CHECK \ No newline at end of file diff --git a/gpu/debug.h b/gpu/debug.h new file mode 100644 index 00000000..861e8ebd --- /dev/null +++ b/gpu/debug.h @@ -0,0 +1,82 @@ +#ifndef __DEBUG_H__ +#define __DEBUG_H__ +#include "plutils.h" +#include "mmpriv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +const char debug_folder[] = "debug"; + +// #define ITER_LIMIT 10000 +// #define MAX_READ_NUM 100000 +// #define MEM_CPU (96-6) // 96 - 6 GB for possible exceed read +// #define MEM_GPU (16-4) // 16 - 4 GB as memory pool = 16760832(0xffc000) KB +// #define SATURATE_FACTOR (0.7) // NOTE: how much portion of cpu memory shall be allocated, < 1 + + +// /* Input File Names: Set by command line arguments in main.c */ +// extern char input_filename[]; // plaintxt chaining inputs & score +// extern char range_infile[]; // plaintxt range +// extern char binary_file[]; // binary chaining inputs & score +// extern char binary_range[]; // binary range + +#ifndef DEBUG_CHECK +#define ASSERT(X) +// Chaining Debug Checker: checks chaining score against input. +#elif DEBUG_CHECK +#define ASSERT(X) assert(X) +// Read score from file for comparison +int debug_read_score(const char input_filename[], chain_read_t *in, void *km); +int debug_build_score(chain_read_t *in, void *km); + +// Check score +int debug_check_score(const int64_t *p, const int32_t *f, const int64_t *p_gold, + const int32_t *f_gold, int64_t n, char* qname); +void debug_check_range(const int32_t* range, size_t n); +int debug_check_cut(const size_t *cut, const int32_t *range, size_t max_cut, + size_t n, size_t offset); + +#ifdef DEBUG_CHECK_FORCE +mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, + int max_iter, int min_cnt, int min_sc, float chn_pen_gap, + float chn_pen_skip, int is_cdna, int n_seg, + int64_t n, // NOTE: n is number of anchors + mm128_t *a, // NOTE: a is ptr to anchors. + int *n_u_, uint64_t **_u, void *km, chain_read_t *input, + int32_t *f_, int64_t *p_ +); +#endif // DEBUG_CHECK_FORCE + + + +// Analyze Distribution + +void debug_cal_long_seg_range_dis(size_t total_n, size_t num_cut, int32_t* range); +void debug_cal_mid_range_dis(size_t total_n, size_t num_cut, int32_t *range); +void debug_cal_range_dis(size_t total_n, size_t num_cut, int32_t *range); +void debug_cal_sc_pair_density(size_t total_n, size_t num_cut, size_t* cut, int32_t* range); +#endif // DEBUG_CHECK + +#ifdef DEBUG_VERBOSE + // Print Input Files + void debug_output_anchors(const char debug_folder[], chain_read_t *in); +void debug_output_score(const char debug_folder[], chain_read_t *in); +void debug_output_meta(const char debug_folder[], input_meta_t *meta); + +void debug_print_successor_range(int32_t *range, int64_t n); +int debug_print_cut(const size_t *cut, size_t max_cut, size_t n, size_t offset, char* qname); +void debug_print_score(const int64_t *p, const int32_t *score, int64_t n); +void debug_print_score_rel_p(const uint16_t *p, const int32_t *score, int64_t n); +void debug_print_chain(mm128_t* a, uint64_t *u, int32_t n_u, char* qname); +void debug_print_regs(mm_reg1_t *regs, int n_u, char *qname); +void debug_print_segs(seg_t *segs, chain_read_t *reads, int num_segs, int num_reads); +void debug_check_anchors(seg_t* segs, int num_segs, int32_t* ax_aggregated, int32_t* ax); +#endif // DEBUG_VERBOSE + +#ifdef __cplusplus +} +#endif + +#endif// __DEBUG_H__ diff --git a/gpu/gpu.mk b/gpu/gpu.mk new file mode 100644 index 00000000..8b69c99e --- /dev/null +++ b/gpu/gpu.mk @@ -0,0 +1,62 @@ +GPU ?= AMD +CONFIG += $(if $(MAX_MICRO_BATCH),-DMICRO_BATCH=\($(MAX_MICRO_BATCH)\)) + +################################################### +############ CPU Compile ################### +################################################### +CU_SRC = $(wildcard gpu/*.cu) +CU_OBJS = $(CU_SRC:%.cu=%.o) +C_SRC = $(wildcard gpu/*.c) +OBJS += $(C_SRC:%.c=%.o) +INCLUDES += -I gpu + +################################################### +############ CUDA Compile ################### +################################################### +NVCC = nvcc +CUDAFLAGS = -rdc=true -lineinfo +CUDATESTFLAG = -G + +################################################### +############ HIP Compile ################### +################################################### +HIPCC = hipcc +HIPFLAGS = -DUSEHIP +HIPTESTFLAGS = -G -Rpass-analysis=kernel-resource-usage -ggdb +HIPLIBS = -L${ROCM_PATH}/lib -lroctx64 -lroctracer64 + +################################################### +############ DEBUG Options ################### +################################################### +ifeq ($(GPU), AMD) + GPU_CC = $(HIPCC) + GPU_FLAGS = $(HIPFLAGS) + GPU_TESTFL = $(HIPTESTFLAGS) + LIBS += $(HIPLIBS) +else + GPU_CC = $(NVCC) + GPU_FLAGS = $(CUDAFLAGS) + GPU_TESTFL = $(CUDATESTFLAG) +endif + +ifeq ($(DEBUG),analyze) + GPU_FLAGS += $(GPU_TESTFL) +endif + +%.o: %.cu + $(GPU_CC) -c $(GPU_FLAGS) $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $(CONFIG) $< -o $@ + +cleangpu: + rm -f gpu/*.o + +# profile:CFLAGS += -pg -g3 +# profile:all +# perf record --call-graph=dwarf -e cycles:u time ./minimap2 -a test/MT-human.fa test/MT-orang.fa > test.sam + +cudep: gpu/.depend + +gpu/.depend: $(CU_SRC) + rm -f gpu/.depend + $(GPU_CC) -c $(GPU_FLAGS) $(CFLAGS) $(CPPFLAGS) $(INCLUDES) -MM $^ > $@ + +include gpu/.depend \ No newline at end of file diff --git a/gpu/hipify.cuh b/gpu/hipify.cuh new file mode 100644 index 00000000..90ee42c3 --- /dev/null +++ b/gpu/hipify.cuh @@ -0,0 +1,61 @@ +#ifndef __HIPIFY_CUH__ +#define __HIPIFY_CUH__ + +#ifdef USEHIP +#include "hip/hip_runtime.h" +#include "roctracer/roctx.h" +#define cudaDeviceProp hipDeviceProp_t +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaMalloc hipMalloc +#define cudaMallocAsync hipMallocAsync +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyToSymbolAsync hipMemcpyToSymbolAsync +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaFree hipFree +#define cudaFreeAsync hipFreeAsync +#define cudaMemcpyToSymbol hipMemcpyToSymbol +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaStream_t hipStream_t +#define cudaStreamCreate hipStreamCreate +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamDestroy hipStreamDestroy +#define cudaMallocHost hipHostMalloc +#define cudaFreeHost hipHostFree +#define cudaEvent_t hipEvent_t +#define cudaEventCreate hipEventCreate +#define cudaEventRecord hipEventRecord +#define cudaEventQuery hipEventQuery +#define cudaEventDestroy hipEventDestroy +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaMemGetInfo hipMemGetInfo +#define cudaCheck() { \ + hipError_t err = hipGetLastError(); \ + if (hipSuccess != err) { \ + fprintf(stderr, "Error in %s:%i %s(): %s.\n", __FILE__, __LINE__, \ + __func__, hipGetErrorString(err)); \ + fflush(stderr); \ + exit(EXIT_FAILURE); \ + } \ +} +#define cudaWarpSize 64 +#else +#define cudaCheck() { \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Error in %s:%i %s(): %s.\n", __FILE__, __LINE__,\ + __func__, cudaGetErrorString(err)); \ + fflush(stderr); \ + exit(EXIT_FAILURE); \ + } \ +} +#include + +#endif + + +#endif // __HIPIFY_CUH__ diff --git a/gpu/planalyze.cu b/gpu/planalyze.cu new file mode 100644 index 00000000..cfa8a5c9 --- /dev/null +++ b/gpu/planalyze.cu @@ -0,0 +1,201 @@ +/* Implement kernel performance analysis that requires extra device + * synchornization. disabled unless DEBUG_LEVEL is set to analyze. + * Enable individual verbose prints in planalyze.cu + */ +#include "planalyze.cuh" + +#ifdef DEBUG_CHECK +void planalyze_short_kernel(stream_ptr_t stream, int uid, float throughput[]){ + cudaStreamSynchronize(stream.cudastream); + size_t total_n = stream.host_mems[uid].total_n; + chain_read_t* reads = stream.reads; + deviceMemPtr* dev_mem = &stream.dev_mem; + hostMemPtr* host_mem = &stream.host_mems[uid]; + size_t cut_num = stream.host_mems[uid].cut_num; + unsigned int num_mid_seg, num_long_seg; + cudaMemcpy(&num_mid_seg, dev_mem->d_mid_seg_count, sizeof(unsigned int), + cudaMemcpyDeviceToHost); + num_long_seg = host_mem->long_segs_num[0] - (uid>0 ? stream.host_mems[uid-1].long_segs_num[0] : 0); + cudaMemcpy(&num_long_seg, dev_mem->d_long_seg_count, sizeof(unsigned int), + cudaMemcpyDeviceToHost); +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[DEBUG](MICROBATCH# %d) total segs: %lu, short:%lu mid: %u \n", uid, cut_num, cut_num - num_mid_seg - num_long_seg, num_mid_seg); +#endif // DEBUG_VERBOSE + + int32_t* range = (int32_t*)malloc(sizeof(int32_t) * total_n); + cudaMemcpy(range, dev_mem->d_range, sizeof(int32_t) * total_n, + cudaMemcpyDeviceToHost); + size_t* cut = (size_t*)malloc(sizeof(size_t) * cut_num); + cudaMemcpy(cut, dev_mem->d_cut, sizeof(size_t) * cut_num, + cudaMemcpyDeviceToHost); + + seg_t* mid_segs = (seg_t*)malloc(sizeof(seg_t) * num_mid_seg); + cudaMemcpy(mid_segs, dev_mem->d_mid_seg, sizeof(seg_t) * num_mid_seg, + cudaMemcpyDeviceToHost); + + + longMemPtr* long_mem = &stream.long_mem; + unsigned int num_aggregated_long_segs; + cudaMemcpy(&num_aggregated_long_segs, dev_mem->d_long_seg_count, sizeof(unsigned int), + cudaMemcpyDeviceToHost); +#ifdef DEBUG_VERBOSE + fprintf(stderr, + "[DEBUG] aggreagated num of long segs %u, %u-%u belongs to this " + "minibatch\n", + num_aggregated_long_segs, + uid > 0 ? stream.host_mems[uid-1].long_segs_num[0] : 0, + num_aggregated_long_segs); +#endif // DEBUG_VERBOSE + + seg_t* long_segs = (seg_t*)malloc(sizeof(seg_t) * num_aggregated_long_segs); + cudaMemcpy(long_segs, dev_mem->d_long_seg, sizeof(seg_t) * num_aggregated_long_segs, + cudaMemcpyDeviceToHost); + + size_t long_segs_total_n; + cudaMemcpy(&long_segs_total_n, dev_mem->d_total_n_long, sizeof(size_t), cudaMemcpyDeviceToHost); + int32_t* long_range = (int32_t*)malloc(sizeof(int32_t) * long_segs_total_n); + cudaMemcpy(long_range, dev_mem->d_range_long, sizeof(int32_t) * long_segs_total_n, cudaMemcpyDeviceToHost); + +// Calculate long segs total workload (sc pairs) + size_t long_seg_sc_pairs = 0; + for(unsigned int segid = (uid>0 ? stream.host_mems[uid-1].long_segs_num[0] : 0); segid < num_aggregated_long_segs; segid++){ + for (size_t i = long_segs[segid].start_idx; i < long_segs[segid].end_idx; i++) + long_seg_sc_pairs += long_range[i]; + } +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[DEBUG] workload (sc pairs) in long segs: %lu\n", long_seg_sc_pairs); +#endif // DEBUG_VERBOSE + +// Calculate total workload (sc pairs) + size_t total_sc_pairs = 0; + for (size_t i = 0; i < total_n; i++) { + total_sc_pairs += range[i]; + } + +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[DEBUG] Total workload (sc pairs) in batch: %lu. %.2f%% work are in long segs.\n", total_sc_pairs, (float)long_seg_sc_pairs/total_sc_pairs*100); +#endif // DEBUG_VERBOSE + assert(long_seg_sc_pairs <= total_sc_pairs); + + // calculate short kernel throughput + float short_kernel_runtime_ms = 0; + cudaEventElapsedTime(&short_kernel_runtime_ms, stream.short_kernel_start_event[uid], stream.short_kernel_stop_event[uid]); + throughput[uid] = (total_sc_pairs - long_seg_sc_pairs) / short_kernel_runtime_ms / (float)1000; +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[DEBUG] Short Seg kernel #%d throughput: %.2f Mpairs/s\n", uid, throughput[uid]); +#endif // DEBUG_VERBOSE + +// Check range w.r.t input (MAKE SURE INPUT RANGE EXISTS) +#if 0 + int64_t read_start = 0; + for (int i = 0; i < dev_mem->size; i++) { +// DEBUG: print range +#if defined(DEBUG_VERBOSE) && 0 + debug_print_successor_range(range + read_start, reads[i].n); +#endif + debug_check_range(range + read_start, input_arr[i].range, input_arr[i].n); + read_start += reads[i].n; + } +#endif + +// DEBUG: Check voilation of cut +#if defined(DEBUG_CHECK) && 0 + for (int readid = 0, cid = 0, idx = 0; readid < dev_mem->size; readid++) { +// DEBUG: Print cuts +#if defined(DEBUG_VERBOSE) && 0 + debug_print_cut(cut + cid, cut_num - cid, reads[readid].n, idx, reads[readid].seq.name); +#endif + cid += debug_check_cut(cut + cid, range, cut_num - cid, reads[readid].n, idx); + idx += reads[readid].n; + } +#endif + +#if defined(DEBUG_VERBOSE) && 0 + int32_t* ax = (int32_t*) malloc(sizeof(int32_t) * dev_mem->buffer_size_long); + cudaMemcpy(ax, dev_mem->d_ax_long, sizeof(int32_t) * dev_mem->buffer_size_long, cudaMemcpyDeviceToHost); + debug_print_segs(host_mem->long_segs, reads, host_mem->long_segs_num[0], stream.host_mems[uid].size); + debug_check_anchors(host_mem->long_segs, host_mem->long_segs_num[0], ax, host_mem->ax); +#endif + +//DEBUG: Calculate range distribution +#if defined(DEBUG_VERBOSE) && 0 + debug_cal_range_dis(total_n, cut_num, range); +#endif // DEBUG_VERBOSE + +// Calculate range distribution for mid segs +#if defined(DEBUG_VERBOSE) && 0 + for (int seg_id = 0; seg_id < num_mid_seg; seg_id++){ + debug_cal_mid_range_dis(mid_segs[seg_id].end_idx - mid_segs[seg_id].start_idx, 1, range + mid_segs[seg_id].start_idx); + } +#endif // DEBUG_VERBOSE + +// DEBUG: Calculate workload distribution +#if defined(DEBUG_VERBOSE) && 0 + debug_cal_sc_pair_density(total_n, cut_num, cut, range); +#endif // DEBUG_VERBOSE + + free(cut); + free(range); + +} +#endif + + + + +#ifdef DEBUG_CHECK + +void planalyze_long_kernel(stream_ptr_t stream, float* throughput){ + deviceMemPtr* dev_mem = &stream.dev_mem; + longMemPtr* long_mem = &stream.long_mem; + + unsigned int num_long_seg = long_mem->total_long_segs_num[0]; +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[DEBUG]LONG Kernel: num of long segs %u\n", num_long_seg); +#endif // DEBUG_VERBOSE + + + seg_t* long_segs = (seg_t*)malloc(sizeof(seg_t) * num_long_seg); + cudaMemcpy(long_segs, dev_mem->d_long_seg, sizeof(seg_t) * num_long_seg, + cudaMemcpyDeviceToHost); + int32_t* long_range = (int32_t*)malloc(sizeof(int32_t) * *(long_mem->total_long_segs_n)); + cudaMemcpy(long_range, dev_mem->d_range_long, sizeof(int32_t) * *(long_mem->total_long_segs_n), cudaMemcpyDeviceToHost); +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[DEBUG] Total n of anchors in long segs %lu\n", *long_mem->total_long_segs_n); +#endif // DEBUG_VERBOSE + +// Calculate total workload (sc pairs) + size_t long_seg_sc_pairs = 0; + for(size_t i = 0; i < *long_mem->total_long_segs_n; i++){ + long_seg_sc_pairs += long_range[i]; + } +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[DEBUG] workload (sc pairs) in long kernel: %lu\n", long_seg_sc_pairs); +#endif // DEBUG_VERBOSE + + + // calculate long kernel throughput + float long_kernel_runtime_ms = 0; + cudaEventElapsedTime(&long_kernel_runtime_ms, stream.long_kernel_event, stream.stopevent); + float long_kernel_througput = long_seg_sc_pairs / long_kernel_runtime_ms / (float)1000; +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[DEBUG] Long Seg kernel throughput: %.2f Mpairs/s\n", long_kernel_througput); +#endif // DEBUG_VERBOSE + throughput[score_kernel_config.micro_batch] = long_kernel_througput; + +// Check range w.r.t input (MAKE SURE INPUT RANGE EXISTS) +#if defined(DEBUG_VERBOSE) && 0 + debug_print_successor_range(long_range, *long_mem->total_long_segs_n); +#endif + +//DEBUG: Calculate range distribution +#if defined(DEBUG_VERBOSE) && 0 + debug_cal_long_seg_range_dis(*long_mem->total_long_segs_n, num_long_seg, long_range); +#endif // DEBUG_VERBOSE + + free(long_segs); + free(long_range); + +} + +#endif // DEBUG_CHECK \ No newline at end of file diff --git a/gpu/planalyze.cuh b/gpu/planalyze.cuh new file mode 100644 index 00000000..cef1048e --- /dev/null +++ b/gpu/planalyze.cuh @@ -0,0 +1,21 @@ +#ifndef __PLANALYZE_H__ +#define __PLANALYZE_H__ + +/* Implement kernel performance analysis that requires extra device + * synchornization. disabled unless DEBUG_LEVEL is set to analyze. + * Enable individual verbose prints in planalyze.cu + */ + +#include "hipify.cuh" +#include "plchain.h" +#include "plutils.h" +#include "plmem.cuh" +#include "plscore.cuh" + + +#ifdef DEBUG_CHECK +void planalyze_short_kernel(stream_ptr_t stream, int uid, float throughput[]); +void planalyze_long_kernel(stream_ptr_t stream, float* throughput); +#endif // DEBUG_CHECK + +#endif // __PLANALYZE_H__ \ No newline at end of file diff --git a/gpu/plchain.cu b/gpu/plchain.cu new file mode 100644 index 00000000..c0418d20 --- /dev/null +++ b/gpu/plchain.cu @@ -0,0 +1,560 @@ +#include +#include +#include +#include +#include + + +#include "mmpriv.h" +#include "plmem.cuh" +#include "plrange.cuh" +#include "plscore.cuh" +#include "plchain.h" +#include +#include + +#ifdef DEBUG_CHECK +#include "planalyze.cuh" +#include "debug.h" +#endif // DEBUG_CHECK + +// utils functions +struct +{ + bool operator()(std::pair a, std::pair b) const { + return a.first > b.first; + } +} +comp; + +void pairsort(seg_t *sdata, unsigned *map, unsigned N){ + std::pair elements[N]; + for (unsigned i = 0; i < N; ++i){ + elements[i].second = map[i]; + elements[i].first = sdata[i].end_idx - sdata[i].start_idx; + } + + std::sort(elements, elements+N, comp); // descent order + + for (unsigned i = 0; i < N; ++i){ + map[i] = elements[i].second; + } +} + + +/** + * translate relative predecessor index to abs index + * Input + * rel[] relative predecessor index + * Output + * p[] absolute predecessor index (of each read) + */ +void p_rel2idx(const uint16_t* rel, int64_t* p, size_t n) { + for (int i = 0; i < n; ++i) { + if (rel[i] == 0) + p[i] = -1; + else + p[i] = i - rel[i]; + } +} + +////////////////////////////////////////////////////////////////////////// +/////////// Backtracking ////////////////////////////////////// +////////////////////////////////////////////////////////////////////////// + +/** + * @brief start from end index of the chain, find the location of min score on + * the chain until anchor has no predecessor OR anchor is in another chain + * + * @param max_drop + * @param z [in] {sc, anchor idx}, sorted by sc + * @param f [in] score + * @param p [in] predecessor + * @param k [in] chain end index + * @param t [update] 0 for unchained anchor, 1 for chained anchor + * @return min_i minmute score location in the chain + */ + +static int64_t mg_chain_bk_end(int32_t max_drop, const mm128_t *z, + const int32_t *f, const int64_t *p, int32_t *t, + int64_t k) { + int64_t i = z[k].y, end_i = -1, max_i = i; + int32_t max_s = 0; + if (i < 0 || t[i] != 0) return i; + do { + int32_t s; + t[i] = 2; + end_i = i = p[i]; + s = i < 0 ? z[k].x : (int32_t)z[k].x - f[i]; + if (s > max_s) + max_s = s, max_i = i; + else if (max_s - s > max_drop) + break; + } while (i >= 0 && t[i] == 0); + for (i = z[k].y; i >= 0 && i != end_i; i = p[i]) // reset modified t[] + t[i] = 0; + return max_i; +} + +void plchain_backtracking(hostMemPtr *host_mem, chain_read_t *reads, Misc misc, void* km){ + int max_drop = misc.bw; + if (misc.max_dist_x < misc.bw) misc.max_dist_x = misc.bw; + if (misc.max_dist_y < misc.bw && !misc.is_cdna) misc.max_dist_y = misc.bw; + if (misc.is_cdna) max_drop = INT32_MAX; + + size_t n_read = host_mem->size; + + uint16_t* p_hostmem = host_mem->p; + int32_t* f = host_mem->f; + for (int i = 0; i < n_read; i++) { + int64_t* p; + KMALLOC(km, p, reads[i].n); + p_rel2idx(p_hostmem, p, reads[i].n); +// print scores +#if defined(DEBUG_VERBOSE) && 0 + debug_print_score(p, f, reads[i].n); +#endif +// Check score w.r.t to input (MAKE SURE INPUT SCORE EXISTS: search for SCORE CHECK) +#if defined(DEBUG_CHECK) && 0 + debug_check_score(p, f, reads[i].p, reads[i].f, reads[i].n); +#endif + + /* Backtracking */ + uint64_t* u; + int32_t *v, *t; + KMALLOC(km, v, reads[i].n); + KCALLOC(km, t, reads[i].n); + int32_t n_u, n_v; + u = mg_chain_backtrack(km, reads[i].n, f, p, v, t, misc.min_cnt, misc.min_score, max_drop, &n_u, &n_v); + reads[i].u = u; + reads[i].n_u = n_u; + kfree(km, p); + // here f is not managed by km memory pool + kfree(km, t); + if (n_u == 0) { + kfree(km, reads[i].a); + kfree(km, v); + reads[i].a = 0; + + f += reads[i].n; + p_hostmem += reads[i].n; + continue; + } + + mm128_t* new_a = compact_a(km, n_u, u, n_v, v, reads[i].a); + reads[i].a = new_a; + + f += reads[i].n; + p_hostmem += reads[i].n; + } +} + + +////////////////////////////////////////////////////////////////////////// +/////////// Stream Management ///////////////////////////////// +////////////////////////////////////////////////////////////////////////// + +/** + * Wait and find a free stream + * stream_setup: [in] + * batchid: [in] + * stream_id: [out] stream_id to schedule to + * RETURN + * true if need to cleanup current stream +*/ +int plchain_schedule_stream(const streamSetup_t stream_setup, const int batchid){ + /* Haven't fill all the streams*/ + if (batchid < stream_setup.num_stream) { + return batchid; + } + + // wait until one stream is free + int streamid = -1; + while(streamid == -1){ + for (int t = 0; t < stream_setup.num_stream; t++){ + if (!cudaEventQuery(stream_setup.streams[t].stopevent)) { + streamid = t; + // FIXME: unnecessary recreate? + cudaEventDestroy(stream_setup.streams[t].stopevent); + cudaEventCreate(&stream_setup.streams[t].stopevent); + cudaCheck(); + break; + } + // cudaCheck(); + } + } + return streamid; +} + + +// Global variable for debug prints. Throughput, runtime & mem usage +#ifdef DEBUG_PRINT + float kernel_mem_usage[MAX_MICRO_BATCH + 1] = {0}; + float kernel_throughput[MAX_MICRO_BATCH + 1] = {0}; +#endif // DEBUG_PRINT + +/* + * Accepts a stream that has already been synced, and finished processing a batch + * Finish and cleanup the stream, save primary chain results to unpinned CPU memory. + * RETURN: number of reads in last batch +*/ +int plchain_post_gpu_helper(streamSetup_t stream_setup, int stream_id, Misc misc, void* km){ + int n_reads = 0; // Number of reads in the batch + +#if defined(DEBUG_CHECK) + planalyze_long_kernel(stream_setup.streams[stream_id], kernel_throughput); +#endif // DEBUG_CHECK + +#ifdef DEBUG_PRINT + kernel_mem_usage[score_kernel_config.micro_batch] = (float)(*stream_setup.streams[stream_id].long_mem.total_long_segs_n)/stream_setup.long_seg_buffer_size_stream*100; + + float kernel_runtime_ms[MAX_MICRO_BATCH + 1] = {0}; + float kernel_throughput_anchors[MAX_MICRO_BATCH + 1] = {0}; + cudaEventElapsedTime(&kernel_runtime_ms[score_kernel_config.micro_batch], stream_setup.streams[stream_id].long_kernel_event, stream_setup.streams[stream_id].stopevent); + kernel_throughput_anchors[score_kernel_config.micro_batch] = *stream_setup.streams[stream_id].long_mem.total_long_segs_n / kernel_runtime_ms[score_kernel_config.micro_batch] / (float)1000; +#endif + + + seg_t* long_segs = stream_setup.streams[stream_id].long_mem.long_segs_og_idx; + size_t long_seg_idx = 0; + size_t long_i = 0; + for (int uid = 0; uid < score_kernel_config.micro_batch; uid++) { + if (stream_setup.streams[stream_id].host_mems[uid].size == 0) continue; + // regorg long to each host mem ptr + // NOTE: this is the number of long segs till this microbatch + unsigned int long_segs_num = stream_setup.streams[stream_id].host_mems[uid].long_segs_num[0]; +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[Debug] %s (%s:%d) MICROBATCH$%d FINISHED: long seg %lu - %u", + __func__, __FILE__, __LINE__, uid, long_seg_idx, long_segs_num); +#endif // DEBUG_VERBOSE + size_t total_n_long_segs = 0; + for (; long_seg_idx < long_segs_num; long_seg_idx++) { + for (size_t i = long_segs[long_seg_idx].start_idx; + i < long_segs[long_seg_idx].end_idx; i++, long_i++) { + stream_setup.streams[stream_id].host_mems[uid].f[i] = + stream_setup.streams[stream_id].long_mem.f_long[long_i]; + stream_setup.streams[stream_id].host_mems[uid].p[i] = + stream_setup.streams[stream_id].long_mem.p_long[long_i]; + } + total_n_long_segs += long_segs[long_seg_idx].end_idx - long_segs[long_seg_idx].start_idx; + } + + // backtrack after p/f is copied + plchain_backtracking(&stream_setup.streams[stream_id].host_mems[uid], + stream_setup.streams[stream_id].reads + n_reads, misc, km); + // accumulate n_reads + n_reads += stream_setup.streams[stream_id].host_mems[uid].size; +#ifdef DEBUG_PRINT + cudaEventElapsedTime(&kernel_runtime_ms[uid], stream_setup.streams[stream_id].short_kernel_start_event[uid], stream_setup.streams[stream_id].short_kernel_stop_event[uid]); + kernel_throughput_anchors[uid] = + (stream_setup.streams[stream_id].host_mems[uid].total_n - total_n_long_segs) / + kernel_runtime_ms[uid] / (float)1000; +#ifdef DEBUG_VERBOSE + fprintf(stderr, ", %.2f%% anchors are in long segs. \n", (float)total_n_long_segs / stream_setup.streams[stream_id].host_mems[uid].total_n * 100); +#endif // DEBUG_VERBOSE +#endif // DEBUG_PRINT + } + +#ifdef DEBUG_PRINT + fprintf(stderr, "----------------------------------------------------------------------------\n "); + for (int uid = 0; uid < score_kernel_config.micro_batch; uid++) fprintf(stderr, "Short%d ", uid); + fprintf(stderr, "Long\n"); + fprintf(stderr, "----------------------------------------------------------------------------\n"); + fprintf(stderr, "Mem Usage = "); + for (int uid = 0; uid < score_kernel_config.micro_batch; uid++) fprintf(stderr, " %9.2f%%", kernel_mem_usage[uid]); + fprintf(stderr, " %9.2f %%\n", kernel_mem_usage[score_kernel_config.micro_batch]); + fprintf(stderr, "Runtime(s) = "); + for (int uid = 0; uid < score_kernel_config.micro_batch; uid++) fprintf(stderr, "%11.2f", kernel_runtime_ms[uid] / 1000); + fprintf(stderr, " %11.2f\n", kernel_runtime_ms[score_kernel_config.micro_batch] / 1000); + fprintf(stderr, "BW (Ma/s) = "); + for (int uid = 0; uid < score_kernel_config.micro_batch; uid++) fprintf(stderr, "%11.2f", kernel_throughput_anchors[uid]); + fprintf(stderr, " %11.2f\n", kernel_throughput_anchors[score_kernel_config.micro_batch]); + fprintf(stderr, "BW(Mpair/s)= "); + for (int uid = 0; uid < score_kernel_config.micro_batch; uid++) fprintf(stderr, "%11.2f", kernel_throughput[uid]); + fprintf(stderr, " %11.2f\n", kernel_throughput[score_kernel_config.micro_batch]); + fprintf(stderr, "----------------------------------------------------------------------------\n"); + if (kernel_mem_usage[score_kernel_config.micro_batch] > 99){ + fprintf(stderr, + "[WARNING] long segment buffer is full. Consider increase " + "long_seg_buffer_size to improve performance.\n"); + } +#endif + + return n_reads; +} + + +/* + * 1. synchronize stream and process previous batch. cleanup stream + * 2. launch kernels (asynchornizely) for the input batch + */ + +void plchain_cal_score_async(chain_read_t **reads_, int *n_read_, Misc misc, streamSetup_t stream_setup, int thread_id, void* km){ + chain_read_t* reads = *reads_; + *reads_ = NULL; + int n_read = *n_read_; + *n_read_ = 0; + + /* sync stream and process previous batch */ + int stream_id = thread_id; + if (stream_setup.streams[stream_id].busy) { + cudaStreamSynchronize(stream_setup.streams[stream_id].cudastream); + *n_read_ = plchain_post_gpu_helper(stream_setup, stream_id, misc, km); + *reads_ = stream_setup.streams[stream_id].reads; + stream_setup.streams[stream_id].busy = false; + } + +#ifdef DEBUG_PRINT + for(int uid = 0; uid < score_kernel_config.micro_batch + 1; uid++) { + kernel_mem_usage[uid] = 0; + kernel_throughput[uid] = 0; + } +#endif // DEBUG_PRINT + + + cudaEventRecord(stream_setup.streams[stream_id].startevent, + stream_setup.streams[stream_id].cudastream); + size_t total_n = 0; + for (int i = 0; i < n_read; i++) { + total_n += reads[i].n; + } // compute total_n first +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] %s (%s:%d) Launching Batch: n_read %d, total anchors %lu (mem usage: %.2f%%)\n", __func__, __FILE__, __LINE__, n_read, total_n, (float)total_n/stream_setup.max_anchors_stream*100); +#endif // DEBUG_PRINT + + // reset long seg counters + cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_long_seg_count, 0, sizeof(unsigned int), + stream_setup.streams[stream_id].cudastream); + cudaMemsetAsync(stream_setup.streams[stream_id].dev_mem.d_total_n_long, 0, sizeof(size_t), + stream_setup.streams[stream_id].cudastream); + stream_setup.streams[stream_id].long_mem.total_long_segs_num[0] = 0; + stream_setup.streams[stream_id].long_mem.total_long_segs_n[0] = 0; + for(int uid = 0; uid < score_kernel_config.micro_batch; uid++) { + stream_setup.streams[stream_id].host_mems[uid].long_segs_num[0] = 0; + stream_setup.streams[stream_id].host_mems[uid].index = uid; + stream_setup.streams[stream_id].host_mems[uid].griddim = 0; + stream_setup.streams[stream_id].host_mems[uid].size = 0; + stream_setup.streams[stream_id].host_mems[uid].total_n = 0; + stream_setup.streams[stream_id].host_mems[uid].cut_num = 0; + } + + stream_setup.streams[stream_id].reads = reads; + stream_setup.streams[stream_id].n_read = n_read; + int read_start = 0; + + for (int uid = 0; uid < score_kernel_config.micro_batch; uid++) { + if (read_start == n_read) continue; +#ifdef USEHIP + roctxRangePushA("microbatch"); +#endif + // decide the size of micro batch + size_t batch_n = 0; + int read_end = 0; + size_t cut_num = 0; + int griddim = 0; + for (read_end = read_start; read_end < n_read; read_end++) { + if (batch_n + reads[read_end].n > stream_setup.max_anchors_stream) { + break; + } + batch_n += reads[read_end].n; + int an_p_block = range_kernel_config.anchor_per_block; + int an_p_cut = range_kernel_config.blockdim; + int block_num = (reads[read_end].n - 1) / an_p_block + 1; + griddim += block_num; + cut_num += (reads[read_end].n - 1) / an_p_cut + 1; + } +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[Debug] %s (%s:%d) MICROBATCH#%d batch_n %lu, read_start %d, read_end %d usage %.2f %%\n", __func__, __FILE__, __LINE__, uid, batch_n, read_start, read_end, (float)batch_n/stream_setup.max_anchors_stream*100); +#endif // DEBUG_VERBOS + +#ifdef DEBUG_PRINT + kernel_mem_usage[uid] = (float)batch_n/stream_setup.max_anchors_stream*100; +#endif // DEBUG_PRINT + + // sanity check + assert(stream_setup.max_anchors_stream >= batch_n); + assert(stream_setup.max_range_grid >= griddim); + assert(stream_setup.max_num_cut >= cut_num); + // work on micro batch +#ifdef USEHIP + roctxRangePushA("reorg"); +#endif + // step1: reorg input + plmem_reorg_input_arr(reads + read_start, read_end - read_start, + &stream_setup.streams[stream_id].host_mems[uid], + range_kernel_config); + // step2: copy to device +#ifdef USEHIP + roctxRangePop(); +#endif + plmem_async_h2d_short_memcpy(&stream_setup.streams[stream_id], uid); + // step3: range selection +#ifdef DEBUG_PRINT + cudaEventRecord(stream_setup.streams[stream_id].short_kernel_start_event[uid], + stream_setup.streams[stream_id].cudastream); +#endif // DEBUG_PRINT + plrange_async_range_selection(&stream_setup.streams[stream_id].dev_mem, + &stream_setup.streams[stream_id].cudastream); + // step4: score generation for short and mid segs + plscore_async_short_mid_forward_dp(&stream_setup.streams[stream_id].dev_mem, + &stream_setup.streams[stream_id].cudastream); +#ifdef DEBUG_PRINT + cudaEventRecord(stream_setup.streams[stream_id].short_kernel_stop_event[uid], + stream_setup.streams[stream_id].cudastream); +#endif // DEBUG_PRINT + // step5: copy short and mid results back + plmem_async_d2h_short_memcpy(&stream_setup.streams[stream_id], uid); + // update index + read_start = read_end; + +#ifdef USEHIP + roctxRangePop(); +#endif + +#ifdef DEBUG_CHECK + planalyze_short_kernel(stream_setup.streams[stream_id], uid, kernel_throughput); +#endif // DEBUG_CHECK + } + +// FIXME: temporary solution for microbatching + if (read_start < n_read) { + fprintf(stderr, "[WARNING] Unable to fit reads %d - %d into a microbatch. Fall back to cpu chaining\n", read_start, n_read-1); + } + + // step6: copy back long_segs_og + cudaStreamSynchronize(stream_setup.streams[stream_id].cudastream); + unsigned int num_long_seg; + cudaMemcpy(&num_long_seg, stream_setup.streams[stream_id].dev_mem.d_long_seg_count, sizeof(unsigned int), + cudaMemcpyDeviceToHost); + seg_t* long_segs_og = (seg_t*)malloc(sizeof(seg_t) * num_long_seg); + cudaMemcpy(long_segs_og, stream_setup.streams[stream_id].dev_mem.d_long_seg_og, sizeof(seg_t) * num_long_seg, + cudaMemcpyDeviceToHost); + + // step7: sort long segs in descent order + unsigned *map = new unsigned[num_long_seg]; + for (unsigned i = 0; i < num_long_seg; i++) { + map[i] = i; + } + pairsort(long_segs_og, map, num_long_seg); + #ifdef DEBUG_VERBOSE + auto last_length = long_segs_og[map[0]].end_idx - long_segs_og[map[0]].start_idx; + for (int i = 1; i < num_long_seg; i++){ + auto this_length = long_segs_og[map[i]].end_idx - long_segs_og[map[i]].start_idx; + if (this_length > last_length) + fprintf(stderr, "Failed sort at: %d - %u\n", i, map[i]); + } + #endif // DEBUG_VERBOSE + free(long_segs_og); + + // step8: copy map to device + cudaMalloc(&stream_setup.streams[stream_id].dev_mem.d_map, sizeof(unsigned) * num_long_seg); + cudaMemcpy(stream_setup.streams[stream_id].dev_mem.d_map, map, sizeof(unsigned) * num_long_seg, cudaMemcpyHostToDevice); + free(map); + + cudaEventRecord(stream_setup.streams[stream_id].long_kernel_event, + stream_setup.streams[stream_id].cudastream); + plscore_async_long_forward_dp(&stream_setup.streams[stream_id].dev_mem, + &stream_setup.streams[stream_id].cudastream); + cudaEventRecord(stream_setup.streams[stream_id].stopevent, + stream_setup.streams[stream_id].cudastream); + plmem_async_d2h_long_memcpy(&stream_setup.streams[stream_id]); + stream_setup.streams[stream_id].busy = true; + cudaCheck(); +} + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +void init_stream_gpu(size_t* total_n, int* max_reads, int *min_n, char gpu_config_file[], Misc misc) { + plmem_stream_initialize(total_n, max_reads, min_n, gpu_config_file); + plrange_upload_misc(misc); + plscore_upload_misc(misc); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info::%s] gpu initialized for chaining with config %s\n", __func__, gpu_config_file); + fprintf(stderr, "[Info::%s] Compile time config: \n", __func__); +#ifdef USEHIP + fprintf(stderr, "\t\t USE HIP\n"); +#else + fprintf(stderr, "\t\t USE CUDA\n"); +#endif // USEHIP +#ifdef MAX_MICRO_BATCH + fprintf(stderr, "\t\t MAX MICRO BATCH \t%d\n", MAX_MICRO_BATCH); +#endif // MAX_MICRO_BATCH +#endif // DEBUG_PRINT +} + +/** + * worker for launching forward chaining on gpu (streaming) + * use KMALLOC and kfree for cpu memory management + * [in/out] in_arr_: ptr to array of reads, updated to a batch launched in previous run + * (NULL if no finishing batch) + * [in/out] n_read_: ptr to num of reads in array, updated to a batch launched in previous run + * (NULL if no finishing batch) +*/ +void chain_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t **in_arr_, int *n_read_, + int thread_id, void* km) { + // assume only one seg. and qlen_sum desn't matter + assert(opt->max_frag_len <= 0); + Misc misc = build_misc(mi, opt, 0, 1); + plchain_cal_score_async(in_arr_, n_read_, misc, stream_setup, thread_id, km); + if (in_arr_) { + int n_read = *n_read_; + chain_read_t* out_arr = *in_arr_; + for (int i = 0; i < n_read; i++) { + post_chaining_helper(mi, opt, &out_arr[i], misc, km); + } + } +} + + +/** + * worker for finish all forward chaining kernenls on gpu + * use KMALLOC and kfree for cpu memory management + * [out] batches: array of batches + * [out] num_reads: array of number of reads in each batch + */ +void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t** reads_, + int* n_read_, int t, void* km) { + // assume only one seg. and qlen_sum desn't matter + assert(opt->max_frag_len <= 0); + Misc misc = build_misc(mi, opt, 0, 1); + /* Sync all the pending batches + backtracking */ + if (!stream_setup.streams[t].busy) { + *reads_ = NULL; + *n_read_ = 0; + return; + } + + chain_read_t* reads; + int n_read = 0; + cudaStreamSynchronize(stream_setup.streams[t].cudastream); + cudaCheck(); + + n_read = plchain_post_gpu_helper(stream_setup, t, misc, km); + reads = stream_setup.streams[t].reads; + stream_setup.streams[t].busy = false; + + for (int i = 0; i < n_read; i++) { + post_chaining_helper(mi, opt, &reads[i], misc, km); + } + + *reads_ = reads; + *n_read_ = n_read; + +} + + +void free_stream_gpu(int n_threads){ + plmem_stream_cleanup(); + + size_t gpu_free_mem, gpu_total_mem; + cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB (after cleanup) \n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG); +#endif +} + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus \ No newline at end of file diff --git a/gpu/plchain.h b/gpu/plchain.h new file mode 100644 index 00000000..254f1f42 --- /dev/null +++ b/gpu/plchain.h @@ -0,0 +1,25 @@ +#ifndef _PLCHAIN_H_ +#define _PLCHAIN_H_ + +/* Range Kernel configuaration */ +typedef struct range_kernel_config_t { + int blockdim; // number of threads in each block + int cut_check_anchors; // number of anchors to check around each cut + int anchor_per_block; // number of anchors assgined to one block = max_it * blockdim +} range_kernel_config_t; + +/* Score Generation Kernel configuration */ +typedef struct score_kernel_config_t{ + int micro_batch; + int short_blockdim; + int long_blockdim; + int mid_blockdim; + int short_griddim; + int long_griddim; + int mid_griddim; + int cut_unit; + int long_seg_cutoff; + int mid_seg_cutoff; +} score_kernel_config_t; + +#endif // _PLCHAIN_H_ \ No newline at end of file diff --git a/gpu/plmem.cu b/gpu/plmem.cu new file mode 100644 index 00000000..7ae07c84 --- /dev/null +++ b/gpu/plmem.cu @@ -0,0 +1,642 @@ +/* GPU memory management */ + + +#include +#include +#include +#include +#include "plmem.cuh" +#include "plrange.cuh" +#include "plscore.cuh" +#include +void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch, + int range_grid_size, size_t buffer_size_long) { +#ifdef DEBUG_PRINT + size_t host_mem_size; + host_mem_size = anchor_per_batch * (sizeof(int32_t) + sizeof(int32_t) + + sizeof(int8_t) + sizeof(int32_t) + sizeof(int32_t) + sizeof(uint16_t)); + host_mem_size += range_grid_size * (sizeof(size_t) + sizeof(size_t) + sizeof(size_t)); + fprintf(stderr, "[Info] Host Malloc Pinned Memory Size %.2f GB\n", (float)host_mem_size / OneG); +#endif + + // data array + cudaMallocHost((void**)&host_mem->ax, anchor_per_batch * sizeof(int32_t)); + cudaMallocHost((void**)&host_mem->ay, anchor_per_batch * sizeof(int32_t)); + cudaMallocHost((void**)&host_mem->sid, anchor_per_batch * sizeof(int8_t)); + cudaMallocHost((void**)&host_mem->xrev, anchor_per_batch * sizeof(int32_t)); + cudaMallocHost((void**)&host_mem->f, anchor_per_batch * sizeof(int32_t)); + cudaMallocHost((void**)&host_mem->p, anchor_per_batch * sizeof(uint16_t)); + + //index + cudaMallocHost((void**)&host_mem->start_idx, range_grid_size * sizeof(size_t)); + cudaMallocHost((void**)&host_mem->read_end_idx, range_grid_size * sizeof(size_t)); + cudaMallocHost((void**)&host_mem->cut_start_idx, range_grid_size * sizeof(size_t)); + + cudaMallocHost((void**)&host_mem->long_segs_num, sizeof(unsigned int)); + cudaCheck(); +} + +void plmem_malloc_long_mem(longMemPtr *long_mem, size_t buffer_size_long) { +#ifdef DEBUG_PRINT + size_t host_mem_size; + host_mem_size = buffer_size_long / (score_kernel_config.long_seg_cutoff * score_kernel_config.cut_unit) * sizeof(seg_t); + host_mem_size += buffer_size_long * (sizeof(int32_t) + sizeof(uint16_t)); + fprintf(stderr, "[Info] Host Malloc Pinned Memory Size %.2f GB (long seg)\n", (float)host_mem_size / OneG); +#endif + // data array + cudaMallocHost((void**)&long_mem->long_segs_og_idx, buffer_size_long / (score_kernel_config.long_seg_cutoff * score_kernel_config.cut_unit) * sizeof(seg_t)); + cudaMallocHost((void**)&long_mem->f_long, buffer_size_long * sizeof(int32_t)); + cudaMallocHost((void**)&long_mem->p_long, buffer_size_long * sizeof(uint16_t)); + cudaMallocHost((void**)&long_mem->total_long_segs_num, sizeof(unsigned int)); + cudaMallocHost((void**)&long_mem->total_long_segs_n, sizeof(size_t)); + cudaCheck(); +} + +void plmem_free_host_mem(hostMemPtr *host_mem) { + cudaFreeHost(host_mem->ax); + cudaFreeHost(host_mem->ay); + cudaFreeHost(host_mem->sid); + cudaFreeHost(host_mem->xrev); + cudaFreeHost(host_mem->f); + cudaFreeHost(host_mem->p); + cudaFreeHost(host_mem->start_idx); + cudaFreeHost(host_mem->read_end_idx); + cudaFreeHost(host_mem->cut_start_idx); + cudaFreeHost(host_mem->long_segs_num); + cudaCheck(); +} + +void plmem_free_long_mem(longMemPtr *long_mem) { + cudaFreeHost(long_mem->long_segs_og_idx); + cudaFreeHost(long_mem->f_long); + cudaFreeHost(long_mem->p_long); + cudaFreeHost(long_mem->total_long_segs_num); + cudaFreeHost(long_mem->total_long_segs_n); + cudaCheck(); +} + +void plmem_malloc_device_mem(deviceMemPtr *dev_mem, size_t anchor_per_batch, int range_grid_size, int num_cut){ + // data array + cudaMalloc(&dev_mem->d_ax, anchor_per_batch * sizeof(int32_t)); + cudaMalloc(&dev_mem->d_ay, anchor_per_batch * sizeof(int32_t)); + cudaMalloc(&dev_mem->d_sid, anchor_per_batch * sizeof(int8_t)); + cudaMalloc(&dev_mem->d_xrev, anchor_per_batch * sizeof(int32_t)); + cudaMalloc(&dev_mem->d_range, anchor_per_batch * sizeof(int32_t)); + cudaMalloc(&dev_mem->d_f, anchor_per_batch * sizeof(int32_t)); + cudaMalloc(&dev_mem->d_p, anchor_per_batch * sizeof(uint16_t)); + + //index + cudaMalloc(&dev_mem->d_start_idx, range_grid_size * sizeof(size_t)); + cudaMalloc(&dev_mem->d_read_end_idx, range_grid_size * sizeof(size_t)); + cudaMalloc(&dev_mem->d_cut_start_idx, range_grid_size * sizeof(size_t)); + + // cut + cudaMalloc(&dev_mem->d_cut, num_cut * sizeof(size_t)); + cudaMalloc(&dev_mem->d_long_seg_count, sizeof(unsigned int)); + cudaMalloc(&dev_mem->d_long_seg, dev_mem->buffer_size_long / (score_kernel_config.long_seg_cutoff * score_kernel_config.cut_unit) * sizeof(seg_t)); + cudaMalloc(&dev_mem->d_long_seg_og, dev_mem->buffer_size_long / (score_kernel_config.long_seg_cutoff * score_kernel_config.cut_unit) * sizeof(seg_t)); + cudaMalloc(&dev_mem->d_mid_seg_count, sizeof(unsigned int)); + cudaMalloc(&dev_mem->d_mid_seg, num_cut/(score_kernel_config.mid_seg_cutoff + 1) * sizeof(seg_t)); + + size_t gpu_free_mem, gpu_total_mem; + cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB (before alloc long seg buffer) \n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG); +#endif + + // long seg buffer + cudaMalloc(&dev_mem->d_ax_long, dev_mem->buffer_size_long * sizeof(int32_t)); + cudaMalloc(&dev_mem->d_ay_long, dev_mem->buffer_size_long * sizeof(int32_t)); + cudaMalloc(&dev_mem->d_sid_long, dev_mem->buffer_size_long * sizeof(int8_t)); + cudaMalloc(&dev_mem->d_range_long, dev_mem->buffer_size_long * sizeof(int32_t)); + cudaMalloc(&dev_mem->d_total_n_long, sizeof(size_t)); + cudaMalloc(&dev_mem->d_f_long, sizeof(int32_t) * dev_mem->buffer_size_long); + cudaMalloc(&dev_mem->d_p_long, sizeof(uint16_t) * dev_mem->buffer_size_long); + cudaCheck(); +} + +void plmem_free_device_mem(deviceMemPtr *dev_mem) { + cudaFree(dev_mem->d_ax); + cudaFree(dev_mem->d_ay); + cudaFree(dev_mem->d_sid); + cudaFree(dev_mem->d_xrev); + cudaFree(dev_mem->d_range); + cudaFree(dev_mem->d_f); + cudaFree(dev_mem->d_p); + + cudaFree(dev_mem->d_start_idx); + cudaFree(dev_mem->d_read_end_idx); + cudaFree(dev_mem->d_cut_start_idx); + + cudaFree(dev_mem->d_cut); + cudaFree(dev_mem->d_long_seg); + cudaFree(dev_mem->d_long_seg_count); + cudaFree(dev_mem->d_mid_seg); + cudaFree(dev_mem->d_mid_seg_count); + + cudaFree(dev_mem->d_ax_long); + cudaFree(dev_mem->d_ay_long); + cudaFree(dev_mem->d_sid_long); + cudaFree(dev_mem->d_range_long); + cudaFree(dev_mem->d_total_n_long); + cudaCheck(); +} + + +/** + * Input + * reads[]: array + * n_reads + * config: range kernel configuartions + * Output + * *host_mem populate host_mem +*/ +void plmem_reorg_input_arr(chain_read_t *reads, int n_read, + hostMemPtr *host_mem, range_kernel_config_t config) { + size_t total_n = 0, cut_num = 0; + size_t griddim = 0; + + host_mem->size = n_read; + for (int i = 0; i < n_read; i++) { + total_n += reads[i].n; + } + host_mem->total_n = total_n; + + size_t idx = 0; + for (int i = 0; i < n_read; i++) { + int n = reads[i].n; + int block_num = (n - 1) / config.anchor_per_block + 1; + + host_mem->start_idx[griddim] = idx; + size_t end_idx = idx + config.anchor_per_block; + host_mem->read_end_idx[griddim] = idx + n; + host_mem->cut_start_idx[griddim] = cut_num; + for (int j = 1; j < block_num; j++) { + cut_num += (config.anchor_per_block / config.blockdim); + host_mem->start_idx[griddim + j] = end_idx; + end_idx = + host_mem->start_idx[griddim + j] + config.anchor_per_block; + host_mem->read_end_idx[griddim + j] = idx + n; + host_mem->cut_start_idx[griddim + j] = cut_num; + } + cut_num += (n - (block_num - 1) * config.anchor_per_block - 1) / + config.blockdim; + end_idx = idx + n; + + griddim += block_num; + + for (int j = 0; j < n; j++) { + host_mem->ax[idx] = (int32_t)reads[i].a[j].x; + host_mem->ay[idx] = (int32_t)reads[i].a[j].y; + host_mem->sid[idx] = (reads[i].a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; + host_mem->xrev[idx] = reads[i].a[j].x >> 32; + ++idx; + } + } + host_mem->cut_num = cut_num; + host_mem->griddim = griddim; +} + +void plmem_async_h2d_short_memcpy(stream_ptr_t* stream_ptrs, size_t uid) { + hostMemPtr *host_mem = &stream_ptrs->host_mems[uid]; + deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; + cudaStream_t *stream = &stream_ptrs->cudastream; + cudaMemcpyAsync(dev_mem->d_ax, host_mem->ax, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_ay, host_mem->ay, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_sid, host_mem->sid, + sizeof(int8_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_xrev, host_mem->xrev, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_start_idx, host_mem->start_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_read_end_idx, host_mem->read_end_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_cut_start_idx, host_mem->cut_start_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemsetAsync(dev_mem->d_cut, 0xff, + sizeof(size_t) * host_mem->cut_num, *stream); + cudaMemsetAsync(dev_mem->d_f, 0, sizeof(int32_t) * host_mem->total_n, + *stream); + cudaMemsetAsync(dev_mem->d_p, 0, sizeof(uint16_t) * host_mem->total_n, + *stream); + cudaCheck(); + dev_mem->total_n = host_mem->total_n; + dev_mem->num_cut = host_mem->cut_num; + dev_mem->size = host_mem->size; + dev_mem->griddim = host_mem->griddim; +} + +void plmem_async_h2d_memcpy(stream_ptr_t* stream_ptrs) { + size_t uid = 0; + hostMemPtr *host_mem = &stream_ptrs->host_mems[uid]; + deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; + cudaStream_t *stream = &stream_ptrs->cudastream; + cudaMemcpyAsync(dev_mem->d_ax, host_mem->ax, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_ay, host_mem->ay, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_sid, host_mem->sid, + sizeof(int8_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_xrev, host_mem->xrev, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_start_idx, host_mem->start_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_read_end_idx, host_mem->read_end_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemcpyAsync(dev_mem->d_cut_start_idx, host_mem->cut_start_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice, + *stream); + cudaMemsetAsync(dev_mem->d_cut, 0xff, + sizeof(size_t) * host_mem->cut_num, *stream); + cudaMemsetAsync(dev_mem->d_f, 0, sizeof(int32_t) * host_mem->total_n, + *stream); + cudaMemsetAsync(dev_mem->d_p, 0, sizeof(uint16_t) * host_mem->total_n, + *stream); + cudaCheck(); + dev_mem->total_n = host_mem->total_n; + dev_mem->num_cut = host_mem->cut_num; + dev_mem->size = host_mem->size; + dev_mem->griddim = host_mem->griddim; +} + +void plmem_sync_h2d_memcpy(hostMemPtr *host_mem, deviceMemPtr *dev_mem) { + cudaMemcpy(dev_mem->d_ax, host_mem->ax, sizeof(int32_t) * host_mem->total_n, + cudaMemcpyHostToDevice); + cudaMemcpy(dev_mem->d_ay, host_mem->ay, sizeof(int32_t) * host_mem->total_n, + cudaMemcpyHostToDevice); + cudaMemcpy(dev_mem->d_sid, host_mem->sid, sizeof(int8_t) * host_mem->total_n, + cudaMemcpyHostToDevice); + cudaMemcpy(dev_mem->d_xrev, host_mem->xrev, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_mem->d_start_idx, host_mem->start_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice); + cudaMemcpy(dev_mem->d_read_end_idx, host_mem->read_end_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice); + cudaMemcpy(dev_mem->d_cut_start_idx, host_mem->cut_start_idx, + sizeof(size_t) * host_mem->griddim, cudaMemcpyHostToDevice); + cudaMemset(dev_mem->d_cut, 0xff, sizeof(size_t) * host_mem->cut_num); + dev_mem->total_n = host_mem->total_n; + dev_mem->num_cut = host_mem->cut_num; + dev_mem->size = host_mem->size; + dev_mem->griddim = host_mem->griddim; + cudaCheck(); +} + +void plmem_async_d2h_memcpy(stream_ptr_t *stream_ptrs) { + size_t uid = 0; + hostMemPtr *host_mem = &stream_ptrs->host_mems[uid]; + longMemPtr *long_mem = &stream_ptrs->long_mem; + deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; + cudaStream_t *stream = &stream_ptrs->cudastream; + cudaMemcpyAsync(host_mem->f, dev_mem->d_f, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyDeviceToHost, + *stream); + cudaMemcpyAsync(host_mem->p, dev_mem->d_p, + sizeof(uint16_t) * host_mem->total_n, + cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->long_segs_og_idx, dev_mem->d_long_seg_og, + dev_mem->buffer_size_long / (score_kernel_config.long_seg_cutoff * score_kernel_config.cut_unit) * sizeof(seg_t), + cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(host_mem->long_segs_num, dev_mem->d_long_seg_count, + sizeof(unsigned int), cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->f_long, dev_mem->d_f_long, sizeof(int32_t)*dev_mem->buffer_size_long, + cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->p_long, dev_mem->d_p_long, sizeof(uint16_t)*dev_mem->buffer_size_long, + cudaMemcpyDeviceToHost, *stream); + cudaCheck(); +} + +void plmem_async_d2h_short_memcpy(stream_ptr_t *stream_ptrs, size_t uid) { + hostMemPtr *host_mem = &stream_ptrs->host_mems[uid]; + deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; + cudaStream_t *stream = &stream_ptrs->cudastream; + cudaMemcpyAsync(host_mem->f, dev_mem->d_f, + sizeof(int32_t) * host_mem->total_n, cudaMemcpyDeviceToHost, + *stream); + cudaMemcpyAsync(host_mem->p, dev_mem->d_p, + sizeof(uint16_t) * host_mem->total_n, + cudaMemcpyDeviceToHost, *stream); + // copy back d_long_seg_count to long_segs_num, this is an accumulative value + cudaMemcpyAsync(host_mem->long_segs_num, dev_mem->d_long_seg_count, + sizeof(unsigned int), cudaMemcpyDeviceToHost, *stream); + cudaCheck(); +} + +void plmem_async_d2h_long_memcpy(stream_ptr_t *stream_ptrs) { + size_t uid = 0; + longMemPtr *long_mem = &stream_ptrs->long_mem; + deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; + cudaStream_t *stream = &stream_ptrs->cudastream; + cudaMemcpyAsync(long_mem->long_segs_og_idx, dev_mem->d_long_seg_og, + dev_mem->buffer_size_long / (score_kernel_config.long_seg_cutoff * score_kernel_config.cut_unit) * sizeof(seg_t), + cudaMemcpyDeviceToHost, *stream); + // cudaMemcpyAsync(&long_mem->total_long_segs_num, dev_mem->d_long_seg_count, + // sizeof(unsigned int), cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->f_long, dev_mem->d_f_long, sizeof(int32_t)*dev_mem->buffer_size_long, + cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->p_long, dev_mem->d_p_long, sizeof(uint16_t)*dev_mem->buffer_size_long, + cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->total_long_segs_n, dev_mem->d_total_n_long, sizeof(size_t), + cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(long_mem->total_long_segs_num, dev_mem->d_long_seg_count, sizeof(unsigned int), + cudaMemcpyDeviceToHost, *stream); + cudaCheck(); +} + +void plmem_sync_d2h_memcpy(hostMemPtr *host_mem, deviceMemPtr *dev_mem){ + cudaMemcpy(host_mem->f, dev_mem->d_f, sizeof(int32_t) * host_mem->total_n, + cudaMemcpyDeviceToHost); + cudaMemcpy(host_mem->p, dev_mem->d_p, sizeof(uint16_t) * host_mem->total_n, + cudaMemcpyDeviceToHost); + cudaCheck(); +} + +//////////////////// Initialization and Cleanup //////////////////////// +streamSetup_t stream_setup; + +#include "cJSON.h" +cJSON *plmem_parse_gpu_config(const char filename[]){ + // read json file to cstring + char *buffer = 0; + long length; + FILE *f = fopen(filename, "rb"); + + if (f) { + fseek(f, 0, SEEK_END); + length = ftell(f); + fseek(f, 0, SEEK_SET); + buffer = (char*)malloc(length); + if (buffer) { + fread(buffer, 1, length, f); + } + fclose(f); + } + + if (!buffer) { + fprintf(stderr, "[Error] fail to open gpu config file %s\n", filename); + exit(1); + } + + cJSON *json = cJSON_Parse(buffer); + if (!json) { + const char *error_ptr = cJSON_GetErrorPtr(); + if (error_ptr != NULL) { + fprintf(stderr, "[Error] cJSON error before %s\n", error_ptr); + } + exit(1); + } + + return json; +} + +int get_json_int(cJSON *json, const char name[]) { + cJSON *elt = cJSON_GetObjectItem(json, name); + if (!cJSON_IsNumber(elt)) { + fprintf(stderr, "[Error] cJSON error failed to get field %s\n", name); + exit(1); + } + return elt->valueint; +} + +void plmem_config_kernels(cJSON *json) { + cJSON *range_config_json = cJSON_GetObjectItem(json, "range_kernel"); + range_kernel_config.blockdim = get_json_int(range_config_json, "blockdim"); + range_kernel_config.cut_check_anchors = + get_json_int(range_config_json, "cut_check_anchors"); + range_kernel_config.anchor_per_block = + get_json_int(range_config_json, "anchor_per_block"); + + cJSON *score_config_json = cJSON_GetObjectItem(json, "score_kernel"); + cudaDeviceProp device_prop; + cudaGetDeviceProperties(&device_prop, 0); + score_kernel_config.short_blockdim = device_prop.warpSize; + score_kernel_config.long_blockdim = device_prop.maxThreadsPerBlock; + score_kernel_config.mid_blockdim = + get_json_int(score_config_json, "mid_blockdim"); + score_kernel_config.short_griddim = + get_json_int(score_config_json, "short_griddim"); + score_kernel_config.long_griddim = + get_json_int(score_config_json, "long_griddim"); + score_kernel_config.mid_griddim = + get_json_int(score_config_json, "mid_griddim"); + score_kernel_config.long_seg_cutoff = + get_json_int(score_config_json, "long_seg_cutoff"); + score_kernel_config.mid_seg_cutoff = + get_json_int(score_config_json, "mid_seg_cutoff"); + score_kernel_config.cut_unit = range_kernel_config.blockdim; + score_kernel_config.micro_batch = + get_json_int(score_config_json, "micro_batch"); + if (score_kernel_config.micro_batch > MAX_MICRO_BATCH) { + fprintf(stderr, "[Error: gpu config] score_kernel:micro_batch should be less than %d\n" + "\t\t or recompile with MAX_MICRO_BATCH=%d" + , MAX_MICRO_BATCH, score_kernel_config.micro_batch); + exit(1); + } + +} + +void plmem_config_stream(size_t *max_range_grid_, size_t *max_num_cut_, size_t max_total_n, size_t max_read, size_t min_n){ + size_t max_range_grid, max_num_cut; + max_range_grid = + (max_total_n - 1) / range_kernel_config.anchor_per_block + 1 + max_read; + max_num_cut = (max_total_n - 1) / range_kernel_config.blockdim + 1 + max_read; + *max_range_grid_ = max_range_grid; + *max_num_cut_ = max_num_cut; + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + cudaCheck(); + + if (*max_range_grid_ > prop.maxGridSize[0]) { + fprintf(stderr, "Invalid memory config!\n"); + exit(1); + } + +} + + +template +void plmem_config_batch(cJSON *json, int *num_stream_, + int *min_n_, size_t *max_total_n_, + int *max_read_, size_t *long_seg_buffer_size_) { + if (is_blooking) + *num_stream_ = 16; + else + *num_stream_ = get_json_int(json, "num_streams"); + + size_t min_anchors = get_json_int(json, "min_n"); + *min_n_ = min_anchors; + + /* If Use define max_total_n & max_read */ + // FIXME: this is limited by int32max + cJSON *max_total_n_json = cJSON_GetObjectItem(json, "max_total_n"); + cJSON *max_read_json = cJSON_GetObjectItem(json, "max_read"); + cJSON *long_seg_buffer_size_json = cJSON_GetObjectItem(json, "long_seg_buffer_size"); + if (max_total_n_json && max_read_json){ + *max_total_n_ = (size_t) max_total_n_json->valuedouble; + *max_read_ = max_read_json->valueint; + *long_seg_buffer_size_ = long_seg_buffer_size_json->valueint; + return; + } + + /* Determine configuration smartly */ + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + + size_t avail_mem_per_stream = (prop.totalGlobalMem / *num_stream_ ) * 0.9; + + // memory per anchor = (ax + ay + range + f + p) + (start_idx + read_end_idx + // + cut_start_idx) + cut + long_seg size: F1 = ax + ay + range + f + p; F2 + // = start_idx + read_end_idx + cut_start_idx; F3 = cut; F4 = long_seg + int F1 = 8 + 8 + 4 + 4 + 2, F2 = 8 + 8 + 8, F3 = 8, F4 = 16; + // TODO: define these data types + + // max iteration of each block, must be an integer + // int max_it = + // range_kernel_config.anchor_per_block / range_kernel_config.blockdim; + // int blockdim = range_kernel_config.blockdim; + + // g = max_grid_size + // g * F2 + g * blockdim * max_it * F1 + max_cut * F3 + max_cut/2 * F4 < + // mem_per_stream max_cut = g * max_it + /* + size_t cost_per_anchor = F1; + size_t cost_per_grid = F2; + size_t cost_per_cut = F3 + F4 / 2; + */ + size_t avg_read_n = get_json_int(json, "avg_read_n"); + + /** + * Assume max_total_n = max_read * avg_read_n + * max_grid = max_total_n / range.anchor_per_block + max_read + * = max_read *( avg_read_n / anchor_per_block + 1) + * max_cut = max_total_n / range.blockdim + max_read + * = max_read * ( avg_read_n / blockdim + 1) + * total_mem = max_grid * cost_per_grid + max_cut * cost_per_cut + max_total_n * cost_per_anchor + */ + + float grid_cost_per_read = + (avg_read_n / (float)range_kernel_config.anchor_per_block + 1) * F2; + float cut_cost_per_read = + (avg_read_n / (float)range_kernel_config.blockdim + 1) * (F3 + F4 / 2); + *max_read_ = floor(avail_mem_per_stream / + (grid_cost_per_read + cut_cost_per_read + F1 * avg_read_n)); + *max_total_n_ = *max_read_ * avg_read_n; +} + +// intialize and config kernels for gpu blocking setup +void plmem_initialize(size_t *max_total_n_, int *max_read_, + int *min_anchors_) { +#ifndef GPU_CONFIG + cJSON *json = plmem_parse_gpu_config("gpu_config.json"); +#else + cJSON *json = plmem_parse_gpu_config(GPU_CONFIG); +#endif + plmem_config_kernels(json); + int num_streams; + size_t buffer_size_long; + plmem_config_batch(json, &num_streams, min_anchors_, max_total_n_, + max_read_, &buffer_size_long); +} + +// initialize global variable stream_setup +void plmem_stream_initialize(size_t *max_total_n_, + int *max_read_, int *min_anchors_, char* gpu_config_file) { + + int num_stream; + size_t max_anchors_stream, max_range_grid, max_num_cut, long_seg_buffer_size; + + cJSON *json = plmem_parse_gpu_config(gpu_config_file); + + plmem_config_kernels(json); + size_t gpu_free_mem, gpu_total_mem; + cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem); + plmem_config_batch(json, &num_stream, min_anchors_, &max_anchors_stream, + max_read_, &long_seg_buffer_size); + plmem_config_stream(&max_range_grid, &max_num_cut, max_anchors_stream, + *max_read_, *min_anchors_); + + stream_setup.num_stream = num_stream; + // assert(num_stream > 1); + + stream_setup.streams = new stream_ptr_t[num_stream]; +#ifdef DEBUG_PRINT + fprintf(stderr, + "[Info] max anchors per stream: %zu, max range grid %zu " + "max_num_cut %zu long_seg_buffer_size %zu\n", + max_anchors_stream, max_range_grid, max_num_cut, + long_seg_buffer_size); +#endif // DEBUG_PRINT + + for (int i = 0; i < num_stream; i++) { + stream_setup.streams[i].busy = false; + cudaStreamCreate(&stream_setup.streams[i].cudastream); + cudaEventCreate(&stream_setup.streams[i].stopevent); + cudaEventCreate(&stream_setup.streams[i].startevent); + cudaEventCreate(&stream_setup.streams[i].long_kernel_event); + cudaCheck(); + stream_setup.streams[i].dev_mem.buffer_size_long = long_seg_buffer_size; + // one stream has multiple host mems + for (int j = 0; j < score_kernel_config.micro_batch; j++) { + plmem_malloc_host_mem(&stream_setup.streams[i].host_mems[j], max_anchors_stream, + max_range_grid, long_seg_buffer_size); + cudaEventCreate(&stream_setup.streams[i].short_kernel_start_event[j]); + cudaEventCreate(&stream_setup.streams[i].short_kernel_stop_event[j]); + } + // one stream has one long mem and one device mem + plmem_malloc_long_mem(&stream_setup.streams[i].long_mem, long_seg_buffer_size); + plmem_malloc_device_mem(&stream_setup.streams[i].dev_mem, max_anchors_stream, + max_range_grid, max_num_cut); + cudaMemset(stream_setup.streams[i].dev_mem.d_long_seg_count, 0, sizeof(unsigned int)); + cudaCheck(); + cudaMemset(stream_setup.streams[i].dev_mem.d_mid_seg_count, 0, sizeof(unsigned int)); + cudaCheck(); + cudaMemset(stream_setup.streams[i].dev_mem.d_total_n_long, 0, sizeof(size_t)); + cudaCheck(); + } + +cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] GPU free mem: %f GB, total mem: %f GB\n", (float)gpu_free_mem / OneG, (float)gpu_total_mem / OneG); +#endif + + *max_total_n_ = max_anchors_stream * score_kernel_config.micro_batch; + *max_read_ = *max_read_ * score_kernel_config.micro_batch; + + stream_setup.max_anchors_stream = max_anchors_stream; + stream_setup.max_range_grid = max_range_grid; + stream_setup.max_num_cut = max_num_cut; + stream_setup.long_seg_buffer_size_stream = long_seg_buffer_size; +} + +void plmem_stream_cleanup() { + for (int i = 0; i < stream_setup.num_stream; i++) { + cudaStreamDestroy(stream_setup.streams[i].cudastream); + cudaEventDestroy(stream_setup.streams[i].stopevent); + cudaEventDestroy(stream_setup.streams[i].startevent); + cudaEventDestroy(stream_setup.streams[i].long_kernel_event); + cudaCheck(); + // free multiple host mems + for (int j = 0; j < score_kernel_config.micro_batch; j++) { + plmem_free_host_mem(&stream_setup.streams[i].host_mems[j]); + } + plmem_free_long_mem(&stream_setup.streams[i].long_mem); + plmem_free_device_mem(&stream_setup.streams[i].dev_mem); + } + delete[] stream_setup.streams; +} diff --git a/gpu/plmem.cuh b/gpu/plmem.cuh new file mode 100644 index 00000000..c6500b5d --- /dev/null +++ b/gpu/plmem.cuh @@ -0,0 +1,147 @@ +#ifndef _PLMEM_CUH_ +#define _PLMEM_CUH_ +#include "hipify.cuh" +#include "plchain.h" +#include "plutils.h" + +#ifndef MAX_MICRO_BATCH +#define MAX_MICRO_BATCH 8 +#endif // MAX_MICRO_BATCH + +#define OneK 1024 +#define OneM (OneK*1024) +#define OneG (OneM*1024) + + +typedef struct { + int index; // read index / batch index + int griddim; // grid for range selection kernel. + int size; // number of reads in the batch + size_t total_n; // number of anchors in the batch + size_t cut_num; // number of cuts in the batch + + // array size: number of anchors in the batch + int32_t *ax; // (int32_t) a[].x + int32_t *ay; // (int32_t) a[].y + int8_t* sid; // a[].y >> 40 & 0xff + int32_t *xrev; // a[].x >> 32 + // outputs + int32_t *f; // score + uint16_t *p; // predecessor + + // array size: number of cuts in the batch / long_seg_cut + // total long segs number till this batch + unsigned int *long_segs_num; + + // start index for each block in range selection + /***** range selection block assiagnment + * One block only gets assgined one read or part of one read. + * start_idx: idx of the first anchor assigned to each block + * read_end_idx: idx of the last anchor OF THE READ assigned to each + * block if a read is devided into several blocks, all the blocks take the + * last anchor index of the read cut_start_idx: idx of the first cut this + * block needs to make + */ + // array size: grid dimension + size_t *start_idx; + size_t *read_end_idx; + size_t *cut_start_idx; +} hostMemPtr; + +typedef struct { + // array size: number of cuts in the batch / long_seg_cut + seg_t *long_segs_og_idx; // start & end idx of long segs in the original micro batch + unsigned int *total_long_segs_num; // sum of mini batch long_segs_num + size_t *total_long_segs_n; // number of anchors in all the long segs + int32_t *f_long; // score for long segs + uint16_t *p_long; // predecessor for long segs +} longMemPtr; + +typedef struct { + int size; + int griddim; + size_t total_n; + size_t num_cut; + // device memory ptrs + // data array + int32_t *d_ax; + int32_t *d_ay; + int8_t *d_sid; // a[].y >> 40 & 0xff + int32_t *d_xrev; // a[].x >> 32 + int32_t *d_range; + int32_t *d_f; // score + uint16_t *d_p; // predecessor + + // range selection index + size_t *d_start_idx; + size_t *d_read_end_idx; + size_t *d_cut_start_idx; + + // cut + size_t *d_cut; // cut + unsigned int *d_long_seg_count; // total number of long seg (aggregated accross micro batches) + seg_t *d_long_seg; // start & end idx of long segs in the long seg buffer (aggregated across micro batches) + seg_t *d_long_seg_og; // start & end idx of long seg in the micro batch. (aggregated accross micro batches) + unsigned int *d_mid_seg_count; // private to micro batch + seg_t *d_mid_seg; // private to micro batch + + // long segement buffer + unsigned *d_map; + int32_t *d_ax_long, *d_ay_long; + int8_t *d_sid_long; + int32_t *d_range_long; + size_t *d_total_n_long; + size_t buffer_size_long; + int32_t *d_f_long; // score, size: buffer_size_long * sizeof(int32_t) + uint16_t *d_p_long; // predecessor, size: buffer_size_long * sizeof(uint16_t) +} deviceMemPtr; + +typedef struct stream_ptr_t{ + chain_read_t *reads; + size_t n_read; + hostMemPtr host_mems[MAX_MICRO_BATCH]; + longMemPtr long_mem; + deviceMemPtr dev_mem; + cudaStream_t cudastream; + cudaEvent_t stopevent, startevent, long_kernel_event; + cudaEvent_t short_kernel_start_event[MAX_MICRO_BATCH]; + cudaEvent_t short_kernel_stop_event[MAX_MICRO_BATCH]; + bool busy = false; +} stream_ptr_t; + +typedef struct gputSetup_t { + int num_stream; + stream_ptr_t *streams; + size_t max_anchors_stream, max_num_cut, long_seg_buffer_size_stream; + int max_range_grid; +} streamSetup_t; + +extern streamSetup_t stream_setup; + +/* memory management methods */ +// initialization and cleanup +void plmem_initialize(size_t *max_total_n, int *max_read, int *min_n); +void plmem_stream_initialize(size_t *max_total_n, int *max_read, int *min_n, char* gpu_config_file); +void plmem_stream_cleanup(); + +// alloc and free +void plmem_malloc_host_mem(hostMemPtr *host_mem, size_t anchor_per_batch, + int range_grid_size, size_t buffer_size_long); +void plmem_malloc_long_mem(longMemPtr *long_mem, size_t buffer_size_long); +void plmem_free_host_mem(hostMemPtr *host_mem); +void plmem_free_long_mem(longMemPtr *long_mem); +void plmem_malloc_device_mem(deviceMemPtr *dev_mem, size_t anchor_per_batch, + int range_grid_size, int num_cut); +void plmem_free_device_mem(deviceMemPtr *dev_mem); + +// data movement +void plmem_reorg_input_arr(chain_read_t *reads, int n_read, + hostMemPtr *host_mem, range_kernel_config_t config); +void plmem_async_h2d_memcpy(stream_ptr_t *stream_ptrs); +void plmem_async_h2d_short_memcpy(stream_ptr_t *stream_ptrs, size_t uid); +void plmem_sync_h2d_memcpy(hostMemPtr *host_mem, deviceMemPtr *dev_mem); +void plmem_async_d2h_memcpy(stream_ptr_t *stream_ptrs); +void plmem_async_d2h_short_memcpy(stream_ptr_t *stream_ptrs, size_t uid); +void plmem_async_d2h_long_memcpy(stream_ptr_t *stream_ptrs); +void plmem_sync_d2h_memcpy(hostMemPtr *host_mem, deviceMemPtr *dev_mem); +#endif // _PLMEM_CUH_ \ No newline at end of file diff --git a/gpu/plrange.cu b/gpu/plrange.cu new file mode 100644 index 00000000..b753883e --- /dev/null +++ b/gpu/plrange.cu @@ -0,0 +1,285 @@ +#include +#include +#include +#include +#include "plrange.cuh" +#include "hipify.cuh" + + +/* + +CUDA/HIP kernel for range selection using forward chaining + +*/ + +/* kernels begin */ +__constant__ int d_max_dist_x; +__constant__ int d_max_iter; +__constant__ int d_cut_check_anchors; + +inline __device__ int64_t range_binary_search(const int32_t* ax, const int32_t* rev, int64_t i, int64_t st_end){ + int64_t st_high = st_end, st_low=i; + while (st_high != st_low) { + int64_t mid = (st_high + st_low -1) / 2+1; + if (rev[i] != rev[mid] || ax[mid] > ax[i] + d_max_dist_x) { + st_high = mid -1; + } else { + st_low = mid; + } + } + return st_high; +} + + +/** + * Forward Range Selection Kernel using global memory and binary range search. + * cut reads into segements where successor range = 0. +*/ +__global__ void range_selection_kernel_binary(const int32_t* ax, const int32_t* rev, size_t *start_idx_arr, size_t *read_end_idx_arr, + int32_t *range, size_t* cut, size_t* cut_start_idx, size_t total_n, range_kernel_config_t config){ + int tid = threadIdx.x; + int bid = blockIdx.x; + + size_t start_idx = start_idx_arr[bid]; + size_t read_end_idx = read_end_idx_arr[bid]; + size_t end_idx = start_idx + config.anchor_per_block; + end_idx = end_idx > read_end_idx ? read_end_idx : end_idx; + size_t cut_idx = cut_start_idx[bid]; + if(tid == 0 && (bid == 0 || read_end_idx_arr[bid-1] != read_end_idx)){ + cut[cut_idx] = start_idx; + } + cut_idx++; + int range_op[3] = {16, 512, 5000}; // Range Options + range_op[2] = d_max_iter; + for (size_t i = start_idx + tid; i < end_idx; i += blockDim.x) { + size_t st_max = i + d_max_iter; + st_max = st_max < read_end_idx ? st_max : read_end_idx -1; + size_t st; + for (int j=0; j<3; ++j){ + st = i + range_op[j]; + st = st <= st_max ? st : st_max; + assert(st < total_n); + assert(i < total_n); + if (st > i && (rev[st] != rev[i] || ax[st] > ax[i] + d_max_dist_x)){ + break; + } + } + st = range_binary_search(ax, rev, i, st); + range[i] = st - i; + + if (tid >= blockDim.x - d_cut_check_anchors && + blockDim.x - tid + i <= end_idx) { + if (st == i) cut[cut_idx] = i+1; + } + cut_idx++; + } +} + +/** + * Forward Range Selection Kernel using global memory and linear range search. + * cut reads into segements where successor range = 0. + */ +__global__ void range_selection_kernel_naive(const int32_t* ax, const int32_t* rev, size_t *start_idx_arr, size_t *read_end_idx_arr, + int32_t *range, size_t* cut, size_t* cut_start_idx, size_t total_n, range_kernel_config_t config){ + int tid = threadIdx.x; + int bid = blockIdx.x; + + size_t start_idx = start_idx_arr[bid]; + size_t read_end_idx = read_end_idx_arr[bid]; + size_t end_idx = start_idx + config.anchor_per_block; + end_idx = end_idx > read_end_idx ? read_end_idx : end_idx; + assert(end_idx == (bid +1 < gridDim.x) ? start_idx_arr[bid+1]: total_n); + // if(end_idx_ref != end_idx){ + // if (tid == 0){ + // int grimdim = gridDim.x; + // printf("start idx %d anchor_per_block %d read_end_idx %d, next start idx %d gridDim %d bid %d\n", + // start_idx, config.anchor_per_block, read_end_idx, start_idx_arr[bid+1], grimdim, bid); + // } + // } + // __syncthreads(); + + size_t cut_idx = cut_start_idx[bid]; + if(tid == 0 && (bid == 0 || read_end_idx_arr[bid-1] != read_end_idx)){ + cut[cut_idx] = start_idx; + } + cut_idx++; + for (size_t i = start_idx + tid; i < end_idx; i += blockDim.x){ + size_t st = i + d_max_iter; + st = i + d_max_iter < read_end_idx ? st : read_end_idx -1; + assert(st < total_n); + assert(i < total_n); + while (st > i && + (rev[i] != rev[st] // NOTE: different prefix cannot become predecessor + || ax[st] > ax[i] + d_max_dist_x)) { // NOTE: same prefix compare the value + --st; + } + range[i] = st - i; + + if (tid >= blockDim.x - d_cut_check_anchors && blockDim.x - tid + i <= end_idx) { + if (st == i) cut[cut_idx] = i+1; + } + cut_idx++; + } +} + +// __global__ void range_selection_kernel(const int64_t* ax, size_t *start_idx_arr, size_t *read_end_idx_arr, int32_t *range){ +// int tid = threadIdx.x; +// int bid = blockIdx.x; + +// size_t start_idx = start_idx_arr[bid]; +// size_t read_end_idx = read_end_idx_arr[bid]; +// size_t end_idx = start_idx + MAX_ANCHOR_PER_BLOCK; +// end_idx = end_idx > read_end_idx ? read_end_idx : end_idx; + +// size_t load_anchor_idx = 100; +// size_t load_smem_idx; +// size_t cal_idx = start_idx + threadIdx.x; +// int32_t cal_smem = tid; +// __shared__ int64_t smem[NUM_ANCHOR_IN_SMEM]; + +// /* prefetch anchors */ +// load_smem_idx = tid; +// load_anchor_idx = start_idx + tid; +// // if (tid == 20) printf("load_smem_idx %d, load_anchor_idx %lu\n", load_smem_idx, load_anchor_idx); +// for (int i = 0; i < PREFETCH_ANCHORS_RANGE/NUM_THREADS_RANGE && load_anchor_idx < read_end_idx; ++i){ +// // if (tid == 20) printf("load_smem_idx %d, load_anchor_idx %lu\n", load_smem_idx, load_anchor_idx); +// smem[load_smem_idx] = ax[load_anchor_idx]; +// load_smem_idx += NUM_THREADS_RANGE; +// load_anchor_idx += NUM_THREADS_RANGE; +// } + +// int iter = (NUM_ANCHOR_IN_SMEM - PREFETCH_ANCHORS_RANGE)/NUM_THREADS_RANGE; // iterations before another load is needed +// while (cal_idx < end_idx) { // tail threads may skip this loop +// /* load anchors */ +// load_smem_idx = load_smem_idx >= NUM_ANCHOR_IN_SMEM ? load_smem_idx - NUM_ANCHOR_IN_SMEM : load_smem_idx; +// for (int i = 0; i < iter && load_anchor_idx < end_idx + PREFETCH_ANCHORS_RANGE; ++i){ +// // if (tid == 20) printf("load it load_smem_idx %d, load_anchor_idx %lu\n", load_smem_idx, load_anchor_idx); +// smem[load_smem_idx] = ax[load_anchor_idx]; +// load_smem_idx += NUM_THREADS_RANGE; +// load_anchor_idx += NUM_THREADS_RANGE; +// load_smem_idx = load_smem_idx >= NUM_ANCHOR_IN_SMEM ? load_smem_idx - NUM_ANCHOR_IN_SMEM : load_smem_idx; +// } + +// __syncthreads(); + +// /* calculate sucessor range */ +// for (int i = 0; i < iter && cal_idx < end_idx; ++i){ +// int64_t anchor = smem[cal_smem]; + +// size_t st = cal_idx + PREFETCH_ANCHORS_RANGE < read_end_idx ? cal_idx + PREFETCH_ANCHORS_RANGE : read_end_idx-1; +// int32_t st_smem = cal_smem + st - cal_idx; +// st_smem = st_smem >= NUM_ANCHOR_IN_SMEM ? st_smem - NUM_ANCHOR_IN_SMEM : st_smem; +// // if (tid == 20) printf("cal idx %lu, cal_mem %d, st %lu, st_smem %d\n", cal_idx, cal_smem, st,st_smem); + +// // if (tid == 20) printf("anchor.x %d, smem[st_smem] %d, anchor.x+MAX_DIST_X%d\n", anchor, smem[st_smem], anchor+MAX_DIST_X); + +// while (st > cal_idx && +// (anchor>> 32 != smem[st_smem] >> 32 || +// smem[st_smem] > anchor + d_max_dist_x +// ) +// ){ +// // if (bid == 25) +// // printf("while 0 bid %d tid %d cal_idx %d\n", bid, tid, cal_idx); +// --st; +// if (st_smem == 0) st_smem = NUM_ANCHOR_IN_SMEM-1; +// else --st_smem; +// } + +// /* NOTE: fallback: succussor is not prefetched */ +// if (st >= PREFETCH_ANCHORS_RANGE + cal_idx){ +// st = cal_idx + MAX_ITER < read_end_idx ? i + MAX_ITER : read_end_idx-1; +// while( +// anchor >> 32 != ax[st] >> 32 || +// ax[st] > anchor + d_max_dist_x // check from global memory +// ){ +// --st; +// // if (bid == 25) +// // printf("while 1 bid %d tid %d\n", bid, tid); +// } + +// } +// range[cal_idx] = st - cal_idx; +// cal_smem += NUM_THREADS_RANGE; +// cal_smem = cal_smem >= NUM_ANCHOR_IN_SMEM ? cal_smem - NUM_ANCHOR_IN_SMEM : cal_smem; +// cal_idx += NUM_THREADS_RANGE; +// // if (bid == 25) +// // printf("for loop i %d bid %d tid %d\n", i, bid, tid); +// } +// // if (bid == 25) +// // printf("outer while bid %d tid %d\n", bid, tid); +// __syncthreads(); + +// } + +// } + +/* kernels end */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* host functions begin */ +range_kernel_config_t range_kernel_config; + +void plrange_upload_misc(Misc misc){ +#ifdef USEHIP + hipMemcpyToSymbol(HIP_SYMBOL(d_max_dist_x), &misc.max_dist_x, sizeof(int)); + hipMemcpyToSymbol(HIP_SYMBOL(d_max_iter), &misc.max_iter, sizeof(int)); + hipMemcpyToSymbol(HIP_SYMBOL(d_cut_check_anchors), + &range_kernel_config.cut_check_anchors, sizeof(int)); +#else + cudaCheck(); + cudaMemcpyToSymbol(d_max_dist_x, &misc.max_dist_x, sizeof(int)); + cudaMemcpyToSymbol(d_max_iter, &misc.max_iter, sizeof(int)); + cudaMemcpyToSymbol(d_cut_check_anchors, + &range_kernel_config.cut_check_anchors, sizeof(int)); +#endif // USEHIP + cudaCheck(); +} + +void plrange_async_range_selection(deviceMemPtr* dev_mem, cudaStream_t* stream) { + size_t total_n = dev_mem->total_n, cut_num = dev_mem->num_cut; + int griddim = dev_mem->griddim; + dim3 DimBlock(range_kernel_config.blockdim, 1, 1); + dim3 DimGrid(griddim, 1, 1); + + // Run kernel + range_selection_kernel_binary<<>>( + dev_mem->d_ax, dev_mem->d_xrev, dev_mem->d_start_idx, dev_mem->d_read_end_idx, + dev_mem->d_range, dev_mem->d_cut, dev_mem->d_cut_start_idx, total_n, range_kernel_config); + cudaCheck(); +#ifdef DEBUG_PRINT + // fprintf(stderr, "[Info] %s (%s:%d): Batch total_n %lu, Range Kernel Launched, grid %d cut %d\n", __func__, __FILE__, __LINE__, total_n, DimGrid.x, cut_num); +#endif +} + +void plrange_sync_range_selection(deviceMemPtr *dev_mem, Misc misc) { + size_t total_n = dev_mem->total_n, cut_num = dev_mem->num_cut; + int griddim = dev_mem->griddim; + dim3 DimBlock(range_kernel_config.blockdim, 1, 1); + dim3 DimGrid(griddim,1,1); + + plrange_upload_misc(misc); + + // Run kernel +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] %s (%s:%d): Grim Dim: %d Cut: %zu Anchors: %zu\n", __func__, __FILE__, __LINE__, DimGrid.x, + cut_num, total_n); +#endif + range_selection_kernel_binary<<>>( + dev_mem->d_ax, dev_mem->d_xrev, dev_mem->d_start_idx, dev_mem->d_read_end_idx, + dev_mem->d_range, dev_mem->d_cut, dev_mem->d_cut_start_idx, total_n, range_kernel_config); + cudaCheck(); + cudaDeviceSynchronize(); + cudaCheck(); +#ifdef DEBUG_PRINT + fprintf(stderr, "[Info] %s: range calculation success\n", __func__); +#endif +} + +#ifdef __cplusplus +} +#endif + +/* host functions end */ diff --git a/gpu/plrange.cuh b/gpu/plrange.cuh new file mode 100644 index 00000000..dcfb7426 --- /dev/null +++ b/gpu/plrange.cuh @@ -0,0 +1,25 @@ +#ifndef _PLRANGE_CUH_ +#define _PLRANGE_CUH_ + +#include "plmem.cuh" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef __int32_t int32_t; + +/* functions declaration */ +void plrange_upload_misc(Misc misc); +void plrange_async_range_selection(deviceMemPtr* device_mem_ptr, cudaStream_t* stream); +void plrange_sync_range_selection(deviceMemPtr* dev_mem, Misc misc); + +extern range_kernel_config_t range_kernel_config; + + +#ifdef __cplusplus +} +#endif + +#endif // _PLRANGE_CUH_ diff --git a/gpu/plscore.cu b/gpu/plscore.cu new file mode 100644 index 00000000..58e62364 --- /dev/null +++ b/gpu/plscore.cu @@ -0,0 +1,635 @@ +#include +#include +#include +#include +#include "plscore.cuh" +#include "hipify.cuh" + +/* + +Parallel chaining helper functions with CUDA + +*/ + +__constant__ Misc misc; +__constant__ int long_seg_cutoff; +__constant__ int mid_seg_cutoff; +__device__ unsigned curr_long_segid; + +/* arithmetic functions begin */ + +// __device__ static inline float cuda_mg_log2(float x) // NB: this doesn't work when x<2 +// { +// union { float f; uint32_t i; } z = { x }; +// float log_2 = ((z.i >> 23) & 255) - 128; +// z.i &= ~(255 << 23); +// z.i += 127 << 23; +// log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f; +// return log_2; +// } + +__device__ static inline float cuda_mg_log2(int32_t x) // NB: this doesn't work when x<2 +{ + return 31 - __clz(x); +} + +__device__ int32_t original_comput_sc(const int32_t ai_x, const int32_t ai_y, const int32_t aj_x, const int32_t aj_y, + const int8_t sidi, const int8_t sidj, + int32_t max_dist_x, int32_t max_dist_y, + int32_t bw, float chn_pen_gap, + float chn_pen_skip, int is_cdna, int n_seg) { + int32_t dq = ai_y - aj_y, dr, dd, dg, q_span, sc; + if (dq <= 0 || dq > max_dist_x) return INT32_MIN; + dr = ai_x - aj_x; + if (sidi == sidj && (dr == 0 || dq > max_dist_y)) return INT32_MIN; + dd = dr > dq ? dr - dq : dq - dr; + if (sidi == sidj && dd > bw) return INT32_MIN; + if (n_seg > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) + return INT32_MIN; // nseg = 1 by default + dg = dr < dq ? dr : dq; + q_span = MM_QSPAN; + sc = q_span < dg ? q_span : dg; + if (dd || dg > q_span) { + float lin_pen, log_pen; + lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; + log_pen = + dd >= 1 ? cuda_mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2 + if (is_cdna || sidi != sidj) { + if (sidi != sidj && dr == 0) + ++sc; // possibly due to overlapping paired ends; give a minor + // bonus + else if (dr > dq || sidi != sidj) + sc -= + (int)(lin_pen < log_pen ? lin_pen + : log_pen); // deletion or jump + // between paired ends + else + sc -= (int)(lin_pen + .5f * log_pen); + } else + sc -= (int)(lin_pen + .5f * log_pen); + } + return sc; +} + +__device__ int32_t comput_sc(const int32_t ai_x, const int32_t ai_y, const int32_t aj_x, const int32_t aj_y, + const int8_t sidi, const int8_t sidj, + int32_t max_dist_x, int32_t max_dist_y, + int32_t bw, float chn_pen_gap, + float chn_pen_skip, int is_cdna, int n_seg) { + int32_t dq = ai_y - aj_y, dr, dd, dg, sc; + dr = ai_x - aj_x; + dd = __sad(dr, dq, 0); + + if (dq <= 0 || dq > max_dist_x || + (sidi == sidj && (dr == 0 || + dq > max_dist_y || + dd > bw || + (n_seg > 1 && !is_cdna && dr > max_dist_y)))) + return INT32_MIN; + + dg = dr < dq ? dr : dq; // dg = min(dr, dq) + sc = MM_QSPAN < dg ? MM_QSPAN : dg; // sc = min(q_span, dr, dq) + if (dd || dg > MM_QSPAN) { + int32_t log_pen = dd >= 1 ? (31 - __clz(dd+1)) : 0; + int32_t lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; + // Initial conditions for modifying score based on penalties + bool minorBonus = is_cdna && sidi != sidj && dr == 0; + bool majorAdjustment = (is_cdna && dg == dq) || sidi != sidj; + sc += minorBonus; + sc -= (!minorBonus && majorAdjustment) * (int)(lin_pen < log_pen ? lin_pen : log_pen); + sc -= (!minorBonus && !majorAdjustment) * (int)(lin_pen + 0.5f * log_pen); + } + return sc; +} + + +/* arithmetic functions end */ + +inline __device__ void compute_sc_seg_one_wf(int32_t* anchors_x, int32_t* anchors_y, int8_t* sid, int32_t* range, + size_t start_idx, size_t end_idx, + int32_t* f, uint16_t* p +){ + Misc blk_misc = misc; + int tid = threadIdx.x; + // int bid = blockIdx.x; + // init f and p + for (size_t i=start_idx+tid; i < end_idx; i += blockDim.x) { + f[i] = MM_QSPAN; + p[i] = 0; + } + // __syncthreads(); + // assert(range[end_idx-1] == 0); + for (size_t i=start_idx; i < end_idx; i++) { + int32_t range_i = range[i]; + // if (range_i + i >= end_idx) + // printf("range_i %d i %lu start_idx %lu, end_idx %lu\n", range_i, i, start_idx, end_idx); + // assert(range_i + i < end_idx); + for (int32_t j = tid; j < range_i; j += blockDim.x) { + int32_t sc = comput_sc( + anchors_x[i+j+1], + anchors_y[i+j+1], + anchors_x[i], + anchors_y[i], + sid [i+j+1], + sid [i], + blk_misc.max_dist_x, blk_misc.max_dist_y, blk_misc.bw, blk_misc.chn_pen_gap, + blk_misc.chn_pen_skip, blk_misc.is_cdna, blk_misc.n_seg); + if (sc == INT32_MIN) continue; + sc += f[i]; + if (sc >= f[i+j+1] && sc != MM_QSPAN) { + f[i+j+1] = sc; + p[i+j+1] = j+1; + + } + } +#ifndef USEHIP + __syncwarps(); // NOTE: single warp, no need to sync +#endif // USEHIP + } + +} + + +inline __device__ void compute_sc_seg_multi_wf(const int32_t* anchors_x, const int32_t* anchors_y, const int8_t* sid, const int32_t* range, + size_t start_idx, size_t end_idx, + int32_t* f, uint16_t* p +){ + Misc blk_misc = misc; + int tid = threadIdx.x; + int bid = blockIdx.x; + // init f and p + for (size_t i=start_idx+tid; i < end_idx; i += blockDim.x) { + f[i] = MM_QSPAN; + p[i] = 0; + } + __syncthreads(); + // assert(range[end_idx-1] == 0); + for (size_t i=start_idx; i < end_idx; i++) { + int32_t range_i = range[i]; + // if (range_i + i >= end_idx) + // printf("range_i %d i %lu start_idx %lu, end_idx %lu\n", range_i, i, start_idx, end_idx); + // assert(range_i + i < end_idx); + for (int32_t j = tid; j < range_i; j += blockDim.x) { + int32_t sc = comput_sc( + anchors_x[i+j+1], + anchors_y[i+j+1], + anchors_x[i], + anchors_y[i], + sid [i+j+1], + sid [i], + blk_misc.max_dist_x, blk_misc.max_dist_y, blk_misc.bw, blk_misc.chn_pen_gap, + blk_misc.chn_pen_skip, blk_misc.is_cdna, blk_misc.n_seg); + if (sc == INT32_MIN) continue; + sc += f[i]; + if (sc >= f[i+j+1] && sc != MM_QSPAN) { + f[i+j+1] = sc; + p[i+j+1] = j+1; + + } + } + __syncthreads(); + } + +} + +#define NUM_ANCHORS_PREFETCH 1024 + +// inline __device__ void compute_sc_seg_shared(const int64_t* anchors_x, const int64_t* anchors_y, int32_t* range, +// size_t start_idx, size_t end_idx, +// int32_t* f, uint16_t* p +// ){ +// Misc blk_misc = misc; +// int tid = threadIdx.x; +// int bid = blockIdx.x; +// // init f and p +// for (size_t i=start_idx+tid; i < end_idx; i += blockDim.x) { +// f[i] = anchors_y[i] >> 32 & 0xff; +// p[i] = 0; +// } +// __syncthreads(); +// // assert(range[end_idx-1] == 0); +// __shared__ int64_t anchors_x_shared[NUM_ANCHORS_PREFETCH]; + +// __shared__ int64_t anchors_y_shared[NUM_ANCHORS_PREFETCH]; +// size_t prefetch_end_idx = 0; +// unsigned int prefetch_smem_offset = 0; +// for (size_t i = start_idx; i < end_idx; i++) { +// int32_t range_i = range[i]; +// // if (range_i + i >= end_idx) +// // printf("range_i %d i %lu start_idx %lu, end_idx %lu\n", range_i, i, start_idx, end_idx); +// // assert(range_i + i < end_idx); +// for (int32_t j = tid; j < range_i; j += blockDim.x) { +// int32_t sc = comput_sc( +// anchors_x[i+j+1], +// anchors_y[i+j+1], +// anchors_x[i], +// anchors_y[i], +// blk_misc.max_dist_x, blk_misc.max_dist_y, blk_misc.bw, blk_misc.chn_pen_gap, +// blk_misc.chn_pen_skip, blk_misc.is_cdna, blk_misc.n_seg); +// if (sc == INT32_MIN) continue; +// sc += f[i]; +// if (sc >= f[i+j+1] && sc != (anchors_y[i+j+1]>>32 & 0xff)) { +// f[i+j+1] = sc; +// p[i+j+1] = j+1; + +// } +// } +// __syncthreads(); +// } +// } + +// inline __device__ void compute_sc_long_seg_one_wf(const int64_t* anchors_x, const int64_t* anchors_y, int32_t* range, +// size_t start_idx, size_t end_idx, +// int32_t* f, uint16_t* p +// ){ +// Misc blk_misc = misc; +// int tid = threadIdx.x; +// // int bid = blockIdx.x; +// // NOTE: smallest alignd offset that is greater than start_idx +// // anchor_offset = tid; +// // while (anchor_offset <= start_idx) anchor_offset += blockDim.x; +// int anchor_offset = tid + (start_idx - tid + blockDim.x) / blockDim.x * blockDim.x; +// // init f and p +// for (size_t i=anchor_offset; i < end_idx; i += blockDim.x) { +// f[i] = anchors_y[i] >> 32 & 0xff; +// p[i] = 0; +// } +// // int64_t local_anchors[10]; +// int64_t anchor_x = anchors_x[anchor_offset]; +// int64_t anchor_y = anchors_y[anchor_offset]; +// __syncthreads(); +// // assert(range[end_idx-1] == 0); +// for (size_t i=start_idx; i < end_idx; i++) { +// int32_t range_i = range[i]; +// // if (range_i + i >= end_idx) +// // printf("range_i %d i %lu start_idx %lu, end_idx %lu\n", range_i, i, start_idx, end_idx); +// // assert(range_i + i < end_idx); +// // for (int32_t j = tid; j < range_i; j += blockDim.x) { +// for (unsigned j = anchor_offset; j < i+range_i+1; j += blockDim.x) { +// anchor_x = anchors_x[j]; +// anchor_y = anchors_y[j]; +// int32_t sc = comput_sc( +// anchor_x, +// anchor_y, +// anchors_x[i], +// anchors_y[i], +// blk_misc.max_dist_x, blk_misc.max_dist_y, blk_misc.bw, blk_misc.chn_pen_gap, +// blk_misc.chn_pen_skip, blk_misc.is_cdna, blk_misc.n_seg); +// if (sc == INT32_MIN) continue; +// sc += f[i]; +// if (sc >= f[j] && sc != (anchors_y[j]>>32 & 0xff)) { +// f[j] = sc; +// p[j] = j+1; +// } +// } +// anchor_offset += (anchor_offset <= i+1) * blockDim.x; // update anchor offset +// __syncthreads(); +// } + +// } + + + +/* kernels begin */ + + +template +__launch_bounds__(short_block_size) +__global__ void score_generation_short( + /* Input: Anchor & Range Inputs */ + int32_t* anchors_x, int32_t* anchors_y, int8_t* sid, int32_t *range, + /* Input: Segmentations */ + size_t *seg_start_arr, + /* Output: Score and Previous Anchor */ + int32_t* f, uint16_t* p, + /* Sizes*/ + size_t total_n, size_t seg_count, + /* Output: Long segs */ + int32_t* a_x_long, int32_t* a_y_long, int8_t* sid_long, int32_t* range_long, /* aggregated memory space for long seg */ + size_t* total_n_long, size_t buffer_size_long + , seg_t* long_seg, seg_t* long_seg_og, unsigned int *long_seg_count + ,seg_t *mid_seg, unsigned int *mid_seg_count){ + int tid = threadIdx.x; + int bid = blockIdx.x; + // init f and p + for(int segid = bid; segid < seg_count; segid += gridDim.x){ + size_t start_idx = seg_start_arr[segid]; + if (start_idx == SIZE_MAX) continue; // start at a failed cut: continue to next iteration + size_t end_idx = SIZE_MAX; + int end_segid = segid + 1; + while (true) { + if (end_segid >= seg_count) { + end_idx = total_n; + break; + } + if (seg_start_arr[end_segid] != SIZE_MAX) { + end_idx = seg_start_arr[end_segid]; + break; + } + ++end_segid; + } + if (end_segid > segid + long_seg_cutoff) { + size_t long_seg_start_idx; + if (tid == 0) { + /* Allocate space in long seg buffer */ + long_seg_start_idx = atomicAdd((unsigned long long int*)total_n_long, (unsigned long long int)end_idx - start_idx); + if (long_seg_start_idx + (end_idx - start_idx) >= buffer_size_long){ // long segement buffer is full + atomicSub((unsigned long long int*)total_n_long, (unsigned long long int)end_idx - start_idx); // rollback total_n_long + long_seg_start_idx = SIZE_MAX; + // fallback to mid kernel + int mid_seg_idx = atomicAdd((unsigned long long int*)mid_seg_count, 1); + mid_seg[mid_seg_idx].start_idx = start_idx; + mid_seg[mid_seg_idx].end_idx = end_idx; + } else { + int long_seg_idx = atomicAdd((unsigned long long int*)long_seg_count, 1); + long_seg[long_seg_idx].start_idx = long_seg_start_idx; + long_seg[long_seg_idx].end_idx = long_seg_start_idx + (end_idx - start_idx); + long_seg_og[long_seg_idx].start_idx = start_idx; + long_seg_og[long_seg_idx].end_idx = end_idx; + //DEBUG: used for debug plchain_cal_long_seg_range_dis LONG_SEG_RANGE_DIS + #ifdef DEBUG_VERBOSE + long_seg_og[long_seg_idx].start_segid = segid; + long_seg_og[long_seg_idx].end_segid = end_segid; + #endif // DEBUG_VERBOSE + } + } + // broadcast long_seg_start_idx to all scalar registers +#ifdef USEHIP + long_seg_start_idx = __builtin_amdgcn_readfirstlane(long_seg_start_idx); +#else + long_seg_start_idx = __shfl_sync(0xffffffff, long_seg_start_idx, 0); +#endif + if (long_seg_start_idx == SIZE_MAX) + continue; // failed to allocate long_seg buffer + for (uint64_t idx = tid; idx < end_idx - start_idx; idx += blockDim.x){ + a_x_long[long_seg_start_idx + idx] = anchors_x[start_idx + idx]; + a_y_long[long_seg_start_idx + idx] = anchors_y[start_idx + idx]; + sid_long[long_seg_start_idx + idx] = sid[start_idx + idx]; + range_long[long_seg_start_idx + idx] = range[start_idx + idx]; + // assert(long_seg_start_idx + idx < buffer_size_long); + // assert(start_idx + idx < total_n); + } + continue; + } else if (end_segid > segid + mid_seg_cutoff) { + if (tid == 0) { + int mid_seg_idx = atomicAdd(mid_seg_count, 1); + mid_seg[mid_seg_idx].start_idx = start_idx; + mid_seg[mid_seg_idx].end_idx = end_idx; + } + continue; + } + // assert(end_idx <= total_n); + compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, start_idx, end_idx, f, p); + } +} + + +template +__launch_bounds__(mid_block_size) +__global__ void score_generation_mid(int32_t* anchors_x, int32_t* anchors_y, int8_t* sid, int32_t *range, + seg_t *long_seg, unsigned int* long_seg_count, + int32_t* f, uint16_t* p){ + int tid = threadIdx.x; + int bid = blockIdx.x; + + for(int segid = bid; segid < *long_seg_count; segid += gridDim.x){ + seg_t seg = long_seg[segid]; + // compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); + compute_sc_seg_multi_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); + } +} + +template +__launch_bounds__(long_block_size) +__global__ void score_generation_long(int32_t* anchors_x, int32_t* anchors_y, int8_t* sid, int32_t *range, + seg_t *long_seg, unsigned int* long_seg_count, + int32_t* f, uint16_t* p){ + int tid = threadIdx.x; + int bid = blockIdx.x; + + for(int segid = bid; segid < *long_seg_count; segid += gridDim.x){ + seg_t seg = long_seg[segid]; + // compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); + compute_sc_seg_multi_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); + } +} + +// FIXME: merge together +template +__launch_bounds__(long_block_size) +__global__ void score_generation_long_map(int32_t* anchors_x, int32_t* anchors_y, int8_t* sid, int32_t *range, + seg_t *long_seg, unsigned int* long_seg_count, + int32_t* f, uint16_t* p, unsigned int* map){ + int tid = threadIdx.x; + int bid = blockIdx.x; + unsigned int seg_count = 0; + + // #ifdef DEBUG_CHECK + // auto start = clock64(); + // #endif + + __shared__ unsigned int segid; + if (tid == 0 && bid == 0) { + // init the first batch as the size of the grid + curr_long_segid = gridDim.x; + } + if (tid == 0) { + segid = bid; + } + + __syncthreads(); + while (segid < *long_seg_count) { + seg_t seg = long_seg[map[segid]]; // sorted + // seg_t seg = long_seg[segid]; // unsorted + compute_sc_seg_multi_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); + seg_count++; + if (tid == 0) segid = atomicAdd(&curr_long_segid, 1); + __syncthreads(); + } + + // for(int segid = bid; segid < *long_seg_count; segid += gridDim.x){ + // // seg_t seg = long_seg[map[segid]]; // sorted + // seg_t seg = long_seg[segid]; // unsorted + // compute_sc_seg_multi_wf(anchors_x, anchors_y, sid, range, seg.start_idx, seg.end_idx, f, p); + // seg_count++; + // } + // #ifdef DEBUG_CHECK + // auto end = clock64(); + // if (threadIdx.x == 0) { + // printf("bid: %d, long kernel time: %lu, process %u segs\n", bid, end - start, seg_count); + // } + // #endif +} + +__global__ void score_generation_naive(int32_t* anchors_x, int32_t* anchors_y, int8_t* sid, int32_t *range, + size_t *seg_start_arr, + int32_t* f, uint16_t* p, size_t total_n, size_t seg_count) { + + // NOTE: each block deal with one batch + // the number of threads in a block is fixed, so we need to calculate iter + // n = end_idx_arr - start_idx_arr + // iter = (range[i] - 1) / num_threads + 1 + + int tid = threadIdx.x; + int bid = blockIdx.x; + for (int segid = bid; segid < seg_count; segid += gridDim.x){ + /* calculate the segement for current block */ + size_t start_idx = seg_start_arr[segid]; + if (start_idx == SIZE_MAX) continue; // start at a failed cut: continue to next iteration + size_t end_idx = SIZE_MAX; + int end_segid = segid + 1; + while (true) { + if (end_segid >= seg_count) { + end_idx = total_n; + break; + } + if (seg_start_arr[end_segid] != SIZE_MAX) { + end_idx = seg_start_arr[end_segid]; + break; + } + ++end_segid; + } + // assert(end_idx <= total_n); + compute_sc_seg_one_wf(anchors_x, anchors_y, sid, range, start_idx, end_idx, f, p); + } +} + +/* kernels end */ + +/* host functions begin */ +score_kernel_config_t score_kernel_config; + +void plscore_upload_misc(Misc input_misc) { +#ifdef USEHIP + hipMemcpyToSymbol(HIP_SYMBOL(misc), &input_misc, sizeof(Misc)); + hipMemcpyToSymbol(HIP_SYMBOL(long_seg_cutoff), &score_kernel_config.long_seg_cutoff, sizeof(int)); + hipMemcpyToSymbol(HIP_SYMBOL(mid_seg_cutoff), &score_kernel_config.mid_seg_cutoff, sizeof(int)); +#else + cudaMemcpyToSymbol(misc, &input_misc, sizeof(Misc)); +#endif + cudaCheck(); +} + +void plscore_async_short_mid_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream) { + size_t total_n = dev_mem->total_n; + size_t cut_num = dev_mem->num_cut; + size_t buffer_size_long = dev_mem->buffer_size_long; + dim3 shortDimGrid(score_kernel_config.short_griddim, 1, 1); + dim3 midDimGrid(score_kernel_config.mid_griddim, 1, 1); + dim3 shortDimBlock(score_kernel_config.short_blockdim, 1, 1); + + // Run kernel; + cudaMemsetAsync(dev_mem->d_mid_seg_count, 0, sizeof(unsigned int), + *stream); + + if (score_kernel_config.short_blockdim == 32 ){ + score_generation_short<32><<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, + dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, + dev_mem->d_ax_long, dev_mem->d_ay_long, dev_mem->d_sid_long, dev_mem->d_range_long, + dev_mem->d_total_n_long, buffer_size_long, + dev_mem->d_long_seg, dev_mem->d_long_seg_og, dev_mem->d_long_seg_count, + dev_mem->d_mid_seg, dev_mem->d_mid_seg_count); + } else if (score_kernel_config.short_blockdim == 64) { + score_generation_short<64><<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, + dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, + dev_mem->d_ax_long, dev_mem->d_ay_long, dev_mem->d_sid_long, dev_mem->d_range_long, + dev_mem->d_total_n_long, buffer_size_long, + dev_mem->d_long_seg, dev_mem->d_long_seg_og, dev_mem->d_long_seg_count, + dev_mem->d_mid_seg, dev_mem->d_mid_seg_count); + } else { + fprintf(stderr, + "[ERROR] Unsupported warpsize: %d. mm2-gb only supports device " + "with a warpsize of 32 / 64. ", + score_kernel_config.short_blockdim); + exit(1); + } + cudaCheck(); + + + if (score_kernel_config.mid_blockdim == 128){ + score_generation_mid<128><<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, dev_mem->d_mid_seg, + dev_mem->d_mid_seg_count, dev_mem->d_f, dev_mem->d_p); + } else if (score_kernel_config.mid_blockdim == 256){ + score_generation_mid<256><<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, dev_mem->d_mid_seg, + dev_mem->d_mid_seg_count, dev_mem->d_f, dev_mem->d_p); + } else if (score_kernel_config.mid_blockdim == 512){ + score_generation_mid<512><<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, dev_mem->d_mid_seg, + dev_mem->d_mid_seg_count, dev_mem->d_f, dev_mem->d_p); + } else if (score_kernel_config.mid_blockdim == 1024){ + score_generation_mid<1024><<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, dev_mem->d_mid_seg, + dev_mem->d_mid_seg_count, dev_mem->d_f, dev_mem->d_p); + } else { + fprintf(stderr, + "[ERROR] Unsupported mid_blockdim: %d. mm2-gb only supports a " + "blockdim of 128/256/512/1024 for mid kernel \n\n" + "Please adjust score_kernel:mid_blockdim in gpu config file. ", + score_kernel_config.mid_blockdim); + exit(1); + } + cudaCheck(); + +#ifdef DEBUG_PRINT + // fprintf(stderr, "[Info] %s (%s:%d) short mid score kernel launched\n", __func__, __FILE__, __LINE__); +#endif + + cudaCheck(); +} + +void plscore_async_long_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream) { + size_t total_n = dev_mem->total_n; + size_t cut_num = dev_mem->num_cut; + size_t buffer_size_long = dev_mem->buffer_size_long; + dim3 longDimGrid(score_kernel_config.long_griddim, 1, 1); + +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[Debug] %s (%s:%d) Long Grid Dim = %d\n", __func__, __FILE__, __LINE__, longDimGrid.x); +#endif // DEBUG_VERBOSE + + + if (score_kernel_config.long_blockdim == 1024){ + score_generation_long_map<1024><<>>( + dev_mem->d_ax_long, dev_mem->d_ay_long, dev_mem->d_sid_long, dev_mem->d_range_long, dev_mem->d_long_seg, + dev_mem->d_long_seg_count, dev_mem->d_f_long, dev_mem->d_p_long, dev_mem->d_map); + } else { + fprintf(stderr, + "[ERROR] Unsupported MaxThreadsPerBlock: %d. mm2-gb only supports a blockdim of 1024 for long kernel ", + score_kernel_config.long_blockdim); + exit(1); + } + + cudaCheck(); + +#ifdef DEBUG_PRINT + // fprintf(stderr, "[Info] %s (%s:%d) long score generation launched\n", __func__, __FILE__, __LINE__); +#endif + + cudaCheck(); +} + +void plscore_async_naive_forward_dp(deviceMemPtr* dev_mem, + cudaStream_t* stream) { + size_t total_n = dev_mem->total_n; + size_t cut_num = dev_mem->num_cut; + dim3 DimBlock(score_kernel_config.long_blockdim, 1, 1); + dim3 longDimGrid(score_kernel_config.long_griddim, 1, 1); + dim3 shortDimGrid(score_kernel_config.short_griddim, 1, 1); + + // Run kernel + // printf("Grid Dim, %d\n", DimGrid.x); + score_generation_naive<<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_sid, dev_mem->d_range, dev_mem->d_cut, + dev_mem->d_f, dev_mem->d_p, total_n, cut_num); + cudaCheck(); +#ifdef DEBUG_VERBOSE + fprintf(stderr, "[M::%s] score generation kernel launch success\n", __func__); +#endif + + cudaCheck(); +} + diff --git a/gpu/plscore.cuh b/gpu/plscore.cuh new file mode 100644 index 00000000..56d57df0 --- /dev/null +++ b/gpu/plscore.cuh @@ -0,0 +1,24 @@ +#ifndef _PLSCORE_CUH_ +#define _PLSCORE_CUH_ + +#include "plmem.cuh" +#include "mmpriv.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#define MM_QSPAN 15 + +void plscore_upload_misc(Misc misc); +void plscore_async_naive_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream); +void plscore_async_short_mid_forward_dp(deviceMemPtr* dev_mem,cudaStream_t* stream); +void plscore_async_long_forward_dp(deviceMemPtr* dev_mem,cudaStream_t* stream); + +extern score_kernel_config_t score_kernel_config; + +#ifdef __cplusplus +} +#endif + +#endif // _PLSCORE_CUH_ \ No newline at end of file diff --git a/gpu/plutils.h b/gpu/plutils.h new file mode 100644 index 00000000..a5cc6a8d --- /dev/null +++ b/gpu/plutils.h @@ -0,0 +1,144 @@ +#ifndef _PLUTILS_H_ +#define _PLUTILS_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "kalloc.h" +#include "minimap.h" + +/* Chaining Options */ + +/* structure for metadata and hits */ +// Sequence meta data +typedef struct { + long i; // read id + int seg_id; // seg id + char name[200]; // name of the sequence + uint32_t len; // name of the sequence + + // mi data + int n_alt; + int is_alt; // reference sequences only + + // sequence info + int qlen_sum; +} mm_seq_meta_t; + +typedef struct { + int max_iter, max_dist_x, max_dist_y, max_skip, bw, min_cnt, min_score, + is_cdna, n_seg; + float chn_pen_gap, chn_pen_skip; +} Misc; + +typedef struct { + mm_seq_meta_t *refs; + int n_refs; + Misc misc; +} input_meta_t; + +typedef struct { + mm_seq_meta_t seq; + + // minimap2 input data for reads + const char **qseqs; // sequences for each segment <- allocated in worker_for, freed in free_read after seeding + int *qlens; // query length for each segment <- allocated in worker_for, freed in free_read after seeding + int n_seg; // number of segs + +//DEBUG: for SCORE CHECK after chaining +#if defined(DEBUG_CHECK) && 0 + int32_t *f; + int64_t *p; +#endif // DEBUG_CHECK + int rep_len; + int frag_gap; + + // seeding outputs + uint64_t *mini_pos; // minimizer positions <- allocated in + int n_mini_pos; + + // seeding output, updated in chaining + mm128_t *a; // array of anchors + int64_t n; // number of anchors = n_a + + // chaining outputs + uint64_t *u; // scores for chains + int n_u; // number of chains formed from anchors == n_reg0 + +} chain_read_t; + +typedef struct seg_t { + size_t start_idx; + size_t end_idx; +//DEBUG: used for debug plchain_cal_long_seg_range_dis LONG_SEG_RANGE_DIS +#ifdef DEBUG_VERBOSE + size_t start_segid; + size_t end_segid; +#endif // DEBUG_VERBOSE +} seg_t; + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus +/* GPU chaining methods */ +// // backward, original chaining methods +// void chain_backword_cpu(const input_meta_t *meta, chain_read_t *read_arr, +// int n_read); +// // forward chaining methods +// void chain_forward_cpu(const input_meta_t *meta, chain_read_t *read_arr, +// int n_read); + +// gpu chaining methods +// initialization and cleanup +void init_stream_gpu(size_t *max_total_n, int *max_reads, + int *min_n, char gpu_config_file[], Misc misc); // for stream_gpu +void finish_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t **batches, + int *num_reads, int num_batch, void *km); // for stream_gpu +void free_stream_gpu(int n_threads); // for stream_gpu free pinned memory +// chaining method +void chain_stream_gpu(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t **in_arr_ptr, int *n_read_ptr, int thread_id, void* km); + +/* Chaining backtracking methods */ +uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, + const int64_t *p, int32_t *v, int32_t *t, + int32_t min_cnt, int32_t min_sc, int32_t max_drop, + int32_t *n_u_, int32_t *n_v_); +mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a); + + +/* Post Chaining helpers */ +Misc build_misc(const mm_idx_t *mi, const mm_mapopt_t *opt, const int64_t qlen_sum, const int n_seg); +void post_chaining_helper(const mm_idx_t *mi, const mm_mapopt_t *opt, + chain_read_t *read, Misc misc, void *km); + +#ifdef __cplusplus +} +#endif // __cplusplus + +///////////////////////////////////////////////////// +/////////// Free Input Struct ///////////// +///////////////////////////////////////////////////// +// free input_iter pointers except a, because it is freed seperately. +static inline void free_read(chain_read_t *in, void* km) { + if (in->qseqs) kfree(km, in->qseqs); + if (in->qlens) kfree(km, in->qlens); + +//DEBUG: for SCORE CHECK after chaining +#if defined(DEBUG_CHECK) && 0 + if (in->f) kfree(km, in->f); + if (in->p) kfree(km, in->p); + in->f = 0, in->p = 0; +#endif + in->qseqs = 0, in->qlens = 0; + in->a = 0, in->u = 0; +} + +static inline void free_meta_struct(input_meta_t *meta, void *km) { + if (meta->refs) kfree(km, meta->refs); +} +#endif // _PLUTILS_H_ diff --git a/gpu_config.json b/gpu_config.json new file mode 100644 index 00000000..70cce79f --- /dev/null +++ b/gpu_config.json @@ -0,0 +1,25 @@ +{ + "num_streams": 1, + "min_n": 512, + "//min_n": "queries with less anchors will be handled on cpu", + "long_seg_buffer_size": 7888000, + "max_total_n": 4934400, + "max_read": 49344, + "avg_read_n": 20000, + "//avg_read_n": "expect average number of anchors per read", + "range_kernel": { + "blockdim": 512, + "cut_check_anchors": 10, + "//cut_check_anchors": "Number of anchors to check to attemp a cut", + "anchor_per_block": 32768, + "//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim" + }, + "score_kernel": { + "short_blockdim": 64, + "long_blockdim": 64, + "mid_blockdim": 64, + "short_griddim": 16128, + "long_griddim": 2016, + "mid_griddim": 16128 + } +} \ No newline at end of file diff --git a/kalloc.c b/kalloc.c index 84995529..f379ff30 100644 --- a/kalloc.c +++ b/kalloc.c @@ -202,4 +202,6 @@ void km_stat(const void *_km, km_stat_t *s) s->capacity += size; s->largest = s->largest > size? s->largest : size; } + + s->meta_size = s->n_cores * sizeof(header_t); } diff --git a/kalloc.h b/kalloc.h index 93bff5e2..7ea6bae9 100644 --- a/kalloc.h +++ b/kalloc.h @@ -9,6 +9,7 @@ extern "C" { typedef struct { size_t capacity, available, n_blocks, n_cores, largest; + size_t meta_size; } km_stat_t; void *kmalloc(void *km, size_t size); diff --git a/kthread.c b/kthread.c index ffdf9408..8cf18d78 100644 --- a/kthread.c +++ b/kthread.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "kthread.h" #if (defined(WIN32) || defined(_WIN32)) && defined(_MSC_VER) @@ -48,6 +49,10 @@ static void *ktf_worker(void *data) } while ((i = steal_work(w->t)) >= 0) w->t->func(w->t->data, i, w - w->t->w); +#if defined(__AMD_SPLIT_KERNELS__) + // call func one last time for this thread to signal end of all reads + w->t->func(w->t->data, -1, w - w->t->w); +#endif pthread_exit(0); } @@ -68,6 +73,11 @@ void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) } else { long j; for (j = 0; j < n; ++j) func(data, j, 0); +#if defined(__AMD_SPLIT_KERNELS__) + // call once at the end to signal end of all reads + func(data, -1, 0); +#endif + } } diff --git a/lchain.c b/lchain.c index d0041578..f789d4d6 100644 --- a/lchain.c +++ b/lchain.c @@ -75,7 +75,7 @@ uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_ return u; } -static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a) +mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a) { mm128_t *b, *w; uint64_t *u2; @@ -167,8 +167,8 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int // fill the score and backtrack arrays for (i = 0, max_ii = -1; i < n; ++i) { - int64_t max_j = -1, end_j; - int32_t max_f = a[i].y>>32&0xff, n_skip = 0; + int64_t max_j = -1, end_j; + int32_t max_f = a[i].y>>32&0xff, n_skip = 0; while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st; if (i - st > max_iter) st = i - max_iter; for (j = i - 1; j >= st; --j) { diff --git a/main.c b/main.c index 0be99335..78d3ccee 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,12 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.24-r1122" +#if defined(__AMD_SPLIT_KERNELS__) + +#include "plutils.h" +#endif // (__AMD_SPLIT_KERNELS__) + +#define MM_VERSION "2.24-mm2-gb-biosys" #ifdef __linux__ #include @@ -86,6 +91,8 @@ static ko_longopt_t long_options[] = { { "mask-level", ko_required_argument, 'M' }, { "min-dp-score", ko_required_argument, 's' }, { "sam", ko_no_argument, 'a' }, + { "gpu-chain", ko_no_argument, 360 }, // use gpu for chaining + { "gpu-cfg", ko_required_argument, 361 }, { 0, 0, 0 } }; @@ -300,9 +307,17 @@ int main(int argc, char *argv[]) } else if (c == 'E') { opt.e = opt.e2 = strtol(o.arg, &s, 10); if (*s == ',') opt.e2 = strtol(s + 1, &s, 10); - } - } - if ((opt.flag & MM_F_SPLICE) && (opt.flag & MM_F_FRAG_MODE)) { + } else if (c == 360) { + opt.flag |= MM_F_GPU_CHAIN; // use gpu for chaining + } else if (c == 361) { + strcpy(opt.gpu_config_file, o.arg); + } + } + if (opt.flag & MM_F_SR) { + opt.max_chain_skip = INT32_MAX; + } + + if ((opt.flag & MM_F_SPLICE) && (opt.flag & MM_F_FRAG_MODE)) { fprintf(stderr, "[ERROR]\033[1;31m --splice and --frag should not be specified at the same time.\033[0m\n"); return 1; } @@ -356,7 +371,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n"); fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads); fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n"); -// fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose); + fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose); fprintf(fp_help, " --version show version number\n"); fprintf(fp_help, " Preset:\n"); fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n"); @@ -422,6 +437,16 @@ int main(int argc, char *argv[]) mm_idx_destroy(mi); continue; // no query files } +#if defined(__AMD_SPLIT_KERNELS__) + // initialize gpu + if (opt.flag & MM_F_GPU_CHAIN) { + // TODO: make misc different for each read + Misc misc = build_misc(mi, &opt, 0, 1); + init_stream_gpu(&opt.gpu_chain_max_anchors, + &opt.gpu_chain_max_reads, &opt.gpu_chain_min_n, + opt.gpu_config_file, misc); + } +#endif // (__AMD_SPLIT_KERNELS__) ret = 0; if (!(opt.flag & MM_F_FRAG_MODE)) { for (i = o.ind + 1; i < argc; ++i) { @@ -436,7 +461,10 @@ int main(int argc, char *argv[]) fprintf(stderr, "ERROR: failed to map the query file\n"); exit(EXIT_FAILURE); } - } +#if defined(__AMD_SPLIT_KERNELS__) + free_stream_gpu(n_threads); +#endif // (__AMD_SPLIT_KERNELS__) + } n_parts = idx_rdr->n_parts; mm_idx_reader_close(idx_rdr); @@ -453,7 +481,21 @@ int main(int argc, char *argv[]) fprintf(stderr, "[M::%s] CMD:", __func__); for (i = 0; i < argc; ++i) fprintf(stderr, " %s", argv[i]); - fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec; Peak RSS: %.3f GB\n", __func__, realtime() - mm_realtime0, cputime(), peakrss() / 1024.0 / 1024.0 / 1024.0); + // TODO: disabled because timer is not updated. + // fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec; Peak RSS: %.3f GB\n", __func__, realtime() - mm_realtime0, cputime(), peakrss() / 1024.0 / 1024.0 / 1024.0); + // fprintf(stderr, "----------------------------------------------------\n"); + // fprintf(stderr, " Sum (sec) Avg (sec) \n"); + // fprintf(stderr, "----------------------------------------------------\n"); + // fprintf(stderr, "Seed = %11.3f %11.3f\n", mm_time_seed_sum, + // mm_time_seed_sum / (double)n_threads); + // fprintf(stderr, "Chain = %11.3f %11.3f\n", mm_time_chain_sum, + // mm_time_chain_sum / (double)n_threads); + // fprintf(stderr, "Align = %11.3f %11.3f\n", mm_time_align_sum, + // mm_time_align_sum / (double)n_threads); + // fprintf(stderr, + // "----------------------------------------------------\n"); + // // fprintf(stderr, "Avg (seed + chain + align) per thread = %.3f secs\n", (mm_time_seed_sum + mm_time_chain_sum + mm_time_align_sum)/(double)n_threads); + // fprintf(stderr, "Total (seed + chain + align) for %d thread(s) = %.3f secs\n", n_threads, (mm_time_seed_sum + mm_time_chain_sum + mm_time_align_sum)); } return 0; } diff --git a/map.c b/map.c index 53114680..0cb285c1 100644 --- a/map.c +++ b/map.c @@ -12,8 +12,38 @@ struct mm_tbuf_s { void *km; - int rep_len, frag_gap; -}; + int rep_len, frag_gap; // updated per read. + double timers[MM_N_THR_TIMERS]; +}; // per thread + +#if defined(__AMD_SPLIT_KERNELS__) + +#include "plutils.h" + +#define N_ACCUM 64 + +typedef struct{ + int batchid; + void *km; // memory pool for each batch + int count; // number of reads in the batch + size_t total_n; // total number of anchors in the batch + chain_read_t *reads; +} mm_batch_trbuf_t; + +// local variables required for each read processed by a CPU thread +typedef struct { + mm_batch_trbuf_t acc_batch; + int is_full; + + mm_batch_trbuf_t launched_batch; + int has_launched; + + mm_batch_trbuf_t pending_batch; + int is_pending; + +} mm_trbuf_t; // per thread + +#endif mm_tbuf_t *mm_tbuf_init(void) { @@ -35,6 +65,98 @@ void *mm_tbuf_get_km(mm_tbuf_t *b) return b->km; } +#if defined(__AMD_SPLIT_KERNELS__) +void mm_trbuf_batch_init(mm_batch_trbuf_t *batch_, int batch_max_reads) { + batch_->count = 0; + batch_->total_n = 0; + batch_->reads = (chain_read_t *)malloc(sizeof(chain_read_t) * batch_max_reads); + memset(batch_->reads, 0, sizeof(chain_read_t) * batch_max_reads); + batch_->batchid = -1; + batch_->km = km_init(); +} + +void mm_trbuf_batch_reset(mm_batch_trbuf_t *batch_, int batch_max_reads, const mm_mapopt_t *opt) { + // free all the reads in the batch + for (int i = 0; i < batch_->count; i++) { + free_read(&batch_->reads[i], batch_->km); + } + + + + /* reset memory pool km */ + km_stat_t kmst; + if (batch_->km) { + chain_read_t *last_read = batch_->reads + batch_->count; + km_stat(batch_->km, &kmst); + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) + fprintf(stderr, "QM\t%s\t%d\tBid=%d\tcap=%ld,avail=%ld,nCore=%ld,largest=%ld\n", + last_read->seq.name, last_read->seq.qlen_sum, batch_->batchid, kmst.capacity, kmst.available, kmst.n_cores, kmst.largest); + assert(kmst.n_blocks == kmst.n_cores); // otherwise, there is a memory leak + assert(kmst.capacity == kmst.meta_size + kmst.available); + if (kmst.largest > 1U<<28 || (opt->cap_kalloc > 0 && kmst.capacity > opt->cap_kalloc)) { + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) + fprintf(stderr, "[W::%s] reset thread-local memory after read %s\n", __func__, last_read->seq.name); + km_destroy(batch_->km); + batch_->km = km_init(); + } + } + + batch_->count = 0; + batch_->total_n = 0; + batch_->batchid = -1; +} + +void mm_trbuf_batch_destroy(mm_batch_trbuf_t *batch_){ + // free reads in the batch + for (int i = 0; i < batch_->count; i++){ + free_read(&batch_->reads[i], batch_->km); + } + batch_->batchid = -1; + batch_->total_n = 0; + batch_->count = 0; + /* clean memory pool */ + km_stat_t kmst; + if (batch_->km) { + km_stat(batch_->km, &kmst); + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) + fprintf(stderr, "Destroy memory pool cap=%ld,avail=%ld,nCore=%ld,largest=%ld\n", + kmst.capacity, kmst.available, kmst.n_cores, kmst.largest); + assert(kmst.n_blocks == kmst.n_cores); // otherwise, there is a memory leak + assert(kmst.capacity == kmst.meta_size + kmst.available); + km_destroy(batch_->km); + batch_->km = 0; + } + free(batch_->reads); + batch_->reads = 0; +} + + +mm_trbuf_t *mm_trbuf_init(const int batch_max_reads, const mm_mapopt_t *opt) +{ + mm_trbuf_t *tr; + tr = (mm_trbuf_t *)calloc(1, sizeof(mm_trbuf_t)); + tr->is_full = 0; + tr->is_pending = 0; + tr->has_launched = 0; + mm_trbuf_batch_init(&tr->acc_batch, batch_max_reads); + tr->acc_batch.batchid = 0; + mm_trbuf_batch_init(&tr->pending_batch, batch_max_reads); + tr->pending_batch.batchid = 1; + mm_trbuf_batch_init(&tr->launched_batch, batch_max_reads); + tr->launched_batch.batchid = 2; + return tr; +} + +void mm_trbuf_destroy(mm_trbuf_t *tr) +{ + if (tr == 0) return; + mm_trbuf_batch_destroy(&tr->acc_batch); + mm_trbuf_batch_destroy(&tr->pending_batch); + mm_trbuf_batch_destroy(&tr->launched_batch); + free(tr); +} +#endif + static int mm_dust_minier(void *km, int n, mm128_t *a, int l_seq, const char *seq, int sdust_thres) { int n_dreg, j, k, u = 0; @@ -229,6 +351,290 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k return regs; } +#if defined(__AMD_SPLIT_KERNELS__) +void mm_map_seed(const mm_idx_t *mi, const mm_mapopt_t *opt, + chain_read_t *read_, mm_tbuf_t *b, void *km) { + int n_segs = read_->n_seg; + const int *qlens = read_->qlens; + const char **seqs = read_->qseqs; + const char *qname = read_->seq.name; + int *rep_len = &read_->rep_len; + int *qlen_sum = &read_->seq.qlen_sum; + int *n_mini_pos = &read_->n_mini_pos; + uint64_t **mini_pos = &read_->mini_pos; + int64_t *n_a = &read_->n; + mm128_t **a = &read_->a; + + int i; + mm128_v mv = {0,0,0}; + double *timers = b->timers; + double t1 = realtime(); + + for (i = 0, *qlen_sum = 0; i < n_segs; ++i) *qlen_sum += qlens[i]; + + if (*qlen_sum == 0 || n_segs <= 0 || n_segs > MM_MAX_SEG) return; + if (opt->max_qlen > 0 && *qlen_sum > opt->max_qlen) return; + + collect_minimizers(km, opt, mi, n_segs, qlens, seqs, &mv); + if (opt->q_occ_frac > 0.0f) mm_seed_mz_flt(km, &mv, opt->mid_occ, opt->q_occ_frac); + if (opt->flag & MM_F_HEAP_SORT) *a = collect_seed_hits_heap(km, opt, opt->mid_occ, mi, qname, &mv, *qlen_sum, n_a, rep_len, n_mini_pos, mini_pos); + else *a = collect_seed_hits(km, opt, opt->mid_occ, mi, qname, &mv, *qlen_sum, n_a, rep_len, n_mini_pos, mini_pos); + + if (mm_dbg_flag & MM_DBG_PRINT_SEED) { + fprintf(stderr, "RS\t%d\n", *rep_len); + for (i = 0; i < *n_a; ++i) + fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[(*a)[i].x<<1>>33].name, (int32_t)(*a)[i].x, "+-"[(*a)[i].x>>63], (int32_t)(*a)[i].y, (int32_t)((*a)[i].y>>32&0xff), + i == 0? 0 : ((int32_t)(*a)[i].y - (int32_t)(*a)[i-1].y) - ((int32_t)(*a)[i].x - (int32_t)(*a)[i-1].x)); + } + kfree(km, mv.a); + timers[MM_TIME_SEED] += realtime() - t1; +} + +Misc build_misc(const mm_idx_t *mi, const mm_mapopt_t *opt, const int64_t qlen_sum, const int n_seg) { + int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR); + float chn_pen_gap, chn_pen_skip; + + // set max chaining gap on the query and the reference sequence + if (is_sr) + max_chain_gap_qry = qlen_sum > opt->max_gap? qlen_sum : opt->max_gap; + else max_chain_gap_qry = opt->max_gap; + if (opt->max_gap_ref > 0) { + max_chain_gap_ref = opt->max_gap_ref; // always honor mm_mapopt_t::max_gap_ref if set + } else if (opt->max_frag_len > 0) { + max_chain_gap_ref = opt->max_frag_len - qlen_sum; + if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; + } else max_chain_gap_ref = opt->max_gap; + + chn_pen_gap = opt->chain_gap_scale * 0.01 * mi->k; + chn_pen_skip = opt->chain_skip_scale * 0.01 * mi->k; + + Misc misc; + misc.max_iter = opt->max_chain_iter; // always set up MAX_UINT + misc.max_dist_y = max_chain_gap_qry; + misc.max_dist_x = max_chain_gap_ref; + misc.max_skip = opt->max_chain_skip; + misc.bw = opt->bw; + misc.min_cnt = opt->min_cnt; + misc.min_score = opt->min_chain_score; + misc.is_cdna = is_splice; + misc.n_seg = n_seg; + + misc.chn_pen_gap = chn_pen_gap; + misc.chn_pen_skip = chn_pen_skip; + + return misc; +} + +void post_chaining_helper(const mm_idx_t *mi, const mm_mapopt_t *opt, chain_read_t* read, Misc misc, void *km) { + int n_segs = read->n_seg; + const char *qname = read->seq.name; + int *rep_len = &read->rep_len; + int *frag_gap = &read->frag_gap; + int *qlen_sum = &read->seq.qlen_sum; + int *n_regs0 = &read->n_u; + int *n_mini_pos = &read->n_mini_pos; + uint64_t **mini_pos = &read->mini_pos; + int64_t *n_a = &read->n; + uint64_t **u = &read->u; + mm128_t **a = &read->a; + + int i; + mm128_v mv = {0, 0, 0}; + + if (opt->bw_long > opt->bw && + (opt->flag & (MM_F_SPLICE | MM_F_SR | MM_F_NO_LJOIN)) == 0 && + n_segs == 1 && *n_regs0 > 1) { // re-chain/long-join for long sequences + int32_t st = (int32_t)(*a)[0].y, en = (int32_t)(*a)[(int32_t)(*u)[0] - 1].y; + if (*qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > *qlen_sum * opt->rmq_rescue_ratio) { + int32_t i; + for (i = 0, *n_a = 0; i < *n_regs0; ++i) *n_a += (int32_t)(*u)[i]; + kfree(km, *u); + radix_sort_128x(*a, (*a) + *n_a); + *a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + misc.chn_pen_gap, misc.chn_pen_skip, *n_a, *a, n_regs0, u, km); + } + } + else if (opt->max_occ > opt->mid_occ && *rep_len > 0 && + !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads + int rechain = 0; + if (*n_regs0 > 0) { // test if the best chain has all the segments + int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0; + for (i = 0; i < *n_regs0; ++i) { // find the best chain + if (max < (int)((*u)[i]>>32)) max = (*u)[i]>>32, max_i = i, max_off = off; + off += (uint32_t)(*u)[i]; + } + for (i = 1; i < (int32_t)(*u)[max_i]; ++i) // count the number of segments in the best chain + if (((*a)[max_off+i].y&MM_SEED_SEG_MASK) != ((*a)[max_off+i-1].y&MM_SEED_SEG_MASK)) + ++n_chained_segs; + if (n_chained_segs < n_segs) + rechain = 1; + } else rechain = 1; + if (rechain) { // redo chaining with a higher max_occ threshold + kfree(km, *a); + kfree(km, *u); + kfree(km, *mini_pos); + if (opt->flag & MM_F_HEAP_SORT) *a = collect_seed_hits_heap(km, opt, opt->max_occ, mi, qname, &mv, *qlen_sum, n_a, rep_len, n_mini_pos, mini_pos); + else *a = collect_seed_hits(km, opt, opt->max_occ, mi, qname, &mv, *qlen_sum, n_a, rep_len, n_mini_pos, mini_pos); + *a = mg_lchain_dp(misc.max_dist_x, misc.max_dist_y, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + misc.chn_pen_gap, misc.chn_pen_skip, misc.is_cdna, n_segs, *n_a, *a, n_regs0, u, km); + kfree(km, mv.a); + } + } + *frag_gap = misc.max_dist_x; +} + +void mm_map_chain(const mm_idx_t *mi, const mm_mapopt_t *opt, + chain_read_t *read_, mm_tbuf_t *b, void *km) { + int n_segs = read_->n_seg; + const char *qname = read_->seq.name; + int *rep_len = &read_->rep_len; + int *frag_gap = &read_->frag_gap; + int *qlen_sum = &read_->seq.qlen_sum; + int *n_regs0 = &read_->n_u; + int *n_mini_pos = &read_->n_mini_pos; + uint64_t **mini_pos = &read_->mini_pos; + int64_t *n_a = &read_->n; + uint64_t **u = &read_->u; + mm128_t **a = &read_->a; + + int i; + int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR); + mm128_v mv = {0,0,0}; + float chn_pen_gap, chn_pen_skip; + double *timers = b->timers; + + // set max chaining gap on the query and the reference sequence + if (is_sr) + max_chain_gap_qry = *qlen_sum > opt->max_gap? *qlen_sum : opt->max_gap; + else max_chain_gap_qry = opt->max_gap; + if (opt->max_gap_ref > 0) { + max_chain_gap_ref = opt->max_gap_ref; // always honor mm_mapopt_t::max_gap_ref if set + } else if (opt->max_frag_len > 0) { + max_chain_gap_ref = opt->max_frag_len - *qlen_sum; + if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; + } else max_chain_gap_ref = opt->max_gap; + + chn_pen_gap = opt->chain_gap_scale * 0.01 * mi->k; + chn_pen_skip = opt->chain_skip_scale * 0.01 * mi->k; + if (opt->flag & MM_F_RMQ) { + *a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + chn_pen_gap, chn_pen_skip, *n_a, *a, n_regs0, u, km); + } else { + *a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + chn_pen_gap, chn_pen_skip, is_splice, n_segs, *n_a, *a, n_regs0, u, km); + } + + if (opt->bw_long > opt->bw && (opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && *n_regs0 > 1) { // re-chain/long-join for long sequences + int32_t st = (int32_t)(*a)[0].y, en = (int32_t)(*a)[(int32_t)(*u)[0] - 1].y; + if (*qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > *qlen_sum * opt->rmq_rescue_ratio) { + int32_t i; + for (i = 0, *n_a = 0; i < *n_regs0; ++i) *n_a += (int32_t)(*u)[i]; + kfree(km, *u); + radix_sort_128x(*a, (*a) + *n_a); + *a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + chn_pen_gap, chn_pen_skip, *n_a, *a, n_regs0, u, km); + } + } else if (opt->max_occ > opt->mid_occ && *rep_len > 0 && !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads + int rechain = 0; + if (*n_regs0 > 0) { // test if the best chain has all the segments + int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0; + for (i = 0; i < *n_regs0; ++i) { // find the best chain + if (max < (int)((*u)[i]>>32)) max = (*u)[i]>>32, max_i = i, max_off = off; + off += (uint32_t)(*u)[i]; + } + for (i = 1; i < (int32_t)(*u)[max_i]; ++i) // count the number of segments in the best chain + if (((*a)[max_off+i].y&MM_SEED_SEG_MASK) != ((*a)[max_off+i-1].y&MM_SEED_SEG_MASK)) + ++n_chained_segs; + if (n_chained_segs < n_segs) + rechain = 1; + } else rechain = 1; + if (rechain) { // redo chaining with a higher max_occ threshold + kfree(km, *a); + kfree(km, *u); + kfree(km, *mini_pos); + if (opt->flag & MM_F_HEAP_SORT) *a = collect_seed_hits_heap(km, opt, opt->max_occ, mi, qname, &mv, *qlen_sum, n_a, rep_len, n_mini_pos, mini_pos); + else *a = collect_seed_hits(km, opt, opt->max_occ, mi, qname, &mv, *qlen_sum, n_a, rep_len, n_mini_pos, mini_pos); + *a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + chn_pen_gap, chn_pen_skip, is_splice, n_segs, *n_a, *a, n_regs0, u, km); + kfree(km, mv.a); + } + } + *frag_gap = max_chain_gap_ref; +} + +void mm_map_align(const mm_idx_t *mi, const mm_mapopt_t *opt, + chain_read_t *read_, mm_reg1_t **regs, int *n_regs, mm_tbuf_t *b, void *km) { + int n_segs = read_->n_seg; + const int *qlens = read_->qlens; + const char **seqs = read_->qseqs; + const char *qname = read_->seq.name; + int rep_len = read_->rep_len; + int frag_gap = read_->frag_gap; + int qlen_sum = read_->seq.qlen_sum; + int *n_regs0 = &read_->n_u; + int n_mini_pos = read_->n_mini_pos; + uint64_t **mini_pos = &read_->mini_pos; + uint64_t *u = read_->u; + mm128_t *a = read_->a; + + int i, j; + int max_chain_gap_ref = frag_gap; + int is_sr = !!(opt->flag & MM_F_SR); + uint32_t hash; + mm_reg1_t *regs0; + double *timers = b->timers; + double t1 = realtime(); + + for (i = 0; i < n_segs; ++i) n_regs[i] = 0, regs[i] = 0; // initialize regs + + hash = qname && !(opt->flag & MM_F_NO_HASH_NAME)? __ac_X31_hash_string(qname) : 0; + hash ^= __ac_Wang_hash(qlen_sum) + __ac_Wang_hash(opt->seed); + hash = __ac_Wang_hash(hash); + + regs0 = mm_gen_regs(km, hash, qlen_sum, *n_regs0, u, a, !!(opt->flag&MM_F_QSTRAND)); + if (mi->n_alt) { + mm_mark_alt(mi, *n_regs0, regs0); + mm_hit_sort(km, n_regs0, regs0, opt->alt_drop); // this step can be merged into mm_gen_regs(); will do if this shows up in profile + } + + if (mm_dbg_flag & (MM_DBG_PRINT_SEED|MM_DBG_PRINT_CHAIN)) + for (j = 0; j < *n_regs0; ++j) + for (i = regs0[j].as; i < regs0[j].as + regs0[j].cnt; ++i) + fprintf(stderr, "CN\t%d\t%s\t%d\t%c\t%d\t%d\t%d\n", j, mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff), + i == regs0[j].as? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); + + chain_post(opt, max_chain_gap_ref, mi, km, qlen_sum, n_segs, qlens, n_regs0, regs0, a); + if (!is_sr && !(opt->flag&MM_F_QSTRAND)) { + mm_est_err(mi, qlen_sum, *n_regs0, regs0, a, n_mini_pos, *mini_pos); + *n_regs0 = mm_filter_strand_retained(*n_regs0, regs0); + } + + if (n_segs == 1) { // uni-segment + regs0 = align_regs(opt, mi, km, qlens[0], seqs[0], n_regs0, regs0, a); + regs0 = (mm_reg1_t*)realloc(regs0, sizeof(*regs0) * *n_regs0); + mm_set_mapq(km, *n_regs0, regs0, opt->min_chain_score, opt->a, rep_len, is_sr); + n_regs[0] = *n_regs0, regs[0] = regs0; + } else { // multi-segment + mm_seg_t *seg; + seg = mm_seg_gen(km, hash, n_segs, qlens, *n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains + free(regs0); + for (i = 0; i < n_segs; ++i) { + mm_set_parent(km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent + regs[i] = align_regs(opt, mi, km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a); + mm_set_mapq(km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr); + } + mm_seg_free(km, n_segs, seg); + if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR)) + mm_pair(km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing + } + timers[MM_TIME_ALIGN] += realtime() - t1; + + kfree(km, a); + kfree(km, u); + kfree(km, *mini_pos); +} +#endif + void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { int i, j, rep_len, qlen_sum, n_regs0, n_mini_pos; @@ -241,6 +647,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** mm_reg1_t *regs0; km_stat_t kmst; float chn_pen_gap, chn_pen_skip; + double *timers = b->timers; + double t1 = realtime(); for (i = 0, qlen_sum = 0; i < n_segs; ++i) qlen_sum += qlens[i], n_regs[i] = 0, regs[i] = 0; @@ -263,7 +671,9 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff), i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); } + timers[MM_TIME_SEED] += realtime() - t1; + t1 = realtime(); // set max chaining gap on the query and the reference sequence if (is_sr) max_chain_gap_qry = qlen_sum > opt->max_gap? qlen_sum : opt->max_gap; @@ -321,7 +731,9 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** } b->frag_gap = max_chain_gap_ref; b->rep_len = rep_len; + timers[MM_TIME_CHAIN] += realtime() - t1; + t1 = realtime(); regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a, !!(opt->flag&MM_F_QSTRAND)); if (mi->n_alt) { mm_mark_alt(mi, n_regs0, regs0); @@ -358,6 +770,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR)) mm_pair(b->km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing } + timers[MM_TIME_ALIGN] += realtime() - t1; kfree(b->km, mv.a); kfree(b->km, a); @@ -409,8 +822,338 @@ typedef struct { int *n_reg, *seg_off, *n_seg, *rep_len, *frag_gap; mm_reg1_t **reg; mm_tbuf_t **buf; +#if defined(__AMD_SPLIT_KERNELS__) + mm_trbuf_t **trbuf; + int batch_max_reads; + size_t batch_max_anchors; + int gpu_min_n; +#endif } step_t; +#define MIN(a, b) ((a)<(b)?(a):(b)) +#define MAX(a, b) ((a)>(b)?(a):(b)) + +// consolidate timers from worker threads +void mm_consolidate_timers(step_t *s, pipeline_t *p) +{ + // TODO: disabled for sysbio submission + return; + mm_time_seed_min = 0; + mm_time_chain_min = 0; + mm_time_align_min = 0; + mm_time_seed_max = 0; + mm_time_chain_max = 0; + mm_time_align_max = 0; + mm_time_seed_avg = 0; + mm_time_chain_avg = 0; + mm_time_align_avg = 0; + for (int i = 0; i < p->n_threads; ++i) { + mm_time_seed_min = MIN(mm_time_seed_min, s->buf[i]->timers[MM_TIME_SEED]); + mm_time_chain_min = MIN(mm_time_chain_min, s->buf[i]->timers[MM_TIME_CHAIN]); + mm_time_align_min = MIN(mm_time_align_min, s->buf[i]->timers[MM_TIME_ALIGN]); + mm_time_seed_max = MAX(mm_time_seed_max, s->buf[i]->timers[MM_TIME_SEED]); + mm_time_chain_max = MAX(mm_time_chain_max, s->buf[i]->timers[MM_TIME_CHAIN]); + mm_time_align_max = MAX(mm_time_align_max, s->buf[i]->timers[MM_TIME_ALIGN]); + mm_time_seed_avg += s->buf[i]->timers[MM_TIME_SEED]; + mm_time_chain_avg += s->buf[i]->timers[MM_TIME_CHAIN]; + mm_time_align_avg += s->buf[i]->timers[MM_TIME_ALIGN]; + mm_time_seed_sum += s->buf[i]->timers[MM_TIME_SEED]; + mm_time_chain_sum += s->buf[i]->timers[MM_TIME_CHAIN]; + mm_time_align_sum += s->buf[i]->timers[MM_TIME_ALIGN]; + } + mm_time_seed_avg /= p->n_threads; + mm_time_chain_avg /= p->n_threads; + mm_time_align_avg /= p->n_threads; + + fprintf(stderr, "----------------------------------------------------\n"); + fprintf(stderr, " Min (sec) Max (sec) Avg (sec) \n"); + fprintf(stderr, "----------------------------------------------------\n"); + fprintf(stderr, "Seed = %11.3f %11.3f %11.3f\n", + mm_time_seed_min, mm_time_seed_max, mm_time_seed_avg); + fprintf(stderr, "Chain = %11.3f %11.3f %11.3f\n", + mm_time_chain_min, mm_time_chain_max, mm_time_chain_avg); + fprintf(stderr, "Align = %11.3f %11.3f %11.3f\n", + mm_time_align_min, mm_time_align_max, mm_time_align_avg); + fprintf(stderr, "----------------------------------------------------\n"); + fprintf(stderr, "Avg (seed + chain + align) per thread = %.3f secs\n", (mm_time_seed_avg + mm_time_chain_avg + mm_time_align_avg)); + fprintf(stderr, "Total (seed + chain + align) (all batches) for %d thread(s) = %.3f secs\n", p->n_threads, (mm_time_seed_sum + mm_time_chain_sum + mm_time_align_sum)); + +} + + +#if defined(__AMD_SPLIT_KERNELS__) + +void mm_trbuf_is_full(mm_trbuf_t* tr, step_t *s){ + while (tr->acc_batch.total_n > s->batch_max_anchors) { // if the batch is full + tr->is_full = 1; + tr->is_pending = 1; + // move last read from acc_batch to pending batch (another memory poll) + chain_read_t *read_ptr_acc_batch = &tr->acc_batch.reads[tr->acc_batch.count - 1]; + chain_read_t *read_ptr_pending_batch = &tr->pending_batch.reads[tr->pending_batch.count]; + /* deep copy, with memory pool transaction*/ + *read_ptr_pending_batch = *read_ptr_acc_batch; + if (s->p->opt->flag & MM_F_INDEPEND_SEG) { + read_ptr_pending_batch->qlens = (int *)kmalloc(tr->pending_batch.km, sizeof(int)); + read_ptr_pending_batch->qseqs = (const char **)kmalloc(tr->pending_batch.km, sizeof(const char*)); + read_ptr_pending_batch->qlens[0] = read_ptr_acc_batch->qlens[0]; + read_ptr_pending_batch->qseqs[0] = read_ptr_acc_batch->qseqs[0]; + } else { + read_ptr_pending_batch->qlens = (int *)kmalloc(tr->pending_batch.km, sizeof(int)*read_ptr_acc_batch->n_seg); + read_ptr_pending_batch->qseqs = (const char **)kmalloc(tr->pending_batch.km, sizeof(const char*)*read_ptr_acc_batch->n_seg); + memcpy(read_ptr_pending_batch->qlens, read_ptr_acc_batch->qlens, sizeof(int)*read_ptr_acc_batch->n_seg); + memcpy(read_ptr_pending_batch->qseqs, read_ptr_acc_batch->qseqs, sizeof(const char*)*read_ptr_acc_batch->n_seg); + } + read_ptr_pending_batch->mini_pos = (uint64_t*)kmalloc(tr->pending_batch.km, read_ptr_acc_batch->n_mini_pos * sizeof(uint64_t)); + read_ptr_pending_batch->a = (mm128_t*)kmalloc(tr->pending_batch.km, read_ptr_acc_batch->n * sizeof(mm128_t)); + memcpy(read_ptr_pending_batch->mini_pos, read_ptr_acc_batch->mini_pos, read_ptr_acc_batch->n_mini_pos * sizeof(uint64_t)); + memcpy(read_ptr_pending_batch->a, read_ptr_acc_batch->a, read_ptr_acc_batch->n * sizeof(mm128_t)); + strcpy(read_ptr_pending_batch->seq.name, read_ptr_acc_batch->seq.name); + tr->pending_batch.count++; + tr->pending_batch.total_n += read_ptr_acc_batch->n; + + // remove read from acc_batch + tr->acc_batch.count--; + tr->acc_batch.total_n -= read_ptr_acc_batch->n; + kfree(tr->acc_batch.km, read_ptr_acc_batch->mini_pos); + kfree(tr->acc_batch.km, read_ptr_acc_batch->a); + kfree(tr->acc_batch.km, read_ptr_acc_batch->qlens); + kfree(tr->acc_batch.km, read_ptr_acc_batch->qseqs); + } +} + +static void worker_for(void *_data, long i_in, int tid) // kt_for() callback +{ + step_t *s = (step_t *)_data; + long i = i_in; + int j, iread, off, pe_ori = s->p->opt->pe_ori; + double t = 0.0; + mm_tbuf_t *b = s->buf[tid]; + mm_trbuf_t *tr = s->trbuf[tid]; + + // Check if this is a valid read + if (i != -1) { + off = s->seg_off[i]; + + assert(s->n_seg[i] <= MM_MAX_SEG); + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) { + fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq); + t = realtime(); + } + + int n_indep_reads = (s->p->opt->flag & MM_F_INDEPEND_SEG) ? s->n_seg[i] : 1; + void *km; + chain_read_t *read_ptr = 0; + if ( n_indep_reads + tr->acc_batch.count <= s->batch_max_reads){ + read_ptr = tr->acc_batch.reads + tr->acc_batch.count; + tr->acc_batch.count += n_indep_reads; + km = tr->acc_batch.km; + } else { + read_ptr = tr->pending_batch.reads + tr->pending_batch.count; + tr->pending_batch.count += n_indep_reads; + tr->is_full = 1; + tr->is_pending = 1; + km = tr->pending_batch.km; + } + + if (s->p->opt->flag & MM_F_INDEPEND_SEG) { // assign to different chain_read_t if segments are indepent + for (j = 0; j < s->n_seg[i]; ++j) { + read_ptr->qlens = (int *)kmalloc(km, sizeof(int)); + read_ptr->qseqs = (const char **)kmalloc(km, sizeof(const char *)); + if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) + mm_revcomp_bseq(&s->seq[off + j]); + read_ptr->qlens[0] = s->seq[off + j].l_seq; + read_ptr->qseqs[0] = s->seq[off + j].seq; + read_ptr->n_seg = 1; + + read_ptr->seq.i = i; + read_ptr->seq.seg_id = j; + strcpy(read_ptr->seq.name, s->seq[off + j].name); + read_ptr->seq.n_alt = s->p->mi->n_alt; + read_ptr->seq.is_alt = 0; + + read_ptr++; + } + + } else { + read_ptr->qlens = (int *)kmalloc(km, s->n_seg[i] * sizeof(int)); + read_ptr->qseqs = (const char **)kmalloc(km, s->n_seg[i] * sizeof(const char *)); + read_ptr->n_seg = s->n_seg[i]; + for (j = 0; j < s->n_seg[i]; ++j) { + if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) + mm_revcomp_bseq(&s->seq[off + j]); + read_ptr->qlens[j] = s->seq[off + j].l_seq; + read_ptr->qseqs[j] = s->seq[off + j].seq; + } + + read_ptr->seq.i = i; + read_ptr->seq.seg_id = 0; + strcpy(read_ptr->seq.name, s->seq[off].name); + read_ptr->seq.n_alt = s->p->mi->n_alt; + read_ptr->seq.is_alt = 0; + + read_ptr++; + } + + + // Seed + for (j = 0; j < n_indep_reads; j++) { + read_ptr--; + mm_map_seed(s->p->mi, s->p->opt, read_ptr, b, km); + if + (tr->is_pending) tr->pending_batch.total_n += read_ptr->n; + else + tr->acc_batch.total_n += read_ptr->n; + assert(read_ptr->n_mini_pos >= 0); + } + + + // move reads from acc_batch to pending_batch if neccessary + mm_trbuf_is_full(tr, s); + } else { + tr->is_full = 1; // set acc_batch to ready to launch if this is the last read + } + + // Did we accumulate enough reads or get to the last batch of reads? + while (tr->is_full || (i_in == -1 && tr->has_launched)) { + // Chain + double t1 = realtime(); + /* perform chaining on acc_batch and move to launched_batch. move current pending batch to acc_batch. Store results in pending batch. */ + if (tr->is_full) { + + if (s->p->opt->flag & MM_F_GPU_CHAIN){ + // chaining on GPU + mm_batch_trbuf_t kernel_batch = tr->acc_batch; + chain_stream_gpu(s->p->mi, s->p->opt, &kernel_batch.reads, &kernel_batch.count, tid, tr->launched_batch.km); + // check if the returned batch exits, and/or is the launched_batch. + if (kernel_batch.reads) { // if chain_stream_gpu return non NULL reads + assert(tr->has_launched); + // FIXME: temporary solution for reads fail to fit in microbatch + // cpu kernel + for (kernel_batch.count; kernel_batch.countlaunched_batch.count; kernel_batch.count++) { + fprintf(stderr, "[WARNING] Run CPU kernel for read %d\n", kernel_batch.count); + mm_map_chain(s->p->mi, s->p->opt, &kernel_batch.reads[kernel_batch.count], b, tr->launched_batch.km); + } + assert(kernel_batch.count == tr->launched_batch.count); + kernel_batch = tr->launched_batch; + tr->launched_batch = tr->acc_batch; + tr->acc_batch = tr->pending_batch; + tr->pending_batch = kernel_batch; + tr->is_pending = 1; + } else { + assert(!tr->has_launched); // no launched batch + kernel_batch = tr->launched_batch; + tr->launched_batch = tr->acc_batch; + tr->acc_batch = tr->pending_batch; + tr->pending_batch = kernel_batch; + tr->is_pending = 0; + } + tr->is_full = 0; + tr->has_launched = 1; + } else { + // cpu kernel + for (iread=0; ireadacc_batch.count; iread++) { + mm_map_chain(s->p->mi, s->p->opt, &tr->acc_batch.reads[iread], b, tr->acc_batch.km); + } + mm_batch_trbuf_t kernel_batch = tr->acc_batch; + tr->acc_batch = tr->pending_batch; + tr->pending_batch = kernel_batch; + tr->has_launched = 0; + tr->is_full = 0; + tr->is_pending = 1; + // end of cpu kernel + } + + } else if (s->p->opt->flag & MM_F_GPU_CHAIN){ // clean up if all the pending kernels have been launched + assert(i_in == -1 && tr->has_launched); + mm_batch_trbuf_t kernel_batch; + finish_stream_gpu(s->p->mi, s->p->opt, &kernel_batch.reads, &kernel_batch.count, tid, tr->launched_batch.km); + // FIXME: temporary solution for reads fail to fit in microbatch + // cpu kernel + for (kernel_batch.count; kernel_batch.countlaunched_batch.count; kernel_batch.count++) { + fprintf(stderr, "[WARNING] Run CPU kernel for read %d\n", iread); + mm_map_chain(s->p->mi, s->p->opt, &kernel_batch.reads[kernel_batch.count], b, kernel_batch.km); + } + assert(kernel_batch.count == tr->launched_batch.count); + kernel_batch = tr->launched_batch; + tr->is_full = 0; + tr->is_pending = 1; + tr->has_launched = 0; + tr->launched_batch = tr->acc_batch; + tr->acc_batch = tr->pending_batch; + tr->pending_batch = kernel_batch; + } + + // NOTE: Because it just submit the GPU task, this timer doesn't make sense for GPU + b->timers[MM_TIME_CHAIN] += realtime() - t1; + + mm_batch_trbuf_t *batch = &tr->pending_batch; + + if (tr->is_pending){ + + /* Copy rep_len & frag_gap to step_t */ + for (iread = 0; iread < batch->count; iread++) { + i = batch->reads[iread].seq.i; + j = batch->reads[iread].seq.seg_id; + off = s->seg_off[i] + j; + for (int k = 0; k < batch->reads[iread].n_seg; k++) { + s->rep_len[off + k] = batch->reads[iread].rep_len; + s->frag_gap[off + k] = batch->reads[iread].frag_gap; + } + } + // Align + for (iread = 0; iread < batch->count; iread++) { + i = batch->reads[iread].seq.i; + off = s->seg_off[i]; + j = batch->reads[iread].seq.seg_id; + mm_map_align(s->p->mi, s->p->opt, &batch->reads[iread], &s->reg[off + j], &s->n_reg[off + j], b, batch->km) ; + if (s->p->opt->flag & MM_F_INDEPEND_SEG) { + if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori >> 1 & 1)) || + (j == 1 && (pe_ori & 1)))) { + int k, t; + mm_revcomp_bseq(&s->seq[off + j]); + for (k = 0; k < s->n_reg[off + j]; ++k) { + mm_reg1_t *r = &s->reg[off + j][k]; + t = r->qs; + r->qs = batch->reads[iread].qlens[j] - r->qe; + r->qe = batch->reads[iread].qlens[j] - t; + r->rev = !r->rev; + } + } + } else { + for (j = 0; j < batch->reads[iread].n_seg; + ++j) { // flip the query strand and coordinate to the + // original read strand + if (s->n_seg[i] == 2 && + ((j == 0 && (pe_ori >> 1 & 1)) || + (j == 1 && (pe_ori & 1)))) { + int k, t; + mm_revcomp_bseq(&s->seq[off + j]); + for (k = 0; k < s->n_reg[off + j]; ++k) { + mm_reg1_t *r = &s->reg[off + j][k]; + t = r->qs; + r->qs = batch->reads[iread].qlens[j] - r->qe; + r->qe = batch->reads[iread].qlens[j] - t; + r->rev = !r->rev; + } + } + } + } + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) + fprintf(stderr, "QT\t%s\t%d\t%.6f\n", s->seq[off].name, tid, realtime() - t); + } + + // reset pending batch + mm_trbuf_batch_reset(batch, s->batch_max_reads, s->p->opt); + tr->pending_batch = *batch; + tr->is_pending = 0; + tr->pending_batch.batchid = tr->acc_batch.batchid + 1; + + } // if tr->is_pending; + } +} +#endif + +#ifndef __AMD_SPLIT_KERNELS__ static void worker_for(void *_data, long i, int tid) // kt_for() callback { step_t *s = (step_t*)_data; @@ -457,6 +1200,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback if (mm_dbg_flag & MM_DBG_PRINT_QNAME) fprintf(stderr, "QT\t%s\t%d\t%.6f\n", s->seq[off].name, tid, realtime() - t); } +#endif static void merge_hits(step_t *s) { @@ -542,6 +1286,10 @@ static void *worker_pipeline(void *shared, int step, void *in) s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*)); for (i = 0; i < p->n_threads; ++i) s->buf[i] = mm_tbuf_init(); +#if defined(__AMD_SPLIT_KERNELS__) + s->trbuf = (mm_trbuf_t**)calloc(p->n_threads, sizeof(mm_trbuf_t*)); +#endif + s->n_reg = (int*)calloc(5 * s->n_seq, sizeof(int)); s->seg_off = s->n_reg + s->n_seq; // seg_off, n_seg, rep_len and frag_gap are allocated together with n_reg s->n_seg = s->seg_off + s->n_seq; @@ -557,15 +1305,36 @@ static void *worker_pipeline(void *shared, int step, void *in) return s; } else free(s); } else if (step == 1) { // step 1: map +#if defined(__AMD_SPLIT_KERNELS__) + step_t *s = (step_t *)in; + if (p->opt->flag & MM_F_GPU_CHAIN) { + s->batch_max_anchors = p->opt->gpu_chain_max_anchors; + // fprintf(stderr, "s->batch_max_anchors = %lu, p->opt->gpu_chain_max_anchors = %lu\n", s->batch_max_anchors, p->opt->gpu_chain_max_anchors); + s->batch_max_reads = p->opt->gpu_chain_max_reads; + s->gpu_min_n = p->opt->gpu_chain_min_n; + } else { + s->batch_max_anchors = SIZE_MAX; + s->batch_max_reads = N_ACCUM; + } + for (i = 0; i < p->n_threads; ++i) + s->trbuf[i] = mm_trbuf_init(s->batch_max_reads, p->opt); +#endif if (p->n_parts > 0) merge_hits((step_t*)in); else kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag); return in; } else if (step == 2) { // step 2: output void *km = 0; - step_t *s = (step_t*)in; + step_t *s = (step_t*)in; const mm_idx_t *mi = p->mi; + // consolidate timers from threads + mm_consolidate_timers (s, p); for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); +#if defined(__AMD_SPLIT_KERNELS__) + for (i = 0; i < p->n_threads; ++i) mm_trbuf_destroy(s->trbuf[i]); + free(s->trbuf); +#endif + if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (k = 0; k < s->n_frag; ++k) { int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k]; diff --git a/mi210_below50k.json b/mi210_below50k.json new file mode 100644 index 00000000..7fce048c --- /dev/null +++ b/mi210_below50k.json @@ -0,0 +1,29 @@ +{ + "//config is for": "aac cloud. Fits one batch + 5% x 4 long buffer avg_read_n 10k", + "num_streams": 1, + "min_n": 512, + "//min_n": "queries with less anchors will be handled on cpu", + "long_seg_buffer_size": 1117376000, + "max_total_n": 2036880000, + "max_read": 2036880, + "avg_read_n": 20000, + "//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified", + "range_kernel": { + "blockdim": 512, + "cut_check_anchors": 10, + "//cut_check_anchors": "Number of anchors to check to attemp a cut", + "anchor_per_block": 32768, + "//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim" + }, + "score_kernel": { + "micro_batch": 6, + "mid_blockdim": 512, + "//static options for mid_blockdim": "128/256/512/1024", + "short_griddim": 3328, + "mid_griddim": 3328, + "long_griddim": 208, + "//normal reads benefit from more blocks": "long_griddim = 2 * num of CUs", + "long_seg_cutoff": 20, + "mid_seg_cutoff": 3 + } +} diff --git a/mi210_over50k.json b/mi210_over50k.json new file mode 100644 index 00000000..ec4dfdce --- /dev/null +++ b/mi210_over50k.json @@ -0,0 +1,28 @@ +{ + "//config is for": "aac cloud. Fits one batch + 5% x 4 long buffer avg_read_n 10k", + "num_streams": 1, + "min_n": 512, + "//min_n": "queries with less anchors will be handled on cpu", + "long_seg_buffer_size": 1117376000, + "max_total_n": 2036880000, + "max_read": 2036880, + "avg_read_n": 20000, + "//avg_read_n": "expect average number of anchors per read, not used if max_total_n and max_read are specified", + "range_kernel": { + "blockdim": 512, + "cut_check_anchors": 10, + "//cut_check_anchors": "Number of anchors to check to attemp a cut", + "anchor_per_block": 32768, + "//anchor_per_block": "Number of anchors each block handle. Must be int * blockdim" + }, + "score_kernel": { + "micro_batch": 6, + "mid_blockdim": 512, + "short_griddim": 3328, + "mid_griddim": 3328, + "long_griddim": 104, + "//long reads benefit from less blocks": "long_griddim = num of CUs", + "long_seg_cutoff": 20, + "mid_seg_cutoff": 3 + } +} diff --git a/minimap.h b/minimap.h index 13e12e03..def59118 100644 --- a/minimap.h +++ b/minimap.h @@ -40,6 +40,7 @@ #define MM_F_QSTRAND (0x100000000LL) #define MM_F_NO_INV (0x200000000LL) #define MM_F_NO_HASH_NAME (0x400000000LL) +#define MM_F_GPU_CHAIN (0x800000000LL) // use gpu for chaining #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 @@ -61,6 +62,8 @@ #define MM_CIGAR_STR "MIDNSHP=XB" +#define MM_N_THR_TIMERS 8 + #ifdef __cplusplus extern "C" { #endif @@ -174,6 +177,12 @@ typedef struct { int64_t cap_kalloc; const char *split_prefix; + + /* gpu batch parameters */ + int gpu_chain_max_reads; + size_t gpu_chain_max_anchors; + int gpu_chain_min_n; + char gpu_config_file[1024]; } mm_mapopt_t; // index reader @@ -195,6 +204,25 @@ typedef struct mm_tbuf_s mm_tbuf_t; extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag extern double mm_realtime0; // wall-clock timer +typedef enum { + MM_TIME_SEED=0, + MM_TIME_CHAIN, + MM_TIME_ALIGN +} mm_thr_timer_t; + +extern double mm_time_seed_min; // timer for seeding (min for all threads) +extern double mm_time_seed_max; // timer for seeding (max for all threads) +extern double mm_time_seed_avg; // timer for seeding (avg of all threads) +extern double mm_time_seed_sum; // timer for seeding (sum of all threads) +extern double mm_time_chain_min; // timer for chaining (min for all threads) +extern double mm_time_chain_max; // timer for chaining (max for all threads) +extern double mm_time_chain_avg; // timer for chaining (avg of all threads) +extern double mm_time_chain_sum; // timer for chaining (sum of all threads) +extern double mm_time_align_min; // timer for alignment (min for all threads) +extern double mm_time_align_max; // timer for alignment (max for all threads) +extern double mm_time_align_avg; // timer for alignment (avg of all threads) +extern double mm_time_align_sum; // timer for alignment (sum of all threads) + /** * Set default or preset parameters * diff --git a/misc.c b/misc.c index f3a29f86..39b37a31 100644 --- a/misc.c +++ b/misc.c @@ -4,6 +4,18 @@ int mm_verbose = 1; int mm_dbg_flag = 0; double mm_realtime0; +double mm_time_seed_min = 0; +double mm_time_seed_max = 0; +double mm_time_seed_avg = 0; +double mm_time_seed_sum = 0; +double mm_time_chain_min = 0; +double mm_time_chain_max = 0; +double mm_time_chain_avg = 0; +double mm_time_chain_sum = 0; +double mm_time_align_min = 0; +double mm_time_align_max = 0; +double mm_time_align_avg = 0; +double mm_time_align_sum = 0; #if defined(WIN32) || defined(_WIN32) #include diff --git a/options.c b/options.c index 235b6dd8..858e7a50 100644 --- a/options.c +++ b/options.c @@ -61,6 +61,8 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->pe_ori = 0; // FF opt->pe_bonus = 33; + + strcpy(opt->gpu_config_file, "gpu_config.json"); } void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) diff --git a/scripts/aac_integrated.sh b/scripts/aac_integrated.sh new file mode 100644 index 00000000..3db6304d --- /dev/null +++ b/scripts/aac_integrated.sh @@ -0,0 +1,38 @@ +#!/bin/bash +WORKSPACE_DIR=/shared/prod/home/liuxs/bioinfo/minimap2 +EXE_PATH=$WORKSPACE_DIR +CONFIG_PATH=$WORKSPACE_DIR +DATA_PATH=/shared/prod/home/liuxs/bioinfo/Profile_mm2/data +# NOTE: currently `-t n_thread`, the n_thread must be equal to the num_streams in gpu_config.json +N_THREAD=$(sed -n 's/.*"num_streams": \([0-9]*\).*/\1/p' ${CONFIG_PATH}/gpu_config.json) +# Array of LONG_BLOCK_SIZE values +MID_BLOCK_SIZES=( 64 ) +MID_CUTS=( 1 ) +LONG_CUTS=( 50 100 ) +GPU_CONFIGS=( gpu_config.json ) +DATA_SETS=( 1kto300k 200kto300k ) + +# Iterate over LONG_BLOCK_SIZES array +for DATA_SET in "${DATA_SETS[@]}" +do + QUERY_FILE=$DATA_PATH/random_500MBases_${DATA_SET}.fa + for MID_BLOCK_SIZE in "${MID_BLOCK_SIZES[@]}" + do + for MID_CUT in "${MID_CUTS[@]}" + do + for LONG_CUT in "${LONG_CUTS[@]}" + do + for GPU_CONFIG in "${GPU_CONFIGS[@]}" + do + echo "Executing with MID_BLOCK_SIZE=${MID_BLOCK_SIZE} MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT}" + + make clean + + make GPU_CONFIG=${GPU_CONFIG} SHORT_BLOCK_SIZE=64 MID_BLOCK_SIZE=${MID_BLOCK_SIZE} LONG_BLOCK_SIZE=1024 MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT} + + omniperf profile -n integrated_${MID_BLOCK_SIZE}_${LONG_CUT}_${DATA_SET}_report --device 0 -- ${EXE_PATH}/minimap2 ${DATA_PATH}/hg38.mmi -t ${N_THREAD} --max-chain-skip=2147483647 --gpu-chain ${QUERY_FILE} > test_logs.out + done + done + done + done +done \ No newline at end of file diff --git a/scripts/aac_omniperf.slurm b/scripts/aac_omniperf.slurm new file mode 100644 index 00000000..ecc55ef8 --- /dev/null +++ b/scripts/aac_omniperf.slurm @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=minimap2 # Job name +#SBATCH --partition=1CN128C8G2H_2IB_MI210_RHEL8 # Specify the MI210 GPU partition or queue +#SBATCH --gres=gpu:1 # Request 1 GPU +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks (processes) per node +#SBATCH --cpus-per-task=16 # Number of CPU cores per task +#SBATCH --mem=500g # Memory per node +#SBATCH --time=20:00:00 # Maximum execution time (HH:MM:SS) +#SBATCH --output=slurm_output/sample_sbatch_job.%j.out # Output file +#SBATCH --error=slurm_output/sample_sbatch_job.%j.err # Error file + +export MM2_ROOT=$HOME/minimap2 + +# Load necessary modules (if required) +source /etc/profile.d/modules.sh +scl enable gcc-toolset-11 bash +module unuse /shared/apps/modules/ubuntu/modulefiles +module use /shared/apps/modules/rhel8/modulefiles +module unuse /shared/apps/modules/rhel9/modulefiles +module unuse /shared/apps/modules/sles15sp4/modulefiles +module unuse /shared/apps/modules/centos8/modulefiles +module unuse /shared/apps/modules/rocky9/modulefiles + +module purge +module load rocm-5.7.1 +# module load rocm/6.0.0 +# export AMD_LOG_LEVEL=4 +# Replace the following line with the actual command(s) you want to run +cd $MM2_ROOT +make clean +export SUFFIX=-omniperf-1 +make MICRO_BATCH=1 GPU_CONFIG=aac_config_half.json SHORT_BLOCK_SIZE=64 LONG_BLOCK_SIZE=1024 MID_BLOCK_SIZE=512 MID_CUT=1 LONG_CUT=20 SUFFIX=$SUFFIX +# ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/reads_4f452f4a-d82a-4580-981b-32d14b997217.fa +# ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_500MBases_200kto300k.fa +omniperf profile -p omniperf_output -n long_seg_${SLURM_JOB_ID} --device 0 -- ./minimap2$SUFFIX -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_90kto100k.fa +# omniperf profile -p omniperf_output -n long_seg_${SLURM_JOB_ID} --device 0 -- ./minimap2$SUFFIX -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_500MBases_90kto100k.fa +# omniperf profile -p omniperf_output -n long_seg_${SLURM_JOB_ID} --device 0 -- ./minimap2$SUFFIX -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/long_read_600M.fa +# omniperf profile -p omniperf_output n long_seg_${SLURM_JOB_ID} --device 0 -- ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/reads_4f452f4a-d82a-4580-981b-32d14b997217.fa +# omniperf profile -p omniperf_output -n long_seg_${SLURM_JOB_ID} --device 0 -- ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_500MBases_200kto300k.fa +# Optional: You can add post-processing commands here + +# End of the script diff --git a/scripts/aac_omnitrace.slurm b/scripts/aac_omnitrace.slurm new file mode 100644 index 00000000..f26f86fa --- /dev/null +++ b/scripts/aac_omnitrace.slurm @@ -0,0 +1,46 @@ +#!/bin/bash +#SBATCH --job-name=minimap2 # Job name +#SBATCH --partition=1CN128C8G2H_2IB_MI210_RHEL8 # Specify the MI210 GPU partition or queue +#SBATCH --gres=gpu:1 # Request 1 GPU +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks (processes) per node +#SBATCH --cpus-per-task=16 # Number of CPU cores per task +#SBATCH --mem=500g # Memory per node +#SBATCH --time=01:40:00 # Maximum execution time (HH:MM:SS) +#SBATCH --output=slurm_output/sample_sbatch_job.%j.out # Output file +#SBATCH --error=slurm_output/sample_sbatch_job.%j.err # Error file + +export MM2_ROOT=$HOME/minimap2 + +# Load necessary modules (if required) +source /etc/profile.d/modules.sh +scl enable gcc-toolset-11 bash +module unuse /shared/apps/modules/ubuntu/modulefiles +module use /shared/apps/modules/rhel8/modulefiles +module unuse /shared/apps/modules/rhel9/modulefiles +module unuse /shared/apps/modules/sles15sp4/modulefiles +module unuse /shared/apps/modules/centos8/modulefiles +module unuse /shared/apps/modules/rocky9/modulefiles + +module purge +module load rocm-5.7.1 +# module load rocm/6.0.0 +# export AMD_LOG_LEVEL=4 +# Replace the following line with the actual command(s) you want to run +cd $MM2_ROOT +make clean +make MICRO_BATCH=5 GPU_CONFIG=aac_config.json SHORT_BLOCK_SIZE=64 LONG_BLOCK_SIZE=1024 MID_BLOCK_SIZE=512 MID_CUT=1 LONG_CUT=100 DEBUG=1 DEBUG_ANALYSIS=1 +# omnitrace-sample -PTDH -E all -I rocm-smi -I roctracer -I rocprofiler -I roctx -o omni_output -- ./minimap2 -K 2500000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_1GBases_100kto300k.fa +omnitrace-sample -PTDH -E all -I rocm-smi -I roctracer -I rocprofiler -I roctx -o omni_output -- ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_90kto100k.fa + +# ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/reads_4f452f4a-d82a-4580-981b-32d14b997217.fa +# ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_500MBases_200kto300k.fa +# rocprof --stats -o rocprof_output/long_seg.${SLURM_JOB_ID}.csv ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/short_seg_reads_from_1kto10k_distri.fa +# rocprof --stats -o rocprof_output/long_seg.${SLURM_JOB_ID}.csv ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/reads_4f452f4a-d82a-4580-981b-32d14b997217.fa +# omnitrace-sample -PTDH -E all -I rocm-smi -I roctracer -I rocprofiler -I roctx -o omni_output -- ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_500MBases_200kto300k.fa +# omnitrace-sample -PTDH -E all -I rocm-smi -I roctracer -I rocprofiler -I roctx -o omni_output -- ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_7GBases_10kto300k.fa +# Optional: You can add post-processing commands here + +echo "Exit: $?" + +# End of the script diff --git a/scripts/aac_rocprof.slurm b/scripts/aac_rocprof.slurm new file mode 100644 index 00000000..45839094 --- /dev/null +++ b/scripts/aac_rocprof.slurm @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --job-name=minimap2 # Job name +#SBATCH --partition=1CN128C8G2H_2IB_MI210_RHEL8 # Specify the MI210 GPU partition or queue +#SBATCH --gres=gpu:1 # Request 1 GPU +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks (processes) per node +#SBATCH --cpus-per-task=16 # Number of CPU cores per task +#SBATCH --mem=0 # Memory per node +#SBATCH --time=00:40:00 # Maximum execution time (HH:MM:SS) +#SBATCH --output=slurm_output/sample_sbatch_job.%j.out # Output file +#SBATCH --error=slurm_output/sample_sbatch_job.%j.err # Error file + +# Load necessary modules (if required) +source /etc/profile.d/modules.sh +scl enable gcc-toolset-11 bash +module unuse /shared/apps/modules/ubuntu/modulefiles +module use /shared/apps/modules/rhel8/modulefiles +module unuse /shared/apps/modules/rhel9/modulefiles +module unuse /shared/apps/modules/sles15sp4/modulefiles +module unuse /shared/apps/modules/centos8/modulefiles +module unuse /shared/apps/modules/rocky9/modulefiles + +module load rocm-5.4.3 +# export AMD_LOG_LEVEL=4 +# Replace the following line with the actual command(s) you want to run +cd /shared/prod/home/liuxs/bioinfo/minimap2/ +# make clean +# make MICRO_BATCH=4 GPU_CONFIG=gpu_config.json SHORT_BLOCK_SIZE=64 LONG_BLOCK_SIZE=1024 MID_BLOCK_SIZE=512 MID_CUT=1 LONG_CUT=100 +# ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/reads_4f452f4a-d82a-4580-981b-32d14b997217.fa +# ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_500MBases_200kto300k.fa +rocprof --hip-trace --roctx-trace rocprof_output/long_seg.${SLURM_JOB_ID}.csv ./minimap2 -K 2000000000 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_2GBases_10kto300k.fa +# rocprof --stats -o rocprof_output/long_seg.${SLURM_JOB_ID}.csv ./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/reads_4f452f4a-d82a-4580-981b-32d14b997217.fa + +# Optional: You can add post-processing commands here + +# End of the script diff --git a/scripts/aac_timing_all.sh b/scripts/aac_timing_all.sh new file mode 100644 index 00000000..7ee82e52 --- /dev/null +++ b/scripts/aac_timing_all.sh @@ -0,0 +1,42 @@ +#!/bin/bash +WORKSPACE_DIR=/shared/prod/home/liuxs/bioinfo/minimap2 +EXE_PATH=$WORKSPACE_DIR +CONFIG_PATH=$WORKSPACE_DIR +# DATA_PATH=/shared/prod/home/liuxs/bioinfo/Profile_mm2/data +DATA_PATH=/shareddata/umich_folder/data/ONT +# NOTE: currently `-t n_thread`, the n_thread must be equal to the num_streams in gpu_config.json +N_THREAD=$(sed -n 's/.*"num_streams": \([0-9]*\).*/\1/p' ${CONFIG_PATH}/gpu_config.json) +# Array of LONG_BLOCK_SIZE values +MID_BLOCK_SIZES=( 512 ) +MID_CUTS=( 1 ) +LONG_CUTS=( 100 ) +GPU_CONFIGS=( gpu_config.json ) +# DATA_SETS=( 50kto100k ) +# DATA_SETS=( 1kto5k 1kto10k 1kto50k 1kto300k 200kto300k 1kto20k 1kto30k 1kto70k 1kto200k 10kto50k 10kto100k 50kto100k) +# DATA_SETS=( 1kto300k 50kto300k 100kto300k 150kto300k 200kto300k 250kto300k 1kto200k 20kto200k 50kto200k 70kto200k 100kto200k 130kto200k 150kto200k 170kto200k) +DATA_SETS=( 1kto5k 9kto10k 10kto20k 20kto30k 40kto50k 90kto100k 110kto120k 140kto150k 180kto200k 200kto250k 200kto300k ) + +# Iterate over LONG_BLOCK_SIZES array +for MID_BLOCK_SIZE in "${MID_BLOCK_SIZES[@]}" +do + for MID_CUT in "${MID_CUTS[@]}" + do + for LONG_CUT in "${LONG_CUTS[@]}" + do + echo "Executing with MID_BLOCK_SIZE=${MID_BLOCK_SIZE} MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT}" + make clean + make GPU_CONFIG=${GPU_CONFIG} SHORT_BLOCK_SIZE=64 MID_BLOCK_SIZE=${MID_BLOCK_SIZE} LONG_BLOCK_SIZE=1024 MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT} + + for GPU_CONFIG in "${GPU_CONFIGS[@]}" + do + for DATA_SET in "${DATA_SETS[@]}" + do + QUERY_FILE=$DATA_PATH/random_500MBases_${DATA_SET}.fa + echo "Executing on dataset ${DATA_SET}" + filename="profile_output/data-${DATA_SET}_profile_${N_THREAD}_midblk-${MID_BLOCK_SIZE}_cut-${LONG_CUT}" + ${EXE_PATH}/minimap2 ${DATA_PATH}/hg38.mmi -t ${N_THREAD} --max-chain-skip=2147483647 --gpu-chain ${QUERY_FILE} > test.out 2> $filename + done + done + done + done +done \ No newline at end of file diff --git a/scripts/acc_integrated.slurm b/scripts/acc_integrated.slurm new file mode 100755 index 00000000..cefcd2b5 --- /dev/null +++ b/scripts/acc_integrated.slurm @@ -0,0 +1,52 @@ +#!/bin/bash +#SBATCH --job-name=minimap2 # Job name +#SBATCH --partition=1CN128C8G2H_2IB_MI210_RHEL8 # Specify the MI210 GPU partition or queue +#SBATCH --gres=gpu:1 # Request 1 GPU +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks (processes) per node +#SBATCH --cpus-per-task=16 # Number of CPU cores per task +#SBATCH --mem=500g # Memory per node +#SBATCH --time=10:40:00 # Maximum execution time (HH:MM:SS) +#SBATCH --output=slurm_output/sample_sbatch_job.%j.out # Output file +#SBATCH --error=slurm_output/sample_sbatch_job.%j.err # Error file + +#salloc --partition=1CN128C8G2H_2IB_MI210_RHEL8 --gres=gpu:1 --nodes=1 --cpus-per-task=8 --time=01:00:00 + +export MM2_ROOT=$HOME/minimap2 + +# Load necessary modules (if required) +source /etc/profile.d/modules.sh +scl enable gcc-toolset-11 bash +module unuse /shared/apps/modules/ubuntu/modulefiles +module use /shared/apps/modules/rhel8/modulefiles +module unuse /shared/apps/modules/rhel9/modulefiles +module unuse /shared/apps/modules/sles15sp4/modulefiles +module unuse /shared/apps/modules/centos8/modulefiles +module unuse /shared/apps/modules/rocky9/modulefiles + +module purge +module load rocm-5.7.1 +# module load rocm-6.0.0 +# Replace the following line with the actual command(s) you want to run +cd $MM2_ROOT +# SOL exp + +# random data test +# export AMD_LOG_LEVEL=4 +#make clean +# debug level: N/A info analyze verbose +#make GPU=AMD DEBUG=analyze +./minimap2 -K 2000000000 -t 1 --gpu-chain --gpu-cfg /shared/prod/home/liuxs/bioinfo/minimap2/mi210_over50k.json /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_90kto100k.fa > out.paf +echo "Exit: $?" +./minimap2 -K 2000000000 -t 1 --gpu-chain --gpu-cfg /shared/prod/home/liuxs/bioinfo/minimap2/mi210_below50k.json /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_10kto20k.fa > out.paf +echo "Exit: $?" +./minimap2 -K 2000000000 -t 1 --gpu-chain --gpu-cfg /shared/prod/home/liuxs/bioinfo/minimap2/mi210_below50k.json /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_1kto10k.fa > out.paf +echo "Exit: $?" +./minimap2 -K 2000000000 -t 1 --gpu-chain --gpu-cfg /shared/prod/home/liuxs/bioinfo/minimap2/mi210_below50k.json /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_40kto50k.fa > out.paf +echo "Exit: $?" +./minimap2 -K 2000000000 -t 1 --gpu-chain --gpu-cfg /shared/prod/home/liuxs/bioinfo/minimap2/mi210_over50k.json /shareddata/umich_folder/data/ONT/hg38.mmi /shareddata/umich_folder/data/ONT/random_5GBases_100kto300k.fa > out.paf +echo "Exit: $?" + +# Optional: You can add post-processing commands here + +# End of the script diff --git a/scripts/amdxfx_integrated.sh b/scripts/amdxfx_integrated.sh new file mode 100755 index 00000000..670ef9a3 --- /dev/null +++ b/scripts/amdxfx_integrated.sh @@ -0,0 +1,3 @@ +#!/bin/bash +make GPU=AMD DEBUG=analyze +./minimap2 -t 1 --gpu-chain --gpu-cfg gfx1030.json data/hg38.mmi data/ONT/random_500MBases_90kto100k.fa > out.paf diff --git a/scripts/compare_mem.py b/scripts/compare_mem.py new file mode 100644 index 00000000..2212a113 --- /dev/null +++ b/scripts/compare_mem.py @@ -0,0 +1,118 @@ +import re +import argparse +import matplotlib.pyplot as plt + +# Define a function to extract segments from a line +def extract_segments(line): + # Define a regular expression pattern to match the numbers + pattern = r'total anchors: (\d+), total segs: (\d+),.*long: (\d+) : (\d+)' + + # Use re.search to find the match in the text + match = re.search(pattern, line) + + if match: + total_anchors = int(match.group(1)) + total_segs = int(match.group(2)) + long_segs = int(match.group(3)) + long_anchors = int(match.group(4)) + + # print("Total anchors:", total_anchors) + # print("Total segs:", total_segs) + # print("Long segs:", long_segs) + # print("Long anchors:", long_anchors) + else: + print("Pattern not found in the text.") + + return total_anchors, total_segs, long_segs, long_anchors + +def extract_dataset(line): + # Define a regular expression pattern to match the values + pattern = r'(\d+k)to(\d+k)' + + # Use re.search to find the match in the text + match = re.search(pattern, line) + + if match: + # Extract the matched values + start_value = match.group(1) + end_value = match.group(2) + + # print("Start value:", start_value) + # print("End value:", end_value) + else: + print("Pattern not found in the text.") + return start_value, end_value + +# Create an argument parser to get the output file name from the command line +parser = argparse.ArgumentParser(description='Compute and plot a histogram of segments from an output file.') +parser.add_argument('output_file', help='Path to the output file containing segment data') +# parser.add_argument('profile_file', help='Path to the profile file containing kernel runtime') +args = parser.parse_args() + +# Read the output file specified in the command line argument +tasks = {} +with open(args.output_file, 'r') as file: + batches = [] + for line in file: + if line.startswith("[M::main] CMD: "): + # finish last batch + start_value, end_value = extract_dataset(line) + print(f"Finish dataset {start_value} to {end_value}") + if len(batches) > 0: + batch_anchors = 0 + batch_long_anchors = 0 + batch_segs = 0 + batch_long_segs = 0 + for batch in batches: + batch_anchors += batch[0] + batch_segs += batch[1] + batch_long_segs += batch[2] + batch_long_anchors += batch[3] + print(f"Long anchor pct: {batch[3] / batch[0] * 100:.2f}%") + print(f"Long seg pct: {batch[2] / batch[1] * 100:.5f}%") + print(f"Batch long anchor pct: {batch_long_anchors / batch_anchors * 100:.2f}%") + print(f"Batch long seg pct: {batch_long_segs / batch_segs * 100:.5f}%") + tasks[(start_value, end_value)] = batches + if line.startswith("[M::main::"): + print("Start new dataset...") + batches = [] + if line.startswith("[DEBUG] total anchors"): + total_anchors, total_segs, long_segs, long_anchors = extract_segments(line) + batches.append((total_anchors, total_segs, long_segs, long_anchors)) + +# visualize the data +# plot the bar chart of the long anchor percentage and long segment percentage with respect to the dataset name +ax, fig = plt.subplots() +x = [] +y = [] +ticks = [] + +values = [] +for key, value in tasks.items(): + values.append((int(key[0][:-1]), f"{key[0]}to{key[1]}", value[0][3] / value[0][0] * 100)) +# sort ticks by the first element of the tuple +ticks.sort(key=lambda tup: tup[0]) +for value in values: + ticks.append(value[1]) + x.append(value[0]) + y.append(value[2]) + +plt.bar(x, y) +plt.xticks(x, ticks, rotation=90) +plt.xlabel("Dataset") +plt.ylabel("Long anchor percentage [%]") +plt.title("Long anchor percentage vs. dataset") +plt.tight_layout() +# save the figure to a file with the same name as the input file +output_filename = args.output_file.split('.')[0] +print(f"Saving figure to {output_filename}_long_anchor_percentage.png") +plt.savefig(f'{output_filename}_long_anchor_percentage.png') + + + +# # Save the figure with an appropriate name based on the input file name +# output_filename = args.output_file #.split('.')[0] # Remove the file extension +# plt.savefig(f'{output_filename}_segment_histogram.png') + +# Display the histogram +# plt.show() diff --git a/scripts/data_analysis.ipynb b/scripts/data_analysis.ipynb new file mode 100644 index 00000000..98df4441 --- /dev/null +++ b/scripts/data_analysis.ipynb @@ -0,0 +1,1001 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: matplotlib in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (3.7.5)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (1.1.1)\n", + "Requirement already satisfied: cycler>=0.10 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (4.49.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: numpy<2,>=1.20 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (1.24.4)\n", + "Requirement already satisfied: packaging>=20.0 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (23.2)\n", + "Requirement already satisfied: pillow>=6.2.0 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (10.2.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (3.1.1)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (2.9.0)\n", + "Requirement already satisfied: importlib-resources>=3.2.0 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from matplotlib) (6.1.2)\n", + "Requirement already satisfied: zipp>=3.1.0 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.17.0)\n", + "Requirement already satisfied: six>=1.5 in /shared/prod/home/joydong/minimap2/.conda/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n" + ] + } + ], + "source": [ + "! pip install matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "oneK = 1024\n", + "oneM = 1024 * oneK\n", + "oneG = 1024 * oneM\n", + "from matplotlib import pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'total_anchors'\n", + "{'SOL': {}, '1kto5k': {'total_bases': 524288000, 'total_anchors': 2157742584, 'total_work': 17570719239, 'work_long': 5048145451, 'anchor_per_base': 4.1155673675537106, 'work_per_base': 33.51348731803894, 'avg_range': 8.143102596801695}, '10kto20k': {'total_bases': 524288000, 'total_anchors': 2372284362, 'total_work': 76583377636, 'work_long': 39767976508, 'anchor_per_base': 4.524773334503174, 'work_per_base': 146.071200630188, 'avg_range': 32.28254540759815}, '40kto50k': {'total_bases': 524288000, 'total_anchors': 2405098826, 'total_work': 165050618103, 'work_long': 131147302342, 'anchor_per_base': 4.587361957550049, 'work_per_base': 314.8090707836151, 'avg_range': 68.62529569211141}, '90kto100k': {'total_bases': 524288000, 'total_anchors': 2447974572, 'total_work': 257392983872, 'work_long': 222873697153, 'anchor_per_base': 4.669140953063965, 'work_per_base': 490.9381558837891, 'avg_range': 105.14528492904623}, '140kto150k': {'total_bases': 524288000, 'total_anchors': 2493238323, 'total_work': 343017810611, 'work_long': 311615099689, 'anchor_per_base': 4.75547470664978, 'work_per_base': 654.2545520992279, 'avg_range': 137.579230772557}}\n" + ] + } + ], + "source": [ + "dataset = {}\n", + "dataset_header = {\"total_bases\":0, \"total_anchors\":1, \"achors_long\":2, \"total_work\":3, \"work_long\":4, \"avg_range\":5, \"avg_range_long\":6, \"total_segs\":0, \"long_segs\":0, \"anchor_per_base\":7, \"work_per_base\":8}\n", + "\n", + "dataset[\"SOL\"] = {}\n", + "dataset[\"1kto5k\"] = { \n", + " \"total_bases\":500*oneM, \n", + " \"total_anchors\":2157742584, \n", + " \"total_work\":17570719239, \n", + " \"work_long\":5048145451\n", + "}\n", + "dataset[\"10kto20k\"] = { \n", + " \"total_bases\":500*oneM,\n", + " \"total_anchors\":2372284362,\n", + " \"total_work\":76583377636,\n", + " \"work_long\":39767976508\n", + "}\n", + "dataset[\"40kto50k\"] = {\n", + " \"total_bases\":500*oneM,\n", + " \"total_anchors\":2405098826,\n", + " \"total_work\":165050618103,\n", + " \"work_long\":131147302342,\n", + "} \n", + "dataset[\"90kto100k\"] = {\n", + " \"total_bases\":500*oneM,\n", + " \"total_anchors\":2447974572,\n", + " \"total_work\":257392983872,\n", + " \"work_long\":222873697153\n", + "}\n", + "dataset[\"140kto150k\"] = {\n", + " \"total_bases\":500*oneM,\n", + " \"total_anchors\":2493238323,\n", + " \"total_work\":343017810611,\n", + " \"work_long\":311615099689\n", + "}\n", + "\n", + "for key, item in dataset.items():\n", + " \n", + " try: \n", + " item[\"anchor_per_base\"] = item[\"total_anchors\"] / item[\"total_bases\"]\n", + " item[\"work_per_base\"] = item[\"total_work\"] / item[\"total_bases\"]\n", + " item[\"avg_range\"] = item[\"total_work\"] / item[\"total_anchors\"]\n", + " except Exception as e:\n", + " print(e)\n", + " \n", + "print(dataset) \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Extracting the keys and corresponding values\n", + "keys = list(dataset.keys())[1:] # excluding 'SOL'\n", + "anchor_per_base_values = [dataset[key]['anchor_per_base'] for key in keys]\n", + "work_per_base_values = [dataset[key]['work_per_base'] for key in keys]\n", + "\n", + "# Plotting\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "\n", + "color = 'tab:blue'\n", + "ax1.set_xlabel('Dataset Keys')\n", + "ax1.set_ylabel('Anchor per Base', color=color)\n", + "ax1.plot(keys, anchor_per_base_values, marker='o', color=color, label='Anchor per Base')\n", + "ax1.tick_params(axis='y', labelcolor=color)\n", + "ax1.set_ylim(bottom=0) # setting lower limit to 0 for primary y-axis\n", + "\n", + "ax2 = ax1.twinx() \n", + "color = 'tab:green'\n", + "ax2.set_ylabel('Work per Base', color=color)\n", + "ax2.plot(keys, work_per_base_values, marker='s', color=color, label='Work per Base')\n", + "ax2.tick_params(axis='y', labelcolor=color)\n", + "ax2.set_ylim(bottom=0) # setting lower limit to 0 for secondary y-axis\n", + "\n", + "fig.tight_layout() \n", + "plt.title('Anchor and Work per Base w.r.t Dataset Keys')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0: 1, \n", + "1: 0, \n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with open (\"../data/read_4f452f4a-d82a-4580-981b-32d14b997217.long_seg.range\", \"r\") as f:\n", + " text = f.read()\n", + " text_s = text.split(\"#\")\n", + "\n", + " text_s.pop(0)\n", + " print(text_s[0])\n", + " print(text_s[1])\n", + " \n", + " range_list = []\n", + " for item in text_s:\n", + " item = item.strip('\\n')\n", + " stripped_number = item.split(':')[1].strip(', ')\n", + " range_list.append(int(stripped_number)) \n", + " plt.hist(range_list, bins=100)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Range Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "for i in [40, 50, 60, 70, 80, 90, 100]:\n", + " with open(\"../mid_range_dis_l%d.csv\" % i, \"r\") as file:\n", + " lines = file.readlines() # Read all lines into a list\n", + "\n", + " with open(\"../last_mid_range_dis_l%d.csv\" % i, \"w+\") as file:\n", + " file.write(lines[0])\n", + " file.write(lines[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1647504372, 380090416, 92695059, 32601466, 16370200, 10385177, 7392392, 5679214, 4560064, 3785437]\n", + "[41909634, 11448864, 4485518, 2709970, 2070510, 1783081, 1575215, 1443760, 1338571, 1250311, 1174174, 1114907, 1051527, 1002328, 953713, 910297, 869125, 833108, 797669, 765226, 736393, 708454, 682753, 659905, 636737, 616486, 597261, 581347, 564090, 547151, 531283, 515540, 500663, 486728, 472705, 459859, 448150, 437260, 426333, 415405, 404493, 393476, 382675, 372297, 363457, 354355, 346335, 338912, 331773, 324710, 318087, 311742, 305397, 299543, 293507, 288783, 283586, 277722, 273054, 268531, 262763, 257072, 252209, 247583, 243414, 239682, 235681, 231623, 227933, 224192, 219739, 216698, 213165, 210636, 207412, 204392, 202120, 200002, 197568, 195719, 193888, 191925, 190921, 189496, 187216, 185677, 183492, 182352, 180826, 179255, 177594, 176383, 174684, 173635, 172765, 171128, 169884, 168875, 167752, 166342]\n", + "[401446, 115445, 67434, 51379, 44628, 41106, 38168, 36967, 36278, 35770, 35748, 35063, 34702, 34579, 34028, 34064, 34025, 34379, 34742, 35388, 36137, 36333, 36988, 37623, 38324, 38893, 39528, 39724, 40281, 40745, 41343, 41754, 42093, 42407, 42675, 43138, 43605, 43731, 44036, 44442, 44913, 45518, 46023, 46559, 47012, 47430, 47665, 48193, 48483, 48718, 49049, 49454, 49814, 50338, 50753, 50925, 51155, 51438, 51656, 51966, 52033, 51973, 52113, 52073, 51875, 51629, 51556, 51570, 51516, 51665, 51995, 51997, 52221, 52557, 52988, 53365, 53244, 53033, 53035, 53007, 52985, 53001, 53345, 53449, 53693, 53701, 53792, 53927, 54003, 53890, 53902, 53704, 53550, 53602, 53352, 53545, 53462, 53474, 53589, 53797]\n", + "Precent of Anchors > 16 bases: 0.07880844352464043\n", + "Precent of Anchors > 16 bases in short kernels 0.005714174534554116\n", + "Precent of Anchors > 64 bases in short kernels 0.0009200053704311544\n", + "Precent of Anchors > 16 bases in mid kernels 0.5466267298249484\n", + "Precent of Anchors > 64 bases in mid kernels 0.41671160802516294\n", + "Precent of Anchors in Long 0.03593610834850576\n", + "Precent of Anchors in Mid 0.0699026543868098\n" + ] + } + ], + "source": [ + "import csv\n", + "range_file = \"../data/random_500MBases_40kto50k.range_dis.csv\"\n", + "long_range_file = \"../data/random_500MBases_40kto50k.long_range_dis.csv\"\n", + "long_range_file = \"../data/random_500MBases_40kto50k.long_range_dis.csv\"\n", + "\n", + "# range_file = \"../range_dis_5.csv\"\n", + "# long_range_file = \"../long_range_dis_5.csv\"\n", + "# long_range_file = \"../last_long_range_dis_5.csv\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "short_kernel_it = []\n", + "avg_range_short = []\n", + "avg_range_mid = []\n", + "mid_anchors_total = []\n", + "mid_larger_than_64 = []\n", + "mid_larger_than_1024 = []\n", + "long_anchors_total = []\n", + "long_larger_than_64 = []\n", + "long_larger_than_1024 = []\n", + "short_anchors_total = []\n", + "short_larger_than_64 = []\n", + "short_larger_than_1024 = []\n", + "\n", + "def read_csv_files_range_dis(range_file, mid_range_file, long_range_file):\n", + " with open (range_file, newline='') as csvfile:\n", + " reader = csv.reader(csvfile, delimiter=',')\n", + " header = next(reader)\n", + " range_row = next(reader)\n", + " \n", + " # Convert numerical values to integers (excluding headers)\n", + " ranges_numeric = [int(x) for x in range_row[2:]]\n", + " # print(ranges_numeric[0:10])\n", + "\n", + " with open(mid_range_file, newline='') as csvfile:\n", + " reader = csv.reader(csvfile, delimiter=',')\n", + " header = next(reader)\n", + " mid_range_row = next(reader)\n", + " \n", + " # Convert numerical values to integers (excluding headers)\n", + " mid_ranges_numeric = [int(x) for x in mid_range_row[2:]]\n", + " # print(mid_ranges_numeric[0:100])\n", + "\n", + " with open(long_range_file, newline='') as csvfile:\n", + " reader = csv.reader(csvfile, delimiter=',')\n", + " header = next(reader)\n", + " long_range_row = next(reader)\n", + " \n", + " # Convert numerical values to integers (excluding headers)\n", + " long_ranges_numeric = [int(x) for x in long_range_row[2:]]\n", + " # print(long_ranges_numeric[0:100])\n", + "\n", + " short_ranges_numeric = [ x - y - z for x, y, z in zip(ranges_numeric, mid_ranges_numeric, long_ranges_numeric)]\n", + " \n", + " sum_short_kernel_it = 0\n", + " for i in range(0, len(short_ranges_numeric)):\n", + " sum_short_kernel_it += short_ranges_numeric[i]* (-(-i // 64))\n", + " \n", + " short_kernel_it.append(sum_short_kernel_it/sum(short_ranges_numeric))\n", + " # print(\"Precent of Anchors > 16 bases: \", sum(ranges_numeric[16:]) / sum(ranges_numeric))\n", + " # print(\"Precent of Anchors > 16 bases in short kernels\", sum(short_ranges_numeric[16:]) / sum(short_ranges_numeric))\n", + " # print(\"Precent of Anchors > 64 bases in short kernels\", sum(short_ranges_numeric[64:]) / sum(short_ranges_numeric))\n", + " # print(\"Precent of Anchors > 16 bases in mid kernels\", sum(mid_ranges_numeric[16:]) / sum(mid_ranges_numeric))\n", + " # print(\"Precent of Anchors > 64 bases in mid kernels\", sum(mid_ranges_numeric[64:]) / sum(mid_ranges_numeric))\n", + " # print(\"Precent of Anchors in Long \", sum(long_ranges_numeric) / sum(ranges_numeric))\n", + " # print(\"Precent of Anchors in Mid \", sum(mid_ranges_numeric) / sum(ranges_numeric))\n", + " \n", + " \n", + " \n", + " workload_range_dis = [value * index for index, value in enumerate(ranges_numeric)]\n", + " workload_long_range_dis = [value * index for index, value in enumerate(long_ranges_numeric)]\n", + " workload_mid_range_dis = [value * index for index, value in enumerate(mid_ranges_numeric)]\n", + " workload_short_range_dis = [x - y - z for x, y, z in zip(workload_range_dis, workload_mid_range_dis, workload_long_range_dis)]\n", + " \n", + " avg_range_short.append(sum(workload_short_range_dis) / sum(short_ranges_numeric))\n", + " avg_range_mid.append(sum(workload_mid_range_dis) / sum(mid_ranges_numeric))\n", + " \n", + " mid_anchors_total.append(sum(mid_ranges_numeric))\n", + " mid_larger_than_64.append(sum(mid_ranges_numeric[64:]))\n", + " mid_larger_than_1024.append(sum(mid_ranges_numeric[1024:]))\n", + " long_anchors_total.append(sum(long_ranges_numeric))\n", + " long_larger_than_64.append(sum(long_ranges_numeric[64:]))\n", + " long_larger_than_1024.append(sum(long_ranges_numeric[1024:]))\n", + " short_anchors_total.append(sum(short_ranges_numeric))\n", + " short_larger_than_64.append(sum(short_ranges_numeric[64:]))\n", + " short_larger_than_1024.append(sum(short_ranges_numeric[1024:]))" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Percentage of Anchors >64 0.06455149797657421\n" + ] + } + ], + "source": [ + "\n", + "MID_CUT_list = [1,2,3,4,5,6,7,8,9,10,12,15,18]\n", + "for i in MID_CUT_list:\n", + " range_file = \"../range_dis_%d.csv\" % i;\n", + " long_range_file = \"../long_range_dis_%d.csv\" % i;\n", + " mid_range_file = \"../last_mid_range_dis_%d.csv\" % i;\n", + " read_csv_files_range_dis(range_file, mid_range_file, long_range_file)\n", + " \n", + "# print(short_kernel_it)\n", + "# print(avg_range_mid)\n", + "# print(avg_range_short)\n", + "# print(mid_larger_than_1024)\n", + "# print(mid_larger_than_64)\n", + "print(\"Percentage of Anchors >64 \", sum(ranges_numeric[64:]) / sum(ranges_numeric))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Define x-axis values from 2 to 10\n", + "x_values = MID_CUT_list\n", + "\n", + "# Plotting\n", + "plt.plot(x_values, short_kernel_it, marker='o', color='blue')\n", + "plt.xlabel('MID_CUT')\n", + "plt.ylabel('Extra iterations in short kernel for range>64')\n", + "plt.title('Short Kernel Iteration Values')\n", + "plt.grid(True)\n", + "\n", + "# Set the lower limit of y-axis to 0\n", + "plt.ylim(0.2, max(short_kernel_it)*1.1)\n", + "\n", + "plt.show()\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Calculate the percentage of elements for each range\n", + "short_less_than_64 = [x - y for x, y in zip(short_anchors_total, short_larger_than_64)]\n", + "short_less_than_1024 = [x - y for x, y in zip(short_anchors_total, short_larger_than_1024)]\n", + "short_between_64_and_1024 = [x - y for x, y in zip(short_larger_than_64, short_larger_than_1024)]\n", + "\n", + "# Plotting\n", + "plt.bar(MID_CUT_list, short_less_than_64, color='b', label='<64')\n", + "plt.bar(MID_CUT_list, short_between_64_and_1024, bottom=short_less_than_64, color='g', label='64-1024')\n", + "plt.bar(MID_CUT_list, short_larger_than_1024, bottom=short_less_than_1024, color='r', label='>1024')\n", + "\n", + "plt.xlabel('MID_CUT_list')\n", + "plt.ylabel('Percentage of Anchors in Short Kernel')\n", + "plt.title('Percentage of Anchors in Different Ranges vs MID_CUT_list')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Calculate the percentage of elements for each range\n", + "mid_less_than_64 = [x - y for x, y in zip(mid_anchors_total, mid_larger_than_64)]\n", + "mid_less_than_1024 = [x - y for x, y in zip(mid_anchors_total, mid_larger_than_1024)]\n", + "mid_between_64_and_1024 = [x - y for x, y in zip(mid_larger_than_64, mid_larger_than_1024)]\n", + "\n", + "# Plotting\n", + "plt.bar(MID_CUT_list, mid_less_than_64, color='b', label='<64')\n", + "plt.bar(MID_CUT_list, mid_between_64_and_1024, bottom=mid_less_than_64, color='g', label='64-1024')\n", + "plt.bar(MID_CUT_list, mid_larger_than_1024, bottom=mid_less_than_1024, color='r', label='>1024')\n", + "\n", + "plt.xlabel('MID_CUT_list')\n", + "plt.ylabel('Percentage of Anchors in Mid Kernel')\n", + "plt.title('Percentage of Elements in Different Ranges vs MID_CUT_list')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Calculate the percentage of elements for each range\n", + "long_ratio_less_than_64 = [1 - x for x in long_ratio_larger_than_64]\n", + "long_ratio_less_than_1024 = [1 - y for y in long_ratio_larger_than_1024]\n", + "long_ratio_between_64_and_1024 = [x - y for x, y in zip(long_ratio_larger_than_64, long_ratio_larger_than_1024)]\n", + "\n", + "# Plotting\n", + "plt.bar(MID_CUT_list, long_ratio_less_than_64, color='b', label='<64')\n", + "plt.bar(MID_CUT_list, long_ratio_between_64_and_1024, bottom=long_ratio_less_than_64, color='g', label='64-1024')\n", + "plt.bar(MID_CUT_list, long_ratio_larger_than_1024, bottom=long_ratio_less_than_1024, color='r', label='>1024')\n", + "\n", + "plt.xlabel('MID_CUT_list')\n", + "plt.ylabel('Percentage of Elements')\n", + "plt.title('Percentage of Elements in Different Ranges vs MID_CUT_list')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "fig, ax1 = plt.subplots()\n", + "\n", + "# Plot avg_range_mid\n", + "color = 'tab:blue'\n", + "ax1.set_xlabel('MID_CUT_list')\n", + "ax1.set_ylabel('avg_range_mid', color=color)\n", + "ax1.plot(MID_CUT_list, avg_range_mid, marker='o', color=color, linestyle='-', label='avg_range_mid')\n", + "ax1.tick_params(axis='y', labelcolor=color)\n", + "\n", + "# Create a twin axis for avg_range_short\n", + "ax2 = ax1.twinx()\n", + "color = 'tab:red'\n", + "ax2.set_ylabel('avg_range_short', color=color)\n", + "ax2.plot(MID_CUT_list, avg_range_short, marker='s', color=color, linestyle='-', label='avg_range_short')\n", + "ax2.tick_params(axis='y', labelcolor=color)\n", + "\n", + "fig.tight_layout() # Adjust layout to prevent clipping of labels\n", + "plt.title('Average Range vs MID_CUT_list')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.27324480968881387, 0.2732439499409883, 0.2732441043488688, 0.27324472765034524, 0.2732448385061006, 0.2732448079539508, 0.2732448385061006]\n", + "[337.4520629293166, 377.98481955068604, 414.66843947960416, 452.74455885586076, 484.31032239941436, 511.4723277400877, 530.4147211089912]\n", + "[1.4844603719646354, 1.484417987583865, 1.484424329880518, 1.484456630712539, 1.4844610114824295, 1.4844606340443962, 1.4844610114824295]\n", + "[4490384, 6213671, 7792645, 9398573, 10859087, 12234518, 13268558]\n", + "[61704426, 67802586, 73509943, 78080017, 82088873, 85566675, 88477861]\n" + ] + } + ], + "source": [ + "import csv\n", + "LONG_CUT_list = [40,50,60,70,80,90,100]\n", + "\n", + "short_kernel_it = []\n", + "avg_range_short = []\n", + "avg_range_mid = []\n", + "mid_anchors_total = []\n", + "mid_larger_than_64 = []\n", + "mid_larger_than_1024 = []\n", + "long_anchors_total = []\n", + "long_larger_than_64 = []\n", + "long_larger_than_1024 = []\n", + "short_anchors_total = []\n", + "short_larger_than_64 = []\n", + "short_larger_than_1024 = []\n", + "\n", + "for i in LONG_CUT_list:\n", + " range_file = \"../range_dis_l%d.csv\" % i;\n", + " long_range_file = \"../long_range_dis_l%d.csv\" % i;\n", + " mid_range_file = \"../last_mid_range_dis_l%d.csv\" % i;\n", + " read_csv_files_range_dis(range_file, mid_range_file, long_range_file)\n", + "\n", + "print(short_kernel_it)\n", + "print(avg_range_mid)\n", + "print(avg_range_short)\n", + "print(mid_larger_than_1024)\n", + "print(mid_larger_than_64)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "bin_size = 50\n", + "# Determine the number of bins (each representing 100 elements)\n", + "num_bins = len(ranges_numeric) // bin_size\n", + "\n", + "# Calculate the cumulative sum for each bin\n", + "cumulative_sum = [sum(ranges_numeric[i*bin_size:(i+1)*bin_size]) for i in range(num_bins)]\n", + "cumulative_long_sum = [sum(long_ranges_numeric[i*bin_size:(i+1)*bin_size]) for i in range(num_bins)]\n", + "cumulative_mid_sum = [sum(mid_ranges_numeric[i*bin_size:(i+1)*bin_size]) for i in range(num_bins)]\n", + "cumulative_mid_sum = [x + y for x, y in zip(cumulative_mid_sum, cumulative_long_sum)]\n", + "\n", + "# Plotting\n", + "plt.bar(range(num_bins), cumulative_sum, color='skyblue')\n", + "\n", + "# # Plot the subset (mid) with a different color\n", + "# for i, value in enumerate(cumulative_mid_sum):\n", + "# plt.bar(i, value, color='green')\n", + "\n", + "# # Plot the subset (long) with a different color\n", + "# for i, value in enumerate(cumulative_long_sum):\n", + "# plt.bar(i, value, color='orange')\n", + " \n", + " \n", + "# Add vertical line at x=64 and color it red\n", + "plt.axvline(x=64.0/bin_size, color='red')\n", + "\n", + "# Add vertical line at x=1000 and color it red\n", + "plt.axvline(x=1000/bin_size, color='blue')\n", + "\n", + "# Set upper limit of y-axis to 20m\n", + "plt.ylim(0, 20000000)\n", + "\n", + "# Label the first bar\n", + "# plt.text(0, cumulative_sum[0], str(cumulative_sum[0]), ha='right')\n", + "plt.xlabel('Bins (Each bin represents %d elements)' % bin_size)\n", + "plt.ylabel('Cumulative Sum of Elements')\n", + "plt.title('Histogram of Range Distribution (Each bin represents %d elements)' % bin_size)\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 380090416, 185390118, 97804398, 65480800, 51925885, 44354352, 39754498, 36480512, 34068933]\n", + "[0, 238157, 266756, 309327, 362744, 426585, 496920, 578872, 664400, 749259]\n", + "[0, 32730, 40474, 47427, 57552, 67300, 74724, 85841, 97984, 111186]\n", + "Precent of Workload > 16 bases: 0.9932645365174807\n", + "Precent of Workload > 64 bases: 0.9863263540183071\n", + "Precent of Workload > 64 bases in short: 0.8996161386626471\n", + "Precent of Workload in Long 0.596239090529109\n", + "Precent of Workload in Mid 0.2807532365863811\n", + "Precent of Workload in Short 0.12300767288450998\n", + "Avg range: 8.842975553696157 899.5018304893736 1705.7212686890798\n" + ] + } + ], + "source": [ + "\n", + "print(workload_range_dis[0:10])\n", + "print(workload_mid_range_dis[0:10])\n", + "print(workload_long_range_dis[0:10])\n", + "\n", + "\n", + "print(\"Precent of Workload > 16 bases: \", sum(workload_range_dis[16:]) / sum(workload_range_dis))\n", + "print(\"Precent of Workload > 64 bases: \", sum(workload_range_dis[64:]) / sum(workload_range_dis))\n", + "print(\"Precent of Workload > 64 bases in short: \", sum(workload_short_range_dis[64:]) / sum(workload_short_range_dis))\n", + "print(\"Precent of Workload in Long \", sum(workload_long_range_dis) / sum(workload_range_dis))\n", + "print(\"Precent of Workload in Mid \", sum(workload_mid_range_dis) / sum(workload_range_dis))\n", + "print(\"Precent of Workload in Short \", 1 - (sum(workload_long_range_dis) + sum(workload_mid_range_dis)) / sum(workload_range_dis))\n", + "print(\"Avg range: \", (sum(workload_range_dis) - sum(workload_long_range_dis) - sum(workload_mid_range_dis))/(sum(ranges_numeric) - sum(long_ranges_numeric) - sum(mid_ranges_numeric))\n", + " ,sum(workload_mid_range_dis) / sum(mid_ranges_numeric), sum(workload_long_range_dis) / sum(long_ranges_numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'workload_range_dis' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m bin_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m50\u001b[39m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Determine the number of bins (each representing 100 elements)\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m num_bins \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[43mworkload_range_dis\u001b[49m) \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m bin_size\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Calculate the cumulative sum for each bin\u001b[39;00m\n\u001b[1;32m 6\u001b[0m cul_wl_range_dis \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28msum\u001b[39m(workload_range_dis[i\u001b[38;5;241m*\u001b[39mbin_size:(i\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m*\u001b[39mbin_size]) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_bins)]\n", + "\u001b[0;31mNameError\u001b[0m: name 'workload_range_dis' is not defined" + ] + } + ], + "source": [ + "bin_size = 50\n", + "# Determine the number of bins (each representing 100 elements)\n", + "num_bins = len(workload_range_dis) // bin_size\n", + "\n", + "# Calculate the cumulative sum for each bin\n", + "cul_wl_range_dis = [sum(workload_range_dis[i*bin_size:(i+1)*bin_size]) for i in range(num_bins)]\n", + "cul_wl_long_range_dis = [sum(workload_long_range_dis[i*bin_size:(i+1)*bin_size]) for i in range(num_bins)]\n", + "cul_wl_mid_range_dis = [sum(workload_mid_range_dis[i*bin_size:(i+1)*bin_size]) for i in range(num_bins)]\n", + "cul_wl_mid_range_dis = [x + y for x, y in zip(cul_wl_mid_range_dis, cul_wl_long_range_dis)]\n", + "\n", + "\n", + "# Plotting\n", + "plt.bar(range(num_bins), cul_wl_range_dis, color='skyblue')\n", + "\n", + "# Plot the subset (mid) with a different color\n", + "# for i, value in enumerate(cul_wl_mid_range_dis):\n", + "# plt.bar(i, value, color='green')\n", + "\n", + "# # Plot the subset (long) with a different color\n", + "# for i, value in enumerate(cul_wl_long_range_dis):\n", + "# plt.bar(i, value, color='orange')\n", + "\n", + "# Add vertical line at x=64 and color it red\n", + "plt.axvline(x=64.0/50, color='red')\n", + "\n", + "# Add vertical line at x=1000 and color it red\n", + "plt.axvline(x=1000/50, color='blue')\n", + "\n", + "# Set upper limit of y-axis to 20m\n", + "plt.ylim(0, 4000000000)\n", + "\n", + "# Label the first bar\n", + "# plt.text(0, cumulative_sum[0], str(cumulative_sum[0]), ha='right')\n", + "plt.xlabel('Range (Each bin represents %d in range)' % bin_size)\n", + "plt.ylabel('Cumulative Workload')\n", + "plt.title('Workload Distribution w.r.t. Anchor Range(Each bin represents %d elements)' % bin_size)\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'seg_len': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990, 1000, 1010, 1020, 1030, 1040, 1050, 1060, 1070, 1080, 1090, 1100, 1110, 1120, 1130, 1140, 1150, 1160, 1170, 1180, 1190, 1200, 1210, 1220, 1230, 1240, 1250, 1260, 1270, 1280, 1290, 1300, 1310, 1320, 1330, 1340, 1350, 1360, 1370, 1380, 1390, 1400, 1410, 1420, 1430, 1440, 1450, 1460, 1470, 1480, 1490, 1500, 1510, 1520, 1530, 1540, 1550, 1560, 1570, 1580, 1590, 1600, 1610, 1620, 1630, 1640, 1650, 1660, 1670, 1680, 1690, 1700, 1710, 1720, 1730, 1740, 1750, 1760, 1770, 1780, 1790, 1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020, 2030, 2040, 2050, 2060, 2070, 2080, 2090, 2100, 2110, 2120, 2130, 2140, 2150, 2160, 2170, 2180, 2190, 2200, 2210, 2220, 2230, 2240, 2250, 2260, 2270, 2280, 2290, 2300, 2310, 2320, 2330, 2340, 2350, 2360, 2370, 2380, 2390, 2400, 2410, 2420, 2430, 2440, 2450, 2460, 2470, 2480, 2490, 2500, 2510, 2520, 2530, 2540, 2550, 2560, 2570, 2580, 2590, 2600, 2610, 2620, 2630, 2640, 2650, 2660, 2670, 2680, 2690, 2700, 2710, 2720, 2730, 2740, 2750, 2760, 2770, 2780, 2790, 2800, 2810, 2820, 2830, 2840, 2850, 2860, 2870, 2880, 2890, 2900, 2910, 2920, 2930, 2940, 2950, 2960, 2970, 2980, 2990, 3000, 3010, 3020, 3030, 3040, 3050, 3060, 3070, 3080, 3090, 3100, 3110, 3120, 3130, 3140, 3150, 3160, 3170, 3180, 3190, 3200, 3210, 3220, 3230, 3240, 3250, 3260, 3270, 3280, 3290, 3300, 3310, 3320, 3330, 3340, 3350, 3360, 3370, 3380, 3390, 3400, 3410, 3420, 3430, 3440, 3450, 3460, 3470, 3480, 3490, 3500, 3510, 3520, 3530, 3540, 3550, 3560, 3570, 3580, 3590, 3600, 3610, 3620, 3630, 3640, 3650, 3660, 3670, 3680, 3690, 3700, 3710, 3720, 3730, 3740, 3750, 3760, 3770, 3780, 3790, 3800, 3810, 3820, 3830, 3840, 3850, 3860, 3870, 3880, 3890, 3900, 3910, 3920, 3930, 3940, 3950, 3960, 3970, 3980, 3990, 4000, 4010, 4020, 4030, 4040, 4050, 4060, 4070, 4080, 4090, 4100, 4110, 4120, 4130, 4140, 4150, 4160, 4170, 4180, 4190, 4200, 4210, 4220, 4230, 4240, 4250, 4260, 4270, 4280, 4290, 4300, 4310, 4320, 4330, 4340, 4350, 4360, 4370, 4380, 4390, 4400, 4410, 4420, 4430, 4440, 4450, 4460, 4470, 4480, 4490, 4500, 4510, 4520, 4530, 4540, 4550, 4560, 4570, 4580, 4590, 4600, 4610, 4620, 4630, 4640, 4650, 4660, 4670, 4680, 4690, 4700, 4710, 4720, 4730, 4740, 4750, 4760, 4770, 4780, 4790, 4800, 4810, 4820, 4830, 4840, 4850, 4860, 4870, 4880, 4890, 4900, 4910, 4920, 4930, 4940, 4950, 4960, 4970, 4980, 4990, 5000, 0], 'sc_pairs': [1451670689, 613181299, 568356128, 674102799, 932969857, 1521042310, 2303006315, 2864751658, 2546465706, 1661846771, 924058335, 694517700, 616875762, 557122231, 688599394, 543951867, 556802567, 583184395, 670911519, 599102236, 6405597539, 5925198166, 6168152676, 6096282938, 6165475981, 5488171150, 4962916075, 3856673093, 3260455805, 3058574087, 1782436146, 3015530241, 1298968520, 755362931, 584932112, 1333851187, 1088231240, 883704715, 441135000, 1039773429, 822065952, 2024091506, 347627661, 144466384, 167105010, 158689277, 490529429, 271235480, 1289346260, 295940672, 392852811, 249756261, 154679974, 425611728, 78505257, 244114103, 178221174, 211959258, 0, 0, 336770524, 120983751, 0, 0, 1295674869, 154421842, 0, 266719583, 1577314857, 0, 265146192, 770824544, 0, 0, 651454342, 0, 325083871, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1029457004, 960363363, 551135677, 0, 349882748, 0, 381374599, 0, 0, 1282631145, 0, 0, 658163034, 1293831795, 0, 790207861, 439746117, 451695263, 1237012484, 0, 0, 0, 786405279, 569040198, 0, 0, 1758267248, 1825384859, 1507163659, 0, 1933830031, 1680695255, 2014499839, 0, 721037322, 0, 0, 3345348809, 727303963, 1922426868, 771150439, 1205069222, 0, 1402244004, 0, 0, 0, 0, 2900623330, 0, 0, 3038426068, 0, 0, 984562116, 0, 0, 0, 0, 0, 0, 0, 5196154950, 0, 0, 0, 0, 0, 0, 3439894017, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2465188264, 0, 3206858364, 0, 2629782562, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3040384243, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7658240445, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'anchors': [2150545312, 53131104, 15035832, 9288032, 8024632, 8710427, 9978669, 10345426, 8393974, 5531815, 3156567, 2453959, 2225756, 1894193, 2007045, 1789045, 1715866, 1661201, 1479503, 1394168, 11392634, 8513016, 6594519, 6096911, 4926205, 4274565, 3726120, 3117720, 2411938, 2235322, 1227392, 1661821, 1266791, 1190940, 688279, 1086356, 847387, 897036, 738855, 329723, 808042, 974746, 501756, 521208, 542727, 419837, 583202, 757249, 627776, 807944, 999940, 513029, 354302, 729094, 186879, 387074, 395770, 404472, 0, 0, 437768, 221184, 0, 0, 948727, 245250, 0, 251904, 1034233, 0, 271360, 272889, 0, 0, 577532, 0, 301568, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1047047, 712695, 359424, 0, 371200, 0, 383487, 0, 0, 795139, 0, 0, 413184, 835069, 0, 429055, 434688, 440322, 885247, 0, 0, 0, 462334, 467457, 0, 0, 482304, 491520, 985087, 0, 502784, 1019390, 517119, 0, 525827, 0, 0, 1078271, 544256, 1100788, 555521, 559104, 0, 573441, 0, 0, 0, 0, 1189886, 0, 0, 1222152, 0, 0, 627199, 0, 0, 0, 0, 0, 0, 0, 2008574, 0, 0, 0, 0, 0, 0, 705025, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 799234, 0, 813052, 0, 824320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 884224, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1989124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'segs': [4194747, 51844, 9779, 4530, 3132, 2833, 2782, 2524, 1821, 1080, 560, 399, 334, 264, 261, 218, 197, 180, 152, 136, 895, 475, 284, 215, 148, 111, 85, 64, 45, 38, 19, 24, 17, 15, 8, 12, 9, 9, 7, 3, 7, 8, 4, 4, 4, 3, 4, 5, 4, 5, 6, 3, 2, 4, 1, 2, 2, 2, 0, 0, 2, 1, 0, 0, 4, 1, 0, 1, 4, 0, 1, 1, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 1, 0, 1, 0, 1, 0, 0, 2, 0, 0, 1, 2, 0, 1, 1, 1, 2, 0, 0, 0, 1, 1, 0, 0, 1, 1, 2, 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 2, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 139867549361536, 0]}\n" + ] + } + ], + "source": [ + "import csv\n", + "sc_pair_dis = {}\n", + "with open(\"../sc_pair_dis.csv\") as csvfile:\n", + " reader = csv.reader(csvfile, delimiter=',')\n", + " for row in reader:\n", + " sc_pair_dis[row[0]] = [int(x) if x.isdigit() else 0 for x in row[1:]]\n", + " print(sc_pair_dis)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80, 90, 100]\n", + "[1451670689, 613181299, 568356128, 674102799, 932969857, 1521042310, 2303006315, 2864751658, 2546465706, 1661846771, 924058335, 694517700, 616875762, 557122231, 688599394, 543951867, 556802567, 583184395, 670911519, 599102236, 6405597539, 5925198166, 6168152676, 6096282938, 6165475981, 5488171150, 4962916075, 3856673093, 3260455805]\n", + "[2150545312, 53131104, 15035832, 9288032, 8024632, 8710427, 9978669, 10345426, 8393974, 5531815, 3156567, 2453959, 2225756, 1894193, 2007045, 1789045, 1715866, 1661201, 1479503, 1394168, 11392634, 8513016, 6594519, 6096911, 4926205, 4274565, 3726120, 3117720, 2411938]\n", + "[4194747, 51844, 9779, 4530, 3132, 2833, 2782, 2524, 1821, 1080, 560, 399, 334, 264, 261, 218, 197, 180, 152, 136, 895, 475, 284, 215, 148, 111, 85, 64, 45]\n" + ] + } + ], + "source": [ + "print(sc_pair_dis['seg_len'][:29])\n", + "print(sc_pair_dis['sc_pairs'][:29])\n", + "print(sc_pair_dis['anchors'][:29])\n", + "print(sc_pair_dis['segs'][:29])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "percentage of anchors\n", + "[0.675024460493674, 11.540910179468508, 37.800111626679524, 72.57757068451099, 116.26325755498819, 174.62316256137615, 230.79293591159302, 276.90997528762955, 303.3683099328161, 300.416187273074, 292.74155593719377, 283.0192761981761, 277.1533636211696, 294.12115396899895, 343.0911583945552, 304.0459390345128, 324.5023603241745, 351.06190942577086, 453.4708743409104, 429.7202603990337, 562.2578184289954, 696.0163314623161, 935.3453490694318, 999.8969868512104, 1251.5670746548305, 1283.9133689626897, 1331.925991379773, 1237.0171449007607, 1351.7991776737213, 1368.2923923264746, 1452.2142445119407, 1814.5938948900032, 1025.4008119729301, 634.2577552185669, 849.8473903751241, 1227.821438828524, 1284.2198900856397, 985.1385172947352, 597.0521956270175, 3153.475581018006, 1017.3554740966435, 2076.532251478847, 692.8221306770622, 277.1760679037927, 307.89883311499153, 377.9783034844475, 841.0969595440345, 358.18532609485123, 2053.8317170455703, 366.28859425900805, 392.87638358301496, 486.82678951872117, 436.57663236448, 583.7542593959078, 420.0860289278089, 630.665203552809, 450.3150162973444, 524.0393846792856, 769.2899526689936, 546.9823811848959, 1365.6983189052278, 629.6507319062182, 1058.8144015180387, 1525.1059065026932, 977.1012382075472, 2824.6816251296314, 1127.9969629388502, 1077.978668161078, 983.2003759143572, 1347.5096121061604, 1533.385853476674, 942.5720581896552, 994.4915968468292, 1613.0904722318992, 1592.9054222815985, 1549.3711238233009, 1841.740245423081, 1011.6362011373675, 1025.8294225589455, 1397.3642203814302, 1700.9462401640374, 1217.3102509963483, 3645.5580878450105, 3713.7550028483074, 1529.980254535894, 3846.244174436736, 1648.726449150963, 3895.6213927548592, 1371.2443864617069, 3102.512085551777, 1336.3269545948965, 1746.4097246699637, 1388.1571335737083, 2155.357897636218, 2445.3152181305486, 2437.73212727942, 2486.1278040701973, 1569.7762847198417, 2586.9870614674887, 4879.109275557605, 3084.4386800361344, 3944.222957449216, 3190.2447617430125, 3438.477402784815]\n", + "[0.9799908606544164, 0.012111969131813567, 0.0022846027725485084, 0.0010583137907398245, 0.0007317083427366733, 0.0006618549600807776, 0.0006499401690592034, 0.0005896653438912399, 0.00042542812647620754, 0.00025231322163333565, 0.00013082907788395181, 9.321571799231566e-05, 7.803020002364268e-05, 6.167656528814871e-05, 6.0975695228056105e-05, 5.0929891033395525e-05, 4.602380061274733e-05, 4.2052203605555935e-05, 3.5510749711358347e-05, 3.177277605753115e-05, 2.090929012609587e-05, 1.1097109284799482e-05, 6.634903235543271e-06, 5.022902097330292e-06, 3.457625629790155e-06, 2.593219222342616e-06, 1.985798503595697e-06, 1.4951894615308778e-06, 1.0513050901388984e-06, 8.877687427839586e-07, 4.438843713919793e-07, 5.606960480740792e-07, 3.971597007191394e-07, 3.504350300462995e-07, 1.8689868269135972e-07, 2.803480240370396e-07, 2.1026101802777968e-07, 2.1026101802777968e-07, 1.6353634735493976e-07, 7.00870060092599e-08, 1.6353634735493976e-07, 1.8689868269135972e-07, 9.344934134567986e-08, 9.344934134567986e-08, 9.344934134567986e-08, 7.00870060092599e-08, 9.344934134567986e-08, 1.1681167668209982e-07, 9.344934134567986e-08, 1.1681167668209982e-07, 1.401740120185198e-07, 7.00870060092599e-08, 4.672467067283993e-08, 9.344934134567986e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 4.672467067283993e-08, 4.672467067283993e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 9.344934134567986e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 9.344934134567986e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 7.00870060092599e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 7.00870060092599e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from matplotlib import pyplot as plt\n", + "# Data\n", + "seg_len = sc_pair_dis['seg_len'][0:200]\n", + "sc_pairs = sc_pair_dis['sc_pairs'][0:200]\n", + "anchors = sc_pair_dis['anchors'][0:200]\n", + "segs = sc_pair_dis['segs'][0:200]\n", + "avg_range = [ (x/y) if (y!=0) else 0 for x,y in zip(sc_pairs, anchors) ]\n", + "\n", + "\n", + "filtered_indices = [i for i in range(len(sc_pairs)) if sc_pairs[i] != 0 and anchors[i] != 0 and segs[i] != 0]\n", + "seg_len_filtered = [seg_len[i]*512 for i in filtered_indices]\n", + "avg_range_filtered = [avg_range[i] for i in filtered_indices]\n", + "segs_filtered = [segs[i] for i in filtered_indices]\n", + "anchors_filtered = [anchors[i] for i in filtered_indices]\n", + "segs_filtered = [x / sum(segs_filtered) for x in segs_filtered]\n", + "segs_filtered[20:] = [x/10 for x in segs_filtered[20:]]\n", + "\n", + "\n", + "# Plotting\n", + "fig, ax1 = plt.subplots()\n", + "\n", + "# Plot sc_pairs on the first y-axis\n", + "ax1.scatter(seg_len_filtered, avg_range_filtered, label='avg_range', color='blue')\n", + "ax1.set_xlabel('seg_len')\n", + "ax1.set_ylabel('avg_range', color='blue')\n", + "ax1.tick_params(axis='y', labelcolor='blue')\n", + "ax1.set_ylim(0, 5000)\n", + "# ax1.set_xscale('log') # Set logarithmic scale for avg_range\n", + "\n", + "print(\"percentage of anchors\")\n", + "anchors_filtered = [x / sum(anchors_filtered) for x in anchors_filtered]\n", + "print(avg_range_filtered)\n", + "\n", + "# # Plot sc_pairs on the first y-axis\n", + "# ax1.plot(seg_len, sc_pairs, label='sc_pairs', color='blue')\n", + "# ax1.set_xlabel('seg_len')\n", + "# ax1.set_ylabel('sc_pairs', color='blue')\n", + "# ax1.tick_params(axis='y', labelcolor='blue')\n", + "\n", + "# Create a second y-axis for anchors\n", + "ax2 = ax1.twinx()\n", + "ax2.plot(seg_len_filtered, anchors_filtered, label='anchors', color='green')\n", + "ax2.set_ylabel('anchors', color='green')\n", + "ax2.tick_params(axis='y', labelcolor='green')\n", + "ax2.set_yscale('log') # Set logarithmic scale for segs\n", + "ax2.set_ylim(0.00000001, 5)\n", + "\n", + "# Create a third y-axis for segs\n", + "ax3 = ax1.twinx()\n", + "ax3.plot(seg_len_filtered, segs_filtered, label='segs', color='red')\n", + "ax3.set_ylabel('segs', color='red')\n", + "ax3.tick_params(axis='y', labelcolor='red')\n", + "ax3.set_yscale('log') # Set logarithmic scale for segs\n", + "ax3.set_ylim(0.00000001, 5)\n", + "print(segs_filtered)\n", + "# print([x / sum(segs_filtered) for x in segs_filtered])\n", + "\n", + "# Add legend\n", + "lines = ax1.get_lines() + ax3.get_lines()\n", + "ax1.legend(lines, [line.get_label() for line in lines], loc='upper left')\n", + "\n", + "# Add horizontal lines\n", + "line_64 = ax1.axhline(y=64, color='c', linestyle='--')\n", + "line_1024 = ax1.axhline(y=1024, color='c', linestyle='--')\n", + "# # Calculate midpoints of horizontal lines\n", + "# midpoint_64 = (line_64.get_ydata()[0] + line_64.get_ydata()[1]) / 2\n", + "# midpoint_1024 = (line_1024.get_ydata()[0] + line_1024.get_ydata()[1]) / 2\n", + "# # # Add labels at midpoints\n", + "# ax1.text(0, midpoint_64, 'avg_range = 64', ha='right', va='center', color='c')\n", + "# ax1.text(0, midpoint_1024, 'avg_range = 1024', ha='right', va='center', color='c')\n", + "\n", + "plt.title('avg_range and number of segs with respect to seg_len')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "percentage of anchors\n", + "[0.675024460493674, 11.540910179468508, 37.800111626679524, 72.57757068451099, 116.26325755498819, 174.62316256137615, 230.79293591159302, 276.90997528762955, 303.3683099328161, 300.416187273074, 292.74155593719377, 283.0192761981761, 277.1533636211696, 294.12115396899895, 343.0911583945552, 304.0459390345128, 324.5023603241745, 351.06190942577086, 453.4708743409104, 429.7202603990337, 562.2578184289954, 696.0163314623161, 935.3453490694318, 999.8969868512104, 1251.5670746548305, 1283.9133689626897, 1331.925991379773, 1237.0171449007607, 1351.7991776737213, 1368.2923923264746, 1452.2142445119407, 1814.5938948900032, 1025.4008119729301, 634.2577552185669, 849.8473903751241, 1227.821438828524, 1284.2198900856397, 985.1385172947352, 597.0521956270175, 3153.475581018006, 1017.3554740966435, 2076.532251478847, 692.8221306770622, 277.1760679037927, 307.89883311499153, 377.9783034844475, 841.0969595440345, 358.18532609485123, 2053.8317170455703, 366.28859425900805, 392.87638358301496, 486.82678951872117, 436.57663236448, 583.7542593959078, 420.0860289278089, 630.665203552809, 450.3150162973444, 524.0393846792856, 769.2899526689936, 546.9823811848959, 1365.6983189052278, 629.6507319062182, 1058.8144015180387, 1525.1059065026932, 977.1012382075472, 2824.6816251296314, 1127.9969629388502, 1077.978668161078, 983.2003759143572, 1347.5096121061604, 1533.385853476674, 942.5720581896552, 994.4915968468292, 1613.0904722318992, 1592.9054222815985, 1549.3711238233009, 1841.740245423081, 1011.6362011373675, 1025.8294225589455, 1397.3642203814302, 1700.9462401640374, 1217.3102509963483, 3645.5580878450105, 3713.7550028483074, 1529.980254535894, 3846.244174436736, 1648.726449150963, 3895.6213927548592, 1371.2443864617069, 3102.512085551777, 1336.3269545948965, 1746.4097246699637, 1388.1571335737083, 2155.357897636218, 2445.3152181305486, 2437.73212727942, 2486.1278040701973, 1569.7762847198417, 2586.9870614674887, 4879.109275557605, 3084.4386800361344, 3944.222957449216, 3190.2447617430125, 3438.477402784815]\n", + "[0.9799908606544164, 0.012111969131813567, 0.0022846027725485084, 0.0010583137907398245, 0.0007317083427366733, 0.0006618549600807776, 0.0006499401690592034, 0.0005896653438912399, 0.00042542812647620754, 0.00025231322163333565, 0.00013082907788395181, 9.321571799231566e-05, 7.803020002364268e-05, 6.167656528814871e-05, 6.0975695228056105e-05, 5.0929891033395525e-05, 4.602380061274733e-05, 4.2052203605555935e-05, 3.5510749711358347e-05, 3.177277605753115e-05, 2.090929012609587e-05, 1.1097109284799482e-05, 6.634903235543271e-06, 5.022902097330292e-06, 3.457625629790155e-06, 2.593219222342616e-06, 1.985798503595697e-06, 1.4951894615308778e-06, 1.0513050901388984e-06, 8.877687427839586e-07, 4.438843713919793e-07, 5.606960480740792e-07, 3.971597007191394e-07, 3.504350300462995e-07, 1.8689868269135972e-07, 2.803480240370396e-07, 2.1026101802777968e-07, 2.1026101802777968e-07, 1.6353634735493976e-07, 7.00870060092599e-08, 1.6353634735493976e-07, 1.8689868269135972e-07, 9.344934134567986e-08, 9.344934134567986e-08, 9.344934134567986e-08, 7.00870060092599e-08, 9.344934134567986e-08, 1.1681167668209982e-07, 9.344934134567986e-08, 1.1681167668209982e-07, 1.401740120185198e-07, 7.00870060092599e-08, 4.672467067283993e-08, 9.344934134567986e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 4.672467067283993e-08, 4.672467067283993e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 9.344934134567986e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 9.344934134567986e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 7.00870060092599e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 4.672467067283993e-08, 4.672467067283993e-08, 2.3362335336419965e-08, 7.00870060092599e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08, 2.3362335336419965e-08]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting\n", + "fig, ax1 = plt.subplots()\n", + "\n", + "# Plot sc_pairs on the first y-axis\n", + "ax1.scatter(seg_len_filtered, avg_range_filtered, label='avg_range', color='blue')\n", + "ax1.set_xlabel('seg_len')\n", + "ax1.set_ylabel('avg_range', color='blue')\n", + "ax1.tick_params(axis='y', labelcolor='blue')\n", + "ax1.set_ylim(0, 600)\n", + "ax1.set_xlim(0, 10000)\n", + "# ax1.set_xscale('log') # Set logarithmic scale for avg_range\n", + "\n", + "print(\"percentage of anchors\")\n", + "anchors_filtered = [x / sum(anchors_filtered) for x in anchors_filtered]\n", + "print(avg_range_filtered)\n", + "\n", + "# # Plot sc_pairs on the first y-axis\n", + "# ax1.plot(seg_len, sc_pairs, label='sc_pairs', color='blue')\n", + "# ax1.set_xlabel('seg_len')\n", + "# ax1.set_ylabel('sc_pairs', color='blue')\n", + "# ax1.tick_params(axis='y', labelcolor='blue')\n", + "\n", + "# Create a second y-axis for anchors\n", + "# ax2 = ax1.twinx()\n", + "# ax2.plot(seg_len_filtered, anchors_filtered, label='anchors', color='green')\n", + "# ax2.set_ylabel('anchors', color='green')\n", + "# ax2.tick_params(axis='y', labelcolor='green')\n", + "# ax2.set_yscale('log') # Set logarithmic scale for segs\n", + "# ax2.set_ylim(0.00000001, 5)\n", + "\n", + "# Create a third y-axis for segs\n", + "ax3 = ax1.twinx()\n", + "ax3.plot(seg_len_filtered, segs_filtered, label='segs', color='red')\n", + "ax3.set_ylabel('segs', color='red')\n", + "ax3.tick_params(axis='y', labelcolor='red')\n", + "ax3.set_yscale('log') # Set logarithmic scale for segs\n", + "ax3.set_ylim(0.00000001, 5)\n", + "print(segs_filtered)\n", + "# print([x / sum(segs_filtered) for x in segs_filtered])\n", + "\n", + "# Add legend\n", + "lines = ax1.get_lines() + ax3.get_lines()\n", + "ax1.legend(lines, [line.get_label() for line in lines], loc='upper left')\n", + "\n", + "# Add horizontal lines\n", + "line_64 = ax1.axhline(y=64, color='c', linestyle='--')\n", + "line_1024 = ax1.axhline(y=1024, color='c', linestyle='--')\n", + "# # Calculate midpoints of horizontal lines\n", + "# midpoint_64 = (line_64.get_ydata()[0] + line_64.get_ydata()[1]) / 2\n", + "# midpoint_1024 = (line_1024.get_ydata()[0] + line_1024.get_ydata()[1]) / 2\n", + "# # # Add labels at midpoints\n", + "# ax1.text(0, midpoint_64, 'avg_range = 64', ha='right', va='center', color='c')\n", + "# ax1.text(0, midpoint_1024, 'avg_range = 1024', ha='right', va='center', color='c')\n", + "\n", + "plt.title('avg_range and number of segs with respect to seg_len')\n", + "plt.grid(True)\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scripts/mbit10_integrated.sh b/scripts/mbit10_integrated.sh new file mode 100755 index 00000000..7700e50c --- /dev/null +++ b/scripts/mbit10_integrated.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +make clean +make MICRO_BATCH=4 GPU=NV GPU_CONFIG=a6000_config.json SHORT_BLOCK_SIZE=32 LONG_BLOCK_SIZE=1024 MID_BLOCK_SIZE=512 MID_CUT=1 LONG_CUT=100 DEBUG_ANALYSIS=1 +./minimap2 -t 1 --max-chain-skip=2147483647 --gpu-chain data/hg38.mmi data/random_500MBases_90kto100k.fa \ No newline at end of file diff --git a/scripts/ncu_all.sh b/scripts/ncu_all.sh new file mode 100644 index 00000000..5c3ddb00 --- /dev/null +++ b/scripts/ncu_all.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Array of LONG_BLOCK_SIZE values +MID_BLOCK_SIZES=( 512 256 ) +MID_CUTS=( 1 ) +LONG_CUTS=( 50 100 ) +GPU_CONFIGS=( gpu_config1.json ) +DATA_SETS=( 1kto300k ) + +# Iterate over LONG_BLOCK_SIZES array +for DATA_SET in "${DATA_SETS[@]}" +do + for MID_BLOCK_SIZE in "${MID_BLOCK_SIZES[@]}" + do + for MID_CUT in "${MID_CUTS[@]}" + do + for LONG_CUT in "${LONG_CUTS[@]}" + do + for GPU_CONFIG in "${GPU_CONFIGS[@]}" + do + echo "Executing with MID_BLOCK_SIZE=${MID_BLOCK_SIZE} MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT}" + + # Clean the project + make GPU=NVCC clean + + # Build with specific configurations + make GPU=NVCC GPU_CONFIG=${GPU_CONFIG} SHORT_BLOCK_SIZE=64 MID_BLOCK_SIZE=${MID_BLOCK_SIZE} LONG_BLOCK_SIZE=512 MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT} + + # # Run nsys ncu with the given command # data/random_500MBases_200kto300k.fa + # read by `ncu --import ncu_midXXX_X_XXX_gpu_configX.json_XXXktoXXXk_report.ncu-rep --section SpeedOfLight --kernel-name score_generation_mid -c 1 --csv > test.csv` + ncu --export "ncu/ncu32_mid${MID_BLOCK_SIZE}_${MID_CUT}_${LONG_CUT}_${GPU_CONFIG}_${DATA_SET}_report" --force-overwrite --target-processes all --kernel-name-base function --kernel-name regex:score_generation_mid --launch-count 1 --launch-skip-before-match 0 --section ComputeWorkloadAnalysis --section InstructionStats --section LaunchStats --section MemoryWorkloadAnalysis --section MemoryWorkloadAnalysis_Chart --section MemoryWorkloadAnalysis_Tables --section Occupancy --section SchedulerStats --section SourceCounters --section SpeedOfLight --section SpeedOfLight_RooflineChart --section WarpStateStats --sampling-interval auto --sampling-max-passes 5 --sampling-buffer-size 33554432 --profile-from-start 1 --cache-control all --clock-control base --apply-rules yes --import-source yes --source-folders gpu --check-exit-code yes ./minimap2 data/hg38.mmi data/random_500MBases_${DATA_SET}.fa -t 1 --max-chain-skip=2147483647 --gpu-chain # 2> ncu/runtime_report_mid${MID_BLOCK_SIZE}_${MID_CUT}_${LONG_CUT}_${DATA_SET}.txt + echo "Done with MID_BLOCK_SIZE=${MID_BLOCK_SIZE} MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT}" + done + done + done + done +done \ No newline at end of file diff --git a/scripts/ncu_integrated.sh b/scripts/ncu_integrated.sh new file mode 100644 index 00000000..2906989b --- /dev/null +++ b/scripts/ncu_integrated.sh @@ -0,0 +1 @@ +ncu --export "ncu/report" --force-overwrite --target-processes all --kernel-name-base function --kernel-name score_generation_long --launch-skip-before-match 0 --section ComputeWorkloadAnalysis --section InstructionStats --section LaunchStats --section MemoryWorkloadAnalysis --section MemoryWorkloadAnalysis_Chart --section MemoryWorkloadAnalysis_Tables --section Occupancy --section SchedulerStats --section SourceCounters --section SpeedOfLight --section SpeedOfLight_RooflineChart --section WarpStateStats --sampling-interval auto --sampling-max-passes 5 --sampling-buffer-size 33554432 --profile-from-start 1 --cache-control all --clock-control base --apply-rules yes --import-source yes --source-folders gpu --check-exit-code yes --replay-mode application ./minimap2 data/hg38.mmi data/random_100MBases_1kto300k.fa -t 4 --max-chain-skip=2147483647 --gpu-chain diff --git a/scripts/nvprof_all.sh b/scripts/nvprof_all.sh new file mode 100644 index 00000000..9c5ed4a9 --- /dev/null +++ b/scripts/nvprof_all.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Array of LONG_BLOCK_SIZE values +MID_BLOCK_SIZES=( 128 256 512 768 1024 ) +MID_CUTS=( 1 ) +LONG_CUTS=( 50 60 70 ) + +# Iterate over LONG_BLOCK_SIZES array +for MID_BLOCK_SIZE in "${MID_BLOCK_SIZES[@]}" +do + for MID_CUT in "${MID_CUTS[@]}" + do + for LONG_CUT in "${LONG_CUTS[@]}" + do + echo "Executing with MID_BLOCK_SIZE=${MID_BLOCK_SIZE} MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT}" + + # Clean the project + make clean + + # Build with specific configurations + make GPU=NVCC GPU_CONFIG=gpu_config.json SHORT_BLOCK_SIZE=64 MID_BLOCK_SIZE=${MID_BLOCK_SIZE} LONG_BLOCK_SIZE=1024 MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT} + + # # Run nsys nvprof with the given command # data/random_500MBases_200kto300k.fa + # read by `nsys stats --report cuda_gpu_kern_sum nsys/nvprofxxx_xxx_xxx.nsys-rep` + nsys nvprof -o nsys/nvprof_mid${MID_BLOCK_SIZE}_${MID_CUT}_${LONG_CUT} ./minimap2 data/hg38.mmi data/random_500MBases_200kto300k.fa -t 1 --max-chain-skip=2147483647 --gpu-chain 2> nsys/runtime_report_mid${MID_BLOCK_SIZE}_${MID_CUT}_${LONG_CUT}.txt + # nsys stats --format csv --report cuda_gpu_kern_sum nsys/nvprofxxx_xxx_xxx.nsys-rep + echo "Done with MID_BLOCK_SIZE=${MID_BLOCK_SIZE} MID_CUT=${MID_CUT} LONG_CUT=${LONG_CUT}" + done + done +done diff --git a/scripts/parse_seg.py b/scripts/parse_seg.py new file mode 100644 index 00000000..59268361 --- /dev/null +++ b/scripts/parse_seg.py @@ -0,0 +1,66 @@ +import re +import argparse +import matplotlib.pyplot as plt + +# Define a function to extract segments from a line +def extract_segments(line): + match = re.search(r'long segments (\d+)', line) + if match: + if int(match.group(1)) > 1000000: + print(line) + return int(match.group(1)) + return None + +# Define a function to extract runtime from a line +def extract_runtime(line): + match = re.search(r'last launch runtime: (\d+\.\d+) ms', line) + if match: + return float(match.group(1)) + return None + + +# Initialize variables to store segment counts +segment_counts = [] +runtimes = [] + +# Create an argument parser to get the output file name from the command line +parser = argparse.ArgumentParser(description='Compute and plot a histogram of segments from an output file.') +parser.add_argument('output_file', help='Path to the output file containing segment data') +parser.add_argument('runtime_file', help='Path to the file containing runtime data') +args = parser.parse_args() + +# Read the output file specified in the command line argument +with open(args.output_file, 'r') as file: + for line in file: + segments = extract_segments(line) + if segments is None: + continue + segment_counts.append(segments) + + +# Read the runtime file specified in the command line argument +with open(args.runtime_file, 'r') as runtime_file: + for line in runtime_file: + runtime = extract_runtime(line) + if runtime is None: + continue + runtimes.append(runtime) + +# Calculate the total number of segments +total_segments = sum(segment_counts) +total_runtime = sum(runtimes) +throughput = total_segments / total_runtime # anchors/ms + +# Create a histogram +plt.hist(segment_counts, bins=200, edgecolor='k') +plt.xlabel('Segments') +plt.ylabel('Frequency') +plt.title(f'Total Segments: {total_segments}, throughput: {throughput} anchors/ms') +plt.grid(True) + +# Save the figure with an appropriate name based on the input file name +output_filename = args.output_file #.split('.')[0] # Remove the file extension +plt.savefig(f'{output_filename}_segment_histogram.png') + +# Display the histogram +# plt.show() diff --git a/scripts/run_ncu_profile.sh b/scripts/run_ncu_profile.sh new file mode 100644 index 00000000..66caf30d --- /dev/null +++ b/scripts/run_ncu_profile.sh @@ -0,0 +1 @@ +ncu --export "ncu/report" --force-overwrite --target-processes all --kernel-name-base function --launch-skip-before-match 0 --section ComputeWorkloadAnalysis --section InstructionStats --section LaunchStats --section MemoryWorkloadAnalysis --section MemoryWorkloadAnalysis_Chart --section MemoryWorkloadAnalysis_Tables --section Occupancy --section SchedulerStats --section SourceCounters --section SpeedOfLight --section SpeedOfLight_RooflineChart --section WarpStateStats --sampling-interval auto --sampling-max-passes 5 --sampling-buffer-size 33554432 --profile-from-start 1 --cache-control all --clock-control base --apply-rules yes --import-source yes --source-folders gpu --check-exit-code yes ./minimap2 data/hg38.mmi data/tiny.fa -t 1 --max-chain-skip=2147483647 --gpu-chain \ No newline at end of file diff --git a/scripts/test_index.cpp b/scripts/test_index.cpp new file mode 100644 index 00000000..e959122c --- /dev/null +++ b/scripts/test_index.cpp @@ -0,0 +1,80 @@ +#include + + +int main(int argc, const char** argv) { + + unsigned blockdim = atoi(argv[1]); + unsigned range = atoi(argv[2]); + unsigned i_range = atoi(argv[3]); + unsigned buffer_size = atoi(argv[4]); + + unsigned start_idx = 3; + + int anchor_offset[blockdim] = {0}; + for (unsigned tid = 0; tid < blockdim; ++tid) { + anchor_offset[tid] = tid + (start_idx - tid + blockdim) / blockdim * blockdim; + std::cout << "tid: " << tid << " " << anchor_offset[tid] << std::endl; + } + + // unsigned anchors[blockdim][buffer_size] = {0}; + // unsigned i = 0; + // for (unsigned tid = 0; tid < blockdim; ++tid) { + // // update when drop becomes 0 + // // TODO: think about how to update the array of anchors + // for (unsigned j = tid+1, index=0; j < i+range+1; j += blockdim, index++) { + // anchors[tid][index] = j; + // } + // } + // // print the anchors + // for (unsigned tid = 0; tid < blockdim; ++tid) { + // std::cout << "tid: " << tid << " ["; + // for (unsigned j = 0; j < buffer_size; ++j) { + // std::cout << anchors[tid][j] << " "; + // } + // std::cout << "]" << std::endl; + // } + + for (unsigned i = start_idx; i < i_range; ++i) { + std::cout << "anchor i: " << i << std::endl; + for (unsigned tid = 0; tid < blockdim; ++tid) { + std::cout << "tid: " << tid << ": "; + for (unsigned j = anchor_offset[tid]; j < i+range+1; j += blockdim) { + std::cout << j << " "; + } + if (anchor_offset[tid] <= i+1) { + anchor_offset[tid] += blockdim; + } + std::cout << std::endl; + // unsigned drop = (i+blockdim-tid-1) % blockdim; + // unsigned offset = (i+blockdim-tid-1) / blockdim; + // // remove old when drop becomes 0 + // // add new when drop becomes (blockdim - range%blockdim) + // // TODO: think about how to update the array of anchors efficiently + // // easiest, generate a mask with ballot and use cooperative groups + // std::cout << "tid: " << tid << " offset: " << offset << " drop: " << drop << " ("; + // if (!drop) { + // anchors[tid][(offset+buffer_size-1)%buffer_size] = 0; + // } + // for (unsigned j = offset*blockdim+tid+1, index=0; j < i+range+1; j += blockdim, index++) { + // if (drop == blockdim - range%blockdim && index == buffer_size-1) { + // anchors[tid][(index+offset)%buffer_size] = j; + // } + // std::cout << j << " : " << anchors[tid][(index+offset)%buffer_size] << ", "; + // // anchors[tid][index] = j; + // } + // std::cout << ") " << std::endl; + } + std::cout << std::endl; + // for (unsigned tid = 0; tid < blockdim; ++tid) { + // std::cout << "tid: " << tid << " ["; + // for (unsigned j = 0; j < buffer_size; ++j) { + // std::cout << anchors[tid][j] << " "; + // } + // std::cout << "]" << std::endl; + // } + + } + + + return 0; +}