diff --git a/gpu/debug.c b/gpu/debug.c index f3a4e41e..d8b33b7d 100644 --- a/gpu/debug.c +++ b/gpu/debug.c @@ -180,6 +180,32 @@ void debug_print_score(const int64_t *p, const int32_t *score, int64_t n) { read_idx++; } + +void debug_print_score_rel_p(const uint16_t *p, const int32_t *score, int64_t n) { + static FILE *fout_score = NULL; + static int read_idx = 0; + if (fout_score == NULL) { + char fout_score_filename[50]; + strcpy(fout_score_filename, debug_folder); + strcat(fout_score_filename, ".score.out"); + if ((fout_score = fopen(fout_score_filename, "w+")) == NULL) { + fprintf(stderr, "[Error]: Cannot create score output file: %s \n", + fout_score_filename); + exit(1); + } + fprintf(stderr, "[Info] Writing score to file %s\n", + fout_score_filename); + fprintf(fout_score, "@@@long_segs, buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t)); + cudaMallocHost((void**)&long_mem->long_segs_og_idx, buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t)); cudaMallocHost((void**)&long_mem->f_long, buffer_size_long * sizeof(int32_t)); cudaMallocHost((void**)&long_mem->p_long, buffer_size_long * sizeof(uint16_t)); cudaMallocHost((void**)&long_mem->total_long_segs_num, sizeof(unsigned int)); @@ -72,7 +72,7 @@ void plmem_free_host_mem(hostMemPtr *host_mem) { } void plmem_free_long_mem(longMemPtr *long_mem) { - cudaFreeHost(long_mem->long_segs); + cudaFreeHost(long_mem->long_segs_og_idx); cudaFreeHost(long_mem->f_long); cudaFreeHost(long_mem->p_long); cudaFreeHost(long_mem->total_long_segs_num); @@ -314,7 +314,7 @@ void plmem_async_d2h_memcpy(stream_ptr_t *stream_ptrs) { cudaMemcpyAsync(host_mem->p, dev_mem->d_p, sizeof(uint16_t) * host_mem->total_n, cudaMemcpyDeviceToHost, *stream); - cudaMemcpyAsync(long_mem->long_segs, dev_mem->d_long_seg_og, + cudaMemcpyAsync(long_mem->long_segs_og_idx, dev_mem->d_long_seg_og, dev_mem->buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t), cudaMemcpyDeviceToHost, *stream); cudaMemcpyAsync(host_mem->long_segs_num, dev_mem->d_long_seg_count, @@ -348,7 +348,7 @@ void plmem_async_d2h_long_memcpy(stream_ptr_t *stream_ptrs) { longMemPtr *long_mem = &stream_ptrs->long_mem; deviceMemPtr *dev_mem = &stream_ptrs->dev_mem; cudaStream_t *stream = &stream_ptrs->cudastream; - cudaMemcpyAsync(long_mem->long_segs, dev_mem->d_long_seg_og, + cudaMemcpyAsync(long_mem->long_segs_og_idx, dev_mem->d_long_seg_og, dev_mem->buffer_size_long / (MM_LONG_SEG_CUTOFF * MM_CUT_SIZE) * sizeof(seg_t), cudaMemcpyDeviceToHost, *stream); // cudaMemcpyAsync(&long_mem->total_long_segs_num, dev_mem->d_long_seg_count, diff --git a/gpu/plmem.cuh b/gpu/plmem.cuh index a1a267c7..7b0ec513 100644 --- a/gpu/plmem.cuh +++ b/gpu/plmem.cuh @@ -45,7 +45,7 @@ typedef struct { typedef struct { // array size: number of cuts in the batch / long_seg_cut - seg_t *long_segs; + seg_t *long_segs_og_idx; // start & end idx of long segs in the original micro batch unsigned int *total_long_segs_num; // sum of mini batch long_segs_num size_t *total_long_segs_n; // number of anchors in all the long segs int32_t *f_long; // score for long segs