diff --git a/CommandLines.cpp b/CommandLines.cpp index 85c2f3d..beac8f2 100644 --- a/CommandLines.cpp +++ b/CommandLines.cpp @@ -175,7 +175,6 @@ void init_opt(hifiasm_opt_t* asm_opt) asm_opt->purge_level_trio = 0; asm_opt->purge_simi_rate_l2 = 0.75; asm_opt->purge_simi_rate_l3 = 0.55; - ///asm_opt->purge_simi_rate_hic = 0.85; asm_opt->purge_overlap_len = 1; ///asm_opt->purge_overlap_len_hic = 50; asm_opt->recover_atg_cov_min = -1024; diff --git a/CommandLines.h b/CommandLines.h index 52107ab..718051e 100644 --- a/CommandLines.h +++ b/CommandLines.h @@ -4,7 +4,7 @@ #include #include -#define HA_VERSION "0.15.1-r333" +#define HA_VERSION "0.15.1-r334" #define VERBOSE 0 @@ -86,6 +86,7 @@ typedef struct { float purge_simi_rate_l2; float purge_simi_rate_l3; float purge_simi_thres; + ///float purge_simi_rate_hic; long long small_pop_bubble_size; diff --git a/Overlaps.cpp b/Overlaps.cpp index 1d4d3da..0c03a76 100644 --- a/Overlaps.cpp +++ b/Overlaps.cpp @@ -11200,7 +11200,7 @@ uint32_t only_len) break; } } - if(k == nv) fprintf(stderr, "ERROR\n"); + if(k == nv) fprintf(stderr, "ERROR-set_utg_offset\n"); } p_v = v; len += l; @@ -11272,8 +11272,8 @@ void refine_u_trans_t(u_trans_hit_t *q, kv_ca_buf_t* cb) if(si != cb->n && ei != cb->n) break; } - if(si == 0 || ei == 0) fprintf(stderr, "ERROR\n"); - if(si >= cb->n || ei >= cb->n) fprintf(stderr, "ERROR\n"); + if(si == 0 || ei == 0) fprintf(stderr, "ERROR-si-ei-0\n"); + if(si >= cb->n || ei >= cb->n) fprintf(stderr, "ERROR-si-ei-1\n"); si--; ei--; if(s < cb->a[si].c_x_p || ((si + 1) < cb->n && s >= cb->a[si + 1].c_x_p)) @@ -11452,7 +11452,7 @@ uint32_t get_u_trans_hit(u_trans_hit_idx *t, u_trans_hit_t *hit) break; } } - if(k == nv) fprintf(stderr, "ERROR\n"); + if(k == nv) fprintf(stderr, "ERROR-nv\n"); } ///[t->cBeg, t->cEnd) @@ -12762,12 +12762,6 @@ long long gap_fuzz, bub_label_t* b_mask_t) if((asm_opt.flag & HA_F_VERBOSE_GFA)) write_trans_chain(cov->t_ch, output_file_name); } - hic_analysis(ug, sg, cov?cov->t_ch:t_ch); - - - - - char* gfa_name = (char*)malloc(strlen(output_file_name)+25); sprintf(gfa_name, "%s.clean_d_utg.noseq.gfa", output_file_name); FILE* output_file = fopen(gfa_name, "w"); @@ -12775,9 +12769,7 @@ long long gap_fuzz, bub_label_t* b_mask_t) fclose(output_file); free(gfa_name); - - - + hic_analysis(ug, sg, cov?cov->t_ch:t_ch); if(cov) destory_hap_cov_t(&cov); if(t_ch) destory_trans_chain(&t_ch); @@ -16593,13 +16585,13 @@ void chain_origin_trans_uid_s_bubble(buf_t *pri, buf_t* aux, uint32_t beg, uint3 for (i = 0; i < nv; ++i) { if(av[i].del) continue; - if(av[i].v == pri_v) priEnd = pri_len - av[i].ol - 1; - if(av[i].v == aux_v) auxEnd = aux_len - av[i].ol - 1; + if(av[i].v == pri_v) priEnd = ((pri_len > av[i].ol)? (pri_len - av[i].ol - 1) : 0); + if(av[i].v == aux_v) auxEnd = ((aux_len > av[i].ol)? (aux_len - av[i].ol - 1) : 0); } if(priBeg == (uint32_t)-1 || priEnd == (uint32_t)-1 || auxBeg == (uint32_t)-1 || auxEnd == (uint32_t)-1) { - fprintf(stderr, "ERROR\n"); + fprintf(stderr, "ERROR-s_bubble\n"); } cov->u_buffer.a.n = cov->tailIndex.a.n = 0; @@ -27630,11 +27622,6 @@ bub_label_t* b_mask_t) ma_ug_t *ug = NULL; ug = ma_ug_gen_primary(sg, PRIMARY_LABLE); - // FILE* output_file = fopen("straw-debug.noseq.gfa", "w"); - // ma_ug_print_simple(ug, &R_INF, sg, coverage_cut, sources, ruIndex, "utg", output_file); - // fclose(output_file); - - hap_cov_t *cov = NULL; asg_t *copy_sg = copy_read_graph(sg); ma_ug_t *copy_ug = copy_untig_graph(ug); diff --git a/Purge_Dups.cpp b/Purge_Dups.cpp index 410c55b..2e81c19 100644 --- a/Purge_Dups.cpp +++ b/Purge_Dups.cpp @@ -11,6 +11,7 @@ #include "rcut.h" KDQ_INIT(uint64_t) +KSORT_INIT_GENERIC(uint64_t) uint8_t debug_enable = 0; @@ -2423,7 +2424,7 @@ uint64_t* position_index, uint32_t xUid, uint32_t yUid, ma_utg_t* xReads, ma_utg ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, asg_t *read_g, R_to_U* ruIndex, ma_sub_t *coverage_cut, float Hap_rate, int is_local, int max_hang, int min_ovlp, uint64_t cov_threshold, kvec_asg_arc_t_offset* u_buffer, kvec_t_i32_warp* tailIndex, kvec_t_i32_warp* prevIndex, hap_cov_t *cov, long long* r_x_pos_beg, long long* r_x_pos_end, -long long* r_y_pos_beg, long long* r_y_pos_end) +long long* r_y_pos_beg, long long* r_y_pos_end, float *sim) { uint32_t max_count = 0, min_count = 0, flag; uint32_t xLen = xReads->n, xIndex; @@ -2537,6 +2538,7 @@ long long* r_y_pos_beg, long long* r_y_pos_end) get_pair_hap_similarity_by_base(xReads, read_g, yUid, reverse_sources, ruIndex, *r_x_pos_beg, *r_x_pos_end, &xLeftMatch, &xLeftTotal); + (*sim) = ((double)xLeftMatch)/((double)xLeftTotal); if(xLeftMatch == 0 || xLeftTotal == 0 || xLeftMatch <= xLeftTotal*Hap_rate) { return NON_PLOID; @@ -2842,6 +2844,63 @@ int filter_secondary_chain(long long max_score, long long cur_score, double rate return 1; } + +void filter_secondary_ovlp(kvec_hap_overlaps *x, kvec_t_u64_warp *a, float sim_flt, float ovlp_flt) +{ + if(sim_flt == 0 || ovlp_flt == 0 || x->a.n == 0) return; + #define f_ovlp(s_0, e_0, s_1, e_1) ((MIN((e_0), (e_1)) > MAX((s_0), (s_1)))? MIN((e_0), (e_1)) - MAX((s_0), (s_1)):0) + uint32_t i, m, k; + uint64_t t, ovlp; + hap_overlaps *p = NULL; + a->a.n = 0; + for (i = 0; i < x->a.n; i++) + { + if(x->a.a[i].s < sim_flt) continue; + t = x->a.a[i].x_beg_pos; t<<=32; t |= x->a.a[i].x_end_pos; + kv_push(uint64_t, a->a, t); + } + + if(a->a.n == 0) return; + ks_introsort_uint64_t(a->a.n, a->a.a); + + for (i = m = 1; i < a->a.n; ++i) + { + t = a->a.a[m-1]; + ovlp = f_ovlp(t>>32, (uint32_t)t, a->a.a[i]>>32, (uint32_t)a->a.a[i]); + if(ovlp == 0) + { + a->a.a[m] = a->a.a[i]; + m++; + } + else + { + t = MIN(a->a.a[m-1]>>32, a->a.a[i]>>32); + t<<=32; + t |= MAX((uint32_t)a->a.a[m-1], (uint32_t)a->a.a[i]); + a->a.a[m-1] = t; + } + } + a->a.n = m; + + for (i = m = 0; i < x->a.n; i++) + { + p = &(x->a.a[i]); + if(p->s < sim_flt) + { + for (k = ovlp = 0; k < a->a.n; k++) + { + ovlp += f_ovlp(p->x_beg_pos, p->x_end_pos, a->a.a[k]>>32, (uint32_t)a->a.a[k]); + if(ovlp >= ovlp_flt*(p->x_end_pos-p->x_beg_pos)) break; + } + if(k < a->a.n) continue; + if(ovlp >= ovlp_flt*(p->x_end_pos-p->x_beg_pos)) continue; + } + x->a.a[m] = x->a.a[i]; + m++; + } + x->a.n = m; +} + static void hap_alignment_advance_worker(void *_data, long eid, int tid) { hap_alignment_struct_pip* hap_buf = (hap_alignment_struct_pip*)_data; @@ -2852,7 +2911,7 @@ static void hap_alignment_advance_worker(void *_data, long eid, int tid) R_to_U* ruIndex = hap_buf->ruIndex; ma_sub_t *coverage_cut = hap_buf->coverage_cut; uint64_t* position_index = hap_buf->position_index; - float Hap_rate = hap_buf->Hap_rate; + float Hap_rate = hap_buf->Hap_rate/**MIN(hap_buf->Hap_rate, 0.2)**/, sim; int max_hang = hap_buf->max_hang; int min_ovlp = hap_buf->min_ovlp; float chain_rate = hap_buf->chain_rate; @@ -3017,7 +3076,7 @@ static void hap_alignment_advance_worker(void *_data, long eid, int tid) if(calculate_pair_hap_similarity_advance(&(u_can->a.a[k]), position_index, xUid, yUid, xReads, yReads, sources, reverse_sources, read_g, ruIndex, coverage_cut, Hap_rate, (asm_opt.purge_level_primary<=2? 0:1), max_hang, min_ovlp, cov_threshold, u_buffer, - score_vc, prevIndex_vec, cov, &r_x_pos_beg, &r_x_pos_end, &r_y_pos_beg, &r_y_pos_end)!=PLOID) + score_vc, prevIndex_vec, cov, &r_x_pos_beg, &r_x_pos_end, &r_y_pos_beg, &r_y_pos_end, &sim)!=PLOID) { continue; } @@ -3050,6 +3109,7 @@ static void hap_alignment_advance_worker(void *_data, long eid, int tid) hap_align.xUid = xUid; hap_align.yUid = yUid; hap_align.status = SELF_EXIST; + hap_align.s = sim; kv_push(hap_overlaps, all_ovlp->x[hap_align.xUid].a, hap_align); } /** @@ -3091,7 +3151,7 @@ static void hap_alignment_advance_worker(void *_data, long eid, int tid) all_ovlp->x[xUid].a.n = m + 1; } } - + // filter_secondary_ovlp(&all_ovlp->x[xUid], u_vecs, hap_buf->Hap_rate, 0.7); } int get_specific_hap_overlap(kvec_hap_overlaps* x, uint32_t qn, uint32_t tn) @@ -4513,7 +4573,6 @@ void print_all_purge_ovlp(ma_ug_t *ug, hap_overlaps_list* all_ovlp, const char* for (v = 0; v < all_ovlp->num; v++) { uId = v; - ///if(uId != 96 && uId != 272) continue; for (i = 0; i < all_ovlp->x[uId].a.n; i++) { print_hap_paf(ug, &(all_ovlp->x[uId].a.a[i])); @@ -5242,7 +5301,6 @@ uint32_t just_coverage, hap_cov_t *cov, uint32_t collect_p_trans, uint32_t colle if(asm_opt.polyploidy <= 2) { mc_solve(&all_ovlp, cov->t_ch, NULL, ug, read_g, 0.8, R_INF.trio_flag, 1, NULL, 1, NULL, NULL); - ///pt_solve(&all_ovlp, cov->t_ch, ug, read_g, 0.8, R_INF.trio_flag); } if(collect_p_trans && collect_p_trans_f == 1) diff --git a/Purge_Dups.h b/Purge_Dups.h index 337f590..5c4f273 100644 --- a/Purge_Dups.h +++ b/Purge_Dups.h @@ -52,6 +52,7 @@ typedef struct { uint32_t yUid; uint32_t weight; long long score; + float s; }hap_overlaps; typedef struct { diff --git a/hic.cpp b/hic.cpp index 99fa1c6..097e7cc 100644 --- a/hic.cpp +++ b/hic.cpp @@ -15410,6 +15410,20 @@ void resolve_tangles_hic(ha_ug_index *idx, bubble_type *bub, kvec_pe_hit *hits, ta->idx.n = ta->n = 0; } + +void print_kv_u_trans_t(kv_u_trans_t *ta) +{ + uint32_t i; + u_trans_t *p = NULL; + for (i = 0; i < ta->n; i++) + { + p = &(ta->a[i]); + fprintf(stderr, "q-utg%.6ul\tqs(%u)\tqe(%u)\tt-utg%.6ul\tts(%u)\tte(%u)\trev(%u)\tw(%f)\tf(%u)\n", + p->qn+1, p->qs, p->qe, p->tn+1, p->ts, p->te, p->rev, p->nw, p->f); + } + fprintf(stderr, "[M::%s::] \n", __func__); +} + int hic_short_align(const enzyme *fn1, const enzyme *fn2, ha_ug_index* idx) { double index_time = yak_realtime(); @@ -15434,6 +15448,7 @@ int hic_short_align(const enzyme *fn1, const enzyme *fn2, ha_ug_index* idx) ///debug_hc_hits_v14(&sl.hits, asm_opt.output_file_name, sl.idx); ////dedup_hits(&(sl.hits), sl.idx); ///write_hc_hits_v14(&sl.hits, asm_opt.output_file_name); + // print_kv_u_trans_t(&(idx->t_ch->k_trans)); hc_links link; init_hc_links(&link, idx->ug->g->n_seq, idx->t_ch);