diff --git a/LICENSE b/LICENSE index 75aeb6c0c..4e05874f6 100644 --- a/LICENSE +++ b/LICENSE @@ -746,3 +746,28 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +----------------------------------------------------------------------------- + +LICENSE for utlist.h + +Copyright (c) 2007-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile index cd6203319..71c6b739e 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ OBJS = main.o vcfindex.o tabix.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o abuf.o \ - ccall.o em.o prob1.o kmin.o + ccall.o em.o prob1.o kmin.o str_finder.o PLUGIN_OBJS = vcfplugin.o prefix = /usr/local @@ -233,6 +233,7 @@ abuf_h = abuf.h $(htslib_vcf_h) bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h) bam_sample_h = bam_sample.h $(htslib_sam_h) +str_finder.o: str_finder.h utlist.h main.o: main.c $(htslib_hts_h) config.h version.h $(bcftools_h) vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h) $(convert_h) $(smpl_ilist_h) regidx.h $(htslib_khash_h) vcfplugin.o: vcfplugin.c config.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h) @@ -275,7 +276,7 @@ regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h) mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) regidx.h $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h) bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h -bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) +bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) str_finder.h bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h) version.o: version.h version.c hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h diff --git a/bam2bcf.c b/bam2bcf.c index 9087b99f5..1962c979d 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -40,7 +40,8 @@ extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); #define CAP_DIST 25 -bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) +bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ, + int delta_baseQ) { bcf_callaux_t *bca; if (theta <= 0.) theta = CALL_DEFTHETA; @@ -48,6 +49,8 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->capQ = 60; bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; bca->min_baseQ = min_baseQ; + bca->max_baseQ = max_baseQ; + bca->delta_baseQ = delta_baseQ; bca->e = errmod_init(1. - theta); bca->min_frac = 0.002; bca->min_support = 1; @@ -55,9 +58,13 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->npos = 100; bca->ref_pos = (int*) malloc(bca->npos*sizeof(int)); bca->alt_pos = (int*) malloc(bca->npos*sizeof(int)); + bca->iref_pos= (int*) malloc(bca->npos*sizeof(int)); + bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int)); bca->nqual = 60; bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int)); bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int)); bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int)); bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int)); bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int)); @@ -69,47 +76,68 @@ void bcf_call_destroy(bcf_callaux_t *bca) { if (bca == 0) return; errmod_destroy(bca->e); - if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } - free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq); + if (bca->npos) { + free(bca->ref_pos); free(bca->alt_pos); + free(bca->iref_pos); free(bca->ialt_pos); + bca->npos = 0; + } + free(bca->ref_mq); free(bca->alt_mq); + free(bca->iref_mq); free(bca->ialt_mq); + free(bca->ref_bq); free(bca->alt_bq); free(bca->fwd_mqs); free(bca->rev_mqs); bca->nqual = 0; free(bca->bases); free(bca->inscns); free(bca); } // position in the sequence with respect to the aligned part of the read -static int get_position(const bam_pileup1_t *p, int *len) -{ - int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; - for (icig=0; icigb->core.n_cigar; icig++) - { - int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK; - int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; - if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) - { - n_tot_bases += ncig; - iread += ncig; +static int get_position(const bam_pileup1_t *p, int *len, + int *sc_len, int *sc_dist) { + int i, j, edist = p->qpos + 1; + int sc_left = 0, sc_right = 0; + int sc_left_dist = -1, sc_right_dist = -1; + + // left end + for (i = 0; i < p->b->core.n_cigar; i++) { + int cig = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK; + if (cig == BAM_CHARD_CLIP) continue; - } - if ( cig==BAM_CINS ) - { - n_tot_bases += ncig; - iread += ncig; - continue; - } - if ( cig==BAM_CSOFT_CLIP ) - { - iread += ncig; - if ( iread<=p->qpos ) edist -= ncig; + else if (cig == BAM_CSOFT_CLIP) + sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT; + else + break; + } + if (sc_left) + sc_left_dist = p->qpos+1 - sc_left; + edist -= sc_left; + + // right end + for (j = p->b->core.n_cigar-1; j >= i; j--) { + int cig = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK; + if (cig == BAM_CHARD_CLIP) continue; + else if (cig == BAM_CSOFT_CLIP) + sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT; + else + break; + } + if (sc_right) + sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos; + + // Distance to nearest soft-clips and length of that clip. + if (sc_left_dist >= 0) { + if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) { + *sc_len = sc_left; + *sc_dist = sc_left_dist; } - if ( cig==BAM_CDEL ) continue; - if ( cig==BAM_CHARD_CLIP ) continue; - if ( cig==BAM_CPAD ) continue; - if ( cig==BAM_CREF_SKIP ) continue; - fprintf(stderr,"todo: cigar %d\n", cig); - assert(0); - } - *len = n_tot_bases; + } else if (sc_right_dist >= 0) { + *sc_len = sc_right; + *sc_dist = sc_right_dist; + } else { + *sc_len = 0; + *sc_dist = 0; + } + + *len = p->b->core.l_qseq - sc_left - sc_right; return edist; } @@ -117,8 +145,12 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) { memset(bca->ref_pos,0,sizeof(int)*bca->npos); memset(bca->alt_pos,0,sizeof(int)*bca->npos); + memset(bca->iref_pos,0,sizeof(int)*bca->npos); + memset(bca->ialt_pos,0,sizeof(int)*bca->npos); memset(bca->ref_mq,0,sizeof(int)*bca->nqual); memset(bca->alt_mq,0,sizeof(int)*bca->nqual); + memset(bca->iref_mq,0,sizeof(int)*bca->nqual); + memset(bca->ialt_mq,0,sizeof(int)*bca->nqual); memset(bca->ref_bq,0,sizeof(int)*bca->nqual); memset(bca->alt_bq,0,sizeof(int)*bca->nqual); memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual); @@ -127,6 +159,10 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES); + memset(bca->ref_scl, 0, 100*sizeof(int)); + memset(bca->alt_scl, 0, 100*sizeof(int)); + memset(bca->iref_scl, 0, 100*sizeof(int)); + memset(bca->ialt_scl, 0, 100*sizeof(int)); } /* @@ -166,7 +202,9 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t kroundup32(bca->max_bases); bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } + // fill the bases array + double nqual_over_60 = bca->nqual / 60.0; for (i = n = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; @@ -175,21 +213,41 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t ++ori_depth; if (is_indel) { - b = p->aux>>16&0x3f; - baseQ = q = p->aux&0xff; - // This read is not counted as indel. Instead of skipping it, treat it as ref. It is - // still only an approximation, but gives more accurate AD counts and calls correctly - // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam) - if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos]; - seqQ = p->aux>>8&0xff; + b = p->aux>>16&0x3f; + seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias + if (q < bca->min_baseQ) continue; + if (p->indel == 0 && (q < _n/2 || _n > 20)) { + // high quality indel calls without p->indel set aren't + // particularly indicative of being a good REF match either, + // at least not in low coverage. So require solid coverage + // before we start utilising such quals. + b = 0; + q = (int)bam_get_qual(p->b)[p->qpos]; + seqQ = (3*seqQ + 2*q)/8; + } + if (_n > 20 && seqQ > 40) seqQ = 40; + baseQ = p->aux>>8&0xff; + is_diff = (b != 0); } else { b = bam_seqi(bam_get_seq(p->b), p->qpos); // base b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base - baseQ = q = (int)bam_get_qual(p->b)[p->qpos]; + + // Lowest of this and neighbour quality values + uint8_t *qual = bam_get_qual(p->b); + q = qual[p->qpos]; + if (p->qpos > 0 && + q > qual[p->qpos-1]+bca->delta_baseQ) + q = qual[p->qpos-1]+bca->delta_baseQ; + if (p->qpos+1 < p->b->core.l_qseq && + q > qual[p->qpos+1]+bca->delta_baseQ) + q = qual[p->qpos+1]+bca->delta_baseQ; + if (q < bca->min_baseQ) continue; + if (q > bca->max_baseQ) q = bca->max_baseQ; + baseQ = q; seqQ = 99; is_diff = (ref4 < 4 && b == ref4)? 0 : 1; } @@ -228,27 +286,39 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // collect for bias tests if ( baseQ > 59 ) baseQ = 59; if ( mapQ > 59 ) mapQ = 59; - int len, epos = 0; - if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) ) + int len, epos = 0, sc_len = 0, sc_dist = 0; + if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) ) { - int pos = get_position(p, &len); + int pos = get_position(p, &len, &sc_len, &sc_dist); epos = (double)pos/(len+1) * bca->npos; + + if (sc_len) { + sc_len = 15.0*sc_len / sc_dist; + if (sc_len > 99) sc_len = 99; + } } - int ibq = baseQ/60. * bca->nqual; - int imq = mapQ/60. * bca->nqual; - if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; - else bca->fwd_mqs[imq]++; + + int imq = mapQ * nqual_over_60; + int ibq = baseQ * nqual_over_60; + + if ( bam_is_rev(p->b) ) + bca->rev_mqs[imq]++; + else + bca->fwd_mqs[imq]++; + if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) { bca->ref_pos[epos]++; bca->ref_bq[ibq]++; bca->ref_mq[imq]++; + bca->ref_scl[sc_len]++; } else { bca->alt_pos[epos]++; bca->alt_bq[ibq]++; bca->alt_mq[imq]++; + bca->alt_scl[sc_len]++; } } r->ori_depth = ori_depth; @@ -437,7 +507,7 @@ double calc_mwu_bias_cdf(int *a, int *b, int n) return pval>1 ? 1 : pval; } -double calc_mwu_bias(int *a, int *b, int n) +double calc_mwu_bias(int *a, int *b, int n, int left) { int na = 0, nb = 0, i; double U = 0, ties = 0; @@ -461,6 +531,7 @@ double calc_mwu_bias(int *a, int *b, int n) if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely double mean = ((double)na*nb)*0.5; + if (left && U > mean) return 1; // for MQB which is asymmetrical if ( na==2 || nb==2 ) { // Linear approximation @@ -483,6 +554,85 @@ double calc_mwu_bias(int *a, int *b, int n) return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2); } +// A Z-score version of the above function. +// +// See "Normal approximation and tie correction" at +// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test +// +// The Z score is the number of standard deviations above or below the mean +// with 0 being equality of the two distributions and +ve/-ve from there. +// +// This is a more robust score to filter on. +double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) { + int i; + int64_t t; + + // Optimisation + for (i = 0; i < n; i++) + if (b[i]) + break; + int b_empty = (i == n); + + // Count equal (e), less-than (l) and greater-than (g) permutations. + int e = 0, l = 0, na = 0, nb = 0; + if (b_empty) { + for (t = 0, i = n-1; i >= 0; i--) { + na += a[i]; + t += (a[i]*a[i]-1)*a[i]; // adjustment score for ties + } + } else { + for (t = 0, i = n-1; i >= 0; i--) { + // Combinations of a[i] and b[j] for i==j + e += a[i]*b[i]; + + // nb is running total of b[i+1]..b[n-1]. + // Therefore a[i]*nb is the number of combinations of a[i] and b[j] + // for all i < j. + l += a[i]*nb; // a= 0 ? 0.5 : -0.5)) / sd; // gatk method? + return (U - m) / sqrt(var2); + } + + // Else U score, which can be asymmetric for some data types. + if (left_only && U > m) + return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not. + + if (na >= 8 || nb >= 8) { + // Normal approximation, very good for na>=8 && nb>=8 and + // reasonable if na<8 or nb<8 + return exp(-0.5*(U-m)*(U-m)/var2); + } + + // Exact calculation + if (na==1 || nb == 1) + return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2); + else + return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2); +} + static inline double logsumexp2(double a, double b) { if ( a>b ) @@ -732,11 +882,43 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - if ( bca->fmt_flag & B2B_INFO_RPB ) - call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); + if (bca->fmt_flag & B2B_INFO_ZSCORE) { + // U z-normalised as +/- number of standard deviations from mean. + if (call->ori_ref < 0) { + if (bca->fmt_flag & B2B_INFO_RPB) + call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, + bca->npos, 0, 1); + call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, + bca->nqual,1,1); + if ( bca->fmt_flag & B2B_INFO_SCB ) + call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, + 100, 0,1); + } else { + if (bca->fmt_flag & B2B_INFO_RPB) + call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, + bca->npos, 0, 1); + call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, + bca->nqual,1,1); + call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, + bca->nqual,0,1); + call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, + bca->nqual,0,1); + if ( bca->fmt_flag & B2B_INFO_SCB ) + call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, + 100, 0,1); + } + } else { + // Old method; U as probability between 0 and 1 + if ( bca->fmt_flag & B2B_INFO_RPB ) + call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, + bca->npos, 0, 0); + call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, + bca->nqual, 1, 0); + call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, + bca->nqual, 0, 0); + call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, + bca->nqual, 0, 0); + } #if CDF_MWU_TESTS // CDF version of MWU tests is not calculated by default @@ -747,7 +929,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); #endif - if ( bca->fmt_flag & B2B_INFO_VDB ) + if ( bca->fmt_flag & B2B_INFO_VDB ) call->vdb = calc_vdb(bca->alt_pos, bca->npos); return 0; @@ -834,10 +1016,32 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + + if (bca->fmt_flag & B2B_INFO_ZSCORE) { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); + if ( bc->mwu_sc != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); + } else { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + } + + if ( bc->strand_bias != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); + #if CDF_MWU_TESTS if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); diff --git a/bam2bcf.h b/bam2bcf.h index 4019ca74a..e1f21c9e2 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -60,21 +60,31 @@ DEALINGS IN THE SOFTWARE. */ #define B2B_INFO_VDB (1<<14) #define B2B_INFO_RPB (1<<15) #define B2B_FMT_QS (1<<16) +#define B2B_INFO_SCB (1<<17) +#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised #define B2B_MAX_ALLELES 5 #define PLP_HAS_SOFT_CLIP(i) ((i)&1) -#define PLP_SAMPLE_ID(i) ((i)>>1) +#define PLP_HAS_INDEL(i) ((i)&2) +#define PLP_SAMPLE_ID(i) ((i)>>2) + +#define PLP_SET_SOFT_CLIP(i) ((i)|=1) +#define PLP_SET_INDEL(i) ((i)|=2) +#define PLP_SET_SAMPLE_ID(i,n) ((i)|=(n)<<2) typedef struct __bcf_callaux_t { int fmt_flag; - int capQ, min_baseQ; + int capQ, min_baseQ, max_baseQ, delta_baseQ; int openQ, extQ, tandemQ; // for indels uint32_t min_support, max_support; // for collecting indel candidates double min_frac; // for collecting indel candidates float max_frac; // for collecting indel candidates int per_sample_flt; // indel filtering strategy int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests + int *iref_pos, *ialt_pos, *iref_mq, *ialt_mq; // for indels + int ref_scl[100], alt_scl[100]; // soft-clip length bias; SNP + int iref_scl[100], ialt_scl[100]; // soft-clip length bias; INDEL // for internal uses int max_bases; int indel_types[4]; // indel lengths @@ -84,6 +94,7 @@ typedef struct __bcf_callaux_t { uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types) errmod_t *e; void *rghash; + float indel_bias; // adjusts indel score threshold; lower => call more. } bcf_callaux_t; // per-sample values @@ -120,11 +131,12 @@ typedef struct { int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS; uint8_t *fmt_arr; float vdb; // variant distance bias - float mwu_pos, mwu_mq, mwu_bq, mwu_mqs; + float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc; #if CDF_MWU_TESTS float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf; #endif float seg_bias; + float strand_bias; // phred-scaled fisher-exact test kstring_t tmp; } bcf_call_t; @@ -132,7 +144,8 @@ typedef struct { extern "C" { #endif - bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); + bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ, + int delta_baseQ); void bcf_call_destroy(bcf_callaux_t *bca); int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 6c367da73..5b0cb4fde 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -26,19 +26,29 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include #include "bam2bcf.h" +#include "str_finder.h" #include KSORT_INIT_GENERIC(uint32_t) #define MINUS_CONST 0x10000000 -#define INDEL_WINDOW_SIZE 50 +#define INDEL_WINDOW_SIZE 110 +#define MAX_TYPES 64 + +// Take a reference position tpos and convert to a query position (returned). +// This uses the CIGAR string plus alignment c->pos to do the mapping. +// +// *_tpos is returned as tpos if query overlaps tpos, but for deletions +// it'll be either the start (is_left) or end (!is_left) ref position. static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) { + // x = pos in ref, y = pos in query seq int k, x = c->pos, y = 0, last_y = 0; *_tpos = c->pos; for (k = 0; k < c->n_cigar; ++k) { @@ -64,6 +74,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, *_tpos = x; return last_y; } + // FIXME: check if the inserted sequence is consistent with the homopolymer run // l is the relative gap length and l_run is the length of the homopolymer on the reference static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) @@ -87,21 +98,609 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) return max_i - pos; } +// Identify spft-clip length, position in seq, and clipped seq len +static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p, + int *sc_len_r, int *slen_r, int *epos_r, int *end) { + bam1_t *b = p->b; + int sc_len = 0, sc_dist = -1, at_left = 1; + int epos = p->qpos, slen = b->core.l_qseq; + int k; + uint32_t *cigar = bam_get_cigar(b); + *end = -1; + for (k = 0; k < b->core.n_cigar; k++) { + int op = bam_cigar_op(cigar[k]); + if (op == BAM_CSOFT_CLIP) { + slen -= bam_cigar_oplen(cigar[k]); + if (at_left) { + // left end + sc_len += bam_cigar_oplen(cigar[k]); + epos -= sc_len; // don't count SC in seq pos + sc_dist = epos; + *end = 0; + } else { + // right end + int srlen = bam_cigar_oplen(cigar[k]); + int rd = b->core.l_qseq - srlen - p->qpos; + if (sc_dist < 0 || sc_dist > rd) { + // closer to right end than left + // FIXME: compensate for indel length too? + sc_dist = rd; + sc_len = srlen; + *end = 1; + } + } + } else if (op != BAM_CHARD_CLIP) { + at_left = 0; + } + } + + if (p->indel > 0 && slen - (epos+p->indel) < epos) + epos += p->indel-1; // end of insertion, if near end of seq + + // slen is now length of sequence minus soft-clips and + // epos is position of indel in seq minus left-clip. + *epos_r = (double)epos / (slen+1) * bca->npos; + + if (sc_len) { + // scale importance of clip by distance to closest end + *sc_len_r = 15.0*sc_len / (sc_dist+1); + if (*sc_len_r > 99) *sc_len_r = 99; + } else { + *sc_len_r = 0; + } + + *slen_r = slen; +} + +// Part of bcf_call_gap_prep. +// +// Scans the pileup to identify all the different sizes of indels +// present. +// +// Returns types and fills out n_types_r, max_rd_len_r and ref_type_r, +// or NULL on error. +static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int *max_rd_len_r, int *n_types_r, + int *ref_type_r, int *N_r) { + int i, j, t, s, N, m, max_rd_len, n_types; + int n_alt = 0, n_tot = 0, indel_support_ok = 0; + uint32_t *aux; + int *types; + + // N is the total number of reads + for (s = N = 0; s < n; ++s) + N += n_plp[s]; + + bca->max_support = bca->max_frac = 0; + aux = (uint32_t*) calloc(N + 1, 4); + if (!aux) + return NULL; + + m = max_rd_len = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type (REF) + + // Fill out aux[] array with all the non-zero indel sizes. + // Also tally number with indels (n_alt) and total (n_tot). + for (s = 0; s < n; ++s) { + int na = 0, nt = 0; + for (i = 0; i < n_plp[s]; ++i) { + const bam_pileup1_t *p = plp[s] + i; + ++nt; + if (p->indel != 0) { + ++na; + aux[m++] = MINUS_CONST + p->indel; + } + + // FIXME: cache me in pileup struct. + j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + double frac = (double)na/nt; + if ( !indel_support_ok && na >= bca->min_support + && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) + bca->max_support = na, bca->max_frac = frac; + + n_alt += na; + n_tot += nt; + } + + // Sort aux[] and dedup + ks_introsort(uint32_t, m, aux); + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + + // Taking totals makes it hard to call rare indels (IMF filter) + if ( !bca->per_sample_flt ) + indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac + || n_alt < bca->min_support ) + ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); + return NULL; + } + + // Bail out if we have far too many types of indel + if (n_types >= MAX_TYPES) { + free(aux); + // TODO revisit how/whether to control printing this warning + if (hts_verbose >= 2) + fprintf(stderr, "[%s] excessive INDEL alleles at position %d. " + "Skip the position.\n", __func__, pos + 1); + return NULL; + } + + // To prevent long stretches of N's to be mistaken for indels + // (sometimes thousands of bases), check the number of N's in the + // sequence and skip places where half or more reference bases are Ns. + int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len + ?2*INDEL_WINDOW_SIZE : max_rd_len); + for (i=pos; i(i-pos) ) { + free(aux); + return NULL; + } + + // Finally fill out the types[] array detailing the size of insertion + // or deletion. + types = (int*)calloc(n_types, sizeof(int)); + if (!types) { + free(aux); + return NULL; + } + t = 0; + types[t++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) + if (aux[i] != aux[i-1]) + types[t++] = aux[i] - MINUS_CONST; + free(aux); + + // Find reference type; types[?] == 0) + for (t = 0; t < n_types; ++t) + if (types[t] == 0) break; + + *ref_type_r = t; + *n_types_r = n_types; + *max_rd_len_r = max_rd_len; + *N_r = N; + + return types; +} + +// Part of bcf_call_gap_prep. +// +// Construct per-sample consensus. +// +// Returns an array of consensus seqs, +// or NULL on failure. +static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int left, int right) { + int i, k, s, L = right - left + 1, max_i, max2_i; + char **ref_sample; // returned + uint32_t *cns = NULL, max, max2; + char *ref0 = NULL, *r; + ref_sample = (char**) calloc(n, sizeof(char*)); + cns = (uint32_t*) calloc(L, 4); + ref0 = (char*) calloc(L, 1); + if (!ref_sample || !cns || !ref0) { + n = 0; + goto err; + } + + // Convert ref ASCII to 0-15. + for (i = 0; i < right - left; ++i) + ref0[i] = seq_nt16_table[(int)ref[i+left]]; + + // NB: one consensus per sample 'n', not per indel type. + // FIXME: consider fixing this. We should compute alignments vs + // types, not vs samples? Or types/sample combined? + for (s = 0; s < n; ++s) { + r = ref_sample[s] = (char*) calloc(L, 1); + if (!r) { + n = s-1; + goto err; + } + + memset(cns, 0, sizeof(int) * L); + + // collect ref and non-ref counts in cns + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + int x = b->core.pos, y = 0; + + // TODO: pileup exposes pileup_ind, but we also need e.g. + // pileup_len to know how much of the current CIGAR op-len + // we've used (or have remaining). If we had that, we + // could start at p->qpos without having to scan through + // the entire CIGAR string until we find it. + // + // Without it about all we could do is have a side channel + // to cache the last known coords. Messy, so punt for now. + // This is no longer the bottle neck until we get to 1000s of + // CIGAR ops. + + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k]&0xf; + int j, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if (x + l >= left) { + j = left - x > 0 ? left - x : 0; + int j_end = right - x < l ? right - x : l; + for (; j < j_end; j++) + // Append to cns. Note this is ref coords, + // so insertions aren't in cns and deletions + // will have lower coverage. + + // FIXME: want true consensus (with ins) per + // type, so we can independently compare each + // seq to each consensus and see which it + // matches best, so we get proper GT analysis. + cns[x+j-left] += + (bam_seqi(seq, y+j) == ref0[x+j-left]) + ? 1 // REF + : (1<<16); // ALT + } + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + x += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + } + + if (x > right) + break; + } + } + + // Determine a sample specific reference. + for (i = 0; i < right - left; ++i) + r[i] = ref0[i]; + + // Find deepest and 2nd deepest ALT region (max & max2). + max = max2 = 0; max_i = max2_i = -1; + for (i = 0; i < right - left; ++i) { + if (cns[i]>>16 >= max>>16) + max2 = max, max2_i = max_i, max = cns[i], max_i = i; + else if (cns[i]>>16 >= max2>>16) + max2 = cns[i], max2_i = i; + } + + // Masks mismatches present in at least 70% of the reads with 'N'. + // This code is nREF/(nREF+n_ALT) >= 70% for deepest region. + // The effect is that at least 30% of bases differing to REF will + // use "N" in consensus, so we don't penalise ALT or REF when + // aligning against it. (A poor man IUPAC code) + // + // Why is it only done in two loci at most? + if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) + max_i = -1; + if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) + max2_i = -1; + if (max_i >= 0) r[max_i] = 15; + if (max2_i >= 0) r[max2_i] = 15; + + //for (i = 0; i < right - left; ++i) + // fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); + //fputc('\n', stderr); + } + + free(ref0); + free(cns); + + return ref_sample; + + err: + free(ref0); + free(cns); + if (ref_sample) { + for (s = 0; s < n; s++) + free(ref_sample[s]); + free(ref_sample); + } + + return NULL; +} + +// The length of the homopolymer run around the current position +static int bcf_cgp_l_run(const char *ref, int pos) { + int i, l_run; + + int c = seq_nt16_table[(int)ref[pos + 1]]; + if (c == 15) { + l_run = 1; + } else { + for (i = pos + 2; ref[i]; ++i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run = i; + for (i = pos; i >= 0; --i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run -= i + 1; + } + + return l_run; +} + + +// Compute the consensus for this sample 's', minus indels which +// get added later. +static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, + int pos, int *types, int n_types, + int max_ins, int s) { + int i, j, t, k; + int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int)); + if (!inscns_aux) + return NULL; + + // Count the number of occurrences of each base at each position for + // each type of insertion. + for (t = 0; t < n_types; ++t) { + if (types[t] > 0) { + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + if (p->indel == types[t]) { + uint8_t *seq = bam_get_seq(p->b); + for (k = 1; k <= p->indel; ++k) { + int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; + } + } + } + } + } + } + + // Use the majority rule to construct the consensus + char *inscns = (char *)calloc(n_types * max_ins, 1); + for (t = 0; t < n_types; ++t) { + for (j = 0; j < types[t]; ++j) { + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) + if (ia[k] > max) + max = ia[k], max_k = k; + inscns[t*max_ins + j] = max ? max_k : 4; + if (max_k == 4) { + // discard insertions which contain N's + types[t] = 0; + break; + } + } + } + free(inscns_aux); + + return inscns; +} + +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +// Part of bcf_call_gap_prep. +// +// Realign using BAQ to get an alignment score of a single read vs +// a haplotype consensus. +// +// Fills out score +// Returns 0 on success, +// <0 on error +static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, + int type, uint8_t *ref2, uint8_t *query, + int r_start, int r_end, int long_read, + int tbeg, int tend, + int left, int right, + int qbeg, int qend, + int qpos, int max_deletion, + int *score) { + // Illumina + probaln_par_t apf = { 1e-4, 1e-2, 10 }; + + // Parameters that work better on PacBio CCS 15k. + // We should consider querying the header and RG PU field. + // See also htslib/realn.c:sam_prob_realn() + if (long_read) { + apf.d = 1e-3; + apf.e = 1e-1; + } + + type = abs(type); + apf.bw = type + 3; + int l, sc; + const uint8_t *qual = bam_get_qual(p->b), *bq; + uint8_t *qq; + + // Get segment of quality, either ZQ tag or if absent QUAL. + if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) + return -1; + bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); + if (bq) ++bq; // skip type + for (l = qbeg; l < qend; ++l) { + int qval = bq? qual[l] + (bq[l] - 64) : qual[l]; + if (qval > 30) + qval = 30; + if (qval < 7) + qval = 7; + qq[l - qbeg] = qval; + } + + // The bottom 8 bits are length-normalised score while + // the top bits are unnormalised. + sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, + query, qend - qbeg, qq, &apf, 0, 0); + if (sc < 0) { + *score = 0xffffff; + free(qq); + return 0; + } + + // used for adjusting indelQ below + l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias; + *score = sc<<8 | MIN(255, l); + + rep_ele *reps, *elt, *tmp; + uint8_t *seg = ref2 + tbeg - left; + int seg_len = tend - tbeg + type; + + // Note: although seg moves (tbeg varies), ref2 is reused many times + // so we could factor out some find_STR calls. However it's not the + // bottleneck for now. + + // FIXME: need to make this work on IUPAC. + reps = find_STR((char *)seg, seg_len, 0); + int iscore = 0; + + // Identify STRs in ref covering the indel up to + // (or close to) the end of the sequence. + // Those having an indel and right at the sequence + // end do not confirm the total length of indel + // size. Specifically a *lack* of indel at the + // end, where we know indels occur in other + // sequences, is a possible reference bias. + // + // This is emphasised further if the sequence ends with + // soft clipping. + DL_FOREACH_SAFE(reps, elt, tmp) { + if (elt->start <= qpos && elt->end >= qpos) { + iscore += (elt->end-elt->start) / elt->rep_len; // c + if (elt->start+tbeg <= r_start || + elt->end+tbeg >= r_end) + iscore += 2*(elt->end-elt->start); + } + + DL_DELETE(reps, elt); + free(elt); + } + + // Apply STR score to existing indelQ + l = (*score&0xff)*.8 + iscore*2; + *score = (*score & ~0xff) | MIN(255, l); + + free(qq); + + return 0; +} + +// Part of bcf_call_gap_prep. +// +// Returns n_alt on success +// -1 on failure +static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, + bcf_callaux_t *bca, char *inscns, + int l_run, int max_ins, + int ref_type, int *types, int n_types, + int *score) { + // FIXME: n_types has a maximum; no need to alloc - use a #define? + int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp; + memset(sumq, 0, n_types * sizeof(int)); + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int *sct = &score[K*n_types], seqQ, indelQ; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + + /* errmod_cal() assumes that if the call is wrong, the + * likelihoods of other events are equal. This is about + * right for substitutions, but is not desired for + * indels. To reuse errmod_cal(), I have to make + * compromise for multi-allelic indels. + */ + if ((sc[0]&0x3f) == ref_type) { + indelQ = (sc[1]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ = (sc[t]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); + } + tmp = sc[0]>>6 & 0xff; + // reduce indelQ + indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); + + // Doesn't really help accuracy, but permits -h to take + // affect still. + if (indelQ > seqQ) indelQ = seqQ; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total + sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; + // fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + } + } + // determine bca->indel_types[] and bca->inscns + bca->maxins = max_ins; + bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); + if (bca->maxins && !bca->inscns) + return -1; + for (t = 0; t < n_types; ++t) + sumq[t] = sumq[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first + tmp = sumq[t]; + for (; t > 0; --t) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; + for (t = 0; t < 4 && t < n_types; ++t) { + bca->indel_types[t] = types[sumq[t]&0x3f]; + if (bca->maxins) + memcpy(&bca->inscns[t * bca->maxins], + &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); + } + // update p->aux + for (s = n_alt = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + int x = types[p->aux>>16&0x3f]; + for (j = 0; j < 4; ++j) + if (x == bca->indel_types[j]) break; + p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); + if ((p->aux>>16&0x3f) > 0) ++n_alt; + //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + } + } + + return n_alt; +} + +/* +FIXME: with high number of samples, do we handle IMF correctly? Is it +fraction of indels across entire data set, or just fraction for this +specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt. + */ + /* notes: - - n .. number of samples - - the routine sets bam_pileup1_t.aux of each read as follows: - - 6: unused - - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f - - 8: estimated sequence quality .. (aux>>8)&0xff - - 8: indel quality .. aux&0xff + - n .. number of samples + - the routine sets bam_pileup1_t.aux of each read as follows: + - 6: unused + - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f + - 8: estimated sequence quality .. (aux>>8)&0xff + - 8: indel quality .. aux&0xff */ -int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) +int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, + bcf_callaux_t *bca, const char *ref) { - int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; + if (ref == 0 || bca == 0) return -1; + + int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins; + int *score, max_ref2; int N, K, l_run, ref_type, n_alt; char *inscns = 0, *ref2, *query, **ref_sample; - if (ref == 0 || bca == 0) return -1; // determine if there is a gap for (s = N = 0; s < n; ++s) { @@ -109,77 +708,29 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (plp[s][i].indel != 0) break; if (i < n_plp[s]) break; } - if (s == n) return -1; // there is no indel at this position. - for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads - { // find out how many types of indels are present - bca->max_support = bca->max_frac = 0; - int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; - uint32_t *aux; - aux = (uint32_t*) calloc(N + 1, 4); - m = max_rd_len = 0; - aux[m++] = MINUS_CONST; // zero indel is always a type - for (s = 0; s < n; ++s) { - int na = 0, nt = 0; - for (i = 0; i < n_plp[s]; ++i) { - const bam_pileup1_t *p = plp[s] + i; - ++nt; - if (p->indel != 0) { - ++na; - aux[m++] = MINUS_CONST + p->indel; - } - j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); - if (j > max_rd_len) max_rd_len = j; - } - double frac = (double)na/nt; - if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) - indel_support_ok = 1; - if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; - n_alt += na; - n_tot += nt; - } - // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), - // check the number of N's in the sequence and skip places where half or more reference bases are Ns. - int nN=0; for (i=pos; i-pos(i-pos) ) { free(aux); return -1; } - - ks_introsort(uint32_t, m, aux); - // squeeze out identical types - for (i = 1, n_types = 1; i < m; ++i) - if (aux[i] != aux[i-1]) ++n_types; - // Taking totals makes it hard to call rare indels - if ( !bca->per_sample_flt ) - indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; - if ( n_types == 1 || !indel_support_ok ) { // then skip - free(aux); return -1; - } - if (n_types >= 64) { - free(aux); - // TODO revisit how/whether to control printing this warning - if (hts_verbose >= 2) - fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); - return -1; - } - types = (int*)calloc(n_types, sizeof(int)); - t = 0; - types[t++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) - if (aux[i] != aux[i-1]) - types[t++] = aux[i] - MINUS_CONST; - free(aux); - for (t = 0; t < n_types; ++t) - if (types[t] == 0) break; - ref_type = t; // the index of the reference type (0) - } - { // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; - if (types[0] < 0) right -= types[0]; - // in case the alignments stand out the reference - for (i = pos; i < right; ++i) - if (ref[i] == 0) break; - right = i; - } - /* The following block fixes a long-existing flaw in the INDEL + if (s == n) + // there is no indel at this position. + return -1; + + // find out how many types of indels are present + types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref, + &max_rd_len, &n_types, &ref_type, &N); + if (!types) + return -1; + + + // calculate left and right boundary + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; + if (types[0] < 0) right -= types[0]; + + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + + + /* The following call fixes a long-existing flaw in the INDEL * calling model: the interference of nearby SNPs. However, it also * reduces the power because sometimes, substitutions caused by * indels are not distinguishable from true mutations. Multiple @@ -187,284 +738,211 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla * * Masks mismatches present in at least 70% of the reads with 'N'. */ - { // construct per-sample consensus - int L = right - left + 1, max_i, max2_i; - uint32_t *cns, max, max2; - char *ref0, *r; - ref_sample = (char**) calloc(n, sizeof(char*)); - cns = (uint32_t*) calloc(L, 4); - ref0 = (char*) calloc(L, 1); - for (i = 0; i < right - left; ++i) - ref0[i] = seq_nt16_table[(int)ref[i+left]]; - for (s = 0; s < n; ++s) { - r = ref_sample[s] = (char*) calloc(L, 1); - memset(cns, 0, sizeof(int) * L); - // collect ref and non-ref counts - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); - int x = b->core.pos, y = 0; - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) - if (x + j >= left && x + j < right) - cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - } - } - // determine the consensus - for (i = 0; i < right - left; ++i) r[i] = ref0[i]; - max = max2 = 0; max_i = max2_i = -1; - for (i = 0; i < right - left; ++i) { - if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; - else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; - } - if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; - if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; - if (max_i >= 0) r[max_i] = 15; - if (max2_i >= 0) r[max2_i] = 15; - //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); - } - free(ref0); free(cns); - } - { // the length of the homopolymer run around the current position - int c = seq_nt16_table[(int)ref[pos + 1]]; - if (c == 15) l_run = 1; - else { - for (i = pos + 2; ref[i]; ++i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run = i; - for (i = pos; i >= 0; --i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run -= i + 1; - } - } - // construct the consensus sequence + ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right); + + // The length of the homopolymer run around the current position + l_run = bcf_cgp_l_run(ref, pos); + + // construct the consensus sequence (minus indels, which are added later) max_ins = types[n_types - 1]; // max_ins is at least 0 if (max_ins > 0) { - int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int)); - // count the number of occurrences of each base at each position for each type of insertion - for (t = 0; t < n_types; ++t) { - if (types[t] > 0) { - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - if (p->indel == types[t]) { - uint8_t *seq = bam_get_seq(p->b); - for (k = 1; k <= p->indel; ++k) { - int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; - assert(c<5); - ++inscns_aux[(t*max_ins+(k-1))*5 + c]; - } - } - } - } - } - } - // use the majority rule to construct the consensus - inscns = (char*) calloc(n_types * max_ins, 1); - for (t = 0; t < n_types; ++t) { - for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; - for (k = 0; k < 5; ++k) - if (ia[k] > max) - max = ia[k], max_k = k; - inscns[t*max_ins + j] = max? max_k : 4; - if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's - } - } - free(inscns_aux); + inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos, + types, n_types, max_ins, s); + if (!inscns) + return -1; } + // compute the likelihood given each type of indel for each read max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); ref2 = (char*) calloc(max_ref2, 1); query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); - score1 = (int*) calloc(N * n_types, sizeof(int)); - score2 = (int*) calloc(N * n_types, sizeof(int)); + score = (int*) calloc(N * n_types, sizeof(int)); bca->indelreg = 0; + double nqual_over_60 = bca->nqual / 60.0; + for (t = 0; t < n_types; ++t) { int l, ir; - probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; - apf1.bw = apf2.bw = abs(types[t]) + 3; + // compute indelreg - if (types[t] == 0) ir = 0; - else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); - else ir = est_indelreg(pos, ref, -types[t], 0); - if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir); - // realignment + if (types[t] == 0) + ir = 0; + else if (types[t] > 0) + ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); + else + ir = est_indelreg(pos, ref, -types[t], 0); + + if (ir > bca->indelreg) + bca->indelreg = ir; + + // Identify max deletion length + int max_deletion = 0; + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + if (max_deletion < -p->indel) + max_deletion = -p->indel; + } + } + + // Realignment score, computed via BAQ for (s = K = 0; s < n; ++s) { - // write ref2 + // Construct ref2 from ref_sample, inscns and indels. + // This is now the true sample consensus (possibly prepended + // and appended with reference if sample data doesn't span + // the full length). for (k = 0, j = left; j <= pos; ++j) ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - if (types[t] <= 0) j += -types[t]; - else for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; + + if (types[t] <= 0) + j += -types[t]; + else + for (l = 0; l < types[t]; ++l) + ref2[k++] = inscns[t*max_ins + l]; + for (; j < right && ref[j]; ++j) ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - for (; k < max_ref2; ++k) ref2[k] = 4; - if (j < right) right = j; + for (; k < max_ref2; ++k) + ref2[k] = 4; + + if (right > j) + right = j; + // align each read to ref2 for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; - int qbeg, qend, tbeg, tend, sc, kk; + + // Some basic ref vs alt stats. + int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual; + imq *= nqual_over_60; + + int sc_len, slen, epos, sc_end; + + // Only need to gather stats on one type, as it's + // identical calculation for all the subsequent ones + // and we're sharing the same stats array + if (t == 0) { + // Gather stats for INFO field to aid filtering. + // mq and sc_len not very helpful for filtering, but could + // help in assigning a better QUAL value. + // + // Pos is slightly useful. + // Base qual can be useful, but need qual prior to BAQ? + // May need to cache orig quals in aux tag so we can fetch + // them even after mpileup step. + get_pos(bca, p, &sc_len, &slen, &epos, &sc_end); + + assert(imq >= 0 && imq < bca->nqual); + assert(epos >= 0 && epos < bca->npos); + assert(sc_len >= 0 && sc_len < 100); + if (p->indel) { + bca->ialt_mq[imq]++; + bca->ialt_scl[sc_len]++; + bca->ialt_pos[epos]++; + } else { + bca->iref_mq[imq]++; + bca->iref_scl[sc_len]++; + bca->iref_pos[epos]++; + } + } + + int qbeg, qpos, qend, tbeg, tend, kk; uint8_t *seq = bam_get_seq(p->b); uint32_t *cigar = bam_get_cigar(p->b); - if (p->b->core.flag&4) continue; // unmapped reads - // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. + if (p->b->core.flag & BAM_FUNMAP) continue; + + // FIXME: the following loop should be better moved outside; + // nonetheless, realignment should be much slower anyway. for (kk = 0; kk < p->b->core.n_cigar; ++kk) - if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; - if (kk < p->b->core.n_cigar) continue; - // FIXME: the following skips soft clips, but using them may be more sensitive. + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) + break; + if (kk < p->b->core.n_cigar) + continue; + // determine the start and end of sequences for alignment - qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); - qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); + // FIXME: loops over CIGAR multiple times + int left2 = left, right2 = right; + if (p->b->core.l_qseq > 1000) { + // long read data needs less context. It also tends to + // have many more candidate indels to investigate so + // speed here matters more. + if (pos - left >= INDEL_WINDOW_SIZE) + left2 += INDEL_WINDOW_SIZE/2; + if (right-pos >= INDEL_WINDOW_SIZE) + right2 -= INDEL_WINDOW_SIZE/2; + } + + int r_start = p->b->core.pos; + int r_end = bam_cigar2rlen(p->b->core.n_cigar, + bam_get_cigar(p->b)) + -1 + r_start; + + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2, + 0, &tbeg); + qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos, + 0, &tend) - qbeg; + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2, + 1, &tend); + if (types[t] < 0) { int l = -types[t]; tbeg = tbeg - l > left? tbeg - l : left; } + // write the query sequence for (l = qbeg; l < qend; ++l) query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; - { // do realignment; this is the bottleneck - const uint8_t *qual = bam_get_qual(p->b), *bq; - uint8_t *qq; - qq = (uint8_t*) calloc(qend - qbeg, 1); - bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); - if (bq) ++bq; // skip type - for (l = qbeg; l < qend; ++l) { - qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; - if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; - if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; - } - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below - if (l > 255) l = 255; - score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; - if (sc > 5) { - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); - if (l > 255) l = 255; - score2[K*n_types + t] = sc<<8 | l; + + // A fudge for now. Consider checking SAM header for + // RG platform field. + int long_read = p->b->core.l_qseq > 1000; + + // do realignment; this is the bottleneck + if (tend > tbeg) { + if (bcf_cgp_align_score(p, bca, types[t], + (uint8_t *)ref2 + left2-left, + (uint8_t *)query, + r_start, r_end, long_read, + tbeg, tend, left2, right2, + qbeg, qend, qpos, max_deletion, + &score[K*n_types + t]) < 0) { + score[K*n_types + t] = 0xffffff; + return -1; } - free(qq); + } else { + // place holder large cost for reads that cover the + // region entirely within a deletion (thus tend < tbeg). + score[K*n_types + t] = 0xffffff; } #if 0 for (l = 0; l < tend - tbeg + abs(types[t]); ++l) fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr); + for (l = 0; l < qend - qbeg; ++l) + fputc("ACGTN"[(int)query[l]], stderr); fputc('\n', stderr); - fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc); + fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s " + "qbeg=%d tbeg=%d score=%d\n", + pos, types[t], s, i, bam_get_qname(p->b), + qbeg, tbeg, sc); #endif } } } - free(ref2); free(query); - { // compute indelQ - int sc_a[16], sumq_a[16]; - int tmp, *sc = sc_a, *sumq = sumq_a; - if (n_types > 16) { - sc = (int *)malloc(n_types * sizeof(int)); - sumq = (int *)malloc(n_types * sizeof(int)); - } - memset(sumq, 0, n_types * sizeof(int)); - for (s = K = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - /* errmod_cal() assumes that if the call is wrong, the - * likelihoods of other events are equal. This is about - * right for substitutions, but is not desired for - * indels. To reuse errmod_cal(), I have to make - * compromise for multi-allelic indels. - */ - if ((sc[0]&0x3f) == ref_type) { - indelQ1 = (sc[1]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ1 = (sc[t]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); - } - tmp = sc[0]>>6 & 0xff; - indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ - sct = &score2[K*n_types]; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - if ((sc[0]&0x3f) == ref_type) { - indelQ2 = (sc[1]>>14) - (sc[0]>>14); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ2 = (sc[t]>>14) - (sc[0]>>14); - } - tmp = sc[0]>>6 & 0xff; - indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); - // pick the smaller between indelQ1 and indelQ2 - indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; - if (indelQ > 255) indelQ = 255; - if (seqQ > 255) seqQ = 255; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); - } - } - // determine bca->indel_types[] and bca->inscns - bca->maxins = max_ins; - bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); - for (t = 0; t < n_types; ++t) - sumq[t] = sumq[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) - tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sumq[t]&0x3f) == ref_type) break; - if (t) { // then move the reference type to the first - tmp = sumq[t]; - for (; t > 0; --t) sumq[t] = sumq[t-1]; - sumq[0] = tmp; - } - for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; - for (t = 0; t < 4 && t < n_types; ++t) { - bca->indel_types[t] = types[sumq[t]&0x3f]; - memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); - } - // update p->aux - for (s = n_alt = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - int x = types[p->aux>>16&0x3f]; - for (j = 0; j < 4; ++j) - if (x == bca->indel_types[j]) break; - p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); - if ((p->aux>>16&0x3f) > 0) ++n_alt; - //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); - } - } - if (sc != sc_a) free(sc); - if (sumq != sumq_a) free(sumq); - } - free(score1); free(score2); + // compute indelQ + n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins, + ref_type, types, n_types, score); + // free - for (i = 0; i < n; ++i) free(ref_sample[i]); + free(ref2); + free(query); + free(score); + + for (i = 0; i < n; ++i) + free(ref_sample[i]); + free(ref_sample); free(types); free(inscns); + return n_alt > 0? 0 : -1; } diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 61166fd19..1b18b9c13 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1786,6 +1786,19 @@ multiple regions and many alignment files are processed. being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value (the default) disables this functionality. +*-D, --full-BAQ*:: + Run the BAQ algorithm on all reads, not just those in problematic + regions. This matches the behaviour for Bcftools 1.12 and earlier. + + By default mpileup uses heuristics to decide when to apply the BAQ + algorithm. Most sequences will not be BAQ adjusted, giving a CPU + time closer to --no-BAQ, but it will still be applied in regions + with suspected problematic alignments. This has been tested to + work well on single sample data with even allele frequency, but + the reliability is unknown for multi-sample calling and for low + allele frequency variants so full BAQ is still recommended in + those scenarios. + *-d, --max-depth* 'INT':: At a position, read maximally 'INT' reads per input file. Note that the original *samtools mpileup* command had a minimum value of '8000/n' @@ -1836,6 +1849,12 @@ multiple regions and many alignment files are processed. *-Q, --min-BQ* 'INT':: Minimum base quality for a base to be considered [13] +* --max-BQ* 'INT':: + Caps the base quality to a maximum value [60]. This can be + particularly useful on technologies that produce overly optimistic + high qualities, leading to too many false positives or incorrect + genotype assignments. + *-r, --regions* 'CHR'|'CHR:POS'|'CHR:FROM-TO'|'CHR:FROM-'[,...]:: Only generate mpileup output in given regions. Requires the alignment files to be indexed. If used in conjunction with -l then considers the intersection; @@ -1927,8 +1946,31 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for *--threads* 'INT':: see *<>* +*-U, --mwu-u*:: + The the previous Mann-Whitney U test score from version 1.12 and + earlier. This is a probability score, but importantly it folds + probabilities above or below the desired score into the same P. + The new Mann-Whitney U test score is a "Z score", expressing the + score as the number of standard deviations away from the mean (with + zero being matching the mean). It keeps both positive and + negative values. This can be important for some tests where + errors are asymmetric. + + This option changes the INFO field names produced back to the ones + used by the earlier Bcftools releases. For excample BQBZ becomes + BQB. ==== Options for SNP/INDEL genotype likelihood computation +*-X, --config* 'STR':: + Specify a platform specific configuration profile. The profile + should be one of '1.12', 'illumina', 'ont' or 'pacbio-ccs'. + Settings applied are as follows: + + 1.12 -Q13 -h100 -m1 + illumina [ default values ] + ont -B -Q5 --max-BQ 30 -I + pacbio-ccs -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 -M99999 + *-e, --ext-prob* 'INT':: Phred-scaled gap extension sequencing error probability. Reducing 'INT' leads to longer indels [20] @@ -1938,7 +1980,25 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for *-h, --tandem-qual* 'INT':: Coefficient for modeling homopolymer errors. Given an 'l'-long homopolymer - run, the sequencing error of an indel of size s is modeled as 'INT'*s/l [100] + run, the sequencing error of an indel of size s is modeled as 'INT'*s/l [500] + Increasing this informs the caller that indels in long + homopolymers are more likely genuine and less likely to be + sequencing artifacts. Hence increasing tandem-qual will have + higher recall and lower precision. Bcftools 1.12 and earlier had + a default of 100, which was tuned around more error prone instruments. + Note changing this may have a minor impact on SNP calling too. + For maximum SNP calling accuracy, it may be preferable to adjust + this lower again, although this will adversely affect indels. + +*--indel-bias* 'FLOAT':: + Skews the indel scores up or down, trading recall (low + false-negative) vs precision (low false-positive) [1.0]. In Bcftools + 1.12 and earlier this parameter didn't exist, but had an implied + value of 1.0. If you are planning to do heavy filtering of + variants, selecting the best quality ones only (favouring + precision over recall), it is advisable to set this lower (such as + 0.75) while higher depth samples or where you favour recall rates + over precision may work better with a higher value such as 2.0. *-I, --skip-indels*:: Do not perform INDEL calling @@ -1949,6 +2009,14 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for *-m, --min-ireads* 'INT':: Minimum number gapped reads for indel candidates 'INT' [1] +*-M, --max-read-len* 'INT':: + The maximum read length permitted by the BAQ algorithm [500]. + Variants are still called on longer reads, but they will not be + passed through the BAQ method. This limit exists to prevent + excessively long BAQ times and high memory usage. Note if partial + BAQ is enabled with '-D' then raising this parameter will likely + not have a significant a CPU cost. + *-o, --open-prob* 'INT':: Phred-scaled gap open sequencing error probability. Reducing 'INT' leads to more indel calls. (The same short option is used for both *--open-prob* diff --git a/filter.c b/filter.c index ea0fb99d6..9bce19f1c 100644 --- a/filter.c +++ b/filter.c @@ -1917,7 +1917,8 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) for (i=0; insamples; i++) rtok->usmpl[i] |= atok->usmpl[i]; for (i=0; insamples; i++) rtok->usmpl[i] |= btok->usmpl[i]; } - memset(rtok->pass_samples, 0, rtok->nsamples); + if (rtok->nsamples) + memset(rtok->pass_samples, 0, rtok->nsamples); } #define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \ diff --git a/mpileup.c b/mpileup.c index 0bfb72495..4e80a399c 100644 --- a/mpileup.c +++ b/mpileup.c @@ -59,16 +59,19 @@ DEALINGS IN THE SOFTWARE. */ #define MPLP_PRINT_MAPQ (1<<10) #define MPLP_PER_SAMPLE (1<<11) #define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_REALN_PARTIAL (1<<13) typedef struct _mplp_aux_t mplp_aux_t; typedef struct _mplp_pileup_t mplp_pileup_t; // Data shared by all bam files typedef struct { - int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, + max_indel_depth, max_read_len, fmt_flag; int rflag_require, rflag_filter, output_type; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels + double indel_bias; char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads; faidx_t *fai; @@ -231,7 +234,46 @@ static int mplp_func(void *data, bam1_t *b) has_ref = 0; } - if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); + // Allow sufficient room for bam_aux_append of ZQ tag without + // a realloc and consequent breakage of pileup's cached pointers. + if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) { + // Doing sam_prob_realn later is problematic as it adds to + // the tag list (ZQ or BQ), which causes a realloc of b->data. + // This happens after pileup has built a hash table on the + // read name. It's a deficiency in pileup IMO. + + // We could implement a new sam_prob_realn that returns ZQ + // somewhere else and cache it ourselves (pileup clientdata), + // but for now we simply use a workaround. + // + // We create a fake tag of the correct length, which we remove + // just prior calling sam_prob_realn so we can guarantee there is + // room. (We can't just make room now as bam_copy1 removes it + // again). + if (b->core.l_qseq > 500) { + uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1); + memset(ZQ, '@', b->core.l_qseq); + ZQ[b->core.l_qseq] = 0; + bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ); + free(ZQ); + } else { + static uint8_t ZQ[501] = + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"; + ZQ[b->core.l_qseq] = 0; + bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ); + ZQ[b->core.l_qseq] = '@'; + } + } + if (has_ref && ma->conf->capQ_thres > 10) { int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres); if (q < 0) continue; // skip @@ -257,18 +299,46 @@ static int mplp_func(void *data, bam1_t *b) static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { mplp_aux_t *ma = (mplp_aux_t *)data; - cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1; - if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) ) - { - int i; - for (i=0; icore.n_cigar; i++) - { - int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; - if ( cig!=BAM_CSOFT_CLIP ) continue; - cd->i |= 1; + int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); + cd->i = 0; + PLP_SET_SAMPLE_ID(cd->i, n); + // Whether read has a soft-clip is used in mplp_realn's heuristics. + // TODO: consider whether clip length is beneficial to use? + int i; + for (i=0; icore.n_cigar; i++) { + int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; + if (cig == BAM_CSOFT_CLIP) { + PLP_SET_SOFT_CLIP(cd->i); break; } } + + if (ma->conf->flag & MPLP_REALN) { + int i, tot_ins = 0; + uint32_t *cigar = bam_get_cigar(b); + int p = 0; + for (i=0; icore.n_cigar; i++) { + int cig = cigar[i] & BAM_CIGAR_MASK; + if (bam_cigar_type(cig) & 2) + p += cigar[i] >> BAM_CIGAR_SHIFT; + if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) { + tot_ins += cigar[i] >> BAM_CIGAR_SHIFT; + // Possible further optimsation, check tot_ins==1 later + // (and remove break) so we can detect single bp indels. + // We may want to focus BAQ on more complex regions only. + PLP_SET_INDEL(cd->i); + break; + } + + // TODO: proper p->cd struct and have cd->i as a size rather + // than a flag. + + // Then aggregate together the sizes and if just 1 size for all + // reads or 2 sizes for approx 50/50 split in all reads, then + // treat this as a well-aligned variant and don't run BAQ. + } + } + return 0; } @@ -317,6 +387,150 @@ static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bc if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); } +/* + * Loops for an indel at this position. + * + * Only reads that overlap an indel loci get realigned. This considerably + * reduces the cost of running BAQ while keeping the main benefits. + * + * TODO: also consider only realigning reads that don't span the indel + * by more than a certain amount either-side. Ie focus BAQ only on reads + * ending adjacent to the indel, where the alignment is most likely to + * be wrong. (2nd TODO: do this based on sequence context; STRs bad, unique + * data good.) + * + * NB: this may sadly realign after we've already used the data. Hmm... + */ +static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, + int flag, int max_read_len, + char *ref, int ref_len, int pos) { + int i, j, has_indel = 0, has_clip = 0, nt = 0; + int min_indel = INT_MAX, max_indel = INT_MIN; + + // Is an indel present. + // NB: don't bother even checking if very long as almost guaranteed + // to have indel (and likely soft-clips too). + for (i = 0; i < n; i++) { // iterate over bams + nt += n_plp[i]; + for (j = 0; j < n_plp[i]; j++) { // iterate over reads + bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j; + has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0; + // Has_clip is almost always true for very long reads + // (eg PacBio CCS), but these rarely matter as the clip + // is likely a long way from this indel. + has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0; + if (max_indel < p->indel) + max_indel = p->indel; + if (min_indel > p->indel) + min_indel = p->indel; + } + } + + if (flag & MPLP_REALN_PARTIAL) { + if (has_indel == 0 || + (has_clip < 0.2*nt && max_indel == min_indel && + (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1))) + return; + } + + // Realign + for (i = 0; i < n; i++) { // iterate over bams + for (j = 0; j < n_plp[i]; j++) { // iterate over reads + const bam_pileup1_t *p = plp[i] + j; + bam1_t *b = p->b; + + // Avoid doing multiple times. + // + // Note we cannot modify p->cd.i here with a PLP_SET macro + // because the cd item is held by mpileup in an lbnode_t + // struct and copied over to the pileup struct for each + // iteration, essentially making p->cd.i read only. + // + // We could use our own structure (p->cd.p), allocated during + // the constructor, but for simplicity we play dirty and + // abuse an unused flag bit instead. + if (b->core.flag & 32768) + continue; + b->core.flag |= 32768; + + if (b->core.l_qseq > max_read_len) + continue; + + // Check p->cigar_ind and see what cigar elements are before + // and after. How close is this location to the end of the + // read? Only realign if we don't span by more than X bases. + // + // Again, best only done on deeper data as BAQ helps + // disproportionately more on shallow data sets. + // + // This rescues some of the false negatives that are caused by + // systematic reduction in quality due to sample vs ref alignment. + +// At deep coverage we skip realigning more reads as we have sufficient depth. +// This rescues for false negatives. At shallow depth we pay for this with +// more FP so are more stringent on spanning size. +#define REALN_DIST (40+10*(nt<40)+10*(nt<20)) + uint32_t *cig = bam_get_cigar(b); + int ncig = b->core.n_cigar; + + // Don't realign reads where indel is in middle? + // On long read data we don't care about soft-clips at the ends. + // For short read data, we always calc BAQ on these as they're + // a common source of false positives. + if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) { + // Left & right cigar op match. + int lr = b->core.l_qseq > 500; + int lm = 0, rm = 0, k; + for (k = 0; k < ncig; k++) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + lm += bam_cigar_oplen(cig[k]); + else + break; + } + + for (k = ncig-1; k >= 0; k--) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + rm += bam_cigar_oplen(cig[k]); + else + break; + } + + if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) + continue; + + if (lm >= REALN_DIST && rm >= REALN_DIST && + has_clip < (0.15+0.05*(nt>20))*nt) + continue; + } + + if (b->core.l_qseq > 500) { + // don't do BAQ on long-read data if it's going to + // cause us to have a large band-with and costly in CPU + int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000) + continue; + } + + // Fudge: make room for ZQ tag. + uint8_t *_Q = bam_aux_get(b, "_Q"); + if (_Q) bam_aux_del(b, _Q); + sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3); + } + } + + return; +} + static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) { bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list @@ -333,7 +547,10 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) if ( !conf->bed_logic ) overlap = overlap ? 0 : 1; if ( !overlap ) continue; } - mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); + int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); + if (has_ref && (conf->flag & MPLP_REALN)) + mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, + conf->max_read_len, ref, ref_len, pos); int total_depth, _ref0, ref16; for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i]; @@ -346,15 +563,16 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) conf->bc.tid = tid; conf->bc.pos = pos; bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc); bcf_clear1(conf->bcf_rec); - bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, + conf->bca, 0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth - && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0) + && (bcf_callaux_clean(conf->bca, &conf->bc), + bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)) { - bcf_callaux_clean(conf->bca, &conf->bc); for (i = 0; i < conf->gplp->n; ++i) bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) @@ -546,11 +764,24 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##INFO="); if ( conf->fmt_flag&B2B_INFO_VDB ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_RPB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + + if (conf->fmt_flag & B2B_INFO_ZSCORE) { + if ( conf->fmt_flag&B2B_INFO_RPB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_SCB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + } else { + if ( conf->fmt_flag&B2B_INFO_RPB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + } + + bcf_hdr_append(conf->bcf_hdr,"##INFO="); #if CDF_MWU_TESTS bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); @@ -601,9 +832,11 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]); if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); - conf->bca = bcf_call_init(-1., conf->min_baseQ); + conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ, + conf->delta_baseQ); conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; + conf->bca->indel_bias = conf->indel_bias; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; @@ -808,6 +1041,7 @@ int parse_format_flag(const char *str) else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR; + else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB; else { fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str); @@ -864,6 +1098,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -b, --bam-list FILE list of input BAM filenames, one per line\n" " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" " -C, --adjust-MQ INT adjust mapping quality [0]\n" +" -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); fprintf(fp, " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" @@ -874,6 +1109,10 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); fprintf(fp, +" --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); + fprintf(fp, +" --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ); + fprintf(fp, " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" " -R, --regions-file FILE restrict to regions listed in a file\n" " --ignore-RG ignore RG tags (one BAM = one sample)\n" @@ -896,9 +1135,11 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -o, --output FILE write output to FILE [standard output]\n" " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" +" -U, --mwu-u use older probability scale for Mann-Whitney U test\n" " --threads INT use multithreading with INT worker threads [0]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" +" -X, --config STR Specify platform specific configuration profiles\n" " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); fprintf(fp, " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); @@ -910,12 +1151,22 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); fprintf(fp, +" -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); + fprintf(fp, " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); fprintf(fp, " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" -" -P, --platforms STR comma separated list of platforms for indels [all]\n" -"\n" +" -P, --platforms STR comma separated list of platforms for indels [all]\n"); + fprintf(fp, +" --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n" +"\n", mplp->indel_bias); + fprintf(fp, "Notes: Assuming diploid individuals.\n" +"--config STR values are equivalent to the following option combinations:\n" +" 1.12: -Q13 -h100 -m1\n" +" illumina: [ default values ]\n" +" ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" +" pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n" "\n" "Example:\n" " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" @@ -934,12 +1185,15 @@ int main_mpileup(int argc, char *argv[]) int nfiles = 0, use_orphan = 0, noref = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); - mplp.min_baseQ = 13; + mplp.min_baseQ = 1; + mplp.max_baseQ = 60; + mplp.delta_baseQ = 30; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; - mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; - mplp.min_frac = 0.002; mplp.min_support = 1; - mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500; + mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL + | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; @@ -947,7 +1201,9 @@ int main_mpileup(int argc, char *argv[]) mplp.record_cmd_line = 1; mplp.n_threads = 0; mplp.bsmpl = bam_smpl_init(); - mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag() + // the default to be changed in future, see also parse_format_flag() + mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; + mplp.max_read_len = 500; static const struct option lopts[] = { @@ -968,6 +1224,8 @@ int main_mpileup(int argc, char *argv[]) {"bam-list", required_argument, NULL, 'b'}, {"no-BAQ", no_argument, NULL, 'B'}, {"no-baq", no_argument, NULL, 'B'}, + {"full-BAQ", no_argument, NULL, 'D'}, + {"full-baq", no_argument, NULL, 'D'}, {"adjust-MQ", required_argument, NULL, 'C'}, {"adjust-mq", required_argument, NULL, 'C'}, {"max-depth", required_argument, NULL, 'd'}, @@ -984,6 +1242,9 @@ int main_mpileup(int argc, char *argv[]) {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, + {"max-bq", required_argument, NULL, 11}, + {"max-BQ", required_argument, NULL, 11}, + {"delta-BQ", required_argument, NULL, 12}, {"ignore-overlaps", no_argument, NULL, 'x'}, {"output-type", required_argument, NULL, 'O'}, {"samples", required_argument, NULL, 's'}, @@ -991,16 +1252,20 @@ int main_mpileup(int argc, char *argv[]) {"annotate", required_argument, NULL, 'a'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, + {"indel-bias", required_argument, NULL, 10}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, - {"min-ireads ", required_argument, NULL, 'm'}, + {"min-ireads", required_argument, NULL, 'm'}, {"per-sample-mF", no_argument, NULL, 'p'}, {"per-sample-mf", no_argument, NULL, 'p'}, {"platforms", required_argument, NULL, 'P'}, + {"max-read-len", required_argument, NULL, 'M'}, + {"config", required_argument, NULL, 'X'}, + {"mwu-u", no_argument, NULL, 'U'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -1052,6 +1317,7 @@ int main_mpileup(int argc, char *argv[]) case 'P': mplp.pl_list = strdup(optarg); break; case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'B': mplp.flag &= ~MPLP_REALN; break; + case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; @@ -1069,6 +1335,8 @@ int main_mpileup(int argc, char *argv[]) case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; + case 11: mplp.max_baseQ = atoi(optarg); break; + case 12: mplp.delta_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': { char *end; @@ -1080,6 +1348,12 @@ int main_mpileup(int argc, char *argv[]) break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; + case 10: // --indel-bias (inverted so higher => more indels called) + if (atof(optarg) < 1e-2) + mplp.indel_bias = 1/1e2; + else + mplp.indel_bias = 1/atof(optarg); + break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1092,6 +1366,40 @@ int main_mpileup(int argc, char *argv[]) } mplp.fmt_flag |= parse_format_flag(optarg); break; + case 'M': mplp.max_read_len = atoi(optarg); break; + case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break; + case 'X': + if (strcasecmp(optarg, "pacbio-ccs") == 0) { + mplp.min_frac = 0.1; + mplp.min_baseQ = 5; + mplp.max_baseQ = 50; + mplp.delta_baseQ = 10; + mplp.openQ = 25; + mplp.extQ = 1; + mplp.flag |= MPLP_REALN_PARTIAL; + mplp.max_read_len = 99999; + } else if (strcasecmp(optarg, "ont") == 0) { + fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " + "a higher -P, eg -P0.01 or -P 0.1\n"); + mplp.min_baseQ = 5; + mplp.max_baseQ = 30; + mplp.flag &= ~MPLP_REALN; + mplp.flag |= MPLP_NO_INDEL; + } else if (strcasecmp(optarg, "1.12") == 0) { + // 1.12 and earlier + mplp.min_frac = 0.05; + mplp.min_support = 1; + mplp.min_baseQ = 13; + mplp.tandemQ = 100; + } else if (strcasecmp(optarg, "illumina") == 0) { + mplp.flag |= MPLP_REALN_PARTIAL; + } else { + fprintf(stderr, "Unknown configuration name '%s'\n" + "Please choose from 1.12, illumina, pacbio-ccs or ont\n", + optarg); + return 1; + } + break; default: fprintf(stderr,"Invalid option: '%c'\n", c); return 1; @@ -1154,5 +1462,6 @@ int main_mpileup(int argc, char *argv[]) if (mplp.bed_itr) regitr_destroy(mplp.bed_itr); if (mplp.reg) regidx_destroy(mplp.reg); bam_smpl_destroy(mplp.bsmpl); + return ret; } diff --git a/str_finder.c b/str_finder.c new file mode 100644 index 000000000..800cbfef9 --- /dev/null +++ b/str_finder.c @@ -0,0 +1,270 @@ +/* str_finder.c -- Short Tandem Repeat finder. + Originally from Crumble (https://github.com/jkbonfield/crumble) + + Copyright (C) 2015-2016, 2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include + +#include "str_finder.h" +#include "utlist.h" + +#define MAX(a,b) ((a)>(b)?(a):(b)) +#define MIN(a,b) ((a)<(b)?(a):(b)) + +typedef unsigned char uc; + +static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen, + int lower_only, unsigned int w) { + rep_ele *el, *tmp, *prev; + char *cp1, *cp2, *cp_end; + int i; + + // Already handled this in previous overlap? + if (*list) { + tmp = DL_TAIL(*list); + if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos) + return; + } + + // Find current and last occurence of repeated word. + + cp2 = &cons[pos+1]; + // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen]; + + for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads + if (*cp1 == '*') + continue; + else + i++; + while (*cp1 == '*') + cp1--; + + + // Scan ahead to see how much further it goes. + cp_end = &cons[clen]; + while (cp2 < cp_end) { + if (*cp1 != *cp2) + break; + + w<<=2; + w|=*cp2; + cp1++; + cp2++; + } + + if (!(el = malloc(sizeof(*el)))) + return; + + el->end = pos + cp2-&cons[pos+1]; + el->rep_len = rlen; + pos++; + while (rlen--) { + while (cons[--pos] == '*'); + while (cons[--pos] == '*'); + } + //pos++; + while (pos > 1 && cons[pos-1] == '*') pos--; + el->start = pos; + + // Check it meets the lower-case only criteria + if (lower_only) { + int lc = 0; + for (i = el->start; i <= el->end; i++) { + if (islower(cons[i])) { + lc = 1; + break; + } + } + + if (!lc) { + free(el); + return; + } + } + + // Remove any older items on the list that are entirely contained within el + if (*list) { + tmp = DL_TAIL(*list); + do { + prev = tmp->prev; + if (tmp->end < el->start) + break; + + if (tmp->start >= el->start) { + DL_DELETE(*list, tmp); + free(tmp); + } + + if (tmp == DL_HEAD(*list)) + break; + tmp = prev; + } while (*list); + } + + DL_APPEND(*list, el); + + return; +} + +/* + * Finds repeated homopolymers up to 8-mers. + * Note this assumes cons is 0-3, so N of 4 may rarely give false hits. + * + * Returns a list of rep_ele structs holding the start,end tuples of repeats; + * NULL on failure. + */ +rep_ele *find_STR(char *cons, int len, int lower_only) { + int i, j; + uint32_t w = 0; + rep_ele *reps = NULL; + + for (i = j = 0; i < len && j < 15; i++) { + if (cons[i] == '*') continue; + + w <<= 2; + w |= cons[i]; + //printf("%3d %c w=%08x\n", i, cons[i], w); + if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003)) + add_rep(&reps, cons, len, i, 1, lower_only, w); + if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f)) + add_rep(&reps, cons, len, i, 2, lower_only, w); + if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f)) + add_rep(&reps, cons, len, i, 3, lower_only, w); + if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff)) + add_rep(&reps, cons, len, i, 4, lower_only, w); + if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff)) + add_rep(&reps, cons, len, i, 5, lower_only, w); + if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff)) + add_rep(&reps, cons, len, i, 6, lower_only, w); + if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff)) + add_rep(&reps, cons, len, i, 7, lower_only, w); + + j++; + } + + for (; i < len; i++) { + if (cons[i] == '*') continue; + + w <<= 2; + w |= cons[i]; + //printf("%3d %c w=%08x\n", i, cons[i], w); + if ((w&0xffff) == ((w>>16)&0xffff)) + add_rep(&reps, cons, len, i, 8, lower_only, w); + else if ((w&0x3fff) == ((w>>14)&0x3fff)) + add_rep(&reps, cons, len, i, 7, lower_only, w); + else if ((w&0x0fff) == ((w>>12)&0x0fff)) + add_rep(&reps, cons, len, i, 6, lower_only, w); + else if ((w&0x03ff) == ((w>>10)&0x03ff)) + add_rep(&reps, cons, len, i, 5, lower_only, w); + else if ((w&0x00ff) == ((w>> 8)&0x00ff)) + add_rep(&reps, cons, len, i, 4, lower_only, w); + else if ((w&0x003f) == ((w>> 6)&0x003f)) + add_rep(&reps, cons, len, i, 3, lower_only, w); + else if ((w&0x000f) == ((w>> 4)&0x000f)) + add_rep(&reps, cons, len, i, 2, lower_only, w); + else if ((w&0x0003) == ((w>> 2)&0x0003)) + add_rep(&reps, cons, len, i, 1, lower_only, w); + } + + return reps; +} + +/* ----------------------------------------------------------------------------- + * Computes repeat regions in the consensus and then provides a bit mask + * indicating the extend of the STRs. + * + * The purpose of this is to identify where a read needs to span the entire + * region in order to validate how many copies of a repeat word are present. + * This only really has a major impact when indels are involved. + * + * For example, given this multiple alignment: + * + * S1 GATCGGACGAGAG + * S2 GATCGGACGAGAGAGAGAGAGT + * S3 GATCGGACGAGAGAGAGAG**TCGGAC + * S4 GGACGAGAGAGAGAGAGTCGGAC + * S5 CGAGAGAGAGAG**TCGGAC + * S6 AGAGAGAGTCGGAC + * + * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last + * (S1 and S6) sequences do not span and so we do not know which allele they + * match. Specifically as the pad is at the right hand end, the alignment of + * S6 gives incorrect weight to the consensus as it is stating AG when it + * may actually be ** at that point. + * + * By identifying the repeats we can soft clip as follows: + * + * S1 GATCGGACgagag + * S2 GATCGGACGAGAGAGAGAGAGT + * S3 GATCGGACGAGAGAGAGAG**TCGGAC + * S4 GGACGAGAGAGAGAGAGTCGGAC + * S5 CGAGAGAGAGAG**TCGGAC + * S6 agagagagTCGGAC + * + * Returns an array of STR vs no-STR values. + * 0 => non repetitive. + * 1+ => repeat with consecutive bit-number for repeat size. + * + * Eg: AGGGGAGGAGAAGAC + * 1111 1111 + * 2222222 + * 444444 + * => 011331137754440 + */ +char *cons_mark_STR(char *cons, int len, int lower_only) { + rep_ele *reps, *elt, *tmp; + char *str; + + str = calloc(1, len); + reps = find_STR(cons, len, lower_only); + + DL_FOREACH_SAFE(reps, elt, tmp) { + int i, v = 0; + + //printf("%2d .. %2d %.*s\n", elt->start, elt->end, + // elt->end - elt->start+1, &cons[elt->start]); + + // What is there? + for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++) + v |= str[i]; + + for (i = 0; i < 8; i++) { + if (!(v&(1<start; i <= elt->end; i++) + str[i] |= v; + + DL_DELETE(reps, elt); + free(elt); + } + + return str; +} diff --git a/str_finder.h b/str_finder.h new file mode 100644 index 000000000..242f59ec1 --- /dev/null +++ b/str_finder.h @@ -0,0 +1,64 @@ +/* str_finder.c -- Short Tandem Repeat finder. + Originally from Crumble (https://github.com/jkbonfield/crumble) + + Copyright (C) 2015-2016, 2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef _STR_FINDER_H_ +#define _STR_FINDER_H_ + +#include "utlist.h" + +typedef struct rep_ele { + int start, end, rep_len; + struct rep_ele *prev; + struct rep_ele *next; +} rep_ele; + +/* + * Finds repeated homopolymers up to 8-mers. + * + * If lower_only is true then it only adds STRs for regions that + * contain at least one lower-case base. This can be used as a marker + * for looking for specific types of repeats. + * (One use for this is to only mark STRs that overlap a heterozygous + * indel region.) + * + * Returns a list of rep_ele structs holding the start,end tuples of repeats; + * NULL on failure. + */ +rep_ele *find_STR(char *cons, int len, int lower_only); + +/* + * Returns an array of STR vs no-STR values. + * 0 => non repetitive. + * 1+ => repeat with consecutive bit-number for repeat size. + * + * Eg: AGGGGAGGAGAAGAC + * 1111 1111 + * 2222222 + * 444444 + * => 011331137754440 + */ +char *cons_mark_STR(char *cons, int len, int lower_only); + +#endif /* _STR_FINDER_H_ */ diff --git a/utlist.h b/utlist.h new file mode 100644 index 000000000..28cf8a381 --- /dev/null +++ b/utlist.h @@ -0,0 +1,761 @@ +/* +Copyright (c) 2007-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef UTLIST_H +#define UTLIST_H + +#define UTLIST_VERSION 1.9.9 + +#include + +/* + * This file contains macros to manipulate singly and doubly-linked lists. + * + * 1. LL_ macros: singly-linked lists. + * 2. DL_ macros: doubly-linked lists. + * 3. CDL_ macros: circular doubly-linked lists. + * + * To use singly-linked lists, your structure must have a "next" pointer. + * To use doubly-linked lists, your structure must "prev" and "next" pointers. + * Either way, the pointer to the head of the list must be initialized to NULL. + * + * ----------------.EXAMPLE ------------------------- + * struct item { + * int id; + * struct item *prev, *next; + * } + * + * struct item *list = NULL: + * + * int main() { + * struct item *item; + * ... allocate and populate item ... + * DL_APPEND(list, item); + * } + * -------------------------------------------------- + * + * For doubly-linked lists, the append and delete macros are O(1) + * For singly-linked lists, append and delete are O(n) but prepend is O(1) + * The sort macro is O(n log(n)) for all types of single/double/circular lists. + */ + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ code), this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#ifdef _MSC_VER /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define LDECLTYPE(x) decltype(x) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#define LDECLTYPE(x) char* +#endif +#elif defined(__ICCARM__) +#define NO_DECLTYPE +#define LDECLTYPE(x) char* +#else /* GNU, Sun and other compilers */ +#define LDECLTYPE(x) __typeof(x) +#endif + +/* for VS2008 we use some workarounds to get around the lack of decltype, + * namely, we always reassign our tmp variable to the list head if we need + * to dereference its prev/next pointers, and save/restore the real head.*/ +#ifdef NO_DECLTYPE +#define _SV(elt,list) _tmp = (char*)(list); {char **_alias = (char**)&(list); *_alias = (elt); } +#define _NEXT(elt,list,next) ((char*)((list)->next)) +#define _NEXTASGN(elt,list,to,next) { char **_alias = (char**)&((list)->next); *_alias=(char*)(to); } +/* #define _PREV(elt,list,prev) ((char*)((list)->prev)) */ +#define _PREVASGN(elt,list,to,prev) { char **_alias = (char**)&((list)->prev); *_alias=(char*)(to); } +#define _RS(list) { char **_alias = (char**)&(list); *_alias=_tmp; } +#define _CASTASGN(a,b) { char **_alias = (char**)&(a); *_alias=(char*)(b); } +#else +#define _SV(elt,list) +#define _NEXT(elt,list,next) ((elt)->next) +#define _NEXTASGN(elt,list,to,next) ((elt)->next)=(to) +/* #define _PREV(elt,list,prev) ((elt)->prev) */ +#define _PREVASGN(elt,list,to,prev) ((elt)->prev)=(to) +#define _RS(list) +#define _CASTASGN(a,b) (a)=(b) +#endif + +/****************************************************************************** + * The sort macro is an adaptation of Simon Tatham's O(n log(n)) mergesort * + * Unwieldy variable names used here to avoid shadowing passed-in variables. * + *****************************************************************************/ +#define LL_SORT(list, cmp) \ + LL_SORT2(list, cmp, next) + +#define LL_SORT2(list, cmp, next) \ +do { \ + LDECLTYPE(list) _ls_p; \ + LDECLTYPE(list) _ls_q; \ + LDECLTYPE(list) _ls_e; \ + LDECLTYPE(list) _ls_tail; \ + int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ + if (list) { \ + _ls_insize = 1; \ + _ls_looping = 1; \ + while (_ls_looping) { \ + _CASTASGN(_ls_p,list); \ + list = NULL; \ + _ls_tail = NULL; \ + _ls_nmerges = 0; \ + while (_ls_p) { \ + _ls_nmerges++; \ + _ls_q = _ls_p; \ + _ls_psize = 0; \ + for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ + _ls_psize++; \ + _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \ + if (!_ls_q) break; \ + } \ + _ls_qsize = _ls_insize; \ + while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \ + if (_ls_psize == 0) { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + } else if (_ls_qsize == 0 || !_ls_q) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + } else if (cmp(_ls_p,_ls_q) <= 0) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + } else { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + } \ + if (_ls_tail) { \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ + } else { \ + _CASTASGN(list,_ls_e); \ + } \ + _ls_tail = _ls_e; \ + } \ + _ls_p = _ls_q; \ + } \ + if (_ls_tail) { \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \ + } \ + if (_ls_nmerges <= 1) { \ + _ls_looping=0; \ + } \ + _ls_insize *= 2; \ + } \ + } \ +} while (0) + + +#define DL_SORT(list, cmp) \ + DL_SORT2(list, cmp, prev, next) + +#define DL_SORT2(list, cmp, prev, next) \ +do { \ + LDECLTYPE(list) _ls_p; \ + LDECLTYPE(list) _ls_q; \ + LDECLTYPE(list) _ls_e; \ + LDECLTYPE(list) _ls_tail; \ + int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ + if (list) { \ + _ls_insize = 1; \ + _ls_looping = 1; \ + while (_ls_looping) { \ + _CASTASGN(_ls_p,list); \ + list = NULL; \ + _ls_tail = NULL; \ + _ls_nmerges = 0; \ + while (_ls_p) { \ + _ls_nmerges++; \ + _ls_q = _ls_p; \ + _ls_psize = 0; \ + for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ + _ls_psize++; \ + _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \ + if (!_ls_q) break; \ + } \ + _ls_qsize = _ls_insize; \ + while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \ + if (_ls_psize == 0) { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + } else if (_ls_qsize == 0 || !_ls_q) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + } else if (cmp(_ls_p,_ls_q) <= 0) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + } else { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + } \ + if (_ls_tail) { \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ + } else { \ + _CASTASGN(list,_ls_e); \ + } \ + _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \ + _ls_tail = _ls_e; \ + } \ + _ls_p = _ls_q; \ + } \ + _CASTASGN(list->prev, _ls_tail); \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \ + if (_ls_nmerges <= 1) { \ + _ls_looping=0; \ + } \ + _ls_insize *= 2; \ + } \ + } \ +} while (0) + + +#define DL_HEAD(list) (list) +#define DL_TAIL(list) ((list) ? (list)->prev : NULL) + +#define CDL_SORT(list, cmp) \ + CDL_SORT2(list, cmp, prev, next) + +#define CDL_SORT2(list, cmp, prev, next) \ +do { \ + LDECLTYPE(list) _ls_p; \ + LDECLTYPE(list) _ls_q; \ + LDECLTYPE(list) _ls_e; \ + LDECLTYPE(list) _ls_tail; \ + LDECLTYPE(list) _ls_oldhead; \ + LDECLTYPE(list) _tmp; \ + int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ + if (list) { \ + _ls_insize = 1; \ + _ls_looping = 1; \ + while (_ls_looping) { \ + _CASTASGN(_ls_p,list); \ + _CASTASGN(_ls_oldhead,list); \ + list = NULL; \ + _ls_tail = NULL; \ + _ls_nmerges = 0; \ + while (_ls_p) { \ + _ls_nmerges++; \ + _ls_q = _ls_p; \ + _ls_psize = 0; \ + for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ + _ls_psize++; \ + _SV(_ls_q,list); \ + if (_NEXT(_ls_q,list,next) == _ls_oldhead) { \ + _ls_q = NULL; \ + } else { \ + _ls_q = _NEXT(_ls_q,list,next); \ + } \ + _RS(list); \ + if (!_ls_q) break; \ + } \ + _ls_qsize = _ls_insize; \ + while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \ + if (_ls_psize == 0) { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \ + } else if (_ls_qsize == 0 || !_ls_q) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \ + } else if (cmp(_ls_p,_ls_q) <= 0) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \ + } else { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \ + } \ + if (_ls_tail) { \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ + } else { \ + _CASTASGN(list,_ls_e); \ + } \ + _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \ + _ls_tail = _ls_e; \ + } \ + _ls_p = _ls_q; \ + } \ + _CASTASGN(list->prev,_ls_tail); \ + _CASTASGN(_tmp,list); \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_tmp,next); _RS(list); \ + if (_ls_nmerges <= 1) { \ + _ls_looping=0; \ + } \ + _ls_insize *= 2; \ + } \ + } \ +} while (0) + +/****************************************************************************** + * singly linked list macros (non-circular) * + *****************************************************************************/ +#define LL_PREPEND(head,add) \ + LL_PREPEND2(head,add,next) + +#define LL_PREPEND2(head,add,next) \ +do { \ + (add)->next = head; \ + head = add; \ +} while (0) + +#define LL_CONCAT(head1,head2) \ + LL_CONCAT2(head1,head2,next) + +#define LL_CONCAT2(head1,head2,next) \ +do { \ + LDECLTYPE(head1) _tmp; \ + if (head1) { \ + _tmp = head1; \ + while (_tmp->next) { _tmp = _tmp->next; } \ + _tmp->next=(head2); \ + } else { \ + (head1)=(head2); \ + } \ +} while (0) + +#define LL_APPEND(head,add) \ + LL_APPEND2(head,add,next) + +#define LL_APPEND2(head,add,next) \ +do { \ + LDECLTYPE(head) _tmp; \ + (add)->next=NULL; \ + if (head) { \ + _tmp = head; \ + while (_tmp->next) { _tmp = _tmp->next; } \ + _tmp->next=(add); \ + } else { \ + (head)=(add); \ + } \ +} while (0) + +#define LL_DELETE(head,del) \ + LL_DELETE2(head,del,next) + +#define LL_DELETE2(head,del,next) \ +do { \ + LDECLTYPE(head) _tmp; \ + if ((head) == (del)) { \ + (head)=(head)->next; \ + } else { \ + _tmp = head; \ + while (_tmp->next && (_tmp->next != (del))) { \ + _tmp = _tmp->next; \ + } \ + if (_tmp->next) { \ + _tmp->next = ((del)->next); \ + } \ + } \ +} while (0) + +/* Here are VS2008 replacements for LL_APPEND and LL_DELETE */ +#define LL_APPEND_VS2008(head,add) \ + LL_APPEND2_VS2008(head,add,next) + +#define LL_APPEND2_VS2008(head,add,next) \ +do { \ + if (head) { \ + (add)->next = head; /* use add->next as a temp variable */ \ + while ((add)->next->next) { (add)->next = (add)->next->next; } \ + (add)->next->next=(add); \ + } else { \ + (head)=(add); \ + } \ + (add)->next=NULL; \ +} while (0) + +#define LL_DELETE_VS2008(head,del) \ + LL_DELETE2_VS2008(head,del,next) + +#define LL_DELETE2_VS2008(head,del,next) \ +do { \ + if ((head) == (del)) { \ + (head)=(head)->next; \ + } else { \ + char *_tmp = (char*)(head); \ + while ((head)->next && ((head)->next != (del))) { \ + head = (head)->next; \ + } \ + if ((head)->next) { \ + (head)->next = ((del)->next); \ + } \ + { \ + char **_head_alias = (char**)&(head); \ + *_head_alias = _tmp; \ + } \ + } \ +} while (0) +#ifdef NO_DECLTYPE +#undef LL_APPEND +#define LL_APPEND LL_APPEND_VS2008 +#undef LL_DELETE +#define LL_DELETE LL_DELETE_VS2008 +#undef LL_DELETE2 +#define LL_DELETE2 LL_DELETE2_VS2008 +#undef LL_APPEND2 +#define LL_APPEND2 LL_APPEND2_VS2008 +#undef LL_CONCAT /* no LL_CONCAT_VS2008 */ +#undef DL_CONCAT /* no DL_CONCAT_VS2008 */ +#endif +/* end VS2008 replacements */ + +#define LL_COUNT(head,el,counter) \ + LL_COUNT2(head,el,counter,next) \ + +#define LL_COUNT2(head,el,counter,next) \ +{ \ + counter = 0; \ + LL_FOREACH2(head,el,next){ ++counter; } \ +} + +#define LL_FOREACH(head,el) \ + LL_FOREACH2(head,el,next) + +#define LL_FOREACH2(head,el,next) \ + for(el=head;el;el=(el)->next) + +#define LL_FOREACH_SAFE(head,el,tmp) \ + LL_FOREACH_SAFE2(head,el,tmp,next) + +#define LL_FOREACH_SAFE2(head,el,tmp,next) \ + for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp) + +#define LL_SEARCH_SCALAR(head,out,field,val) \ + LL_SEARCH_SCALAR2(head,out,field,val,next) + +#define LL_SEARCH_SCALAR2(head,out,field,val,next) \ +do { \ + LL_FOREACH2(head,out,next) { \ + if ((out)->field == (val)) break; \ + } \ +} while(0) + +#define LL_SEARCH(head,out,elt,cmp) \ + LL_SEARCH2(head,out,elt,cmp,next) + +#define LL_SEARCH2(head,out,elt,cmp,next) \ +do { \ + LL_FOREACH2(head,out,next) { \ + if ((cmp(out,elt))==0) break; \ + } \ +} while(0) + +#define LL_REPLACE_ELEM(head, el, add) \ +do { \ + LDECLTYPE(head) _tmp; \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + (add)->next = (el)->next; \ + if ((head) == (el)) { \ + (head) = (add); \ + } else { \ + _tmp = head; \ + while (_tmp->next && (_tmp->next != (el))) { \ + _tmp = _tmp->next; \ + } \ + if (_tmp->next) { \ + _tmp->next = (add); \ + } \ + } \ +} while (0) + +#define LL_PREPEND_ELEM(head, el, add) \ +do { \ + LDECLTYPE(head) _tmp; \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + (add)->next = (el); \ + if ((head) == (el)) { \ + (head) = (add); \ + } else { \ + _tmp = head; \ + while (_tmp->next && (_tmp->next != (el))) { \ + _tmp = _tmp->next; \ + } \ + if (_tmp->next) { \ + _tmp->next = (add); \ + } \ + } \ +} while (0) \ + + +/****************************************************************************** + * doubly linked list macros (non-circular) * + *****************************************************************************/ +#define DL_PREPEND(head,add) \ + DL_PREPEND2(head,add,prev,next) + +#define DL_PREPEND2(head,add,prev,next) \ +do { \ + (add)->next = head; \ + if (head) { \ + (add)->prev = (head)->prev; \ + (head)->prev = (add); \ + } else { \ + (add)->prev = (add); \ + } \ + (head) = (add); \ +} while (0) + +#define DL_APPEND(head,add) \ + DL_APPEND2(head,add,prev,next) + +#define DL_APPEND2(head,add,prev,next) \ +do { \ + if (head) { \ + (add)->prev = (head)->prev; \ + (head)->prev->next = (add); \ + (head)->prev = (add); \ + (add)->next = NULL; \ + } else { \ + (head)=(add); \ + (head)->prev = (head); \ + (head)->next = NULL; \ + } \ +} while (0) + +#define DL_CONCAT(head1,head2) \ + DL_CONCAT2(head1,head2,prev,next) + +#define DL_CONCAT2(head1,head2,prev,next) \ +do { \ + LDECLTYPE(head1) _tmp; \ + if (head2) { \ + if (head1) { \ + _tmp = (head2)->prev; \ + (head2)->prev = (head1)->prev; \ + (head1)->prev->next = (head2); \ + (head1)->prev = _tmp; \ + } else { \ + (head1)=(head2); \ + } \ + } \ +} while (0) + +#define DL_DELETE(head,del) \ + DL_DELETE2(head,del,prev,next) + +#define DL_DELETE2(head,del,prev,next) \ +do { \ + assert((del)->prev != NULL); \ + if ((del)->prev == (del)) { \ + (head)=NULL; \ + } else if ((del)==(head)) { \ + (del)->next->prev = (del)->prev; \ + (head) = (del)->next; \ + } else { \ + (del)->prev->next = (del)->next; \ + if ((del)->next) { \ + (del)->next->prev = (del)->prev; \ + } else { \ + (head)->prev = (del)->prev; \ + } \ + } \ +} while (0) + +#define DL_COUNT(head,el,counter) \ + DL_COUNT2(head,el,counter,next) \ + +#define DL_COUNT2(head,el,counter,next) \ +{ \ + counter = 0; \ + DL_FOREACH2(head,el,next){ ++counter; } \ +} + +#define DL_FOREACH(head,el) \ + DL_FOREACH2(head,el,next) + +#define DL_FOREACH2(head,el,next) \ + for(el=head;el;el=(el)->next) + +/* this version is safe for deleting the elements during iteration */ +#define DL_FOREACH_SAFE(head,el,tmp) \ + DL_FOREACH_SAFE2(head,el,tmp,next) + +#define DL_FOREACH_SAFE2(head,el,tmp,next) \ + for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp) + +/* these are identical to their singly-linked list counterparts */ +#define DL_SEARCH_SCALAR LL_SEARCH_SCALAR +#define DL_SEARCH LL_SEARCH +#define DL_SEARCH_SCALAR2 LL_SEARCH_SCALAR2 +#define DL_SEARCH2 LL_SEARCH2 + +#define DL_REPLACE_ELEM(head, el, add) \ +do { \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + if ((head) == (el)) { \ + (head) = (add); \ + (add)->next = (el)->next; \ + if ((el)->next == NULL) { \ + (add)->prev = (add); \ + } else { \ + (add)->prev = (el)->prev; \ + (add)->next->prev = (add); \ + } \ + } else { \ + (add)->next = (el)->next; \ + (add)->prev = (el)->prev; \ + (add)->prev->next = (add); \ + if ((el)->next == NULL) { \ + (head)->prev = (add); \ + } else { \ + (add)->next->prev = (add); \ + } \ + } \ +} while (0) + +#define DL_PREPEND_ELEM(head, el, add) \ +do { \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + (add)->next = (el); \ + (add)->prev = (el)->prev; \ + (el)->prev = (add); \ + if ((head) == (el)) { \ + (head) = (add); \ + } else { \ + (add)->prev->next = (add); \ + } \ +} while (0) \ + + +/****************************************************************************** + * circular doubly linked list macros * + *****************************************************************************/ +#define CDL_PREPEND(head,add) \ + CDL_PREPEND2(head,add,prev,next) + +#define CDL_PREPEND2(head,add,prev,next) \ +do { \ + if (head) { \ + (add)->prev = (head)->prev; \ + (add)->next = (head); \ + (head)->prev = (add); \ + (add)->prev->next = (add); \ + } else { \ + (add)->prev = (add); \ + (add)->next = (add); \ + } \ +(head)=(add); \ +} while (0) + +#define CDL_DELETE(head,del) \ + CDL_DELETE2(head,del,prev,next) + +#define CDL_DELETE2(head,del,prev,next) \ +do { \ + if ( ((head)==(del)) && ((head)->next == (head))) { \ + (head) = 0L; \ + } else { \ + (del)->next->prev = (del)->prev; \ + (del)->prev->next = (del)->next; \ + if ((del) == (head)) (head)=(del)->next; \ + } \ +} while (0) + +#define CDL_COUNT(head,el,counter) \ + CDL_COUNT2(head,el,counter,next) \ + +#define CDL_COUNT2(head, el, counter,next) \ +{ \ + counter = 0; \ + CDL_FOREACH2(head,el,next){ ++counter; } \ +} + +#define CDL_FOREACH(head,el) \ + CDL_FOREACH2(head,el,next) + +#define CDL_FOREACH2(head,el,next) \ + for(el=head;el;el=((el)->next==head ? 0L : (el)->next)) + +#define CDL_FOREACH_SAFE(head,el,tmp1,tmp2) \ + CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next) + +#define CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next) \ + for((el)=(head), ((tmp1)=(head)?((head)->prev):NULL); \ + (el) && ((tmp2)=(el)->next, 1); \ + ((el) = (((el)==(tmp1)) ? 0L : (tmp2)))) + +#define CDL_SEARCH_SCALAR(head,out,field,val) \ + CDL_SEARCH_SCALAR2(head,out,field,val,next) + +#define CDL_SEARCH_SCALAR2(head,out,field,val,next) \ +do { \ + CDL_FOREACH2(head,out,next) { \ + if ((out)->field == (val)) break; \ + } \ +} while(0) + +#define CDL_SEARCH(head,out,elt,cmp) \ + CDL_SEARCH2(head,out,elt,cmp,next) + +#define CDL_SEARCH2(head,out,elt,cmp,next) \ +do { \ + CDL_FOREACH2(head,out,next) { \ + if ((cmp(out,elt))==0) break; \ + } \ +} while(0) + +#define CDL_REPLACE_ELEM(head, el, add) \ +do { \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + if ((el)->next == (el)) { \ + (add)->next = (add); \ + (add)->prev = (add); \ + (head) = (add); \ + } else { \ + (add)->next = (el)->next; \ + (add)->prev = (el)->prev; \ + (add)->next->prev = (add); \ + (add)->prev->next = (add); \ + if ((head) == (el)) { \ + (head) = (add); \ + } \ + } \ +} while (0) + +#define CDL_PREPEND_ELEM(head, el, add) \ +do { \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + (add)->next = (el); \ + (add)->prev = (el)->prev; \ + (el)->prev = (add); \ + (add)->prev->next = (add); \ + if ((head) == (el)) { \ + (head) = (add); \ + } \ +} while (0) \ + +#endif /* UTLIST_H */ +