From a43d170b0e27308f0924ba35b849adae1c50a6b5 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 7 Sep 2021 11:03:44 +0100 Subject: [PATCH] Finer control of --regions vs --targets overlap This is to address a long-standing design flaw in handling regions and targets, as described in these BCFtools issues: https://github.com/samtools/bcftools/issues/1420 https://github.com/samtools/bcftools/issues/1421 HTSlib (and BCFtools) recognize two sets of behaviors / options for resctricting VCF/BCF files by region, one is for streaming (`-t/-T`) and one for index-gumping (`-r/-R`). They behave differently, the first includes only records with POS coordinate within the regions, the other includes overlapping regions. This allows to modify the default behavior and provides three options: - Include only records with POS starting in the regions/targets - Include VCF records that overlap regions/targets, even if POS itself is outside the regions - Include only VCF records where the true variation overlaps regions/targets, e.g. consider the difference between `TC>T-` and `C>-` Most importantly, this allows to make the regions and targets behave the same way. Note that the default behavior remains unchanged. --- htslib/synced_bcf_reader.h | 9 ++- synced_bcf_reader.c | 122 ++++++++++++++++++++++++++++++------- 2 files changed, 107 insertions(+), 24 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index f262327ff8..c49b2eca1d 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019-2020 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2021 Genome Research Ltd. Author: Petr Danecek @@ -96,7 +96,9 @@ typedef enum { BCF_SR_REQUIRE_IDX, BCF_SR_PAIR_LOGIC, // combination of the PAIR_* values above - BCF_SR_ALLOW_NO_IDX // allow to proceed even if required index is not present (at the user's risk) + BCF_SR_ALLOW_NO_IDX, // allow to proceed even if required index is not present (at the user's risk) + BCF_SR_REGIONS_OVERLAP, // include overlapping records with POS outside the regions: 0=no, 1=VCF line overlap, 2=true variant overlap [1] + BCF_SR_TARGETS_OVERLAP // include overlapping records with POS outside the targets: 0=no, 1=VCF line overlap, 2=true variant overlap [0] } bcf_sr_opt_t; @@ -110,7 +112,8 @@ typedef struct bcf_sr_regions_t kstring_t line; // holder of the current line, set only when reading from tabix-indexed files htsFile *file; char *fname; - int is_bin; // is open in binary mode (tabix access) + int is_bin:30, // is open in binary mode (tabix access) + overlap:2; // see BCF_SR_REGIONS_OVERLAP/BCF_SR_TARGETS_OVERLAP char **als; // parsed alleles if targets_als set and _regions_match_alleles called kstring_t als_str; // block of parsed alleles int nals, mals; // number of set alleles and the size of allocated array diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 3dd6c2066c..0bb66c8989 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -67,6 +67,7 @@ region_t; typedef struct { sr_sort_t sort; + int regions_overlap, targets_overlap; } aux_t; @@ -123,6 +124,18 @@ int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...) BCF_SR_AUX(readers)->sort.pair = va_arg(args, int); return 0; + case BCF_SR_REGIONS_OVERLAP: + va_start(args, opt); + BCF_SR_AUX(readers)->regions_overlap = va_arg(args, int); + if ( readers->regions ) readers->regions->overlap = BCF_SR_AUX(readers)->regions_overlap; + return 0; + + case BCF_SR_TARGETS_OVERLAP: + va_start(args, opt); + BCF_SR_AUX(readers)->targets_overlap = va_arg(args, int); + if ( readers->targets ) readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap; + return 0; + default: break; } @@ -181,6 +194,7 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) if ( !readers->regions ) return -1; readers->explicit_regs = 1; readers->require_index = REQUIRE_IDX_; + readers->regions->overlap = BCF_SR_AUX(readers)->regions_overlap; return 0; } @@ -199,6 +213,7 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2); if ( !readers->targets ) return -1; readers->targets_als = alleles; + readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap; return 0; } @@ -391,6 +406,8 @@ bcf_srs_t *bcf_sr_init(void) bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t)); files->aux = (aux_t*) calloc(1,sizeof(aux_t)); bcf_sr_sort_init(&BCF_SR_AUX(files)->sort); + bcf_sr_set_opt(files,BCF_SR_REGIONS_OVERLAP,1); + bcf_sr_set_opt(files,BCF_SR_TARGETS_OVERLAP,0); return files; } @@ -545,6 +562,35 @@ static int _readers_next_region(bcf_srs_t *files) return 0; } +static void _set_variant_boundaries(bcf1_t *rec, hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t off; + if ( rec->n_allele ) + { + off = rec->rlen; + bcf_unpack(rec, BCF_UN_STR); + int i; + for (i=1; in_allele; i++) + { + // Make symbolic alleles start at POS, although this is not strictly true for + // , where POS should be the position BEFORE the deletion/insertion. + // However, since arbitrary symbolic alleles can be defined by the user, we + // will simplify the interpretation of --targets-overlap and --region-overlap. + int j = 0; + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[i]; + while ( ref[j] && alt[j] && ref[j]==alt[j] ) j++; + if ( off > j ) off = j; + if ( !off ) break; + } + } + else + off = 0; + + *beg = rec->pos + off; + *end = rec->pos + rec->rlen - 1; +} + /* * _reader_fill_buffer() - buffers all records with the same coordinate */ @@ -606,8 +652,28 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } - // prevent creation of duplicates from records overlapping multiple regions - if ( files->regions && reader->buffer[reader->nbuffer+1]->pos <= files->regions->prev_end ) continue; + // Prevent creation of duplicates from records overlapping multiple regions + // and recognize true variant overlaps vs record overlaps (e.g. TA>T vs A>-) + if ( files->regions ) + { + hts_pos_t beg, end; + if ( BCF_SR_AUX(files)->regions_overlap==0 ) + beg = end = reader->buffer[reader->nbuffer+1]->pos; + else if ( BCF_SR_AUX(files)->regions_overlap==1 ) + { + beg = reader->buffer[reader->nbuffer+1]->pos; + end = reader->buffer[reader->nbuffer+1]->pos + reader->buffer[reader->nbuffer+1]->rlen - 1; + } + else if ( BCF_SR_AUX(files)->regions_overlap==2 ) + _set_variant_boundaries(reader->buffer[reader->nbuffer+1], &beg,&end); + else + { + hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); + exit(1); + } + + if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; + } // apply filter if ( !reader->nfilter_ids ) @@ -637,23 +703,18 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } /* - * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position + * _readers_shift_buffer() - removes the first line */ static void _reader_shift_buffer(bcf_sr_t *reader) { + if ( !reader->nbuffer ) return; int i; + bcf1_t *tmp = reader->buffer[1]; for (i=2; i<=reader->nbuffer; i++) - if ( reader->buffer[i]->rid!=reader->buffer[1]->rid || reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; - if ( i<=reader->nbuffer ) - { - // A record with a different position follows, swap it. Because of the reader's logic, - // only one such line can be present. - assert( i==reader->nbuffer ); - bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp; - reader->nbuffer = 1; - } - else - reader->nbuffer = 0; // no other line + reader->buffer[i-1] = reader->buffer[i]; + if ( reader->nbuffer > 1 ) + reader->buffer[reader->nbuffer] = tmp; + reader->nbuffer--; } static int next_line(bcf_srs_t *files) @@ -704,19 +765,38 @@ static int next_line(bcf_srs_t *files) // Skip this position if not present in targets if ( files->targets ) { - int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos); - if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) ) + int match = 0; + for (i=0; inreaders; i++) + { + if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue; + hts_pos_t beg, end; + if ( BCF_SR_AUX(files)->targets_overlap==0 ) + beg = end = min_pos; + else if ( BCF_SR_AUX(files)->targets_overlap==1 ) + { + beg = min_pos; + end = min_pos + files->readers[i].buffer[1]->rlen - 1; + } + else if ( BCF_SR_AUX(files)->targets_overlap==2 ) + _set_variant_boundaries(files->readers[i].buffer[1], &beg,&end); + else + { + hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); + exit(1); + } + int overlap = bcf_sr_regions_overlap(files->targets, chr, beg, end)==0 ? 1 : 0; + if ( (!files->targets_exclude && !overlap) || (files->targets_exclude && overlap) ) + _reader_shift_buffer(&files->readers[i]); + else + match = 1; + } + if ( !match ) { - // Remove all lines with this position from the buffer - for (i=0; inreaders; i++) - if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) - _reader_shift_buffer(&files->readers[i]); min_pos = HTS_POS_MAX; chr = NULL; continue; } } - break; // done: chr and min_pos are set } if ( !chr ) return 0;