From de19711b8596b3293d2343954dccedcc9ace2446 Mon Sep 17 00:00:00 2001 From: pd3 Date: Mon, 5 Jun 2023 15:19:04 +0200 Subject: [PATCH] Add support for non-standard chromosome names containing [:-] characters Note hts_parse_region() cannot be used because it requires the header and without the header the caller does not learn the contig name. Resolves #1620 --- htslib/synced_bcf_reader.h | 2 ++ synced_bcf_reader.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index bbe5ea2ba..9a6b48438 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -338,6 +338,8 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); * supply 'from' in place of 'to'. When 'to' is negative, first * abs(to) will be attempted and if that fails, 'from' will be used * instead. + * If chromosome name contains the characters ':' or '-', it should + * be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000" * * The bcf_sr_regions_t struct returned by a successful call should be freed * via bcf_sr_regions_destroy() when it is no longer needed. diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 702f260ee..d24e5f444 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1032,6 +1032,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) } // File name or a list of genomic locations. If file name, NULL is returned. +// Recognises regions in the form chr, chr:pos, chr:beg-end, chr:beg-, {weird-chr-name}:pos. +// Cannot use hts_parse_region() as that requires the header and if header is not present, +// wouldn't learn the chromosome name. static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); @@ -1043,9 +1046,23 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) hts_pos_t from, to; while ( 1 ) { - while ( *ep && *ep!=',' && *ep!=':' ) ep++; tmp.l = 0; - kputsn(sp,ep-sp,&tmp); + if ( *ep=='{' ) + { + while ( *ep && *ep!='}' ) ep++; + if ( !*ep ) + { + hts_log_error("Could not parse the region, mismatching braces in: \"%s\"", str); + goto exit_nicely; + } + ep++; + kputsn(sp+1,ep-sp-2,&tmp); + } + else + { + while ( *ep && *ep!=',' && *ep!=':' ) ep++; + kputsn(sp,ep-sp,&tmp); + } if ( *ep==':' ) { sp = ep+1; @@ -1053,7 +1070,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( sp==ep ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } if ( !*ep || *ep==',' ) { @@ -1064,7 +1081,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep!='-' ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } ep++; sp = ep; @@ -1072,7 +1089,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep && *ep!=',' ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } if ( sp==ep ) to = MAX_CSI_COOR-1; _regions_add(reg, tmp.s, from, to); @@ -1088,6 +1105,11 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } free(tmp.s); return reg; + +exit_nicely: + bcf_sr_regions_destroy(reg); + free(tmp.s); + return NULL; } // ichr,ifrom,ito are 0-based;