From 326e3d239dc6e3379eeffdc82486d5c97ed49a49 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 4 Nov 2021 20:33:17 +0000 Subject: [PATCH] Add bam_aux_first()/bam_aux_next() tagged aux field iterator API Add new API functions for iterating through a BAM record's aux fields, inline accessor methods for field tag and type (or code can continue to use s-2 and *s), and a variant of bam_aux_del() that returns the (updated) iterator to the following field (for use in iterator-based loops that delete fields). --- htslib/sam.h | 50 +++++++++++++++++++++++++++++++++- sam.c | 77 +++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 113 insertions(+), 14 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index d37877e486..867940dc5a 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1428,7 +1428,6 @@ int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, /// Converts a BAM aux tag to SAM format /* - * @param b Pointer to the bam record * @param key Two letter tag key * @param type Single letter type code: ACcSsIifHZB. * @param tag Tag data pointer, in BAM format @@ -1616,6 +1615,29 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, return NULL; } +/// Return a pointer to a BAM record's first aux field +/** @param b Pointer to the BAM record + @return Aux field pointer, or NULL if the record has none + +When NULL is returned, errno will also be set to ENOENT. ("Aux field pointers" +point to the TYPE byte within the auxiliary data for that field; but in general +it is unnecessary for user code to be aware of this.) + */ +HTSLIB_EXPORT +uint8_t *bam_aux_first(const bam1_t *b); + +/// Return a pointer to a BAM record's next aux field +/** @param b Pointer to the BAM record + @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return Pointer to the next aux field, or NULL if already last or error + +Whenever NULL is returned, errno will also be set: ENOENT if @p s was the +record's last aux field; otherwise EINVAL, indicating that the BAM record's +aux data is corrupt. + */ +HTSLIB_EXPORT +uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s); + /// Return a pointer to an aux record /** @param b Pointer to the bam record @param tag Desired aux tag @@ -1628,6 +1650,19 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, HTSLIB_EXPORT uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); +/// Return the aux field's 2-character tag +/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return Pointer to the tag characters, NOT NUL-terminated + */ +static inline +const char *bam_aux_tag(const uint8_t *s) { return (const char *) (s-2); } + +/// Return the aux field's type character +/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return The type character: one of cCsSiI/fd/A/Z/H/B + */ +static inline char bam_aux_type(const uint8_t *s) { return *s; } + /// Return a SAM formatting string containing a BAM tag /** @param b Pointer to the bam record @param tag Desired aux tag @@ -1739,6 +1774,19 @@ int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8 HTSLIB_EXPORT int bam_aux_del(bam1_t *b, uint8_t *s); +/// Delete tag data from a bam record +/* @param b The bam record to update + @param s Pointer to the aux field to delete, as returned by + bam_aux_first()/_next()/_get() + @return Pointer to the following aux field, or NULL if none or on error + +Whenever NULL is returned, errno will also be set: ENOENT if the aux field +deleted was the record's last one; otherwise EINVAL, indicating that the +BAM record's aux data is corrupt. + */ +HTSLIB_EXPORT +uint8_t *bam_aux_erase(bam1_t *b, uint8_t *s); + /// Update or add a string-type tag /* @param b The bam record to update @param tag Tag identifier diff --git a/sam.c b/sam.c index 393a3b22e2..0a4b90594a 100644 --- a/sam.c +++ b/sam.c @@ -4508,7 +4508,52 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) } } +uint8_t *bam_aux_first(const bam1_t *b) +{ + uint8_t *s = bam_get_aux(b); + uint8_t *end = b->data + b->l_data; + if (s >= end) { errno = ENOENT; return NULL; } + return s+2; +} + +uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) +{ + uint8_t *end = b->data + b->l_data; + uint8_t *next = s? skip_aux((uint8_t *) s, end) : end; + if (next == NULL) goto bad_aux; + if (next >= end) { errno = ENOENT; return NULL; } + return next+2; + +bad_aux: + hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); + errno = EINVAL; + return NULL; +} + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) +{ + uint8_t *s; + for (s = bam_aux_first(b); s; s = bam_aux_next(b, s)) + if (s[-2] == tag[0] && s[-1] == tag[1]) { + // Check the tag value is valid and complete + uint8_t *e = skip_aux(s, b->data + b->l_data); + if (e == NULL) goto bad_aux; + if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux; + + return s; + } + + // errno now as set by bam_aux_first()/bam_aux_next() + return NULL; + +bad_aux: + hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); + errno = EINVAL; + return NULL; +} + +#if 0 +uint8_t *bam_aux_get_old(const bam1_t *b, const char tag[2]) { uint8_t *s, *end, *t = (uint8_t *) tag; uint16_t y = (uint16_t) t[0]<<8 | t[1]; @@ -4540,24 +4585,30 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) errno = EINVAL; return NULL; } +#endif -// s MUST BE returned by bam_aux_get() -int bam_aux_del(bam1_t *b, uint8_t *s) +uint8_t *bam_aux_erase(bam1_t *b, uint8_t *s) { - uint8_t *p, *aux; - int l_aux = bam_get_l_aux(b); - aux = bam_get_aux(b); - p = s - 2; - s = skip_aux(s, aux + l_aux); - if (s == NULL) goto bad_aux; - memmove(p, s, l_aux - (s - aux)); - b->l_data -= s - p; - return 0; + uint8_t *end = b->data + b->l_data; + uint8_t *next = skip_aux(s, end); + if (next == NULL) goto bad_aux; - bad_aux: + memmove(s-2, next, end - next); + b->l_data -= next - (s-2); + + if (next >= end) { errno = ENOENT; return NULL; } + return s; + +bad_aux: hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); errno = EINVAL; - return -1; + return NULL; +} + +int bam_aux_del(bam1_t *b, uint8_t *s) +{ + s = bam_aux_erase(b, s); + return (s || errno == ENOENT)? 0 : -1; } int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)