From a5376a1e2cdbff22dea049163254a1ccf8a2c0ae Mon Sep 17 00:00:00 2001 From: Oliver Voogd Date: Mon, 3 Jul 2023 13:09:14 +1000 Subject: [PATCH] added BamRecord, Gene parsing and devtools::check fixes --- src/classes/BamRecord.cpp | 99 +++++++++++++++ src/classes/BamRecord.h | 94 ++++++++++++++ src/classes/GeneAnnotationParser.cpp | 183 +++++++++++++++++++++++++++ src/classes/GeneAnnotationParser.h | 10 ++ 4 files changed, 386 insertions(+) create mode 100644 src/classes/BamRecord.cpp create mode 100644 src/classes/BamRecord.h create mode 100644 src/classes/GeneAnnotationParser.cpp create mode 100644 src/classes/GeneAnnotationParser.h diff --git a/src/classes/BamRecord.cpp b/src/classes/BamRecord.cpp new file mode 100644 index 0000000..a47aed6 --- /dev/null +++ b/src/classes/BamRecord.cpp @@ -0,0 +1,99 @@ +#include "BamRecord.h" + +#include +#include +#include +#include + +#include "../utility/bam.h" +#include "../utility/cigars.h" + + +int calculate_length_from_cigar(const std::vector &cigartuples, bool queryLength) { + // calculate the length of either query or reference using a cigar string + // query consuming operations are: MIS=X (0, 1, 4, 7, 8) + // reference consuming operations are: MDN=X (0, 2, 3, 7, 8) + // if queryLength =True, calculate query length, otherwise calculate read length + int len = 0; + for (const auto &pair : cigartuples) { + if (queryLength) { + if (pair.op == 0 || pair.op == 1 || pair.op == 4 || pair.op == 7 || pair.op == 8) { + len += pair.len; + } + } else { + if (pair.op == 0 || pair.op == 2 || pair.op == 3 || pair.op == 7 || pair.op == 8) { + len += pair.len; + } + } + } + + return len; +} + +int calculate_reference_end(const bam1_t *b, const std::vector &cigartuples) { + // calculate the length of the reference based on reference consuming operations: + // MDN=X (0, 2, 3, 7, 8) + int ref_start = bam_reference_start(b); + ref_start += calculate_length_from_cigar(cigartuples, false); + return ref_start; +} + +int calculate_query_alignment_length(const std::vector &cigartuples) { + int q_len = calculate_length_from_cigar(cigartuples, true); + + // remove regions of soft clipping from the ends of the query length + if (cigartuples.size() > 0) { + if (cigartuples[0].op == 4) { + q_len -= cigartuples[0].len; + } + if (cigartuples.size() > 1) { + if (cigartuples.back().op == 4) { + q_len -= cigartuples.back().len; + } + } + } + + return q_len; +} + +/* takes a flag int, converts it to all of the properties it encodes for +*/ +Flag +read_flag(int n) +{ + Flag flag; + + flag.read_paired = (n & (1<<0)); + flag.read_mapped_in_proper_pair = (n & (1<<1)); + flag.read_unmapped = (n & (1<<2)); + flag.mate_unmapped = (n & (1<<3)); + flag.read_reverse_strand = (n & (1<<4)); + flag.mate_reverse_strand = (n & (1<<5)); + flag.first_in_pair = (n & (1<<6)); + flag.second_in_pair = (n & (1<<7)); + flag.not_primary_alignment = (n & (1<<8)); + flag.read_fails_platform_vendor_quality_checks = (n & (1<<9)); + flag.read_is_PCR_or_optical_duplicate = (n & (1<<10)); + flag.supplementary_alignment = (n & (1<<11)); + + return flag; +} + +/* takes a single bam entry, converts it into a BAMRecord struct +*/ +BAMRecord +read_record(const bam1_t * b, const bam_header_t * header) +{ + BAMRecord rec; + rec.reference_start = bam_reference_start(b); + rec.reference_name = std::string(header->target_name[b->core.tid]); + rec.AS_tag = bam_aux2i(bam_aux_get(b, "AS")); + rec.read_name = std::string(bam1_qname(b)); + rec.mapping_quality = (int)bam_mapping_qual(b); + rec.cigar = generate_cigar_pairs(b); + rec.cigar_string = generate_cigar(rec.cigar); + rec.reference_end = calculate_reference_end(b, rec.cigar); + rec.query_alignment_length = calculate_query_alignment_length(rec.cigar); + rec.flag = read_flag(b->core.flag); + return rec; +} diff --git a/src/classes/BamRecord.h b/src/classes/BamRecord.h new file mode 100644 index 0000000..e2e3973 --- /dev/null +++ b/src/classes/BamRecord.h @@ -0,0 +1,94 @@ +#ifndef BAM_RECORD_H +#define BAM_RECORD_H + +#include +#include + +#include "../utility/bam.h" +#include "../utility/cigars.h" + +/* + holds all of the flags in a bam record +*/ +struct Flag { + bool + read_paired; + bool + read_mapped_in_proper_pair; + bool + read_unmapped; + bool + mate_unmapped; + bool + read_reverse_strand; + bool + mate_reverse_strand; + bool + first_in_pair; + bool + second_in_pair; + bool + not_primary_alignment; + bool + read_fails_platform_vendor_quality_checks; + bool + read_is_PCR_or_optical_duplicate; + bool + supplementary_alignment; + + inline bool operator==(const Flag &o) const{ + return + read_paired == o.read_paired && + read_mapped_in_proper_pair == o.read_mapped_in_proper_pair && + read_unmapped == o.read_unmapped && + mate_unmapped == o.mate_unmapped && + read_reverse_strand == o.read_reverse_strand && + mate_reverse_strand == o.mate_reverse_strand && + first_in_pair == o.first_in_pair && + second_in_pair == o.second_in_pair && + not_primary_alignment == o.not_primary_alignment && + read_fails_platform_vendor_quality_checks == o.read_fails_platform_vendor_quality_checks && + read_is_PCR_or_optical_duplicate == o.read_is_PCR_or_optical_duplicate && + supplementary_alignment == o.supplementary_alignment; + } +}; + +/* + a struct for handling each record in a bam file +*/ +struct BAMRecord { + std::vector + cigar; + std::string + cigar_string; + + std::string + reference_name; + int + reference_start; + int + reference_end; + + std::string + read_name; + + float + AS_tag; + int + query_alignment_length; + int + mapping_quality; + Flag + flag; +}; + + +Flag +read_flag(int); + +BAMRecord +read_record(const bam1_t*, const bam_header_t*); + +int calculate_length_from_cigar(const std::vector &cigartuples, bool queryLength); + +#endif // BAM_RECORD_H diff --git a/src/classes/GeneAnnotationParser.cpp b/src/classes/GeneAnnotationParser.cpp new file mode 100644 index 0000000..9a46e51 --- /dev/null +++ b/src/classes/GeneAnnotationParser.cpp @@ -0,0 +1,183 @@ +#include "GeneAnnotationParser.h" + +#include +#include +#include +#include +#include "zlib.h" + +#include "GFFData.h" +#include "GFFRecord.h" +#include "types.h" +#include "StartEndPair.h" +#include "Pos.h" +#include "../utility/utility.h" + +enum AnnotationSource { gencode, ensembl }; +AnnotationSource guessAnnotationSource(const std::string &filename) +{ + gzFile annotation = gzopen(filename.c_str(), "r"); + + const int MAXLEN = 256; + char buf[MAXLEN]; + AnnotationSource returnSource = ensembl; + // Only look through the first 1000 lines + for (int i = 0; i < 1000; i++) { + if (gzgets(annotation, buf, MAXLEN) == NULL) return ensembl; + std::string line(buf); + if (line.find("GENCODE") != std::string::npos) { + returnSource = gencode; + break; + } else if (line.find("1\tEnsembl") != std::string::npos) { + returnSource = ensembl; + break; + } + } + + gzclose(annotation); + return returnSource; +} + +void +removeTranscriptDuplicates(GFFData * const data, bool updateTranscriptDict=true) +{ + for (auto &[transcript, exons] : data->transcript_to_exon) { + ranges::sort(exons); + if (exons.size() > 1 && exons[0].start == exons[1].start) { + std::vector new_exons = {exons[0]}; + for (const auto &ex : exons) { + if (ex.start != new_exons.back().start && ex.end != new_exons.back().start) + new_exons.push_back(ex); + } + + data->transcript_to_exon[transcript] = new_exons; + } + + if (updateTranscriptDict) { + data->transcript_dict[transcript] = + Pos(data->transcript_dict[transcript].chr, data->transcript_to_exon[transcript][0].start, + data->transcript_to_exon[transcript].back().end, data->transcript_dict[transcript].strand, + data->transcript_dict[transcript].parent_id); + } + } + + for (const auto &[gene, transcripts] : data->gene_to_transcript) { + // remove duplicates from transcript + // sort transcripts + std::set transcript_set(transcripts.begin(), transcripts.end()); + data->gene_to_transcript[gene] = std::vector (transcript_set.begin(), transcript_set.end()); + } +} + +GFFData parse_gtf(const std::string &filename) +{ + GFFData data; + auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename); + + gzFile annotationFile = gzopen(filename.c_str(), "r"); + const int MAXLEN = 65535; // maximum char array size + char buf[MAXLEN]; + while(gzgets(annotationFile, buf, MAXLEN) != NULL) { + GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction); + + if (rec.feature == "gene") { + data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]); + } else if (rec.feature == "transcript") { + if (rec.attributes.count("gene_id") == 0) continue; + + std::string gene_id = rec.attributes["gene_id"]; + if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id)) + data.chr_to_gene[rec.seqname].push_back(gene_id); + data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]); + data.transcript_dict[rec.attributes["transcript_id"]] = + Pos(rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id); + } else if (rec.feature == "exon") { + if (rec.attributes.count("gene_id") == 0) continue; + + if (data.chr_to_gene.count(rec.seqname) == 0 + || ranges::doesNotContain(data.chr_to_gene[rec.seqname], rec.attributes["gene_id"])) { + data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]); + } + if (rec.attributes.count("transcript_id")) { + if (data.transcript_dict.count(rec.attributes["transcript_id"]) == 0) { + data.gene_to_transcript[rec.attributes["gene_id"]].push_back(rec.attributes["transcript_id"]); + data.transcript_dict[rec.attributes["transcript_id"]] = + Pos(rec.seqname, -1, 1, rec.strand, rec.attributes["gene_id"]); + } + data.transcript_to_exon[rec.attributes["transcript_id"]].push_back( + StartEndPair(rec.start - 1, rec.end) + ); + } + } + + } + gzclose(annotationFile); + + removeTranscriptDuplicates(&data, true); + + return data; +} + +GFFData parse_gff(const std::string &filename) +{ + GFFData data; + + AnnotationSource annotation_source = guessAnnotationSource(filename); + auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename); + + gzFile annotationFile = gzopen(filename.c_str(), "r"); + const int MAXLEN = 1000; + char buf[MAXLEN]; + while(gzgets(annotationFile, buf, MAXLEN) != NULL) { + GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction); + if (annotation_source == ensembl){ + if (rec.attributes.count("gene_id")) + data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]); + if (rec.attributes.count("Parent") + && rec.attributes["Parent"].find("gene:") != std::string::npos) { + std::string gene_id = rec.attributes["Parent"].substr( + rec.attributes["Parent"].find_first_of(':') + 1); + data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]); + data.transcript_dict[rec.attributes["transcript_id"]] = + Pos (rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id); + } else if (rec.feature == "exon") { + if (rec.attributes["Parent"].find("transcript:") == std::string::npos) + throw std::invalid_argument("format error"); + std::string gene_id = rec.attributes["Parent"].substr( + rec.attributes["Parent"].find_first_of(':') + 1); + data.transcript_to_exon[gene_id].push_back( StartEndPair(rec.start - 1, rec.end) ); + } + } else if (annotation_source == gencode) { + if (rec.feature == "gene") { + data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]); + } else if (rec.feature == "transcript") { + std::string gene_id = rec.attributes["Parent"]; + if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id)) + data.chr_to_gene[rec.seqname].push_back(gene_id); + + data.gene_to_transcript["gene_id"].push_back(rec.attributes["transcript_id"]); + data.transcript_dict[rec.attributes["transcript_id"]] = + Pos( rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id); + } else if (rec.feature == "exon") { + data.transcript_to_exon[rec.attributes["Parent"]].push_back( + StartEndPair(rec.start - 1, rec.end) + ); + } + } + } + gzclose(annotationFile); + + removeTranscriptDuplicates(&data, false); + + return data; +} + + +GFFData parse_gff_file(const std::string &gff_filename) +{ + if (gff_filename.find(".gtf") != std::string::npos) { + return parse_gtf(gff_filename); + } + + return parse_gff(gff_filename); +} diff --git a/src/classes/GeneAnnotationParser.h b/src/classes/GeneAnnotationParser.h new file mode 100644 index 0000000..963db0e --- /dev/null +++ b/src/classes/GeneAnnotationParser.h @@ -0,0 +1,10 @@ +#ifndef GENE_ANNOTATION_PARSER_H +#define GENE_ANNOTATION_PARSER_H + +#include + +#include "GFFData.h" + +GFFData parse_gff_file(const std::string &gff_filename); + +#endif // GENE_ANNOTATION_PARSER_H \ No newline at end of file