added BamRecord, Gene parsing and devtools::check fixes

mritchielab · Jul 3, 2023 · a5376a1 · a5376a1
1 parent eb33725
commit a5376a1
Show file tree

Hide file tree

Showing 4 changed files with 386 additions and 0 deletions.
diff --git a/src/classes/BamRecord.cpp b/src/classes/BamRecord.cpp
@@ -0,0 +1,99 @@
+#include "BamRecord.h"
+
+#include <vector>
+#include <string>
+#include <codecvt>
+#include <Rcpp.h>
+
+#include "../utility/bam.h"
+#include "../utility/cigars.h"
+
+
+int calculate_length_from_cigar(const std::vector<CigarPair> &cigartuples, bool queryLength) {
+ // calculate the length of either query or reference using a cigar string
+ // query consuming operations are: MIS=X (0, 1, 4, 7, 8)
+ // reference consuming operations are: MDN=X (0, 2, 3, 7, 8)
+ // if queryLength =True, calculate query length, otherwise calculate read length
+ int len = 0;
+ for (const auto &pair : cigartuples) {
+ if (queryLength) {
+ if (pair.op == 0 || pair.op == 1 || pair.op == 4 || pair.op == 7 || pair.op == 8) {
+ len += pair.len;
+ }
+ } else {
+ if (pair.op == 0 || pair.op == 2 || pair.op == 3 || pair.op == 7 || pair.op == 8) {
+ len += pair.len;
+ }
+ }
+ }
+
+ return len;
+}
+
+int calculate_reference_end(const bam1_t *b, const std::vector<CigarPair> &cigartuples) {
+ // calculate the length of the reference based on reference consuming operations:
+ // MDN=X (0, 2, 3, 7, 8)
+ int ref_start = bam_reference_start(b);
+ ref_start += calculate_length_from_cigar(cigartuples, false);
+ return ref_start;
+}
+
+int calculate_query_alignment_length(const std::vector<CigarPair> &cigartuples) {
+ int q_len = calculate_length_from_cigar(cigartuples, true);
+
+ // remove regions of soft clipping from the ends of the query length 
+ if (cigartuples.size() > 0) {
+ if (cigartuples[0].op == 4) {
+ q_len -= cigartuples[0].len;
+ }
+ if (cigartuples.size() > 1) {
+ if (cigartuples.back().op == 4) {
+ q_len -= cigartuples.back().len;
+ }
+ }
+ }
+
+ return q_len;
+}
+
+/* takes a flag int, converts it to all of the properties it encodes for 
+*/
+Flag
+read_flag(int n)
+{
+ Flag flag;
+
+ flag.read_paired = (n & (1<<0));
+ flag.read_mapped_in_proper_pair = (n & (1<<1));
+ flag.read_unmapped = (n & (1<<2));
+ flag.mate_unmapped = (n & (1<<3));
+ flag.read_reverse_strand = (n & (1<<4));
+ flag.mate_reverse_strand = (n & (1<<5));
+ flag.first_in_pair = (n & (1<<6));
+ flag.second_in_pair = (n & (1<<7));
+ flag.not_primary_alignment = (n & (1<<8));
+ flag.read_fails_platform_vendor_quality_checks = (n & (1<<9));
+ flag.read_is_PCR_or_optical_duplicate = (n & (1<<10));
+ flag.supplementary_alignment = (n & (1<<11));
+
+ return flag;
+}
+
+/* takes a single bam entry, converts it into a BAMRecord struct
+*/
+BAMRecord
+read_record(const bam1_t * b, const bam_header_t * header)
+{
+ BAMRecord rec;
+ rec.reference_start = bam_reference_start(b);
+ rec.reference_name = std::string(header->target_name[b->core.tid]);
+ rec.AS_tag = bam_aux2i(bam_aux_get(b, "AS"));
+ rec.read_name = std::string(bam1_qname(b));
+ rec.mapping_quality = (int)bam_mapping_qual(b);
+ rec.cigar = generate_cigar_pairs(b);
+ rec.cigar_string = generate_cigar(rec.cigar);
+ rec.reference_end = calculate_reference_end(b, rec.cigar);
+ rec.query_alignment_length = calculate_query_alignment_length(rec.cigar);
+ rec.flag = read_flag(b->core.flag);
+ return rec;
+}
diff --git a/src/classes/BamRecord.h b/src/classes/BamRecord.h
@@ -0,0 +1,94 @@
+#ifndef BAM_RECORD_H
+#define BAM_RECORD_H
+
+#include <vector>
+#include <string>
+
+#include "../utility/bam.h"
+#include "../utility/cigars.h"
+
+/* 
+ holds all of the flags in a bam record
+*/
+struct Flag {
+ bool
+ read_paired;
+ bool
+ read_mapped_in_proper_pair;
+ bool
+ read_unmapped;
+ bool
+ mate_unmapped;
+ bool
+ read_reverse_strand;
+ bool
+ mate_reverse_strand;
+ bool
+ first_in_pair;
+ bool
+ second_in_pair;
+ bool
+ not_primary_alignment;
+ bool
+ read_fails_platform_vendor_quality_checks;
+ bool
+ read_is_PCR_or_optical_duplicate;
+ bool
+ supplementary_alignment;
+
+ inline bool operator==(const Flag &o) const{
+ return
+ read_paired == o.read_paired &&
+ read_mapped_in_proper_pair == o.read_mapped_in_proper_pair &&
+ read_unmapped == o.read_unmapped &&
+ mate_unmapped == o.mate_unmapped &&
+ read_reverse_strand == o.read_reverse_strand &&
+ mate_reverse_strand == o.mate_reverse_strand &&
+ first_in_pair == o.first_in_pair &&
+ second_in_pair == o.second_in_pair &&
+ not_primary_alignment == o.not_primary_alignment &&
+ read_fails_platform_vendor_quality_checks == o.read_fails_platform_vendor_quality_checks &&
+ read_is_PCR_or_optical_duplicate == o.read_is_PCR_or_optical_duplicate &&
+ supplementary_alignment == o.supplementary_alignment;
+ }
+};
+
+/* 
+ a struct for handling each record in a bam file
+*/
+struct BAMRecord {
+ std::vector<CigarPair>
+ cigar;
+ std::string
+ cigar_string;
+
+ std::string
+ reference_name;
+ int
+ reference_start;
+ int
+ reference_end;
+
+ std::string
+ read_name;
+
+ float
+ AS_tag;
+ int
+ query_alignment_length;
+ int
+ mapping_quality;
+ Flag
+ flag;
+};
+
+
+Flag
+read_flag(int);
+
+BAMRecord
+read_record(const bam1_t*, const bam_header_t*);
+
+int calculate_length_from_cigar(const std::vector<CigarPair> &cigartuples, bool queryLength);
+
+#endif // BAM_RECORD_H
diff --git a/src/classes/GeneAnnotationParser.cpp b/src/classes/GeneAnnotationParser.cpp
@@ -0,0 +1,183 @@
+#include "GeneAnnotationParser.h"
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <set>
+#include "zlib.h"
+
+#include "GFFData.h"
+#include "GFFRecord.h" 
+#include "types.h"
+#include "StartEndPair.h"
+#include "Pos.h"
+#include "../utility/utility.h"
+
+enum AnnotationSource { gencode, ensembl };
+AnnotationSource guessAnnotationSource(const std::string &filename) 
+{
+ gzFile annotation = gzopen(filename.c_str(), "r");
+
+ const int MAXLEN = 256;
+ char buf[MAXLEN];
+ AnnotationSource returnSource = ensembl;
+ // Only look through the first 1000 lines
+ for (int i = 0; i < 1000; i++) {
+ if (gzgets(annotation, buf, MAXLEN) == NULL) return ensembl;
+ std::string line(buf);
+ if (line.find("GENCODE") != std::string::npos) {
+ returnSource = gencode;
+ break;
+ } else if (line.find("1\tEnsembl") != std::string::npos) {
+ returnSource = ensembl;
+ break;
+ }
+ }
+
+ gzclose(annotation);
+ return returnSource;
+}
+
+void
+removeTranscriptDuplicates(GFFData * const data, bool updateTranscriptDict=true)
+{
+ for (auto &[transcript, exons] : data->transcript_to_exon) {
+ ranges::sort<StartEndPair>(exons);
+ if (exons.size() > 1 && exons[0].start == exons[1].start) {
+ std::vector<exon> new_exons = {exons[0]};
+ for (const auto &ex : exons) {
+ if (ex.start != new_exons.back().start && ex.end != new_exons.back().start)
+ new_exons.push_back(ex);
+ }
+
+ data->transcript_to_exon[transcript] = new_exons;
+ }
+
+ if (updateTranscriptDict) {
+ data->transcript_dict[transcript] = 
+ Pos(data->transcript_dict[transcript].chr, data->transcript_to_exon[transcript][0].start,
+ data->transcript_to_exon[transcript].back().end, data->transcript_dict[transcript].strand,
+ data->transcript_dict[transcript].parent_id);
+ }
+ }
+
+ for (const auto &[gene, transcripts] : data->gene_to_transcript) {
+ // remove duplicates from transcript
+ // sort transcripts
+ std::set<std::string> transcript_set(transcripts.begin(), transcripts.end());
+ data->gene_to_transcript[gene] = std::vector<std::string> (transcript_set.begin(), transcript_set.end());
+ }
+}
+
+GFFData parse_gtf(const std::string &filename)
+{
+ GFFData data;
+ auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename);
+
+ gzFile annotationFile = gzopen(filename.c_str(), "r");
+ const int MAXLEN = 65535; // maximum char array size
+ char buf[MAXLEN];
+ while(gzgets(annotationFile, buf, MAXLEN) != NULL) {
+ GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction);
+
+ if (rec.feature == "gene") {
+ data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
+ } else if (rec.feature == "transcript") {
+ if (rec.attributes.count("gene_id") == 0) continue;
+
+ std::string gene_id = rec.attributes["gene_id"];
+ if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id))
+ data.chr_to_gene[rec.seqname].push_back(gene_id);
+ data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]);
+ data.transcript_dict[rec.attributes["transcript_id"]] = 
+ Pos(rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
+ } else if (rec.feature == "exon") {
+ if (rec.attributes.count("gene_id") == 0) continue;
+
+ if (data.chr_to_gene.count(rec.seqname) == 0 
+ || ranges::doesNotContain(data.chr_to_gene[rec.seqname], rec.attributes["gene_id"])) {
+ data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
+ }
+ if (rec.attributes.count("transcript_id")) {
+ if (data.transcript_dict.count(rec.attributes["transcript_id"]) == 0) {
+ data.gene_to_transcript[rec.attributes["gene_id"]].push_back(rec.attributes["transcript_id"]);
+ data.transcript_dict[rec.attributes["transcript_id"]] = 
+ Pos(rec.seqname, -1, 1, rec.strand, rec.attributes["gene_id"]);
+ }
+ data.transcript_to_exon[rec.attributes["transcript_id"]].push_back(
+ StartEndPair(rec.start - 1, rec.end)
+ );
+ }
+ }
+
+ }
+ gzclose(annotationFile);
+
+ removeTranscriptDuplicates(&data, true);
+
+ return data;
+}
+
+GFFData parse_gff(const std::string &filename)
+{
+ GFFData data;
+
+ AnnotationSource annotation_source = guessAnnotationSource(filename);
+ auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename);
+
+ gzFile annotationFile = gzopen(filename.c_str(), "r");
+ const int MAXLEN = 1000;
+ char buf[MAXLEN];
+ while(gzgets(annotationFile, buf, MAXLEN) != NULL) {
+ GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction);
+ if (annotation_source == ensembl){
+ if (rec.attributes.count("gene_id")) 
+ data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
+ if (rec.attributes.count("Parent") 
+ && rec.attributes["Parent"].find("gene:") != std::string::npos) {
+ std::string gene_id = rec.attributes["Parent"].substr(
+ rec.attributes["Parent"].find_first_of(':') + 1);
+ data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]);
+ data.transcript_dict[rec.attributes["transcript_id"]] = 
+ Pos (rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
+ } else if (rec.feature == "exon") {
+ if (rec.attributes["Parent"].find("transcript:") == std::string::npos)
+ throw std::invalid_argument("format error");
+ std::string gene_id = rec.attributes["Parent"].substr(
+ rec.attributes["Parent"].find_first_of(':') + 1);
+ data.transcript_to_exon[gene_id].push_back( StartEndPair(rec.start - 1, rec.end) );
+ }
+ } else if (annotation_source == gencode) {
+ if (rec.feature == "gene") {
+ data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
+ } else if (rec.feature == "transcript") {
+ std::string gene_id = rec.attributes["Parent"];
+ if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id)) 
+ data.chr_to_gene[rec.seqname].push_back(gene_id);
+
+ data.gene_to_transcript["gene_id"].push_back(rec.attributes["transcript_id"]);
+ data.transcript_dict[rec.attributes["transcript_id"]] = 
+ Pos( rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
+ } else if (rec.feature == "exon") {
+ data.transcript_to_exon[rec.attributes["Parent"]].push_back(
+ StartEndPair(rec.start - 1, rec.end)
+ );
+ }
+ }
+ }
+ gzclose(annotationFile);
+
+ removeTranscriptDuplicates(&data, false);
+
+ return data;
+}
+
+
+GFFData parse_gff_file(const std::string &gff_filename)
+{
+ if (gff_filename.find(".gtf") != std::string::npos) {
+ return parse_gtf(gff_filename);
+ } 
+
+ return parse_gff(gff_filename);
+}
diff --git a/src/classes/GeneAnnotationParser.h b/src/classes/GeneAnnotationParser.h
@@ -0,0 +1,10 @@
+#ifndef GENE_ANNOTATION_PARSER_H
+#define GENE_ANNOTATION_PARSER_H
+
+#include <string>
+
+#include "GFFData.h"
+
+GFFData parse_gff_file(const std::string &gff_filename);
+
+#endif // GENE_ANNOTATION_PARSER_H