From a5376a1e2cdbff22dea049163254a1ccf8a2c0ae Mon Sep 17 00:00:00 2001
From: Oliver Voogd <olivervoogd@gmail.com>
Date: Mon, 3 Jul 2023 13:09:14 +1000
Subject: [PATCH] added BamRecord, Gene parsing and devtools::check fixes

---
 src/classes/BamRecord.cpp            |  99 +++++++++++++++
 src/classes/BamRecord.h              |  94 ++++++++++++++
 src/classes/GeneAnnotationParser.cpp | 183 +++++++++++++++++++++++++++
 src/classes/GeneAnnotationParser.h   |  10 ++
 4 files changed, 386 insertions(+)
 create mode 100644 src/classes/BamRecord.cpp
 create mode 100644 src/classes/BamRecord.h
 create mode 100644 src/classes/GeneAnnotationParser.cpp
 create mode 100644 src/classes/GeneAnnotationParser.h
diff --git a/src/classes/BamRecord.cpp b/src/classes/BamRecord.cpp
new file mode 100644
index 0000000..a47aed6
--- /dev/null
+++ b/src/classes/BamRecord.cpp
@@ -0,0 +1,99 @@
+#include "BamRecord.h"
+
+#include <vector>
+#include <string>
+#include <codecvt>
+#include <Rcpp.h>
+
+#include "../utility/bam.h"
+#include "../utility/cigars.h"
+
+
+int calculate_length_from_cigar(const std::vector<CigarPair> &cigartuples, bool queryLength) {
+	// calculate the length of either query or reference using a cigar string
+	// query consuming operations are: MIS=X (0, 1, 4, 7, 8)
+	// reference consuming operations are: MDN=X (0, 2, 3, 7, 8)
+	// if queryLength =True, calculate query length, otherwise calculate read length
+	int len = 0;
+	for (const auto &pair : cigartuples) {
+		if (queryLength) {
+			if (pair.op == 0 || pair.op == 1 || pair.op == 4 || pair.op == 7 || pair.op == 8) {
+				len += pair.len;
+			}
+		} else {
+			if (pair.op == 0 || pair.op == 2 || pair.op == 3 || pair.op == 7 || pair.op == 8) {
+				len += pair.len;
+			}
+		}
+	}
+
+	return len;
+}
+
+int calculate_reference_end(const bam1_t *b, const std::vector<CigarPair> &cigartuples) {
+	// calculate the length of the reference based on reference consuming operations:
+	// MDN=X (0, 2, 3, 7, 8)
+	int ref_start = bam_reference_start(b);
+	ref_start += calculate_length_from_cigar(cigartuples, false);
+	return ref_start;
+}
+
+int calculate_query_alignment_length(const std::vector<CigarPair> &cigartuples) {
+	int q_len = calculate_length_from_cigar(cigartuples, true);
+
+	// remove regions of soft clipping from the ends of the query length 
+	if (cigartuples.size() > 0) {
+		if (cigartuples[0].op == 4) {
+			q_len -= cigartuples[0].len;
+		}
+		if (cigartuples.size() > 1) {
+			if (cigartuples.back().op == 4) {
+				q_len -= cigartuples.back().len;
+			}
+		}
+	}
+
+	return q_len;
+}
+
+/*  takes a flag int, converts it to all of the properties it encodes for 
+*/
+Flag
+read_flag(int n)
+{
+    Flag flag;
+
+    flag.read_paired                               = (n & (1<<0));
+    flag.read_mapped_in_proper_pair                = (n & (1<<1));
+    flag.read_unmapped                             = (n & (1<<2));
+    flag.mate_unmapped                             = (n & (1<<3));
+    flag.read_reverse_strand                       = (n & (1<<4));
+    flag.mate_reverse_strand                       = (n & (1<<5));
+    flag.first_in_pair                             = (n & (1<<6));
+    flag.second_in_pair                            = (n & (1<<7));
+    flag.not_primary_alignment                     = (n & (1<<8));
+    flag.read_fails_platform_vendor_quality_checks = (n & (1<<9));
+    flag.read_is_PCR_or_optical_duplicate          = (n & (1<<10));
+    flag.supplementary_alignment                   = (n & (1<<11));
+
+    return flag;
+}
+
+/*  takes a single bam entry, converts it into a BAMRecord struct
+*/
+BAMRecord
+read_record(const bam1_t * b, const bam_header_t * header)
+{
+    BAMRecord rec;
+    rec.reference_start = bam_reference_start(b);
+    rec.reference_name = std::string(header->target_name[b->core.tid]);
+    rec.AS_tag = bam_aux2i(bam_aux_get(b, "AS"));
+    rec.read_name = std::string(bam1_qname(b));
+    rec.mapping_quality = (int)bam_mapping_qual(b);
+    rec.cigar = generate_cigar_pairs(b);
+	rec.cigar_string = generate_cigar(rec.cigar);
+	rec.reference_end = calculate_reference_end(b, rec.cigar);
+	rec.query_alignment_length = calculate_query_alignment_length(rec.cigar);
+    rec.flag = read_flag(b->core.flag);
+    return rec;
+}
diff --git a/src/classes/BamRecord.h b/src/classes/BamRecord.h
new file mode 100644
index 0000000..e2e3973
--- /dev/null
+++ b/src/classes/BamRecord.h
@@ -0,0 +1,94 @@
+#ifndef BAM_RECORD_H
+#define BAM_RECORD_H
+
+#include <vector>
+#include <string>
+
+#include "../utility/bam.h"
+#include "../utility/cigars.h"
+
+/* 
+    holds all of the flags in a bam record
+*/
+struct Flag {
+    bool
+    read_paired;
+    bool
+    read_mapped_in_proper_pair;
+    bool
+    read_unmapped;
+    bool
+    mate_unmapped;
+    bool
+    read_reverse_strand;
+    bool
+    mate_reverse_strand;
+    bool
+    first_in_pair;
+    bool
+    second_in_pair;
+    bool
+    not_primary_alignment;
+    bool
+    read_fails_platform_vendor_quality_checks;
+    bool
+    read_is_PCR_or_optical_duplicate;
+    bool
+    supplementary_alignment;
+
+	inline bool operator==(const Flag &o) const{
+		return
+			read_paired == o.read_paired &&
+			read_mapped_in_proper_pair == o.read_mapped_in_proper_pair &&
+			read_unmapped == o.read_unmapped &&
+			mate_unmapped == o.mate_unmapped &&
+			read_reverse_strand == o.read_reverse_strand &&
+			mate_reverse_strand == o.mate_reverse_strand  &&
+			first_in_pair == o.first_in_pair &&
+			second_in_pair == o.second_in_pair &&
+			not_primary_alignment == o.not_primary_alignment &&
+			read_fails_platform_vendor_quality_checks == o.read_fails_platform_vendor_quality_checks &&
+			read_is_PCR_or_optical_duplicate == o.read_is_PCR_or_optical_duplicate &&
+			supplementary_alignment == o.supplementary_alignment;
+	}
+};
+
+/* 
+    a struct for handling each record in a bam file
+*/
+struct BAMRecord {
+    std::vector<CigarPair>
+    cigar;
+    std::string
+    cigar_string;
+
+    std::string
+    reference_name;
+    int
+    reference_start;
+    int
+    reference_end;
+    
+    std::string
+    read_name;
+
+    float
+    AS_tag;
+    int
+    query_alignment_length;
+    int
+    mapping_quality;
+    Flag
+    flag;
+};
+
+
+Flag
+read_flag(int);
+
+BAMRecord
+read_record(const bam1_t*, const bam_header_t*);
+
+int calculate_length_from_cigar(const std::vector<CigarPair> &cigartuples, bool queryLength);
+
+#endif // BAM_RECORD_H
diff --git a/src/classes/GeneAnnotationParser.cpp b/src/classes/GeneAnnotationParser.cpp
new file mode 100644
index 0000000..9a46e51
--- /dev/null
+++ b/src/classes/GeneAnnotationParser.cpp
@@ -0,0 +1,183 @@
+#include "GeneAnnotationParser.h"
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <set>
+#include "zlib.h"
+
+#include "GFFData.h"
+#include "GFFRecord.h"  
+#include "types.h"
+#include "StartEndPair.h"
+#include "Pos.h"
+#include "../utility/utility.h"
+
+enum AnnotationSource { gencode, ensembl };
+AnnotationSource guessAnnotationSource(const std::string &filename) 
+{
+    gzFile annotation = gzopen(filename.c_str(), "r");
+
+    const int MAXLEN = 256;
+    char buf[MAXLEN];
+    AnnotationSource returnSource = ensembl;
+    // Only look through the first 1000 lines
+    for (int i = 0; i < 1000; i++) {
+        if (gzgets(annotation, buf, MAXLEN) == NULL) return ensembl;
+        std::string line(buf);
+        if (line.find("GENCODE") != std::string::npos) {
+            returnSource = gencode;
+            break;
+        } else if (line.find("1\tEnsembl") != std::string::npos) {
+            returnSource = ensembl;
+            break;
+        }
+    }
+
+    gzclose(annotation);
+    return returnSource;
+}
+
+void
+removeTranscriptDuplicates(GFFData * const data, bool updateTranscriptDict=true)
+{
+    for (auto &[transcript, exons] : data->transcript_to_exon) {
+        ranges::sort<StartEndPair>(exons);
+        if (exons.size() > 1 && exons[0].start == exons[1].start) {
+            std::vector<exon> new_exons = {exons[0]};
+            for (const auto &ex : exons) {
+                if (ex.start != new_exons.back().start && ex.end != new_exons.back().start)
+                    new_exons.push_back(ex);
+            }
+
+            data->transcript_to_exon[transcript] = new_exons;
+        }
+
+        if (updateTranscriptDict) {
+            data->transcript_dict[transcript] = 
+                Pos(data->transcript_dict[transcript].chr, data->transcript_to_exon[transcript][0].start,
+                    data->transcript_to_exon[transcript].back().end, data->transcript_dict[transcript].strand,
+                    data->transcript_dict[transcript].parent_id);
+        }
+    }
+
+    for (const auto &[gene, transcripts] : data->gene_to_transcript) {
+        // remove duplicates from transcript
+        // sort transcripts
+        std::set<std::string> transcript_set(transcripts.begin(), transcripts.end());
+        data->gene_to_transcript[gene] = std::vector<std::string> (transcript_set.begin(), transcript_set.end());
+    }
+}
+
+GFFData parse_gtf(const std::string &filename)
+{
+    GFFData data;
+    auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename);
+
+    gzFile annotationFile = gzopen(filename.c_str(), "r");
+    const int MAXLEN = 65535; // maximum char array size
+    char buf[MAXLEN];
+    while(gzgets(annotationFile, buf, MAXLEN) != NULL) {
+        GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction);
+        
+        if (rec.feature == "gene") {
+            data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
+        } else if (rec.feature == "transcript") {
+            if (rec.attributes.count("gene_id") == 0) continue;
+
+            std::string gene_id = rec.attributes["gene_id"];
+            if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id))
+                data.chr_to_gene[rec.seqname].push_back(gene_id);
+            data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]);
+            data.transcript_dict[rec.attributes["transcript_id"]] = 
+                Pos(rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
+        } else if (rec.feature == "exon") {
+            if (rec.attributes.count("gene_id") == 0) continue;
+
+            if (data.chr_to_gene.count(rec.seqname) == 0 
+                    || ranges::doesNotContain(data.chr_to_gene[rec.seqname], rec.attributes["gene_id"])) {
+                data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
+            }
+            if (rec.attributes.count("transcript_id")) {
+                if (data.transcript_dict.count(rec.attributes["transcript_id"]) == 0) {
+                    data.gene_to_transcript[rec.attributes["gene_id"]].push_back(rec.attributes["transcript_id"]);
+                    data.transcript_dict[rec.attributes["transcript_id"]] = 
+                        Pos(rec.seqname, -1, 1, rec.strand, rec.attributes["gene_id"]);
+                }
+                data.transcript_to_exon[rec.attributes["transcript_id"]].push_back(
+                    StartEndPair(rec.start - 1, rec.end)
+                );
+            }
+        }
+        
+    }
+    gzclose(annotationFile);
+
+    removeTranscriptDuplicates(&data, true);
+
+    return data;
+}
+
+GFFData parse_gff(const std::string &filename)
+{
+    GFFData data;
+
+    AnnotationSource annotation_source = guessAnnotationSource(filename);
+    auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename);
+    
+    gzFile annotationFile = gzopen(filename.c_str(), "r");
+    const int MAXLEN = 1000;
+    char buf[MAXLEN];
+    while(gzgets(annotationFile, buf, MAXLEN) != NULL) {
+        GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction);
+        if (annotation_source == ensembl){
+            if (rec.attributes.count("gene_id")) 
+                data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
+            if (rec.attributes.count("Parent") 
+                    && rec.attributes["Parent"].find("gene:") != std::string::npos) {
+                std::string gene_id = rec.attributes["Parent"].substr(
+                    rec.attributes["Parent"].find_first_of(':') + 1);
+                data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]);
+                data.transcript_dict[rec.attributes["transcript_id"]] = 
+                    Pos (rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
+            } else if (rec.feature == "exon") {
+                if (rec.attributes["Parent"].find("transcript:") == std::string::npos)
+                    throw std::invalid_argument("format error");
+                std::string gene_id = rec.attributes["Parent"].substr(
+                    rec.attributes["Parent"].find_first_of(':') + 1);
+                data.transcript_to_exon[gene_id].push_back( StartEndPair(rec.start - 1, rec.end) );
+            }
+        } else if (annotation_source == gencode) {
+            if (rec.feature == "gene") {
+                data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
+            } else if (rec.feature == "transcript") {
+                std::string gene_id = rec.attributes["Parent"];
+                if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id)) 
+                    data.chr_to_gene[rec.seqname].push_back(gene_id);
+                
+                data.gene_to_transcript["gene_id"].push_back(rec.attributes["transcript_id"]);
+                data.transcript_dict[rec.attributes["transcript_id"]] = 
+                    Pos( rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
+            } else if (rec.feature == "exon") {
+                data.transcript_to_exon[rec.attributes["Parent"]].push_back(
+                    StartEndPair(rec.start - 1, rec.end)
+                );
+            }
+        }
+    }
+    gzclose(annotationFile);
+
+    removeTranscriptDuplicates(&data, false);
+
+    return data;
+}
+
+
+GFFData parse_gff_file(const std::string &gff_filename)
+{
+    if (gff_filename.find(".gtf") != std::string::npos) {
+        return parse_gtf(gff_filename);
+    } 
+
+    return parse_gff(gff_filename);
+}
diff --git a/src/classes/GeneAnnotationParser.h b/src/classes/GeneAnnotationParser.h
new file mode 100644
index 0000000..963db0e
--- /dev/null
+++ b/src/classes/GeneAnnotationParser.h
@@ -0,0 +1,10 @@
+#ifndef GENE_ANNOTATION_PARSER_H
+#define GENE_ANNOTATION_PARSER_H
+
+#include <string>
+
+#include "GFFData.h"
+
+GFFData parse_gff_file(const std::string &gff_filename);
+
+#endif // GENE_ANNOTATION_PARSER_H
\ No newline at end of file