Skip to content

Commit

Permalink
added BamRecord, Gene parsing and devtools::check fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
OliverVoogd committed Jul 3, 2023
1 parent eb33725 commit a5376a1
Show file tree
Hide file tree
Showing 4 changed files with 386 additions and 0 deletions.
99 changes: 99 additions & 0 deletions src/classes/BamRecord.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#include "BamRecord.h"

#include <vector>
#include <string>
#include <codecvt>
#include <Rcpp.h>

#include "../utility/bam.h"
#include "../utility/cigars.h"


int calculate_length_from_cigar(const std::vector<CigarPair> &cigartuples, bool queryLength) {
// calculate the length of either query or reference using a cigar string
// query consuming operations are: MIS=X (0, 1, 4, 7, 8)
// reference consuming operations are: MDN=X (0, 2, 3, 7, 8)
// if queryLength =True, calculate query length, otherwise calculate read length
int len = 0;
for (const auto &pair : cigartuples) {
if (queryLength) {
if (pair.op == 0 || pair.op == 1 || pair.op == 4 || pair.op == 7 || pair.op == 8) {
len += pair.len;
}
} else {
if (pair.op == 0 || pair.op == 2 || pair.op == 3 || pair.op == 7 || pair.op == 8) {
len += pair.len;
}
}
}

return len;
}

int calculate_reference_end(const bam1_t *b, const std::vector<CigarPair> &cigartuples) {
// calculate the length of the reference based on reference consuming operations:
// MDN=X (0, 2, 3, 7, 8)
int ref_start = bam_reference_start(b);
ref_start += calculate_length_from_cigar(cigartuples, false);
return ref_start;
}

int calculate_query_alignment_length(const std::vector<CigarPair> &cigartuples) {
int q_len = calculate_length_from_cigar(cigartuples, true);

// remove regions of soft clipping from the ends of the query length
if (cigartuples.size() > 0) {
if (cigartuples[0].op == 4) {
q_len -= cigartuples[0].len;
}
if (cigartuples.size() > 1) {
if (cigartuples.back().op == 4) {
q_len -= cigartuples.back().len;
}
}
}

return q_len;
}

/* takes a flag int, converts it to all of the properties it encodes for
*/
Flag
read_flag(int n)
{
Flag flag;

flag.read_paired = (n & (1<<0));
flag.read_mapped_in_proper_pair = (n & (1<<1));
flag.read_unmapped = (n & (1<<2));
flag.mate_unmapped = (n & (1<<3));
flag.read_reverse_strand = (n & (1<<4));
flag.mate_reverse_strand = (n & (1<<5));
flag.first_in_pair = (n & (1<<6));
flag.second_in_pair = (n & (1<<7));
flag.not_primary_alignment = (n & (1<<8));
flag.read_fails_platform_vendor_quality_checks = (n & (1<<9));
flag.read_is_PCR_or_optical_duplicate = (n & (1<<10));
flag.supplementary_alignment = (n & (1<<11));

return flag;
}

/* takes a single bam entry, converts it into a BAMRecord struct
*/
BAMRecord
read_record(const bam1_t * b, const bam_header_t * header)
{
BAMRecord rec;
rec.reference_start = bam_reference_start(b);
rec.reference_name = std::string(header->target_name[b->core.tid]);
rec.AS_tag = bam_aux2i(bam_aux_get(b, "AS"));
rec.read_name = std::string(bam1_qname(b));
rec.mapping_quality = (int)bam_mapping_qual(b);
rec.cigar = generate_cigar_pairs(b);
rec.cigar_string = generate_cigar(rec.cigar);
rec.reference_end = calculate_reference_end(b, rec.cigar);
rec.query_alignment_length = calculate_query_alignment_length(rec.cigar);
rec.flag = read_flag(b->core.flag);
return rec;
}
94 changes: 94 additions & 0 deletions src/classes/BamRecord.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#ifndef BAM_RECORD_H
#define BAM_RECORD_H

#include <vector>
#include <string>

#include "../utility/bam.h"
#include "../utility/cigars.h"

/*
holds all of the flags in a bam record
*/
struct Flag {
bool
read_paired;
bool
read_mapped_in_proper_pair;
bool
read_unmapped;
bool
mate_unmapped;
bool
read_reverse_strand;
bool
mate_reverse_strand;
bool
first_in_pair;
bool
second_in_pair;
bool
not_primary_alignment;
bool
read_fails_platform_vendor_quality_checks;
bool
read_is_PCR_or_optical_duplicate;
bool
supplementary_alignment;

inline bool operator==(const Flag &o) const{
return
read_paired == o.read_paired &&
read_mapped_in_proper_pair == o.read_mapped_in_proper_pair &&
read_unmapped == o.read_unmapped &&
mate_unmapped == o.mate_unmapped &&
read_reverse_strand == o.read_reverse_strand &&
mate_reverse_strand == o.mate_reverse_strand &&
first_in_pair == o.first_in_pair &&
second_in_pair == o.second_in_pair &&
not_primary_alignment == o.not_primary_alignment &&
read_fails_platform_vendor_quality_checks == o.read_fails_platform_vendor_quality_checks &&
read_is_PCR_or_optical_duplicate == o.read_is_PCR_or_optical_duplicate &&
supplementary_alignment == o.supplementary_alignment;
}
};

/*
a struct for handling each record in a bam file
*/
struct BAMRecord {
std::vector<CigarPair>
cigar;
std::string
cigar_string;

std::string
reference_name;
int
reference_start;
int
reference_end;

std::string
read_name;

float
AS_tag;
int
query_alignment_length;
int
mapping_quality;
Flag
flag;
};


Flag
read_flag(int);

BAMRecord
read_record(const bam1_t*, const bam_header_t*);

int calculate_length_from_cigar(const std::vector<CigarPair> &cigartuples, bool queryLength);

#endif // BAM_RECORD_H
183 changes: 183 additions & 0 deletions src/classes/GeneAnnotationParser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#include "GeneAnnotationParser.h"

#include <unordered_map>
#include <string>
#include <vector>
#include <set>
#include "zlib.h"

#include "GFFData.h"
#include "GFFRecord.h"
#include "types.h"
#include "StartEndPair.h"
#include "Pos.h"
#include "../utility/utility.h"

enum AnnotationSource { gencode, ensembl };
AnnotationSource guessAnnotationSource(const std::string &filename)
{
gzFile annotation = gzopen(filename.c_str(), "r");

const int MAXLEN = 256;
char buf[MAXLEN];
AnnotationSource returnSource = ensembl;
// Only look through the first 1000 lines
for (int i = 0; i < 1000; i++) {
if (gzgets(annotation, buf, MAXLEN) == NULL) return ensembl;
std::string line(buf);
if (line.find("GENCODE") != std::string::npos) {
returnSource = gencode;
break;
} else if (line.find("1\tEnsembl") != std::string::npos) {
returnSource = ensembl;
break;
}
}

gzclose(annotation);
return returnSource;
}

void
removeTranscriptDuplicates(GFFData * const data, bool updateTranscriptDict=true)
{
for (auto &[transcript, exons] : data->transcript_to_exon) {
ranges::sort<StartEndPair>(exons);
if (exons.size() > 1 && exons[0].start == exons[1].start) {
std::vector<exon> new_exons = {exons[0]};
for (const auto &ex : exons) {
if (ex.start != new_exons.back().start && ex.end != new_exons.back().start)
new_exons.push_back(ex);
}

data->transcript_to_exon[transcript] = new_exons;
}

if (updateTranscriptDict) {
data->transcript_dict[transcript] =
Pos(data->transcript_dict[transcript].chr, data->transcript_to_exon[transcript][0].start,
data->transcript_to_exon[transcript].back().end, data->transcript_dict[transcript].strand,
data->transcript_dict[transcript].parent_id);
}
}

for (const auto &[gene, transcripts] : data->gene_to_transcript) {
// remove duplicates from transcript
// sort transcripts
std::set<std::string> transcript_set(transcripts.begin(), transcripts.end());
data->gene_to_transcript[gene] = std::vector<std::string> (transcript_set.begin(), transcript_set.end());
}
}

GFFData parse_gtf(const std::string &filename)
{
GFFData data;
auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename);

gzFile annotationFile = gzopen(filename.c_str(), "r");
const int MAXLEN = 65535; // maximum char array size
char buf[MAXLEN];
while(gzgets(annotationFile, buf, MAXLEN) != NULL) {
GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction);

if (rec.feature == "gene") {
data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
} else if (rec.feature == "transcript") {
if (rec.attributes.count("gene_id") == 0) continue;

std::string gene_id = rec.attributes["gene_id"];
if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id))
data.chr_to_gene[rec.seqname].push_back(gene_id);
data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]);
data.transcript_dict[rec.attributes["transcript_id"]] =
Pos(rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
} else if (rec.feature == "exon") {
if (rec.attributes.count("gene_id") == 0) continue;

if (data.chr_to_gene.count(rec.seqname) == 0
|| ranges::doesNotContain(data.chr_to_gene[rec.seqname], rec.attributes["gene_id"])) {
data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
}
if (rec.attributes.count("transcript_id")) {
if (data.transcript_dict.count(rec.attributes["transcript_id"]) == 0) {
data.gene_to_transcript[rec.attributes["gene_id"]].push_back(rec.attributes["transcript_id"]);
data.transcript_dict[rec.attributes["transcript_id"]] =
Pos(rec.seqname, -1, 1, rec.strand, rec.attributes["gene_id"]);
}
data.transcript_to_exon[rec.attributes["transcript_id"]].push_back(
StartEndPair(rec.start - 1, rec.end)
);
}
}

}
gzclose(annotationFile);

removeTranscriptDuplicates(&data, true);

return data;
}

GFFData parse_gff(const std::string &filename)
{
GFFData data;

AnnotationSource annotation_source = guessAnnotationSource(filename);
auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename);

gzFile annotationFile = gzopen(filename.c_str(), "r");
const int MAXLEN = 1000;
char buf[MAXLEN];
while(gzgets(annotationFile, buf, MAXLEN) != NULL) {
GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction);
if (annotation_source == ensembl){
if (rec.attributes.count("gene_id"))
data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
if (rec.attributes.count("Parent")
&& rec.attributes["Parent"].find("gene:") != std::string::npos) {
std::string gene_id = rec.attributes["Parent"].substr(
rec.attributes["Parent"].find_first_of(':') + 1);
data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]);
data.transcript_dict[rec.attributes["transcript_id"]] =
Pos (rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
} else if (rec.feature == "exon") {
if (rec.attributes["Parent"].find("transcript:") == std::string::npos)
throw std::invalid_argument("format error");
std::string gene_id = rec.attributes["Parent"].substr(
rec.attributes["Parent"].find_first_of(':') + 1);
data.transcript_to_exon[gene_id].push_back( StartEndPair(rec.start - 1, rec.end) );
}
} else if (annotation_source == gencode) {
if (rec.feature == "gene") {
data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]);
} else if (rec.feature == "transcript") {
std::string gene_id = rec.attributes["Parent"];
if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id))
data.chr_to_gene[rec.seqname].push_back(gene_id);

data.gene_to_transcript["gene_id"].push_back(rec.attributes["transcript_id"]);
data.transcript_dict[rec.attributes["transcript_id"]] =
Pos( rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id);
} else if (rec.feature == "exon") {
data.transcript_to_exon[rec.attributes["Parent"]].push_back(
StartEndPair(rec.start - 1, rec.end)
);
}
}
}
gzclose(annotationFile);

removeTranscriptDuplicates(&data, false);

return data;
}


GFFData parse_gff_file(const std::string &gff_filename)
{
if (gff_filename.find(".gtf") != std::string::npos) {
return parse_gtf(gff_filename);
}

return parse_gff(gff_filename);
}
10 changes: 10 additions & 0 deletions src/classes/GeneAnnotationParser.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef GENE_ANNOTATION_PARSER_H
#define GENE_ANNOTATION_PARSER_H

#include <string>

#include "GFFData.h"

GFFData parse_gff_file(const std::string &gff_filename);

#endif // GENE_ANNOTATION_PARSER_H

0 comments on commit a5376a1

Please sign in to comment.