-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added BamRecord, Gene parsing and devtools::check fixes
- Loading branch information
1 parent
eb33725
commit a5376a1
Showing
4 changed files
with
386 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#include "BamRecord.h" | ||
|
||
#include <vector> | ||
#include <string> | ||
#include <codecvt> | ||
#include <Rcpp.h> | ||
|
||
#include "../utility/bam.h" | ||
#include "../utility/cigars.h" | ||
|
||
|
||
int calculate_length_from_cigar(const std::vector<CigarPair> &cigartuples, bool queryLength) { | ||
// calculate the length of either query or reference using a cigar string | ||
// query consuming operations are: MIS=X (0, 1, 4, 7, 8) | ||
// reference consuming operations are: MDN=X (0, 2, 3, 7, 8) | ||
// if queryLength =True, calculate query length, otherwise calculate read length | ||
int len = 0; | ||
for (const auto &pair : cigartuples) { | ||
if (queryLength) { | ||
if (pair.op == 0 || pair.op == 1 || pair.op == 4 || pair.op == 7 || pair.op == 8) { | ||
len += pair.len; | ||
} | ||
} else { | ||
if (pair.op == 0 || pair.op == 2 || pair.op == 3 || pair.op == 7 || pair.op == 8) { | ||
len += pair.len; | ||
} | ||
} | ||
} | ||
|
||
return len; | ||
} | ||
|
||
int calculate_reference_end(const bam1_t *b, const std::vector<CigarPair> &cigartuples) { | ||
// calculate the length of the reference based on reference consuming operations: | ||
// MDN=X (0, 2, 3, 7, 8) | ||
int ref_start = bam_reference_start(b); | ||
ref_start += calculate_length_from_cigar(cigartuples, false); | ||
return ref_start; | ||
} | ||
|
||
int calculate_query_alignment_length(const std::vector<CigarPair> &cigartuples) { | ||
int q_len = calculate_length_from_cigar(cigartuples, true); | ||
|
||
// remove regions of soft clipping from the ends of the query length | ||
if (cigartuples.size() > 0) { | ||
if (cigartuples[0].op == 4) { | ||
q_len -= cigartuples[0].len; | ||
} | ||
if (cigartuples.size() > 1) { | ||
if (cigartuples.back().op == 4) { | ||
q_len -= cigartuples.back().len; | ||
} | ||
} | ||
} | ||
|
||
return q_len; | ||
} | ||
|
||
/* takes a flag int, converts it to all of the properties it encodes for | ||
*/ | ||
Flag | ||
read_flag(int n) | ||
{ | ||
Flag flag; | ||
|
||
flag.read_paired = (n & (1<<0)); | ||
flag.read_mapped_in_proper_pair = (n & (1<<1)); | ||
flag.read_unmapped = (n & (1<<2)); | ||
flag.mate_unmapped = (n & (1<<3)); | ||
flag.read_reverse_strand = (n & (1<<4)); | ||
flag.mate_reverse_strand = (n & (1<<5)); | ||
flag.first_in_pair = (n & (1<<6)); | ||
flag.second_in_pair = (n & (1<<7)); | ||
flag.not_primary_alignment = (n & (1<<8)); | ||
flag.read_fails_platform_vendor_quality_checks = (n & (1<<9)); | ||
flag.read_is_PCR_or_optical_duplicate = (n & (1<<10)); | ||
flag.supplementary_alignment = (n & (1<<11)); | ||
|
||
return flag; | ||
} | ||
|
||
/* takes a single bam entry, converts it into a BAMRecord struct | ||
*/ | ||
BAMRecord | ||
read_record(const bam1_t * b, const bam_header_t * header) | ||
{ | ||
BAMRecord rec; | ||
rec.reference_start = bam_reference_start(b); | ||
rec.reference_name = std::string(header->target_name[b->core.tid]); | ||
rec.AS_tag = bam_aux2i(bam_aux_get(b, "AS")); | ||
rec.read_name = std::string(bam1_qname(b)); | ||
rec.mapping_quality = (int)bam_mapping_qual(b); | ||
rec.cigar = generate_cigar_pairs(b); | ||
rec.cigar_string = generate_cigar(rec.cigar); | ||
rec.reference_end = calculate_reference_end(b, rec.cigar); | ||
rec.query_alignment_length = calculate_query_alignment_length(rec.cigar); | ||
rec.flag = read_flag(b->core.flag); | ||
return rec; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#ifndef BAM_RECORD_H | ||
#define BAM_RECORD_H | ||
|
||
#include <vector> | ||
#include <string> | ||
|
||
#include "../utility/bam.h" | ||
#include "../utility/cigars.h" | ||
|
||
/* | ||
holds all of the flags in a bam record | ||
*/ | ||
struct Flag { | ||
bool | ||
read_paired; | ||
bool | ||
read_mapped_in_proper_pair; | ||
bool | ||
read_unmapped; | ||
bool | ||
mate_unmapped; | ||
bool | ||
read_reverse_strand; | ||
bool | ||
mate_reverse_strand; | ||
bool | ||
first_in_pair; | ||
bool | ||
second_in_pair; | ||
bool | ||
not_primary_alignment; | ||
bool | ||
read_fails_platform_vendor_quality_checks; | ||
bool | ||
read_is_PCR_or_optical_duplicate; | ||
bool | ||
supplementary_alignment; | ||
|
||
inline bool operator==(const Flag &o) const{ | ||
return | ||
read_paired == o.read_paired && | ||
read_mapped_in_proper_pair == o.read_mapped_in_proper_pair && | ||
read_unmapped == o.read_unmapped && | ||
mate_unmapped == o.mate_unmapped && | ||
read_reverse_strand == o.read_reverse_strand && | ||
mate_reverse_strand == o.mate_reverse_strand && | ||
first_in_pair == o.first_in_pair && | ||
second_in_pair == o.second_in_pair && | ||
not_primary_alignment == o.not_primary_alignment && | ||
read_fails_platform_vendor_quality_checks == o.read_fails_platform_vendor_quality_checks && | ||
read_is_PCR_or_optical_duplicate == o.read_is_PCR_or_optical_duplicate && | ||
supplementary_alignment == o.supplementary_alignment; | ||
} | ||
}; | ||
|
||
/* | ||
a struct for handling each record in a bam file | ||
*/ | ||
struct BAMRecord { | ||
std::vector<CigarPair> | ||
cigar; | ||
std::string | ||
cigar_string; | ||
|
||
std::string | ||
reference_name; | ||
int | ||
reference_start; | ||
int | ||
reference_end; | ||
|
||
std::string | ||
read_name; | ||
|
||
float | ||
AS_tag; | ||
int | ||
query_alignment_length; | ||
int | ||
mapping_quality; | ||
Flag | ||
flag; | ||
}; | ||
|
||
|
||
Flag | ||
read_flag(int); | ||
|
||
BAMRecord | ||
read_record(const bam1_t*, const bam_header_t*); | ||
|
||
int calculate_length_from_cigar(const std::vector<CigarPair> &cigartuples, bool queryLength); | ||
|
||
#endif // BAM_RECORD_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
#include "GeneAnnotationParser.h" | ||
|
||
#include <unordered_map> | ||
#include <string> | ||
#include <vector> | ||
#include <set> | ||
#include "zlib.h" | ||
|
||
#include "GFFData.h" | ||
#include "GFFRecord.h" | ||
#include "types.h" | ||
#include "StartEndPair.h" | ||
#include "Pos.h" | ||
#include "../utility/utility.h" | ||
|
||
enum AnnotationSource { gencode, ensembl }; | ||
AnnotationSource guessAnnotationSource(const std::string &filename) | ||
{ | ||
gzFile annotation = gzopen(filename.c_str(), "r"); | ||
|
||
const int MAXLEN = 256; | ||
char buf[MAXLEN]; | ||
AnnotationSource returnSource = ensembl; | ||
// Only look through the first 1000 lines | ||
for (int i = 0; i < 1000; i++) { | ||
if (gzgets(annotation, buf, MAXLEN) == NULL) return ensembl; | ||
std::string line(buf); | ||
if (line.find("GENCODE") != std::string::npos) { | ||
returnSource = gencode; | ||
break; | ||
} else if (line.find("1\tEnsembl") != std::string::npos) { | ||
returnSource = ensembl; | ||
break; | ||
} | ||
} | ||
|
||
gzclose(annotation); | ||
return returnSource; | ||
} | ||
|
||
void | ||
removeTranscriptDuplicates(GFFData * const data, bool updateTranscriptDict=true) | ||
{ | ||
for (auto &[transcript, exons] : data->transcript_to_exon) { | ||
ranges::sort<StartEndPair>(exons); | ||
if (exons.size() > 1 && exons[0].start == exons[1].start) { | ||
std::vector<exon> new_exons = {exons[0]}; | ||
for (const auto &ex : exons) { | ||
if (ex.start != new_exons.back().start && ex.end != new_exons.back().start) | ||
new_exons.push_back(ex); | ||
} | ||
|
||
data->transcript_to_exon[transcript] = new_exons; | ||
} | ||
|
||
if (updateTranscriptDict) { | ||
data->transcript_dict[transcript] = | ||
Pos(data->transcript_dict[transcript].chr, data->transcript_to_exon[transcript][0].start, | ||
data->transcript_to_exon[transcript].back().end, data->transcript_dict[transcript].strand, | ||
data->transcript_dict[transcript].parent_id); | ||
} | ||
} | ||
|
||
for (const auto &[gene, transcripts] : data->gene_to_transcript) { | ||
// remove duplicates from transcript | ||
// sort transcripts | ||
std::set<std::string> transcript_set(transcripts.begin(), transcripts.end()); | ||
data->gene_to_transcript[gene] = std::vector<std::string> (transcript_set.begin(), transcript_set.end()); | ||
} | ||
} | ||
|
||
GFFData parse_gtf(const std::string &filename) | ||
{ | ||
GFFData data; | ||
auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename); | ||
|
||
gzFile annotationFile = gzopen(filename.c_str(), "r"); | ||
const int MAXLEN = 65535; // maximum char array size | ||
char buf[MAXLEN]; | ||
while(gzgets(annotationFile, buf, MAXLEN) != NULL) { | ||
GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction); | ||
|
||
if (rec.feature == "gene") { | ||
data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]); | ||
} else if (rec.feature == "transcript") { | ||
if (rec.attributes.count("gene_id") == 0) continue; | ||
|
||
std::string gene_id = rec.attributes["gene_id"]; | ||
if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id)) | ||
data.chr_to_gene[rec.seqname].push_back(gene_id); | ||
data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]); | ||
data.transcript_dict[rec.attributes["transcript_id"]] = | ||
Pos(rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id); | ||
} else if (rec.feature == "exon") { | ||
if (rec.attributes.count("gene_id") == 0) continue; | ||
|
||
if (data.chr_to_gene.count(rec.seqname) == 0 | ||
|| ranges::doesNotContain(data.chr_to_gene[rec.seqname], rec.attributes["gene_id"])) { | ||
data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]); | ||
} | ||
if (rec.attributes.count("transcript_id")) { | ||
if (data.transcript_dict.count(rec.attributes["transcript_id"]) == 0) { | ||
data.gene_to_transcript[rec.attributes["gene_id"]].push_back(rec.attributes["transcript_id"]); | ||
data.transcript_dict[rec.attributes["transcript_id"]] = | ||
Pos(rec.seqname, -1, 1, rec.strand, rec.attributes["gene_id"]); | ||
} | ||
data.transcript_to_exon[rec.attributes["transcript_id"]].push_back( | ||
StartEndPair(rec.start - 1, rec.end) | ||
); | ||
} | ||
} | ||
|
||
} | ||
gzclose(annotationFile); | ||
|
||
removeTranscriptDuplicates(&data, true); | ||
|
||
return data; | ||
} | ||
|
||
GFFData parse_gff(const std::string &filename) | ||
{ | ||
GFFData data; | ||
|
||
AnnotationSource annotation_source = guessAnnotationSource(filename); | ||
auto attributeParsingFunction = GFFRecord::chooseAttributesFunc(filename); | ||
|
||
gzFile annotationFile = gzopen(filename.c_str(), "r"); | ||
const int MAXLEN = 1000; | ||
char buf[MAXLEN]; | ||
while(gzgets(annotationFile, buf, MAXLEN) != NULL) { | ||
GFFRecord rec = GFFRecord::parseGFFRecord(std::string(buf), attributeParsingFunction); | ||
if (annotation_source == ensembl){ | ||
if (rec.attributes.count("gene_id")) | ||
data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]); | ||
if (rec.attributes.count("Parent") | ||
&& rec.attributes["Parent"].find("gene:") != std::string::npos) { | ||
std::string gene_id = rec.attributes["Parent"].substr( | ||
rec.attributes["Parent"].find_first_of(':') + 1); | ||
data.gene_to_transcript[gene_id].push_back(rec.attributes["transcript_id"]); | ||
data.transcript_dict[rec.attributes["transcript_id"]] = | ||
Pos (rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id); | ||
} else if (rec.feature == "exon") { | ||
if (rec.attributes["Parent"].find("transcript:") == std::string::npos) | ||
throw std::invalid_argument("format error"); | ||
std::string gene_id = rec.attributes["Parent"].substr( | ||
rec.attributes["Parent"].find_first_of(':') + 1); | ||
data.transcript_to_exon[gene_id].push_back( StartEndPair(rec.start - 1, rec.end) ); | ||
} | ||
} else if (annotation_source == gencode) { | ||
if (rec.feature == "gene") { | ||
data.chr_to_gene[rec.seqname].push_back(rec.attributes["gene_id"]); | ||
} else if (rec.feature == "transcript") { | ||
std::string gene_id = rec.attributes["Parent"]; | ||
if (ranges::doesNotContain(data.chr_to_gene[rec.seqname], gene_id)) | ||
data.chr_to_gene[rec.seqname].push_back(gene_id); | ||
|
||
data.gene_to_transcript["gene_id"].push_back(rec.attributes["transcript_id"]); | ||
data.transcript_dict[rec.attributes["transcript_id"]] = | ||
Pos( rec.seqname, rec.start - 1, rec.end, rec.strand, gene_id); | ||
} else if (rec.feature == "exon") { | ||
data.transcript_to_exon[rec.attributes["Parent"]].push_back( | ||
StartEndPair(rec.start - 1, rec.end) | ||
); | ||
} | ||
} | ||
} | ||
gzclose(annotationFile); | ||
|
||
removeTranscriptDuplicates(&data, false); | ||
|
||
return data; | ||
} | ||
|
||
|
||
GFFData parse_gff_file(const std::string &gff_filename) | ||
{ | ||
if (gff_filename.find(".gtf") != std::string::npos) { | ||
return parse_gtf(gff_filename); | ||
} | ||
|
||
return parse_gff(gff_filename); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#ifndef GENE_ANNOTATION_PARSER_H | ||
#define GENE_ANNOTATION_PARSER_H | ||
|
||
#include <string> | ||
|
||
#include "GFFData.h" | ||
|
||
GFFData parse_gff_file(const std::string &gff_filename); | ||
|
||
#endif // GENE_ANNOTATION_PARSER_H |