-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6882b28
commit eb33725
Showing
13 changed files
with
1,155 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#ifndef GFFDATA_H | ||
#define GFFDATA_H | ||
|
||
#include <unordered_map> | ||
#include <vector> | ||
#include <string> | ||
#include <Rcpp.h> | ||
|
||
#include "Pos.h" // Pos | ||
#include "types.h" // transcriptvector, exon | ||
|
||
struct GFFData | ||
{ | ||
std::unordered_map<std::string, std::vector<std::string>> | ||
chr_to_gene; | ||
|
||
std::unordered_map<std::string, Pos> | ||
transcript_dict; | ||
|
||
std::unordered_map<std::string, transcriptvector> | ||
gene_to_transcript; | ||
|
||
std::unordered_map<std::string, std::vector<exon>> | ||
transcript_to_exon; | ||
|
||
// Rcpp::List to_R(); | ||
|
||
// void | ||
// from_R(Rcpp::List list); | ||
|
||
// void | ||
// log(std::string filename); | ||
|
||
bool is_empty() const { | ||
return (chr_to_gene.size() + transcript_dict.size() + gene_to_transcript.size() + transcript_to_exon.size()) > 0; | ||
} | ||
}; | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#include "GFFRecord.h" | ||
|
||
#include <string> | ||
#include <vector> | ||
#include <functional> | ||
|
||
#include "../utility/parsing.h" | ||
#include "../utility/utility.h" | ||
|
||
std::function<std::unordered_map<std::string, std::string>(const std::string &)> GFFRecord::chooseAttributesFunc(const std::string &filename) { | ||
if (filename.find(".gtf") != std::string::npos) { | ||
return GFFRecord::parseGTFAttributes; | ||
} else { | ||
return GFFRecord::parseGFFAttributes; | ||
} | ||
} | ||
|
||
std::unordered_map<std::string, std::string> GFFRecord::parseGFFAttributes(const std::string &attributeString) { | ||
if (attributeString == ".") return {}; | ||
|
||
std::unordered_map<std::string, std::string> attributes; | ||
std::vector<std::string> tokens = splitStringToVector(attributeString, ';'); | ||
for (const auto &attribute : tokens) { | ||
size_t commaPos = attribute.find(','); | ||
std::string key = attribute.substr(0, commaPos); | ||
std::string value = attribute.substr(commaPos); | ||
attributes[key] = value; | ||
} | ||
|
||
return attributes; | ||
} | ||
|
||
std::unordered_map<std::string, std::string> GFFRecord::parseGTFAttributes(const std::string &attributeString) { | ||
if (attributeString == ".") return {}; | ||
|
||
std::unordered_map<std::string, std::string> attributes; | ||
std::vector<std::string> tokens = splitStringToVector(attributeString, ';'); | ||
|
||
for (const auto &attribute : tokens) { | ||
if (attribute.size() == 0) continue; | ||
std::vector<std::string> items = splitStringToVector(attribute, '\"'); | ||
if (items.size() < 2) { | ||
items = splitStringToVector(leftStrip(attribute), ' '); | ||
if (items.size() < 2) continue; | ||
} | ||
std::string key = strip(items[0]); | ||
std::string value = strip(items[1]); | ||
attributes[key] = value; | ||
} | ||
return attributes; | ||
} | ||
|
||
|
||
|
||
GFFRecord GFFRecord::parseGFFRecord( | ||
const std::string &line, | ||
std::function<std::unordered_map<std::string, std::string>(const std::string &)> parseAttributesFunc) { | ||
|
||
std::vector<std::string> tokens = splitStringToVector(line, '\t'); | ||
if (tokens.size() != 9) return {}; | ||
|
||
GFFRecord rec; | ||
rec.seqname = tokens[0] == "." ? "" : tokens[0]; | ||
rec.source = tokens[1] == "." ? "" : tokens[1]; | ||
rec.feature = tokens[2] == "." ? "" : tokens[2]; | ||
rec.start = stoi(tokens[3] == "." ? "-1" : tokens[3]); | ||
rec.end = stoi(tokens[4] == "." ? "-1" : tokens[4]); | ||
rec.score = stof(tokens[5] == "." ? "-1" : tokens[5]); | ||
rec.strand = tokens[6][0]; | ||
rec.frame = stoi(tokens[7] == "." ? "" : tokens[7]); | ||
|
||
rec.attributes = parseAttributesFunc(tokens[8]); | ||
|
||
return rec; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#ifndef GFFRECORD_H | ||
#define GFFRECORD_H | ||
|
||
#include <unordered_map> | ||
#include <string> | ||
#include <vector> | ||
#include <functional> | ||
|
||
struct GFFRecord { | ||
std::string seqname, source, feature; | ||
int start, end; | ||
float score; | ||
char strand; | ||
int frame; | ||
std::unordered_map<std::string, std::string> attributes; | ||
|
||
GFFRecord() {}; | ||
|
||
static GFFRecord parseGFFRecord( | ||
const std::string &line, | ||
std::function<std::unordered_map<std::string, std::string>(const std::string &)> parseAttributesFunc); | ||
|
||
static std::unordered_map<std::string, std::string> parseGFFAttributes(const std::string &attributes); | ||
static std::unordered_map<std::string, std::string> parseGTFAttributes(const std::string &attributes); | ||
static std::function<std::unordered_map<std::string, std::string>(const std::string &)> chooseAttributesFunc(const std::string &filename); | ||
}; | ||
|
||
#endif // GFFRECORD_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#ifndef GENEBLOCKS_H | ||
#define GENEBLOCKS_H | ||
|
||
#include <vector> | ||
#include <string> | ||
#include <map> | ||
#include <algorithm> | ||
|
||
#include "types.h" | ||
|
||
struct GeneBlocks | ||
{ | ||
int start, end; | ||
transcriptvector transcript_list; | ||
std::map<std::string, transcriptvector> gene_to_transcript; | ||
|
||
GeneBlocks(int _start, int _end, const transcriptvector &_transcript_list, const std::string &a_gene) | ||
: start{_start}, end{_end}, transcript_list{_transcript_list}, gene_to_transcript{{a_gene, _transcript_list}} | ||
{} | ||
|
||
void add_gene(int _start, int _end, const transcriptvector &_transcript_list, const std::string &a_gene) | ||
{ | ||
end = std::max(end, _end); | ||
transcript_list.insert(transcript_list.end(), _transcript_list.begin(), _transcript_list.end()); | ||
gene_to_transcript[a_gene] = transcriptvector(transcript_list); | ||
} | ||
}; | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#ifndef POS_H | ||
#define POS_H | ||
|
||
#include <string> | ||
#include <Rcpp.h> | ||
|
||
struct Pos | ||
{ | ||
std::string chr; | ||
int start; | ||
int end; | ||
char strand; | ||
std::string parent_id; | ||
|
||
Pos() {} | ||
Pos(std::string _chr, int _start, int _end, char _strand, std::string _parent_id) | ||
: chr(_chr), start(_start), end(_end), strand(_strand), parent_id(_parent_id) {} | ||
}; | ||
|
||
inline bool comparePos(const Pos &a, const Pos &b) { | ||
return a.chr == b.chr | ||
&& a.start == b.start | ||
&& a.end == b.end | ||
&& a.strand == b.strand | ||
&& a.parent_id == b.parent_id; | ||
} | ||
|
||
inline Rcpp::List pos_to_R(Pos * pos) { | ||
/* | ||
wraps up the Pos struct into an Rcpp list | ||
*/ | ||
return Rcpp::List::create( | ||
Rcpp::_["chr"] = pos->chr, | ||
Rcpp::_["start"] = pos->start, | ||
Rcpp::_["end"] = pos->end, | ||
Rcpp::_["strand"] = pos->strand, | ||
Rcpp::_["parent_id"] = pos->parent_id | ||
); | ||
} | ||
|
||
inline Pos pos_from_R(Rcpp::List list) | ||
{ | ||
Pos pos( | ||
(Rcpp::String)(list["chr"]), | ||
list["start"], | ||
list["end"], | ||
list["strand"], | ||
(Rcpp::String)list["parent_id"] | ||
); | ||
return pos; | ||
} | ||
|
||
#endif // POS_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#ifndef START_END_PAIR_H | ||
#define START_END_PAIR_H | ||
|
||
#include <vector> | ||
#include <sstream> | ||
#include <string> | ||
|
||
struct StartEndPair { | ||
int start; | ||
int end; | ||
|
||
StartEndPair(int _start, int _end): start(_start), end(_end) {} | ||
StartEndPair(const StartEndPair &sep): start{sep.start}, end{sep.end} {} | ||
|
||
// we need this to correctly logically compare StartEndPairs | ||
bool operator==(const StartEndPair &other) const | ||
{ | ||
return (start == other.start && end == other.end); | ||
} | ||
bool operator!=(const StartEndPair &other) const | ||
{ | ||
return (start != other.start || end != other.end); | ||
} | ||
|
||
bool operator<(const StartEndPair &other) const | ||
{ | ||
// compare a and b, return true if a is 'less than' b | ||
if (start < other.start) { | ||
return true; | ||
} else if ((start == other.start) && (end < other.end)) { | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
bool operator>(const StartEndPair &other) const | ||
{ | ||
// compare a and b, return true if a is 'greater than' b | ||
if (start > other.start) { | ||
return true; | ||
} else if ((start == other.start) && (end > other.end)) { | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
bool operator>=(const StartEndPair &other) const | ||
{ | ||
return ((*this) > other) || ((*this) == other); | ||
} | ||
|
||
bool operator<=(const StartEndPair &other) const | ||
{ | ||
return ((*this) < other) || ((*this) == other); | ||
} | ||
|
||
std::string getString() const { | ||
std::stringstream s; | ||
s << "(" << start << ", " << end << ")"; | ||
return s.str(); | ||
} | ||
}; | ||
|
||
/* | ||
next, we will need hashing functions | ||
this is so that we can use a std::vector<StartEndPair> as a key in a dictionary | ||
*/ | ||
namespace std { | ||
template <> struct hash<StartEndPair> | ||
{ | ||
std::size_t operator()(const StartEndPair& k) const | ||
{ | ||
using std::size_t; | ||
using std::hash; | ||
|
||
return ((hash<int>()(k.start) | ||
^ (hash<int>()(k.end) << 1)) >> 1); | ||
} | ||
}; | ||
|
||
template<> struct hash<vector<StartEndPair>> | ||
{ | ||
std::size_t operator()(const vector<StartEndPair>& vec) const | ||
{ | ||
using std::size_t; | ||
using std::hash; | ||
|
||
std::size_t seed = vec.size(); | ||
|
||
for (auto& pair : vec) { | ||
seed ^= ((hash<int>()(pair.start) ^ (hash<int>()(pair.end))) >> 1); | ||
} | ||
return seed; | ||
} | ||
}; | ||
|
||
template <> struct hash<vector<int>> | ||
{ | ||
size_t operator()(vector<int> const& vec) const | ||
{ | ||
size_t seed = vec.size(); | ||
for(auto& i : vec) { | ||
seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2); | ||
} | ||
return seed; | ||
} | ||
}; | ||
} | ||
|
||
inline bool StartEndPairCompare(const StartEndPair &a, const StartEndPair &b) { | ||
// compare a and b, return true if a is 'less than' b | ||
// in this case, 'less than' is defined if a.start is less than b.start | ||
return a.start < b.start; | ||
} | ||
|
||
#endif // START_END_PAIR_H |
Oops, something went wrong.