Skip to content

Commit

Permalink
get_transcript_seq c++
Browse files Browse the repository at this point in the history
  • Loading branch information
OliverVoogd committed Jun 29, 2023
1 parent 6882b28 commit eb33725
Show file tree
Hide file tree
Showing 13 changed files with 1,155 additions and 18 deletions.
39 changes: 39 additions & 0 deletions src/classes/GFFData.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#ifndef GFFDATA_H
#define GFFDATA_H

#include <unordered_map>
#include <vector>
#include <string>
#include <Rcpp.h>

#include "Pos.h" // Pos
#include "types.h" // transcriptvector, exon

struct GFFData
{
std::unordered_map<std::string, std::vector<std::string>>
chr_to_gene;

std::unordered_map<std::string, Pos>
transcript_dict;

std::unordered_map<std::string, transcriptvector>
gene_to_transcript;

std::unordered_map<std::string, std::vector<exon>>
transcript_to_exon;

// Rcpp::List to_R();

// void
// from_R(Rcpp::List list);

// void
// log(std::string filename);

bool is_empty() const {
return (chr_to_gene.size() + transcript_dict.size() + gene_to_transcript.size() + transcript_to_exon.size()) > 0;
}
};

#endif
75 changes: 75 additions & 0 deletions src/classes/GFFRecord.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include "GFFRecord.h"

#include <string>
#include <vector>
#include <functional>

#include "../utility/parsing.h"
#include "../utility/utility.h"

std::function<std::unordered_map<std::string, std::string>(const std::string &)> GFFRecord::chooseAttributesFunc(const std::string &filename) {
if (filename.find(".gtf") != std::string::npos) {
return GFFRecord::parseGTFAttributes;
} else {
return GFFRecord::parseGFFAttributes;
}
}

std::unordered_map<std::string, std::string> GFFRecord::parseGFFAttributes(const std::string &attributeString) {
if (attributeString == ".") return {};

std::unordered_map<std::string, std::string> attributes;
std::vector<std::string> tokens = splitStringToVector(attributeString, ';');
for (const auto &attribute : tokens) {
size_t commaPos = attribute.find(',');
std::string key = attribute.substr(0, commaPos);
std::string value = attribute.substr(commaPos);
attributes[key] = value;
}

return attributes;
}

std::unordered_map<std::string, std::string> GFFRecord::parseGTFAttributes(const std::string &attributeString) {
if (attributeString == ".") return {};

std::unordered_map<std::string, std::string> attributes;
std::vector<std::string> tokens = splitStringToVector(attributeString, ';');

for (const auto &attribute : tokens) {
if (attribute.size() == 0) continue;
std::vector<std::string> items = splitStringToVector(attribute, '\"');
if (items.size() < 2) {
items = splitStringToVector(leftStrip(attribute), ' ');
if (items.size() < 2) continue;
}
std::string key = strip(items[0]);
std::string value = strip(items[1]);
attributes[key] = value;
}
return attributes;
}



GFFRecord GFFRecord::parseGFFRecord(
const std::string &line,
std::function<std::unordered_map<std::string, std::string>(const std::string &)> parseAttributesFunc) {

std::vector<std::string> tokens = splitStringToVector(line, '\t');
if (tokens.size() != 9) return {};

GFFRecord rec;
rec.seqname = tokens[0] == "." ? "" : tokens[0];
rec.source = tokens[1] == "." ? "" : tokens[1];
rec.feature = tokens[2] == "." ? "" : tokens[2];
rec.start = stoi(tokens[3] == "." ? "-1" : tokens[3]);
rec.end = stoi(tokens[4] == "." ? "-1" : tokens[4]);
rec.score = stof(tokens[5] == "." ? "-1" : tokens[5]);
rec.strand = tokens[6][0];
rec.frame = stoi(tokens[7] == "." ? "" : tokens[7]);

rec.attributes = parseAttributesFunc(tokens[8]);

return rec;
}
28 changes: 28 additions & 0 deletions src/classes/GFFRecord.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef GFFRECORD_H
#define GFFRECORD_H

#include <unordered_map>
#include <string>
#include <vector>
#include <functional>

struct GFFRecord {
std::string seqname, source, feature;
int start, end;
float score;
char strand;
int frame;
std::unordered_map<std::string, std::string> attributes;

GFFRecord() {};

static GFFRecord parseGFFRecord(
const std::string &line,
std::function<std::unordered_map<std::string, std::string>(const std::string &)> parseAttributesFunc);

static std::unordered_map<std::string, std::string> parseGFFAttributes(const std::string &attributes);
static std::unordered_map<std::string, std::string> parseGTFAttributes(const std::string &attributes);
static std::function<std::unordered_map<std::string, std::string>(const std::string &)> chooseAttributesFunc(const std::string &filename);
};

#endif // GFFRECORD_H
28 changes: 28 additions & 0 deletions src/classes/GeneBlocks.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef GENEBLOCKS_H
#define GENEBLOCKS_H

#include <vector>
#include <string>
#include <map>
#include <algorithm>

#include "types.h"

struct GeneBlocks
{
int start, end;
transcriptvector transcript_list;
std::map<std::string, transcriptvector> gene_to_transcript;

GeneBlocks(int _start, int _end, const transcriptvector &_transcript_list, const std::string &a_gene)
: start{_start}, end{_end}, transcript_list{_transcript_list}, gene_to_transcript{{a_gene, _transcript_list}}
{}

void add_gene(int _start, int _end, const transcriptvector &_transcript_list, const std::string &a_gene)
{
end = std::max(end, _end);
transcript_list.insert(transcript_list.end(), _transcript_list.begin(), _transcript_list.end());
gene_to_transcript[a_gene] = transcriptvector(transcript_list);
}
};
#endif
53 changes: 53 additions & 0 deletions src/classes/Pos.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#ifndef POS_H
#define POS_H

#include <string>
#include <Rcpp.h>

struct Pos
{
std::string chr;
int start;
int end;
char strand;
std::string parent_id;

Pos() {}
Pos(std::string _chr, int _start, int _end, char _strand, std::string _parent_id)
: chr(_chr), start(_start), end(_end), strand(_strand), parent_id(_parent_id) {}
};

inline bool comparePos(const Pos &a, const Pos &b) {
return a.chr == b.chr
&& a.start == b.start
&& a.end == b.end
&& a.strand == b.strand
&& a.parent_id == b.parent_id;
}

inline Rcpp::List pos_to_R(Pos * pos) {
/*
wraps up the Pos struct into an Rcpp list
*/
return Rcpp::List::create(
Rcpp::_["chr"] = pos->chr,
Rcpp::_["start"] = pos->start,
Rcpp::_["end"] = pos->end,
Rcpp::_["strand"] = pos->strand,
Rcpp::_["parent_id"] = pos->parent_id
);
}

inline Pos pos_from_R(Rcpp::List list)
{
Pos pos(
(Rcpp::String)(list["chr"]),
list["start"],
list["end"],
list["strand"],
(Rcpp::String)list["parent_id"]
);
return pos;
}

#endif // POS_H
116 changes: 116 additions & 0 deletions src/classes/StartEndPair.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#ifndef START_END_PAIR_H
#define START_END_PAIR_H

#include <vector>
#include <sstream>
#include <string>

struct StartEndPair {
int start;
int end;

StartEndPair(int _start, int _end): start(_start), end(_end) {}
StartEndPair(const StartEndPair &sep): start{sep.start}, end{sep.end} {}

// we need this to correctly logically compare StartEndPairs
bool operator==(const StartEndPair &other) const
{
return (start == other.start && end == other.end);
}
bool operator!=(const StartEndPair &other) const
{
return (start != other.start || end != other.end);
}

bool operator<(const StartEndPair &other) const
{
// compare a and b, return true if a is 'less than' b
if (start < other.start) {
return true;
} else if ((start == other.start) && (end < other.end)) {
return true;
}
return false;
}

bool operator>(const StartEndPair &other) const
{
// compare a and b, return true if a is 'greater than' b
if (start > other.start) {
return true;
} else if ((start == other.start) && (end > other.end)) {
return true;
}
return false;
}

bool operator>=(const StartEndPair &other) const
{
return ((*this) > other) || ((*this) == other);
}

bool operator<=(const StartEndPair &other) const
{
return ((*this) < other) || ((*this) == other);
}

std::string getString() const {
std::stringstream s;
s << "(" << start << ", " << end << ")";
return s.str();
}
};

/*
next, we will need hashing functions
this is so that we can use a std::vector<StartEndPair> as a key in a dictionary
*/
namespace std {
template <> struct hash<StartEndPair>
{
std::size_t operator()(const StartEndPair& k) const
{
using std::size_t;
using std::hash;

return ((hash<int>()(k.start)
^ (hash<int>()(k.end) << 1)) >> 1);
}
};

template<> struct hash<vector<StartEndPair>>
{
std::size_t operator()(const vector<StartEndPair>& vec) const
{
using std::size_t;
using std::hash;

std::size_t seed = vec.size();

for (auto& pair : vec) {
seed ^= ((hash<int>()(pair.start) ^ (hash<int>()(pair.end))) >> 1);
}
return seed;
}
};

template <> struct hash<vector<int>>
{
size_t operator()(vector<int> const& vec) const
{
size_t seed = vec.size();
for(auto& i : vec) {
seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
return seed;
}
};
}

inline bool StartEndPairCompare(const StartEndPair &a, const StartEndPair &b) {
// compare a and b, return true if a is 'less than' b
// in this case, 'less than' is defined if a.start is less than b.start
return a.start < b.start;
}

#endif // START_END_PAIR_H
Loading

0 comments on commit eb33725

Please sign in to comment.