Skip to content

Commit

Permalink
fixed stack limit error when running group_bam2isoform
Browse files Browse the repository at this point in the history
  • Loading branch information
OliverVoogd committed Jul 20, 2023
1 parent d3da049 commit f262bf2
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 32 deletions.
17 changes: 17 additions & 0 deletions inst/python/find_isoform.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,20 @@ def find_isoform_multisample(gff3, genome_bams, isoform_gff3, tss_tes_stat, geno

sys.stdout.flush()
return {"transcript_dict": transcript_dict, "transcript_dict_i": transcript_dict_i}


if __name__=="__main__":
data = "/Users/voogd.o/Documents/FLAMESintermediate/SIRV/"
genome_bam = data + "FLAMESout/align2genome.bam"
gff3 = data + "SIRV_isoforms_multi-fasta-annotation_C_170612a.gtf"
genomefa = data +"SIRV_isoforms_multi-fasta_170612a.fasta"
config_file = data + "SIRV_config_default.json"

out = "/Users/voogd.o/Documents/FLAMESintermediate/SIRV/flamesc++out/py/"
isoform_gff3 = out + "isoform_annotated.gff3"
tss_tes_stat = out +"tss_tes.bedgraph"
transcript_fa = out + "transcript_assembly.fa"

config = {'comment': 'this is the default config for SIRV spike-in data. use splice annotation on alignment.', 'pipeline_parameters': {'seed': 2022, 'do_genome_alignment': True, 'do_isoform_identification': True, 'bambu_isoform_identification': False, 'do_read_realignment': True, 'do_transcript_quantification': True}, 'barcode_parameters': {'max_edit_distance': 2, 'has_UMI': False}, 'isoform_parameters': {'generate_raw_isoform': True, 'max_dist': 10, 'max_ts_dist': 100, 'max_splice_match_dist': 10, 'min_fl_exon_len': 40, 'max_site_per_splice': 3, 'min_sup_cnt': 10, 'min_cnt_pct': 0.01, 'min_sup_pct': 0.2, 'bambu_trust_reference': True, 'strand_specific': 1, 'remove_incomp_reads': 5, 'downsample_ratio': 1}, 'alignment_parameters': {'use_junctions': True, 'no_flank': True}, 'realign_parameters': {'use_annotation': True}, 'transcript_counting': {'min_tr_coverage': 0.75, 'min_read_coverage': 0.75}}

find_isoform(gff3, genome_bam, isoform_gff3, tss_tes_stat, genomefa, transcript_fa, 1, config, "")
12 changes: 10 additions & 2 deletions inst/python/sc_longread.py
Original file line number Diff line number Diff line change
Expand Up @@ -1175,6 +1175,8 @@ def group_bam2isoform(bam_in, out_gff3, out_stat, summary_csv, chr_to_blocks, ge
tss_tes_stat = open(out_stat, "w")
isoform_dict = {}
fa_dict = {}

debuglog = open("/Users/voogd.o/Documents/FLAMESintermediate/SIRV/flamesc++out/py/debug.log", "w")
for c in get_fa(fa_f):
fa_dict[c[0]] = c[1]
for ch in sorted(chr_to_blocks.keys()):
Expand All @@ -1198,14 +1200,20 @@ def group_bam2isoform(bam_in, out_gff3, out_stat, summary_csv, chr_to_blocks, ge
blocks = get_blocks(rec)
junctions = blocks_to_junctions(blocks)
tmp_isoform.add_isoform(junctions, rec.is_reverse)

debuglog.write(rec.query_name + ", " + rec.cigarstring + ", " + str(blocks) + ", " + str(junctions) + "\n")
debuglog.write("tmp_isoform length: " + str(len(tmp_isoform)) + "\n")

if len(tmp_isoform) > 0:
tmp_isoform.update_all_splice()
debuglog.write("splice: " + str(len(tmp_isoform)) + "\n")
tmp_isoform.filter_TSS_TES(
tss_tes_stat, known_site=TSS_TES_site, fdr_cutoff=0.1)
# tmp_isoform.site_stat(tss_tes_stat)
# issue
# tmp_isoform.match_known_annotation(
# transcript_to_junctions, transcript_dict, gene_dict, bl, fa_dict)
debuglog.write("filter: " + str(len(tmp_isoform)) + "\ncnt_pct: " + str(config["isoform_parameters"]["min_cnt_pct"]) + "\n")
if tmp_isoform.match_known_annotation(
transcript_to_junctions, transcript_dict, gene_dict, bl, fa_dict) == 0:
tmp_isoform.match_known_seg(
Expand All @@ -1215,8 +1223,8 @@ def group_bam2isoform(bam_in, out_gff3, out_stat, summary_csv, chr_to_blocks, ge
splice_raw.write(tmp_isoform.raw_splice_to_gff3())
iso_annotated.write(tmp_isoform.isoform_to_gff3(
isoform_pct=config["isoform_parameters"]["min_cnt_pct"]))
# with open(iso_exact,"w") as out_f:
# out_f.write("##gff-version 3\n")
break
debuglog.close()
tss_tes_stat.close()
iso_annotated.close()
bamfile.close()
Expand Down
35 changes: 17 additions & 18 deletions src/main-functions/group_bam2isoform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,20 +91,23 @@ get_blocks(const BAMRecord &record) {

void
merge_files_and_delete(std::ofstream &outfile, const std::vector<std::string> &infiles) {
if (!outfile.is_open()) return;

for (const auto &file : infiles) {
std::ifstream infile(file);

if (!infile.is_open()) continue;

std::string line;
while (std::getline(infile, line)) {
outfile << line;
}
infile.close();
foreachLineinFile(file, [&outfile](const std::string &line) {
outfile << line << "\n";
});

// std::remove(file.c_str());
std::remove(file.c_str());
}
}
void merge_files_and_delete(const std::string &filename, const std::vector<std::string> &infiles) {
std::ofstream outfile(filename);
merge_files_and_delete(outfile, infiles);
outfile.close();
}

static int
bam2isoform_fetch_function(const bam1_t *b, void *data)
Expand All @@ -125,6 +128,7 @@ bam2isoform_fetch_function(const bam1_t *b, void *data)

data_struct->isoform->add_isoform(junctions, rec.flag.read_reverse_strand);
data_struct->i = data_struct->i + 1;

return 0;
}

Expand All @@ -151,10 +155,9 @@ void group_bam2isoform(
// import all the values of fa_f
const std::unordered_map<std::string, std::string> fa_dict = get_fa_dict(fa_file);

Rcpp::Rcout << "starting main group_bam2isoform\n";
std::string iso_annotated_output_prefix = splitStringToVector(out_gff3, '.').front();
std::string tss_tes_output_prefix = splitStringToVector(out_stat, '.').front();
std::string raw_gff3_prefix = raw_gff3 != "" ? splitStringToVector(raw_gff3, '.').front() : "";
std::string iso_annotated_output_prefix = getFilenameBeforeExt(out_gff3, '.');
std::string tss_tes_output_prefix = getFilenameBeforeExt(out_stat, '.');
std::string raw_gff3_prefix = raw_gff3 != "" ? getFilenameBeforeExt(raw_gff3, '.') : "";
std::vector<std::thread> pool;
std::vector<std::string> isoform_annotated_files;
std::vector<std::string> tss_tes_files;
Expand Down Expand Up @@ -228,6 +231,7 @@ void group_bam2isoform(

tmp_isoform.update_all_splice();
tmp_isoform.filter_TSS_TES(tss_tes_stream, TSS_TES_site, (float)0.1);

tmp_isoform.match_known_annotation(
transcript_to_junctions,
transcript_dict,
Expand Down Expand Up @@ -266,10 +270,7 @@ void group_bam2isoform(
merge_files_and_delete(iso_annotated, isoform_annotated_files);
iso_annotated.close();

std::ofstream tss_tes_stat (out_stat);
merge_files_and_delete(tss_tes_stat, tss_tes_files);
tss_tes_stat.close();

merge_files_and_delete(out_stat, tss_tes_files);
// add to splice_raw if we are planning on outputting raw_gff3
std::ofstream splice_raw;
if (raw_gff3 != "") {
Expand All @@ -278,6 +279,4 @@ void group_bam2isoform(
merge_files_and_delete(splice_raw, raw_gff3_files);
splice_raw.close();
}

Rcpp::Rcout << "Finished!\n";
}
15 changes: 15 additions & 0 deletions src/utility/parsing.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <vector>
#include <cctype>
#include <sstream>
#include <fstream>

inline std::vector<std::string> splitStringToVector(const std::string &str, const char sep) {
std::vector<std::string> tokens;
Expand All @@ -15,6 +16,11 @@ inline std::vector<std::string> splitStringToVector(const std::string &str, cons
}
return tokens;
}
inline std::string getFilenameBeforeExt(const std::string &str, const char sep='.') {
int i = str.size() - 1;
while (str[i] != sep) i--;
return str.substr(0, i);
}

inline std::string leftStrip(const std::string &str) {
size_t firstCharPos = 0;
Expand All @@ -32,4 +38,13 @@ inline std::string strip(const std::string &str) {
std::string intermediate = leftStrip(str);
return rightStrip(intermediate);
}

inline void foreachLineinFile(const std::string &filename, const std::function<void(const std::string &)> func) {
std::ifstream infile(filename.c_str());
if (!infile.is_open()) return;

std::string line;
while (std::getline(infile, line)) func(line);
infile.close();
}
#endif // PARSING_H
27 changes: 15 additions & 12 deletions src/utility/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,22 +109,25 @@ inline std::string PosToStr(const Pos &p) {
return ss.str();
}

inline std::string VecToStr(const std::vector<std::string> &v) {
std::stringstream ss;
for (auto it : v) {
ss << it << "\t";
}
return ss.str();
}
// inline std::string VecToStr(const std::vector<std::string> &v) {
// std::stringstream ss;
// for (auto it : v) {
// ss << it << "\t";
// }
// return ss.str();
// }

template <typename T>
inline std::string VecToStr(const std::vector<T> &v) {
inline std::string VecToStr(const std::vector<T> &v, char open='(', char close=')') {
std::stringstream ss;
ss << "{";
for (const auto &it : v) {
ss << it << ",";
ss << open;
if (v.size() > 0) {
ss << v.at(0);
for (size_t i = 1; i < v.size(); i++) {
ss << ", " << v.at(i);
}
}
ss << "}";
ss << close;
return ss.str();
}
/*
Expand Down

0 comments on commit f262bf2

Please sign in to comment.