Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add all reads to merged fusion #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .vscode/c_cpp_properties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Win32",
"includePath": [
"${workspaceFolder}/**"
],
"defines": [
"_DEBUG",
"UNICODE",
"_UNICODE"
],
"configurationProvider": "ms-vscode.makefile-tools",
"compilerPath": "C:/msys64/mingw64/bin/gcc.exe"
}
],
"version": 4
}
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ bioconda:

# make arriba executable
arriba: $(SOURCE)/arriba.cpp $(SOURCE)/annotation.o $(SOURCE)/assembly.o $(SOURCE)/options.o $(SOURCE)/read_chimeric_alignments.o $(SOURCE)/filter_duplicates.o $(SOURCE)/filter_uninteresting_contigs.o $(SOURCE)/filter_viral_contigs.o $(SOURCE)/filter_top_expressed_viral_contigs.o $(SOURCE)/filter_low_coverage_viral_contigs.o $(SOURCE)/filter_inconsistently_clipped.o $(SOURCE)/filter_homopolymer.o $(SOURCE)/read_stats.o $(SOURCE)/fusions.o $(SOURCE)/filter_proximal_read_through.o $(SOURCE)/filter_same_gene.o $(SOURCE)/filter_small_insert_size.o $(SOURCE)/filter_long_gap.o $(SOURCE)/filter_hairpin.o $(SOURCE)/filter_multimappers.o $(SOURCE)/filter_mismatches.o $(SOURCE)/filter_low_entropy.o $(SOURCE)/filter_relative_support.o $(SOURCE)/filter_both_intronic.o $(SOURCE)/filter_non_coding_neighbors.o $(SOURCE)/filter_intragenic_both_exonic.o $(SOURCE)/recover_internal_tandem_duplication.o $(SOURCE)/filter_min_support.o $(SOURCE)/recover_known_fusions.o $(SOURCE)/recover_both_spliced.o $(SOURCE)/filter_blacklisted_ranges.o $(SOURCE)/filter_end_to_end.o $(SOURCE)/filter_in_vitro.o $(SOURCE)/merge_adjacent_fusions.o $(SOURCE)/select_best.o $(SOURCE)/filter_marginal_read_through.o $(SOURCE)/filter_short_anchor.o $(SOURCE)/filter_no_coverage.o $(SOURCE)/filter_homologs.o $(SOURCE)/filter_mismappers.o $(SOURCE)/recover_many_spliced.o $(SOURCE)/filter_genomic_support.o $(SOURCE)/recover_isoforms.o $(SOURCE)/annotate_tags.o $(SOURCE)/annotate_protein_domains.o $(SOURCE)/output_fusions.o $(SOURCE)/read_compressed_file.o
$(CXX) $(CXXFLAGS) $(CPPFLAGS) -I$(SOURCE) -I$(STATIC_LIBS)/htslib -I$(STATIC_LIBS)/tsl -o arriba $^ $(LDFLAGS) $(LIBS_A) $(LIBS_SO)
$(CXX) $(CXXFLAGS) $(CPPFLAGS) -I$(SOURCE) -I$(STATIC_LIBS)/htslib -I$(STATIC_LIBS)/tsl -g -o arriba $^ $(LDFLAGS) $(LIBS_A) $(LIBS_SO)
%.o: %.cpp $(wildcard $(SOURCE)/*.hpp) $(LIBS_A) $(STATIC_LIBS)/tsl/htrie_map.h
$(CXX) -c $(CXXFLAGS) $(CPPFLAGS) -I$(STATIC_LIBS)/htslib -I$(STATIC_LIBS)/tsl -o $@ $<
$(CXX) -g -c $(CXXFLAGS) $(CPPFLAGS) -I$(STATIC_LIBS)/htslib -I$(STATIC_LIBS)/tsl -o $@ $<

# download and compile dependencies for a static build
WGET := $(shell (which wget && echo "--no-check-certificate -O -") || echo "curl -k -L")
Expand Down
Binary file added arriba
Binary file not shown.
14 changes: 14 additions & 0 deletions dd_arriba.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
sample=$1


/data008/users/yshen/arriba_debug/arriba -x /data008/users/yshen/benchmark_results/CTAT_FUSIONTRANS/STARfusion_results/$sample/Aligned.out.bam \
-o /data008/users/yshen/arriba_debug/output3/$sample/fusions.tsv \
-a /data008/users/yshen/arriba_v2.4.0/hg19/hs37d5viral.fa \
-g /data008/users/yshen/arriba_v2.4.0/hg19/GENCODE19.gtf \
-b /data008/users/yshen/arriba_v2.4.0/database/blacklist_hg19_hs37d5_GRCh37_v2.4.0.tsv.gz \
-k /data008/users/yshen/arriba_v2.4.0/database/known_fusions_hg19_hs37d5_GRCh37_v2.4.0.tsv.gz \
-t /data008/users/yshen/arriba_v2.4.0/database/known_fusions_hg19_hs37d5_GRCh37_v2.4.0.tsv.gz \
-p /data008/users/yshen/arriba_v2.4.0/database/protein_domains_hg19_hs37d5_GRCh37_v2.4.0.gff3 \
#-f homologs
#-X
#-O /data008/users/yshen/arriba_debug/test_output/fusions.discarded.tsv \
15 changes: 8 additions & 7 deletions run_arriba.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,16 @@ tee Aligned.out.bam |
"$BASE_DIR/arriba" \
-x /dev/stdin \
-o fusions.tsv -O fusions.discarded.tsv \
-X \
-a "$ASSEMBLY_FA" -g "$ANNOTATION_GTF" -b "$BLACKLIST_TSV" -k "$KNOWN_FUSIONS_TSV" -t "$TAGS_TSV" -p "$PROTEIN_DOMAINS_GFF3" \
# -d structural_variants_from_WGS.tsv

# sorting and indexing is only required for visualization
if [[ $(samtools --version-only 2> /dev/null) =~ ^1\. ]]; then
samtools sort -@ "$THREADS" -m $((40000/THREADS))M -T tmp -O bam Aligned.out.bam > Aligned.sortedByCoord.out.bam
rm -f Aligned.out.bam
samtools index Aligned.sortedByCoord.out.bam
else
echo "samtools >= 1.0 required for sorting of alignments" 1>&2
fi
#if [[ $(samtools --version-only 2> /dev/null) =~ ^1\. ]]; then
# samtools sort -@ "$THREADS" -m $((40000/THREADS))M -T tmp -O bam Aligned.out.bam > Aligned.sortedByCoord.out.bam
# rm -f Aligned.out.bam
# samtools index Aligned.sortedByCoord.out.bam
#else
# echo "samtools >= 1.0 required for sorting of alignments" 1>&2
#fi

Binary file added source/annotate_protein_domains.o
Binary file not shown.
Binary file added source/annotate_tags.o
Binary file not shown.
Binary file added source/annotation.o
Binary file not shown.
11 changes: 7 additions & 4 deletions source/arriba.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ int main(int argc, char **argv) {
}
}


// if the alignment maps neither to an exon nor to a gene, make a dummy gene which subsumes all alignments with a distance of 10kb
gene_annotation_t unmapped_alignments;
for (chimeric_alignments_t::iterator chimeric_alignment = chimeric_alignments.begin(); chimeric_alignment != chimeric_alignments.end(); ++chimeric_alignment) {
Expand Down Expand Up @@ -422,10 +423,12 @@ int main(int argc, char **argv) {
cout << "(remaining=" << merge_adjacent_fusions(fusions, 5, options.max_itd_length) << ")" << endl;
}

disjoint_set homolog_union;

// this step must come before the e-value calculation, or else multi-mapping reads are counted redundantly
if (options.filters.at("multimappers")) {
cout << get_time_string() << " Filtering multi-mapping fusions by alignment score and read support " << flush;
cout << "(remaining=" << filter_multimappers(chimeric_alignments, fusions, exon_annotation_index, assembly) << ")" << endl;
cout << "(remaining=" << filter_multimappers(chimeric_alignments, fusions, exon_annotation_index, assembly,homolog_union) << ")" << endl;
}

// this step must come after the 'merge_adjacent' filter,
Expand Down Expand Up @@ -555,7 +558,7 @@ int main(int argc, char **argv) {
// this step must come near the end, because it is expensive in terms of memory consumption
if (options.filters.at("homologs")) {
cout << get_time_string() << " Filtering genes with >=" << (options.max_homolog_identity*100) << "% identity " << flush;
cout << "(remaining=" << filter_homologs(fusions, kmer_indices, kmer_length, assembly, options.max_homolog_identity) << ")" << endl;
cout << "(remaining=" << filter_homologs(fusions, kmer_indices, kmer_length, assembly, options.max_homolog_identity,homolog_union) << ")" << endl;
}

// this step must come near the end, because it is expensive in terms of memory and CPU consumption
Expand Down Expand Up @@ -602,11 +605,11 @@ int main(int argc, char **argv) {
}

cout << get_time_string() << " Writing fusions to file '" << options.output_file << "' " << endl;
write_fusions_to_file(fusions, options.output_file, coverage, assembly, gene_annotation_index, exon_annotation_index, original_contig_names, tags, protein_domain_annotation_index, max_mate_gap, options.max_itd_length, true, options.fill_sequence_gaps, false);
write_fusions_to_file(fusions, options.output_file, coverage, assembly, gene_annotation_index, exon_annotation_index, original_contig_names, tags, protein_domain_annotation_index, max_mate_gap, options.max_itd_length, true, options.fill_sequence_gaps, false,homolog_union);

if (options.discarded_output_file != "") {
cout << get_time_string() << " Writing discarded fusions to file '" << options.discarded_output_file << "'" << endl;
write_fusions_to_file(fusions, options.discarded_output_file, coverage, assembly, gene_annotation_index, exon_annotation_index, original_contig_names, tags, protein_domain_annotation_index, max_mate_gap, options.max_itd_length, options.print_extra_info_for_discarded_fusions, options.fill_sequence_gaps, true);
write_fusions_to_file(fusions, options.discarded_output_file, coverage, assembly, gene_annotation_index, exon_annotation_index, original_contig_names, tags, protein_domain_annotation_index, max_mate_gap, options.max_itd_length, options.print_extra_info_for_discarded_fusions, options.fill_sequence_gaps,true,homolog_union);
}

cout << get_time_string() << " Freeing resources" << endl;
Expand Down
Binary file added source/assembly.o
Binary file not shown.
71 changes: 71 additions & 0 deletions source/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
#include <sstream>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <unordered_map>
#include <vector>
#include "sam.h"
#include <iostream>

using namespace std;

Expand Down Expand Up @@ -284,12 +286,81 @@ struct fusion_t {
};
typedef unordered_map< tuple<unsigned int /*gene1 id*/, unsigned int /*gene2 id*/, contig_t /*contig1*/, contig_t /*contig2*/, position_t /*breakpoint1*/, position_t /*breakpoint2*/, direction_t /*direction1*/, direction_t /*direction2*/>,fusion_t > fusions_t;

// supplementary functions for printing newly-implemented data structures
inline std::ostream& operator<<(std::ostream& os, const fusion_t& f) {
return os << "{fusion " << f.gene1->name << '|' << f.gene2->name << '}';
}

template<typename K, typename V>
inline std::ostream& operator<<(std::ostream& os, const std::unordered_map<K, V>& m) {
os << "map {";
for(const auto& pair : m) {
os << ' ' << pair.first << ',' << pair.second;
}
return os << " }";
}

template<typename K>
inline std::ostream& operator<<(std::ostream& os, const std::vector<K>& m) {
os << "vec {";
for(const auto& pair : m) {
os << ' ' << pair;
}
return os << " }";
}

typedef char strandedness_t;
const strandedness_t STRANDEDNESS_NO = 0;
const strandedness_t STRANDEDNESS_YES = 1;
const strandedness_t STRANDEDNESS_REVERSE = 2;
const strandedness_t STRANDEDNESS_AUTO = 3;

struct disjoint_set {

unordered_map<fusion_t*,fusion_t*> parent;
unordered_map<fusion_t*,int> rank;

fusion_t* root(fusion_t* ft){
if(ft != parent[ft]) parent[ft]=root(parent[ft]);
return parent[ft];
}

void make_set(fusion_t* ft){
if(parent[ft] == NULL || parent[ft] == 0)
parent[ft] = ft;
}

void union_set(fusion_t* a, fusion_t* b) {
fusion_t* root_a = root(a);
fusion_t* root_b = root(b);
if (root_a != root_b) {
if (rank[root_a] > rank[root_b]){
parent[root_b] = root_a;
}else{
parent[root_a] = root_b;
if(rank[root_a] == rank[root_b])
rank[root_b]++;
}
}
}

vector<fusion_t*> get_all_members(fusion_t* ft){
vector<fusion_t*> members;
if(parent[ft]==0){
return members;
}
fusion_t* ft_root = root(ft);
for(const auto& elem:parent){
if(root(elem.first) == ft_root){
members.push_back(elem.first);
}
}
return members;
}

};


// implement hash() function for tuples so they can be used as keys in unordered_maps
namespace std {

Expand Down
Binary file added source/filter_blacklisted_ranges.o
Binary file not shown.
Binary file added source/filter_both_intronic.o
Binary file not shown.
27 changes: 20 additions & 7 deletions source/filter_duplicates.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <iostream>
#include <unordered_map>
#include <tuple>
#include "common.hpp"
Expand All @@ -7,7 +8,7 @@ using namespace std;

unsigned int filter_duplicates(chimeric_alignments_t& chimeric_alignments, const bool external_duplicate_marking) {
unsigned int remaining = 0;
unordered_map< tuple<contig_t,contig_t,position_t,position_t> , unsigned int> duplicate_count;
unordered_map< tuple<contig_t,contig_t,contig_t,position_t,position_t,position_t> , unsigned int> duplicate_count;
for (chimeric_alignments_t::iterator chimeric_alignment = chimeric_alignments.begin(); chimeric_alignment != chimeric_alignments.end(); ++chimeric_alignment) {
if (chimeric_alignment->second.filter != FILTER_none)
continue; // read has already been filtered
Expand All @@ -28,14 +29,25 @@ unsigned int filter_duplicates(chimeric_alignments_t& chimeric_alignments, const
chimeric_alignment->second[MATE1].start - chimeric_alignment->second[MATE1].preclipping() :
chimeric_alignment->second[MATE1].end + chimeric_alignment->second[MATE1].postclipping()
);
unsigned int mate2 = (chimeric_alignment->second.size() == 2) ? MATE2 : SUPPLEMENTARY;
//unsigned int mate2 = (chimeric_alignment->second.size() == 2) ? MATE2 : SUPPLEMENTARY;
position_t position2 = static_cast<position_t>(
(chimeric_alignment->second[mate2].strand == FORWARD) ?
chimeric_alignment->second[mate2].start - chimeric_alignment->second[mate2].preclipping() :
chimeric_alignment->second[mate2].end + chimeric_alignment->second[mate2].postclipping()
(chimeric_alignment->second[MATE2].strand == FORWARD) ?
chimeric_alignment->second[MATE2].start - chimeric_alignment->second[MATE2].preclipping() :
chimeric_alignment->second[MATE2].end + chimeric_alignment->second[MATE2].postclipping()
);
contig_t contig1 = chimeric_alignment->second[MATE1].contig;
contig_t contig2 = chimeric_alignment->second[mate2].contig;
contig_t contig2 = chimeric_alignment->second[MATE2].contig;
contig_t contig3 = static_cast<position_t>(0);
position_t position3 = static_cast<position_t>(0);

if(chimeric_alignment->second.size() > 2){
position3 = static_cast<position_t>(
(chimeric_alignment->second[SUPPLEMENTARY].strand == FORWARD) ?
chimeric_alignment->second[SUPPLEMENTARY].start - chimeric_alignment->second[SUPPLEMENTARY].preclipping() :
chimeric_alignment->second[SUPPLEMENTARY].end + chimeric_alignment->second[SUPPLEMENTARY].postclipping()
);
contig3 = chimeric_alignment->second[SUPPLEMENTARY].contig;
}

// always put the mate with the lower coordinate in first position
// or else we might not recognize the duplicate
Expand All @@ -44,7 +56,8 @@ unsigned int filter_duplicates(chimeric_alignments_t& chimeric_alignments, const
swap(contig1, contig2);
}

if (duplicate_count[make_tuple(contig1, contig2, position1, position2)]++ > 0)

if (duplicate_count[make_tuple(contig1, contig2, contig3, position1, position2, position3)]++ > 0)
chimeric_alignment->second.filter = FILTER_duplicates;
else
++remaining;
Expand Down
Binary file added source/filter_duplicates.o
Binary file not shown.
Binary file added source/filter_end_to_end.o
Binary file not shown.
Binary file added source/filter_genomic_support.o
Binary file not shown.
Binary file added source/filter_hairpin.o
Binary file not shown.
22 changes: 13 additions & 9 deletions source/filter_homologs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ bool is_homolog(const gene_t gene1, const gene_t gene2, const kmer_indices_t& km
return false;
}

unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indices, const char kmer_length, const assembly_t& assembly, const float max_identity_fraction) {
unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indices, const char kmer_length, const assembly_t& assembly, const float max_identity_fraction,disjoint_set& homolog_union) {

// select non-discarded fusions for better speed,
// we need to iterate over them many times
Expand Down Expand Up @@ -118,14 +118,18 @@ unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indi
if (is_homolog(homolog1, homolog2, kmer_indices, kmer_length, assembly, max_identity_fraction)) {

// other event must have poorer alignments or fewer reads or a worse e-value for us to consider its supporting reads to be mismappers
if (anchor1 > anchor2 ||
anchor1 == anchor2 && (**fusion).supporting_reads() > (**other_fusion).supporting_reads() ||
anchor1 == anchor2 && (**fusion).supporting_reads() == (**other_fusion).supporting_reads() && (**fusion).evalue <= (**other_fusion).evalue) {
(**other_fusion).filter = FILTER_homologs;
} else {
(**fusion).filter = FILTER_homologs;
break;
}
// if (anchor1 > anchor2 ||
// anchor1 == anchor2 && (**fusion).supporting_reads() > (**other_fusion).supporting_reads() ||
// anchor1 == anchor2 && (**fusion).supporting_reads() == (**other_fusion).supporting_reads() && (**fusion).evalue <= (**other_fusion).evalue) {
// (**other_fusion).filter = FILTER_homologs;
// } else {
// (**fusion).filter = FILTER_homologs;
// break;
// }
// the above commented code is the original version in Arriba, which would filter out the homologous fusions. Here I use a disjoint_set to keep them recorded and reported.
homolog_union.make_set(*other_fusion);
homolog_union.make_set(*fusion);
homolog_union.union_set(*other_fusion,*fusion);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion source/filter_homologs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@

using namespace std;

unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indices, const char kmer_length, const assembly_t& assembly, const float max_identity_fraction);
unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indices, const char kmer_length, const assembly_t& assembly, const float max_identity_fraction,disjoint_set& homolog_union);

#endif /* FILTER_HOMOLOGS_H */
Binary file added source/filter_homologs.o
Binary file not shown.
Binary file added source/filter_homopolymer.o
Binary file not shown.
Binary file added source/filter_in_vitro.o
Binary file not shown.
Binary file added source/filter_inconsistently_clipped.o
Binary file not shown.
Binary file added source/filter_intragenic_both_exonic.o
Binary file not shown.
Binary file added source/filter_long_gap.o
Binary file not shown.
Binary file added source/filter_low_coverage_viral_contigs.o
Binary file not shown.
Binary file added source/filter_low_entropy.o
Binary file not shown.
Binary file added source/filter_marginal_read_through.o
Binary file not shown.
Binary file added source/filter_min_support.o
Binary file not shown.
Binary file added source/filter_mismappers.o
Binary file not shown.
Binary file added source/filter_mismatches.o
Binary file not shown.
Loading