OceanGenomics · acloop62hz · Jul 28, 2023 · Sep 5, 2023
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
@@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Win32",
+            "includePath": [
+                "${workspaceFolder}/**"
+            ],
+            "defines": [
+                "_DEBUG",
+                "UNICODE",
+                "_UNICODE"
+            ],
+            "configurationProvider": "ms-vscode.makefile-tools",
+            "compilerPath": "C:/msys64/mingw64/bin/gcc.exe"
+        }
+    ],
+    "version": 4
+}
diff --git a/Makefile b/Makefile
@@ -14,9 +14,9 @@ bioconda:
 
 # make arriba executable
 arriba: $(SOURCE)/arriba.cpp $(SOURCE)/annotation.o $(SOURCE)/assembly.o $(SOURCE)/options.o $(SOURCE)/read_chimeric_alignments.o $(SOURCE)/filter_duplicates.o $(SOURCE)/filter_uninteresting_contigs.o $(SOURCE)/filter_viral_contigs.o $(SOURCE)/filter_top_expressed_viral_contigs.o $(SOURCE)/filter_low_coverage_viral_contigs.o $(SOURCE)/filter_inconsistently_clipped.o $(SOURCE)/filter_homopolymer.o $(SOURCE)/read_stats.o $(SOURCE)/fusions.o $(SOURCE)/filter_proximal_read_through.o $(SOURCE)/filter_same_gene.o $(SOURCE)/filter_small_insert_size.o $(SOURCE)/filter_long_gap.o $(SOURCE)/filter_hairpin.o $(SOURCE)/filter_multimappers.o $(SOURCE)/filter_mismatches.o $(SOURCE)/filter_low_entropy.o $(SOURCE)/filter_relative_support.o $(SOURCE)/filter_both_intronic.o $(SOURCE)/filter_non_coding_neighbors.o $(SOURCE)/filter_intragenic_both_exonic.o $(SOURCE)/recover_internal_tandem_duplication.o $(SOURCE)/filter_min_support.o $(SOURCE)/recover_known_fusions.o $(SOURCE)/recover_both_spliced.o $(SOURCE)/filter_blacklisted_ranges.o $(SOURCE)/filter_end_to_end.o $(SOURCE)/filter_in_vitro.o $(SOURCE)/merge_adjacent_fusions.o $(SOURCE)/select_best.o $(SOURCE)/filter_marginal_read_through.o $(SOURCE)/filter_short_anchor.o $(SOURCE)/filter_no_coverage.o $(SOURCE)/filter_homologs.o $(SOURCE)/filter_mismappers.o $(SOURCE)/recover_many_spliced.o $(SOURCE)/filter_genomic_support.o $(SOURCE)/recover_isoforms.o $(SOURCE)/annotate_tags.o $(SOURCE)/annotate_protein_domains.o $(SOURCE)/output_fusions.o $(SOURCE)/read_compressed_file.o
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -I$(SOURCE) -I$(STATIC_LIBS)/htslib -I$(STATIC_LIBS)/tsl -o arriba $^ $(LDFLAGS) $(LIBS_A) $(LIBS_SO)
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -I$(SOURCE) -I$(STATIC_LIBS)/htslib -I$(STATIC_LIBS)/tsl -g -o arriba $^ $(LDFLAGS) $(LIBS_A) $(LIBS_SO)
 %.o: %.cpp $(wildcard $(SOURCE)/*.hpp) $(LIBS_A) $(STATIC_LIBS)/tsl/htrie_map.h
-	$(CXX) -c $(CXXFLAGS) $(CPPFLAGS) -I$(STATIC_LIBS)/htslib -I$(STATIC_LIBS)/tsl -o $@ $<
+	$(CXX) -g -c $(CXXFLAGS) $(CPPFLAGS) -I$(STATIC_LIBS)/htslib -I$(STATIC_LIBS)/tsl -o $@ $<
 
 # download and compile dependencies for a static build
 WGET := $(shell (which wget && echo "--no-check-certificate -O -") || echo "curl -k -L")

diff --git a/arriba b/arriba
diff --git a/dd_arriba.sh b/dd_arriba.sh
@@ -0,0 +1,14 @@
+sample=$1
+
+
+/data008/users/yshen/arriba_debug/arriba -x /data008/users/yshen/benchmark_results/CTAT_FUSIONTRANS/STARfusion_results/$sample/Aligned.out.bam \
+       -o /data008/users/yshen/arriba_debug/output3/$sample/fusions.tsv \
+       -a /data008/users/yshen/arriba_v2.4.0/hg19/hs37d5viral.fa \
+       -g /data008/users/yshen/arriba_v2.4.0/hg19/GENCODE19.gtf \
+       -b /data008/users/yshen/arriba_v2.4.0/database/blacklist_hg19_hs37d5_GRCh37_v2.4.0.tsv.gz \
+       -k /data008/users/yshen/arriba_v2.4.0/database/known_fusions_hg19_hs37d5_GRCh37_v2.4.0.tsv.gz \
+       -t /data008/users/yshen/arriba_v2.4.0/database/known_fusions_hg19_hs37d5_GRCh37_v2.4.0.tsv.gz \
+       -p /data008/users/yshen/arriba_v2.4.0/database/protein_domains_hg19_hs37d5_GRCh37_v2.4.0.gff3 \
+       #-f homologs
+       #-X 
+       #-O /data008/users/yshen/arriba_debug/test_output/fusions.discarded.tsv \
diff --git a/run_arriba.sh b/run_arriba.sh
@@ -41,15 +41,16 @@ tee Aligned.out.bam |
 "$BASE_DIR/arriba" \
 	-x /dev/stdin \
 	-o fusions.tsv -O fusions.discarded.tsv \
+	-X \
 	-a "$ASSEMBLY_FA" -g "$ANNOTATION_GTF" -b "$BLACKLIST_TSV" -k "$KNOWN_FUSIONS_TSV" -t "$TAGS_TSV" -p "$PROTEIN_DOMAINS_GFF3" \
 #	-d structural_variants_from_WGS.tsv
 
 # sorting and indexing is only required for visualization
-if [[ $(samtools --version-only 2> /dev/null) =~ ^1\. ]]; then
-	samtools sort -@ "$THREADS" -m $((40000/THREADS))M -T tmp -O bam Aligned.out.bam > Aligned.sortedByCoord.out.bam
-	rm -f Aligned.out.bam
-	samtools index Aligned.sortedByCoord.out.bam
-else
-	echo "samtools >= 1.0 required for sorting of alignments" 1>&2
-fi
+#if [[ $(samtools --version-only 2> /dev/null) =~ ^1\. ]]; then
+#	samtools sort -@ "$THREADS" -m $((40000/THREADS))M -T tmp -O bam Aligned.out.bam > Aligned.sortedByCoord.out.bam
+#	rm -f Aligned.out.bam
+#	samtools index Aligned.sortedByCoord.out.bam
+#else
+#	echo "samtools >= 1.0 required for sorting of alignments" 1>&2
+#fi
 
diff --git a/source/annotate_protein_domains.o b/source/annotate_protein_domains.o
diff --git a/source/annotate_tags.o b/source/annotate_tags.o
diff --git a/source/annotation.o b/source/annotation.o
diff --git a/source/arriba.cpp b/source/arriba.cpp
@@ -204,6 +204,7 @@ int main(int argc, char **argv) {
 		}
 	}
 
+
 	// if the alignment maps neither to an exon nor to a gene, make a dummy gene which subsumes all alignments with a distance of 10kb
 	gene_annotation_t unmapped_alignments;
 	for (chimeric_alignments_t::iterator chimeric_alignment = chimeric_alignments.begin(); chimeric_alignment != chimeric_alignments.end(); ++chimeric_alignment) {
@@ -422,10 +423,12 @@ int main(int argc, char **argv) {
 		cout << "(remaining=" << merge_adjacent_fusions(fusions, 5, options.max_itd_length) << ")" << endl;
 	}
 
+	disjoint_set homolog_union;
+
 	// this step must come before the e-value calculation, or else multi-mapping reads are counted redundantly
 	if (options.filters.at("multimappers")) {
 		cout << get_time_string() << " Filtering multi-mapping fusions by alignment score and read support " << flush;
-		cout << "(remaining=" << filter_multimappers(chimeric_alignments, fusions, exon_annotation_index, assembly) << ")" << endl;
+		cout << "(remaining=" << filter_multimappers(chimeric_alignments, fusions, exon_annotation_index, assembly,homolog_union) << ")" << endl;
 	}
 
 	// this step must come after the 'merge_adjacent' filter,
@@ -555,7 +558,7 @@ int main(int argc, char **argv) {
 	// this step must come near the end, because it is expensive in terms of memory consumption
 	if (options.filters.at("homologs")) {
 		cout << get_time_string() << " Filtering genes with >=" << (options.max_homolog_identity*100) << "% identity " << flush;
-		cout << "(remaining=" << filter_homologs(fusions, kmer_indices, kmer_length, assembly, options.max_homolog_identity) << ")" << endl;
+		cout << "(remaining=" << filter_homologs(fusions, kmer_indices, kmer_length, assembly, options.max_homolog_identity,homolog_union) << ")" << endl;
 	}
 
 	// this step must come near the end, because it is expensive in terms of memory and CPU consumption
@@ -602,11 +605,11 @@ int main(int argc, char **argv) {
 	}
 
 	cout << get_time_string() << " Writing fusions to file '" << options.output_file << "' " << endl;
-	write_fusions_to_file(fusions, options.output_file, coverage, assembly, gene_annotation_index, exon_annotation_index, original_contig_names, tags, protein_domain_annotation_index, max_mate_gap, options.max_itd_length, true, options.fill_sequence_gaps, false);
+	write_fusions_to_file(fusions, options.output_file, coverage, assembly, gene_annotation_index, exon_annotation_index, original_contig_names, tags, protein_domain_annotation_index, max_mate_gap, options.max_itd_length, true, options.fill_sequence_gaps, false,homolog_union);
 
 	if (options.discarded_output_file != "") {
 		cout << get_time_string() << " Writing discarded fusions to file '" << options.discarded_output_file << "'" << endl;
-		write_fusions_to_file(fusions, options.discarded_output_file, coverage, assembly, gene_annotation_index, exon_annotation_index, original_contig_names, tags, protein_domain_annotation_index, max_mate_gap, options.max_itd_length, options.print_extra_info_for_discarded_fusions, options.fill_sequence_gaps, true);
+		write_fusions_to_file(fusions, options.discarded_output_file, coverage, assembly, gene_annotation_index, exon_annotation_index, original_contig_names, tags, protein_domain_annotation_index, max_mate_gap, options.max_itd_length, options.print_extra_info_for_discarded_fusions, options.fill_sequence_gaps,true,homolog_union);
 	}
 
 	cout << get_time_string() << " Freeing resources" << endl;

diff --git a/source/assembly.o b/source/assembly.o
diff --git a/source/common.hpp b/source/common.hpp
@@ -12,9 +12,11 @@
 #include <sstream>
 #include <tuple>
 #include <type_traits>
+#include <unordered_set>
 #include <unordered_map>
 #include <vector>
 #include "sam.h"
+#include <iostream>
 
 using namespace std;
 
@@ -284,12 +286,81 @@ struct fusion_t {
 };
 typedef unordered_map< tuple<unsigned int /*gene1 id*/, unsigned int /*gene2 id*/, contig_t /*contig1*/, contig_t /*contig2*/, position_t /*breakpoint1*/, position_t /*breakpoint2*/, direction_t /*direction1*/, direction_t /*direction2*/>,fusion_t > fusions_t;
 
+// supplementary functions for printing newly-implemented data structures
+inline std::ostream& operator<<(std::ostream& os, const fusion_t& f) {
+	return os << "{fusion " << f.gene1->name << '|' << f.gene2->name << '}';
+}
+
+template<typename K, typename V>
+inline std::ostream& operator<<(std::ostream& os, const std::unordered_map<K, V>& m) {
+	os << "map {";
+	for(const auto& pair : m) {
+		os << ' ' << pair.first << ',' << pair.second;
+	}
+	return os << " }";
+} 
+
+template<typename K>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<K>& m) {
+	os << "vec {";
+	for(const auto& pair : m) {
+		os << ' ' << pair;
+	}
+	return os << " }";
+} 
+
 typedef char strandedness_t;
 const strandedness_t STRANDEDNESS_NO = 0;
 const strandedness_t STRANDEDNESS_YES = 1;
 const strandedness_t STRANDEDNESS_REVERSE = 2;
 const strandedness_t STRANDEDNESS_AUTO = 3;
 
+struct disjoint_set {
+
+	unordered_map<fusion_t*,fusion_t*> parent;
+	unordered_map<fusion_t*,int> rank;
+
+	fusion_t* root(fusion_t* ft){
+		if(ft != parent[ft]) parent[ft]=root(parent[ft]);
+		return parent[ft];
+	}
+
+	void make_set(fusion_t* ft){
+		if(parent[ft] == NULL || parent[ft] == 0)
+			parent[ft] = ft;
+	}
+
+	void union_set(fusion_t* a, fusion_t* b) {
+		fusion_t* root_a = root(a);
+		fusion_t* root_b = root(b);
+		if (root_a != root_b) {
+			if (rank[root_a] > rank[root_b]){
+				parent[root_b] = root_a;
+			}else{
+				parent[root_a] = root_b;
+				if(rank[root_a] == rank[root_b])
+					rank[root_b]++;
+			}
+		}
+	}
+
+	vector<fusion_t*> get_all_members(fusion_t* ft){
+		vector<fusion_t*> members;
+		if(parent[ft]==0){
+			return members;
+		}
+		fusion_t* ft_root = root(ft);
+		for(const auto& elem:parent){
+			if(root(elem.first) == ft_root){
+				members.push_back(elem.first);
+			}
+		}
+		return members;
+	}
+
+};
+
+
 // implement hash() function for tuples so they can be used as keys in unordered_maps
 namespace std {
 

diff --git a/source/filter_blacklisted_ranges.o b/source/filter_blacklisted_ranges.o
diff --git a/source/filter_both_intronic.o b/source/filter_both_intronic.o
diff --git a/source/filter_duplicates.cpp b/source/filter_duplicates.cpp
@@ -1,3 +1,4 @@
+#include <iostream>
 #include <unordered_map>
 #include <tuple>
 #include "common.hpp"
@@ -7,7 +8,7 @@ using namespace std;
 
 unsigned int filter_duplicates(chimeric_alignments_t& chimeric_alignments, const bool external_duplicate_marking) {
 	unsigned int remaining = 0;
-	unordered_map< tuple<contig_t,contig_t,position_t,position_t> , unsigned int> duplicate_count;
+	unordered_map< tuple<contig_t,contig_t,contig_t,position_t,position_t,position_t> , unsigned int> duplicate_count;
 	for (chimeric_alignments_t::iterator chimeric_alignment = chimeric_alignments.begin(); chimeric_alignment != chimeric_alignments.end(); ++chimeric_alignment) {
 		if (chimeric_alignment->second.filter != FILTER_none)
 			continue; // read has already been filtered
@@ -28,14 +29,25 @@ unsigned int filter_duplicates(chimeric_alignments_t& chimeric_alignments, const
 				chimeric_alignment->second[MATE1].start - chimeric_alignment->second[MATE1].preclipping() :
 				chimeric_alignment->second[MATE1].end   + chimeric_alignment->second[MATE1].postclipping()
 			);
-			unsigned int mate2 = (chimeric_alignment->second.size() == 2) ? MATE2 : SUPPLEMENTARY;
+			//unsigned int mate2 = (chimeric_alignment->second.size() == 2) ? MATE2 : SUPPLEMENTARY;
 			position_t position2 = static_cast<position_t>(
-				(chimeric_alignment->second[mate2].strand == FORWARD) ?
-				chimeric_alignment->second[mate2].start - chimeric_alignment->second[mate2].preclipping() :
-				chimeric_alignment->second[mate2].end   + chimeric_alignment->second[mate2].postclipping()
+				(chimeric_alignment->second[MATE2].strand == FORWARD) ?
+				chimeric_alignment->second[MATE2].start - chimeric_alignment->second[MATE2].preclipping() :
+				chimeric_alignment->second[MATE2].end   + chimeric_alignment->second[MATE2].postclipping()
 			);
 			contig_t contig1 = chimeric_alignment->second[MATE1].contig;
-			contig_t contig2 = chimeric_alignment->second[mate2].contig;
+			contig_t contig2 = chimeric_alignment->second[MATE2].contig;
+			contig_t contig3 = static_cast<position_t>(0);
+			position_t position3 = static_cast<position_t>(0);
+
+			if(chimeric_alignment->second.size() > 2){
+				position3 = static_cast<position_t>(
+					(chimeric_alignment->second[SUPPLEMENTARY].strand == FORWARD) ?
+					chimeric_alignment->second[SUPPLEMENTARY].start - chimeric_alignment->second[SUPPLEMENTARY].preclipping() :
+					chimeric_alignment->second[SUPPLEMENTARY].end   + chimeric_alignment->second[SUPPLEMENTARY].postclipping()
+				);
+				contig3 = chimeric_alignment->second[SUPPLEMENTARY].contig;
+			}
 
 			// always put the mate with the lower coordinate in first position
 			// or else we might not recognize the duplicate
@@ -44,7 +56,8 @@ unsigned int filter_duplicates(chimeric_alignments_t& chimeric_alignments, const
 				swap(contig1, contig2);
 			}
 
-			if (duplicate_count[make_tuple(contig1, contig2, position1, position2)]++ > 0)
+
+			if (duplicate_count[make_tuple(contig1, contig2, contig3, position1, position2, position3)]++ > 0)
 				chimeric_alignment->second.filter = FILTER_duplicates;
 			else
 				++remaining;

diff --git a/source/filter_duplicates.o b/source/filter_duplicates.o
diff --git a/source/filter_end_to_end.o b/source/filter_end_to_end.o
diff --git a/source/filter_genomic_support.o b/source/filter_genomic_support.o
diff --git a/source/filter_hairpin.o b/source/filter_hairpin.o
diff --git a/source/filter_homologs.cpp b/source/filter_homologs.cpp
@@ -62,7 +62,7 @@ bool is_homolog(const gene_t gene1, const gene_t gene2, const kmer_indices_t& km
 	return false;
 }
 
-unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indices, const char kmer_length, const assembly_t& assembly, const float max_identity_fraction) {
+unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indices, const char kmer_length, const assembly_t& assembly, const float max_identity_fraction,disjoint_set& homolog_union) {
 
 	// select non-discarded fusions for better speed,
 	// we need to iterate over them many times
@@ -118,14 +118,18 @@ unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indi
 				if (is_homolog(homolog1, homolog2, kmer_indices, kmer_length, assembly, max_identity_fraction)) {
 
 					// other event must have poorer alignments or fewer reads or a worse e-value for us to consider its supporting reads to be mismappers
-					if (anchor1 > anchor2 ||
-					    anchor1 == anchor2 && (**fusion).supporting_reads() > (**other_fusion).supporting_reads() ||
-					    anchor1 == anchor2 && (**fusion).supporting_reads() == (**other_fusion).supporting_reads() && (**fusion).evalue <= (**other_fusion).evalue) {
-						(**other_fusion).filter = FILTER_homologs;
-					} else {
-						(**fusion).filter = FILTER_homologs;
-						break;
-					}
+					// if (anchor1 > anchor2 ||
+					//     anchor1 == anchor2 && (**fusion).supporting_reads() > (**other_fusion).supporting_reads() ||
+					//     anchor1 == anchor2 && (**fusion).supporting_reads() == (**other_fusion).supporting_reads() && (**fusion).evalue <= (**other_fusion).evalue) {
+					// 	(**other_fusion).filter = FILTER_homologs;
+					// } else {
+					// 	(**fusion).filter = FILTER_homologs;
+					// 	break;
+					// }
+					// the above commented code is the original version in Arriba, which would filter out the homologous fusions. Here I use a disjoint_set to keep them recorded and reported.
+					homolog_union.make_set(*other_fusion);
+					homolog_union.make_set(*fusion);
+					homolog_union.union_set(*other_fusion,*fusion);
 				}
 			}
 		}

diff --git a/source/filter_homologs.hpp b/source/filter_homologs.hpp
@@ -7,6 +7,6 @@
 
 using namespace std;
 
-unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indices, const char kmer_length, const assembly_t& assembly, const float max_identity_fraction);
+unsigned int filter_homologs(fusions_t& fusions, const kmer_indices_t& kmer_indices, const char kmer_length, const assembly_t& assembly, const float max_identity_fraction,disjoint_set& homolog_union);
 
 #endif /* FILTER_HOMOLOGS_H */
diff --git a/source/filter_homologs.o b/source/filter_homologs.o
diff --git a/source/filter_homopolymer.o b/source/filter_homopolymer.o
diff --git a/source/filter_in_vitro.o b/source/filter_in_vitro.o
diff --git a/source/filter_inconsistently_clipped.o b/source/filter_inconsistently_clipped.o
diff --git a/source/filter_intragenic_both_exonic.o b/source/filter_intragenic_both_exonic.o
diff --git a/source/filter_long_gap.o b/source/filter_long_gap.o
diff --git a/source/filter_low_coverage_viral_contigs.o b/source/filter_low_coverage_viral_contigs.o
diff --git a/source/filter_low_entropy.o b/source/filter_low_entropy.o
diff --git a/source/filter_marginal_read_through.o b/source/filter_marginal_read_through.o
diff --git a/source/filter_min_support.o b/source/filter_min_support.o
diff --git a/source/filter_mismappers.o b/source/filter_mismappers.o
diff --git a/source/filter_mismatches.o b/source/filter_mismatches.o