From 96506b396cb63c990a32b74eea3b58451db88a79 Mon Sep 17 00:00:00 2001 From: Felix Schlesinger Date: Mon, 16 Jan 2017 11:03:57 -0800 Subject: [PATCH 1/8] Manta-445: Select contigs based on read-count --- .../SVCandidateAssemblyRefiner.cpp | 79 +++++++++---------- 1 file changed, 37 insertions(+), 42 deletions(-) diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp index 7b657e32..46a2a7a0 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp @@ -1629,19 +1629,52 @@ getJumpAssembly( log_os << __FUNCTION__ << "\tbp2seq_fwd: " << bp2Seq << "\n"; #endif +#ifdef DEBUG_REFINER + log_os << __FUNCTION__ << ": Checking contig aln: " << contigIndex << "\n"; +#endif // QC the alignment to make sure it spans the two breakend locations: static const unsigned minAlignRefSpan(20); const bool isAlignment1Good(alignment.align1.isAligned() && (apath_ref_length(alignment.align1.apath) >= minAlignRefSpan)); const bool isAlignment2Good(alignment.align2.isAligned() && (apath_ref_length(alignment.align2.apath) >= minAlignRefSpan)); - const bool isAlignmentGood(isAlignment1Good && isAlignment2Good); + bool isAlignmentGood(isAlignment1Good && isAlignment2Good); #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": Checking contig aln: " << contigIndex << "\n"; + log_os << __FUNCTION__ << ": contig alignment initial okay: " << contigIndex << "\n"; #endif + { + // check the min size and fraction of optimal alignment score + // for each of the two sub-alignments on each side of the bp + // + // note this is done for multiple values -- the lower value is + // motivated by cases where a second breakpoint exists near to + // the target breakpoint -- the higher value is motivated by + // cases with some alignment 'messiness' near the breakpoint + // that stabilizes as we move farther away + // + /// TODO change iterative refspan to a single consistent alignment criteria + bool isFilterAlign1(true); + bool isFilterAlign2(true); + static const unsigned spanSet[] = { 75, 100, 200 }; + for (const unsigned maxQCRefSpan : spanSet) + { + const unsigned qcSpan1 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align1.apath) : 0); + if (!isFilterSpanningAlignment(qcSpan1, _spanningAligner, true, isRNA, alignment.align1.apath)) + { + isFilterAlign1 = false; + } + const unsigned qcSpan2 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align2.apath) : 0); + if (!isFilterSpanningAlignment(qcSpan2, _spanningAligner, false, isRNA, alignment.align2.apath)) + { + isFilterAlign2 = false; + } + } + if (isFilterAlign1 || isFilterAlign2) isAlignmentGood = false; + } if (! isAlignmentGood) continue; #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": contig okay: " << contigIndex << "\n"; + log_os << __FUNCTION__ << ": contig alignment okay: " << contigIndex << "\n"; #endif - if ((! isHighScore) || (alignment.score > assemblyData.spanningAlignments[highScoreIndex].score)) + //if ((! isHighScore) || (alignment.score > assemblyData.spanningAlignments[highScoreIndex].score)) + if ((! isHighScore) || (contig.supportReads > assemblyData.contigs[highScoreIndex].supportReads)) { isHighScore = true; highScoreIndex=contigIndex; @@ -1653,44 +1686,6 @@ getJumpAssembly( log_os << __FUNCTION__ << ": high scoring contig: " << highScoreIndex << "\n"; #endif - // set any additional QC steps before deciding an alignment is - // usable: - - { - // check the min size and fraction of optimal alignment score - // for each of the two sub-alignments on each side of the bp - // - // note this is done for multiple values -- the lower value is - // motivated by cases where a second breakpoint exists near to - // the target breakpoint -- the higher value is motivated by - // cases with some alignment 'messiness' near the breakpoint - // that stabilizes as we move farther away - // - /// TODO change iterative refspan to a single consistent alignment criteria - /// TODO should this be moved into the candidate selection loop? - // - const SVCandidateAssemblyData::JumpAlignmentResultType& hsAlign(assemblyData.spanningAlignments[highScoreIndex]); - - bool isFilterAlign1(true); - bool isFilterAlign2(true); - static const unsigned spanSet[] = {75, 100, 200}; - for (const unsigned maxQCRefSpan : spanSet) - { - const unsigned qcSpan1 = maxQCRefSpan + (isRNA ? apath_spliced_length(hsAlign.align1.apath) : 0); - if (! isFilterSpanningAlignment(qcSpan1, _spanningAligner, true, isRNA, hsAlign.align1.apath)) - { - isFilterAlign1 = false; - } - const unsigned qcSpan2 = maxQCRefSpan + (isRNA ? apath_spliced_length(hsAlign.align2.apath) : 0); - if (! isFilterSpanningAlignment(qcSpan2, _spanningAligner, false, isRNA, hsAlign.align2.apath)) - { - isFilterAlign2 = false; - } - } - if (isFilterAlign1 || isFilterAlign2) return; - } - - // ok, passed QC -- mark the high-scoring alignment as usable for // hypothesis refinement: { From c49c1b12e1a59cf685fb93548a967a1d887217dd Mon Sep 17 00:00:00 2001 From: Felix Schlesinger Date: Tue, 24 Jan 2017 10:26:44 -0800 Subject: [PATCH 2/8] Manta-445: Pick contig from highest align-score ones --- .../SVCandidateAssemblyRefiner.cpp | 123 ++++++++++-------- src/c++/lib/assembly/AssembledContig.cpp | 1 + 2 files changed, 70 insertions(+), 54 deletions(-) diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp index 46a2a7a0..8c0aec2f 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp @@ -1519,10 +1519,8 @@ getJumpAssembly( // it's empty: assemblyData.spanningAlignments.resize(contigCount); - bool isHighScore(false); - unsigned highScoreIndex(0); - - for (unsigned contigIndex(0); contigIndex goodContigIndicies; + for (unsigned contigIndex(0); contigIndex < contigCount; ++contigIndex) { const AssembledContig& contig(assemblyData.contigs[contigIndex]); @@ -1619,79 +1617,96 @@ getJumpAssembly( #ifdef DEBUG_REFINER log_os << __FUNCTION__ << ": contigIndex: " << contigIndex << " alignment: " << alignment; - - std::string bp1Seq,bp2Seq,insertSeq; - getFwdStrandQuerySegments(alignment, contig.seq, - bporient.isBp2AlignedFirst, bporient.isBp1Reversed, bporient.isBp2Reversed, - bp1Seq, bp2Seq, insertSeq); - log_os << __FUNCTION__ << "\tbp1seq_fwd: " << bp1Seq << "\n"; - log_os << __FUNCTION__ << "\tinsseq_fwd: " << insertSeq << "\n"; - log_os << __FUNCTION__ << "\tbp2seq_fwd: " << bp2Seq << "\n"; + if (alignment.align1.isAligned()) + { + std::string bp1Seq, bp2Seq, insertSeq; + getFwdStrandQuerySegments(alignment, contig.seq, + bporient.isBp2AlignedFirst, bporient.isBp1Reversed, bporient.isBp2Reversed, + bp1Seq, bp2Seq, insertSeq); + log_os << __FUNCTION__ << "\tbp1seq_fwd: " << bp1Seq << "\n"; + log_os << __FUNCTION__ << "\tinsseq_fwd: " << insertSeq << "\n"; + log_os << __FUNCTION__ << "\tbp2seq_fwd: " << bp2Seq << "\n"; + } #endif #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": Checking contig aln: " << contigIndex << "\n"; + log_os << __FUNCTION__ << ": Checking contig aln: " << contigIndex << "\n"; #endif // QC the alignment to make sure it spans the two breakend locations: static const unsigned minAlignRefSpan(20); const bool isAlignment1Good(alignment.align1.isAligned() && (apath_ref_length(alignment.align1.apath) >= minAlignRefSpan)); const bool isAlignment2Good(alignment.align2.isAligned() && (apath_ref_length(alignment.align2.apath) >= minAlignRefSpan)); bool isAlignmentGood(isAlignment1Good && isAlignment2Good); + if (!isAlignmentGood) continue; #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": contig alignment initial okay: " << contigIndex << "\n"; + log_os << __FUNCTION__ << ": contig alignment initial okay: " << contigIndex << "\n"; #endif - { - // check the min size and fraction of optimal alignment score - // for each of the two sub-alignments on each side of the bp - // - // note this is done for multiple values -- the lower value is - // motivated by cases where a second breakpoint exists near to - // the target breakpoint -- the higher value is motivated by - // cases with some alignment 'messiness' near the breakpoint - // that stabilizes as we move farther away - // - /// TODO change iterative refspan to a single consistent alignment criteria - bool isFilterAlign1(true); - bool isFilterAlign2(true); - static const unsigned spanSet[] = { 75, 100, 200 }; - for (const unsigned maxQCRefSpan : spanSet) - { - const unsigned qcSpan1 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align1.apath) : 0); - if (!isFilterSpanningAlignment(qcSpan1, _spanningAligner, true, isRNA, alignment.align1.apath)) - { - isFilterAlign1 = false; - } - const unsigned qcSpan2 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align2.apath) : 0); - if (!isFilterSpanningAlignment(qcSpan2, _spanningAligner, false, isRNA, alignment.align2.apath)) - { - isFilterAlign2 = false; - } - } - if (isFilterAlign1 || isFilterAlign2) isAlignmentGood = false; - } - if (! isAlignmentGood) continue; + { + // check the min size and fraction of optimal alignment score + // for each of the two sub-alignments on each side of the bp + // + // note this is done for multiple values -- the lower value is + // motivated by cases where a second breakpoint exists near to + // the target breakpoint -- the higher value is motivated by + // cases with some alignment 'messiness' near the breakpoint + // that stabilizes as we move farther away + // + /// TODO change iterative refspan to a single consistent alignment criteria + bool isFilterAlign1(true); + bool isFilterAlign2(true); + static const unsigned spanSet[] = { 75, 100, 200 }; + for (const unsigned maxQCRefSpan : spanSet) + { + const unsigned qcSpan1 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align1.apath) : 0); + if (!isFilterSpanningAlignment(qcSpan1, _spanningAligner, true, isRNA, alignment.align1.apath)) + { + isFilterAlign1 = false; + } + const unsigned qcSpan2 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align2.apath) : 0); + if (!isFilterSpanningAlignment(qcSpan2, _spanningAligner, false, isRNA, alignment.align2.apath)) + { + isFilterAlign2 = false; + } + } + if (isFilterAlign1 || isFilterAlign2) isAlignmentGood = false; + } + if (isAlignmentGood) + { + goodContigIndicies.push_back(contigIndex); #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": contig alignment okay: " << contigIndex << "\n"; + log_os << __FUNCTION__ << ": contig alignment okay: " << contigIndex << "\n"; #endif - //if ((! isHighScore) || (alignment.score > assemblyData.spanningAlignments[highScoreIndex].score)) - if ((! isHighScore) || (contig.supportReads > assemblyData.contigs[highScoreIndex].supportReads)) + } + } + if (goodContigIndicies.empty()) return; + unsigned maxAlignContigIndex = goodContigIndicies.front(); + for (unsigned index : goodContigIndicies) + { + if (assemblyData.spanningAlignments[index].score > assemblyData.spanningAlignments[maxAlignContigIndex].score) + { + maxAlignContigIndex = index; + } + } + unsigned selectedContigIndex(maxAlignContigIndex); + for (unsigned index : goodContigIndicies) + { + if ((assemblyData.spanningAlignments[index].score * 2 > assemblyData.spanningAlignments[maxAlignContigIndex].score) && + (assemblyData.contigs[index].supportReads.size() > assemblyData.contigs[selectedContigIndex].supportReads.size())) { - isHighScore = true; - highScoreIndex=contigIndex; + selectedContigIndex = index; } } - if (! isHighScore) return; #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": high scoring contig: " << highScoreIndex << "\n"; + log_os << __FUNCTION__ << ": high scoring contig: " << selectedContigIndex << "\n"; #endif // ok, passed QC -- mark the high-scoring alignment as usable for // hypothesis refinement: { - assemblyData.bestAlignmentIndex = highScoreIndex; + assemblyData.bestAlignmentIndex = selectedContigIndex; #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": highscoreid: " << highScoreIndex << " alignment: " << assemblyData.spanningAlignments[highScoreIndex]; + log_os << __FUNCTION__ << ": highscoreid: " << selectedContigIndex << " alignment: " << assemblyData.spanningAlignments[selectedContigIndex]; #endif // process the alignment into information that's easily usable @@ -1702,7 +1717,7 @@ getJumpAssembly( assemblyData.svs.push_back(sv); SVCandidate& newSV(assemblyData.svs.back()); - generateRefinedVCFSVCandidateFromJumpAlignment(bporient, assemblyData, highScoreIndex, newSV); + generateRefinedVCFSVCandidateFromJumpAlignment(bporient, assemblyData, selectedContigIndex, newSV); #ifdef DEBUG_REFINER log_os << __FUNCTION__ << ": highscore refined sv: " << newSV; diff --git a/src/c++/lib/assembly/AssembledContig.cpp b/src/c++/lib/assembly/AssembledContig.cpp index d3ee30b2..f3f6077c 100644 --- a/src/c++/lib/assembly/AssembledContig.cpp +++ b/src/c++/lib/assembly/AssembledContig.cpp @@ -33,6 +33,7 @@ operator<<(std::ostream& os, const AssembledContig& contig) { os << "CONTIG size: " << contig.seq.size() << " seedCount: " << contig.seedReadCount + << " supportReads: " << contig.supportReads.size() << " seq:\n"; printSeq(contig.seq,os); os << "\n"; From 2c169e5e3f82a8baced8b24554aa94ca6811df14 Mon Sep 17 00:00:00 2001 From: Felix Schlesinger Date: Tue, 24 Jan 2017 10:26:58 -0800 Subject: [PATCH 3/8] Fixups from code review --- .../SVCandidateAssemblyRefiner.cpp | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp index 8c0aec2f..ac849a8e 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp @@ -1538,12 +1538,12 @@ getJumpAssembly( static const int nSpacer(25); std::vector exclBlocks1; const std::string cutRef1 = kmerMaskReference(align1RefStrPtr->begin() + align1LeadingCut, - align1RefStrPtr->end() - align1TrailingCut, - contig.seq, nSpacer, exclBlocks1); + align1RefStrPtr->end() - align1TrailingCut, + contig.seq, nSpacer, exclBlocks1); std::vector exclBlocks2; const std::string cutRef2 = kmerMaskReference(align2RefStrPtr->begin() + align2LeadingCut, - align2RefStrPtr->end() - align2TrailingCut, - contig.seq, nSpacer, exclBlocks2); + align2RefStrPtr->end() - align2TrailingCut, + contig.seq, nSpacer, exclBlocks2); #ifdef DEBUG_REFINER log_os << __FUNCTION__ << " Kmer-masked references\n"; log_os << "\t ref Lengths " << align1RefStrPtr->size() << " " << align2RefStrPtr->size() << "\n"; @@ -1572,16 +1572,16 @@ getJumpAssembly( log_os << __FUNCTION__ << " isTranscriptStrandKnown: " << bporient.isTranscriptStrandKnown << "; bp1Fw: " << bp1Fw << " ; bp2Fw: " << bp2Fw << '\n'; #endif _RNASpanningAligner.align(contig.seq.begin(), contig.seq.end(), - cutRef1.begin(), cutRef1.end(), cutRef2.begin(), cutRef2.end(), - bp1Fw, bp2Fw, bporient.isTranscriptStrandKnown, - alignment); + cutRef1.begin(), cutRef1.end(), cutRef2.begin(), cutRef2.end(), + bp1Fw, bp2Fw, bporient.isTranscriptStrandKnown, + alignment); #ifdef DEBUG_REFINER log_os << __FUNCTION__ << " Masked 1: " << alignment.align1 << '\n'; log_os << __FUNCTION__ << " Masked 2: " << alignment.align2 << '\n'; #endif if (!(translateMaskedAlignment(alignment.align1, exclBlocks1) && - translateMaskedAlignment(alignment.align2, exclBlocks2))) + translateMaskedAlignment(alignment.align2, exclBlocks2))) { #ifdef DEBUG_REFINER log_os << __FUNCTION__ << " Failed to fix kmer-masked alignment\n"; @@ -1598,14 +1598,14 @@ getJumpAssembly( { #ifdef DEBUG_REFINER log_os << __FUNCTION__ << " Ref1 for alignment: " - << bp1refSeq.substr(align1LeadingCut, bp1refSeq.size()-align1LeadingCut-align1TrailingCut) << '\n'; + << bp1refSeq.substr(align1LeadingCut, bp1refSeq.size() - align1LeadingCut - align1TrailingCut) << '\n'; log_os << __FUNCTION__ << " Ref2 for alignment: " - << bp2refSeq.substr(align2LeadingCut, bp2refSeq.size()-align2LeadingCut-align2TrailingCut) << '\n'; + << bp2refSeq.substr(align2LeadingCut, bp2refSeq.size() - align2LeadingCut - align2TrailingCut) << '\n'; #endif _spanningAligner.align(contig.seq.begin(), contig.seq.end(), - align1RefStrPtr->begin() + align1LeadingCut, align1RefStrPtr->end() - align1TrailingCut, - align2RefStrPtr->begin() + align2LeadingCut, align2RefStrPtr->end() - align2TrailingCut, - alignment); + align1RefStrPtr->begin() + align1LeadingCut, align1RefStrPtr->end() - align1TrailingCut, + align2RefStrPtr->begin() + align2LeadingCut, align2RefStrPtr->end() - align2TrailingCut, + alignment); } alignment.align1.beginPos += align1LeadingCut; @@ -1679,6 +1679,7 @@ getJumpAssembly( } } if (goodContigIndicies.empty()) return; + // Find the contig with the highest alignment score unsigned maxAlignContigIndex = goodContigIndicies.front(); for (unsigned index : goodContigIndicies) { @@ -1688,17 +1689,19 @@ getJumpAssembly( } } unsigned selectedContigIndex(maxAlignContigIndex); + // Pick the contig with the most supporting reads that has an alignment score at least half as high as the highest-scoring contig for (unsigned index : goodContigIndicies) { - if ((assemblyData.spanningAlignments[index].score * 2 > assemblyData.spanningAlignments[maxAlignContigIndex].score) && - (assemblyData.contigs[index].supportReads.size() > assemblyData.contigs[selectedContigIndex].supportReads.size())) + const bool sufficientScore(assemblyData.spanningAlignments[index].score * 2 > assemblyData.spanningAlignments[maxAlignContigIndex].score); + const bool moreReads(assemblyData.contigs[index].supportReads.size() > assemblyData.contigs[selectedContigIndex].supportReads.size()); + if (sufficientScore && moreReads) { selectedContigIndex = index; } } #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": high scoring contig: " << selectedContigIndex << "\n"; + log_os << __FUNCTION__ << ": selected contig: " << selectedContigIndex << "\n"; #endif // ok, passed QC -- mark the high-scoring alignment as usable for From 7b8399ee4f42179671aed480436c5827670118a9 Mon Sep 17 00:00:00 2001 From: Felix Schlesinger Date: Fri, 12 May 2017 10:30:00 -0700 Subject: [PATCH 4/8] Update ChangeLog --- ChangeLog.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ChangeLog.txt b/ChangeLog.txt index c79e3a1b..616fedff 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -1,5 +1,6 @@ - MANTA-1211/[#90] Fix issue with empty regions input to EstimateSVLoci - MANTA-1043 Add more read statistics when 'too few read pair observations' exception +- MANTA-445 RNA: Select contigs for refinement based on alignment score and read count - MANTA-696 Improve the iterative assembler and set it to the default assembler - MANTA-1175 Fix a bug of duplicate RG & PG in evidence bams - MANTA-886 Verify extension of alignment input file @@ -9,6 +10,7 @@ - MANTA-1116 Sort duplicate vcf entries together to allow merging - MANTA-443 Improve the check of running into adapter for data with small insert size - MANTA-1038 Fix infrequent graph edge lookup failure + v1.1.1 - MANTA-691 Add an option to specify call regions in a bed file - MANTA-977/[#83] Improve SA tag error reporting From 787a3a4dff5d0e79c75a032d82e32cc977f68aca Mon Sep 17 00:00:00 2001 From: Felix Schlesinger Date: Wed, 30 Aug 2017 08:34:40 -0700 Subject: [PATCH 5/8] Only use reads counts for RNA contig selection --- .../SVCandidateAssemblyRefiner.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp index ac849a8e..5098241b 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp @@ -1688,18 +1688,21 @@ getJumpAssembly( maxAlignContigIndex = index; } } + unsigned selectedContigIndex(maxAlignContigIndex); - // Pick the contig with the most supporting reads that has an alignment score at least half as high as the highest-scoring contig - for (unsigned index : goodContigIndicies) + if (isRNA) { - const bool sufficientScore(assemblyData.spanningAlignments[index].score * 2 > assemblyData.spanningAlignments[maxAlignContigIndex].score); - const bool moreReads(assemblyData.contigs[index].supportReads.size() > assemblyData.contigs[selectedContigIndex].supportReads.size()); - if (sufficientScore && moreReads) + // Pick the contig with the most supporting reads that has an alignment score at least half as high as the highest-scoring contig + for (unsigned index : goodContigIndicies) { - selectedContigIndex = index; + const bool sufficientScore(assemblyData.spanningAlignments[index].score * 2 > assemblyData.spanningAlignments[maxAlignContigIndex].score); + const bool moreReads(assemblyData.contigs[index].supportReads.size() > assemblyData.contigs[selectedContigIndex].supportReads.size()); + if (sufficientScore && moreReads) + { + selectedContigIndex = index; + } } } - #ifdef DEBUG_REFINER log_os << __FUNCTION__ << ": selected contig: " << selectedContigIndex << "\n"; #endif From 4cc2da4ddc8921f2239d212f57d6ce8a0ba6df66 Mon Sep 17 00:00:00 2001 From: Felix Schlesinger Date: Wed, 30 Aug 2017 15:25:02 -0700 Subject: [PATCH 6/8] Restore old checkSubAlignment order for DNA --- .../SVCandidateAssemblyRefiner.cpp | 74 ++++++++++++------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp index 5098241b..5d3c6452 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp @@ -1311,7 +1311,42 @@ generateRefinedVCFSVCandidateFromJumpAlignment( addCigarToSpanningAlignment(sv); } - +// check the min size and fraction of optimal alignment score +// for each of the two sub-alignments on each side of the bp +// +// note this is done for multiple values -- the lower value is +// motivated by cases where a second breakpoint exists near to +// the target breakpoint -- the higher value is motivated by +// cases with some alignment 'messiness' near the breakpoint +// that stabilizes as we move farther away +// +/// TODO change iterative refspan to a single consistent alignment criteria +static +bool +checkFilterSubAlignments( + const JumpAlignmentResult& alignment, + const GlobalJumpAligner& spanningAligner, + const bool isRNA) +{ + using namespace ALIGNPATH; + bool isFilterAlign1(true); + bool isFilterAlign2(true); + static const unsigned spanSet[] = { 75, 100, 200 }; + for (const unsigned maxQCRefSpan : spanSet) + { + const unsigned qcSpan1 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align1.apath) : 0); + if (!isFilterSpanningAlignment(qcSpan1, spanningAligner, true, isRNA, alignment.align1.apath)) + { + isFilterAlign1 = false; + } + const unsigned qcSpan2 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align2.apath) : 0); + if (!isFilterSpanningAlignment(qcSpan2, spanningAligner, false, isRNA, alignment.align2.apath)) + { + isFilterAlign2 = false; + } + } + return (isFilterAlign1 || isFilterAlign2); +} void SVCandidateAssemblyRefiner:: @@ -1641,34 +1676,10 @@ getJumpAssembly( #ifdef DEBUG_REFINER log_os << __FUNCTION__ << ": contig alignment initial okay: " << contigIndex << "\n"; #endif + if (isRNA && checkFilterSubAlignments(alignment, _spanningAligner, isRNA)) { - // check the min size and fraction of optimal alignment score - // for each of the two sub-alignments on each side of the bp - // - // note this is done for multiple values -- the lower value is - // motivated by cases where a second breakpoint exists near to - // the target breakpoint -- the higher value is motivated by - // cases with some alignment 'messiness' near the breakpoint - // that stabilizes as we move farther away - // - /// TODO change iterative refspan to a single consistent alignment criteria - bool isFilterAlign1(true); - bool isFilterAlign2(true); - static const unsigned spanSet[] = { 75, 100, 200 }; - for (const unsigned maxQCRefSpan : spanSet) - { - const unsigned qcSpan1 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align1.apath) : 0); - if (!isFilterSpanningAlignment(qcSpan1, _spanningAligner, true, isRNA, alignment.align1.apath)) - { - isFilterAlign1 = false; - } - const unsigned qcSpan2 = maxQCRefSpan + (isRNA ? apath_spliced_length(alignment.align2.apath) : 0); - if (!isFilterSpanningAlignment(qcSpan2, _spanningAligner, false, isRNA, alignment.align2.apath)) - { - isFilterAlign2 = false; - } - } - if (isFilterAlign1 || isFilterAlign2) isAlignmentGood = false; + // For RNA pre-filter all contigs to only consider those passing the SubAlignment filter + isAlignmentGood = false; } if (isAlignmentGood) { @@ -1706,6 +1717,13 @@ getJumpAssembly( #ifdef DEBUG_REFINER log_os << __FUNCTION__ << ": selected contig: " << selectedContigIndex << "\n"; #endif + if (checkFilterSubAlignments(assemblyData.spanningAlignments[selectedContigIndex], _spanningAligner, isRNA)) + { + // Check the selected contig with the SubAlignments filter. + // Only relevant for DNA, since the RNA contigs were pre-filtered with this filter before selection + // TODO (Maybe) switch DNA to the RNA logic? + return; + } // ok, passed QC -- mark the high-scoring alignment as usable for // hypothesis refinement: From e556f8e309f159ba77061e308edc71944cb16730 Mon Sep 17 00:00:00 2001 From: Felix Schlesinger Date: Tue, 5 Sep 2017 14:34:15 -0700 Subject: [PATCH 7/8] Better separate DNA & RNA contig selection. Plus code review cleanups --- ChangeLog.txt | 2 +- .../SVCandidateAssemblyRefiner.cpp | 203 +++++++++++------- 2 files changed, 126 insertions(+), 79 deletions(-) diff --git a/ChangeLog.txt b/ChangeLog.txt index 616fedff..0a3617e9 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -1,6 +1,6 @@ +- MANTA-445 RNA: Select contigs for refinement based on alignment score and read count - MANTA-1211/[#90] Fix issue with empty regions input to EstimateSVLoci - MANTA-1043 Add more read statistics when 'too few read pair observations' exception -- MANTA-445 RNA: Select contigs for refinement based on alignment score and read count - MANTA-696 Improve the iterative assembler and set it to the default assembler - MANTA-1175 Fix a bug of duplicate RG & PG in evidence bams - MANTA-886 Verify extension of alignment input file diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp index 5d3c6452..992c7c1b 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp @@ -1260,10 +1260,9 @@ void generateRefinedSVCandidateFromJumpAlignment( const BPOrientation& bporient, const SVCandidateAssemblyData& assemblyData, - const unsigned contigIndex, SVCandidate& sv) { - const SVCandidateAssemblyData::JumpAlignmentResultType& align(assemblyData.spanningAlignments[contigIndex]); + const SVCandidateAssemblyData::JumpAlignmentResultType& align(assemblyData.spanningAlignments[assemblyData.bestAlignmentIndex]); // first get each alignment associated with the correct breakend: const Alignment* bp1AlignPtr(&align.align1); @@ -1273,7 +1272,7 @@ generateRefinedSVCandidateFromJumpAlignment( // summarize usable output information in a second SVBreakend // object -- this is the 'refined' sv: - sv.assemblyAlignIndex = contigIndex; + sv.assemblyAlignIndex = assemblyData.bestAlignmentIndex; sv.assemblySegmentIndex = 0; sv.setPrecise(); @@ -1292,13 +1291,12 @@ void generateRefinedVCFSVCandidateFromJumpAlignment( const BPOrientation& bporient, const SVCandidateAssemblyData& assemblyData, - const unsigned contigIndex, SVCandidate& sv) { - generateRefinedSVCandidateFromJumpAlignment(bporient, assemblyData, contigIndex, sv); + generateRefinedSVCandidateFromJumpAlignment(bporient, assemblyData, sv); - const AssembledContig& contig(assemblyData.contigs[contigIndex]); - const SVCandidateAssemblyData::JumpAlignmentResultType& align(assemblyData.spanningAlignments[contigIndex]); + const AssembledContig& contig(assemblyData.contigs[assemblyData.bestAlignmentIndex]); + const SVCandidateAssemblyData::JumpAlignmentResultType& align(assemblyData.spanningAlignments[assemblyData.bestAlignmentIndex]); // fill in insertSeq: sv.insertSeq.clear(); @@ -1311,6 +1309,19 @@ generateRefinedVCFSVCandidateFromJumpAlignment( addCigarToSpanningAlignment(sv); } +// QC the alignment to make sure it spans the two breakend locations: +static +bool +basicContigAlignmentCheck( + const JumpAlignmentResult& alignment +) +{ + static const unsigned minAlignRefSpan(20); + const bool isAlignment1Good(alignment.align1.isAligned() && (apath_ref_length(alignment.align1.apath) >= minAlignRefSpan)); + const bool isAlignment2Good(alignment.align2.isAligned() && (apath_ref_length(alignment.align2.apath) >= minAlignRefSpan)); + return (isAlignment1Good && isAlignment2Good); +} + // check the min size and fraction of optimal alignment score // for each of the two sub-alignments on each side of the bp // @@ -1348,6 +1359,99 @@ checkFilterSubAlignments( return (isFilterAlign1 || isFilterAlign2); } +// Filter fusion contigs and select the 'best' one, based on alignment score and supporting read count +static +bool +selectContigRNA( + SVCandidateAssemblyData& assemblyData, + const GlobalJumpAligner& spanningAligner +) +{ + std::vector goodContigIndicies; + for (unsigned contigIndex = 0; contigIndex < assemblyData.contigs.size(); contigIndex++) + { + const JumpAlignmentResult& alignment(assemblyData.spanningAlignments[contigIndex]); +#ifdef DEBUG_REFINER + log_os << __FUNCTION__ << ": Checking contig alignment: " << contigIndex << "\n"; +#endif + if (!basicContigAlignmentCheck(alignment)) continue; +#ifdef DEBUG_REFINER + log_os << __FUNCTION__ << ": contig alignment initial okay: " << contigIndex << "\n"; +#endif + if (checkFilterSubAlignments(alignment, spanningAligner, true)) continue; +#ifdef DEBUG_REFINER + if (isRNA) log_os << __FUNCTION__ << ": contig alignment okay: " << contigIndex << "\n"; +#endif + goodContigIndicies.push_back(contigIndex); + } + if (goodContigIndicies.empty()) return false; + // Find the highest alignment score + int maxAlnScore = 0; + unsigned selectedContigIndex(goodContigIndicies.front()); + for (unsigned index : goodContigIndicies) + { + if (assemblyData.spanningAlignments[index].score > maxAlnScore) + { + maxAlnScore = assemblyData.spanningAlignments[index].score; + selectedContigIndex = index; + } + } + // Pick the contig with the most supporting reads that has an alignment score at least half as high as the highest-scoring contig + for (unsigned index : goodContigIndicies) + { + const bool sufficientScore(assemblyData.spanningAlignments[index].score * 2 > maxAlnScore); + const bool moreReads(assemblyData.contigs[index].supportReads.size() > assemblyData.contigs[selectedContigIndex].supportReads.size()); + if (sufficientScore && moreReads) selectedContigIndex = index; + } +#ifdef DEBUG_REFINER + log_os << __FUNCTION__ << ": selected contig: " << selectedContigIndex << "\n"; +#endif + assemblyData.bestAlignmentIndex = selectedContigIndex; + return true; +} + +// Filter breakpoint contigs, select the 'best' one based on alignment score and check alignment on selected contig +// TODO Consider making this more like the RNA case, e.g. all alignment checks before selection and pick the best passing one. +static +bool +selectContigDNA( + SVCandidateAssemblyData& assemblyData, + const GlobalJumpAligner& spanningAligner +) +{ + int maxAlignContigIndex(-1); + for (unsigned index = 0; index < assemblyData.contigs.size(); index++) + { + const JumpAlignmentResult& alignment(assemblyData.spanningAlignments[index]); +#ifdef DEBUG_REFINER + log_os << __FUNCTION__ << ": Checking contig alignment: " << contigIndex << "\n"; +#endif + if (!basicContigAlignmentCheck(alignment)) continue; +#ifdef DEBUG_REFINER + log_os << __FUNCTION__ << ": contig alignment initial okay: " << contigIndex << "\n"; +#endif + // Find the contig with the highest alignment score + if ((maxAlignContigIndex == -1) || + (assemblyData.spanningAlignments[index].score > assemblyData.spanningAlignments[maxAlignContigIndex].score)) + { + maxAlignContigIndex = index; + } + } +#ifdef DEBUG_REFINER + log_os << __FUNCTION__ << ": selected contig: " << selectedContigIndex << "\n"; +#endif + if ((maxAlignContigIndex == -1) || + (checkFilterSubAlignments(assemblyData.spanningAlignments[maxAlignContigIndex], spanningAligner, false))) + { + return false; + } + // ok, passed QC -- mark the high-scoring alignment as usable for + // hypothesis refinement: + assemblyData.bestAlignmentIndex = maxAlignContigIndex; + return true; +} + + void SVCandidateAssemblyRefiner:: getJumpAssembly( @@ -1554,7 +1658,6 @@ getJumpAssembly( // it's empty: assemblyData.spanningAlignments.resize(contigCount); - std::vector goodContigIndicies; for (unsigned contigIndex(0); contigIndex < contigCount; ++contigIndex) { const AssembledContig& contig(assemblyData.contigs[contigIndex]); @@ -1663,90 +1766,34 @@ getJumpAssembly( log_os << __FUNCTION__ << "\tbp2seq_fwd: " << bp2Seq << "\n"; } #endif - -#ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": Checking contig aln: " << contigIndex << "\n"; -#endif - // QC the alignment to make sure it spans the two breakend locations: - static const unsigned minAlignRefSpan(20); - const bool isAlignment1Good(alignment.align1.isAligned() && (apath_ref_length(alignment.align1.apath) >= minAlignRefSpan)); - const bool isAlignment2Good(alignment.align2.isAligned() && (apath_ref_length(alignment.align2.apath) >= minAlignRefSpan)); - bool isAlignmentGood(isAlignment1Good && isAlignment2Good); - if (!isAlignmentGood) continue; -#ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": contig alignment initial okay: " << contigIndex << "\n"; -#endif - if (isRNA && checkFilterSubAlignments(alignment, _spanningAligner, isRNA)) - { - // For RNA pre-filter all contigs to only consider those passing the SubAlignment filter - isAlignmentGood = false; - } - if (isAlignmentGood) - { - goodContigIndicies.push_back(contigIndex); -#ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": contig alignment okay: " << contigIndex << "\n"; -#endif - } } - if (goodContigIndicies.empty()) return; // Find the contig with the highest alignment score - unsigned maxAlignContigIndex = goodContigIndicies.front(); - for (unsigned index : goodContigIndicies) - { - if (assemblyData.spanningAlignments[index].score > assemblyData.spanningAlignments[maxAlignContigIndex].score) - { - maxAlignContigIndex = index; - } - } - - unsigned selectedContigIndex(maxAlignContigIndex); + bool foundContig(false); if (isRNA) { - // Pick the contig with the most supporting reads that has an alignment score at least half as high as the highest-scoring contig - for (unsigned index : goodContigIndicies) - { - const bool sufficientScore(assemblyData.spanningAlignments[index].score * 2 > assemblyData.spanningAlignments[maxAlignContigIndex].score); - const bool moreReads(assemblyData.contigs[index].supportReads.size() > assemblyData.contigs[selectedContigIndex].supportReads.size()); - if (sufficientScore && moreReads) - { - selectedContigIndex = index; - } - } + foundContig = selectContigRNA(assemblyData, _spanningAligner); } -#ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": selected contig: " << selectedContigIndex << "\n"; -#endif - if (checkFilterSubAlignments(assemblyData.spanningAlignments[selectedContigIndex], _spanningAligner, isRNA)) + else { - // Check the selected contig with the SubAlignments filter. - // Only relevant for DNA, since the RNA contigs were pre-filtered with this filter before selection - // TODO (Maybe) switch DNA to the RNA logic? - return; + foundContig = selectContigDNA(assemblyData, _spanningAligner); } - - // ok, passed QC -- mark the high-scoring alignment as usable for - // hypothesis refinement: - { - assemblyData.bestAlignmentIndex = selectedContigIndex; + if (!foundContig) return; #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": highscoreid: " << selectedContigIndex << " alignment: " << assemblyData.spanningAlignments[selectedContigIndex]; + log_os << __FUNCTION__ << ": highscoreid: " << selectedContigIndex << " alignment: " << assemblyData.spanningAlignments[selectedContigIndex]; #endif + // process the alignment into information that's easily usable + // in the vcf output (ie. breakends in reference coordinates) - // process the alignment into information that's easily usable - // in the vcf output (ie. breakends in reference coordinates) - - // summarize usable output information in a second SVBreakend - // object -- this is the 'refined' sv: - assemblyData.svs.push_back(sv); - SVCandidate& newSV(assemblyData.svs.back()); + // summarize usable output information in a second SVBreakend + // object -- this is the 'refined' sv: + assemblyData.svs.push_back(sv); + SVCandidate& newSV(assemblyData.svs.back()); - generateRefinedVCFSVCandidateFromJumpAlignment(bporient, assemblyData, selectedContigIndex, newSV); + generateRefinedVCFSVCandidateFromJumpAlignment(bporient, assemblyData, newSV); #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": highscore refined sv: " << newSV; + log_os << __FUNCTION__ << ": highscore refined sv: " << newSV; #endif - } } From e14b6e1a45b4ec9d4d2e17dc3175e629cb2d4c00 Mon Sep 17 00:00:00 2001 From: Felix Schlesinger Date: Wed, 6 Sep 2017 16:36:53 -0700 Subject: [PATCH 8/8] Code review fixups --- .../SVCandidateAssemblyRefiner.cpp | 10 +++++----- src/c++/lib/assembly/AssembledContig.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp index 992c7c1b..3d8d7cc0 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp @@ -1387,7 +1387,7 @@ selectContigRNA( if (goodContigIndicies.empty()) return false; // Find the highest alignment score int maxAlnScore = 0; - unsigned selectedContigIndex(goodContigIndicies.front()); + unsigned selectedContigIndex(goodContigIndicies.front()); for (unsigned index : goodContigIndicies) { if (assemblyData.spanningAlignments[index].score > maxAlnScore) @@ -1741,9 +1741,9 @@ getJumpAssembly( << bp2refSeq.substr(align2LeadingCut, bp2refSeq.size() - align2LeadingCut - align2TrailingCut) << '\n'; #endif _spanningAligner.align(contig.seq.begin(), contig.seq.end(), - align1RefStrPtr->begin() + align1LeadingCut, align1RefStrPtr->end() - align1TrailingCut, - align2RefStrPtr->begin() + align2LeadingCut, align2RefStrPtr->end() - align2TrailingCut, - alignment); + align1RefStrPtr->begin() + align1LeadingCut, align1RefStrPtr->end() - align1TrailingCut, + align2RefStrPtr->begin() + align2LeadingCut, align2RefStrPtr->end() - align2TrailingCut, + alignment); } alignment.align1.beginPos += align1LeadingCut; @@ -1779,7 +1779,7 @@ getJumpAssembly( } if (!foundContig) return; #ifdef DEBUG_REFINER - log_os << __FUNCTION__ << ": highscoreid: " << selectedContigIndex << " alignment: " << assemblyData.spanningAlignments[selectedContigIndex]; + log_os << __FUNCTION__ << ": highscoreid: " << assemblyData.bestAlignmentIndex << " alignment: " << assemblyData.spanningAlignments[assemblyData.bestAlignmentIndex]; #endif // process the alignment into information that's easily usable // in the vcf output (ie. breakends in reference coordinates) diff --git a/src/c++/lib/assembly/AssembledContig.cpp b/src/c++/lib/assembly/AssembledContig.cpp index f3f6077c..9c239bab 100644 --- a/src/c++/lib/assembly/AssembledContig.cpp +++ b/src/c++/lib/assembly/AssembledContig.cpp @@ -33,7 +33,7 @@ operator<<(std::ostream& os, const AssembledContig& contig) { os << "CONTIG size: " << contig.seq.size() << " seedCount: " << contig.seedReadCount - << " supportReads: " << contig.supportReads.size() + << " supportReads: " << contig.supportReads.size() << " seq:\n"; printSeq(contig.seq,os); os << "\n";