Skip to content

Commit

Permalink
Fixed a bug where overlapping reads in subsequent regions can have in…
Browse files Browse the repository at this point in the history
…valid base qualities (#6943)

This fixes the test that was accidentally disabled (when we added a flag for overlapping read adjustment we forgot to set it correctly for the test) and reinstated something like the code from #4926. It should now necessarily be the case that the finalize() method calls read.copy() at least once for every read (though many multiples of once are still quite possible).

Fixes #6882
  • Loading branch information
jamesemery authored Nov 24, 2020
1 parent 2e78008 commit 3be2f38
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -102,20 +102,31 @@ public static void finalizeRegion(final AssemblyRegion region,
}

final byte minTailQualityToUse = errorCorrectReads ? HaplotypeCallerEngine.MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION : minTailQuality;
final List<GATKRead> readsToUse = region.getReads().stream()
// TODO unclipping soft clips may introduce bases that aren't in the extended region if the unclipped bases
// TODO include a deletion w.r.t. the reference. We must remove kmers that occur before the reference haplotype start
.map(read -> dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(read) ?
ReadClipper.hardClipSoftClippedBases(read) : ReadClipper.revertSoftClippedBases(read))
.map(read -> softClipLowQualityEnds ? ReadClipper.softClipLowQualEnds(read, minTailQualityToUse) :
ReadClipper.hardClipLowQualEnds(read, minTailQualityToUse))
.filter(read -> read.getStart() <= read.getEnd())
.map(read -> read.isUnmapped() ? read : ReadClipper.hardClipAdaptorSequence(read))
.filter(read -> !read.isEmpty() && read.getCigar().getReadLength() > 0)
.map(read -> ReadClipper.hardClipToRegion(read, region.getPaddedSpan().getStart(), region.getPaddedSpan().getEnd() ))
.filter(read -> read.getStart() <= read.getEnd() && read.getLength() > 0 && read.overlaps(region.getPaddedSpan()))
.sorted(new ReadCoordinateComparator(readsHeader)) // TODO: sort may be unnecessary here
.collect(Collectors.toList());

final List<GATKRead> readsToUse = new ArrayList<>();
for (GATKRead originalRead : region.getReads()) {
// TODO unclipping soft clips may introduce bases that aren't in the extended region if the unclipped bases
// TODO include a deletion w.r.t. the reference. We must remove kmers that occur before the reference haplotype start
GATKRead read = (dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(originalRead) ?
ReadClipper.hardClipSoftClippedBases(originalRead) : ReadClipper.revertSoftClippedBases(originalRead));
read = (softClipLowQualityEnds ? ReadClipper.softClipLowQualEnds(read, minTailQualityToUse) :
ReadClipper.hardClipLowQualEnds(read, minTailQualityToUse));

if (read.getStart() <= read.getEnd()) {
read = (read.isUnmapped() ? read : ReadClipper.hardClipAdaptorSequence(read));

if (!read.isEmpty() && read.getCigar().getReadLength() > 0) {
read = ReadClipper.hardClipToRegion(read, region.getPaddedSpan().getStart(), region.getPaddedSpan().getEnd() );

if (read.getStart() <= read.getEnd() && read.getLength() > 0 && read.overlaps(region.getPaddedSpan())) {
// NOTE: here we make a defensive copy of the read if it has not been modified by the above operations
// which might only make copies in the case that the read is actually clipped
readsToUse.add(read == originalRead? read.copy() : read);
}
}
}
}
readsToUse.sort(new ReadCoordinateComparator(readsHeader));

// handle overlapping read pairs from the same fragment
if (correctOverlappingBaseQualities) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ public void testfinalizeRegion() {

SAMLineParser parser = new SAMLineParser(header);
List<GATKRead> reads = new LinkedList<GATKRead>();
// NOTE: These reads are mates that overlap one-another without agreement which means they should have modified base qualities after calling finalize()
// Read2 has a clean cigar, and thus will not be copied by the clipping code before being fed to the overlapping bases code. This test asserts that its still copied.
SAMRecord orgRead0 = parser.parseLine("HWI-ST807:461:C2P0JACXX:4:2204:18080:5857\t83\t1\t42596803\t39\t1S95M5S\t=\t42596891\t-7\tGAATCATCATCAAATGGAATCTAATGGAATCATTGAACAGAATTGAATGGAATCGTCATCGAATGAATTGAATGCAATCATCGAATGGTCTCGAATAGAAT\tDAAAEDCFCCGEEDDBEDDDGCCDEDECDDFDCEECCFEECDCEDBCDBDBCC>DCECC>DBCDDBCBDDBCDDEBCCECC>DBCDBDBGC?FCCBDB>>?\tRG:Z:tumor");
SAMRecord orgRead1 = parser.parseLine("HWI-ST807:461:C2P0JACXX:4:2204:18080:5857\t163\t1\t42596891\t39\t101M\t=\t42596803\t7\tCTCGAATGGAATCATTTTCTACTGGAAAGGAATGGAATCATCGCATAGAATCGAATGGAATTAACATGGAATGGAATCGAATGTAATCATCATCAAATGGA\t>@>:ABCDECCCEDCBBBDDBDDEBCCBEBBCBEBCBCDDCD>DECBGCDCF>CCCFCDDCBABDEDFCDCDFFDDDG?DDEGDDFDHFEGDDGECB@BAA\tRG:Z:tumor");

Expand All @@ -64,9 +66,10 @@ public void testfinalizeRegion() {
activeRegion.addAll(reads);
SampleList sampleList = SampleList.singletonSampleList("tumor");
Byte minbq = 9;
AssemblyBasedCallerUtils.finalizeRegion(activeRegion, false, false, minbq, header, sampleList, false, false);
// NOTE: this test MUST be run with correctOverlappingBaseQualities enabled otherwise this test can succeed even with unsafe code
AssemblyBasedCallerUtils.finalizeRegion(activeRegion, false, false, minbq, header, sampleList, true, false);

// make sure reads are not changed due to finalizeRegion()
// make sure that the original reads are not changed due to finalizeRegion()
Assert.assertTrue(reads.get(0).convertToSAMRecord(header).equals(orgRead0));
Assert.assertTrue(reads.get(1).convertToSAMRecord(header).equals(orgRead1));
}
Expand Down

0 comments on commit 3be2f38

Please sign in to comment.