-
Notifications
You must be signed in to change notification settings - Fork 597
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixed MarkDuplicatesSpark handling of unsorted bams #4732
Changes from 4 commits
ae33725
8742a80
942b290
dc5dfaf
b338293
e8dcae0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,6 +60,11 @@ public int getIndex() { | |
this.value = value; | ||
this.index = index; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "indexpair["+index+","+value.toString()+"]"; | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -167,9 +172,7 @@ private static JavaPairRDD<String, Iterable<IndexPair<GATKRead>>> getReadsGroupe | |
keyedReads = spanReadsByKey(indexedReads); | ||
} else { | ||
// sort by group and name (incurs a shuffle) | ||
JavaPairRDD<String, IndexPair<GATKRead>> keyReadPairs = indexedReads.mapToPair(read -> new Tuple2<>(ReadsKey.keyForRead( | ||
read.getValue()), read)); | ||
keyedReads = keyReadPairs.groupByKey(numReducers); | ||
throw new GATKException("MarkDuplicatesSparkUtils.mark() requires input reads to be queryname sorted, yet the header indicated otherwise"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you have it print the sort order it thinks its in? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
} | ||
return keyedReads; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
package org.broadinstitute.hellbender.utils.read; | ||
|
||
import htsjdk.samtools.SAMRecordQueryNameComparator; | ||
import htsjdk.samtools.SAMTag; | ||
|
||
import java.io.Serializable; | ||
import java.util.Comparator; | ||
|
||
/** | ||
* compare {@link GATKRead} by queryname | ||
* duplicates the exact ordering of {@link SAMRecordQueryNameComparator} | ||
*/ | ||
public class ReadQueryNameComparator implements Comparator<GATKRead>, Serializable { | ||
private static final long serialVersionUID = 1L; | ||
|
||
@Override | ||
public int compare(final GATKRead read1, final GATKRead read2) { | ||
int cmp = compareReadNames(read1, read2); | ||
if (cmp != 0) { | ||
return cmp; | ||
} | ||
|
||
final boolean r1Paired = read1.isPaired(); | ||
final boolean r2Paired = read2.isPaired(); | ||
|
||
if (r1Paired || r2Paired) { | ||
if (!r1Paired) return 1; | ||
else if (!r2Paired) return -1; | ||
else if (read1.isFirstOfPair() && read2.isSecondOfPair()) return -1; | ||
else if (read1.isSecondOfPair() && read2.isFirstOfPair()) return 1; | ||
} | ||
|
||
if (read1.isReverseStrand() != read2.isReverseStrand()) { | ||
return (read1.isReverseStrand()? 1: -1); | ||
} | ||
if (read1.isSecondaryAlignment() != read2.isSecondaryAlignment()) { | ||
return read2.isSecondaryAlignment()? -1: 1; | ||
} | ||
if (read1.isSupplementaryAlignment() != read2.isSupplementaryAlignment()) { | ||
return read2.isSupplementaryAlignment() ? -1 : 1; | ||
} | ||
final Integer hitIndex1 = read1.getAttributeAsInteger(SAMTag.HI.name()); | ||
final Integer hitIndex2 = read2.getAttributeAsInteger(SAMTag.HI.name()); | ||
if (hitIndex1 != null) { | ||
if (hitIndex2 == null) return 1; | ||
else { | ||
cmp = hitIndex1.compareTo(hitIndex2); | ||
if (cmp != 0) return cmp; | ||
} | ||
} else if (hitIndex2 != null) return -1; | ||
return 0; | ||
} | ||
|
||
/** | ||
* compare read names lexicographically without any additional tie breakers | ||
*/ | ||
public int compareReadNames(final GATKRead read1, final GATKRead read2) { | ||
return read1.getName().compareTo(read2.getName()); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -144,6 +144,28 @@ public static JavaRDD<GATKRead> coordinateSortReads(final JavaRDD<GATKRead> read | |
return readVoidPairs.keys(); | ||
} | ||
|
||
/** | ||
* Sorts the given reads in queryname sort order. | ||
* @param reads the reads to sort | ||
* @param numReducers the number of reducers to use; a value of 0 means use the default number of reducers | ||
* @return a sorted RDD of reads | ||
*/ | ||
public static JavaRDD<GATKRead> querynameSortReads(final JavaRDD<GATKRead> reads, final int numReducers) { | ||
// Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount | ||
// of data going through the shuffle. | ||
final JavaPairRDD<GATKRead, Void> rddReadPairs = reads.mapToPair(read -> new Tuple2<>(read, (Void) null)); | ||
|
||
// do a total sort so that all the reads in partition i are less than those in partition i+1 | ||
final Comparator<GATKRead> comparator = new ReadQueryNameComparator(); | ||
final JavaPairRDD<GATKRead, Void> readVoidPairs; | ||
if (numReducers > 0) { | ||
readVoidPairs = rddReadPairs.sortByKey(comparator, true, numReducers); | ||
} else { | ||
readVoidPairs = rddReadPairs.sortByKey(comparator); | ||
} | ||
return readVoidPairs.keys(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method should call the edge fixing method. We don't want to give people the option to do it wrong. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
} | ||
|
||
/** | ||
* Sorts the given reads according to the sort order in the header. | ||
* @param reads the reads to sort | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink; | ||
import org.broadinstitute.hellbender.utils.read.*; | ||
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; | ||
import org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder; | ||
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey; | ||
import org.broadinstitute.hellbender.GATKBaseTest; | ||
import org.broadinstitute.hellbender.utils.test.SamAssertionUtils; | ||
|
@@ -66,4 +67,58 @@ private static Tuple2<String, Iterable<GATKRead>> pairIterable(String key, GATKR | |
return new Tuple2<>(key, ImmutableList.copyOf(reads)); | ||
} | ||
|
||
@Test | ||
// Test that asserts the duplicate marking is sorting agnostic, specifically this is testing that when reads are scrambled across | ||
// partitions in the input that all reads in a group are getting properly duplicate marked together as they are for queryname sorted bams | ||
public void testSortOrderParitioningCorrectness() throws IOException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo paritioning -> partitioning There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
||
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); | ||
JavaRDD<GATKRead> unsortedReads = generateUnsortedReads(10000,3, ctx, 100, true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. stupid nitpick: spaces here are wonky, and on the next line |
||
JavaRDD<GATKRead> pariedEndsQueryGrouped = generateUnsortedReads(10000,3, ctx,1, false); | ||
|
||
SAMFileHeader unsortedHeader = hg19Header.clone(); | ||
unsortedHeader.setSortOrder(SAMFileHeader.SortOrder.unsorted); | ||
SAMFileHeader sortedHeader = hg19Header.clone(); | ||
sortedHeader.setSortOrder(SAMFileHeader.SortOrder.queryname); | ||
|
||
// Using the header flagged as unsorted will result in the reads being sorted again | ||
JavaRDD<GATKRead> unsortedReadsMarked = MarkDuplicatesSpark.mark(unsortedReads,unsortedHeader, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES,new OpticalDuplicateFinder(),100,true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is called unsorted, but isn't it actually coordinate sorted? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the different num reducers? Is that to find issues with edge fixing? If it is, I think you'd be better off with a specific (and possibly similar) test for that. Since we're always generating pairs, it seems to me that they might never get split across partitions if we're creating an even number of partitions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the reason for different numbers of partitions was that when I first wrote this test this test there was no exposed way to do the edge fixing for a queryname sorted bam. I didn't want to deal with the problems of having a mispartitioned bam so I let the queryname sorted reads reside on one partition so the spanning couldn't be wrong. Since this is a test of the coordinate sorted bam marking across partitions and not the edge fixing i'm not worried. |
||
JavaRDD<GATKRead> sortedReadsMarked = MarkDuplicatesSpark.mark(pariedEndsQueryGrouped,sortedHeader, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES,new OpticalDuplicateFinder(),1,true); | ||
|
||
Iterator<GATKRead> sortedReadsFinal = sortedReadsMarked.sortBy(GATKRead::commonToString, false, 1).collect().iterator(); | ||
Iterator<GATKRead> unsortedReadsFinal = unsortedReadsMarked.sortBy(GATKRead::commonToString, false, 1).collect().iterator(); | ||
|
||
// Comparing the output reads to ensure they are all duplicate marked correctly | ||
while (sortedReadsFinal.hasNext()) { | ||
GATKRead read1 = sortedReadsFinal.next(); | ||
GATKRead read2 = unsortedReadsFinal.next(); | ||
Assert.assertEquals(read1.getName(), read2.getName()); | ||
Assert.assertEquals(read1.isDuplicate(), read2.isDuplicate()); | ||
} | ||
} | ||
|
||
private JavaRDD<GATKRead> generateUnsortedReads(int numReadGroups, int numDuplicatesPerGroup, JavaSparkContext ctx, int numPartitions, boolean coordinate) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these are sorted... rename to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you add a bit of javadoc to this method explaining what it does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
int readNameCounter = 0; | ||
SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate, | ||
true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, SAMRecordSetBuilder.DEFAULT_DUPLICATE_SCORING_STRATEGY); | ||
|
||
Random rand = new Random(10); | ||
for (int i = 0; i < numReadGroups; i++ ) { | ||
int start1 = rand.nextInt(SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH); | ||
int start2 = rand.nextInt(SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH); | ||
for (int j = 0; j < numDuplicatesPerGroup; j++) { | ||
samRecordSetBuilder.addPair("READ" + readNameCounter++, 0, start1, start2); | ||
} | ||
} | ||
final ReadCoordinateComparator coordinateComparitor = new ReadCoordinateComparator(hg19Header); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. coordinateComparitor is unused, and misspelled. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, it really does have a poor lot in life doesn't it |
||
List<SAMRecord> records = Lists.newArrayList(samRecordSetBuilder.getRecords()); | ||
if (coordinate) { | ||
records.sort(new SAMRecordCoordinateComparator()); | ||
} else { | ||
records.sort(new SAMRecordQueryNameComparator()); | ||
} | ||
|
||
return ctx.parallelize(records, numPartitions).map(SAMRecordToGATKReadAdapter::new); | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull the sort onto it's own line. It's not a great idea to hide really expensive operations inline with other calls.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I might extract this whole sorting operation into a function, "queryNameSortReadsIfNecessary"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done