broadinstitute · yfarjoun · Dec 13, 2019 · Feb 13, 2020 · Apr 6, 2020 · jamesemery
diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java
@@ -483,14 +483,15 @@ private void buildSortedReadEndLists(final boolean useBarcodes) {
         log.info("Will retain up to " + maxInMemory + " data points before spilling to disk.");
 
         final ReadEndsForMarkDuplicatesCodec fragCodec, pairCodec, diskCodec;
+        final double scale = OPTICAL_DUPLICATE_PIXEL_DISTANCE/(double) OpticalDuplicateFinder.DEFAULT_OPTICAL_DUPLICATE_DISTANCE;
         if (useBarcodes) {
-            fragCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
-            pairCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
-            diskCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
+            fragCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec(scale);
+            pairCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec(scale);
+            diskCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec(scale);
         } else {
-            fragCodec = new ReadEndsForMarkDuplicatesCodec();
-            pairCodec = new ReadEndsForMarkDuplicatesCodec();
-            diskCodec = new ReadEndsForMarkDuplicatesCodec();
+            fragCodec = new ReadEndsForMarkDuplicatesCodec(scale);
+            pairCodec = new ReadEndsForMarkDuplicatesCodec(scale);
+            diskCodec = new ReadEndsForMarkDuplicatesCodec(scale);
         }
 
         this.pairSort = SortingCollection.newInstance(ReadEndsForMarkDuplicates.class,

diff --git a/src/main/java/picard/sam/markduplicates/util/OpticalDuplicateFinder.java b/src/main/java/picard/sam/markduplicates/util/OpticalDuplicateFinder.java
@@ -31,7 +31,10 @@
 import picard.util.GraphUtils;
 
 import java.io.Serializable;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 
 /**
  * Contains methods for finding optical/co-localized/sequencing duplicates.
@@ -327,16 +330,19 @@ private PhysicalLocation keeperOrNull(final List<? extends PhysicalLocation> lis
     /** Simple method to test whether two physical locations are close enough to each other to be deemed optical dupes. */
     private boolean closeEnough(final PhysicalLocation lhs, final PhysicalLocation rhs, final int distance) {
         return lhs != rhs &&                                    // no comparing an object to itself (checked using object identity)!
-               lhs.hasLocation() && rhs.hasLocation() &&        // no comparing objects without locations
-               lhs.getReadGroup() == rhs.getReadGroup() &&      // must be in the same RG to be optical duplicates
-               lhs.getTile()      == rhs.getTile()      &&      // and the same tile
-               Math.abs(lhs.getX() - rhs.getX()) <= distance &&
-               Math.abs(lhs.getY() - rhs.getY()) <= distance;
+                lhs.hasLocation() && rhs.hasLocation() &&        // no comparing objects without locations
+                lhs.getReadGroup() == rhs.getReadGroup() &&      // must be in the same RG to be optical duplicates
+                lhs.getTile() == rhs.getTile() &&      // and the same tile
+                closeEnoughShort(lhs, rhs, distance);
     }
 
     private boolean closeEnoughShort(final PhysicalLocation lhs, final PhysicalLocation rhs, final int distance) {
         return lhs != rhs &&
-                Math.abs(lhs.getX() - rhs.getX()) <= distance &&
-                Math.abs(lhs.getY() - rhs.getY()) <= distance;
+                // Since the X and Y coordinates are constrained to short values, they can also become negative when
+                // cast from int to short. This code compares them while taking that into account.
+                (Math.abs((lhs.getX() - rhs.getX()) % Short.MAX_VALUE) <= distance ||
+                        Math.abs((rhs.getX() - lhs.getX()) % Short.MAX_VALUE) <= distance) &&
+                (Math.abs((lhs.getY() - rhs.getY()) % Short.MAX_VALUE) <= distance ||
+                        Math.abs((rhs.getY() - lhs.getY()) % Short.MAX_VALUE) <= distance);
     }
 }
diff --git a/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java b/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java
@@ -26,15 +26,26 @@
 import htsjdk.samtools.util.SortingCollection;
 import picard.PicardException;
 
-import java.io.*;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
 
 /** Codec for ReadEnds that just outputs the primitive fields and reads them back. */
 public class ReadEndsForMarkDuplicatesCodec implements SortingCollection.Codec<ReadEndsForMarkDuplicates> {
     protected DataInputStream in;
     protected DataOutputStream out;
 
+    final protected double scaleFactor;
+
+    public ReadEndsForMarkDuplicatesCodec(final double coordinateAccuracy) {
+        this.scaleFactor = coordinateAccuracy;
+    }
+
     public SortingCollection.Codec<ReadEndsForMarkDuplicates> clone() {
-        return new ReadEndsForMarkDuplicatesCodec();
+        return new ReadEndsForMarkDuplicatesCodec(this.scaleFactor);
     }
 
     public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); }
@@ -66,8 +77,12 @@ public void encode(final ReadEndsForMarkDuplicates read) {
 
             this.out.writeShort(read.readGroup);
             this.out.writeShort(read.tile);
-            this.out.writeShort((short)read.x);
-            this.out.writeShort((short)read.y);
+
+            // scaling this so that in cases where there may be overflow, but the local accuracy is not so important
+            // the low-level bits are forgotten instead of overflowing
+
+            this.out.writeShort((short) (read.x / this.scaleFactor));
+            this.out.writeShort((short) (read.y / this.scaleFactor));
             this.out.writeByte(read.orientationForOpticalDuplicates);
             this.out.writeInt(read.duplicateSetSize);
         } catch (final IOException ioe) {
@@ -99,8 +114,10 @@ public ReadEndsForMarkDuplicates decode() {
 
             read.readGroup = this.in.readShort();
             read.tile = this.in.readShort();
-            read.x = this.in.readShort();
-            read.y = this.in.readShort();
+
+            // reversing the scaling that was done during writing.
+            read.x = (int) (this.scaleFactor * this.in.readShort());
+            read.y = (int) (this.scaleFactor * this.in.readShort());
 
             read.orientationForOpticalDuplicates = this.in.readByte();
             read.duplicateSetSize = this.in.readInt();

diff --git a/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesWithBarcodesCodec.java b/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesWithBarcodesCodec.java
@@ -1,26 +1,26 @@
 /*
-  * The MIT License
-  *
-  * Copyright (c) 2015 The Broad Institute
-  *
-  * Permission is hereby granted, free of charge, to any person obtaining a copy
-  * of this software and associated documentation files (the "Software"), to deal
-  * in the Software without restriction, including without limitation the rights
-  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  * copies of the Software, and to permit persons to whom the Software is
-  * furnished to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in
-  * all copies or substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  * THE SOFTWARE.
-  */
+ * The MIT License
+ *
+ * Copyright (c) 2015 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
 
 package picard.sam.markduplicates.util;
 
@@ -34,9 +34,13 @@
  */
 public class ReadEndsForMarkDuplicatesWithBarcodesCodec extends ReadEndsForMarkDuplicatesCodec {
 
+    public ReadEndsForMarkDuplicatesWithBarcodesCodec(final double coordinateAccuracy) {
+        super(coordinateAccuracy);
+    }
+
     @Override
     public SortingCollection.Codec<ReadEndsForMarkDuplicates> clone() {
-        return new ReadEndsForMarkDuplicatesWithBarcodesCodec();
+        return new ReadEndsForMarkDuplicatesWithBarcodesCodec(this.scaleFactor);
     }
 
     @Override
@@ -47,7 +51,7 @@ public void encode(final ReadEndsForMarkDuplicates read) {
         super.encode(read);
 
         try {
-            final ReadEndsForMarkDuplicatesWithBarcodes val = (ReadEndsForMarkDuplicatesWithBarcodes)read;
+            final ReadEndsForMarkDuplicatesWithBarcodes val = (ReadEndsForMarkDuplicatesWithBarcodes) read;
             out.writeInt(val.barcode);
             out.writeInt(val.readOneBarcode);
             out.writeInt(val.readTwoBarcode);
@@ -59,7 +63,9 @@ public void encode(final ReadEndsForMarkDuplicates read) {
     @Override
     public ReadEndsForMarkDuplicates decode() {
         final ReadEndsForMarkDuplicates parentRead = super.decode();
-        if (null == parentRead) return null; // EOF
+        if (null == parentRead) {
+            return null; // EOF
+        }
         final ReadEndsForMarkDuplicatesWithBarcodes read = new ReadEndsForMarkDuplicatesWithBarcodes(parentRead);
         try {
             read.barcode = in.readInt();
@@ -70,5 +76,4 @@ public ReadEndsForMarkDuplicates decode() {
             throw new PicardException("Exception writing ReadEnds to file.", ioe);
         }
     }
-
 }
diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesTest.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesTest.java
@@ -193,6 +193,7 @@ public void testOpticalDuplicateDetection(final File sam, final long expectedNum
         markDuplicates.TMP_DIR = CollectionUtil.makeList(outputDir);
         // Needed to suppress calling CommandLineProgram.getVersion(), which doesn't work for code not in a jar
         markDuplicates.PROGRAM_RECORD_ID = null;
+        markDuplicates.OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500;
         Assert.assertEquals(markDuplicates.doWork(), 0);
         Assert.assertEquals(markDuplicates.numOpticalDuplicates(), expectedNumOpticalDuplicates);
         IOUtil.recursiveDelete(outputDir.toPath());
@@ -204,6 +205,7 @@ public Object[][] testOpticalDuplicateDetectionDataProvider() {
         return new Object[][] {
                 {new File(TEST_DATA_DIR, "optical_dupes.sam"), 1L},
                 {new File(TEST_DATA_DIR, "optical_dupes_casava.sam"), 1L},
+                {new File(TEST_DATA_DIR, "GH1141.optical_dups.sam"), 1L},
         };
     }
 

diff --git a/testdata/picard/sam/MarkDuplicates/GH1141.optical_dups.sam b/testdata/picard/sam/MarkDuplicates/GH1141.optical_dups.sam
@@ -0,0 +1,11 @@
+@HD	VN:1.6	SO:coordinate
+@SQ	SN:1	LN:200000000
+@SQ	SN:2	LN:200000000
+@RG	ID:00001_1#1	LB:22222222 PL:ILLUMINA	SM:AW99
+@PG	ID:MarkDuplicates	VN:2.21.4-2-ga3021a7-SNAPSHOT	CL:MarkDuplicates TAG_DUPLICATE_SET_MEMBERS=true TAGGING_POLICY=All INPUT=[short_test.sam] OUTPUT=short_test_out.sam METRICS_FILE=short_test_stat.txt OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500    MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 REMOVE_SEQUENCING_DUPLICATES=false CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true REMOVE_DUPLICATES=false ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false	PN:MarkDuplicates
+AW7_00001:1:2101:29376:32760	99	1	2239203	60	151M	=	2239280	228	TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT	AAFFFJJJJJJJJJJJJJJJJJJJJFJJJJJJFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJAFJJFJF	MC:Z:151M	PG:Z:MarkDuplicates	RG:Z:00001_1#1	DI:i:0	DS:i:2
+AW7_00001:1:2101:29346:33023	1123	1	2239203	60	151M	=	2239280	228	TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT	AAFFFJJJJJJJJJJJJJJJJJJJAJFJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJAJJJJJF	MC:Z:151M	PG:Z:MarkDuplicates	RG:Z:00001_1#1	DI:i:0	DS:i:2	DT:Z:LB
+AW7_00001:1:2101:29346:98296	1123	1	2239203	60	151M	=	2239280	228	TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT	AAFFFJJJJJJJJJJJJJJJJJJJAJFJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJAJJJJJF	MC:Z:151M	PG:Z:MarkDuplicates	RG:Z:00001_1#1	DI:i:0	DS:i:2	DT:Z:LB
+AW7_00001:1:2101:29376:32760	147	1	2239280	60	151M	=	2239203	-228	TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT	JJFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJFFFAA	MC:Z:151M	PG:Z:MarkDuplicates	RG:Z:00001_1#1	DI:i:0	DS:i:2
+AW7_00001:1:2101:29346:33023	1171	1	2239280	60	151M	=	2239203	-228	TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT	JJJJJFJJJJJJJJJJJJJJJJJJJJJFJJF7JJJFJJJFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJJJJJJJJJJJJJJJJJFFAAA	MC:Z:151M	PG:Z:MarkDuplicates	RG:Z:00001_1#1	DI:i:0	DS:i:2	DT:Z:LB
+AW7_00001:1:2101:29346:98296	1171	1	2239280	60	151M	=	2239203	-228	TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT	JJJJJFJJJJJJJJJJJJJJJJJJJJJFJJF7JJJFJJJFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJJJJJJJJJJJJJJJJJFFAAA	MC:Z:151M	PG:Z:MarkDuplicates	RG:Z:00001_1#1	DI:i:0	DS:i:2	DT:Z:LB