diff --git a/build.sbt b/build.sbt index 192233aa58..abb411cc40 100644 --- a/build.sbt +++ b/build.sbt @@ -1,57 +1,41 @@ organization := "org.hammerlab.adam" -name := ParentPlugin.sparkName("adam-core") +name := sparkName("adam-core") version := "0.20.4-SNAPSHOT" -sonatypeProfileName := "org.hammerlab" - -val utilsVersion = "0.2.9" - hadoopVersion := "2.7.3" scalatestVersion := "2.2.1" -testDeps ++= Seq( - "org.bdgenomics.utils" %% "utils-misc" % utilsVersion classifier("tests") exclude("org.apache.spark", "*"), +addSparkDeps +publishTestJar +enableScalariform + +// Using ":=" here to clobber the usual default hammerlab-test-libs that are added by parent-plugin, which use +// Scalatest 3.0.0. +testDeps := Seq( "org.mockito" % "mockito-core" % "1.10.19" ) -providedDeps ++= Seq( - libraries.value('hadoop), - libraries.value('spark) -) +testJarTestDeps += (libraries.value('bdg_utils_misc) exclude("org.apache.spark", "*")) libraryDependencies ++= Seq( - "org.bdgenomics.utils" %% "utils-metrics" % utilsVersion, - "org.bdgenomics.utils" %% "utils-misc" % utilsVersion, - "org.bdgenomics.utils" %% "utils-io" % utilsVersion exclude("com.fasterxml.jackson.core", "*"), - "org.bdgenomics.utils" %% "utils-cli" % utilsVersion, - "org.bdgenomics.utils" %% "utils-intervalrdd" % utilsVersion, - "com.esotericsoftware.kryo" % "kryo" % "2.24.0", - "org.bdgenomics.bdg-formats" % "bdg-formats" % "0.10.0", - "commons-io" % "commons-io" % "2.4", + libraries.value('bdg_formats), + libraries.value('bdg_utils_cli), + libraries.value('bdg_utils_intervalrdd), + libraries.value('bdg_utils_io), + libraries.value('bdg_utils_metrics), + libraries.value('bdg_utils_misc), + libraries.value('commons_io), + libraries.value('hadoop_bam) exclude("com.github.samtools", "htsjdk"), + libraries.value('log4j), + "com.github.samtools" % "htsjdk" % "2.5.0", + "com.netflix.servo" % "servo-core" % "0.10.0", "it.unimi.dsi" % "fastutil" % "6.6.5", "org.apache.avro" % "avro" % "1.8.0", - "org.slf4j" % "slf4j-log4j12" % "1.7.21", + "org.apache.httpcomponents" % "httpclient" % "4.5.2", "org.apache.parquet" % "parquet-avro" % "1.8.1", "org.apache.parquet" % "parquet-scala_2.10" % "1.8.1" exclude("org.scala-lang", "scala-library"), - libraries.value('hadoop_bam), - "com.github.samtools" % "htsjdk" % "2.5.0", - "org.apache.httpcomponents" % "httpclient" % "4.5.2", - "com.netflix.servo" % "servo-core" % "0.10.0", - "org.hammerlab" %% "genomic-loci" % "1.4.2" exclude("com.github.samtools", "htsjdk") + "org.hammerlab" %% "genomic-loci" % "1.4.4" exclude("com.github.samtools", "htsjdk") ) - -import scalariform.formatter.preferences._ -import com.typesafe.sbt.SbtScalariform -import com.typesafe.sbt.SbtScalariform.ScalariformKeys - -SbtScalariform.defaultScalariformSettings - -ScalariformKeys.preferences := ScalariformKeys.preferences.value - .setPreference(AlignParameters, true) - .setPreference(CompactStringConcatenation, false) - .setPreference(AlignSingleLineCaseStatements, true) - .setPreference(DoubleIndentClassDeclaration, true) - .setPreference(PreserveDanglingCloseParenthesis, true) diff --git a/project/plugins.sbt b/project/plugins.sbt index c7a556c033..37f576a434 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,2 +1 @@ -addSbtPlugin("org.hammerlab" % "sbt-parent" % "1.2.3") -addSbtPlugin("com.typesafe.sbt" % "sbt-scalariform" % "1.3.0") +addSbtPlugin("org.hammerlab" % "sbt-parent" % "1.4.0") diff --git a/src/main/java/org/bdgenomics/adam/io/FastqRecordReader.java b/src/main/java/org/bdgenomics/adam/io/FastqRecordReader.java index ba79724427..37fdc1a705 100644 --- a/src/main/java/org/bdgenomics/adam/io/FastqRecordReader.java +++ b/src/main/java/org/bdgenomics/adam/io/FastqRecordReader.java @@ -40,7 +40,7 @@ * a single Text output. This is then fed into the FastqConverter, which * converts the single Text instance into two AlignmentRecords. */ -abstract class FastqRecordReader extends RecordReader { +abstract class FastqRecordReader extends RecordReader { /* * fastq format: * := + @@ -57,53 +57,53 @@ abstract class FastqRecordReader extends RecordReader { * For now I'm going to assume single-line sequences. This works for our sequencing * application. We'll see if someone complains in other applications. */ - + /** * First valid data index in the stream. */ private long start; - + /** * First index value beyond the slice, i.e. slice is in range [start,end). */ protected long end; - + /** * Current position in file. */ protected long pos; - + /** * Path of the file being parsed. */ private Path file; - + /** * The line reader we are using to read the file. */ private LineReader lineReader; - + /** * The input stream we are using to read the file. */ private InputStream inputStream; - + /** * The text for a single record pair we have parsed out. * Hadoop's RecordReader contract requires us to save this as state. */ private Text currentValue; - + /** * Newline string for matching on. */ private static final byte[] newline = "\n".getBytes(); - + /** * Maximum length for a read string. */ private static final int MAX_LINE_LENGTH = 10000; - + /** * Builds a new record reader given a config file and an input split. * @@ -115,13 +115,13 @@ protected FastqRecordReader(final Configuration conf, final FileSplit split) thr file = split.getPath(); start = split.getStart(); end = start + split.getLength(); - + FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); - + CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); - + if (codec == null) { // no codec. Uncompressed file. positionAtFirstRecord(fileIn); inputStream = fileIn; @@ -130,14 +130,12 @@ protected FastqRecordReader(final Configuration conf, final FileSplit split) thr if (start != 0) { throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); } - inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } - lineReader = new LineReader(inputStream); } - + /** * Checks to see whether the buffer is positioned at a valid record. * @@ -154,19 +152,19 @@ protected FastqRecordReader(final Configuration conf, final FileSplit split) thr * * @param stream The stream to reposition. */ - protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException { + protected final void positionAtFirstRecord(final FSDataInputStream stream) throws IOException { Text buffer = new Text(); - + if (true) { // (start > 0) // use start>0 to assume that files start with valid data // Advance to the start of the first record that ends with /1 // We use a temporary LineReader to read lines until we find the // position of the right one. We then seek the file to that position. stream.seek(start); LineReader reader = new LineReader(stream); - + int bytesRead = 0; do { - bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); + bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); int bufferLength = buffer.getLength(); if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) { start += bytesRead; @@ -175,9 +173,9 @@ protected void positionAtFirstRecord(FSDataInputStream stream) throws IOExceptio // // If this isn't the start of a record, we want to backtrack to its end long backtrackPosition = start + bytesRead; - - bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); - bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); + + bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); + bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { break; // all good! } else { @@ -188,39 +186,39 @@ protected void positionAtFirstRecord(FSDataInputStream stream) throws IOExceptio } } } while (bytesRead > 0); - + stream.seek(start); } - pos = start; } - + /** * Method is a no-op. * * @param split The input split that we will parse. * @param context The Hadoop task context. */ - public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {} - + public final void initialize(final InputSplit split, final TaskAttemptContext context) + throws IOException, InterruptedException {} + /** * FASTQ has no keys, so we return null. * * @return Always returns null. */ - public Void getCurrentKey() { + public final Void getCurrentKey() { return null; } - + /** * Returns the last interleaved FASTQ record. * * @return The text corresponding to the last read pair. */ - public Text getCurrentValue() { + public final Text getCurrentValue() { return currentValue; } - + /** * Seeks ahead in our split to the next key-value pair. * @@ -229,42 +227,41 @@ public Text getCurrentValue() { * * @return True if reading the next read pair succeeded. */ - public boolean nextKeyValue() throws IOException, InterruptedException { + public final boolean nextKeyValue() throws IOException, InterruptedException { currentValue = new Text(); - return next(currentValue); } - + /** * Close this RecordReader to future operations. */ - public void close() throws IOException { + public final void close() throws IOException { inputStream.close(); } - + /** * How much of the input has the RecordReader consumed? * * @return Returns a value on [0.0, 1.0] that notes how many bytes we * have read so far out of the total bytes to read. */ - public float getProgress() { + public final float getProgress() { if (start == end) { return 1.0f; } else { return Math.min(1.0f, (pos - start) / (float)(end - start)); } } - + /** * Produces a debugging message with the file position. * * @return Returns a string containing {filename}:{index}. */ - protected String makePositionMessage() { + protected final String makePositionMessage() { return file.toString() + ":" + pos; } - + /** * Parses a read from an interleaved FASTQ file. * @@ -277,7 +274,9 @@ protected String makePositionMessage() { * @throws RuntimeException Throws exception if FASTQ record doesn't * have proper formatting (e.g., record doesn't start with @). */ - protected boolean lowLevelFastqRead(Text readName, Text value) throws IOException { + protected final boolean lowLevelFastqRead(final Text readName, final Text value) + throws IOException { + // ID line readName.clear(); long skipped = appendLineInto(readName, true); @@ -292,31 +291,30 @@ protected boolean lowLevelFastqRead(Text readName, Text value) throws IOExceptio ". Line: " + readName + ". \n"); } - value.append(readName.getBytes(), 0, readName.getLength()); - + // sequence appendLineInto(value, false); - + // separator line appendLineInto(value, false); - + // quality appendLineInto(value, false); - + return true; } - + /** * Reads from the input split. * * @param value Text record to write input value into. * @return Returns whether this read was successful or not. * - * @see lowLevelFastqRead + * @see #lowLevelFastqRead(Text, Text) */ abstract protected boolean next(Text value) throws IOException; - + /** * Reads a newline into a text record from the underlying line reader. * @@ -330,14 +328,15 @@ protected boolean lowLevelFastqRead(Text readName, Text value) throws IOExceptio private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException { Text buf = new Text(); int bytesRead = lineReader.readLine(buf, MAX_LINE_LENGTH); - - if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) + + if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) { throw new EOFException(); - + } + dest.append(buf.getBytes(), 0, buf.getLength()); dest.append(newline, 0, 1); pos += bytesRead; - + return bytesRead; } } \ No newline at end of file diff --git a/src/main/java/org/bdgenomics/adam/io/InterleavedFastqInputFormat.java b/src/main/java/org/bdgenomics/adam/io/InterleavedFastqInputFormat.java index 05206cd084..b9c726ef72 100755 --- a/src/main/java/org/bdgenomics/adam/io/InterleavedFastqInputFormat.java +++ b/src/main/java/org/bdgenomics/adam/io/InterleavedFastqInputFormat.java @@ -41,7 +41,7 @@ * This reader is based on the FastqInputFormat that's part of Hadoop-BAM, * found at https://github.com/HadoopGenomics/Hadoop-BAM/blob/master/src/main/java/org/seqdoop/hadoop_bam/FastqInputFormat.java */ -public class InterleavedFastqInputFormat extends FileInputFormat { +public final class InterleavedFastqInputFormat extends FileInputFormat { /** * A record reader for the interleaved FASTQ format. @@ -70,7 +70,7 @@ private static class InterleavedFastqRecordReader extends FastqRecordReader { * @return Returns true if the buffer contains the first line of a properly * formatted pair of FASTQ records. */ - protected boolean checkBuffer(int bufferLength, Text buffer) { + protected boolean checkBuffer(final int bufferLength, final Text buffer) { return (bufferLength >= 2 && buffer.getBytes()[0] == '@' && buffer.getBytes()[bufferLength - 2] == '/' && @@ -87,28 +87,22 @@ protected boolean checkBuffer(int bufferLength, Text buffer) { * middle of a read, or if we have a read that is incorrectly * formatted (missing readname delimiters). */ - protected boolean next(Text value) throws IOException { + protected boolean next(final Text value) throws IOException { if (pos >= end) return false; // past end of slice try { Text readName1 = new Text(); Text readName2 = new Text(); - value.clear(); // first read of the pair - boolean gotData = lowLevelFastqRead(readName1, value); - - if (!gotData) + if (!lowLevelFastqRead(readName1, value)) { return false; + } // second read of the pair - gotData = lowLevelFastqRead(readName2, value); - - if (!gotData) - return false; + return lowLevelFastqRead(readName2, value); - return true; } catch (EOFException e) { throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage()); } @@ -123,12 +117,11 @@ protected boolean next(Text value) throws IOException { * @return Returns the interleaved FASTQ record reader. */ public RecordReader createRecordReader( - InputSplit genericSplit, - TaskAttemptContext context) throws IOException, InterruptedException { + final InputSplit genericSplit, + final TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(genericSplit.toString()); // cast as per example in TextInputFormat - return new InterleavedFastqRecordReader(context.getConfiguration(), - (FileSplit)genericSplit); + return new InterleavedFastqRecordReader(context.getConfiguration(), (FileSplit) genericSplit); } } diff --git a/src/main/java/org/bdgenomics/adam/io/SingleFastqInputFormat.java b/src/main/java/org/bdgenomics/adam/io/SingleFastqInputFormat.java index 762e91b57f..aa6a754b23 100644 --- a/src/main/java/org/bdgenomics/adam/io/SingleFastqInputFormat.java +++ b/src/main/java/org/bdgenomics/adam/io/SingleFastqInputFormat.java @@ -33,8 +33,8 @@ * This reader is based on the FastqInputFormat that's part of Hadoop-BAM, * found at https://github.com/HadoopGenomics/Hadoop-BAM/blob/master/src/main/java/org/seqdoop/hadoop_bam/FastqInputFormat.java */ -public class SingleFastqInputFormat extends FileInputFormat { - +public final class SingleFastqInputFormat extends FileInputFormat { + /** * A record reader for the standard FASTQ format. * @@ -61,7 +61,7 @@ private static class SingleFastqRecordReader extends FastqRecordReader { * @return Returns true if the buffer contains the first line of a properly * formatted pair of FASTQ records. */ - protected boolean checkBuffer(int bufferLength, Text buffer) { + protected boolean checkBuffer(final int bufferLength, final Text buffer) { return (bufferLength >= 0 && buffer.getBytes()[0] == '@'); } @@ -76,18 +76,15 @@ protected boolean checkBuffer(int bufferLength, Text buffer) { * middle of a read, or if we have a read that is incorrectly * formatted (missing readname delimiters). */ - protected boolean next(Text value) throws IOException { + protected boolean next(final Text value) throws IOException { if (pos >= end) return false; // past end of slice try { Text readName = new Text(); - value.clear(); // first read of the pair - boolean gotData = lowLevelFastqRead(readName, value); - - return gotData; + return lowLevelFastqRead(readName, value); } catch (EOFException e) { throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage()); } @@ -102,12 +99,11 @@ protected boolean next(Text value) throws IOException { * @return Returns the interleaved FASTQ record reader. */ public RecordReader createRecordReader( - InputSplit genericSplit, - TaskAttemptContext context) throws IOException, InterruptedException { + final InputSplit genericSplit, + final TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(genericSplit.toString()); // cast as per example in TextInputFormat - return new SingleFastqRecordReader(context.getConfiguration(), - (FileSplit)genericSplit); + return new SingleFastqRecordReader(context.getConfiguration(), (FileSplit) genericSplit); } } diff --git a/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala b/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala index 79683c8826..aec39fc3e3 100644 --- a/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala +++ b/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala @@ -17,7 +17,6 @@ */ package org.bdgenomics.adam.converters -import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.formats.avro.{ Contig, NucleotideContigFragment } import scala.collection.mutable diff --git a/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala b/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala index 0defab2f36..44ac65a0ea 100644 --- a/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala +++ b/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala @@ -17,7 +17,6 @@ */ package org.bdgenomics.adam.converters -import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.formats.avro._ @@ -92,7 +91,7 @@ private[adam] object FragmentConverter extends Serializable { // are our fragments adjacent? val newLastFragment = if (lastRegion.isAdjacent(thisRegion)) { // if they are, merge the fragments - // we have sorted these fragments before we started, so the orientation is already known + // we have sorted these fragments before we started, so the strand is already known (lastRegion.hull(thisRegion), lastString + thisString) } else { // if they aren't, prepend the last fragment to the list diff --git a/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala b/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala index d54d3636f3..9e72c8d000 100644 --- a/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala +++ b/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala @@ -18,13 +18,12 @@ package org.bdgenomics.adam.converters import htsjdk.samtools.{ - CigarElement, SAMReadGroupRecord, SAMRecord, SAMUtils } import org.bdgenomics.utils.misc.Logging -import org.bdgenomics.adam.models.{ Attribute, TagType } +import org.bdgenomics.adam.models.Attribute import org.bdgenomics.adam.util.AttributeUtils import org.bdgenomics.formats.avro.AlignmentRecord import scala.collection.JavaConverters._ @@ -107,7 +106,7 @@ private[adam] class SAMRecordConverter extends Serializable with Logging { // set read alignment flag val start: Int = samRecord.getAlignmentStart assert(start != 0, "Start cannot equal 0 if contig is set.") - builder.setStart((start - 1)) + builder.setStart(start - 1L) // set OP and OC flags, if applicable if (samRecord.getAttribute("OP") != null) { @@ -152,7 +151,7 @@ private[adam] class SAMRecordConverter extends Serializable with Logging { val mateStart = samRecord.getMateAlignmentStart if (mateStart > 0) { // We subtract one here to be 0-based offset - builder.setMateAlignmentStart(mateStart - 1) + builder.setMateAlignmentStart(mateStart - 1L) } } @@ -188,7 +187,7 @@ private[adam] class SAMRecordConverter extends Serializable with Logging { var tags = List[Attribute]() val tlen = samRecord.getInferredInsertSize if (tlen != 0) { - builder.setInferredInsertSize(tlen) + builder.setInferredInsertSize(tlen.toLong) } if (samRecord.getAttributes != null) { samRecord.getAttributes.asScala.foreach { diff --git a/src/main/scala/org/bdgenomics/adam/converters/TranscriptEffectConverter.scala b/src/main/scala/org/bdgenomics/adam/converters/TranscriptEffectConverter.scala index 5e6eba7bd0..ec4101c84b 100644 --- a/src/main/scala/org/bdgenomics/adam/converters/TranscriptEffectConverter.scala +++ b/src/main/scala/org/bdgenomics/adam/converters/TranscriptEffectConverter.scala @@ -24,7 +24,6 @@ import org.bdgenomics.formats.avro.VariantAnnotationMessage._ import org.bdgenomics.formats.avro.{ TranscriptEffect, Variant, - VariantAnnotation, VariantAnnotationMessage } import org.bdgenomics.utils.misc.Logging @@ -165,7 +164,7 @@ private[adam] object TranscriptEffectConverter extends Serializable with Logging proteinPosition.foreach(te.setProteinPosition(_)) proteinLength.foreach(te.setProteinLength(_)) setIfNotEmpty(distance, (s: String) => te.setDistance(Integer.parseInt(s))) - if (!messages.isEmpty) te.setMessages(messages.asJava) + if (messages.nonEmpty) te.setMessages(messages.asJava) Seq(te.build()) } diff --git a/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala b/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala index 4b34b5c977..b613dc7f7c 100644 --- a/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala +++ b/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala @@ -80,4 +80,11 @@ object Timers extends Metrics { val WriteBAMRecord = timer("Write BAM Record") val WriteSAMRecord = timer("Write SAM Record") val WriteCRAMRecord = timer("Write CRAM Record") + + // org.bdgenomics.adam.rdd.TreeRegionJoin + val TreeJoin = timer("Running broadcast join with interval tree") + val BuildingTrees = timer("Building interval tree") + val SortingRightSide = timer("Sorting right side of join") + val GrowingTrees = timer("Growing forest of trees") + val RunningMapSideJoin = timer("Running map-side join") } diff --git a/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala b/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala index 2b91fcaac2..242195329d 100644 --- a/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala +++ b/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala @@ -18,7 +18,6 @@ package org.bdgenomics.adam.models import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.algorithms.consensus.Consensus import org.bdgenomics.adam.rdd.ADAMContext._ diff --git a/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala b/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala index c1a6523aef..693fab7968 100644 --- a/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala +++ b/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala @@ -40,7 +40,7 @@ private[adam] class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) ex // checks to all the methods below to make sure that 'endpoints' isn't empty. Also, it shouldn't // make any sense to have a set of non-overlapping regions for ... no regions. // Also, without this check, we can't tell which chromosome this NonoverlappingRegions object is for. - assert(regions.size > 0, "regions list must be non-empty") + assert(regions.nonEmpty, "regions list must be non-empty") assert(regions.head != null, "regions must have at least one non-null entry") /** @@ -235,7 +235,7 @@ private[adam] class MultiContigNonoverlappingRegions( * nonoverlapping-regions. Basically, reject the input value if its corresponding region is * completely outside the hull of all the input-set regions. * - * @param regionable The input value + * @param value The input value * @tparam U * @return a boolean -- the input value should only participate in the regionJoin if the return value * here is 'true'. diff --git a/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala b/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala index 029ab6e736..15d1bc750b 100644 --- a/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala +++ b/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala @@ -18,7 +18,6 @@ package org.bdgenomics.adam.models import htsjdk.samtools.SAMProgramRecord -import org.bdgenomics.adam.rdd.ADAMContext._ private[models] object ProgramRecord { diff --git a/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala b/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala index 080f30213a..9ccf15b409 100644 --- a/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala +++ b/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala @@ -116,10 +116,10 @@ object ReferencePosition extends Serializable { * * @param referenceName The name of the reference contig this locus exists on. * @param pos The position of this locus. - * @param orientation The strand that this locus is on. + * @param strand The strand that this locus is on. */ - def apply(referenceName: String, pos: Long, orientation: Strand): ReferencePosition = { - new ReferencePosition(referenceName, pos, orientation) + def apply(referenceName: String, pos: Long, strand: Strand): ReferencePosition = { + new ReferencePosition(referenceName, pos, strand) } } @@ -128,13 +128,13 @@ object ReferencePosition extends Serializable { * * @param referenceName The name of the reference contig this locus exists on. * @param pos The position of this locus. - * @param orientation The strand that this locus is on. + * @param strand The strand that this locus is on. */ class ReferencePosition( override val referenceName: String, val pos: Long, - override val orientation: Strand = Strand.INDEPENDENT) - extends ReferenceRegion(referenceName, pos, pos + 1, orientation) + override val strand: Strand = Strand.INDEPENDENT) + extends ReferenceRegion(referenceName, pos, pos + 1, strand) class ReferencePositionSerializer extends Serializer[ReferencePosition] { private val enumValues = Strand.values() @@ -142,13 +142,13 @@ class ReferencePositionSerializer extends Serializer[ReferencePosition] { def write(kryo: Kryo, output: Output, obj: ReferencePosition) = { output.writeString(obj.referenceName) output.writeLong(obj.pos) - output.writeInt(obj.orientation.ordinal) + output.writeInt(obj.strand.ordinal) } def read(kryo: Kryo, input: Input, klazz: Class[ReferencePosition]): ReferencePosition = { val refName = input.readString() val pos = input.readLong() - val orientation = input.readInt() - new ReferencePosition(refName, pos, enumValues(orientation)) + val strand = input.readInt() + new ReferencePosition(refName, pos, enumValues(strand)) } } diff --git a/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala index 454cb6b5e8..ec57de1e41 100644 --- a/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala +++ b/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala @@ -20,7 +20,7 @@ package org.bdgenomics.adam.models import com.esotericsoftware.kryo.io.{ Input, Output } import com.esotericsoftware.kryo.{ Kryo, Serializer } import org.bdgenomics.formats.avro._ -import org.bdgenomics.utils.intervaltree.Interval +import org.bdgenomics.utils.intervalarray.Interval import org.hammerlab.genomics.reference.Region import scala.math.{ max, min } @@ -43,7 +43,7 @@ trait ReferenceOrdering[T <: ReferenceRegion] extends Ordering[T] { b: T): Int = { val rc = regionCompare(a, b) if (rc == 0) { - a.orientation.ordinal compare b.orientation.ordinal + a.strand.ordinal compare b.strand.ordinal } else { rc } @@ -99,12 +99,14 @@ object ReferenceRegion { * @param referenceName The name of the reference contig that this region is * on. * @param end The end position for this region. + * @param strand The strand of the genome that this region exists on. * @return Returns a reference region that goes from the start of a contig to * a user provided end point. */ def fromStart(referenceName: String, - end: Long): ReferenceRegion = { - ReferenceRegion(referenceName, 0L, end) + end: Long, + strand: Strand = Strand.INDEPENDENT): ReferenceRegion = { + ReferenceRegion(referenceName, 0L, end, strand = strand) } /** @@ -113,22 +115,26 @@ object ReferenceRegion { * @param referenceName The name of the reference contig that this region is * on. * @param start The start position for this region. + * @param strand The strand of the genome that this region exists on. * @return Returns a reference region that goes from a user provided starting * point to the end of a contig. */ def toEnd(referenceName: String, - start: Long): ReferenceRegion = { - ReferenceRegion(referenceName, start, Long.MaxValue) + start: Long, + strand: Strand = Strand.INDEPENDENT): ReferenceRegion = { + ReferenceRegion(referenceName, start, Long.MaxValue, strand = strand) } /** * Creates a reference region that covers the entirety of a contig. * * @param referenceName The name of the reference contig to cover. + * @param strand The strand of the genome that this region exists on. * @return Returns a reference region that covers the entirety of a contig. */ - def all(referenceName: String): ReferenceRegion = { - ReferenceRegion(referenceName, 0L, Long.MaxValue) + def all(referenceName: String, + strand: Strand = Strand.INDEPENDENT): ReferenceRegion = { + ReferenceRegion(referenceName, 0L, Long.MaxValue, strand = strand) } /** @@ -140,7 +146,7 @@ object ReferenceRegion { */ def opt(record: AlignmentRecord): Option[ReferenceRegion] = { if (record.getReadMapped) { - Some(apply(record)) + Some(unstranded(record)) } else { None } @@ -153,7 +159,9 @@ object ReferenceRegion { * @return The site where this genotype lives. */ def apply(genotype: Genotype): ReferenceRegion = { - ReferenceRegion(genotype.getVariant) + ReferenceRegion(genotype.getContigName, + genotype.getStart, + genotype.getEnd) } /** @@ -176,22 +184,28 @@ object ReferenceRegion { ReferenceRegion(annotation.getVariant) } + private def checkRead(record: AlignmentRecord) { + require(record.getReadMapped, + "Cannot build reference region for unmapped read %s.".format(record)) + require(record.getContigName != null && + record.getStart != null && + record.getEnd != null, + "Read %s contains required fields that are null.".format(record)) + } + /** - * Builds a reference region for an aligned read. + * Builds a reference region with independent strand for an aligned read. * * @throws IllegalArgumentException If this read is not aligned or alignment * data is null. * * @param record The read to extract the reference region from. * @return Returns the reference region covered by this read's alignment. + * + * @see stranded */ - def apply(record: AlignmentRecord): ReferenceRegion = { - require(record.getReadMapped, - "Cannot build reference region for unmapped read %s.".format(record)) - require(record.getContigName != null && - record.getStart != null && - record.getEnd != null, - "Read %s contains required fields that are null.".format(record)) + def unstranded(record: AlignmentRecord): ReferenceRegion = { + checkRead(record) ReferenceRegion(record.getContigName, record.getStart, record.getEnd) } @@ -215,13 +229,44 @@ object ReferenceRegion { else None + /** + * Builds a reference region for an aligned read with strand set. + * + * @throws IllegalArgumentException If this read is not aligned, alignment + * data is null, or strand is not set. + * + * @param record The read to extract the reference region from. + * @return Returns the reference region covered by this read's alignment. + * + * @see unstranded + */ + def stranded(record: AlignmentRecord): ReferenceRegion = { + checkRead(record) + + val strand = Option(record.getReadNegativeStrand) + .map(b => b: Boolean) match { + case None => throw new IllegalArgumentException( + "Alignment strand not set for %s".format(record)) + case Some(true) => Strand.REVERSE + case Some(false) => Strand.FORWARD + } + new ReferenceRegion(record.getContigName, + record.getStart, + record.getEnd, + strand = strand) + } + /** * Generates a region from a given position -- the region will have a length of 1. * @param pos The position to convert * @return A 1-wide region at the same location as pos */ - def apply(pos: ReferencePosition): ReferenceRegion = - ReferenceRegion(pos.referenceName, pos.pos, pos.pos + 1) + def apply(pos: ReferencePosition): ReferenceRegion = { + ReferenceRegion(pos.referenceName, + pos.pos, + pos.pos + 1, + strand = pos.strand) + } /** * Generates a reference region from assembly data. Returns None if the assembly does not @@ -245,16 +290,47 @@ object ReferenceRegion { } } + private def checkFeature(record: Feature) { + require(record.getContigName != null && + record.getStart != null && + record.getEnd != null, + "Feature %s contains required fields that are null.".format(record)) + } + /** - * Builds a reference region for a feature. + * Builds a reference region for a feature without strand information. * - * @param feature Feature to extract ReferenceRegion from + * @param feature Feature to extract ReferenceRegion from. * @return Extracted ReferenceRegion + * + * @see stranded */ - def apply(feature: Feature): ReferenceRegion = { + def unstranded(feature: Feature): ReferenceRegion = { + checkFeature(feature) new ReferenceRegion(feature.getContigName, feature.getStart, feature.getEnd) } + /** + * Builds a reference region for a feature with strand set. + * + * @param feature Feature to extract ReferenceRegion from. + * @return Extracted ReferenceRegion + * + * @throws IllegalArgumentException Throws an exception if the strand is null + * in the provided feature. + * + * @see unstranded + */ + def stranded(feature: Feature): ReferenceRegion = { + checkFeature(feature) + require(feature.getStrand != null, + "Strand is not defined in feature %s.".format(feature)) + new ReferenceRegion(feature.getContigName, + feature.getStart, + feature.getEnd, + strand = feature.getStrand) + } + /** * Builds a reference region for a coverage site. * @@ -274,18 +350,19 @@ object ReferenceRegion { * @param end The 0-based residue-coordinate for the first residue after the start * which is not in the region -- i.e. [start, end) define a 0-based * half-open interval. + * @param strand The strand of the genome that this region exists on. */ case class ReferenceRegion( referenceName: String, start: Long, end: Long, - orientation: Strand = Strand.INDEPENDENT) + strand: Strand = Strand.INDEPENDENT) extends Comparable[ReferenceRegion] - with Interval { + with Interval[ReferenceRegion] { assert(start >= 0 && end >= start, "Failed when trying to create region %s %d %d on %s strand.".format( - referenceName, start, end, orientation)) + referenceName, start, end, strand)) /** * @return Returns a copy of this reference region that is on the independent @@ -331,7 +408,7 @@ case class ReferenceRegion( * @see merge */ def hull(region: ReferenceRegion): ReferenceRegion = { - assert(orientation == region.orientation, "Cannot compute convex hull of differently oriented regions.") + assert(strand == region.strand, "Cannot compute convex hull of differently oriented regions.") assert(referenceName == region.referenceName, "Cannot compute convex hull of regions on different references.") ReferenceRegion(referenceName, min(start, region.start), max(end, region.end)) } @@ -361,7 +438,7 @@ case class ReferenceRegion( * the point is not in our reference space, we return an empty option. */ def distance(other: ReferenceRegion): Option[Long] = { - if (referenceName == other.referenceName && orientation == other.orientation) { + if (referenceName == other.referenceName && strand == other.strand) { if (overlaps(other)) { Some(0) } else if (other.start >= end) { @@ -399,7 +476,7 @@ case class ReferenceRegion( new ReferenceRegion(referenceName, start - byStart, end + byEnd, - orientation) + strand) } /** @@ -409,7 +486,7 @@ case class ReferenceRegion( * @return True if the position is within our region. */ def contains(other: ReferencePosition): Boolean = { - orientation == other.orientation && + strand == other.strand && referenceName == other.referenceName && start <= other.pos && end > other.pos } @@ -421,11 +498,23 @@ case class ReferenceRegion( * @return True if the region is wholly contained within our region. */ def contains(other: ReferenceRegion): Boolean = { - orientation == other.orientation && + strand == other.strand && referenceName == other.referenceName && start <= other.start && end >= other.end } + /** + * Checks if our region overlaps (wholly or partially) another region, + * independent of strand. + * + * @param other The region to compare against. + * @return True if any section of the two regions overlap. + */ + def covers(other: ReferenceRegion): Boolean = { + referenceName == other.referenceName && + end > other.start && start < other.end + } + /** * Checks if our region overlaps (wholly or partially) another region. * @@ -433,7 +522,7 @@ case class ReferenceRegion( * @return True if any section of the two regions overlap. */ def overlaps(other: ReferenceRegion): Boolean = { - orientation == other.orientation && + strand == other.strand && referenceName == other.referenceName && end > other.start && start < other.end } @@ -460,7 +549,7 @@ case class ReferenceRegion( result = 41 * result + (if (referenceName != null) referenceName.hashCode else 0) result = 41 * result + start.hashCode result = 41 * result + end.hashCode - result = 41 * result + (if (orientation != null) orientation.ordinal() else 0) + result = 41 * result + (if (strand != null) strand.ordinal() else 0) result } } @@ -472,14 +561,14 @@ class ReferenceRegionSerializer extends Serializer[ReferenceRegion] { output.writeString(obj.referenceName) output.writeLong(obj.start) output.writeLong(obj.end) - output.writeInt(obj.orientation.ordinal()) + output.writeInt(obj.strand.ordinal()) } def read(kryo: Kryo, input: Input, klazz: Class[ReferenceRegion]): ReferenceRegion = { val referenceName = input.readString() val start = input.readLong() val end = input.readLong() - val orientation = input.readInt() - new ReferenceRegion(referenceName, start, end, enumValues(orientation)) + val strand = input.readInt() + new ReferenceRegion(referenceName, start, end, enumValues(strand)) } } diff --git a/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala b/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala index ecf9d7bfc0..5aea05443b 100644 --- a/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala +++ b/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala @@ -17,7 +17,7 @@ */ package org.bdgenomics.adam.models -import htsjdk.samtools.{ SAMFileHeader, SAMProgramRecord } +import htsjdk.samtools.SAMFileHeader import scala.collection.JavaConversions._ private[adam] object SAMFileHeaderWritable { diff --git a/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala b/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala index 79d4d321f0..41eb4d8b95 100644 --- a/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala +++ b/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala @@ -23,6 +23,7 @@ import org.bdgenomics.formats.avro.{ Contig, NucleotideContigFragment } import org.hammerlab.genomics.reference.ContigLengths import scala.collection.JavaConversions._ +import scala.collection._ /** * Singleton object for creating SequenceDictionaries. @@ -119,7 +120,7 @@ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializab private val hasSequenceOrdering = records.forall(_.referenceIndex.isDefined) - def contigLengths: ContigLengths = byName.mapValues(_.length) + def contigLengths: ContigLengths = byName.mapValues(_.length).toMap /** * @param that Sequence dictionary to compare against. diff --git a/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala index ff2de50935..4c7312e8a4 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala @@ -144,7 +144,7 @@ private class FileFilter(private val name: String) extends PathFilter { * * @param sc The SparkContext to wrap. */ -class ADAMContext private (@transient val sc: SparkContext) extends Serializable with Logging { +class ADAMContext(@transient val sc: SparkContext) extends Serializable with Logging { /** * @param samHeader The header to extract a sequence dictionary from. diff --git a/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala b/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala index 7c79894bb9..81f5783950 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala @@ -17,20 +17,17 @@ */ package org.bdgenomics.adam.rdd -import java.util.logging.Level import java.io.OutputStream import org.apache.avro.Schema import org.apache.avro.file.DataFileWriter import org.apache.avro.generic.IndexedRecord import org.apache.avro.specific.{ SpecificDatumWriter, SpecificRecordBase } -import org.apache.hadoop.fs.{ FileSystem, Path } +import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{ OutputFormat => NewOutputFormat } import org.apache.spark.SparkContext import org.apache.spark.rdd.MetricsContext._ import org.apache.spark.rdd.{ InstrumentedOutputFormat, RDD } import org.bdgenomics.adam.instrumentation.Timers._ -import org.bdgenomics.adam.models._ -import org.bdgenomics.adam.util.ParquetLogger import org.bdgenomics.utils.cli.SaveArgs import org.bdgenomics.utils.misc.HadoopUtil import org.bdgenomics.utils.misc.Logging diff --git a/src/main/scala/org/bdgenomics/adam/rdd/BroadcastRegionJoin.scala b/src/main/scala/org/bdgenomics/adam/rdd/BroadcastRegionJoin.scala deleted file mode 100644 index 28ae7174af..0000000000 --- a/src/main/scala/org/bdgenomics/adam/rdd/BroadcastRegionJoin.scala +++ /dev/null @@ -1,190 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rdd - -import org.apache.spark.SparkContext._ -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.models.{ - MultiContigNonoverlappingRegions, - ReferenceRegion -} -import scala.Predef._ -import scala.reflect.ClassTag - -/** - * Contains multiple implementations of a 'region join', an operation that joins two sets of - * regions based on the spatial overlap between the regions. - * - * Different implementations will have different performance characteristics -- and new implementations - * will likely be added in the future, see the notes to each individual method for more details. - */ -sealed trait BroadcastRegionJoin[T, U, RT] extends RegionJoin[T, U, RT, U] { - - /** - * Performs a region join between two RDDs (broadcast join). - * - * This implementation first _collects_ the left-side RDD; therefore, if the left-side RDD is large - * or otherwise idiosyncratic in a spatial sense (i.e. contains a set of regions whose unions overlap - * a significant fraction of the genome) then the performance of this implementation will likely be - * quite bad. - * - * Once the left-side RDD is collected, its elements are reduced to their distinct unions; - * these can then be used to define the partitions over which the region-join will be computed. - * - * The regions in the left-side are keyed by their corresponding partition (each such region should have - * exactly one partition). The regions in the right-side are also keyed by their corresponding partitions - * (here there can be more than one partition for a region, since a region may cross the boundaries of - * the partitions defined by the left-side). - * - * Finally, within each separate partition, we essentially perform a cartesian-product-and-filter - * operation. The result is the region-join. - * - * @param baseRDD The 'left' side of the join - * @param joinedRDD The 'right' side of the join - * @param tManifest implicit type of baseRDD - * @param uManifest implicit type of joinedRDD - * @tparam T type of baseRDD - * @tparam U type of joinedRDD - * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region - * corresponding to x overlaps the region corresponding to y. - */ - def partitionAndJoin( - baseRDD: RDD[(ReferenceRegion, T)], - joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], - uManifest: ClassTag[U]): RDD[(RT, U)] = { - - val sc = baseRDD.context - - /** - * Original Join Design: - * - * Parameters: - * (1) f : (Range, Range) => T // an aggregation function - * (2) a : RDD[Range] - * (3) b : RDD[Range] - * - * Return type: RDD[(Range,T)] - * - * Algorithm: - * 1. a.collect() (where a is smaller than b) - * 2. build a non-overlapping partition on a - * 3. ak = a.map( v => (partition(v), v) ) - * 4. bk = b.flatMap( v => partitions(v).map( i=>(i,v) ) ) - * 5. joined = ak.join(bk).filter( (i, (r1, r2)) => r1.overlaps(r2) ).map( (i, (r1,r2))=>(r1, r2) ) - * 6. return: joined.reduceByKey(f) - * - * Ways in which we've generalized this plan: - * - removed the aggregation step altogether - * - carry a sequence dictionary through the computation. - */ - - // First, we group the regions in the left side of the join by their referenceName, - // and collect them. - val collectedLeft: Seq[(String, Iterable[ReferenceRegion])] = - baseRDD - .map(_._1) // RDD[ReferenceRegion] - .keyBy(_.referenceName) // RDD[(String,ReferenceRegion)] - .groupByKey() // RDD[(String,Seq[ReferenceRegion])] - .collect() // Iterable[(String,Seq[ReferenceRegion])] - .toSeq // Seq[(String,Seq[ReferenceRegion])] - - // Next, we turn that into a data structure that reduces those regions to their non-overlapping - // pieces, which we will use as a partition. - val multiNonOverlapping = new MultiContigNonoverlappingRegions(collectedLeft) - - // Then, we broadcast those partitions -- this will be the function that allows us to - // partition all the regions on the right side of the join. - val regions = sc.broadcast(multiNonOverlapping) - - // each element of the left-side RDD should have exactly one partition. - val smallerKeyed: RDD[(ReferenceRegion, (ReferenceRegion, T))] = - baseRDD.map(t => (regions.value.regionsFor(t).head, t)) - - // each element of the right-side RDD may have 0, 1, or more than 1 corresponding partition. - val largerKeyed: RDD[(ReferenceRegion, (ReferenceRegion, U))] = - joinedRDD.flatMap(t => regionsFor(t, regions).map((r: ReferenceRegion) => (r, t))) - - joinAndFilterFn(smallerKeyed, largerKeyed) - } - - protected def regionsFor(u: (ReferenceRegion, U), - regions: Broadcast[MultiContigNonoverlappingRegions]): Iterable[ReferenceRegion] - - protected def joinAndFilterFn(tRdd: RDD[(ReferenceRegion, (ReferenceRegion, T))], - uRdd: RDD[(ReferenceRegion, (ReferenceRegion, U))]): RDD[(RT, U)] -} - -/** - * Extends the BroadcastRegionJoin trait to implement an inner join. - */ -case class InnerBroadcastRegionJoin[T, U]() extends BroadcastRegionJoin[T, U, T] { - - protected def joinAndFilterFn(tRdd: RDD[(ReferenceRegion, (ReferenceRegion, T))], - uRdd: RDD[(ReferenceRegion, (ReferenceRegion, U))]): RDD[(T, U)] = { - // this is (essentially) performing a cartesian product within each partition... - val joined: RDD[(ReferenceRegion, ((ReferenceRegion, T), (ReferenceRegion, U)))] = - tRdd.join(uRdd) - - // ... so we need to filter the final pairs to make sure they're overlapping. - joined.flatMap(kv => { - val (_, (t: (ReferenceRegion, T), u: (ReferenceRegion, U))) = kv - - if (t._1.overlaps(u._1)) { - Some((t._2, u._2)) - } else { - None - } - }) - } - - protected def regionsFor(u: (ReferenceRegion, U), - regions: Broadcast[MultiContigNonoverlappingRegions]): Iterable[ReferenceRegion] = { - regions.value.regionsFor(u) - } -} - -/** - * Extends the BroadcastRegionJoin trait to implement a right outer join. - */ -case class RightOuterBroadcastRegionJoin[T, U]() extends BroadcastRegionJoin[T, U, Option[T]] { - - protected def joinAndFilterFn(tRdd: RDD[(ReferenceRegion, (ReferenceRegion, T))], - uRdd: RDD[(ReferenceRegion, (ReferenceRegion, U))]): RDD[(Option[T], U)] = { - // this is (essentially) performing a cartesian product within each partition... - val joined: RDD[(ReferenceRegion, (Option[(ReferenceRegion, T)], (ReferenceRegion, U)))] = - tRdd.rightOuterJoin(uRdd) - - // ... so we need to filter the final pairs to make sure they're overlapping. - joined.map(kv => { - val (_, (optT: Option[(ReferenceRegion, T)], u: (ReferenceRegion, U))) = kv - - (optT.filter(t => t._1.overlaps(u._1)).map(_._2), u._2) - }) - } - - protected def regionsFor(u: (ReferenceRegion, U), - regions: Broadcast[MultiContigNonoverlappingRegions]): Iterable[ReferenceRegion] = { - val reg = regions.value.regionsFor(u) - if (reg.isEmpty) { - Iterable(u._1) - } else { - reg - } - } -} diff --git a/src/main/scala/org/bdgenomics/adam/rdd/FileMerger.scala b/src/main/scala/org/bdgenomics/adam/rdd/FileMerger.scala index 9c8d6b4540..977957d51a 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/FileMerger.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/FileMerger.scala @@ -187,7 +187,7 @@ object FileMerger extends Logging { // finish the file off if (writeEmptyGzipBlock) { - os.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); + os.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK) } else if (writeCramEOF) { CramIO.issueEOF(CramVersions.DEFAULT_CRAM_VERSION, os) } diff --git a/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala b/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala index 695af7ba09..2f41524c25 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala @@ -396,11 +396,12 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { * @return Returns a new genomic RDD containing all pairs of keys that * overlapped in the genomic coordinate space. */ - def broadcastRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(T, X), Z]](genomicRdd: GenomicRDD[X, Y])( - implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(T, X), Z] = { + def broadcastRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(T, X), Z]]( + genomicRdd: GenomicRDD[X, Y])( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(T, X), Z] = { // key the RDDs and join - GenericGenomicRDD[(T, X)](InnerBroadcastRegionJoin[T, X]().partitionAndJoin(flattenRddByRegions(), + GenericGenomicRDD[(T, X)](InnerTreeRegionJoin[T, X]().partitionAndJoin(flattenRddByRegions(), genomicRdd.flattenRddByRegions()), sequences ++ genomicRdd.sequences, kv => { getReferenceRegions(kv._1) ++ genomicRdd.getReferenceRegions(kv._2) }) @@ -423,11 +424,12 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { * overlapped in the genomic coordinate space, and all keys from the * right RDD that did not overlap a key in the left RDD. */ - def rightOuterBroadcastRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Option[T], X), Z]](genomicRdd: GenomicRDD[X, Y])( - implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Option[T], X), Z] = { + def rightOuterBroadcastRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Option[T], X), Z]]( + genomicRdd: GenomicRDD[X, Y])( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Option[T], X), Z] = { // key the RDDs and join - GenericGenomicRDD[(Option[T], X)](RightOuterBroadcastRegionJoin[T, X]().partitionAndJoin(flattenRddByRegions(), + GenericGenomicRDD[(Option[T], X)](RightOuterTreeRegionJoin[T, X]().partitionAndJoin(flattenRddByRegions(), genomicRdd.flattenRddByRegions()), sequences ++ genomicRdd.sequences, kv => { @@ -437,6 +439,96 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { .asInstanceOf[GenomicRDD[(Option[T], X), Z]] } + /** + * Computes the partition size and final sequence dictionary for a join. + * + * @param optPartitions Optional user-requested number of partitions for the + * end of the shuffle. + * @param genomicRdd The genomic RDD we are joining against. + * @return Returns a tuple containing the (partition size, final sequence + * dictionary after the join). + */ + private[rdd] def joinPartitionSizeAndSequences[X, Y <: GenomicRDD[X, Y]]( + optPartitions: Option[Int], + genomicRdd: GenomicRDD[X, Y]): (Long, SequenceDictionary) = { + + require(!(sequences.isEmpty && genomicRdd.sequences.isEmpty), + "Both RDDs at input to join have an empty sequence dictionary!") + + // what sequences do we wind up with at the end? + val finalSequences = sequences ++ genomicRdd.sequences + + // did the user provide a set partition count? + // if no, take the max partition count from our rdds + val estPartitions = optPartitions.getOrElse(Seq(rdd.partitions.length, + genomicRdd.rdd.partitions.length).max) + + // if the user provides too high of a partition count, the estimated number + // of partitions can go to 0 + val partitions = if (estPartitions >= 1) { + estPartitions + } else { + 1 + } + + (finalSequences.records.map(_.length).sum / partitions, + finalSequences) + } + + /** + * Performs a broadcast inner join between this RDD and another RDD. + * + * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * and broadcast to all the nodes in the cluster. The key equality function + * used for this join is the reference region overlap function. Since this + * is an inner join, all values who do not overlap a value from the other + * RDD are dropped. + * + * @param genomicRdd The right RDD in the join. + * @return Returns a new genomic RDD containing all pairs of keys that + * overlapped in the genomic coordinate space. + */ + def broadcastRegionJoinAndGroupByRight[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Iterable[T], X), Z]](genomicRdd: GenomicRDD[X, Y])( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Iterable[T], X), Z] = { + + // key the RDDs and join + GenericGenomicRDD[(Iterable[T], X)](InnerTreeRegionJoinAndGroupByRight[T, X]().partitionAndJoin(flattenRddByRegions(), + genomicRdd.flattenRddByRegions()), + sequences ++ genomicRdd.sequences, + kv => { (kv._1.flatMap(getReferenceRegions) ++ genomicRdd.getReferenceRegions(kv._2)).toSeq }) + .asInstanceOf[GenomicRDD[(Iterable[T], X), Z]] + } + + /** + * Performs a broadcast right outer join between this RDD and another RDD. + * + * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * and broadcast to all the nodes in the cluster. The key equality function + * used for this join is the reference region overlap function. Since this + * is a right outer join, all values in the left RDD that do not overlap a + * value from the right RDD are dropped. If a value from the right RDD does + * not overlap any values in the left RDD, it will be paired with a `None` + * in the product of the join. + * + * @param genomicRdd The right RDD in the join. + * @return Returns a new genomic RDD containing all pairs of keys that + * overlapped in the genomic coordinate space, and all keys from the + * right RDD that did not overlap a key in the left RDD. + */ + def rightOuterBroadcastRegionJoinAndGroupByRight[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Iterable[T], X), Z]](genomicRdd: GenomicRDD[X, Y])( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Iterable[T], X), Z] = { + + // key the RDDs and join + GenericGenomicRDD[(Iterable[T], X)](RightOuterTreeRegionJoinAndGroupByRight[T, X]().partitionAndJoin(flattenRddByRegions(), + genomicRdd.flattenRddByRegions()), + sequences ++ genomicRdd.sequences, + kv => { + Seq(kv._1.map(v => getReferenceRegions(v))).flatten.flatten ++ + genomicRdd.getReferenceRegions(kv._2) + }) + .asInstanceOf[GenomicRDD[(Iterable[T], X), Z]] + } + /** * Performs a sort-merge inner join between this RDD and another RDD. * @@ -450,22 +542,18 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { * @return Returns a new genomic RDD containing all pairs of keys that * overlapped in the genomic coordinate space. */ - def shuffleRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(T, X), Z]](genomicRdd: GenomicRDD[X, Y], - optPartitions: Option[Int] = None)( - implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(T, X), Z] = { + def shuffleRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(T, X), Z]]( + genomicRdd: GenomicRDD[X, Y], + optPartitions: Option[Int] = None)( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(T, X), Z] = { - // did the user provide a set partition count? - // if no, take the max partition count from our rdds - val partitions = optPartitions.getOrElse(Seq(rdd.partitions.length, - genomicRdd.rdd.partitions.length).max) - - // what sequences do we wind up with at the end? - val endSequences = sequences ++ genomicRdd.sequences + val (partitionSize, endSequences) = joinPartitionSizeAndSequences(optPartitions, + genomicRdd) // key the RDDs and join GenericGenomicRDD[(T, X)]( InnerShuffleRegionJoin[T, X](endSequences, - partitions, + partitionSize, rdd.context).partitionAndJoin(flattenRddByRegions(), genomicRdd.flattenRddByRegions()), endSequences, @@ -489,22 +577,18 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { * overlapped in the genomic coordinate space, and all keys from the * right RDD that did not overlap a key in the left RDD. */ - def rightOuterShuffleRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Option[T], X), Z]](genomicRdd: GenomicRDD[X, Y], - optPartitions: Option[Int] = None)( - implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Option[T], X), Z] = { + def rightOuterShuffleRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Option[T], X), Z]]( + genomicRdd: GenomicRDD[X, Y], + optPartitions: Option[Int] = None)( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Option[T], X), Z] = { - // did the user provide a set partition count? - // if no, take the max partition count from our rdds - val partitions = optPartitions.getOrElse(Seq(rdd.partitions.length, - genomicRdd.rdd.partitions.length).max) - - // what sequences do we wind up with at the end? - val endSequences = sequences ++ genomicRdd.sequences + val (partitionSize, endSequences) = joinPartitionSizeAndSequences(optPartitions, + genomicRdd) // key the RDDs and join GenericGenomicRDD[(Option[T], X)]( RightOuterShuffleRegionJoin[T, X](endSequences, - partitions, + partitionSize, rdd.context).partitionAndJoin(flattenRddByRegions(), genomicRdd.flattenRddByRegions()), endSequences, @@ -531,22 +615,18 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { * overlapped in the genomic coordinate space, and all keys from the * left RDD that did not overlap a key in the right RDD. */ - def leftOuterShuffleRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(T, Option[X]), Z]](genomicRdd: GenomicRDD[X, Y], - optPartitions: Option[Int] = None)( - implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(T, Option[X]), Z] = { + def leftOuterShuffleRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(T, Option[X]), Z]]( + genomicRdd: GenomicRDD[X, Y], + optPartitions: Option[Int] = None)( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(T, Option[X]), Z] = { - // did the user provide a set partition count? - // if no, take the max partition count from our rdds - val partitions = optPartitions.getOrElse(Seq(rdd.partitions.length, - genomicRdd.rdd.partitions.length).max) - - // what sequences do we wind up with at the end? - val endSequences = sequences ++ genomicRdd.sequences + val (partitionSize, endSequences) = joinPartitionSizeAndSequences(optPartitions, + genomicRdd) // key the RDDs and join GenericGenomicRDD[(T, Option[X])]( LeftOuterShuffleRegionJoin[T, X](endSequences, - partitions, + partitionSize, rdd.context).partitionAndJoin(flattenRddByRegions(), genomicRdd.flattenRddByRegions()), endSequences, @@ -572,22 +652,18 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { * overlapped in the genomic coordinate space, and values that did not * overlap will be paired with a `None`. */ - def fullOuterShuffleRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Option[T], Option[X]), Z]](genomicRdd: GenomicRDD[X, Y], - optPartitions: Option[Int] = None)( - implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Option[T], Option[X]), Z] = { - - // did the user provide a set partition count? - // if no, take the max partition count from our rdds - val partitions = optPartitions.getOrElse(Seq(rdd.partitions.length, - genomicRdd.rdd.partitions.length).max) + def fullOuterShuffleRegionJoin[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Option[T], Option[X]), Z]]( + genomicRdd: GenomicRDD[X, Y], + optPartitions: Option[Int] = None)( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Option[T], Option[X]), Z] = { - // what sequences do we wind up with at the end? - val endSequences = sequences ++ genomicRdd.sequences + val (partitionSize, endSequences) = joinPartitionSizeAndSequences(optPartitions, + genomicRdd) // key the RDDs and join GenericGenomicRDD[(Option[T], Option[X])]( FullOuterShuffleRegionJoin[T, X](endSequences, - partitions, + partitionSize, rdd.context).partitionAndJoin(flattenRddByRegions(), genomicRdd.flattenRddByRegions()), endSequences, @@ -614,22 +690,18 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { * overlapped in the genomic coordinate space, grouped together by * the value they overlapped in the left RDD.. */ - def shuffleRegionJoinAndGroupByLeft[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(T, Iterable[X]), Z]](genomicRdd: GenomicRDD[X, Y], - optPartitions: Option[Int] = None)( - implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(T, Iterable[X]), Z] = { - - // did the user provide a set partition count? - // if no, take the max partition count from our rdds - val partitions = optPartitions.getOrElse(Seq(rdd.partitions.length, - genomicRdd.rdd.partitions.length).max) + def shuffleRegionJoinAndGroupByLeft[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(T, Iterable[X]), Z]]( + genomicRdd: GenomicRDD[X, Y], + optPartitions: Option[Int] = None)( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(T, Iterable[X]), Z] = { - // what sequences do we wind up with at the end? - val endSequences = sequences ++ genomicRdd.sequences + val (partitionSize, endSequences) = joinPartitionSizeAndSequences(optPartitions, + genomicRdd) // key the RDDs and join GenericGenomicRDD[(T, Iterable[X])]( InnerShuffleRegionJoinAndGroupByLeft[T, X](endSequences, - partitions, + partitionSize, rdd.context).partitionAndJoin(flattenRddByRegions(), genomicRdd.flattenRddByRegions()), endSequences, @@ -658,22 +730,18 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { * the value they overlapped in the left RDD, and all values from the * right RDD that did not overlap an item in the left RDD. */ - def rightOuterShuffleRegionJoinAndGroupByLeft[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Option[T], Iterable[X]), Z]](genomicRdd: GenomicRDD[X, Y], - optPartitions: Option[Int] = None)( - implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Option[T], Iterable[X]), Z] = { + def rightOuterShuffleRegionJoinAndGroupByLeft[X, Y <: GenomicRDD[X, Y], Z <: GenomicRDD[(Option[T], Iterable[X]), Z]]( + genomicRdd: GenomicRDD[X, Y], + optPartitions: Option[Int] = None)( + implicit tTag: ClassTag[T], xTag: ClassTag[X]): GenomicRDD[(Option[T], Iterable[X]), Z] = { - // did the user provide a set partition count? - // if no, take the max partition count from our rdds - val partitions = optPartitions.getOrElse(Seq(rdd.partitions.length, - genomicRdd.rdd.partitions.length).max) - - // what sequences do we wind up with at the end? - val endSequences = sequences ++ genomicRdd.sequences + val (partitionSize, endSequences) = joinPartitionSizeAndSequences(optPartitions, + genomicRdd) // key the RDDs and join GenericGenomicRDD[(Option[T], Iterable[X])]( RightOuterShuffleRegionJoinAndGroupByLeft[T, X](endSequences, - partitions, + partitionSize, rdd.context).partitionAndJoin(flattenRddByRegions(), genomicRdd.flattenRddByRegions()), endSequences, diff --git a/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala b/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala index ffa64ed849..9a2e956367 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala @@ -17,11 +17,8 @@ */ package org.bdgenomics.adam.rdd -import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion } +import org.bdgenomics.adam.models.ReferenceRegion import org.apache.spark.rdd.RDD -import org.apache.spark.SparkContext._ -import scala.Predef._ -import org.apache.spark.SparkContext import scala.reflect.ClassTag /** @@ -35,7 +32,7 @@ import scala.reflect.ClassTag * @tparam RU The type of data yielded by the right RDD at the output of the * join. */ -trait RegionJoin[T, U, RT, RU] { +trait RegionJoin[T, U, RT, RU] extends Serializable { /** * Performs a region join between two RDDs. diff --git a/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala b/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala index 0869515de5..f63228a4dc 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala @@ -17,8 +17,8 @@ */ package org.bdgenomics.adam.rdd -import org.apache.spark.SparkContext._ import org.apache.spark.{ Partitioner, SparkContext } +import org.apache.spark.rdd.MetricsContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.ReferenceRegion._ import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion } @@ -240,7 +240,7 @@ case class RightOuterShuffleRegionJoinAndGroupByLeft[T, U](sd: SequenceDictionar protected def emptyFn(left: Iterator[((ReferenceRegion, Int), T)], right: Iterator[((ReferenceRegion, Int), U)]): Iterator[(Option[T], Iterable[U])] = { - right.map(v => (None, Iterable(v._2))) + left.map(v => (Some(v._2), Iterable.empty)) ++ right.map(v => (None, Iterable(v._2))) } } @@ -278,8 +278,9 @@ private trait SortedIntervalPartitionJoin[T, U, RT, RU] extends Iterator[(RT, RU protected def pruneCache(to: Long) private def getHits(): Unit = { + assert(!hits.hasNext) // if there is nothing more in left, then I'm done - while (left.hasNext && hits.isEmpty) { + while (left.hasNext) { // there is more in left... val nl = left.next val ((nextLeftRegion, _), nextLeft) = nl @@ -298,7 +299,11 @@ private trait SortedIntervalPartitionJoin[T, U, RT, RU] extends Iterator[(RT, RU // be improved by making cache a fancier data structure than just a list // we filter for things that overlap, where at least one side of the join has a start position // in this partition - hits = processHits(nextLeft, nextLeftRegion) + // + // also, see note "important: fun times with iterators" in this file, which explains + // that these must apparently be two lines + val newHits = processHits(nextLeft, nextLeftRegion) + hits = hits ++ newHits assert(prevLeftRegion == null || (prevLeftRegion.referenceName == nextLeftRegion.referenceName && @@ -318,6 +323,11 @@ private trait SortedIntervalPartitionJoin[T, U, RT, RU] extends Iterator[(RT, RU if (hits.isEmpty) { getHits() } + // if that fails, try advancing and pruning the cache + if (hits.isEmpty) { + advanceCache(binRegion.end) + pruneCache(binRegion.end) + } // if hits is still empty, I must really be at the end hits.hasNext } @@ -377,7 +387,11 @@ private case class SortedIntervalPartitionJoinAndGroupByLeft[T, U]( protected def postProcessHits(iter: Iterator[(T, U)], currentLeft: T): Iterator[(T, Iterable[U])] = { - Iterator((currentLeft, iter.map(_._2).toIterable)) + if (iter.hasNext) { + Iterator((currentLeft, iter.map(_._2).toIterable)) + } else { + Iterator.empty + } } } @@ -425,8 +439,40 @@ private trait SortedIntervalPartitionJoinWithVictims[T, U, RT, RU] extends Sorte } protected def pruneCache(to: Long) { + cache = cache.dropWhile(_._1.end <= to) - hits = hits ++ (victimCache.takeWhile(_._1.end <= to).map(u => postProcessPruned(u._2))) + cache = cache ++ victimCache.takeWhile(_._1.end > to) + victimCache = victimCache.dropWhile(_._1.end > to) + + // important: fun times with iterators + // + // for reasons known only to God, if you combine these two lines down to a + // single line, it causes the hits iterator to be invalidated and become + // empty. + // + // MORE: it seems like there's some funniness with the underlying scala imp'l + // of append on two iterators. if the second line is written as: + // + // hits = hits ++ pped + // + // the line works as expected on scala 2.10. on scala 2.11, it occasionally + // fails. oddly enough, if you write the line above and then do a duplicate + // on the hits iterator (which you then reassign to hits), it works. i.e., + // + // hits = hits ++ pped + // val (d, _) = hits.duplicate + // hits = d + // + // works on both scala 2.10 and 2.11 across all unit tests + // + // rewriting it as (pped ++ hits).toIterator seems to work all the time. + // that appends the hits iterator to a ListBuffer, and then returns an iterator + // over the list buffer. essentially, i think there's a bug in the Iterator.++ + // method in scala that occasionally causes it to return an empty iterator, but + // i'm not sure why that is + val pped = (victimCache.takeWhile(_._1.end <= to).map(u => postProcessPruned(u._2))) + hits = (pped ++ hits).toIterator + victimCache = victimCache.dropWhile(_._1.end <= to) } diff --git a/src/main/scala/org/bdgenomics/adam/rdd/TreeRegionJoin.scala b/src/main/scala/org/bdgenomics/adam/rdd/TreeRegionJoin.scala new file mode 100644 index 0000000000..f0c60b0971 --- /dev/null +++ b/src/main/scala/org/bdgenomics/adam/rdd/TreeRegionJoin.scala @@ -0,0 +1,202 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd + +import com.esotericsoftware.kryo.io.{ Input, Output } +import com.esotericsoftware.kryo.{ Kryo, Serializer } +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.instrumentation.Timers._ +import org.bdgenomics.adam.models.ReferenceRegion +import org.bdgenomics.utils.intervalarray.IntervalArray +import scala.annotation.tailrec +import scala.reflect.ClassTag + +/** + * Implements a shuffle free (broadcast) region join. + * + * The broadcast values are stored in a sorted array. It was going to be an + * ensemble of interval trees, but, that didn't work out. + */ +trait TreeRegionJoin[T, U] { + + /** + * Performs an inner region join between two RDDs, and groups by the + * value on the right side of the join. + * + * @param leftRdd RDD on the left side of the join. Will be collected to the + * driver and broadcast. + * @param rightRdd RDD on the right side of the join. + * @return Returns an RDD where each element is a value from the right RDD, + * along with all values from the left RDD that it overlapped. + */ + private[rdd] def runJoinAndGroupByRight( + leftRdd: RDD[(ReferenceRegion, T)], + rightRdd: RDD[(ReferenceRegion, U)])( + implicit tTag: ClassTag[T]): RDD[(Iterable[T], U)] = TreeJoin.time { + + // build the tree from the left RDD + val tree = IntervalArray(leftRdd) + + RunningMapSideJoin.time { + // broadcast this tree + val broadcastTree = leftRdd.context + .broadcast(tree) + + // map and join + rightRdd.map(kv => { + val (rr, u) = kv + + // what values keys does this overlap in the tree? + val overlappingValues = broadcastTree.value + .get(rr) + .map(_._2) + + (overlappingValues, u) + }) + } + } +} + +/** + * Implements an inner region join where the left side of the join is broadcast. + */ +case class InnerTreeRegionJoin[T, U]() extends RegionJoin[T, U, T, U] with TreeRegionJoin[T, U] { + + /** + * Performs an inner region join between two RDDs. + * + * @param baseRDD The 'left' side of the join + * @param joinedRDD The 'right' side of the join + * @param tManifest implicit type of baseRDD + * @param uManifest implicit type of joinedRDD + * @tparam T type of baseRDD + * @tparam U type of joinedRDD + * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region + * corresponding to x overlaps the region corresponding to y. + */ + def partitionAndJoin( + baseRDD: RDD[(ReferenceRegion, T)], + joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], + uManifest: ClassTag[U]): RDD[(T, U)] = { + runJoinAndGroupByRight(baseRDD, joinedRDD) + .flatMap(kv => { + val (leftIterable, right) = kv + leftIterable.map(left => (left, right)) + }) + } +} + +/** + * Implements a right outer region join where the left side of the join is + * broadcast. + */ +case class RightOuterTreeRegionJoin[T, U]() extends RegionJoin[T, U, Option[T], U] with TreeRegionJoin[T, U] { + + /** + * Performs a right outer region join between two RDDs. + * + * @param baseRDD The 'left' side of the join + * @param joinedRDD The 'right' side of the join + * @param tManifest implicit type of baseRDD + * @param uManifest implicit type of joinedRDD + * @tparam T type of baseRDD + * @tparam U type of joinedRDD + * @return An RDD of pairs (Option[x], y), where the optional x value is from + * baseRDD, y is from joinedRDD, and the region corresponding to x overlaps + * the region corresponding to y. If there are no keys in the baseRDD that + * overlap a given key (y) from the joinedRDD, x will be None. + */ + def partitionAndJoin( + baseRDD: RDD[(ReferenceRegion, T)], + joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], + uManifest: ClassTag[U]): RDD[(Option[T], U)] = { + runJoinAndGroupByRight(baseRDD, joinedRDD) + .flatMap(kv => { + val (leftIterable, right) = kv + + if (leftIterable.isEmpty) { + Iterable((None, right)) + } else { + leftIterable.map(left => (Some(left), right)) + } + }) + } +} + +/** + * Performs an inner region join, followed logically by grouping by the right + * value. This is implemented without any shuffling; the join naturally returns + * values on the left grouped by the right value. + */ +case class InnerTreeRegionJoinAndGroupByRight[T, U]() extends RegionJoin[T, U, Iterable[T], U] with TreeRegionJoin[T, U] { + + /** + * Performs an inner join between two RDDs, followed by a groupBy on the + * right object. + * + * @param baseRDD The 'left' side of the join + * @param joinedRDD The 'right' side of the join + * @param tManifest implicit type of baseRDD + * @param uManifest implicit type of joinedRDD + * @tparam T type of baseRDD + * @tparam U type of joinedRDD + * @return An RDD of pairs (Iterable[x], y), where the Iterable[x] is from + * baseRDD, y is from joinedRDD, and all values in the Iterable[x] are + * aligned at regions that overlap the region corresponding to y. If the + * iterable is empty, the key-value pair is filtered out. + */ + def partitionAndJoin( + baseRDD: RDD[(ReferenceRegion, T)], + joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], + uManifest: ClassTag[U]): RDD[(Iterable[T], U)] = { + runJoinAndGroupByRight(baseRDD, joinedRDD) + .filter(_._1.nonEmpty) + } +} + +/** + * Performs a right outer region join, followed logically by grouping by the right + * value. This is implemented without any shuffling; the join naturally returns + * values on the left grouped by the right value. In this implementation, empty + * collections on the left side of the join are kept. + */ +case class RightOuterTreeRegionJoinAndGroupByRight[T, U]() extends RegionJoin[T, U, Iterable[T], U] with TreeRegionJoin[T, U] { + + /** + * Performs an inner join between two RDDs, followed by a groupBy on the + * right object. + * + * @param baseRDD The 'left' side of the join + * @param joinedRDD The 'right' side of the join + * @param tManifest implicit type of baseRDD + * @param uManifest implicit type of joinedRDD + * @tparam T type of baseRDD + * @tparam U type of joinedRDD + * @return An RDD of pairs (Iterable[x], y), where the Iterable[x] is from + * baseRDD, y is from joinedRDD, and all values in the Iterable[x] are + * aligned at regions that overlap the region corresponding to y. If the + * iterable is empty, the key-value pair is NOT filtered out. + */ + def partitionAndJoin( + baseRDD: RDD[(ReferenceRegion, T)], + joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], + uManifest: ClassTag[U]): RDD[(Iterable[T], U)] = { + runJoinAndGroupByRight(baseRDD, joinedRDD) + } +} diff --git a/src/main/scala/org/bdgenomics/adam/rdd/VCFHeaderUtils.scala b/src/main/scala/org/bdgenomics/adam/rdd/VCFHeaderUtils.scala index a6ba0acf8a..204fc902f7 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/VCFHeaderUtils.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/VCFHeaderUtils.scala @@ -21,7 +21,6 @@ import htsjdk.variant.variantcontext.writer.{ Options, VariantContextWriterBuilder } -import htsjdk.variant.vcf.{ VCFHeader, VCFHeaderLine } import htsjdk.variant.vcf.VCFHeader import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FileSystem, Path } diff --git a/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala b/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala index df98a749af..10161d735d 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala @@ -17,7 +17,6 @@ */ package org.bdgenomics.adam.rdd.contig -import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary } import org.bdgenomics.adam.rdd.ReferencePartitioner diff --git a/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala b/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala index d29557e6e7..6d4eec6132 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala @@ -18,7 +18,6 @@ package org.bdgenomics.adam.rdd.contig import com.google.common.base.Splitter -import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.converters.FragmentConverter import org.bdgenomics.adam.models.{ diff --git a/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala b/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala index cdab4f80e7..6ac0e534d9 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala @@ -21,8 +21,7 @@ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.{ Coverage, ReferenceRegion, - SequenceDictionary, - SequenceRecord + SequenceDictionary } import org.bdgenomics.adam.rdd.GenomicRDD import scala.annotation.tailrec diff --git a/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureParser.scala b/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureParser.scala index 31c1c7f247..abd1e52297 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureParser.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureParser.scala @@ -18,13 +18,11 @@ package org.bdgenomics.adam.rdd.feature import htsjdk.samtools.ValidationStringency -import java.io.File import org.bdgenomics.adam.models.SequenceRecord -import org.bdgenomics.formats.avro.{ Dbxref, Feature, Strand } +import org.bdgenomics.formats.avro.Feature import org.bdgenomics.utils.misc.Logging import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer -import scala.io.Source private[rdd] sealed trait FeatureParser extends Serializable with Logging { diff --git a/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala b/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala index dc256e7667..3a257ba848 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala @@ -271,7 +271,7 @@ case class FeatureRDD(rdd: RDD[Feature], * method will always return a Seq of exactly one ReferenceRegion. */ protected def getReferenceRegions(elem: Feature): Seq[ReferenceRegion] = { - Seq(ReferenceRegion(elem)) + Seq(ReferenceRegion.unstranded(elem)) } /** diff --git a/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala b/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala index f44770828f..9b18848040 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala @@ -162,15 +162,15 @@ private[feature] object Features { ) // set list fields if non-empty - if (!aliases.isEmpty) f.setAliases(aliases) - if (!notes.isEmpty) f.setNotes(notes) - if (!parentIds.isEmpty) f.setParentIds(parentIds) - if (!dbxrefs.isEmpty) f.setDbxrefs(dbxrefs) - if (!ontologyTerms.isEmpty) f.setOntologyTerms(ontologyTerms) + if (aliases.nonEmpty) f.setAliases(aliases) + if (notes.nonEmpty) f.setNotes(notes) + if (parentIds.nonEmpty) f.setParentIds(parentIds) + if (dbxrefs.nonEmpty) f.setDbxrefs(dbxrefs) + if (ontologyTerms.nonEmpty) f.setOntologyTerms(ontologyTerms) // set remaining attributes if non-empty; // any duplicate keys are lost at this point, last one in wins - if (!remaining.isEmpty) f.setAttributes(remaining) + if (remaining.nonEmpty) f.setAttributes(remaining) f } diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala index b4d3320ca6..b0380b24b3 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala @@ -17,7 +17,6 @@ */ package org.bdgenomics.adam.rdd.read -import htsjdk.samtools.SAMFileHeader import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{ OutputFormat, RecordWriter, TaskAttemptContext } import org.apache.spark.rdd.InstrumentedOutputFormat diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala index 28ce20ab13..7ddda38588 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala @@ -17,7 +17,6 @@ */ package org.bdgenomics.adam.rdd.read -import htsjdk.samtools.SAMFileHeader import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{ OutputFormat, RecordWriter, TaskAttemptContext } import org.apache.spark.rdd.InstrumentedOutputFormat diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala index f32921cd2a..0f5cd504c7 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala @@ -17,20 +17,13 @@ */ package org.bdgenomics.adam.rdd.read -import htsjdk.samtools._ -import htsjdk.samtools.util.{ - BinaryCodec, - BlockCompressedOutputStream, - BlockCompressedStreamConstants -} import htsjdk.samtools._ import htsjdk.samtools.cram.ref.ReferenceSource import htsjdk.samtools.util.{ BinaryCodec, BlockCompressedOutputStream } -import java.io.{ InputStream, OutputStream, StringWriter, Writer } +import java.io.{ OutputStream, StringWriter, Writer } import java.net.URI import java.nio.file.Paths -import org.apache.avro.Schema -import org.apache.hadoop.fs.{ FileSystem, Path } +import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.MetricsContext._ @@ -56,10 +49,8 @@ import org.bdgenomics.adam.rdd.feature.CoverageRDD import org.bdgenomics.adam.rdd.read.realignment.RealignIndels import org.bdgenomics.adam.rdd.read.recalibration.BaseQualityRecalibration import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.adam.util.ReferenceFile import org.bdgenomics.formats.avro._ -import org.bdgenomics.utils.misc.Logging import org.seqdoop.hadoop_bam._ import scala.collection.JavaConversions._ import scala.language.implicitConversions diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala index e358f23ba7..4bf78a180a 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala @@ -17,10 +17,7 @@ */ package org.bdgenomics.adam.rdd.read -import org.bdgenomics.adam.rdd.ADAMContext._ import htsjdk.samtools.{ TextCigarCodec, ValidationStringency } -// NOTE(ryan): this is necessary for Spark <= 1.2.1. -import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.{ MdTag, ReferenceRegion } import org.bdgenomics.adam.util.ReferenceFile @@ -74,7 +71,8 @@ private[read] case class MDTagging( contig <- Option(read.getContigName) if read.getReadMapped } yield { - maybeMDTagRead(read, referenceFileB.value.extract(ReferenceRegion(read))) + maybeMDTagRead(read, referenceFileB.value + .extract(ReferenceRegion.unstranded(read))) }).getOrElse({ numUnmappedReads += 1 read diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/ReferencePositionPair.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/ReferencePositionPair.scala index bfc9e5ad7d..e26325f107 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/ReferencePositionPair.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/ReferencePositionPair.scala @@ -19,10 +19,8 @@ package org.bdgenomics.adam.rdd.read import com.esotericsoftware.kryo.{ Kryo, Serializer } import com.esotericsoftware.kryo.io.{ Input, Output } -import Ordering.Option import org.bdgenomics.utils.misc.Logging import org.bdgenomics.adam.instrumentation.Timers.CreateReferencePositionPair -import org.bdgenomics.adam.models.ReferenceRegion._ import org.bdgenomics.adam.models.{ ReferencePosition, ReferencePositionSerializer diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/SingleReadBucket.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/SingleReadBucket.scala index 6ead0da7c2..343f52d5bb 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/SingleReadBucket.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/SingleReadBucket.scala @@ -21,7 +21,6 @@ import com.esotericsoftware.kryo.{ Kryo, Serializer } import com.esotericsoftware.kryo.io.{ Output, Input } import org.bdgenomics.utils.misc.Logging import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.serialization.AvroSerializer import org.bdgenomics.formats.avro.{ AlignmentRecord, diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala index 56edf5e5cd..183c798f1e 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala @@ -101,7 +101,7 @@ private[realignment] object IndelRealignmentTarget { read: RichAlignmentRecord, maxIndelSize: Int): Seq[IndelRealignmentTarget] = CreateIndelRealignmentTargets.time { - val region = ReferenceRegion(read.record) + val region = ReferenceRegion.unstranded(read.record) val refId = read.record.getContigName var pos = List[ReferenceRegion]() var referencePos = read.record.getStart diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala index 75c457608d..4000756833 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala @@ -22,7 +22,6 @@ import org.bdgenomics.utils.misc.Logging import org.apache.spark.rdd.MetricsContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.algorithms.consensus.{ Consensus, ConsensusGenerator, ConsensusGeneratorFromReads } -import org.bdgenomics.adam.models.ReferenceRegion._ import org.bdgenomics.adam.models.{ MdTag, ReferencePosition, ReferenceRegion } import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.adam.rich.RichAlignmentRecord._ diff --git a/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala b/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala index ce1acc4a85..b0e5ddf90f 100644 --- a/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala +++ b/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala @@ -19,7 +19,6 @@ package org.bdgenomics.adam.rdd.read.recalibration import htsjdk.samtools.ValidationStringency import java.io._ -import org.apache.spark.SparkContext._ import org.bdgenomics.utils.misc.Logging import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD diff --git a/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala b/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala index b5ead6e373..2318429440 100644 --- a/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala +++ b/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala @@ -23,10 +23,8 @@ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.{ MdTag, ReferencePosition, - ReferenceRegion, QualityScore } -import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rich.RichAlignmentRecord._ import org.bdgenomics.formats.avro.AlignmentRecord diff --git a/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala b/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala index fa249a4f73..172b2251b9 100644 --- a/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala +++ b/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala @@ -23,7 +23,6 @@ import htsjdk.samtools.{ CigarOperator, TextCigarCodec } -import java.util.regex.Pattern import org.bdgenomics.adam.models.{ Attribute, MdTag, @@ -38,11 +37,11 @@ import scala.math.max object RichAlignmentRecord { - @deprecated("Use explicit coversion wherever possible in new development.", + @deprecated("Use explicit conversion wherever possible in new development.", since = "0.21.0") implicit def recordToRichRecord(record: AlignmentRecord): RichAlignmentRecord = new RichAlignmentRecord(record) - @deprecated("Use explicit coversion wherever possible in new development.", + @deprecated("Use explicit conversion wherever possible in new development.", since = "0.21.0") implicit def richRecordToRecord(record: RichAlignmentRecord): AlignmentRecord = record.record } diff --git a/src/main/scala/org/bdgenomics/adam/rich/RichCigar.scala b/src/main/scala/org/bdgenomics/adam/rich/RichCigar.scala index d7bbbb5e35..04db18de2e 100644 --- a/src/main/scala/org/bdgenomics/adam/rich/RichCigar.scala +++ b/src/main/scala/org/bdgenomics/adam/rich/RichCigar.scala @@ -37,7 +37,7 @@ private[adam] case class RichCigar(cigar: Cigar) { def moveLeft(index: Int): RichCigar = { // var elements = List[CigarElement]() // deepclone instead of empty list initialization - var elements = cigar.getCigarElements.map(e => new CigarElement(e.getLength, e.getOperator)) + val elements = cigar.getCigarElements.map(e => new CigarElement(e.getLength, e.getOperator)) /** * Moves an element of a cigar left. diff --git a/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala b/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala index ba8e3246fc..b21fb91baf 100644 --- a/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala +++ b/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala @@ -113,6 +113,7 @@ class ADAMKryoRegistrator extends KryoRegistrator { kryo.register(Class.forName("org.apache.avro.Schema$EnumSchema")) kryo.register(Class.forName("org.apache.avro.Schema$Name")) kryo.register(Class.forName("org.apache.avro.Schema$LongSchema")) + kryo.register(Class.forName("org.apache.avro.generic.GenericData$Array")) // org.apache.hadoop.io kryo.register(classOf[org.apache.hadoop.io.Text]) @@ -144,6 +145,7 @@ class ADAMKryoRegistrator extends KryoRegistrator { // org.bdgenomics.adam.rdd kryo.register(classOf[org.bdgenomics.adam.rdd.GenomeBins]) + kryo.register(Class.forName("org.bdgenomics.adam.rdd.SortedIntervalPartitionJoinAndGroupByLeft$$anonfun$postProcessHits$1")) // org.bdgenomics.adam.rdd.read kryo.register(classOf[org.bdgenomics.adam.rdd.read.FlagStatMetrics]) @@ -220,6 +222,24 @@ class ADAMKryoRegistrator extends KryoRegistrator { kryo.register(classOf[org.bdgenomics.formats.avro.VariantCallingAnnotations], new AvroSerializer[org.bdgenomics.formats.avro.VariantCallingAnnotations]) + // org.bdgenomics.utils.intervalarray + // this is only exposed officially through the genomicrdd trait, thus, we'll only + // register it for said traits + kryo.register(classOf[org.bdgenomics.utils.intervalarray.IntervalArray[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.AlignmentRecord]], + new org.bdgenomics.utils.intervalarray.IntervalArraySerializer[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.AlignmentRecord](kryo)) + kryo.register(classOf[org.bdgenomics.utils.intervalarray.IntervalArray[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.Feature]], + new org.bdgenomics.utils.intervalarray.IntervalArraySerializer[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.Feature](kryo)) + kryo.register(classOf[org.bdgenomics.utils.intervalarray.IntervalArray[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.Fragment]], + new org.bdgenomics.utils.intervalarray.IntervalArraySerializer[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.Fragment](kryo)) + kryo.register(classOf[org.bdgenomics.utils.intervalarray.IntervalArray[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.Genotype]], + new org.bdgenomics.utils.intervalarray.IntervalArraySerializer[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.Genotype](kryo)) + kryo.register(classOf[org.bdgenomics.utils.intervalarray.IntervalArray[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.NucleotideContigFragment]], + new org.bdgenomics.utils.intervalarray.IntervalArraySerializer[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.NucleotideContigFragment](kryo)) + kryo.register(classOf[org.bdgenomics.utils.intervalarray.IntervalArray[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.Variant]], + new org.bdgenomics.utils.intervalarray.IntervalArraySerializer[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.formats.avro.Variant](kryo)) + kryo.register(classOf[org.bdgenomics.utils.intervalarray.IntervalArray[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.adam.models.VariantContext]], + new org.bdgenomics.utils.intervalarray.IntervalArraySerializer[org.bdgenomics.adam.models.ReferenceRegion, org.bdgenomics.adam.models.VariantContext](kryo)) + // org.codehaus.jackson.node kryo.register(classOf[org.codehaus.jackson.node.NullNode]) kryo.register(classOf[org.codehaus.jackson.node.BooleanNode]) @@ -258,15 +278,23 @@ class ADAMKryoRegistrator extends KryoRegistrator { kryo.register(classOf[scala.Array[String]]) kryo.register(Class.forName("scala.Tuple2$mcCC$sp")) + // scala.collection + kryo.register(Class.forName("scala.collection.Iterator$$anon$11")) + kryo.register(Class.forName("scala.collection.Iterator$$anonfun$toStream$1")) + // scala.collection.convert kryo.register(Class.forName("scala.collection.convert.Wrappers$")) // scala.collection.immutable kryo.register(classOf[scala.collection.immutable.::[_]]) kryo.register(classOf[scala.collection.immutable.Range]) + kryo.register(Class.forName("scala.collection.immutable.Stream$Cons")) + kryo.register(Class.forName("scala.collection.immutable.Stream$Empty$")) // scala.collection.mutable kryo.register(classOf[scala.collection.mutable.ArrayBuffer[_]]) + kryo.register(classOf[scala.collection.mutable.ListBuffer[_]]) + kryo.register(Class.forName("scala.collection.mutable.ListBuffer$$anon$1")) kryo.register(classOf[scala.collection.mutable.WrappedArray.ofInt]) kryo.register(classOf[scala.collection.mutable.WrappedArray.ofLong]) kryo.register(classOf[scala.collection.mutable.WrappedArray.ofByte]) diff --git a/src/main/scala/org/bdgenomics/adam/util/ParquetFileTraversable.scala b/src/main/scala/org/bdgenomics/adam/util/ParquetFileTraversable.scala index d62af65981..07fa86eb75 100644 --- a/src/main/scala/org/bdgenomics/adam/util/ParquetFileTraversable.scala +++ b/src/main/scala/org/bdgenomics/adam/util/ParquetFileTraversable.scala @@ -18,7 +18,7 @@ package org.bdgenomics.adam.util import org.apache.avro.generic.IndexedRecord -import org.apache.hadoop.fs.{ FileSystem, Path } +import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetReader import org.apache.spark.SparkContext import org.bdgenomics.utils.misc.HadoopUtil diff --git a/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala b/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala index c672ee766e..74221a70e7 100644 --- a/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala +++ b/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala @@ -17,7 +17,6 @@ */ package org.bdgenomics.adam.util -import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.{ ReferenceRegion, diff --git a/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala b/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala index aeebb531de..63b43ecf7d 100644 --- a/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala +++ b/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala @@ -15,7 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.bdgenomics.adam.util import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary } diff --git a/src/test/resources/small.1.bed b/src/test/resources/small.1.bed new file mode 100644 index 0000000000..b62e37fa40 --- /dev/null +++ b/src/test/resources/small.1.bed @@ -0,0 +1,4 @@ +1 143 26423 +1 14397230 26472788 +1 169801934 169801939 +1 240997788 240997796 \ No newline at end of file diff --git a/src/test/resources/small.1.narrowPeak b/src/test/resources/small.1.narrowPeak new file mode 100644 index 0000000000..b3338bc8d6 --- /dev/null +++ b/src/test/resources/small.1.narrowPeak @@ -0,0 +1,20 @@ +1 26472784 26472859 simread:1:26472783:false 0 + 0 -1 -1 -1 +1 240997788 240997863 simread:1:240997787:true 0 + 0 -1 -1 -1 +1 189606654 189606729 simread:1:189606653:true 0 + 0 -1 -1 -1 +1 207027739 207027814 simread:1:207027738:true 0 + 0 -1 -1 -1 +1 14397234 14397309 simread:1:14397233:false 0 + 0 -1 -1 -1 +1 240344443 240344518 simread:1:240344442:true 0 + 0 -1 -1 -1 +1 153978725 153978800 simread:1:153978724:false 0 + 0 -1 -1 -1 +1 237728410 237728485 simread:1:237728409:true 0 + 0 -1 -1 -1 +1 231911907 231911982 simread:1:231911906:false 0 + 0 -1 -1 -1 +1 50683372 50683447 simread:1:50683371:false 0 + 0 -1 -1 -1 +1 37577446 37577521 simread:1:37577445:false 0 + 0 -1 -1 -1 +1 195211966 195212041 simread:1:195211965:false 0 + 0 -1 -1 -1 +1 163841414 163841489 simread:1:163841413:false 0 + 0 -1 -1 -1 +1 101556379 101556454 simread:1:101556378:false 0 + 0 -1 -1 -1 +1 20101801 20101876 simread:1:20101800:true 0 + 0 -1 -1 -1 +1 186794284 186794359 simread:1:186794283:true 0 + 0 -1 -1 -1 +1 165341383 165341458 simread:1:165341382:true 0 + 0 -1 -1 -1 +1 5469107 5469182 simread:1:5469106:true 0 + 0 -1 -1 -1 +1 89554253 89554328 simread:1:89554252:false 0 + 0 -1 -1 -1 +1 169801934 169802009 simread:1:169801933:true 0 + 0 -1 -1 -1 diff --git a/src/test/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanSuite.scala b/src/test/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanSuite.scala index 1994e34260..7ff91afc2f 100644 --- a/src/test/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanSuite.scala @@ -19,7 +19,6 @@ package org.bdgenomics.adam.algorithms.smithwaterman import org.scalatest.FunSuite import scala.math.abs -import scala.util.Random class SmithWatermanSuite extends FunSuite { diff --git a/src/test/scala/org/bdgenomics/adam/converters/AlignmentRecordConverterSuite.scala b/src/test/scala/org/bdgenomics/adam/converters/AlignmentRecordConverterSuite.scala index 4b1e7295d7..d209d52de9 100644 --- a/src/test/scala/org/bdgenomics/adam/converters/AlignmentRecordConverterSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/converters/AlignmentRecordConverterSuite.scala @@ -17,7 +17,7 @@ */ package org.bdgenomics.adam.converters -import htsjdk.samtools.{ SamReaderFactory, SAMRecord } +import htsjdk.samtools.SamReaderFactory import java.io.File import org.bdgenomics.adam.models.{ RecordGroupDictionary, @@ -28,7 +28,6 @@ import org.bdgenomics.adam.models.{ } import org.bdgenomics.formats.avro.{ AlignmentRecord, - Contig, Fragment } import org.scalatest.FunSuite diff --git a/src/test/scala/org/bdgenomics/adam/converters/SAMRecordConverterSuite.scala b/src/test/scala/org/bdgenomics/adam/converters/SAMRecordConverterSuite.scala index ed5ed6976f..0feebd47be 100644 --- a/src/test/scala/org/bdgenomics/adam/converters/SAMRecordConverterSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/converters/SAMRecordConverterSuite.scala @@ -20,8 +20,6 @@ package org.bdgenomics.adam.converters import htsjdk.samtools._ import java.io.File import org.scalatest.FunSuite -import org.bdgenomics.adam.models.Attribute -import org.bdgenomics.adam.rdd.ADAMContext._ import scala.collection.JavaConversions._ class SAMRecordConverterSuite extends FunSuite { diff --git a/src/test/scala/org/bdgenomics/adam/converters/TranscriptEffectConverterSuite.scala b/src/test/scala/org/bdgenomics/adam/converters/TranscriptEffectConverterSuite.scala index 3a28c41d67..c63029a05c 100644 --- a/src/test/scala/org/bdgenomics/adam/converters/TranscriptEffectConverterSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/converters/TranscriptEffectConverterSuite.scala @@ -28,7 +28,6 @@ import org.bdgenomics.formats.avro.{ } import org.mockito.Mockito import org.mockito.Mockito.when -import scala.collection.JavaConverters._ class TranscriptEffectConverterSuite extends ADAMFunSuite { final val EMPTY = "" diff --git a/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala b/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala index 3694029274..4232a96946 100644 --- a/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala @@ -18,7 +18,6 @@ package org.bdgenomics.adam.converters import com.google.common.collect.ImmutableList -import htsjdk.samtools.SAMFileReader import htsjdk.variant.utils.SAMSequenceDictionaryExtractor import htsjdk.variant.variantcontext.{ Allele, @@ -33,7 +32,6 @@ import org.bdgenomics.adam.models.{ } import org.bdgenomics.adam.util.{ ADAMFunSuite, PhredUtils } import org.bdgenomics.formats.avro._ -import org.scalatest.FunSuite import scala.collection.JavaConversions._ class VariantContextConverterSuite extends ADAMFunSuite { diff --git a/src/test/scala/org/bdgenomics/adam/models/AlphabetSuite.scala b/src/test/scala/org/bdgenomics/adam/models/AlphabetSuite.scala index 65b6f24160..cdf6cbfdd8 100644 --- a/src/test/scala/org/bdgenomics/adam/models/AlphabetSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/models/AlphabetSuite.scala @@ -19,11 +19,6 @@ package org.bdgenomics.adam.models import org.bdgenomics.adam.util.ADAMFunSuite -/** - * Created by bryan on 4/18/15. - * - * Tests the alphabet class - */ class AlphabetSuite extends ADAMFunSuite { val testSymbols = Seq( diff --git a/src/test/scala/org/bdgenomics/adam/models/IndelTableSuite.scala b/src/test/scala/org/bdgenomics/adam/models/IndelTableSuite.scala index 6d87f6a696..e9d6b672d7 100644 --- a/src/test/scala/org/bdgenomics/adam/models/IndelTableSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/models/IndelTableSuite.scala @@ -19,7 +19,7 @@ package org.bdgenomics.adam.models import org.bdgenomics.adam.algorithms.consensus.Consensus import org.bdgenomics.adam.util.ADAMFunSuite -import org.bdgenomics.formats.avro.{ Contig, Variant } +import org.bdgenomics.formats.avro.Variant class IndelTableSuite extends ADAMFunSuite { diff --git a/src/test/scala/org/bdgenomics/adam/models/NonoverlappingRegionsSuite.scala b/src/test/scala/org/bdgenomics/adam/models/NonoverlappingRegionsSuite.scala index a256025a6d..3502e3dd5c 100644 --- a/src/test/scala/org/bdgenomics/adam/models/NonoverlappingRegionsSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/models/NonoverlappingRegionsSuite.scala @@ -110,9 +110,9 @@ class NonoverlappingRegionsSuite extends FunSuite { val record2 = AlignmentRecord.newBuilder(built).setStart(3L).setEnd(4L).build() val baseRecord = AlignmentRecord.newBuilder(built).setCigar("4M").setEnd(5L).build() - val baseMapping = new NonoverlappingRegions(Seq(ReferenceRegion(baseRecord))) - val regions1 = baseMapping.findOverlappingRegions(ReferenceRegion(record1)) - val regions2 = baseMapping.findOverlappingRegions(ReferenceRegion(record2)) + val baseMapping = new NonoverlappingRegions(Seq(ReferenceRegion.unstranded(baseRecord))) + val regions1 = baseMapping.findOverlappingRegions(ReferenceRegion.unstranded(record1)) + val regions2 = baseMapping.findOverlappingRegions(ReferenceRegion.unstranded(record2)) assert(regions1.size === 1) assert(regions2.size === 1) assert(regions1.head === regions2.head) diff --git a/src/test/scala/org/bdgenomics/adam/models/ProgramRecordSuite.scala b/src/test/scala/org/bdgenomics/adam/models/ProgramRecordSuite.scala index 7a593a6fee..6b2fd7e5ae 100644 --- a/src/test/scala/org/bdgenomics/adam/models/ProgramRecordSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/models/ProgramRecordSuite.scala @@ -18,7 +18,6 @@ package org.bdgenomics.adam.models import htsjdk.samtools.SAMProgramRecord -import org.bdgenomics.adam.rdd.ADAMContext._ import org.scalatest.FunSuite class ProgramRecordSuite extends FunSuite { diff --git a/src/test/scala/org/bdgenomics/adam/models/RecordGroupDictionarySuite.scala b/src/test/scala/org/bdgenomics/adam/models/RecordGroupDictionarySuite.scala index 6ac3be96c7..14e807eb4d 100644 --- a/src/test/scala/org/bdgenomics/adam/models/RecordGroupDictionarySuite.scala +++ b/src/test/scala/org/bdgenomics/adam/models/RecordGroupDictionarySuite.scala @@ -18,7 +18,6 @@ package org.bdgenomics.adam.models import htsjdk.samtools.SAMReadGroupRecord -import org.bdgenomics.formats.avro.AlignmentRecord import org.scalatest.FunSuite class RecordGroupDictionarySuite extends FunSuite { diff --git a/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala b/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala index c864cb7721..17eb5e6492 100644 --- a/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala @@ -17,8 +17,15 @@ */ package org.bdgenomics.adam.models -import org.scalatest._ +import htsjdk.variant.variantcontext.{ + Allele, + GenotypeBuilder, + VariantContextBuilder +} +import org.bdgenomics.adam.converters.VariantContextConverter import org.bdgenomics.formats.avro._ +import org.scalatest.FunSuite +import scala.collection.JavaConversions._ class ReferenceRegionSuite extends FunSuite { @@ -59,23 +66,37 @@ class ReferenceRegionSuite extends FunSuite { // contained assert(region("chr0", 10, 100).overlaps(region("chr0", 20, 50))) + assert(region("chr0", 10, 100).covers(region("chr0", 20, 50))) + + // different strands + assert(!ReferenceRegion("chr0", 10, 100, strand = Strand.FORWARD) + .overlaps(ReferenceRegion("chr0", 20, 50, strand = Strand.REVERSE))) + assert(ReferenceRegion("chr0", 10, 100, strand = Strand.FORWARD) + .covers(ReferenceRegion("chr0", 20, 50, strand = Strand.REVERSE))) // right side assert(region("chr0", 10, 100).overlaps(region("chr0", 50, 250))) + assert(region("chr0", 10, 100).covers(region("chr0", 50, 250))) // left side assert(region("chr0", 10, 100).overlaps(region("chr0", 5, 15))) + assert(region("chr0", 10, 100).covers(region("chr0", 5, 15))) // left edge assert(region("chr0", 10, 100).overlaps(region("chr0", 5, 11))) + assert(region("chr0", 10, 100).covers(region("chr0", 5, 11))) assert(!region("chr0", 10, 100).overlaps(region("chr0", 5, 10))) + assert(!region("chr0", 10, 100).covers(region("chr0", 5, 10))) // right edge assert(region("chr0", 10, 100).overlaps(region("chr0", 99, 200))) + assert(region("chr0", 10, 100).covers(region("chr0", 99, 200))) assert(!region("chr0", 10, 100).overlaps(region("chr0", 100, 200))) + assert(!region("chr0", 10, 100).covers(region("chr0", 100, 200))) // different sequences assert(!region("chr0", 10, 100).overlaps(region("chr1", 50, 200))) + assert(!region("chr0", 10, 100).covers(region("chr1", 50, 200))) } test("distance(: ReferenceRegion)") { @@ -126,7 +147,7 @@ class ReferenceRegionSuite extends FunSuite { val read = AlignmentRecord.newBuilder() .setReadMapped(false) .build() - ReferenceRegion(read) + ReferenceRegion.unstranded(read) } } @@ -135,10 +156,71 @@ class ReferenceRegionSuite extends FunSuite { val read = AlignmentRecord.newBuilder() .setReadMapped(true) .build() - ReferenceRegion(read) + ReferenceRegion.unstranded(read) + } + } + + test("create stranded region from unmapped read fails") { + intercept[IllegalArgumentException] { + val read = AlignmentRecord.newBuilder() + .setReadMapped(false) + .build() + ReferenceRegion.stranded(read) + } + } + + test("create stranded region from read with null alignment positions fails") { + intercept[IllegalArgumentException] { + val read = AlignmentRecord.newBuilder() + .setReadMapped(true) + .build() + ReferenceRegion.stranded(read) + } + } + + test("create stranded region from read with null alignment strand fails") { + intercept[IllegalArgumentException] { + val read = AlignmentRecord.newBuilder() + .setReadMapped(true) + .setStart(10L) + .setEnd(15L) + .setContigName("ctg") + .setReadNegativeStrand(null) + .build() + ReferenceRegion.stranded(read) } } + test("create stranded region from read on forward strand") { + val read = AlignmentRecord.newBuilder() + .setReadMapped(true) + .setStart(10L) + .setEnd(15L) + .setContigName("ctg") + .setReadNegativeStrand(false) + .build() + val rr = ReferenceRegion.stranded(read) + assert(rr.referenceName === "ctg") + assert(rr.start === 10L) + assert(rr.end === 15L) + assert(rr.strand === Strand.FORWARD) + } + + test("create stranded region from read on reverse strand") { + val read = AlignmentRecord.newBuilder() + .setReadMapped(true) + .setStart(10L) + .setEnd(15L) + .setContigName("ctg") + .setReadNegativeStrand(true) + .build() + val rr = ReferenceRegion.stranded(read) + assert(rr.referenceName === "ctg") + assert(rr.start === 10L) + assert(rr.end === 15L) + assert(rr.strand === Strand.REVERSE) + } + test("create region from mapped read contains read start and end") { val read = AlignmentRecord.newBuilder() .setReadMapped(true) @@ -149,9 +231,9 @@ class ReferenceRegionSuite extends FunSuite { .setContigName("chr1") .build() - assert(ReferenceRegion(read).contains(point("chr1", 1L))) - assert(ReferenceRegion(read).contains(point("chr1", 5L))) - assert(!ReferenceRegion(read).contains(point("chr1", 6L))) + assert(ReferenceRegion.unstranded(read).contains(point("chr1", 1L))) + assert(ReferenceRegion.unstranded(read).contains(point("chr1", 5L))) + assert(!ReferenceRegion.unstranded(read).contains(point("chr1", 6L))) } test("validate that adjacent regions can be merged") { @@ -205,7 +287,7 @@ class ReferenceRegionSuite extends FunSuite { .setMismatchingPositions("5") .build() - val r = ReferenceRegion(read) + val r = ReferenceRegion.unstranded(read) assert(r.referenceName === "chrM") assert(r.start === 5L) @@ -260,6 +342,9 @@ class ReferenceRegionSuite extends FunSuite { .build() val g = Genotype.newBuilder() .setVariant(v) + .setContigName("chr") + .setStart(1L) + .setEnd(3L) .build() val rrV = ReferenceRegion(v) val rrG = ReferenceRegion(g) @@ -294,6 +379,22 @@ class ReferenceRegionSuite extends FunSuite { assert(openEnded.overlaps(ReferenceRegion("myCtg", Long.MaxValue - 1L, Long.MaxValue))) } + test("can build an open ended reference region with strand") { + val openEnded = ReferenceRegion.toEnd("myCtg", 45L, strand = Strand.FORWARD) + + assert(!openEnded.overlaps(ReferenceRegion("myCtg", 44L, 45L))) + assert(!openEnded.overlaps(ReferenceRegion("myCtg", 44L, 46L))) + assert(!openEnded.overlaps(ReferenceRegion("myCtg", Long.MaxValue - 1L, Long.MaxValue))) + + assert(!openEnded.covers(ReferenceRegion("myCtg", 44L, 45L))) + assert(openEnded.covers(ReferenceRegion("myCtg", 44L, 46L))) + assert(openEnded.covers(ReferenceRegion("myCtg", Long.MaxValue - 1L, Long.MaxValue))) + + assert(!openEnded.overlaps(ReferenceRegion("myCtg", 44L, 45L, strand = Strand.FORWARD))) + assert(openEnded.overlaps(ReferenceRegion("myCtg", 44L, 46L, strand = Strand.FORWARD))) + assert(openEnded.overlaps(ReferenceRegion("myCtg", Long.MaxValue - 1L, Long.MaxValue, strand = Strand.FORWARD))) + } + test("can build a reference region with an open start position") { val openStart = ReferenceRegion.fromStart("myCtg", 45L) @@ -302,10 +403,114 @@ class ReferenceRegionSuite extends FunSuite { assert(!openStart.overlaps(ReferenceRegion("myCtg", 45L, 46L))) } + test("can build a reference region with an open start position with strand") { + val openStart = ReferenceRegion.fromStart("myCtg", 45L, strand = Strand.REVERSE) + + assert(!openStart.overlaps(ReferenceRegion("myCtg", 0L, 1L))) + assert(!openStart.overlaps(ReferenceRegion("myCtg", 44L, 46L))) + assert(!openStart.overlaps(ReferenceRegion("myCtg", 45L, 46L))) + + assert(openStart.covers(ReferenceRegion("myCtg", 0L, 1L))) + assert(openStart.covers(ReferenceRegion("myCtg", 44L, 46L))) + assert(!openStart.covers(ReferenceRegion("myCtg", 45L, 46L))) + + assert(openStart.overlaps(ReferenceRegion("myCtg", 0L, 1L, strand = Strand.REVERSE))) + assert(openStart.overlaps(ReferenceRegion("myCtg", 44L, 46L, strand = Strand.REVERSE))) + assert(!openStart.overlaps(ReferenceRegion("myCtg", 45L, 46L, strand = Strand.REVERSE))) + } + test("can build a reference region that covers the entirety of a contig") { val all = ReferenceRegion.all("myCtg") assert(all.overlaps(ReferenceRegion("myCtg", 0L, 1L))) assert(all.overlaps(ReferenceRegion("myCtg", Long.MaxValue - 1L, Long.MaxValue))) } + + test("can build a reference region that covers the entirety of a contig with strand") { + val all = ReferenceRegion.all("myCtg", strand = Strand.FORWARD) + + assert(!all.overlaps(ReferenceRegion("myCtg", 0L, 1L))) + assert(!all.overlaps(ReferenceRegion("myCtg", Long.MaxValue - 1L, Long.MaxValue))) + + assert(all.covers(ReferenceRegion("myCtg", 0L, 1L))) + assert(all.covers(ReferenceRegion("myCtg", Long.MaxValue - 1L, Long.MaxValue))) + + assert(all.overlaps(ReferenceRegion("myCtg", 0L, 1L, strand = Strand.FORWARD))) + assert(all.overlaps(ReferenceRegion("myCtg", Long.MaxValue - 1L, Long.MaxValue, strand = Strand.FORWARD))) + } + + test("convert a genotype and then get the reference region") { + val converter = new VariantContextConverter + val vcb = new VariantContextBuilder() + .alleles(List(Allele.create("A", true), Allele.create("T"))) + .start(1L) + .stop(1L) + .chr("1") + val vc = vcb.genotypes(GenotypeBuilder.create("NA12878", + vcb.getAlleles(), + Map.empty[String, java.lang.Object])).make() + val gts = converter.convert(vc).flatMap(_.genotypes) + assert(gts.size === 1) + val gt = gts.head + + val rr = ReferenceRegion(gt) + assert(rr.referenceName === "1") + assert(rr.start === 0L) + assert(rr.end === 1L) + } + + test("create region from feature with null alignment positions fails") { + intercept[IllegalArgumentException] { + val feature = Feature.newBuilder() + .build() + ReferenceRegion.unstranded(feature) + } + } + + test("create stranded region from feature with null alignment positions fails") { + intercept[IllegalArgumentException] { + val feature = Feature.newBuilder() + .build() + ReferenceRegion.stranded(feature) + } + } + + test("create stranded region from feature with null alignment strand fails") { + intercept[IllegalArgumentException] { + val feature = Feature.newBuilder() + .setStart(10L) + .setEnd(15L) + .setContigName("ctg") + .build() + ReferenceRegion.stranded(feature) + } + } + + test("create stranded region from feature on forward strand") { + val feature = Feature.newBuilder() + .setStart(10L) + .setEnd(15L) + .setContigName("ctg") + .setStrand(Strand.FORWARD) + .build() + val rr = ReferenceRegion.stranded(feature) + assert(rr.referenceName === "ctg") + assert(rr.start === 10L) + assert(rr.end === 15L) + assert(rr.strand === Strand.FORWARD) + } + + test("create stranded region from feature on reverse strand") { + val feature = Feature.newBuilder() + .setStart(10L) + .setEnd(15L) + .setContigName("ctg") + .setStrand(Strand.REVERSE) + .build() + val rr = ReferenceRegion.stranded(feature) + assert(rr.referenceName === "ctg") + assert(rr.start === 10L) + assert(rr.end === 15L) + assert(rr.strand === Strand.REVERSE) + } } diff --git a/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala b/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala index a07dc6fc79..e0a03d5399 100644 --- a/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala +++ b/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala @@ -19,7 +19,6 @@ package org.bdgenomics.adam.models import java.io.File import htsjdk.samtools.{ - SAMFileReader, SAMSequenceDictionary, SAMSequenceRecord } diff --git a/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index c8c3ec193f..6e7be2b2e7 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -18,8 +18,6 @@ package org.bdgenomics.adam.rdd import java.io.{ File, FileNotFoundException } -import java.util.UUID -import htsjdk.samtools.DiskBasedBAMFileIndex import com.google.common.io.Files import org.apache.hadoop.fs.Path import org.apache.parquet.filter2.dsl.Dsl._ @@ -28,12 +26,10 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models._ import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.read.AlignedReadRDD import org.bdgenomics.adam.util.PhredUtils._ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ -import org.seqdoop.hadoop_bam.{ BAMInputFormat, CRAMInputFormat } -import scala.collection.JavaConversions._ +import org.seqdoop.hadoop_bam.CRAMInputFormat case class TestSaveArgs(var outputPath: String) extends ADAMSaveAnyArgs { var sortFastqOutput = false @@ -48,6 +44,10 @@ case class TestSaveArgs(var outputPath: String) extends ADAMSaveAnyArgs { class ADAMContextSuite extends ADAMFunSuite { + sparkTest("ctr is accessible") { + new ADAMContext(sc) + } + sparkTest("sc.loadParquet should not fail on unmapped reads") { val readsFilepath = testFile("unmapped.sam") diff --git a/src/test/scala/org/bdgenomics/adam/rdd/GenomicPositionPartitionerSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/GenomicPositionPartitionerSuite.scala index ef959184e7..7458943827 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/GenomicPositionPartitionerSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/GenomicPositionPartitionerSuite.scala @@ -17,9 +17,7 @@ */ package org.bdgenomics.adam.rdd -import org.apache.spark.SparkContext._ import org.apache.spark.RangePartitioner -import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.{ ReferencePosition, SequenceRecord, SequenceDictionary } import org.bdgenomics.adam.projections.Projection import org.bdgenomics.adam.rdd.ADAMContext._ diff --git a/src/test/scala/org/bdgenomics/adam/rdd/InnerShuffleRegionJoinSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/InnerShuffleRegionJoinSuite.scala index 88a9095700..7c700a7a1c 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/InnerShuffleRegionJoinSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/InnerShuffleRegionJoinSuite.scala @@ -51,8 +51,8 @@ class InnerShuffleRegionJoinSuite extends ADAMFunSuite { val record2 = AlignmentRecord.newBuilder(built).setStart(3L).setEnd(4L).build() val baseRecord = AlignmentRecord.newBuilder(built).setCigar("4M").setEnd(5L).build() - val baseRdd = sc.parallelize(Seq(baseRecord)).keyBy(ReferenceRegion(_)) - val recordsRdd = sc.parallelize(Seq(record1, record2)).keyBy(ReferenceRegion(_)) + val baseRdd = sc.parallelize(Seq(baseRecord)).keyBy(ReferenceRegion.unstranded(_)) + val recordsRdd = sc.parallelize(Seq(record1, record2)).keyBy(ReferenceRegion.unstranded(_)) assert( InnerShuffleRegionJoin[AlignmentRecord, AlignmentRecord](seqDict, partitionSize, sc) @@ -110,8 +110,8 @@ class InnerShuffleRegionJoinSuite extends ADAMFunSuite { val baseRecord1 = AlignmentRecord.newBuilder(builtRef1).setCigar("4M").setEnd(5L).build() val baseRecord2 = AlignmentRecord.newBuilder(builtRef2).setCigar("4M").setEnd(5L).build() - val baseRdd = sc.parallelize(Seq(baseRecord1, baseRecord2)).keyBy(ReferenceRegion(_)) - val recordsRdd = sc.parallelize(Seq(record1, record2, record3)).keyBy(ReferenceRegion(_)) + val baseRdd = sc.parallelize(Seq(baseRecord1, baseRecord2)).keyBy(ReferenceRegion.unstranded(_)) + val recordsRdd = sc.parallelize(Seq(record1, record2, record3)).keyBy(ReferenceRegion.unstranded(_)) assert( InnerShuffleRegionJoin[AlignmentRecord, AlignmentRecord](seqDict, partitionSize, sc) @@ -138,7 +138,7 @@ class InnerShuffleRegionJoinSuite extends ADAMFunSuite { object InnerShuffleRegionJoinSuite { def getReferenceRegion(record: AlignmentRecord): ReferenceRegion = - ReferenceRegion(record) + ReferenceRegion.unstranded(record) def merge(prev: Boolean, next: (AlignmentRecord, AlignmentRecord)): Boolean = prev && getReferenceRegion(next._1).overlaps(getReferenceRegion(next._2)) diff --git a/src/test/scala/org/bdgenomics/adam/rdd/InnerBroadcastRegionJoinSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/InnerTreeRegionJoinSuite.scala similarity index 77% rename from src/test/scala/org/bdgenomics/adam/rdd/InnerBroadcastRegionJoinSuite.scala rename to src/test/scala/org/bdgenomics/adam/rdd/InnerTreeRegionJoinSuite.scala index a5cf3fc9d2..a9439e430d 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/InnerBroadcastRegionJoinSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/InnerTreeRegionJoinSuite.scala @@ -22,7 +22,7 @@ import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig } -class InnerBroadcastRegionJoinSuite extends ADAMFunSuite { +class InnerTreeRegionJoinSuite extends ADAMFunSuite { sparkTest("Ensure same reference regions get passed together") { val contig = Contig.newBuilder @@ -41,24 +41,24 @@ class InnerBroadcastRegionJoinSuite extends ADAMFunSuite { val record1 = builder.build() val record2 = builder.build() - val rdd1 = sc.parallelize(Seq(record1)).keyBy(ReferenceRegion(_)) - val rdd2 = sc.parallelize(Seq(record2)).keyBy(ReferenceRegion(_)) + val rdd1 = sc.parallelize(Seq(record1)).keyBy(ReferenceRegion.unstranded(_)) + val rdd2 = sc.parallelize(Seq(record2)).keyBy(ReferenceRegion.unstranded(_)) - assert(InnerBroadcastRegionJoinSuite.getReferenceRegion(record1) === - InnerBroadcastRegionJoinSuite.getReferenceRegion(record2)) + assert(InnerTreeRegionJoinSuite.getReferenceRegion(record1) === + InnerTreeRegionJoinSuite.getReferenceRegion(record2)) - assert(InnerBroadcastRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( + assert(InnerTreeRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( rdd1, rdd2).aggregate(true)( - InnerBroadcastRegionJoinSuite.merge, - InnerBroadcastRegionJoinSuite.and)) + InnerTreeRegionJoinSuite.merge, + InnerTreeRegionJoinSuite.and)) - assert(InnerBroadcastRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( + assert(InnerTreeRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( rdd1, rdd2) .aggregate(0)( - InnerBroadcastRegionJoinSuite.count, - InnerBroadcastRegionJoinSuite.sum) === 1) + InnerTreeRegionJoinSuite.count, + InnerTreeRegionJoinSuite.sum) === 1) } sparkTest("Overlapping reference regions") { @@ -80,17 +80,17 @@ class InnerBroadcastRegionJoinSuite extends ADAMFunSuite { val record2 = AlignmentRecord.newBuilder(built).setStart(3L).setEnd(4L).build() val baseRecord = AlignmentRecord.newBuilder(built).setCigar("4M").setEnd(5L).build() - val baseRdd = sc.parallelize(Seq(baseRecord)).keyBy(ReferenceRegion(_)) - val recordsRdd = sc.parallelize(Seq(record1, record2)).keyBy(ReferenceRegion(_)) + val baseRdd = sc.parallelize(Seq(baseRecord)).keyBy(ReferenceRegion.unstranded(_)) + val recordsRdd = sc.parallelize(Seq(record1, record2)).keyBy(ReferenceRegion.unstranded(_)) - assert(InnerBroadcastRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( + assert(InnerTreeRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( baseRdd, recordsRdd) .aggregate(true)( - InnerBroadcastRegionJoinSuite.merge, - InnerBroadcastRegionJoinSuite.and)) + InnerTreeRegionJoinSuite.merge, + InnerTreeRegionJoinSuite.and)) - assert(InnerBroadcastRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( + assert(InnerTreeRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( baseRdd, recordsRdd).count() === 2) } @@ -129,25 +129,25 @@ class InnerBroadcastRegionJoinSuite extends ADAMFunSuite { val baseRecord1 = AlignmentRecord.newBuilder(builtRef1).setCigar("4M").setEnd(5L).build() val baseRecord2 = AlignmentRecord.newBuilder(builtRef2).setCigar("4M").setEnd(5L).build() - val baseRdd = sc.parallelize(Seq(baseRecord1, baseRecord2)).keyBy(ReferenceRegion(_)) - val recordsRdd = sc.parallelize(Seq(record1, record2, record3)).keyBy(ReferenceRegion(_)) + val baseRdd = sc.parallelize(Seq(baseRecord1, baseRecord2)).keyBy(ReferenceRegion.unstranded(_)) + val recordsRdd = sc.parallelize(Seq(record1, record2, record3)).keyBy(ReferenceRegion.unstranded(_)) - assert(InnerBroadcastRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( + assert(InnerTreeRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( baseRdd, recordsRdd) .aggregate(true)( - InnerBroadcastRegionJoinSuite.merge, - InnerBroadcastRegionJoinSuite.and)) + InnerTreeRegionJoinSuite.merge, + InnerTreeRegionJoinSuite.and)) - assert(InnerBroadcastRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( + assert(InnerTreeRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( baseRdd, recordsRdd).count() === 3) } } -object InnerBroadcastRegionJoinSuite { +object InnerTreeRegionJoinSuite { def getReferenceRegion(record: AlignmentRecord): ReferenceRegion = - ReferenceRegion(record) + ReferenceRegion.unstranded(record) def merge(prev: Boolean, next: (AlignmentRecord, AlignmentRecord)): Boolean = prev && getReferenceRegion(next._1).overlaps(getReferenceRegion(next._2)) diff --git a/src/test/scala/org/bdgenomics/adam/rdd/OuterRegionJoinSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/OuterRegionJoinSuite.scala index 5c5a826155..6791452253 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/OuterRegionJoinSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/OuterRegionJoinSuite.scala @@ -53,8 +53,8 @@ trait OuterRegionJoinSuite extends ADAMFunSuite { .setEnd(11L) .build() - val rdd1 = sc.parallelize(Seq(record1, record3)).keyBy(ReferenceRegion(_)) - val rdd2 = sc.parallelize(Seq(record2, record4)).keyBy(ReferenceRegion(_)) + val rdd1 = sc.parallelize(Seq(record1, record3)).keyBy(ReferenceRegion.unstranded(_)) + val rdd2 = sc.parallelize(Seq(record2, record4)).keyBy(ReferenceRegion.unstranded(_)) val jrdd = runJoin(rdd1, rdd2).cache() @@ -86,8 +86,8 @@ trait OuterRegionJoinSuite extends ADAMFunSuite { val baseRecord = AlignmentRecord.newBuilder(built).setCigar("4M").setEnd(5L).build() val record3 = AlignmentRecord.newBuilder(built).setStart(6L).setEnd(7L).build() - val baseRdd = sc.parallelize(Seq(baseRecord)).keyBy(ReferenceRegion(_)) - val recordsRdd = sc.parallelize(Seq(record1, record2, record3)).keyBy(ReferenceRegion(_)) + val baseRdd = sc.parallelize(Seq(baseRecord)).keyBy(ReferenceRegion.unstranded(_)) + val recordsRdd = sc.parallelize(Seq(record1, record2, record3)).keyBy(ReferenceRegion.unstranded(_)) val jrdd = runJoin(baseRdd, recordsRdd).cache diff --git a/src/test/scala/org/bdgenomics/adam/rdd/RightOuterBroadcastRegionJoinSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/RightOuterTreeRegionJoinSuite.scala similarity index 88% rename from src/test/scala/org/bdgenomics/adam/rdd/RightOuterBroadcastRegionJoinSuite.scala rename to src/test/scala/org/bdgenomics/adam/rdd/RightOuterTreeRegionJoinSuite.scala index 5a32005426..6cf4893387 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/RightOuterBroadcastRegionJoinSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/RightOuterTreeRegionJoinSuite.scala @@ -21,11 +21,11 @@ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.formats.avro.AlignmentRecord -class RightOuterBroadcastRegionJoinSuite extends OuterRegionJoinSuite { +class RightOuterTreeRegionJoinSuite extends OuterRegionJoinSuite { def runJoin(leftRdd: RDD[(ReferenceRegion, AlignmentRecord)], rightRdd: RDD[(ReferenceRegion, AlignmentRecord)]): RDD[(Option[AlignmentRecord], AlignmentRecord)] = { - RightOuterBroadcastRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( + RightOuterTreeRegionJoin[AlignmentRecord, AlignmentRecord]().partitionAndJoin( leftRdd, rightRdd) } diff --git a/src/test/scala/org/bdgenomics/adam/rdd/TreeRegionJoinSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/TreeRegionJoinSuite.scala new file mode 100644 index 0000000000..b5ac8e1a9a --- /dev/null +++ b/src/test/scala/org/bdgenomics/adam/rdd/TreeRegionJoinSuite.scala @@ -0,0 +1,85 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd + +import org.bdgenomics.adam.models.ReferenceRegion +import org.bdgenomics.adam.util.ADAMFunSuite +import org.bdgenomics.formats.avro.{ AlignmentRecord, Variant } +import scala.reflect.ClassTag + +private case class ConcreteTreeRegionJoin[T: ClassTag, U]() extends TreeRegionJoin[T, U] { +} + +class TreeRegionJoinSuite extends ADAMFunSuite { + + sparkTest("run a join between data on a single contig") { + + val rightRdd = sc.parallelize(Seq( + (ReferenceRegion("chr1", 10L, 20L), 0), + (ReferenceRegion("chr1", 15L, 25L), 1), + (ReferenceRegion("chr1", 30L, 50L), 2), + (ReferenceRegion("chr1", 60L, 70L), 3), + (ReferenceRegion("chr1", 90L, 100L), 4))) + .map(kv => { + val (k, v) = kv + // i have made many poor life decisions + (k, Variant.newBuilder + .setStart(v.toLong) + .build) + }) + + val leftRdd = sc.parallelize(Seq( + (ReferenceRegion("chr1", 12L, 22L), 0), + (ReferenceRegion("chr1", 20L, 35L), 1), + (ReferenceRegion("chr1", 40L, 55L), 2), + (ReferenceRegion("chr1", 75L, 85L), 3), + (ReferenceRegion("chr1", 95L, 105L), 4))) + .map(kv => { + val (k, v) = kv + // and this is but another one of them + (k, AlignmentRecord.newBuilder + .setStart(v.toLong) + .build) + }) + + val joinData = ConcreteTreeRegionJoin().runJoinAndGroupByRight(rightRdd, leftRdd) + .map(kv => { + val (k, v) = kv + (k.map(_.getStart.toInt), v.getStart.toInt) + }).collect + + assert(joinData.size === 5) + + val joinMap = joinData.filter(_._1.nonEmpty) + .map(_.swap) + .toMap + .mapValues(_.toSet) + + assert(joinMap.size === 4) + assert(joinMap(0).size === 2) + assert(joinMap(0)(0)) + assert(joinMap(0)(1)) + assert(joinMap(1).size === 2) + assert(joinMap(1)(1)) + assert(joinMap(1)(2)) + assert(joinMap(2).size === 1) + assert(joinMap(2)(2)) + assert(joinMap(4).size === 1) + assert(joinMap(4)(4)) + } +} diff --git a/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala index 0b995cbbdb..0868984a11 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala @@ -538,5 +538,199 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals { assert(coverage.rdd.count == 19) } + + sparkTest("use broadcast join to pull down features mapped to targets") { + val featuresPath = testFile("small.1.narrowPeak") + val targetsPath = testFile("small.1.bed") + + val features = sc.loadFeatures(featuresPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = features.broadcastRegionJoin(targets) + + assert(jRdd.rdd.count === 5L) + } + + sparkTest("use right outer broadcast join to pull down features mapped to targets") { + val featuresPath = testFile("small.1.narrowPeak") + val targetsPath = testFile("small.1.bed") + + val features = sc.loadFeatures(featuresPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = features.rightOuterBroadcastRegionJoin(targets) + + val c = jRdd.rdd.collect + assert(c.count(_._1.isEmpty) === 1) + assert(c.count(_._1.isDefined) === 5) + } + + def sd = { + sc.loadBam(testFile("small.1.sam")) + .sequences + } + + sparkTest("use shuffle join to pull down features mapped to targets") { + val featuresPath = testFile("small.1.narrowPeak") + val targetsPath = testFile("small.1.bed") + + val features = sc.loadFeatures(featuresPath) + .transform(_.repartition(1)) + .copy(sequences = sd) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = features.shuffleRegionJoin(targets) + val jRdd0 = features.shuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + assert(jRdd.rdd.count === 5L) + assert(jRdd0.rdd.count === 5L) + } + + sparkTest("use right outer shuffle join to pull down features mapped to targets") { + val featuresPath = testFile("small.1.narrowPeak") + val targetsPath = testFile("small.1.bed") + + val features = sc.loadFeatures(featuresPath) + .transform(_.repartition(1)) + .copy(sequences = sd) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = features.rightOuterShuffleRegionJoin(targets) + val jRdd0 = features.rightOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._1.isEmpty) === 1) + assert(c0.count(_._1.isEmpty) === 1) + assert(c.count(_._1.isDefined) === 5) + assert(c0.count(_._1.isDefined) === 5) + } + + sparkTest("use left outer shuffle join to pull down features mapped to targets") { + val featuresPath = testFile("small.1.narrowPeak") + val targetsPath = testFile("small.1.bed") + + val features = sc.loadFeatures(featuresPath) + .transform(_.repartition(1)) + .copy(sequences = sd) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = features.leftOuterShuffleRegionJoin(targets) + val jRdd0 = features.leftOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._2.isEmpty) === 15) + assert(c0.count(_._2.isEmpty) === 15) + assert(c.count(_._2.isDefined) === 5) + assert(c0.count(_._2.isDefined) === 5) + } + + sparkTest("use full outer shuffle join to pull down features mapped to targets") { + val featuresPath = testFile("small.1.narrowPeak") + val targetsPath = testFile("small.1.bed") + + val features = sc.loadFeatures(featuresPath) + .transform(_.repartition(1)) + .copy(sequences = sd) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = features.fullOuterShuffleRegionJoin(targets) + val jRdd0 = features.fullOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c0.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c.count(t => t._1.isDefined && t._2.isEmpty) === 15) + assert(c0.count(t => t._1.isDefined && t._2.isEmpty) === 15) + assert(c.count(t => t._1.isEmpty && t._2.isDefined) === 1) + assert(c0.count(t => t._1.isEmpty && t._2.isDefined) === 1) + assert(c.count(t => t._1.isDefined && t._2.isDefined) === 5) + assert(c0.count(t => t._1.isDefined && t._2.isDefined) === 5) + } + + sparkTest("use shuffle join with group by to pull down features mapped to targets") { + val featuresPath = testFile("small.1.narrowPeak") + val targetsPath = testFile("small.1.bed") + + val features = sc.loadFeatures(featuresPath) + .transform(_.repartition(1)) + .copy(sequences = sd) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = features.shuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = features.shuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.size === 5) + assert(c0.size === 5) + assert(c.forall(_._2.size == 1)) + assert(c0.forall(_._2.size == 1)) + } + + sparkTest("use right outer shuffle join with group by to pull down features mapped to targets") { + val featuresPath = testFile("small.1.narrowPeak") + val targetsPath = testFile("small.1.bed") + + val features = sc.loadFeatures(featuresPath) + .transform(_.repartition(1)) + .copy(sequences = sd) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = features.rightOuterShuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = features.rightOuterShuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd0.rdd.collect // FIXME + val c0 = jRdd0.rdd.collect + + assert(c.count(_._1.isDefined) === 20) + assert(c0.count(_._1.isDefined) === 20) + assert(c.filter(_._1.isDefined).count(_._2.size == 1) === 5) + assert(c0.filter(_._1.isDefined).count(_._2.size == 1) === 5) + assert(c.filter(_._1.isDefined).count(_._2.isEmpty) === 15) + assert(c0.filter(_._1.isDefined).count(_._2.isEmpty) === 15) + assert(c.count(_._1.isEmpty) === 1) + assert(c0.count(_._1.isEmpty) === 1) + assert(c.filter(_._1.isEmpty).forall(_._2.size == 1)) + assert(c0.filter(_._1.isEmpty).forall(_._2.size == 1)) + } } diff --git a/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala index 28b37da642..34a3401455 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala @@ -43,4 +43,186 @@ class FragmentRDDSuite extends ADAMFunSuite { val newRecords = pipedRdd.rdd.count assert(2 * records === newRecords) } + + sparkTest("use broadcast join to pull down fragments mapped to targets") { + val fragmentsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val fragments = sc.loadFragments(fragmentsPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = fragments.broadcastRegionJoin(targets) + + assert(jRdd.rdd.count === 5) + } + + sparkTest("use right outer broadcast join to pull down fragments mapped to targets") { + val fragmentsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val fragments = sc.loadFragments(fragmentsPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = fragments.rightOuterBroadcastRegionJoin(targets) + + val c = jRdd.rdd.collect + assert(c.count(_._1.isEmpty) === 1) + assert(c.count(_._1.isDefined) === 5) + } + + sparkTest("use shuffle join to pull down fragments mapped to targets") { + val fragmentsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val fragments = sc.loadFragments(fragmentsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = fragments.shuffleRegionJoin(targets) + val jRdd0 = fragments.shuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + assert(jRdd.rdd.count === 5) + assert(jRdd0.rdd.count === 5) + } + + sparkTest("use right outer shuffle join to pull down fragments mapped to targets") { + val fragmentsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val fragments = sc.loadFragments(fragmentsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = fragments.rightOuterShuffleRegionJoin(targets) + val jRdd0 = fragments.rightOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._1.isEmpty) === 1) + assert(c0.count(_._1.isEmpty) === 1) + assert(c.count(_._1.isDefined) === 5) + assert(c0.count(_._1.isDefined) === 5) + } + + sparkTest("use left outer shuffle join to pull down fragments mapped to targets") { + val fragmentsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val fragments = sc.loadFragments(fragmentsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = fragments.leftOuterShuffleRegionJoin(targets) + val jRdd0 = fragments.leftOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._2.isEmpty) === 15) + assert(c0.count(_._2.isEmpty) === 15) + assert(c.count(_._2.isDefined) === 5) + assert(c0.count(_._2.isDefined) === 5) + } + + sparkTest("use full outer shuffle join to pull down fragments mapped to targets") { + val fragmentsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val fragments = sc.loadFragments(fragmentsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = fragments.fullOuterShuffleRegionJoin(targets) + val jRdd0 = fragments.fullOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c0.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c.count(t => t._1.isDefined && t._2.isEmpty) === 15) + assert(c0.count(t => t._1.isDefined && t._2.isEmpty) === 15) + assert(c.count(t => t._1.isEmpty && t._2.isDefined) === 1) + assert(c0.count(t => t._1.isEmpty && t._2.isDefined) === 1) + assert(c.count(t => t._1.isDefined && t._2.isDefined) === 5) + assert(c0.count(t => t._1.isDefined && t._2.isDefined) === 5) + } + + sparkTest("use shuffle join with group by to pull down fragments mapped to targets") { + val fragmentsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val fragments = sc.loadFragments(fragmentsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = fragments.shuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = fragments.shuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.size === 5) + assert(c0.size === 5) + assert(c.forall(_._2.size == 1)) + assert(c0.forall(_._2.size == 1)) + } + + sparkTest("use right outer shuffle join with group by to pull down fragments mapped to targets") { + val fragmentsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val fragments = sc.loadFragments(fragmentsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = fragments.rightOuterShuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = fragments.rightOuterShuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._1.isDefined) === 20) + assert(c0.count(_._1.isDefined) === 20) + assert(c.filter(_._1.isDefined).count(_._2.size == 1) === 5) + assert(c0.filter(_._1.isDefined).count(_._2.size == 1) === 5) + assert(c.filter(_._1.isDefined).count(_._2.isEmpty) === 15) + assert(c0.filter(_._1.isDefined).count(_._2.isEmpty) === 15) + assert(c.count(_._1.isEmpty) === 1) + assert(c0.count(_._1.isEmpty) === 1) + assert(c.filter(_._1.isEmpty).forall(_._2.size == 1)) + assert(c0.filter(_._1.isEmpty).forall(_._2.size == 1)) + } } diff --git a/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDSuite.scala index b5464ac819..81a97908f3 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDSuite.scala @@ -708,4 +708,187 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val tempBam = sc.loadBam(tempPath) assert(tempBam.rdd.count === ardd.rdd.count) } + + sparkTest("use broadcast join to pull down reads mapped to targets") { + val readsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val reads = sc.loadAlignments(readsPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = reads.broadcastRegionJoin(targets) + + assert(jRdd.rdd.count === 5) + } + + sparkTest("use right outer broadcast join to pull down reads mapped to targets") { + val readsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val reads = sc.loadAlignments(readsPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = reads.rightOuterBroadcastRegionJoin(targets) + + val c = jRdd.rdd.collect + assert(c.count(_._1.isEmpty) === 1) + assert(c.count(_._1.isDefined) === 5) + } + + sparkTest("use shuffle join to pull down reads mapped to targets") { + val readsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val reads = sc.loadAlignments(readsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = reads.shuffleRegionJoin(targets) + val jRdd0 = reads.shuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + assert(jRdd.rdd.count === 5) + assert(jRdd0.rdd.count === 5) + } + + sparkTest("use right outer shuffle join to pull down reads mapped to targets") { + val readsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val reads = sc.loadAlignments(readsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = reads.rightOuterShuffleRegionJoin(targets) + val jRdd0 = reads.rightOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._1.isEmpty) === 1) + assert(c0.count(_._1.isEmpty) === 1) + assert(c.count(_._1.isDefined) === 5) + assert(c0.count(_._1.isDefined) === 5) + } + + sparkTest("use left outer shuffle join to pull down reads mapped to targets") { + val readsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val reads = sc.loadAlignments(readsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = reads.leftOuterShuffleRegionJoin(targets) + val jRdd0 = reads.leftOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._2.isEmpty) === 15) + assert(c0.count(_._2.isEmpty) === 15) + assert(c.count(_._2.isDefined) === 5) + assert(c0.count(_._2.isDefined) === 5) + } + + sparkTest("use full outer shuffle join to pull down reads mapped to targets") { + val readsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val reads = sc.loadAlignments(readsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = reads.fullOuterShuffleRegionJoin(targets) + val jRdd0 = reads.fullOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c0.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c.count(t => t._1.isDefined && t._2.isEmpty) === 15) + assert(c0.count(t => t._1.isDefined && t._2.isEmpty) === 15) + assert(c.count(t => t._1.isEmpty && t._2.isDefined) === 1) + assert(c0.count(t => t._1.isEmpty && t._2.isDefined) === 1) + assert(c.count(t => t._1.isDefined && t._2.isDefined) === 5) + assert(c0.count(t => t._1.isDefined && t._2.isDefined) === 5) + } + + sparkTest("use shuffle join with group by to pull down reads mapped to targets") { + val readsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val reads = sc.loadAlignments(readsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = reads.shuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = reads.shuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.size === 5) + assert(c0.size === 5) + assert(c.forall(_._2.size == 1)) + assert(c0.forall(_._2.size == 1)) + } + + sparkTest("use right outer shuffle join with group by to pull down reads mapped to targets") { + val readsPath = testFile("small.1.sam") + val targetsPath = testFile("small.1.bed") + + val reads = sc.loadAlignments(readsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = reads.rightOuterShuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = reads.rightOuterShuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd0.rdd.collect // FIXME + val c0 = jRdd0.rdd.collect + + assert(c.count(_._1.isDefined) === 20) + assert(c0.count(_._1.isDefined) === 20) + assert(c.filter(_._1.isDefined).count(_._2.size == 1) === 5) + assert(c0.filter(_._1.isDefined).count(_._2.size == 1) === 5) + assert(c.filter(_._1.isDefined).count(_._2.isEmpty) === 15) + assert(c0.filter(_._1.isDefined).count(_._2.isEmpty) === 15) + assert(c.count(_._1.isEmpty) === 1) + assert(c0.count(_._1.isEmpty) === 1) + assert(c.filter(_._1.isEmpty).forall(_._2.size == 1)) + assert(c0.filter(_._1.isEmpty).forall(_._2.size == 1)) + } } diff --git a/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala index 7d5e09b4a4..4db692fdd1 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala @@ -23,7 +23,6 @@ import org.bdgenomics.adam.models.{ RecordGroupDictionary, SequenceDictionary } -import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig } diff --git a/src/test/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTargetSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTargetSuite.scala index d085247744..cb95e99221 100644 --- a/src/test/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTargetSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTargetSuite.scala @@ -22,7 +22,7 @@ import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.adam.util.ADAMFunSuite -import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig } +import org.bdgenomics.formats.avro.AlignmentRecord class IndelRealignmentTargetSuite extends ADAMFunSuite { diff --git a/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDDSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDDSuite.scala new file mode 100644 index 0000000000..eddf8b0543 --- /dev/null +++ b/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDDSuite.scala @@ -0,0 +1,208 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.variant + +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.util.ADAMFunSuite + +class GenotypeRDDSuite extends ADAMFunSuite { + + // these tests will all fail until https://github.com/bigdatagenomics/adam/pull/1291 merges + ignore("use broadcast join to pull down genotypes mapped to targets") { + val genotypesPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val genotypes = sc.loadGenotypes(genotypesPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = genotypes.broadcastRegionJoin(targets) + + assert(jRdd.rdd.count === 9L) + } + + ignore("use right outer broadcast join to pull down genotypes mapped to targets") { + val genotypesPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val genotypes = sc.loadGenotypes(genotypesPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = genotypes.rightOuterBroadcastRegionJoin(targets) + + val c = jRdd.rdd.collect + assert(c.count(_._1.isEmpty) === 3) + assert(c.count(_._1.isDefined) === 9) + } + + ignore("use shuffle join to pull down genotypes mapped to targets") { + val genotypesPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val genotypes = sc.loadGenotypes(genotypesPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = genotypes.shuffleRegionJoin(targets) + val jRdd0 = genotypes.shuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + assert(jRdd.rdd.count === 3L) + assert(jRdd0.rdd.count === 3L) + } + + ignore("use right outer shuffle join to pull down genotypes mapped to targets") { + val genotypesPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val genotypes = sc.loadGenotypes(genotypesPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = genotypes.rightOuterShuffleRegionJoin(targets) + val jRdd0 = genotypes.rightOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._1.isEmpty) === 3) + assert(c0.count(_._1.isEmpty) === 3) + assert(c.count(_._1.isDefined) === 9) + assert(c0.count(_._1.isDefined) === 9) + } + + ignore("use left outer shuffle join to pull down genotypes mapped to targets") { + val genotypesPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val genotypes = sc.loadGenotypes(genotypesPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = genotypes.leftOuterShuffleRegionJoin(targets) + val jRdd0 = genotypes.leftOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._2.isEmpty) === 9) + assert(c0.count(_._2.isEmpty) === 9) + assert(c.count(_._2.isDefined) === 9) + assert(c0.count(_._2.isDefined) === 9) + } + + ignore("use full outer shuffle join to pull down genotypes mapped to targets") { + val genotypesPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val genotypes = sc.loadGenotypes(genotypesPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = genotypes.fullOuterShuffleRegionJoin(targets) + val jRdd0 = genotypes.fullOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c0.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c.count(t => t._1.isDefined && t._2.isEmpty) === 9) + assert(c0.count(t => t._1.isDefined && t._2.isEmpty) === 9) + assert(c.count(t => t._1.isEmpty && t._2.isDefined) === 3) + assert(c0.count(t => t._1.isEmpty && t._2.isDefined) === 3) + assert(c.count(t => t._1.isDefined && t._2.isDefined) === 9) + assert(c0.count(t => t._1.isDefined && t._2.isDefined) === 9) + } + + ignore("use shuffle join with group by to pull down genotypes mapped to targets") { + val genotypesPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val genotypes = sc.loadGenotypes(genotypesPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = genotypes.shuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = genotypes.shuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.size === 9) + assert(c0.size === 9) + assert(c.forall(_._2.size == 1)) + assert(c0.forall(_._2.size == 1)) + } + + ignore("use right outer shuffle join with group by to pull down genotypes mapped to targets") { + val genotypesPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val genotypes = sc.loadGenotypes(genotypesPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = genotypes.rightOuterShuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = genotypes.rightOuterShuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd0.rdd.collect + val c0 = jRdd0.rdd.collect + + assert(c.count(_._1.isDefined) === 18) + assert(c0.count(_._1.isDefined) === 18) + assert(c.filter(_._1.isDefined).count(_._2.size == 1) === 9) + assert(c0.filter(_._1.isDefined).count(_._2.size == 1) === 9) + assert(c.filter(_._1.isDefined).count(_._2.isEmpty) === 9) + assert(c0.filter(_._1.isDefined).count(_._2.isEmpty) === 9) + assert(c.count(_._1.isEmpty) === 3) + assert(c0.count(_._1.isEmpty) === 3) + assert(c.filter(_._1.isEmpty).forall(_._2.size == 1)) + assert(c0.filter(_._1.isEmpty).forall(_._2.size == 1)) + } +} diff --git a/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantRDDSuite.scala b/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantRDDSuite.scala new file mode 100644 index 0000000000..9a4ca21196 --- /dev/null +++ b/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantRDDSuite.scala @@ -0,0 +1,207 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.variant + +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.util.ADAMFunSuite + +class VariantRDDSuite extends ADAMFunSuite { + + sparkTest("use broadcast join to pull down variants mapped to targets") { + val variantsPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val variants = sc.loadVariants(variantsPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = variants.broadcastRegionJoin(targets) + + assert(jRdd.rdd.count === 3L) + } + + sparkTest("use right outer broadcast join to pull down variants mapped to targets") { + val variantsPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val variants = sc.loadVariants(variantsPath) + val targets = sc.loadFeatures(targetsPath) + + val jRdd = variants.rightOuterBroadcastRegionJoin(targets) + + val c = jRdd.rdd.collect + assert(c.count(_._1.isEmpty) === 3) + assert(c.count(_._1.isDefined) === 3) + } + + sparkTest("use shuffle join to pull down variants mapped to targets") { + val variantsPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val variants = sc.loadVariants(variantsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = variants.shuffleRegionJoin(targets) + val jRdd0 = variants.shuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + assert(jRdd.rdd.count === 3L) + assert(jRdd0.rdd.count === 3L) + } + + sparkTest("use right outer shuffle join to pull down variants mapped to targets") { + val variantsPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val variants = sc.loadVariants(variantsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = variants.rightOuterShuffleRegionJoin(targets) + val jRdd0 = variants.rightOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._1.isEmpty) === 3) + assert(c0.count(_._1.isEmpty) === 3) + assert(c.count(_._1.isDefined) === 3) + assert(c0.count(_._1.isDefined) === 3) + } + + sparkTest("use left outer shuffle join to pull down variants mapped to targets") { + val variantsPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val variants = sc.loadVariants(variantsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = variants.leftOuterShuffleRegionJoin(targets) + val jRdd0 = variants.leftOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(_._2.isEmpty) === 3) + assert(c0.count(_._2.isEmpty) === 3) + assert(c.count(_._2.isDefined) === 3) + assert(c0.count(_._2.isDefined) === 3) + } + + sparkTest("use full outer shuffle join to pull down variants mapped to targets") { + val variantsPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val variants = sc.loadVariants(variantsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = variants.fullOuterShuffleRegionJoin(targets) + val jRdd0 = variants.fullOuterShuffleRegionJoin(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c0.count(t => t._1.isEmpty && t._2.isEmpty) === 0) + assert(c.count(t => t._1.isDefined && t._2.isEmpty) === 3) + assert(c0.count(t => t._1.isDefined && t._2.isEmpty) === 3) + assert(c.count(t => t._1.isEmpty && t._2.isDefined) === 3) + assert(c0.count(t => t._1.isEmpty && t._2.isDefined) === 3) + assert(c.count(t => t._1.isDefined && t._2.isDefined) === 3) + assert(c0.count(t => t._1.isDefined && t._2.isDefined) === 3) + } + + sparkTest("use shuffle join with group by to pull down variants mapped to targets") { + val variantsPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val variants = sc.loadVariants(variantsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = variants.shuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = variants.shuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd.rdd.collect + val c0 = jRdd0.rdd.collect + assert(c.size === 3) + assert(c0.size === 3) + assert(c.forall(_._2.size == 1)) + assert(c0.forall(_._2.size == 1)) + } + + sparkTest("use right outer shuffle join with group by to pull down variants mapped to targets") { + val variantsPath = testFile("small.vcf") + val targetsPath = testFile("small.1.bed") + + val variants = sc.loadVariants(variantsPath) + .transform(_.repartition(1)) + val targets = sc.loadFeatures(targetsPath) + .transform(_.repartition(1)) + + val jRdd = variants.rightOuterShuffleRegionJoinAndGroupByLeft(targets) + val jRdd0 = variants.rightOuterShuffleRegionJoinAndGroupByLeft(targets, optPartitions = Some(4)) + + // we can't guarantee that we get exactly the number of partitions requested, + // we get close though + assert(jRdd.rdd.partitions.length === 1) + assert(jRdd0.rdd.partitions.length === 5) + + val c = jRdd0.rdd.collect + val c0 = jRdd0.rdd.collect + + assert(c.count(_._1.isDefined) === 6) + assert(c0.count(_._1.isDefined) === 6) + assert(c.filter(_._1.isDefined).count(_._2.size == 1) === 3) + assert(c0.filter(_._1.isDefined).count(_._2.size == 1) === 3) + assert(c.filter(_._1.isDefined).count(_._2.isEmpty) === 3) + assert(c0.filter(_._1.isDefined).count(_._2.isEmpty) === 3) + assert(c.count(_._1.isEmpty) === 3) + assert(c0.count(_._1.isEmpty) === 3) + assert(c.filter(_._1.isEmpty).forall(_._2.size == 1)) + assert(c0.filter(_._1.isEmpty).forall(_._2.size == 1)) + } +} diff --git a/src/test/scala/org/bdgenomics/adam/rich/RichAlignmentRecordSuite.scala b/src/test/scala/org/bdgenomics/adam/rich/RichAlignmentRecordSuite.scala index 09765e8355..d9e5ab4174 100644 --- a/src/test/scala/org/bdgenomics/adam/rich/RichAlignmentRecordSuite.scala +++ b/src/test/scala/org/bdgenomics/adam/rich/RichAlignmentRecordSuite.scala @@ -21,7 +21,6 @@ import org.bdgenomics.adam.models.{ ReferencePosition, TagType, Attribute } import org.bdgenomics.adam.rich.RichAlignmentRecord._ import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig } import org.scalatest.FunSuite -import org.scalatest.exceptions.TestFailedException class RichAlignmentRecordSuite extends FunSuite {