From 1da1c6e1b94555f7ac7ea27264db9fa9fa072fe5 Mon Sep 17 00:00:00 2001 From: Neal Sidhwaney Date: Thu, 13 Mar 2014 16:36:08 -0400 Subject: [PATCH] Migrate from reference ids to reference names We decided that using RefSeq names for our contigs was better practice than minting our own IDs and handling complicated sequence dictionary merges. --- .../bdgenomics/adam/cli/CalculateDepth.scala | 8 +- .../org/bdgenomics/adam/cli/FindReads.scala | 2 +- .../org/bdgenomics/adam/cli/FlagStat.scala | 2 +- .../org/bdgenomics/adam/cli/ListDict.scala | 4 +- .../adam/converters/FastaConverter.scala | 12 +- .../adam/converters/SAMRecordConverter.scala | 18 +- .../converters/VariantContextConverter.scala | 1 - .../adam/metrics/AvailableComparisons.scala | 9 +- .../adam/models/ReferenceMapping.scala | 5 +- .../adam/models/ReferencePosition.scala | 40 ++- .../adam/models/ReferenceRegion.scala | 36 +-- .../adam/models/SequenceDictionary.scala | 211 +++----------- .../ADAMNucleotideContigFragmentField.scala | 2 +- .../adam/projections/ADAMRecordField.scala | 2 +- .../org/bdgenomics/adam/rdd/ADAMContext.scala | 64 ++--- .../adam/rdd/ADAMRDDFunctions.scala | 52 ++-- .../org/bdgenomics/adam/rdd/FlagStat.scala | 7 +- .../adam/rdd/GenomicRegionPartitioner.scala | 36 +-- .../adam/rdd/PileupAggregator.scala | 9 +- .../adam/rdd/Reads2PileupProcessor.scala | 5 +- .../org/bdgenomics/adam/rdd/RegionJoin.scala | 59 ++-- .../bdgenomics/adam/rich/DecadentRead.scala | 2 +- .../adam/rich/ReferenceMappingContext.scala | 12 +- .../adam/rich/RichADAMVariant.scala | 6 +- .../adam/rich/RichRDDReferenceRecords.scala | 50 ---- .../adam/util/IntervalListReader.scala | 9 +- .../adam/util/PileupTraversable.scala | 20 +- .../scala/org/bdgenomics/adam/util/Util.scala | 7 + .../bdgenomics/adam/util/VcfHeaderUtils.scala | 4 +- .../adam/converters/FastaConverterSuite.scala | 29 +- .../VariantContextConverterSuite.scala | 5 +- .../adam/metrics/ComparisonsSuite.scala | 16 +- .../adam/models/ReferencePositionSuite.scala | 22 +- .../adam/models/ReferenceRegionSuite.scala | 105 +++---- .../adam/models/SequenceDictionarySuite.scala | 213 ++++---------- .../adam/rdd/ADAMContextSuite.scala | 13 +- .../adam/rdd/ADAMRDDFunctionsSuite.scala | 265 ++++++++++-------- .../rdd/GenomicRegionPartitionerSuite.scala | 56 ++-- .../adam/rdd/GenotypesSummarySuite.scala | 1 - .../adam/rdd/MarkDuplicatesSuite.scala | 60 ++-- .../adam/rdd/PileupAggregationSuite.scala | 14 +- .../bdgenomics/adam/rdd/RegionJoinSuite.scala | 105 ++++--- .../ComparisonTraversalEngineSuite.scala | 17 +- .../ADAMVariantContextRDDFunctionsSuite.scala | 2 +- .../variation/ADAMVariationContextSuite.scala | 2 +- .../ADAMRecordReferenceMappingSuite.scala | 16 +- .../adam/rich/RichADAMVariantSuite.scala | 4 - .../rich/RichRDDReferenceRecordsSuite.scala | 53 ---- adam-format/src/main/resources/avro/adam.avdl | 34 +-- 49 files changed, 731 insertions(+), 995 deletions(-) delete mode 100644 adam-core/src/main/scala/org/bdgenomics/adam/rich/RichRDDReferenceRecords.scala delete mode 100644 adam-core/src/test/scala/org/bdgenomics/adam/rich/RichRDDReferenceRecordsSuite.scala diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala index 39ad661a91..942e830eaa 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala @@ -69,7 +69,7 @@ class CalculateDepth(protected val args: CalculateDepthArgs) extends ADAMSparkCo // Quiet parquet logging... ParquetLogger.hadoopLoggerLevel(Level.SEVERE) - val proj = Projection(referenceId, referenceName, referenceLength, referenceUrl, start, cigar, readMapped) + val proj = Projection(contig, start, cigar, readMapped) val adamRDD: RDD[ADAMRecord] = sc.adamLoad(args.adamInputPath, projection = Some(proj)) val mappedRDD = adamRDD.filter(_.getReadMapped) @@ -112,7 +112,7 @@ class CalculateDepth(protected val args: CalculateDepthArgs) extends ADAMSparkCo depths.collect().foreach { case (region, count) => println("%20s\t%15s\t% 5d".format( - "%s:%d".format(seqDict(region.refId).name, region.start), + "%s:%d".format(region.referenceName, region.start), variantNames(region), count)) } @@ -144,13 +144,11 @@ class CalculateDepth(protected val args: CalculateDepthArgs) extends ADAMSparkCo throw new IllegalArgumentException("chromosome name \"%s\" wasn't in the sequence dictionary (%s)".format( chrom, seqDict.records.map(_.name).mkString(","))) } - val refId = seqDict(chrom).id val start = array(1).toLong val name = array(2) val end = start + array(3).length - (ReferenceRegion(refId, start, end), name) + (ReferenceRegion(chrom, start, end), name) } }.toSeq) } } - diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FindReads.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FindReads.scala index 8d1ddc8d10..bf5ebd1108 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FindReads.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FindReads.scala @@ -135,7 +135,7 @@ class FindReads(protected val args: FindReadsArgs) extends ADAMSparkCommand[Find case (name: CharSequence, ((bucket1: ReadBucket, bucket2: ReadBucket), generated: Seq[Any])) => { val rec1 = bucket1.allReads().head val rec2 = bucket2.allReads().head - (name, "%s:%d".format(rec1.getReferenceName, rec1.getStart), "%s:%d".format(rec2.getReferenceName, rec2.getStart)) + (name, "%s:%d".format(rec1.getContig.getContigName, rec1.getStart), "%s:%d".format(rec2.getContig.getContigName, rec2.getStart)) } } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala index 11e3b01aa5..e7faaef84b 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala @@ -49,7 +49,7 @@ class FlagStat(protected val args: FlagStatArgs) extends ADAMSparkCommand[FlagSt val projection = Projection( ADAMRecordField.readMapped, ADAMRecordField.mateMapped, ADAMRecordField.readPaired, - ADAMRecordField.referenceId, ADAMRecordField.mateReferenceId, + ADAMRecordField.contig, ADAMRecordField.mateContig, ADAMRecordField.primaryAlignment, ADAMRecordField.duplicateRead, ADAMRecordField.readMapped, ADAMRecordField.mateMapped, ADAMRecordField.firstOfPair, ADAMRecordField.secondOfPair, diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala index 4bdbae180a..6c728afcb1 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala @@ -47,9 +47,9 @@ class ListDict(protected val args: ListDictArgs) extends ADAMSparkCommand[ListDi val dict = sc.adamDictionaryLoad[ADAMRecord](args.inputPath) - dict.recordsIn.sortBy(_.id).foreach { + dict.recordsIn.sortBy(_.name.toString).foreach { rec: SequenceRecord => - println("%d\t%s\t%d".format(rec.id, rec.name, rec.length)) + println("%s\t%d".format(rec.name, rec.length)) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala index e9b62eb145..85ff36a329 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala @@ -15,9 +15,10 @@ */ package org.bdgenomics.adam.converters -import org.bdgenomics.adam.avro.ADAMNucleotideContigFragment import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMNucleotideContigFragment } +import org.bdgenomics.adam.rdd.ADAMContext._ import scala.Int import scala.math.Ordering.Int import scala.Predef._ @@ -195,18 +196,19 @@ private[converters] class FastaConverter(fragmentLength: Long) extends Serializa .map(si => { val (bases, index) = si + val contig = ADAMContig.newBuilder + .setContigLength(sequenceLength) + val builder = ADAMNucleotideContigFragment.newBuilder() - .setContigId(id) .setFragmentSequence(bases) - .setContigLength(sequenceLength) .setFragmentNumber(index) .setFragmentStartPosition(index * fragmentLength) .setNumberOfFragmentsInContig(fragmentCount) // map over optional fields - name.foreach(builder.setContigName(_)) + name.foreach(contig.setContigName(_)) description.foreach(builder.setDescription(_)) - + builder.setContig(contig.build) // build and return builder.build() }) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala index 6c04b8ae81..921f6bcf36 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala @@ -17,7 +17,7 @@ package org.bdgenomics.adam.converters import net.sf.samtools.{ SAMReadGroupRecord, SAMRecord } -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } import scala.collection.JavaConverters._ import org.bdgenomics.adam.models.{ Attribute, RecordGroupDictionary, SequenceDictionary } import org.bdgenomics.adam.util.AttributeUtils @@ -36,10 +36,10 @@ class SAMRecordConverter extends Serializable { val readReference: Int = samRecord.getReferenceIndex if (readReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { builder - .setReferenceId(readReference) - .setReferenceName(samRecord.getReferenceName) - .setReferenceLength(dict(samRecord.getReferenceIndex).length) - .setReferenceUrl(dict(samRecord.getReferenceIndex).url) + .setContig(ADAMContig.newBuilder + .setContigName(samRecord.getReferenceName) + .setContigLength(dict(samRecord.getReferenceName).length) + .setReferenceURL(dict(samRecord.getReferenceName).url).build) val start: Int = samRecord.getAlignmentStart if (start != 0) { @@ -58,10 +58,10 @@ class SAMRecordConverter extends Serializable { if (mateReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { builder - .setMateReferenceId(mateReference) - .setMateReference(samRecord.getMateReferenceName) - .setMateReferenceLength(dict(samRecord.getMateReferenceName).length) - .setMateReferenceUrl(dict(samRecord.getMateReferenceName).url) + .setMateContig(ADAMContig.newBuilder + .setContigName(samRecord.getMateReferenceName) + .setContigLength(dict(samRecord.getMateReferenceName).length) + .setReferenceURL(dict(samRecord.getMateReferenceName).url).build) val mateStart = samRecord.getMateAlignmentStart if (mateStart > 0) { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala index eaf5b70a82..a49754a4f4 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala @@ -140,7 +140,6 @@ class VariantContextConverter(dict: Option[SequenceDictionary] = None) extends S val contig: ADAMContig.Builder = ADAMContig.newBuilder() .setContigName(vc.getChr) - .setContigId(contigId) if (dict.isDefined) { val sr = (dict.get)(vc.getChr) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/metrics/AvailableComparisons.scala b/adam-core/src/main/scala/org/bdgenomics/adam/metrics/AvailableComparisons.scala index f28d870df4..aa94dc3008 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/metrics/AvailableComparisons.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/metrics/AvailableComparisons.scala @@ -15,10 +15,12 @@ */ package org.bdgenomics.adam.metrics -import org.bdgenomics.adam.rich.RichADAMRecord._ +import org.bdgenomics.adam.avro.{ ADAMRecord, ADAMContig } +import org.bdgenomics.adam.models.ReadBucket import org.bdgenomics.adam.projections.FieldValue import org.bdgenomics.adam.projections.ADAMRecordField._ -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.rich.RichADAMRecord._ +import org.bdgenomics.adam.util.Util._; import scala.collection.Map import org.bdgenomics.adam.models.ReadBucket @@ -93,7 +95,7 @@ object MappedPosition extends LongComparisons with Serializable { case 1 => { val r1 = records1.head val r2 = records2.head - if (r1.getReferenceId == r2.getReferenceId) { + if (isSameContig(r1.getContig, r2.getContig)) { val start1 = r1.getStart val start2 = r2.getStart if (start1 > start2) start1 - start2 else start2 - start1 @@ -115,7 +117,6 @@ object MappedPosition extends LongComparisons with Serializable { distance(bucket1.pairedSecondSecondaryMappedReads, bucket2.pairedSecondSecondaryMappedReads)) def schemas: Seq[FieldValue] = Seq( - referenceId, start, firstOfPair) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceMapping.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceMapping.scala index af3313b4fc..0221c58fe1 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceMapping.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceMapping.scala @@ -16,9 +16,6 @@ package org.bdgenomics.adam.models trait ReferenceMapping[T] { - - def getReferenceId(value: T): Int - def remapReferenceId(value: T, newId: Int): T - + def getReferenceName(value: T): String def getReferenceRegion(value: T): ReferenceRegion } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala index 049e3b8306..f92fa4c066 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala @@ -44,8 +44,18 @@ object ReferencePositionWithOrientation { case class ReferencePositionWithOrientation(refPos: Option[ReferencePosition], negativeStrand: Boolean) extends Ordered[ReferencePositionWithOrientation] { + override def compare(that: ReferencePositionWithOrientation): Int = { - val posCompare = refPos.compare(that.refPos) + if (refPos.isEmpty && that.refPos.isEmpty) { + return 0 + } + if (refPos.isEmpty) { + return -1 + } + if (that.refPos.isEmpty) { + return 1 + } + val posCompare = refPos.get.compare(that.refPos.get) if (posCompare != 0) { posCompare } else { @@ -61,7 +71,7 @@ object ReferencePosition { * which is not located anywhere along the reference sequences (see, e.g. its use in * GenomicRegionPartitioner). */ - val UNMAPPED = new ReferencePosition(-1, -1) + val UNMAPPED = new ReferencePosition("", -1) /** * Checks to see if a read is mapped with a valid position. @@ -70,9 +80,9 @@ object ReferencePosition { * @return True if read is mapped and has a valid position, else false. */ def mappedPositionCheck(record: ADAMRecord): Boolean = { - val referenceId = Some(record.getReferenceId) + val contig = Some(record.getContig) val start = Some(record.getStart) - record.getReadMapped && referenceId.isDefined && start.isDefined + record.getReadMapped && (contig.isDefined && Some(contig.get.getContigName).isDefined) && start.isDefined } /** @@ -87,7 +97,7 @@ object ReferencePosition { */ def apply(record: ADAMRecord): Option[ReferencePosition] = { if (mappedPositionCheck(record)) { - Some(new ReferencePosition(record.getReferenceId, record.getStart)) + Some(new ReferencePosition(record.getContig.getContigName, record.getStart)) } else { None } @@ -104,7 +114,7 @@ object ReferencePosition { * @return The reference position of this variant. */ def apply(variant: ADAMVariant): ReferencePosition = { - new ReferencePosition(variant.getContig.getContigId, variant.getPosition) + new ReferencePosition(variant.getContig.getContigName, variant.getPosition) } /** @@ -118,7 +128,7 @@ object ReferencePosition { */ def apply(genotype: ADAMGenotype): ReferencePosition = { val variant = genotype.getVariant() - new ReferencePosition(variant.getContig.getContigId, variant.getPosition) + new ReferencePosition(variant.getContig.getContigName, variant.getPosition) } /** @@ -134,7 +144,7 @@ object ReferencePosition { */ def fivePrime(record: ADAMRecord): Option[ReferencePosition] = { if (mappedPositionCheck(record)) { - Some(new ReferencePosition(record.getReferenceId, record.fivePrimePosition.get)) + Some(new ReferencePosition(record.getContig.getContigName, record.fivePrimePosition.get)) } else { None } @@ -148,15 +158,15 @@ object ReferencePosition { * @return The reference position of this pileup. */ def apply(pileup: ADAMPileup): ReferencePosition = { - new ReferencePosition(pileup.getReferenceId, pileup.getPosition) + new ReferencePosition(pileup.getContig.getContigName, pileup.getPosition) } } -case class ReferencePosition(refId: Int, pos: Long) extends Ordered[ReferencePosition] { +case class ReferencePosition(referenceName: String, pos: Long) extends Ordered[ReferencePosition] { - def compare(that: ReferencePosition): Int = { + override def compare(that: ReferencePosition): Int = { // Note: important to compare by reference first for coordinate ordering - val refCompare = refId.compare(that.refId) + val refCompare = referenceName.compare(that.referenceName) if (refCompare != 0) { refCompare } else { @@ -194,13 +204,13 @@ class ReferencePositionWithOrientationSerializer extends Serializer[ReferencePos class ReferencePositionSerializer extends Serializer[ReferencePosition] { def write(kryo: Kryo, output: Output, obj: ReferencePosition) = { - output.writeInt(obj.refId) + output.writeString(obj.referenceName) output.writeLong(obj.pos) } def read(kryo: Kryo, input: Input, klazz: Class[ReferencePosition]): ReferencePosition = { - val refId = input.readInt() + val refName = input.readString() val pos = input.readLong() - new ReferencePosition(refId, pos) + new ReferencePosition(refName, pos) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala index b8cbe14f23..3f94d2c04a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala @@ -34,7 +34,7 @@ object ReferenceRegion { */ def apply(record: ADAMRecord): Option[ReferenceRegion] = { if (record.getReadMapped) { - Some(ReferenceRegion(record.getReferenceId, record.getStart, RichADAMRecord(record).end.get + 1)) + Some(ReferenceRegion(record.getContig.getContigName, record.getStart, RichADAMRecord(record).end.get + 1)) } else { None } @@ -46,7 +46,7 @@ object ReferenceRegion { * @return A 1-wide region at the same location as pos */ def apply(pos: ReferencePosition): ReferenceRegion = - ReferenceRegion(pos.refId, pos.pos, pos.pos + 1) + ReferenceRegion(pos.referenceName, pos.pos, pos.pos + 1) /** * Generates a reference region from assembly data. Returns None if the assembly does not @@ -56,9 +56,11 @@ object ReferenceRegion { * @return Region corresponding to inclusive region of contig fragment. */ def apply(fragment: ADAMNucleotideContigFragment): Option[ReferenceRegion] = { - if (fragment.getContigId != null && fragment.getFragmentStartPosition != null) { + val contig = fragment.getContig + if (contig != null && contig.getContigName != null && + fragment.getFragmentStartPosition != null) { val fragmentSequence = fragment.getFragmentSequence - Some(ReferenceRegion(fragment.getContigId, + Some(ReferenceRegion(contig.getContigName, fragment.getFragmentStartPosition, fragment.getFragmentStartPosition + fragmentSequence.length)) } else { @@ -76,7 +78,7 @@ object ReferenceRegion { * which is not in the region -- i.e. [start, end) define a 0-based * half-open interval. */ -case class ReferenceRegion(refId: Int, start: Long, end: Long) extends Ordered[ReferenceRegion] { +case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Ordered[ReferenceRegion] { assert(start >= 0) assert(end >= start) @@ -110,8 +112,8 @@ case class ReferenceRegion(refId: Int, start: Long, end: Long) extends Ordered[R * @see merge */ def hull(region: ReferenceRegion): ReferenceRegion = { - assert(refId == region.refId, "Cannot compute convex hull of regions on different references.") - ReferenceRegion(refId, min(start, region.start), max(end, region.end)) + assert(referenceName == region.referenceName, "Cannot compute convex hull of regions on different references.") + ReferenceRegion(referenceName, min(start, region.start), max(end, region.end)) } /** @@ -137,7 +139,7 @@ case class ReferenceRegion(refId: Int, start: Long, end: Long) extends Ordered[R * our reference space, we return an empty option (None). */ def distance(other: ReferencePosition): Option[Long] = - if (refId == other.refId) + if (referenceName == other.referenceName) if (other.pos < start) Some(start - other.pos) else if (other.pos >= end) @@ -159,7 +161,7 @@ case class ReferenceRegion(refId: Int, start: Long, end: Long) extends Ordered[R * our reference space, we return an empty option (None). */ def distance(other: ReferenceRegion): Option[Long] = - if (refId == other.refId) + if (referenceName == other.referenceName) if (overlaps(other)) Some(0) else if (other.start >= end) @@ -170,17 +172,17 @@ case class ReferenceRegion(refId: Int, start: Long, end: Long) extends Ordered[R None def contains(other: ReferencePosition): Boolean = - refId == other.refId && start <= other.pos && end > other.pos + referenceName == other.referenceName && start <= other.pos && end > other.pos def contains(other: ReferenceRegion): Boolean = - refId == other.refId && start <= other.start && end >= other.end + referenceName == other.referenceName && start <= other.start && end >= other.end def overlaps(other: ReferenceRegion): Boolean = - refId == other.refId && end > other.start && start < other.end + referenceName == other.referenceName && end > other.start && start < other.end def compare(that: ReferenceRegion): Int = - if (refId != that.refId) - refId.compareTo(that.refId) + if (referenceName != that.referenceName) + referenceName.compareTo(that.referenceName) else if (start != that.start) start.compareTo(that.start) else @@ -189,15 +191,15 @@ case class ReferenceRegion(refId: Int, start: Long, end: Long) extends Ordered[R class ReferenceRegionSerializer extends Serializer[ReferenceRegion] { def write(kryo: Kryo, output: Output, obj: ReferenceRegion) = { - output.writeInt(obj.refId) + output.writeString(obj.referenceName) output.writeLong(obj.start) output.writeLong(obj.end) } def read(kryo: Kryo, input: Input, klazz: Class[ReferenceRegion]): ReferenceRegion = { - val refId = input.readInt() + val referenceName = input.readString() val start = input.readLong() val end = input.readLong() - new ReferenceRegion(refId, start, end) + new ReferenceRegion(referenceName, start, end) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala index e79ffba7b0..bf99ce87b9 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala @@ -29,30 +29,18 @@ import scala.math.Ordering.Implicits._ */ class SequenceDictionary(val recordsIn: Array[SequenceRecord]) extends Serializable { - // Intermediate value used to ensure that no referenceName or referenceId is listed twice with a different - // referenceId or referenceName (respectively). Notice the "toSet", which means it's okay to pass an Iterable - // that lists the _same_ SequenceRecord twice. - private val idNamePairs = recordsIn.map(rec => (rec.id, rec.name.toString)).toSet + // Intermediate value used to ensure that no referenceName is listed twice. + private val sequenceNames = recordsIn.groupBy(_.name) - // check that no referenceId value is listed twice, to two different referenceNames - assert(idNamePairs.groupBy(_._1).map(p => (p._1, p._2.size)).filter(p => p._2 > 1).isEmpty, - "Duplicate ID in %s".format(idNamePairs)) - - // check that no referenceName is listed twice, to two different referenceIds - assert(idNamePairs.groupBy(_._2).map(p => (p._1, p._2.size)).filter(p => p._2 > 1).isEmpty, - "Duplicate Name in %s".format(idNamePairs)) + // Check that, for every sequenceName -> SequenceRecord mapping, the + // sequence records are all the same. + assert(sequenceNames.map(k => k._2.toSet).filter(k => k.size > 1).size == 0) // Pre-compute the hashCode, based on a sorted version of the idNamePairs list. - private val _hashCode: Int = idNamePairs.toSeq.sortWith(_ < _).foldLeft(0) { - (hash: Int, p: (Int, CharSequence)) => 37 * (hash + p._1) + p._2.hashCode + private val _hashCode: Int = sequenceNames.map(k => k._1).toSeq.sortWith(_.toString < _.toString).foldLeft(0) { + (hash: Int, p: CharSequence) => 37 * hash + p.hashCode } - // Maps referenceId -> SequenceRecord - private lazy val recordIndices: mutable.Map[Int, SequenceRecord] = - mutable.Map(recordsIn.map { - rec => (rec.id, rec) - }.toSeq: _*) - // Maps referenceName -> SequenceRecord private lazy val recordNames: mutable.Map[CharSequence, SequenceRecord] = mutable.Map(recordsIn.map { @@ -61,13 +49,11 @@ class SequenceDictionary(val recordsIn: Array[SequenceRecord]) extends Serializa rec => (rec.name.toString, rec) }.toSeq: _*) - def assignments: Map[Int, CharSequence] = recordIndices.map { - case (id: Int, rec: SequenceRecord) => - (id, rec.name) + def assignments: Map[String, SequenceRecord] = recordNames.map { + case (name: String, rec: SequenceRecord) => + (name, rec) } - def apply(id: Int): SequenceRecord = recordIndices(id) - /** * Returns the sequence record associated with a specific contig name. * @@ -96,116 +82,7 @@ class SequenceDictionary(val recordsIn: Array[SequenceRecord]) extends Serializa !recordsIn.forall(kv => kv.name.toString != name.toString) } - /** - * Returns true if this sequence dictionary contains a reference with the specified (integer) ID. - * @param id refId to look for - * @return True if the refId is in this dictionary - */ - def containsRefId(id: Int): Boolean = { - recordIndices.contains(id) - } - - /** - * Produces a Map of Int -> Int which maps the referenceIds from this SequenceDictionary - * into referenceIds compatible with the argument SequenceDictionary ('dict'). - * - * There are basically three cases that we have to handle here: - * (1) ids for the same sequence name which are different between the dictionaries. These are - * converted (from this.referenceId into dict.referenceId). - * (2) ids which are in use (for different sequences) between the two dictionaries. In this case, - * we mint a new identifier (using nonoverlappingHash) for the sequence in this dictionary - * that won't conflict with any sequence in either dictionary. - * (3) ids for sequences that aren't in the argument dict, and which don't conflict as in (2), - * can be carried over as-is. - * - * (Note: if the source referenceId isn't in the Map produced by mapTo, we can assume that it - * can be used without change in the new dictionary. The method remap, below, actually implements - * this identity.) - * - * The results of this mapTo should be useable by remap to produce a "compatible" dictionary, - * i.e. for all d1 and d2, - * - * d1.remap(d1.mapTo(d2)).isCompatibleWith(d2) - * - * should be true. - * - * @param dict The target dictionary into whose referenceId space the ids of this dictionary should be mapped. - * @return A Map whose values change the referenceIds in this dictionary; every referenceId in the source - * dictionary should be present in this Map - */ - def mapTo(dict: SequenceDictionary): Map[Int, Int] = { - - /* - * we start by assuming that all the sequences in the target dictionary will maintain their - * existing identifiers -- mapTo won't imply any changes to the id/sequence correspondence in - * the target dictionary. - */ - val assign: mutable.Map[Int, CharSequence] = mutable.Map(dict.assignments.toSeq: _*) - - /* - * Next, for every source sequence that is _not_ in the target dictionary, there are two cases: - * 1. the source ID is not in use in the target -- in this case, just carry over the existing - * identifier into the assignment. - * 2. the source ID _is_ already in use in the assignment -- in this case, we assign a new identifier - * for the source sequence, and store it in the assignment. - */ - recordNames.keys.filter(!dict.recordNames.contains(_)).foreach { - name => - val myIdx = recordNames(name).id - if (assign.contains(myIdx)) { - // using dict.nonoverlappingHash (rather than this.nonoverlappingHash) ensures - // that the new identifier won't overlap with any other in the target dictionary - // (and therefore, in the assignment map we're building, above). - assign(dict.nonoverlappingHash(name)) = name - } else { - assign(myIdx) = name - } - } - - /* - * At this point, 'assign' holds the desired id->sequence mapping of the "combined" target - * and source dictionaries; to some extent, we've reverse-engineered the results of - * - * this.remap(this.mapTo(dict)) ++ dict - * - * So now, we reverse the mapping (into sequence->id) and use it to convert source identifiers - * into target identifiers. - */ - val rassign: Map[CharSequence, Int] = Map(assign.toSeq.map(p => (p._2, p._1)): _*) - val idxMap: Map[Int, Int] = Map(recordIndices.keys.map(idx => (idx, rassign(recordIndices(idx).name))).toSeq: _*) - - assert(idxMap.keys.filter(!recordIndices.contains(_)).isEmpty, - "There we keys in the mapTo Map that weren't actually sequence indices") - assert(recordIndices.keys.filter(!idxMap.contains(_)).isEmpty, - "There were keys which weren't remapped by the mapTo idxMap") - - idxMap - } - - /** - * See the note to mapTo, above. - * The results of this remap and mapTo should be to produce a "compatible" dictionary, - * i.e. for all d1 and d2, - * - * d1.remap(d1.mapTo(d2)).isCompatibleWith(d2) - * - * should be true. - * - * @param idTransform The Map[Int,Int] to transform the identifiers of this dictionary; e.g. the output of - * mapTo. - * @return A new SequenceDictionary with just the referenceIds mapped through the given Map argument. - */ - def remap(idTransform: Map[Int, Int]): SequenceDictionary = { - def remapIndex(i: Int): Int = - if (idTransform.contains(i)) idTransform(i) else i - - new SequenceDictionary(idNamePairs.map { - case (id, name) => - recordIndices(id).withReferenceId(remapIndex(id)) - }.toArray) - } - - def records: Set[SequenceRecord] = recordIndices.values.toSet + def records: Set[SequenceRecord] = recordNames.values.toSet private[models] def cleanAndMerge(a1: Array[SequenceRecord], a2: Array[SequenceRecord]): Array[SequenceRecord] = { @@ -230,8 +107,7 @@ class SequenceDictionary(val recordsIn: Array[SequenceRecord]) extends Serializa /** * Tests whether two dictionaries are compatible, where "compatible" means that - * shared referenceName values are associated with the same referenceId, and - * shared referenceId values are associated with the same referenceName. + * shared referenceName values are associated with the same SequenceRecord. * * Roughly, two dictionaries are compatible if the ++ operator will succeed when * called on them together. @@ -240,26 +116,22 @@ class SequenceDictionary(val recordsIn: Array[SequenceRecord]) extends Serializa * @return true if the dictionaries are compatible, false otherwise. */ def isCompatibleWith(dict: SequenceDictionary): Boolean = - recordIndices.keys.filter(dict.recordIndices.contains).filter(idx => recordIndices(idx) != dict.recordIndices(idx)).isEmpty && - recordNames.keys.filter(dict.recordNames.contains).filter(name => recordNames(name) != dict.recordNames(name)).isEmpty - - def nonoverlappingHash(x: CharSequence): Int = - SequenceDictionary.nonoverlappingHash(x, idx => recordIndices.contains(idx)) + recordNames.keys.filter(dict.recordNames.contains).filter( + name => recordNames(name) != dict.recordNames(name)).isEmpty override def equals(x: Any): Boolean = { x match { case d: SequenceDictionary => - recordNames == d.recordNames && recordIndices == d.recordIndices + recordNames == d.recordNames case _ => false } } override def hashCode(): Int = _hashCode - override def toString: String = idNamePairs.toString() - /** - * Converts this ADAM style sequence dictionary into a SAM style sequence dictionary. + * Converts this ADAM style sequence dictionary into a SAM style + * sequence dictionary. * * @return Returns a SAM formatted sequence dictionary. */ @@ -318,7 +190,7 @@ object SequenceDictionary { // i've flagged the hadoop-bam team to let them know -- FAN, 2/5/2014 val length = 1 - SequenceRecord(index, name, length.toLong, null) + SequenceRecord(name, length.toLong, null) }): _*) } @@ -362,26 +234,24 @@ object SequenceDictionary { * @param url * @param md5 */ -class SequenceRecord(val id: Int, val name: CharSequence, val length: Long, val url: CharSequence, val md5: CharSequence) extends Serializable { +class SequenceRecord(val name: CharSequence, val length: Long, val url: CharSequence, val md5: CharSequence) extends Serializable { assert(name != null, "SequenceRecord.name is null") assert(name.length > 0, "SequenceRecord.name has length 0") assert(length > 0, "SequenceRecord.length <= 0") - def withReferenceId(newId: Int): SequenceRecord = - new SequenceRecord(newId, name, length, url, md5) - override def equals(x: Any): Boolean = { x match { case y: SequenceRecord => - id == y.id && name == y.name && length == y.length && url == y.url && md5 == y.md5 + name == y.name && length == y.length && ( + md5 == null || y.md5 == null || md5 == y.md5) case _ => false } } - override def hashCode: Int = ((id + name.hashCode) * 37 + length.hashCode) * 37 + override def hashCode: Int = (name.hashCode * 37 + length.hashCode) * 37 - override def toString: String = "%s->%s=%d".format(id, name, length) + override def toString: String = "%s->%s".format(name, length) /** * Converts this sequence record into a SAM sequence record. @@ -404,8 +274,8 @@ class SequenceRecord(val id: Int, val name: CharSequence, val length: Long, val object SequenceRecord { - def apply(id: Int, name: CharSequence, length: Long, url: CharSequence = null, md5: CharSequence = null): SequenceRecord = - new SequenceRecord(id, name, length, url, md5) + def apply(name: CharSequence, length: Long, url: CharSequence = null, md5: CharSequence = null): SequenceRecord = + new SequenceRecord(name, length, url, md5) /** * Converts an ADAM contig into a sequence record. @@ -413,8 +283,9 @@ object SequenceRecord { * @param ctg Contig to convert. * @return Contig expressed as a sequence record. */ - def fromADAMContigFragment(ctg: ADAMNucleotideContigFragment): SequenceRecord = { - apply(ctg.getContigId, ctg.getContigName, ctg.getContigLength, ctg.getUrl) + def fromADAMContigFragment(fragment: ADAMNucleotideContigFragment): SequenceRecord = { + val contig = fragment.getContig + apply(contig.getContigName, contig.getContigLength, contig.getReferenceURL) } /* @@ -424,7 +295,7 @@ object SequenceRecord { * @return A new ADAM sequence record. */ def fromSamSequenceRecord(seqRecord: SAMSequenceRecord): SequenceRecord = { - apply(seqRecord.getSequenceIndex, seqRecord.getSequenceName, seqRecord.getSequenceLength, seqRecord.getAssembly) + apply(seqRecord.getSequenceName, seqRecord.getSequenceLength, seqRecord.getAssembly) } /** @@ -446,27 +317,27 @@ object SequenceRecord { if (rec.getFirstOfPair) { val left = - if (rec.getReadMapped) - Set(SequenceRecord(rec.getReferenceId, rec.getReferenceName, rec.getReferenceLength, rec.getReferenceUrl)) - else + if (rec.getReadMapped) { + val contig = rec.getContig + Set(SequenceRecord(contig.getContigName, contig.getContigLength, contig.getReferenceURL)) + } else Set() val right = - if (rec.getMateMapped) - Set(SequenceRecord(rec.getMateReferenceId, rec.getMateReference, rec.getMateReferenceLength, rec.getMateReferenceUrl)) - else + if (rec.getMateMapped) { + val contig = rec.getMateContig + Set(SequenceRecord(contig.getContigName, contig.getContigLength, contig.getReferenceURL)) + } else Set() - left ++ right - } else { Set() } } else { - if (rec.getReadMapped) { - Set(SequenceRecord(rec.getReferenceId, rec.getReferenceName, rec.getReferenceLength, rec.getReferenceUrl)) + val contig = rec.getContig + Set(SequenceRecord(contig.getContigName, contig.getContigLength, contig.getReferenceURL)) } else { // If the read isn't mapped, then ignore the fields altogether. Set() @@ -478,8 +349,9 @@ object SequenceRecord { val schema = rec.getSchema if (schema.getField("referenceId") != null) { + // This should be able to be removed now that we've consolidated everything into + // contained fields of type ADAMContig. new SequenceRecord( - rec.get(schema.getField("referenceId").pos()).asInstanceOf[Int], rec.get(schema.getField("referenceName").pos()).asInstanceOf[CharSequence], rec.get(schema.getField("referenceLength").pos()).asInstanceOf[Long], rec.get(schema.getField("referenceUrl").pos()).asInstanceOf[CharSequence], @@ -489,7 +361,6 @@ object SequenceRecord { val contig = rec.get(pos).asInstanceOf[ADAMContig] val contigSchema = contig.getSchema new SequenceRecord( - contig.get(contigSchema.getField("contigId").pos()).asInstanceOf[Int], contig.get(contigSchema.getField("contigName").pos()).asInstanceOf[CharSequence], contig.get(contigSchema.getField("contigLength").pos()).asInstanceOf[Long], contig.get(contigSchema.getField("referenceURL").pos()).asInstanceOf[CharSequence], diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/projections/ADAMNucleotideContigFragmentField.scala b/adam-core/src/main/scala/org/bdgenomics/adam/projections/ADAMNucleotideContigFragmentField.scala index 175c7745ae..d484bdd58d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/projections/ADAMNucleotideContigFragmentField.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/projections/ADAMNucleotideContigFragmentField.scala @@ -27,5 +27,5 @@ import org.bdgenomics.adam.avro.ADAMNucleotideContigFragment */ object ADAMNucleotideContigFragmentField extends FieldEnumeration(ADAMNucleotideContigFragment.SCHEMA$) { - val contigName, contigId, description, fragmentSequence, contigLength, fragmentNumber, fragmentStartPosition, numberOfFragmentsInContig, url = SchemaValue + val contig, description, fragmentSequence, fragmentNumber, fragmentStartPosition, numberOfFragmentsInContig, url = SchemaValue } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/projections/ADAMRecordField.scala b/adam-core/src/main/scala/org/bdgenomics/adam/projections/ADAMRecordField.scala index 037f7dd2d8..edc4b34e86 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/projections/ADAMRecordField.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/projections/ADAMRecordField.scala @@ -27,5 +27,5 @@ import org.bdgenomics.adam.avro.ADAMRecord */ object ADAMRecordField extends FieldEnumeration(ADAMRecord.SCHEMA$) { - val referenceName, referenceId, referenceLength, referenceUrl, start, mapq, readName, sequence, mateReference, mateAlignmentStart, cigar, qual, recordGroupId, recordGroupName, readPaired, properPair, readMapped, mateMapped, readNegativeStrand, mateNegativeStrand, firstOfPair, secondOfPair, primaryAlignment, failedVendorQualityChecks, duplicateRead, mismatchingPositions, attributes, recordGroupSequencingCenter, recordGroupDescription, recordGroupRunDateEpoch, recordGroupFlowOrder, recordGroupKeySequence, recordGroupLibrary, recordGroupPredictedMedianInsertSize, recordGroupPlatform, recordGroupPlatformUnit, recordGroupSample, mateReferenceId, mateReferenceLength, mateReferenceUrl, origQual = SchemaValue + val contig, start, mapq, readName, sequence, mateAlignmentStart, cigar, qual, recordGroupId, recordGroupName, readPaired, properPair, readMapped, mateMapped, readNegativeStrand, mateNegativeStrand, firstOfPair, secondOfPair, primaryAlignment, failedVendorQualityChecks, duplicateRead, mismatchingPositions, attributes, recordGroupSequencingCenter, recordGroupDescription, recordGroupRunDateEpoch, recordGroupFlowOrder, recordGroupKeySequence, recordGroupLibrary, recordGroupPredictedMedianInsertSize, recordGroupPlatform, recordGroupPlatformUnit, recordGroupSample, mateContig, origQual = SchemaValue } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala index d94c137a31..aee1afd4bb 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala @@ -15,6 +15,19 @@ */ package org.bdgenomics.adam.rdd +import fi.tkk.ics.hadoop.bam.{ SAMRecordWritable, AnySAMInputFormat, VariantContextWritable, VCFInputFormat } +import fi.tkk.ics.hadoop.bam.util.{ SAMHeaderReader, VCFHeaderReader, WrapSeekable } +import java.util.regex.Pattern +import net.sf.samtools.SAMFileHeader +import org.apache.hadoop.fs.FileSystem +import org.apache.avro.Schema +import org.apache.avro.specific.SpecificRecord +import org.apache.hadoop.fs.Path +import org.apache.hadoop.io.{ LongWritable, Text } +import org.apache.hadoop.mapreduce.Job +import org.apache.spark.{ Logging, SparkContext } +import org.apache.spark.rdd.RDD +import org.apache.spark.scheduler.StatsReportListener import org.bdgenomics.adam.avro.{ ADAMPileup, ADAMRecord, @@ -29,21 +42,7 @@ import org.bdgenomics.adam.projections.{ ADAMNucleotideContigFragmentField } import org.bdgenomics.adam.rich.RichADAMRecord -import org.bdgenomics.adam.rich.RichRDDReferenceRecords._ import org.bdgenomics.adam.serialization.ADAMKryoProperties -import fi.tkk.ics.hadoop.bam.{ SAMRecordWritable, AnySAMInputFormat, VariantContextWritable, VCFInputFormat } -import fi.tkk.ics.hadoop.bam.util.{ SAMHeaderReader, VCFHeaderReader, WrapSeekable } -import java.util.regex.Pattern -import net.sf.samtools.SAMFileHeader -import org.apache.hadoop.fs.FileSystem -import org.apache.avro.Schema -import org.apache.avro.specific.SpecificRecord -import org.apache.hadoop.fs.Path -import org.apache.hadoop.io.{ Text, LongWritable } -import org.apache.hadoop.mapreduce.Job -import org.apache.spark.{ Logging, SparkContext } -import org.apache.spark.rdd.RDD -import org.apache.spark.scheduler.StatsReportListener import parquet.avro.{ AvroParquetInputFormat, AvroReadSupport } import parquet.filter.UnboundRecordFilter import parquet.hadoop.ParquetInputFormat @@ -226,29 +225,16 @@ class ADAMContext(sc: SparkContext) extends Serializable with Logging { val projection = if (isADAMRecord) { Projection( - ADAMRecordField.referenceId, - ADAMRecordField.referenceName, - ADAMRecordField.referenceLength, - ADAMRecordField.referenceUrl, - ADAMRecordField.mateReferenceId, - ADAMRecordField.mateReference, - ADAMRecordField.mateReferenceLength, - ADAMRecordField.mateReferenceUrl, + ADAMRecordField.contig, + ADAMRecordField.mateContig, ADAMRecordField.readPaired, ADAMRecordField.firstOfPair, ADAMRecordField.readMapped, ADAMRecordField.mateMapped) } else if (isADAMContig) { - Projection(ADAMNucleotideContigFragmentField.contigName, - ADAMNucleotideContigFragmentField.contigId, - ADAMNucleotideContigFragmentField.contigLength, - ADAMNucleotideContigFragmentField.url) + Projection(ADAMNucleotideContigFragmentField.contig) } else { - Projection( - ADAMRecordField.referenceId, - ADAMRecordField.referenceName, - ADAMRecordField.referenceLength, - ADAMRecordField.referenceUrl) + Projection(ADAMRecordField.contig) } if (filePath.endsWith(".bam") || filePath.endsWith(".sam")) { @@ -353,18 +339,8 @@ class ADAMContext(sc: SparkContext) extends Serializable with Logging { (dict, rdd) } - def remap(adams: Seq[(SequenceDictionary, RDD[ADAMRecord])]): Seq[RDD[ADAMRecord]] = { - adams.headOption match { - case None => Seq() - case Some(head) => - head._2 +: adams.tail.map(v => { - if (v._1.equals(head._1)) v._2 - else v._2.remapReferenceId(v._1.mapTo(head._1).toMap)(sc) - }) - } - } - - sc.union(remap(paths.map(loadADAMs))) + // Remapreferenceid code deleted since we don't remap sequence + // dictionaries anymore. + sc.union(paths.map(loadADAMs).map(v => v._2)) } } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala index 8c7c4ea70d..4db0479fa7 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala @@ -15,7 +15,17 @@ */ package org.bdgenomics.adam.rdd +import java.io.File +import java.util.logging.Level +import java.util.UUID +import org.apache.avro.specific.SpecificRecord +import org.apache.hadoop.mapreduce.Job +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.Logging +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD import org.bdgenomics.adam.avro.{ + ADAMContig, ADAMPileup, ADAMRecord, ADAMNucleotideContigFragment, @@ -36,14 +46,6 @@ import org.bdgenomics.adam.rdd.recalibration.BaseQualityRecalibration import org.bdgenomics.adam.rich.RichADAMRecord._ import org.bdgenomics.adam.rich.RichADAMRecord import org.bdgenomics.adam.util.{ HadoopUtil, MapTools, ParquetLogger } -import java.io.File -import java.util.logging.Level -import org.apache.avro.specific.SpecificRecord -import org.apache.hadoop.mapreduce.Job -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.Logging -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.RDD import parquet.avro.{ AvroParquetOutputFormat, AvroWriteSupport } import parquet.hadoop.ParquetOutputFormat import parquet.hadoop.metadata.CompressionCodecName @@ -152,17 +154,20 @@ class ADAMRecordRDDFunctions(rdd: RDD[ADAMRecord]) extends ADAMSequenceDictionar // we place them in a range of referenceIds at the end of the file. // The referenceId is an Int and typical only a few dozen values are even used. // These referenceId values are not stored; they are only used during sorting. - val unmappedReferenceIds = new Iterator[Int] with Serializable { + val unmappedReferenceNames = new Iterator[String] with Serializable { var currentOffsetFromEnd = 0 def hasNext: Boolean = true - def next(): Int = { + def next(): String = { currentOffsetFromEnd += 1 if (currentOffsetFromEnd > 10000) { currentOffsetFromEnd = 0 } - Int.MaxValue - currentOffsetFromEnd + // NB : this is really ugly - any better way to manufacture + // string values that are greater than anything else we care + // about? + "unmapped" + (Int.MaxValue - currentOffsetFromEnd).toString } } @@ -171,7 +176,7 @@ class ADAMRecordRDDFunctions(rdd: RDD[ADAMRecord]) extends ADAMSequenceDictionar case None => // Move unmapped reads to the end of the file ReferencePosition( - unmappedReferenceIds.next(), Long.MaxValue) + unmappedReferenceNames.next(), Long.MaxValue) case Some(pos) => pos } (referencePos, p) @@ -236,12 +241,12 @@ class ADAMRecordRDDFunctions(rdd: RDD[ADAMRecord]) extends ADAMSequenceDictionar def mapToBucket(r: ADAMRecord): List[(ReferencePosition, ADAMRecord)] = { val s = r.getStart / bucketSize val e = RichADAMRecord(r).end.get / bucketSize - val id = r.getReferenceId + val name = r.getContig.getContigName if (s == e) { - List((new ReferencePosition(id, s), r)) + List((new ReferencePosition(name, s), r)) } else { - List((new ReferencePosition(id, s), r), (new ReferencePosition(id, e), r)) + List((new ReferencePosition(name, s), r), (new ReferencePosition(name, e), r)) } } @@ -414,16 +419,15 @@ class ADAMNucleotideContigFragmentRDDFunctions(rdd: RDD[ADAMNucleotideContigFrag * @param dictionary A sequence dictionary containing the IDs to use for remapping. * @return An option containing the remapped contig if it's sequence name was found in the dictionary. */ - def remapContig(contig: ADAMNucleotideContigFragment, dictionary: SequenceDictionary): Option[ADAMNucleotideContigFragment] = { - val name: CharSequence = contig.getContigName + def remapContig(fragment: ADAMNucleotideContigFragment, dictionary: SequenceDictionary): Option[ADAMNucleotideContigFragment] = { + val name: CharSequence = fragment.getContig.getContigName if (dictionary.containsRefName(name)) { - val newId = dictionary(contig.getContigName).id - val newContig = ADAMNucleotideContigFragment.newBuilder(contig) - .setContigId(newId) + // NB : this is a no-op in the non-ref-id world. Should we delete it? + val newFragment = ADAMNucleotideContigFragment.newBuilder(fragment) + .setContig(fragment.getContig) .build() - - Some(newContig) + Some(newFragment) } else { None } @@ -450,11 +454,9 @@ class ADAMNucleotideContigFragmentRDDFunctions(rdd: RDD[ADAMNucleotideContigFrag val str = fragmentSequence.drop(trimStart) .dropRight(trimEnd) - - val reg = new ReferenceRegion(fragment._1.refId, + val reg = new ReferenceRegion(fragment._1.referenceName, fragment._1.start + trimStart, fragment._1.end - trimEnd) - (reg, str) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/FlagStat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/FlagStat.scala index c7aecda959..f6cdfe5094 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/FlagStat.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/FlagStat.scala @@ -15,8 +15,9 @@ */ package org.bdgenomics.adam.rdd -import org.bdgenomics.adam.avro.ADAMRecord import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.util.Util._ object FlagStatMetrics { val emptyFailedQuality = new FlagStatMetrics(0, DuplicateMetrics.empty, DuplicateMetrics.empty, 0, 0, 0, 0, 0, 0, 0, 0, 0, true) @@ -40,7 +41,7 @@ object DuplicateMetrics { new DuplicateMetrics(b2i(f(record)), b2i(f(record) && record.getReadMapped && record.getMateMapped), b2i(f(record) && record.getReadMapped && !record.getMateMapped), - b2i(f(record) && (record.getReferenceId != record.getMateReferenceId))) + b2i(f(record) && (!isSameContig(record.getContig, record.getMateContig)))) } (duplicateMetrics(isPrimary), duplicateMetrics(isSecondary)) } @@ -85,7 +86,7 @@ object FlagStat { def apply(rdd: RDD[ADAMRecord]) = { rdd.map { p => - val mateMappedToDiffChromosome = p.getReadPaired && p.getReadMapped && p.getMateMapped && (p.getReferenceId != p.getMateReferenceId) + val mateMappedToDiffChromosome = p.getReadPaired && p.getReadMapped && p.getMateMapped && !isSameContig(p.getContig, p.getMateContig) val (primaryDuplicates, secondaryDuplicates) = DuplicateMetrics(p) new FlagStatMetrics(1, primaryDuplicates, secondaryDuplicates, diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRegionPartitioner.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRegionPartitioner.scala index eeb6b186e7..f8bf9767a3 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRegionPartitioner.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRegionPartitioner.scala @@ -33,21 +33,21 @@ import scala.math._ * @param seqLengths a map relating sequence name-to-length and indicating the set and length of all extant * sequences in the genome. */ -class GenomicRegionPartitioner(val numParts: Int, val seqLengths: Map[Int, Long]) extends Partitioner { +class GenomicRegionPartitioner(val numParts: Int, val seqLengths: Map[String, Long]) extends Partitioner { def this(numParts: Int, seqDict: SequenceDictionary) = this(numParts, GenomicRegionPartitioner.extractLengthMap(seqDict)) - val ids: Seq[Int] = seqLengths.keys.toSeq.sortWith(_ < _) - val lengths: Seq[Long] = ids.map(seqLengths(_)) + val names: Seq[String] = seqLengths.keys.toSeq.sortWith(_ < _) + val lengths: Seq[Long] = names.map(seqLengths(_)) private val cumuls: Seq[Long] = lengths.scan(0L)(_ + _) // total # of bases in the sequence dictionary val totalLength: Long = lengths.reduce(_ + _) - // referenceId -> cumulative length before this sequence (using seqDict.records as the implicit ordering) - val cumulativeLengths: Map[Int, Long] = Map( - ids.zip(cumuls): _*) + // referenceName -> cumulative length before this sequence (using seqDict.records as the implicit ordering) + val cumulativeLengths: Map[String, Long] = Map( + names.zip(cumuls): _*) /** * 'parts' is the total number of partitions for non-UNMAPPED ReferencePositions -- @@ -62,13 +62,12 @@ class GenomicRegionPartitioner(val numParts: Int, val seqLengths: Map[Int, Long] // This allows partitions that cross chromosome boundaries. // The computation is slightly more complicated if you want to avoid this. - def getPart(refId: Int, pos: Long): Int = { - val totalOffset = cumulativeLengths(refId) + pos + def getPart(referenceName: String, pos: Long): Int = { + val totalOffset = cumulativeLengths(referenceName) + pos val totalFraction: Double = totalOffset.toDouble / totalLength - // Need to use 'parts' here, rather than 'numPartitions' -- see the note // on 'parts', above. - floor(totalFraction * parts.toDouble).toInt + min(floor(totalFraction * parts.toDouble).toInt, numPartitions) } key match { @@ -76,7 +75,7 @@ class GenomicRegionPartitioner(val numParts: Int, val seqLengths: Map[Int, Long] case ReferencePosition.UNMAPPED => parts // everything else gets assigned normally. - case refpos: ReferencePosition => getPart(refpos.refId, refpos.pos) + case refpos: ReferencePosition => getPart(refpos.referenceName, refpos.pos) // only ReferencePosition values are partitioned using this partitioner case _ => throw new IllegalArgumentException("Only ReferencePosition values can be partitioned by GenomicRegionPartitioner") @@ -86,19 +85,22 @@ class GenomicRegionPartitioner(val numParts: Int, val seqLengths: Map[Int, Long] override def equals(x: Any): Boolean = { x match { case y: GenomicRegionPartitioner => - y.numPartitions == numPartitions && ids == y.ids && lengths == y.lengths + y.numPartitions == numPartitions && names == y.names && lengths == y.lengths case _ => false } } - override def hashCode(): Int = 37 * (37 * parts + ids.hashCode()) + lengths.hashCode() + override def toString(): String = { + return "%d parts, %d partitions, %s" format (parts, numPartitions, cumulativeLengths.toString) + } + + override def hashCode(): Int = 37 * (37 * parts + names.hashCode()) + lengths.hashCode() } object GenomicRegionPartitioner { - def apply(N: Int, lengths: Map[Int, Long]) = new GenomicRegionPartitioner(N, lengths) + def apply(N: Int, lengths: Map[String, Long]) = new GenomicRegionPartitioner(N, lengths) - def extractLengthMap(seqDict: SequenceDictionary): Map[Int, Long] = - Map(seqDict.records.toSeq.map(rec => (rec.id, rec.length)): _*) + def extractLengthMap(seqDict: SequenceDictionary): Map[String, Long] = + Map(seqDict.records.toSeq.map(rec => (rec.name.toString, rec.length)): _*) } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/PileupAggregator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/PileupAggregator.scala index 6e0f081d6b..2b417e880f 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/PileupAggregator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/PileupAggregator.scala @@ -16,7 +16,7 @@ package org.bdgenomics.adam.rdd -import org.bdgenomics.adam.avro.{ Base, ADAMPileup } +import org.bdgenomics.adam.avro.{ Base, ADAMContig, ADAMPileup } import org.bdgenomics.adam.models.ReferencePosition import org.bdgenomics.adam.rdd.ADAMContext._ import org.apache.spark.rdd.RDD @@ -68,10 +68,13 @@ private[rdd] class PileupAggregator(validate: Boolean = false) extends Serializa "Cannot aggregate pileup with required fields null: " + b) } + // We have to duplicate the existing contig so it doesn't get + // inadvertently modified later on. + val contig = ADAMContig.newBuilder(a.getContig).build + // set copied fields val c = ADAMPileup.newBuilder() - .setReferenceName(a.getReferenceName) - .setReferenceId(a.getReferenceId) + .setContig(contig) .setPosition(a.getPosition) .setRangeOffset(a.getRangeOffset) .setRangeLength(a.getRangeLength) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Reads2PileupProcessor.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Reads2PileupProcessor.scala index 6fa8a33df7..cea443222e 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Reads2PileupProcessor.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Reads2PileupProcessor.scala @@ -16,7 +16,7 @@ package org.bdgenomics.adam.rdd -import org.bdgenomics.adam.avro.{ Base, ADAMPileup, ADAMRecord } +import org.bdgenomics.adam.avro.{ Base, ADAMContig, ADAMPileup, ADAMRecord } import org.apache.spark.rdd.RDD import org.apache.spark.Logging import org.bdgenomics.adam.rich.RichADAMRecord._ @@ -83,8 +83,7 @@ private[rdd] class Reads2PileupProcessor(createSecondaryAlignments: Boolean = fa assert(end != -1L, "Read is mapped but has a null end position. Read:\n" + record) ADAMPileup.newBuilder() - .setReferenceName(record.getReferenceName) - .setReferenceId(record.getReferenceId) + .setContig(record.getContig) .setMapQuality(record.getMapq) .setPosition(referencePos) .setRecordGroupSequencingCenter(record.getRecordGroupSequencingCenter) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala index 0dd1367560..ef38e82b59 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala @@ -98,15 +98,15 @@ object RegionJoin { * - carry a sequence dictionary through the computation. */ - // First, we group the regions in the left side of the join by their refId, + // First, we group the regions in the left side of the join by their referenceName, // and collect them. - val collectedLeft: Seq[(Int, Seq[ReferenceRegion])] = + val collectedLeft: Seq[(String, Seq[ReferenceRegion])] = baseRDD - .map(t => (tMapping.getReferenceId(t), tMapping.getReferenceRegion(t))) // RDD[(Int,ReferenceRegion)] - .groupBy(_._1) // RDD[(Int,Seq[(Int,ReferenceRegion)])] - .map(t => (t._1, t._2.map(_._2))) // RDD[(Int,Seq[ReferenceRegion])] - .collect() // Iterable[(Int,Seq[ReferenceRegion])] - .toSeq // Seq[(Int,Seq[ReferenceRegion])] + .map(t => (tMapping.getReferenceName(t), tMapping.getReferenceRegion(t))) // RDD[(String,ReferenceRegion)] + .groupBy(_._1) // RDD[(String,Seq[(String,ReferenceRegion)])] + .map(t => (t._1, t._2.map(_._2))) // RDD[(String,Seq[ReferenceRegion])] + .collect() // Iterable[(String,Seq[ReferenceRegion])] + .toSeq // Seq[(String,Seq[ReferenceRegion])] // Next, we turn that into a data structure that reduces those regions to their non-overlapping // pieces, which we will use as a partition. @@ -167,7 +167,7 @@ object RegionJoin { * * NonoverlappingRegions takes, as input, and 'input-set' of regions. These are arbitrary ReferenceRegions, * which may be overlapping, identical, disjoint, etc. The input-set of regions _must_ all be located on - * the same reference chromosome (i.e. must all have the same refId); the generalization to reference + * the same reference chromosome (i.e. must all have the same refName); the generalization to reference * regions from multiple chromosomes is in MultiContigNonoverlappingRegions, below. * * NonoverlappingRegions produces, internally, a 'nonoverlapping-set' of regions. This is basically @@ -190,11 +190,11 @@ class NonoverlappingRegions(seqDict: SequenceDictionary, regions: Seq[ReferenceR assert(seqDict != null, "Sequence Dictionary cannot be null") - val referenceId: Int = regions.head.refId - val referenceLength: Long = seqDict(referenceId).length + val referenceName: String = regions.head.referenceName + val referenceLength: Long = seqDict(referenceName).length // invariant: all the values in the 'regions' list have the same referenceId - assert(regions.forall(_.refId == referenceId)) + assert(regions.forall(_.referenceName == referenceName)) // We represent the distinct unions, the 'nonoverlapping-set' of regions, as a set of endpoints, // so that we can do reasonably-fast binary searching on them to determine the slice of nonoverlapping-set @@ -275,7 +275,7 @@ class NonoverlappingRegions(seqDict: SequenceDictionary, regions: Seq[ReferenceR */ NonoverlappingRegions.alternating(startSlice.zip(endSlice).map { case (start, end) => - ReferenceRegion(referenceId, start, end) + ReferenceRegion(referenceName, start, end) }.toSeq, firstRegionIsHit) } } @@ -312,7 +312,7 @@ class NonoverlappingRegions(seqDict: SequenceDictionary, regions: Seq[ReferenceR } override def toString: String = - "%d:%d-%d (%s)".format(referenceId, endpoints.head, endpoints.last, endpoints.mkString(",")) + "%s:%d-%d (%s)".format(referenceName, endpoints.head, endpoints.last, endpoints.mkString(",")) } object NonoverlappingRegions { @@ -328,34 +328,39 @@ object NonoverlappingRegions { } /** - * Creates a multi-reference-region collection of NonoverlappingRegions -- see the scaladocs to - * NonoverlappingRegions. + * Creates a multi-reference-region collection of NonoverlappingRegions -- see + * the scaladocs to NonoverlappingRegions. * - * @param seqDict A sequence dictionary for all the possible reference regions that could be aligned to. - * @param regions A Seq of ReferencRegions, pre-partitioned by their refIds -- so, for a given pair - * (x, regs) in this Seq, all regions R in regs must satisfy R.refId == x. Furthermore, - * all the x's must be valid refIds with respect to the sequence dictionary. + * @param seqDict A sequence dictionary for all the possible reference regions + * that could be aligned to. + * @param regions A Seq of ReferencRegions, pre-partitioned by their + * referenceNames. So, for a given pair (x, regs) in + * this Seq, all regions R in regs must satisfy + * R.referenceName == x. Furthermore, all the x's must + * be valid reference names with respect to the sequence + * dictionary. */ -class MultiContigNonoverlappingRegions(seqDict: SequenceDictionary, regions: Seq[(Int, Seq[ReferenceRegion])]) - extends Serializable { +class MultiContigNonoverlappingRegions( + seqDict: SequenceDictionary, + regions: Seq[(String, Seq[ReferenceRegion])]) extends Serializable { assert(regions != null, "Regions was set to null") - assert(!regions.map(_._1).exists(!seqDict.containsRefId(_)), - "SeqDict doesn't contain a refId from the regions sequence") + assert(!regions.map(_._1).exists(!seqDict.containsRefName(_)), + "SeqDict doesn't contain a referenceName from the regions sequence") - val regionMap: Map[Int, NonoverlappingRegions] = + val regionMap: Map[String, NonoverlappingRegions] = Map(regions.map(r => (r._1, new NonoverlappingRegions(seqDict, r._2))): _*) def regionsFor[U](regionable: U)(implicit mapping: ReferenceMapping[U]): Iterable[ReferenceRegion] = - regionMap.get(mapping.getReferenceId(regionable)) match { + regionMap.get(mapping.getReferenceName(regionable)) match { case None => Seq() case Some(nr) => nr.regionsFor(regionable) } def filter[U](value: U)(implicit mapping: ReferenceMapping[U]): Boolean = - regionMap.get(mapping.getReferenceId(value)) match { + regionMap.get(mapping.getReferenceName(value)) match { case None => false case Some(nr) => nr.hasRegionsFor(value) } @@ -364,7 +369,7 @@ class MultiContigNonoverlappingRegions(seqDict: SequenceDictionary, regions: Seq object MultiContigNonoverlappingRegions { def apply[T](seqDict: SequenceDictionary, values: Seq[T])(implicit mapping: ReferenceMapping[T]): MultiContigNonoverlappingRegions = { new MultiContigNonoverlappingRegions(seqDict, - values.map(v => (mapping.getReferenceId(v), mapping.getReferenceRegion(v))) + values.map(v => (mapping.getReferenceName(v), mapping.getReferenceRegion(v))) .groupBy(t => t._1) .map(t => (t._1, t._2.map(k => k._2))) .toSeq) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala index 49587d64af..c655d987d0 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala @@ -106,7 +106,7 @@ class DecadentRead(val record: RichADAMRecord) extends Logging { def referenceLocationOption: Option[ReferenceLocation] = assumingAligned { record.readOffsetToReferencePosition(offset). - map(refOffset => new ReferenceLocation(record.getReferenceName.toString, refOffset)) + map(refOffset => new ReferenceLocation(record.getContig.getContigName.toString, refOffset)) } def referenceLocation: ReferenceLocation = diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/ReferenceMappingContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/ReferenceMappingContext.scala index ae0820e268..f9011262e4 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/ReferenceMappingContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rich/ReferenceMappingContext.scala @@ -24,20 +24,16 @@ import org.bdgenomics.adam.avro.ADAMRecord object ReferenceMappingContext { implicit object ADAMRecordReferenceMapping extends ReferenceMapping[ADAMRecord] with Serializable { - def getReferenceId(value: ADAMRecord): Int = value.getReferenceId - - def remapReferenceId(value: ADAMRecord, newId: Int): ADAMRecord = - ADAMRecord.newBuilder(value).setReferenceId(newId).build() + def getReferenceName(value: ADAMRecord): String = + value.getContig.getContigName.toString def getReferenceRegion(value: ADAMRecord): ReferenceRegion = ReferenceRegion(value).getOrElse(null) } implicit object ReferenceRegionReferenceMapping extends ReferenceMapping[ReferenceRegion] with Serializable { - def getReferenceId(value: ReferenceRegion): Int = value.refId - - def remapReferenceId(value: ReferenceRegion, newId: Int): ReferenceRegion = - ReferenceRegion(newId, value.start, value.end) + def getReferenceName(value: ReferenceRegion): String = + value.referenceName.toString def getReferenceRegion(value: ReferenceRegion): ReferenceRegion = value } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichADAMVariant.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichADAMVariant.scala index 7abfdaeaea..5e79cf1625 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichADAMVariant.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichADAMVariant.scala @@ -17,6 +17,7 @@ package org.bdgenomics.adam.rich import org.bdgenomics.adam.avro.{ ADAMContig, ADAMVariant } +import org.bdgenomics.adam.util.Util._ import java.util.Arrays object RichADAMVariant { @@ -30,11 +31,6 @@ class RichADAMVariant(val variant: ADAMVariant) { variant.getPosition, variant.getReferenceAllele, variant.getVariantAllele) override def hashCode = Arrays.hashCode(hashObjects) - private def isSameContig(left: ADAMContig, right: ADAMContig): Boolean = { - left.getContigName == right.getContigName && ( - left.getContigMD5 == null || right.contigMD5 == null || left.getContigMD5 == right.getContigMD5) - } - override def equals(o: Any) = o match { case that: RichADAMVariant => { variant.getPosition == that.variant.getPosition && diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichRDDReferenceRecords.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichRDDReferenceRecords.scala deleted file mode 100644 index 224a915853..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichRDDReferenceRecords.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2014 Genome Bridge LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.bdgenomics.adam.rich - -import org.bdgenomics.adam.avro.ADAMRecord -import org.bdgenomics.adam.models.ReferenceMapping -import ReferenceMappingContext._ -import org.apache.spark.rdd.RDD -import org.apache.spark.SparkContext -import org.apache.avro.specific.SpecificRecord -import scala.reflect.ClassTag - -class RichRDDReferenceRecords[T <: SpecificRecord: ClassTag](rdd: RDD[T], - mapping: ReferenceMapping[T]) - extends Serializable { - - def remapReferenceId(map: Map[Int, Int])(implicit sc: SparkContext): RDD[T] = { - // If the reference IDs match, we don't need to do any remapping, just return the previous RDD - if (map.forall({ case (a, b) => a == b })) rdd - else { - // Broadcast the map variable - val bc = sc.broadcast(map) - rdd.map(r => { - val refId = mapping.getReferenceId(r) - // If the reference ID is the same, we don't need to create a new ADAMRecord - if (bc.value(refId) == refId.toInt) r - else mapping.remapReferenceId(r, bc.value(refId)) - }) - } - } -} - -object RichRDDReferenceRecords extends Serializable { - implicit def adamRDDToRichADAMRDD(rdd: RDD[ADAMRecord]): RichRDDReferenceRecords[ADAMRecord] = - new RichRDDReferenceRecords(rdd, ADAMRecordReferenceMapping) -} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/IntervalListReader.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/IntervalListReader.scala index d996f4e406..d02ee3e4df 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/IntervalListReader.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/IntervalListReader.scala @@ -54,15 +54,14 @@ class IntervalListReader(file: File) extends Traversable[(ReferenceRegion, Strin lines.filter(!_.startsWith("@")).foreach { line => val array = line.split("\t") - val refId = array(0).toInt + val refname = array(0).toString val start = array(1).toLong val end = array(2).toLong val strand = array(3) val name = array(4) - assert(strand == "+") - f((ReferenceRegion(refId, start, end), name)) + f((ReferenceRegion(refname, start, end), name)) } } @@ -70,7 +69,6 @@ class IntervalListReader(file: File) extends Traversable[(ReferenceRegion, Strin val sequenceRecords = mutable.ListBuffer[SequenceRecord]() val regex = Pattern.compile("(\\w{2}):(.*)") - var nextSequenceId = 0 def sequenceDictionary: SequenceDictionary = SequenceDictionary(sequenceRecords: _*) @@ -98,8 +96,7 @@ class IntervalListReader(file: File) extends Traversable[(ReferenceRegion, Strin } } - sequenceRecords ++= List(SequenceRecord(nextSequenceId, name, length, url)) - nextSequenceId += 1 + sequenceRecords ++= List(SequenceRecord(name, length, url)) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/PileupTraversable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/PileupTraversable.scala index 878df879f7..7a472e82ab 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/PileupTraversable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/PileupTraversable.scala @@ -61,9 +61,8 @@ case class DeletionEvent(readName: String, mapQ: Int, qual: Int) extends PileupEvent(readName) -class Pileup(val referenceId: Int, +class Pileup(val referenceName: String, val referencePosition: Long, - val referenceName: String, val referenceBase: Option[Base.Value], val matches: List[MatchEvent] = List.empty, val mismatches: List[MismatchEvent] = List.empty, @@ -120,7 +119,7 @@ class PileupTraversable(reads: RDD[ADAMRecord]) extends Traversable[Pileup] with case CigarOperator.I => val insertEvent = new InsertionEvent(record.getReadName.toString, record.getSequence.toString, record.getMapq.toInt, stringToQualitySanger(record.getQual.toString)) - pileupList ::= new Pileup(record.getReferenceId, referencePos, record.getReferenceName.toString, + pileupList ::= new Pileup(record.getContig.getContigName.toString, referencePos, None, insertions = List(insertEvent)) readPos += cigarElement.getLength @@ -133,7 +132,7 @@ class PileupTraversable(reads: RDD[ADAMRecord]) extends Traversable[Pileup] with // sequence match val matchEvent = new MatchEvent(record.getReadName.toString, isReverseStrand, i, cigarElement.getLength, record.getMapq.toInt, stringToQualitySanger(record.getQual.toString)) - pileupList ::= new Pileup(record.getReferenceId, referencePos, record.getReferenceName.toString, + pileupList ::= new Pileup(record.getContig.getContigName.toString, referencePos, Some(baseFromSequence(readPos)), matches = List(matchEvent)) } else { val mismatchBase = mdTag.mismatchedBase(referencePos) @@ -144,7 +143,7 @@ class PileupTraversable(reads: RDD[ADAMRecord]) extends Traversable[Pileup] with val mismatchEvent = new MismatchEvent(record.getReadName.toString, baseFromSequence(readPos), isReverseStrand, i, cigarElement.getLength, record.getMapq.toInt, stringToQualitySanger(record.getQual.toString)) - pileupList ::= new Pileup(record.getReferenceId, referencePos, record.getReferenceName.toString, + pileupList ::= new Pileup(record.getContig.getContigName.toString, referencePos, Some(Base.withName(mismatchBase.get.toString)), mismatches = List(mismatchEvent)) } @@ -162,7 +161,7 @@ class PileupTraversable(reads: RDD[ADAMRecord]) extends Traversable[Pileup] with } val deleteEvent = new DeletionEvent(record.getReadName.toString, i, cigarElement.getLength, record.getMapq.toInt, stringToQualitySanger(record.getQual.toString)) - pileupList ::= new Pileup(record.getReferenceId, referencePos, record.getReferenceName.toString, + pileupList ::= new Pileup(record.getContig.getContigName.toString, referencePos, Some(Base.withName(deletedBase.get.toString)), deletes = List(deleteEvent)) // Consume reference bases but not read bases referencePos += 1 @@ -187,7 +186,7 @@ class PileupTraversable(reads: RDD[ADAMRecord]) extends Traversable[Pileup] with // val queue = new collection.mutable.PriorityQueue[(Int, Long)]() var pileups = SortedMap[Long, ListBuffer[Pileup]]() - var currentReference: Option[Int] = None + var currentReference: Option[String] = None var currentReferencePosition: Option[Long] = None def flushPileups(beforePosition: Option[Long] = None) { @@ -207,7 +206,6 @@ class PileupTraversable(reads: RDD[ADAMRecord]) extends Traversable[Pileup] with var referenceBase: Option[Base.Value] = None pileupsAtLocation foreach { pileup => - assert(pileup.referenceId == currentReference.get) assert(pileup.referencePosition == location) matches ++= pileup.matches mismatches ++= pileup.mismatches @@ -224,7 +222,7 @@ class PileupTraversable(reads: RDD[ADAMRecord]) extends Traversable[Pileup] with referenceBase = pileup.referenceBase } } - f(new Pileup(currentReference.get, location, referenceName.get, referenceBase, + f(new Pileup(referenceName.get, location, referenceBase, matches.toList, mismatches.toList, inserts.toList, deletes.toList)) } pileups --= locationsToFlush @@ -233,13 +231,13 @@ class PileupTraversable(reads: RDD[ADAMRecord]) extends Traversable[Pileup] with for (read: ADAMRecord <- reads) { def updateCurrentInfo(read: ADAMRecord) = { - currentReference = Some(read.getReferenceId) + currentReference = Some(read.getContig.getContigName.toString) currentReferencePosition = Some(read.getStart) } currentReference match { case Some(reference) => - if (reference != read.getReferenceId.toInt) { + if (reference != read.getContig.getContigName.toString) { // We're starting a new reference, flush all events from the previous reference flushPileups() updateCurrentInfo(read) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/Util.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/Util.scala index cd6a542780..bae5c3160b 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/Util.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/Util.scala @@ -16,7 +16,14 @@ package org.bdgenomics.adam.util +import org.bdgenomics.adam.avro.ADAMContig + object Util { + def isSameContig(left: ADAMContig, right: ADAMContig): Boolean = { + left.getContigName == right.getContigName && ( + left.getContigMD5 == null || right.contigMD5 == null || left.getContigMD5 == right.getContigMD5) + } + def hashCombine(parts: Int*): Int = if (parts.tail == Nil) parts.head diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfHeaderUtils.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfHeaderUtils.scala index 4d5763e7b8..832722c2f5 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfHeaderUtils.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfHeaderUtils.scala @@ -90,8 +90,8 @@ private[util] class VcfHeaderBuilder(samples: List[String]) { def addContigLines(seqDict: SequenceDictionary) { val contigNames = seqDict.getReferenceNames - contigNames.foreach(ctg => { - val contig = new VCFContigHeaderLine(Map("ID" -> ctg), seqDict(ctg).id) + contigNames.zip(1 to contigNames.size).foreach(ctg => { + val contig = new VCFContigHeaderLine(Map("ID" -> ctg._1), ctg._2) contigLines = contig :: contigLines }) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala index 589f523dad..c9b6208898 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala @@ -41,20 +41,18 @@ class FastaConverterSuite extends SparkFunSuite { test("convert a single record without naming information") { val contig = converter.convert(None, 0, Seq("AAATTTGCGC"), None) - assert(contig.head.getContigId === 0) assert(contig.head.getFragmentSequence.map(_.toString).reduce(_ + _) === "AAATTTGCGC") - assert(contig.head.getContigLength === 10) - assert(contig.head.getContigName === null) + assert(contig.head.getContig.getContigLength === 10) + assert(contig.head.getContig.getContigName === null) assert(contig.head.getDescription === null) } test("convert a single record with naming information") { val contig = converter.convert(Some("chr2"), 1, Seq("NNNN"), Some("hg19")) - assert(contig.head.getContigId === 1) assert(contig.head.getFragmentSequence.map(_.toString).reduce(_ + _) === "NNNN") - assert(contig.head.getContigLength === 4) - assert(contig.head.getContigName === "chr2") + assert(contig.head.getContig.getContigLength === 4) + assert(contig.head.getContig.getContigName === "chr2") assert(contig.head.getDescription === "hg19") } @@ -84,10 +82,9 @@ class FastaConverterSuite extends SparkFunSuite { val fastaFragmentSequence = fasta.map(_._2).reduce(_ + _) val convertedFragmentSequence = fastaElement.getFragmentSequence.map(_.toString).reduce(_ + _) - assert(fastaElement.getContigId() === 0) assert(convertedFragmentSequence === fastaFragmentSequence) - assert(fastaElement.getContigLength() == fastaFragmentSequence.length) - assert(fastaElement.getContigName === null) + assert(fastaElement.getContig.getContigLength() == fastaFragmentSequence.length) + assert(fastaElement.getContig.getContigName === null) assert(fastaElement.getDescription === null) } @@ -132,24 +129,22 @@ class FastaConverterSuite extends SparkFunSuite { val adamFasta = FastaConverter(rdd) assert(adamFasta.count === 2) - val fastaElement1 = adamFasta.filter(_.getContigName == "chr1").first() + val fastaElement1 = adamFasta.filter(_.getContig.getContigName == "chr1").first() val fastaFragmentSequence1 = fasta1.drop(1).map(_._2).reduce(_ + _) val convertedFragmentSequence1 = fastaElement1.getFragmentSequence.map(_.toString).reduce(_ + _) - assert(fastaElement1.getContigId() === 0) assert(convertedFragmentSequence1 === fastaFragmentSequence1) - assert(fastaElement1.getContigLength() == fastaFragmentSequence1.length) - assert(fastaElement1.getContigName().toString === "chr1") + assert(fastaElement1.getContig.getContigLength() == fastaFragmentSequence1.length) + assert(fastaElement1.getContig.getContigName().toString === "chr1") assert(fastaElement1.getDescription === null) - val fastaElement2 = adamFasta.filter(_.getContigName == "chr2").first() + val fastaElement2 = adamFasta.filter(_.getContig.getContigName == "chr2").first() val fastaFragmentSequence2 = fasta2.drop(1).map(_._2).reduce(_ + _) val convertedFragmentSequence2 = fastaElement2.getFragmentSequence.map(_.toString).reduce(_ + _) - assert(fastaElement2.getContigId() === 1) assert(convertedFragmentSequence2 === fastaFragmentSequence2) - assert(fastaElement2.getContigLength() == fastaFragmentSequence2.length) - assert(fastaElement2.getContigName().toString === "chr2") + assert(fastaElement2.getContig.getContigLength() == fastaFragmentSequence2.length) + assert(fastaElement2.getContig.getContigName().toString === "chr2") assert(fastaElement2.getDescription === null) } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala index f092814dd4..5d62f44766 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala @@ -24,7 +24,8 @@ import org.bdgenomics.adam.avro._ import scala.Some class VariantContextConverterSuite extends FunSuite { - val dictionary = SequenceDictionary(SequenceRecord(1, "chr1", 249250621, "file://ucsc.hg19.fasta", "1b22b98cdeb4a9304cb5d48026a85128")) + val dictionary = SequenceDictionary(SequenceRecord("chr1", 249250621, + "file://ucsc.hg19.fasta", "1b22b98cdeb4a9304cb5d48026a85128")) def gatkSNVBuilder: VariantContextBuilder = new VariantContextBuilder() .alleles(List(Allele.create("A", true), Allele.create("T"))) @@ -39,7 +40,7 @@ class VariantContextConverterSuite extends FunSuite { .chr("chr1") def adamSNVBuilder: ADAMVariant.Builder = ADAMVariant.newBuilder() - .setContig(ADAMContig.newBuilder().setContigId(1).setContigName("chr1").build) + .setContig(ADAMContig.newBuilder().setContigName("chr1").build) .setPosition(0L) .setReferenceAllele("A") .setVariantAllele("T") diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/metrics/ComparisonsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/metrics/ComparisonsSuite.scala index de4e894f3f..7c9b612987 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/metrics/ComparisonsSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/metrics/ComparisonsSuite.scala @@ -16,7 +16,7 @@ package org.bdgenomics.adam.metrics import org.bdgenomics.adam.util.SparkFunSuite -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } import org.bdgenomics.adam.models.SingleReadBucket class ComparisonsSuite extends SparkFunSuite { @@ -37,12 +37,18 @@ class ComparisonsSuite extends SparkFunSuite { srbRDD.first() } + val contig: ADAMContig = ADAMContig.newBuilder + .setContigName("chr1") + .build + val contig2: ADAMContig = ADAMContig.newBuilder + .setContigName("chr2") + .build val record: ADAMRecord = ADAMRecord.newBuilder() + .setContig(contig) .setReadName("test") .setDuplicateRead(false) .setMapq(10) .setQual("abcdef") - .setReferenceId(1) .setStart(100) .setPrimaryAlignment(true) .setRecordGroupName("groupid") @@ -56,10 +62,10 @@ class ComparisonsSuite extends SparkFunSuite { .build()) bucketMapqUnset = srb(ADAMRecord.newBuilder() + .setContig(contig) .setReadName("test") .setDuplicateRead(false) .setQual("abcdef") - .setReferenceId(1) .setStart(100) .setPrimaryAlignment(true) .setRecordGroupName("groupid") @@ -75,10 +81,10 @@ class ComparisonsSuite extends SparkFunSuite { .build()) bucketQualUnset = srb(ADAMRecord.newBuilder() + .setContig(contig) .setReadName("test") .setDuplicateRead(false) .setMapq(10) - .setReferenceId(1) .setStart(100) .setPrimaryAlignment(true) .setRecordGroupName("groupid") @@ -86,7 +92,7 @@ class ComparisonsSuite extends SparkFunSuite { .build()) bucketMovedChromosome = srb(ADAMRecord.newBuilder(record) - .setReferenceId(2) + .setContig(contig2) .setStart(200) .build()) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferencePositionSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferencePositionSuite.scala index 98131a4d5b..11e302987d 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferencePositionSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferencePositionSuite.scala @@ -21,9 +21,13 @@ import org.bdgenomics.adam.avro.{ ADAMContig, ADAMGenotype, ADAMPileup, ADAMReco class ReferencePositionSuite extends FunSuite { test("create reference position from mapped read") { + val contig = ADAMContig.newBuilder + .setContigName("chr1") + .build + val read = ADAMRecord.newBuilder() + .setContig(contig) .setStart(1L) - .setReferenceId(1) .setReadMapped(true) .build() @@ -33,7 +37,7 @@ class ReferencePositionSuite extends FunSuite { val refPos = refPosOpt.get - assert(refPos.refId === 1) + assert(refPos.referenceName === "chr1") assert(refPos.pos === 1L) } @@ -48,20 +52,23 @@ class ReferencePositionSuite extends FunSuite { } test("create reference position from pileup") { + val contig = ADAMContig.newBuilder + .setContigName("chr2") + .build + val pileup = ADAMPileup.newBuilder() .setPosition(2L) - .setReferenceId(2) + .setContig(contig) .build() val refPos = ReferencePosition(pileup) - assert(refPos.refId === 2) + assert(refPos.referenceName === "chr2") assert(refPos.pos === 2L) } test("create reference position from variant") { val contig = ADAMContig.newBuilder() - .setContigId(10) .setContigName("chr10") .build() val variant = ADAMVariant.newBuilder() @@ -73,13 +80,12 @@ class ReferencePositionSuite extends FunSuite { val refPos = ReferencePosition(variant) - assert(refPos.refId === 10) + assert(refPos.referenceName === "chr10") assert(refPos.pos === 10L) } test("create reference position from genotype") { val contig = ADAMContig.newBuilder() - .setContigId(10) .setContigName("chr10") .build() val variant = ADAMVariant.newBuilder() @@ -94,7 +100,7 @@ class ReferencePositionSuite extends FunSuite { val refPos = ReferencePosition(genotype) - assert(refPos.refId === 10) + assert(refPos.referenceName === "chr10") assert(refPos.pos === 100L) } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala index ff070872b4..16ed44b91c 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala @@ -16,37 +16,37 @@ package org.bdgenomics.adam.models import org.scalatest._ -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } class ReferenceRegionSuite extends FunSuite { test("contains(: ReferenceRegion)") { - assert(region(0, 10, 100).contains(region(0, 50, 70))) - assert(region(0, 10, 100).contains(region(0, 10, 100))) - assert(!region(0, 10, 100).contains(region(1, 50, 70))) - assert(region(0, 10, 100).contains(region(0, 50, 100))) - assert(!region(0, 10, 100).contains(region(0, 50, 101))) + assert(region("chr0", 10, 100).contains(region("chr0", 50, 70))) + assert(region("chr0", 10, 100).contains(region("chr0", 10, 100))) + assert(!region("chr0", 10, 100).contains(region("chr1", 50, 70))) + assert(region("chr0", 10, 100).contains(region("chr0", 50, 100))) + assert(!region("chr0", 10, 100).contains(region("chr0", 50, 101))) } test("contains(: ReferencePosition)") { - assert(region(0, 10, 100).contains(point(0, 50))) - assert(region(0, 10, 100).contains(point(0, 10))) - assert(region(0, 10, 100).contains(point(0, 99))) - assert(!region(0, 10, 100).contains(point(0, 100))) - assert(!region(0, 10, 100).contains(point(1, 50))) + assert(region("chr0", 10, 100).contains(point("chr0", 50))) + assert(region("chr0", 10, 100).contains(point("chr0", 10))) + assert(region("chr0", 10, 100).contains(point("chr0", 99))) + assert(!region("chr0", 10, 100).contains(point("chr0", 100))) + assert(!region("chr0", 10, 100).contains(point("chr1", 50))) } test("merge") { intercept[AssertionError] { - region(0, 10, 100).merge(region(1, 10, 100)) + region("chr0", 10, 100).merge(region("chr1", 10, 100)) } - val r1 = region(0, 10, 100) - val r2 = region(0, 0, 15) - val r3 = region(0, 50, 150) + val r1 = region("chr0", 10, 100) + val r2 = region("chr0", 0, 15) + val r3 = region("chr0", 50, 150) - val r12 = region(0, 0, 100) - val r13 = region(0, 10, 150) + val r12 = region("chr0", 0, 100) + val r13 = region("chr0", 10, 150) assert(r1.merge(r1) === r1) assert(r1.merge(r2) === r12) @@ -56,66 +56,66 @@ class ReferenceRegionSuite extends FunSuite { test("overlaps") { // contained - assert(region(0, 10, 100).overlaps(region(0, 20, 50))) + assert(region("chr0", 10, 100).overlaps(region("chr0", 20, 50))) // right side - assert(region(0, 10, 100).overlaps(region(0, 50, 250))) + assert(region("chr0", 10, 100).overlaps(region("chr0", 50, 250))) // left side - assert(region(0, 10, 100).overlaps(region(0, 5, 15))) + assert(region("chr0", 10, 100).overlaps(region("chr0", 5, 15))) // left edge - assert(region(0, 10, 100).overlaps(region(0, 5, 11))) - assert(!region(0, 10, 100).overlaps(region(0, 5, 10))) + assert(region("chr0", 10, 100).overlaps(region("chr0", 5, 11))) + assert(!region("chr0", 10, 100).overlaps(region("chr0", 5, 10))) // right edge - assert(region(0, 10, 100).overlaps(region(0, 99, 200))) - assert(!region(0, 10, 100).overlaps(region(0, 100, 200))) + assert(region("chr0", 10, 100).overlaps(region("chr0", 99, 200))) + assert(!region("chr0", 10, 100).overlaps(region("chr0", 100, 200))) // different sequences - assert(!region(0, 10, 100).overlaps(region(1, 50, 200))) + assert(!region("chr0", 10, 100).overlaps(region("chr1", 50, 200))) } test("distance(: ReferenceRegion)") { // distance on the right - assert(region(0, 10, 100).distance(region(0, 200, 300)) === Some(101)) + assert(region("chr0", 10, 100).distance(region("chr0", 200, 300)) === Some(101)) // distance on the left - assert(region(0, 100, 200).distance(region(0, 10, 50)) === Some(51)) + assert(region("chr0", 100, 200).distance(region("chr0", 10, 50)) === Some(51)) // different sequences - assert(region(0, 100, 200).distance(region(1, 10, 50)) === None) + assert(region("chr0", 100, 200).distance(region("chr1", 10, 50)) === None) // touches on the right - assert(region(0, 10, 100).distance(region(0, 100, 200)) === Some(1)) + assert(region("chr0", 10, 100).distance(region("chr0", 100, 200)) === Some(1)) // overlaps - assert(region(0, 10, 100).distance(region(0, 50, 150)) === Some(0)) + assert(region("chr0", 10, 100).distance(region("chr0", 50, 150)) === Some(0)) // touches on the left - assert(region(0, 10, 100).distance(region(0, 0, 10)) === Some(1)) + assert(region("chr0", 10, 100).distance(region("chr0", 0, 10)) === Some(1)) } test("distance(: ReferencePosition)") { // middle - assert(region(0, 10, 100).distance(point(0, 50)) === Some(0)) + assert(region("chr0", 10, 100).distance(point("chr0", 50)) === Some(0)) // left edge - assert(region(0, 10, 100).distance(point(0, 10)) === Some(0)) + assert(region("chr0", 10, 100).distance(point("chr0", 10)) === Some(0)) // right edge - assert(region(0, 10, 100).distance(point(0, 100)) === Some(1)) + assert(region("chr0", 10, 100).distance(point("chr0", 100)) === Some(1)) // right - assert(region(0, 10, 100).distance(point(0, 150)) === Some(51)) + assert(region("chr0", 10, 100).distance(point("chr0", 150)) === Some(51)) // left - assert(region(0, 100, 200).distance(point(0, 50)) === Some(50)) + assert(region("chr0", 100, 200).distance(point("chr0", 50)) === Some(50)) // different sequences - assert(region(0, 100, 200).distance(point(1, 50)) === None) + assert(region("chr0", 100, 200).distance(point("chr1", 50)) === None) } @@ -133,26 +133,29 @@ class ReferenceRegionSuite extends FunSuite { .setSequence("AAAAA") .setStart(1L) .setCigar("5M") - .setReferenceId(1) + .setContig(ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(10) + .build) .build() assert(ReferenceRegion(read).isDefined) - assert(ReferenceRegion(read).get.contains(point(1, 1L))) - assert(ReferenceRegion(read).get.contains(point(1, 5L))) + assert(ReferenceRegion(read).get.contains(point("chr1", 1L))) + assert(ReferenceRegion(read).get.contains(point("chr1", 5L))) } test("validate that adjacent regions can be merged") { - val r1 = region(1, 0L, 6L) - val r2 = region(1, 6L, 10L) + val r1 = region("chr1", 0L, 6L) + val r2 = region("chr1", 6L, 10L) assert(r1.distance(r2).get === 1) assert(r1.isAdjacent(r2)) - assert(r1.merge(r2) == region(1, 0L, 10L)) + assert(r1.merge(r2) == region("chr1", 0L, 10L)) } test("validate that non-adjacent regions cannot be merged") { - val r1 = region(1, 0L, 5L) - val r2 = region(1, 7L, 10L) + val r1 = region("chr1", 0L, 5L) + val r2 = region("chr1", 7L, 10L) assert(!r1.isAdjacent(r2)) @@ -162,8 +165,8 @@ class ReferenceRegionSuite extends FunSuite { } test("compute convex hull of two sets") { - val r1 = region(1, 0L, 5L) - val r2 = region(1, 7L, 10L) + val r1 = region("chr1", 0L, 5L) + val r2 = region("chr1", 7L, 10L) assert(!r1.isAdjacent(r2)) @@ -177,9 +180,9 @@ class ReferenceRegionSuite extends FunSuite { assert(hull1.end == 10L) } - def region(refId: Int, start: Long, end: Long): ReferenceRegion = - ReferenceRegion(refId, start, end) + def region(refName: String, start: Long, end: Long): ReferenceRegion = + ReferenceRegion(refName, start, end) - def point(refId: Int, pos: Long): ReferencePosition = - ReferencePosition(refId, pos) + def point(refName: String, pos: Long): ReferencePosition = + ReferencePosition(refName, pos) } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala index cb79e76d24..1cb7ee2cfa 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala @@ -24,157 +24,74 @@ import org.scalatest.FunSuite class SequenceDictionarySuite extends FunSuite { - test("Can retrieve sequence by ID") { - val rec1 = record(0, "foo") - val rec2 = record(1, "bar") - assert(SequenceDictionary(rec1, rec2)(rec1.id) === rec1) - } - test("containsRefName works as expected 1") { - val rec1 = record(0, "foo") - val rec2 = record(1, "bar") + val rec1 = record("foo") + val rec2 = record("bar") assert(SequenceDictionary(rec1, rec2).containsRefName("foo")) assert(SequenceDictionary(rec1, rec2).containsRefName("bar")) assert(!SequenceDictionary(rec1, rec2).containsRefName("foo ")) } test("containsRefName works as expected 2") { - val rec1 = record(0, "foo") - val rec2 = record(1, "bar") + val rec1 = record("foo") + val rec2 = record("bar") val sd = SequenceDictionary(rec1, rec2) val names = sd.getReferenceNames() assert(names.toList.filter(_ == "foo").length === 1) assert(names.toList.filter(_ == "bar").length === 1) } - test("containsRefId works as expected") { - val rec1 = record(0, "foo") - val rec2 = record(1, "bar") - - assert(SequenceDictionary(rec1, rec2).containsRefId(0)) - assert(SequenceDictionary(rec1, rec2).containsRefId(1)) - assert(!SequenceDictionary(rec1, rec2).containsRefId(2)) - } - test("Can retrieve sequence by Name") { - val rec1 = record(0, "foo") - val rec2 = record(1, "bar") + val rec1 = record("foo") + val rec2 = record("bar") assert(SequenceDictionary(rec1, rec2)(rec1.name) === rec1) } test("SequenceDictionaries with same single element are equal") { - assert(SequenceDictionary(record(0, "foo")) === SequenceDictionary(record(0, "foo"))) + assert( + SequenceDictionary(record("foo")) === SequenceDictionary(record("foo"))) } test("SequenceDictionaries with same two elements are equals") { - assert(SequenceDictionary(record(0, "foo"), record(1, "bar")) === - SequenceDictionary(record(0, "foo"), record(1, "bar"))) + assert(SequenceDictionary(record("foo"), record("bar")) === + SequenceDictionary(record("foo"), record("bar"))) } test("SequenceDictionaries with different elements are unequal") { - assert(SequenceDictionary(record(0, "foo"), record(1, "bar")) != - SequenceDictionary(record(0, "foo"), record(1, "quux"))) + assert(SequenceDictionary(record("foo"), record("bar")) != + SequenceDictionary(record("foo"), record("quux"))) } test("SequenceDictionaries with same elements in different order are equal") { - assert(SequenceDictionary(record(0, "foo"), record(1, "bar")) === - SequenceDictionary(record(1, "bar"), record(0, "foo"))) - } - - test("double referenceIds throws an exception") { - intercept[AssertionError] { - SequenceDictionary(record(0, "foo"), record(0, "bar")) - } - } - - test("double referenceNames throws an exception") { - intercept[AssertionError] { - SequenceDictionary(record(0, "foo"), record(1, "foo")) - } - } - - test("mapTo generates correct identifier mappings") { - val fromDict = SequenceDictionary( - record(0, "foo"), - record(1, "bar"), - record(2, "quux")) - - val toDict = SequenceDictionary(record(10, "bar"), record(20, "quux")) - - assert(fromDict.mapTo(toDict) === Map(0 -> 0, 1 -> 10, 2 -> 20)) + assert(SequenceDictionary(record("foo"), record("bar")) === + SequenceDictionary(record("bar"), record("foo"))) } test("isCompatible tests equality on overlap") { - val s1 = SequenceDictionary(record(0, "foo"), record(1, "bar")) - val s2 = SequenceDictionary(record(1, "bar"), record(2, "quux")) - val s3 = SequenceDictionary(record(0, "foo"), record(2, "bar")) - + val s1 = SequenceDictionary(record("foo"), record("bar")) + val s2 = SequenceDictionary(record("bar"), record("quux")) + val s3 = SequenceDictionary(record("foo"), record("bar")) + val s4 = SequenceDictionary(record("foo", 1001)) assert(s1 isCompatibleWith s2) - assert(!(s1 isCompatibleWith s3)) - } - - test("remap and mapTo generate equality for dictionaries with the same names") { - val s1 = SequenceDictionary(record(1, "foo"), record(2, "bar")) - val s2 = SequenceDictionary(record(20, "bar"), record(10, "foo")) - - assert(s1.mapTo(s2) === Map(1 -> 10, 2 -> 20)) - assert(s1.remap(s1.mapTo(s2)) === s2) - } - - test("all five cases for toMap") { - val s1 = SequenceDictionary(record(1, "s1"), record(3, "s2"), record(4, "s4"), record(6, "s6")) - val s2 = SequenceDictionary(record(1, "s1"), record(2, "s2"), record(4, "s3"), record(5, "s5")) - - val map = s1.mapTo(s2) - - assert(map(1) === 1) - assert(!map.contains(2)) - assert(map(3) === 2) - assert(map(4) === s2.nonoverlappingHash("s4")) - assert(!map.contains(5)) - assert(map(6) === 6) - } - - test("mapTo and remap produce a compatible dictionary") { - val s1 = SequenceDictionary(record(1, "s1"), record(3, "s2"), record(2, "s3"), record(5, "s4")) - val s2 = SequenceDictionary(record(1, "s1"), record(2, "s2"), record(3, "s3"), record(5, "s5"), - record("s4".hashCode, "s6")) - - val map = s1.mapTo(s2) - - // double check that the linear probing for new sequence idx assignment is operational. - // -- this should match up with SequenceDictionary.nonoverlappingHash - assert(map(5) === "s4".hashCode + 1) - - assert(s1.remap(map).isCompatibleWith(s2)) - } - - test("toMap handles permutations correctly") { - val s1 = SequenceDictionary(record(1, "s2"), record(2, "s3"), record(3, "s1")) - val s2 = SequenceDictionary(record(1, "s1"), record(2, "s2"), record(3, "s3")) - - val map = s1.mapTo(s2) - - assert(map(1) === 2) - assert(map(2) === 3) - assert(map(3) === 1) + assert(s1 isCompatibleWith s3) + assert(!(s3 isCompatibleWith s4)) } test("the addition + works correctly") { val s1 = SequenceDictionary() - val s2 = SequenceDictionary(record(1, "foo")) - val s3 = SequenceDictionary(record(1, "foo"), record(2, "bar")) + val s2 = SequenceDictionary(record("foo")) + val s3 = SequenceDictionary(record("foo"), record("bar")) - assert(s1 + record(1, "foo") === s2) - assert(s2 + record(1, "foo") === s2) - assert(s2 + record(2, "bar") === s3) + assert(s1 + record("foo") === s2) + assert(s2 + record("foo") === s2) + assert(s2 + record("bar") === s3) } test("the append operation ++ works correctly") { val s1 = SequenceDictionary() - val s2a = SequenceDictionary(record(1, "foo")) - val s2b = SequenceDictionary(record(2, "bar")) - val s3 = SequenceDictionary(record(1, "foo"), record(2, "bar")) + val s2a = SequenceDictionary(record("foo")) + val s2b = SequenceDictionary(record("bar")) + val s3 = SequenceDictionary(record("foo"), record("bar")) assert(s1 ++ s1 === s1) assert(s1 ++ s2a === s2a) @@ -183,10 +100,10 @@ class SequenceDictionarySuite extends FunSuite { } test("containsRefName works correctly") { - val dict = SequenceDictionary(record(0, "chr0"), - record(1, "chr1"), - record(2, "chr2"), - record(3, "chr3")) + val dict = SequenceDictionary(record("chr0"), + record("chr1"), + record("chr2"), + record("chr3")) val str0: String = "chr0" val str1: java.lang.String = "chr1" val str2: CharSequence = "chr2" @@ -199,44 +116,40 @@ class SequenceDictionarySuite extends FunSuite { } test("apply on name works correctly") { - val dict = SequenceDictionary(record(0, "chr0"), - record(1, "chr1"), - record(2, "chr2"), - record(3, "chr3")) + val dict = SequenceDictionary( + record("chr0"), + record("chr1"), + record("chr2"), + record("chr3")) val str0: String = "chr0" val str1: java.lang.String = "chr1" val str2: CharSequence = "chr2" val str3: java.lang.CharSequence = "chr3" - assert(dict(str0).id === 0) - assert(dict(str1).id === 1) - assert(dict(str2).id === 2) - assert(dict(str3).id === 3) + assert(dict(str0).name === "chr0") + assert(dict(str1).name === "chr1") + assert(dict(str2).name === "chr2") + assert(dict(str3).name === "chr3") } - // TODO (nealsid): Update this test case once we move ADAMRecord - // over to using ADAMContig for it's location and fromSpecificRecord - // is also updated. - // test("get record from variant using specific record") { - // val contig = ADAMContig.newBuilder - // .setContigId(0) - // .setContigName("chr0") - // .setContigLength(1000) - // .setReferenceURL("http://bigdatagenomics.github.io/chr0") - // .build() - // val variant = ADAMVariant.newBuilder() - // .setContig(contig) - // .setReferenceAllele("A") - // .setVariantAllele("T") - // .build() - - // val rec = SequenceRecord.fromSpecificRecord(variant) - - // assert(rec.id === 0) - // assert(rec.name === "chr0") - // assert(rec.length === 1000L) - // assert(rec.url === "http://bigdatagenomics.github.io/chr0") - // } + test("get record from variant using specific record") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .setContigLength(1000) + .setReferenceURL("http://bigdatagenomics.github.io/chr0") + .build() + val variant = ADAMVariant.newBuilder() + .setContig(contig) + .setReferenceAllele("A") + .setVariantAllele("T") + .build() + + val rec = SequenceRecord.fromSpecificRecord(variant) + + assert(rec.name === "chr0") + assert(rec.length === 1000L) + assert(rec.url === "http://bigdatagenomics.github.io/chr0") + } test("convert from sam sequence record and back") { val sr = new SAMSequenceRecord("chr0", 1000) @@ -248,7 +161,6 @@ class SequenceDictionarySuite extends FunSuite { assert(conv.name === "chr0") assert(conv.length === 1000L) assert(conv.url === "http://bigdatagenomics.github.io/chr0") - assert(conv.id === -1) val convSr = conv.toSAMSequenceRecord @@ -264,11 +176,10 @@ class SequenceDictionarySuite extends FunSuite { val asd = SequenceDictionary.fromSAMSequenceDictionary(ssd) - assert(asd(0).name === "chr0") - assert(asd("chr0").id === 0) + assert(asd("chr0").name === "chr0") } - def record(id: Int, name: String, length: Int = 1000, url: String = null): SequenceRecord = - SequenceRecord(id, name, length, url) + def record(name: String, length: Int = 1000, url: String = null): SequenceRecord = + SequenceRecord(name, length, url) } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index 059ac483c1..94fd2d2566 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -17,7 +17,7 @@ package org.bdgenomics.adam.rdd import parquet.filter.UnboundRecordFilter import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } import org.bdgenomics.adam.util.SparkFunSuite import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.util.PhredUtils._ @@ -53,13 +53,16 @@ class ADAMContextSuite extends SparkFunSuite { } sparkTest("loadADAMFromPaths can load simple RDDs that have just been saved") { + val contig = ADAMContig.newBuilder + .setContigName("abc") + .setContigLength(1000000) + .setReferenceURL("http://abc") + .build + val a0 = ADAMRecord.newBuilder() .setRecordGroupName("group0") .setReadName("read0") - .setReferenceId(0) - .setReferenceName("abc") - .setReferenceUrl("http://abc") - .setReferenceLength(1000000) + .setContig(contig) .setStart(100) .setPrimaryAlignment(true) .setReadPaired(false) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctionsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctionsSuite.scala index 2da2a2f27e..f894fdafe8 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctionsSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctionsSuite.scala @@ -40,14 +40,18 @@ import scala.util.Random class ADAMRDDFunctionsSuite extends SparkFunSuite { sparkTest("can convert pileups to rods, bases at different pos, same reference") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build + val p0 = ADAMPileup.newBuilder() .setPosition(0L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.A) .build() val p1 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.C) .build() @@ -63,14 +67,22 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("can convert pileups to rods, bases at same pos, different reference") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build + + val contig1 = ADAMContig.newBuilder + .setContigName("chr1") + .build + val p0 = ADAMPileup.newBuilder() .setPosition(0L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.A) .build() val p1 = ADAMPileup.newBuilder() .setPosition(0L) - .setReferenceId(1) + .setContig(contig1) .setReadBase(Base.C) .build() @@ -79,21 +91,26 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { val rods = pileups.adamPileupsToRods(1) assert(rods.count === 2) - assert(rods.filter(_.position.refId == 0).count === 1) - assert(rods.filter(_.position.refId == 0).flatMap(_.pileups).filter(_.getReadBase == Base.A).count === 1) - assert(rods.filter(_.position.refId == 1).count === 1) - assert(rods.filter(_.position.refId == 1).flatMap(_.pileups).filter(_.getReadBase == Base.C).count === 1) + assert(rods.filter(_.position.referenceName == "chr0").count === 1) + assert(rods.filter(_.position.referenceName == "chr0").flatMap(_.pileups).filter(_.getReadBase == Base.A).count === 1) + assert(rods.filter(_.position.referenceName == "chr1").count === 1) + assert(rods.filter(_.position.referenceName == "chr1").flatMap(_.pileups).filter(_.getReadBase == Base.C).count === 1) } sparkTest("can convert pileups to rods, bases at same pos") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build + val p0 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.A) .build() + val p1 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.C) .build() @@ -109,15 +126,19 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("can convert pileups to rods, bases at same pos, split by different sample") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build + val p0 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.A) .setRecordGroupSample("0") .build() val p1 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.C) .setRecordGroupSample("1") .build() @@ -141,15 +162,19 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("can convert pileups to rods, bases at same pos, split by same sample") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build + val p0 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.A) .setRecordGroupSample("1") .build() val p1 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.C) .setRecordGroupSample("1") .build() @@ -172,14 +197,18 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("check coverage, bases at different pos") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build + val p0 = ADAMPileup.newBuilder() .setPosition(0L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.A) .build() val p1 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.C) .build() @@ -193,14 +222,18 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("check coverage, bases at same pos") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build + val p0 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.A) .build() val p1 = ADAMPileup.newBuilder() .setPosition(1L) - .setReferenceId(0) + .setContig(contig) .setReadBase(Base.C) .build() @@ -220,7 +253,10 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { val mapped = random.nextBoolean() val builder = ADAMRecord.newBuilder().setReadMapped(mapped) if (mapped) { - builder.setReferenceId(random.nextInt(numReadsToCreate / 10)).setStart(random.nextInt(1000000)) + val contig = ADAMContig.newBuilder + .setContigName(random.nextInt(numReadsToCreate / 10).toString) + .build + builder.setContig(contig).setStart(random.nextInt(1000000)) } builder.build() } @@ -231,14 +267,18 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { assert(unmapped.forall(p => p._2 > mapped.takeRight(1)(0)._2)) // Make sure that we appropriately sorted the reads val expectedSortedReads = mapped.sortWith( - (a, b) => a._1.getReferenceId < b._1.getReferenceId && a._1.getStart < b._1.getStart) + (a, b) => a._1.getContig.getContigName.toString < b._1.getContig.getContigName.toString && a._1.getStart < b._1.getStart) assert(expectedSortedReads === mapped) } sparkTest("convert an RDD of reads into rods") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build + val r0 = ADAMRecord.newBuilder .setStart(1L) - .setReferenceId(0) + .setContig(contig) .setSequence("ACG") .setMapq(30) .setCigar("3M") @@ -250,7 +290,7 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { .build() val r1 = ADAMRecord.newBuilder .setStart(2L) - .setReferenceId(0) + .setContig(contig) .setSequence("CG") .setMapq(40) .setCigar("2M") @@ -262,7 +302,7 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { .build() val r2 = ADAMRecord.newBuilder .setStart(3L) - .setReferenceId(0) + .setContig(contig) .setSequence("G") .setMapq(50) .setCigar("1M") @@ -278,7 +318,7 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { val rods = reads.adamRecords2Rods() assert(rods.count === 3) - assert(rods.collect.forall(_.position.refId == 0)) + assert(rods.collect.forall(_.position.referenceName == "chr0")) assert(rods.filter(_.position.pos == 1L).count === 1) assert(rods.filter(_.position.pos == 1L).first.pileups.length === 1) assert(rods.filter(_.position.pos == 1L).first.pileups.forall(_.getReadBase == Base.A)) @@ -291,9 +331,17 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("convert an RDD of reads into rods, different references") { + val contig0 = ADAMContig.newBuilder + .setContigName("chr0") + .build + + val contig1 = ADAMContig.newBuilder + .setContigName("chr1") + .build + val r0 = ADAMRecord.newBuilder .setStart(1L) - .setReferenceId(0) + .setContig(contig0) .setSequence("AC") .setMapq(30) .setCigar("2M") @@ -305,7 +353,7 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { .build() val r1 = ADAMRecord.newBuilder .setStart(2L) - .setReferenceId(0) + .setContig(contig0) .setSequence("C") .setMapq(40) .setCigar("1M") @@ -317,7 +365,7 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { .build() val r2 = ADAMRecord.newBuilder .setStart(2L) - .setReferenceId(1) + .setContig(contig1) .setSequence("G") .setMapq(50) .setCigar("1M") @@ -333,76 +381,36 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { val rods = reads.adamRecords2Rods() assert(rods.count === 3) - assert(rods.filter(_.position.refId == 0).count === 2) - assert(rods.filter(_.position.refId == 1).count === 1) - assert(rods.filter(_.position.pos == 1L).filter(_.position.refId == 0).count === 1) - assert(rods.filter(_.position.pos == 1L).filter(_.position.refId == 0).first.pileups.length === 1) - assert(rods.filter(_.position.pos == 1L).filter(_.position.refId == 0).first.pileups.forall(_.getReadBase == Base.A)) - assert(rods.filter(_.position.pos == 2L).filter(_.position.refId == 0).count === 1) - assert(rods.filter(_.position.pos == 2L).filter(_.position.refId == 0).first.pileups.length === 2) - assert(rods.filter(_.position.pos == 2L).filter(_.position.refId == 0).first.pileups.forall(_.getReadBase == Base.C)) - assert(rods.filter(_.position.pos == 2L).filter(_.position.refId == 1).count === 1) - assert(rods.filter(_.position.pos == 2L).filter(_.position.refId == 1).first.pileups.length === 1) - assert(rods.filter(_.position.pos == 2L).filter(_.position.refId == 1).first.pileups.forall(_.getReadBase == Base.G)) - } - - sparkTest("can remap contig ids") { - val dict = SequenceDictionary(SequenceRecord(0, "chr0", 1000L, "http://bigdatagenomics.github.io/chr0.fa"), - SequenceRecord(1, "chr1", 1000L, "http://bigdatagenomics.github.io/chr0.fa")) - val ctg0 = ADAMNucleotideContigFragment.newBuilder() - .setContigName("chr0") - .setContigId(1) - .setContigLength(1000L) - .build() - val ctg1 = ADAMNucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigId(2) - .setContigLength(1000L) - .build() - - val rdd = sc.parallelize(List(ctg0, ctg1)) - - val remapped = rdd.adamRewriteContigIds(dict) - - assert(remapped.count === 2) - assert(remapped.filter(_.getContigName.toString == "chr0").first.getContigId === 0) - assert(remapped.filter(_.getContigName.toString == "chr1").first.getContigId === 1) + assert(rods.filter(_.position.referenceName == "chr0").count === 2) + assert(rods.filter(_.position.referenceName == "chr1").count === 1) + assert(rods.filter(_.position.pos == 1L).filter(_.position.referenceName == "chr0").count === 1) + assert(rods.filter(_.position.pos == 1L).filter(_.position.referenceName == "chr0").first.pileups.length === 1) + assert(rods.filter(_.position.pos == 1L).filter(_.position.referenceName == "chr0").first.pileups.forall(_.getReadBase == Base.A)) + assert(rods.filter(_.position.pos == 2L).filter(_.position.referenceName == "chr0").count === 1) + assert(rods.filter(_.position.pos == 2L).filter(_.position.referenceName == "chr0").first.pileups.length === 2) + assert(rods.filter(_.position.pos == 2L).filter(_.position.referenceName == "chr0").first.pileups.forall(_.getReadBase == Base.C)) + assert(rods.filter(_.position.pos == 2L).filter(_.position.referenceName == "chr1").count === 1) + assert(rods.filter(_.position.pos == 2L).filter(_.position.referenceName == "chr1").first.pileups.length === 1) + assert(rods.filter(_.position.pos == 2L).filter(_.position.referenceName == "chr1").first.pileups.forall(_.getReadBase == Base.G)) } - sparkTest("can remap contig ids while filtering out contigs that aren't in dict") { - val dict = SequenceDictionary(SequenceRecord(0, "chr0", 1000L, "http://bigdatagenomics.github.io/chr0.fa"), - SequenceRecord(1, "chr1", 1000L, "http://bigdatagenomics.github.io/chr0.fa")) - val ctg0 = ADAMNucleotideContigFragment.newBuilder() + sparkTest("generate sequence dict from fasta") { + val contig0 = ADAMContig.newBuilder .setContigName("chr0") - .setContigId(1) - .setContigLength(1000L) - .build() - val ctg1 = ADAMNucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigId(2) .setContigLength(1000L) - .build() - - val rdd = sc.parallelize(List(ctg0, ctg1)) - - val remapped = rdd.adamRewriteContigIds(dict) + .setReferenceURL("http://bigdatagenomics.github.io/chr0.fa") + .build - assert(remapped.count === 1) - assert(remapped.filter(_.getContigName.toString == "chr0").first.getContigId === 0) - assert(remapped.filter(_.getContigName.toString == "chr2").count === 0) - } + val contig1 = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(900L) + .build - sparkTest("generate sequence dict from fasta") { val ctg0 = ADAMNucleotideContigFragment.newBuilder() - .setContigName("chr0") - .setContigId(1) - .setContigLength(1000L) - .setUrl("http://bigdatagenomics.github.io/chr0.fa") + .setContig(contig0) .build() val ctg1 = ADAMNucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigId(2) - .setContigLength(900L) + .setContig(contig1) .build() val rdd = sc.parallelize(List(ctg0, ctg1)) @@ -410,17 +418,14 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { val dict = rdd.adamGetSequenceDictionary() assert(dict.containsRefName("chr0")) - assert(dict("chr0").id === 1) assert(dict("chr0").length === 1000L) assert(dict("chr0").url.toString == "http://bigdatagenomics.github.io/chr0.fa") assert(dict.containsRefName("chr1")) - assert(dict("chr1").id === 2) assert(dict("chr1").length === 900L) } sparkTest("recover samples from variant context") { val contig0 = ADAMContig.newBuilder() - .setContigId(1) .setContigName("chr0") .build val variant0 = ADAMVariant.newBuilder() @@ -454,7 +459,6 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { sparkTest("get sequence dictionary from variant context") { val contig0 = ADAMContig.newBuilder() .setContigName("chr0") - .setContigId(0) .setContigLength(1000) .build val variant0 = ADAMVariant.newBuilder() @@ -481,9 +485,8 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { val vc = ADAMVariantContext.buildFromGenotypes(genotypeSeq) val sequenceDict = sc.parallelize(List(vc)).adamGetSequenceDictionary() - assert(sequenceDict("chr0").id === 0) - println(sequenceDict(0).name.getClass) - assert(sequenceDict(0).name.toString === "chr0") + assert(sequenceDict("chr0") != None) + assert(sequenceDict("chr0").length === 1000) } sparkTest("characterizeTags counts integer tag values correctly") { @@ -551,12 +554,15 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("recover reference string from a single contig fragment") { + val contig = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(7L) + .build + val sequence = "ACTGTAC" val fragment = ADAMNucleotideContigFragment.newBuilder() - .setContigId(1) - .setContigName("chr1") + .setContig(contig) .setFragmentSequence(sequence) - .setContigLength(7L) .setFragmentNumber(0) .setFragmentStartPosition(0L) .setNumberOfFragmentsInContig(1) @@ -569,17 +575,20 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("recover trimmed reference string from a single contig fragment") { + val contig = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(7L) + .build + val sequence = "ACTGTAC" val fragment = ADAMNucleotideContigFragment.newBuilder() - .setContigId(1) - .setContigName("chr1") + .setContig(contig) .setFragmentSequence(sequence) - .setContigLength(7L) .setFragmentNumber(0) .setFragmentStartPosition(0L) .setNumberOfFragmentsInContig(1) .build() - val region = new ReferenceRegion(1, 1L, 6L) + val region = new ReferenceRegion("chr1", 1L, 6L) val rdd = sc.parallelize(List(fragment)) @@ -587,33 +596,37 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("recover reference string from multiple contig fragments") { + val contig1 = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(7L) + .build + + val contig2 = ADAMContig.newBuilder + .setContigName("chr2") + .setContigLength(11L) + .build + val sequence = "ACTGTACTC" val sequence0 = sequence.take(7) // ACTGTAC - val sequence1 = sequence.drop(3).take(5) // GTACT + val sequence1 = sequence.drop(3).take(5) // GTACT val sequence2 = sequence.takeRight(6).reverse // CTCATG val fragment0 = ADAMNucleotideContigFragment.newBuilder() - .setContigId(1) - .setContigName("chr1") + .setContig(contig1) .setFragmentSequence(sequence0) - .setContigLength(7L) .setFragmentNumber(0) .setFragmentStartPosition(0L) .setNumberOfFragmentsInContig(1) .build() val fragment1 = ADAMNucleotideContigFragment.newBuilder() - .setContigId(2) - .setContigName("chr2") + .setContig(contig2) .setFragmentSequence(sequence1) - .setContigLength(11L) .setFragmentNumber(0) .setFragmentStartPosition(0L) .setNumberOfFragmentsInContig(2) .build() val fragment2 = ADAMNucleotideContigFragment.newBuilder() - .setContigId(2) - .setContigName("chr2") + .setContig(contig2) .setFragmentSequence(sequence2) - .setContigLength(11L) .setFragmentNumber(1) .setFragmentStartPosition(5L) .setNumberOfFragmentsInContig(2) @@ -628,39 +641,43 @@ class ADAMRDDFunctionsSuite extends SparkFunSuite { } sparkTest("recover trimmed reference string from multiple contig fragments") { + val contig1 = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(7L) + .build + + val contig2 = ADAMContig.newBuilder + .setContigName("chr2") + .setContigLength(11L) + .build + val sequence = "ACTGTACTC" val sequence0 = sequence.take(7) // ACTGTAC - val sequence1 = sequence.drop(3).take(5) // GTACT + val sequence1 = sequence.drop(3).take(5) // GTACT val sequence2 = sequence.takeRight(6).reverse // CTCATG val fragment0 = ADAMNucleotideContigFragment.newBuilder() - .setContigId(1) - .setContigName("chr1") + .setContig(contig1) .setFragmentSequence(sequence0) - .setContigLength(7L) .setFragmentNumber(0) .setFragmentStartPosition(0L) .setNumberOfFragmentsInContig(1) .build() val fragment1 = ADAMNucleotideContigFragment.newBuilder() - .setContigId(2) - .setContigName("chr2") + .setContig(contig2) .setFragmentSequence(sequence1) - .setContigLength(11L) .setFragmentNumber(0) .setFragmentStartPosition(0L) .setNumberOfFragmentsInContig(2) .build() val fragment2 = ADAMNucleotideContigFragment.newBuilder() - .setContigId(2) - .setContigName("chr2") + .setContig(contig2) .setFragmentSequence(sequence2) - .setContigLength(11L) .setFragmentNumber(1) .setFragmentStartPosition(5L) .setNumberOfFragmentsInContig(2) .build() - val region0 = new ReferenceRegion(1, 1L, 6L) - val region1 = new ReferenceRegion(2, 3L, 9L) + val region0 = new ReferenceRegion("chr1", 1L, 6L) + val region1 = new ReferenceRegion("chr2", 3L, 9L) val rdd = sc.parallelize(List(fragment0, fragment1, fragment2)) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenomicRegionPartitionerSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenomicRegionPartitionerSuite.scala index a5273921e0..d56864edff 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenomicRegionPartitionerSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenomicRegionPartitionerSuite.scala @@ -17,7 +17,7 @@ package org.bdgenomics.adam.rdd import org.bdgenomics.adam.models.{ ReferencePosition, SequenceRecord, SequenceDictionary } import org.bdgenomics.adam.util.SparkFunSuite -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.projections.Projection @@ -29,50 +29,50 @@ import scala.util.Random class GenomicRegionPartitionerSuite extends SparkFunSuite { test("partitions the UNMAPPED ReferencePosition into the top partition") { - val parter = new GenomicRegionPartitioner(10, SequenceDictionary(record(0, "foo", 1000))) + val parter = new GenomicRegionPartitioner(10, SequenceDictionary(record("foo", 1000))) assert(parter.numPartitions === 11) assert(parter.getPartition(ReferencePosition.UNMAPPED) === 10) } test("partitioning into N pieces on M total sequence length, where N > M, results in M partitions") { - val parter = new GenomicRegionPartitioner(10, SequenceDictionary(record(0, "foo", 9))) + val parter = new GenomicRegionPartitioner(10, SequenceDictionary(record("foo", 9))) assert(parter.numPartitions === 10) } test("correctly partitions a single dummy sequence into two pieces") { - val parter = new GenomicRegionPartitioner(2, SequenceDictionary(record(0, "foo", 10))) - assert(parter.getPartition(ReferencePosition(0, 3)) === 0) - assert(parter.getPartition(ReferencePosition(0, 7)) === 1) + val parter = new GenomicRegionPartitioner(2, SequenceDictionary(record("foo", 10))) + assert(parter.getPartition(ReferencePosition("foo", 3)) === 0) + assert(parter.getPartition(ReferencePosition("foo", 7)) === 1) } test("correctly counts cumulative lengths") { - val parter = new GenomicRegionPartitioner(3, SequenceDictionary(record(0, "foo", 20), record(1, "bar", 10))) + val parter = new GenomicRegionPartitioner(3, SequenceDictionary(record("foo", 20), record("bar", 10))) - assert(parter.cumulativeLengths(0) === 0) - assert(parter.cumulativeLengths(1) === 20) + assert(parter.cumulativeLengths("bar") === 0) + assert(parter.cumulativeLengths("foo") === 10) } test("correctly partitions positions across two dummy sequences") { - val parter = new GenomicRegionPartitioner(3, SequenceDictionary(record(0, "foo", 20), record(1, "bar", 10))) - + val parter = new GenomicRegionPartitioner(3, SequenceDictionary(record("bar", 20), record("foo", 10))) // check easy examples - assert(parter.getPartition(ReferencePosition(0, 8)) === 0) - assert(parter.getPartition(ReferencePosition(0, 18)) === 1) - assert(parter.getPartition(ReferencePosition(1, 8)) === 2) + assert(parter.getPartition(ReferencePosition("foo", 8)) === 2) + assert(parter.getPartition(ReferencePosition("foo", 18)) === 3) + assert(parter.getPartition(ReferencePosition("bar", 18)) === 1) + assert(parter.getPartition(ReferencePosition("bar", 8)) === 0) // check edge cases - assert(parter.getPartition(ReferencePosition(0, 0)) === 0) - assert(parter.getPartition(ReferencePosition(0, 10)) === 1) - assert(parter.getPartition(ReferencePosition(1, 0)) === 2) + assert(parter.getPartition(ReferencePosition("foo", 0)) === 2) + assert(parter.getPartition(ReferencePosition("foo", 10)) === 3) + assert(parter.getPartition(ReferencePosition("bar", 0)) === 0) } sparkTest("test that we can range partition ADAMRecords") { val rand = new Random(1000L) val count = 1000 - val pos = sc.parallelize((1 to count).map(i => adamRecord(0, "1", "read_%d".format(i), rand.nextInt(100), readMapped = true))) + val pos = sc.parallelize((1 to count).map(i => adamRecord("chr1", "read_%d".format(i), rand.nextInt(100), readMapped = true))) val parts = 200 - val pairs = pos.map(p => (ReferencePosition(p.getReferenceId, p.getStart), p)) + val pairs = pos.map(p => (ReferencePosition(p.contig.getContigName, p.getStart), p)) val parter = new RangePartitioner(parts, pairs) val partitioned = pairs.sortByKey().partitionBy(parter) @@ -95,14 +95,14 @@ class GenomicRegionPartitionerSuite extends SparkFunSuite { val p = { import org.bdgenomics.adam.projections.ADAMRecordField._ - Projection(referenceName, referenceId, start, readName, readMapped) + Projection(contig, start, readName, readMapped) } val rdd: RDD[ADAMRecord] = sc.adamLoad(filename, projection = Some(p)) assert(rdd.count() === 200) val keyed = - rdd.map(rec => (ReferencePosition(rec.getReferenceId, rec.getStart), rec)).sortByKey() + rdd.map(rec => (ReferencePosition(rec.getContig.getContigName, rec.getStart), rec)).sortByKey() val keys = keyed.map(_._1).collect() assert(!keys.exists(rp => parter.getPartition(rp) < 0 || parter.getPartition(rp) >= parts)) @@ -118,17 +118,21 @@ class GenomicRegionPartitionerSuite extends SparkFunSuite { assert(partSizes.count() === parts + 1) } - def adamRecord(referenceId: Int, referenceName: String, readName: String, start: Long, readMapped: Boolean) = + def adamRecord(referenceName: String, readName: String, start: Long, readMapped: Boolean) = { + val contig = ADAMContig.newBuilder + .setContigName(referenceName) + .build + ADAMRecord.newBuilder() - .setReferenceId(referenceId) - .setReferenceName(referenceName) + .setContig(contig) .setReadName(readName) .setReadMapped(readMapped) .setStart(start) .build() + } - def record(id: Int, name: CharSequence, length: Long) = - SequenceRecord(id, name, length, "") + def record(name: CharSequence, length: Long) = + SequenceRecord(name, length, "") } class PositionKeyed[U <: Serializable] extends Serializable { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenotypesSummarySuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenotypesSummarySuite.scala index 9d574e832a..22267d9168 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenotypesSummarySuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenotypesSummarySuite.scala @@ -22,7 +22,6 @@ import org.bdgenomics.adam.util.SparkFunSuite class GenotypesSummarySuite extends SparkFunSuite { private val contig = ADAMContig.newBuilder() - .setContigId(1) .setContigName("abc") .setContigLength(100) .build diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/MarkDuplicatesSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/MarkDuplicatesSuite.scala index a84f7a546e..5834a963b6 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/MarkDuplicatesSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/MarkDuplicatesSuite.scala @@ -16,7 +16,7 @@ package org.bdgenomics.adam.rdd import org.bdgenomics.adam.util.SparkFunSuite -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } import org.bdgenomics.adam.rdd.ADAMContext._ import java.util.UUID @@ -26,15 +26,20 @@ class MarkDuplicatesSuite extends SparkFunSuite { ADAMRecord.newBuilder().setReadMapped(false).build() } - def createMappedRead(referenceId: Int, position: Long, + def createMappedRead(referenceName: String, position: Long, readName: String = UUID.randomUUID().toString, avgPhredScore: Int = 20, numClippedBases: Int = 0, isPrimaryAlignment: Boolean = true, isNegativeStrand: Boolean = false) = { assert(avgPhredScore >= 10 && avgPhredScore <= 50) val qual = (for (i <- 0 until 100) yield (avgPhredScore + 33).toChar).toString() val cigar = if (numClippedBases > 0) "%dS%dM".format(numClippedBases, 100 - numClippedBases) else "100M" + + val contig = ADAMContig.newBuilder + .setContigName("reference%s".format(referenceName)) + .build + ADAMRecord.newBuilder() - .setReferenceId(referenceId) + .setContig(contig) .setStart(position) .setQual(qual) .setCigar(cigar) @@ -44,28 +49,35 @@ class MarkDuplicatesSuite extends SparkFunSuite { .setRecordGroupName("machine foo") .setRecordGroupId(0) .setRecordGroupLibrary("library bar") - .setReferenceName("reference%d".format(referenceId)) .setDuplicateRead(false) .setReadNegativeStrand(isNegativeStrand) .build() } - def createPair(firstReferenceId: Int, firstPosition: Long, - secondReferenceId: Int, secondPosition: Long, + def createPair(firstReferenceName: String, firstPosition: Long, + secondReferenceName: String, secondPosition: Long, readName: String = UUID.randomUUID().toString, avgPhredScore: Int = 20): Seq[ADAMRecord] = { - val firstOfPair = createMappedRead(firstReferenceId, firstPosition, + val firstContig = ADAMContig.newBuilder + .setContigName(firstReferenceName) + .build + + val secondContig = ADAMContig.newBuilder + .setContigName(secondReferenceName) + .build + + val firstOfPair = createMappedRead(firstReferenceName, firstPosition, readName = readName, avgPhredScore = avgPhredScore) firstOfPair.setFirstOfPair(true) firstOfPair.setMateMapped(true) - firstOfPair.setMateReferenceId(secondReferenceId) + firstOfPair.setMateContig(secondContig) firstOfPair.setMateAlignmentStart(secondPosition) firstOfPair.setReadPaired(true) - val secondOfPair = createMappedRead(secondReferenceId, secondPosition, + val secondOfPair = createMappedRead(secondReferenceName, secondPosition, readName = readName, avgPhredScore = avgPhredScore, isNegativeStrand = true) secondOfPair.setSecondOfPair(true) secondOfPair.setMateMapped(true) - secondOfPair.setMateReferenceId(firstReferenceId) + secondOfPair.setMateContig(firstContig) secondOfPair.setMateAlignmentStart(firstPosition) secondOfPair.setReadPaired(true) Seq(firstOfPair, secondOfPair) @@ -76,15 +88,15 @@ class MarkDuplicatesSuite extends SparkFunSuite { } sparkTest("single read") { - val read = createMappedRead(0, 100) + val read = createMappedRead("0", 100) val marked = markDuplicates(read) // Can't have duplicates with a single read, should return the read unchanged. assert(marked(0) == read) } sparkTest("reads at different positions") { - val read1 = createMappedRead(0, 42) - val read2 = createMappedRead(0, 43) + val read1 = createMappedRead("0", 42) + val read2 = createMappedRead("0", 43) val marked = markDuplicates(read1, read2) // Reads shouldn't be modified assert(marked.contains(read1) && marked.contains(read2)) @@ -92,9 +104,9 @@ class MarkDuplicatesSuite extends SparkFunSuite { sparkTest("reads at the same position") { val poorReads = for (i <- 0 until 10) yield { - createMappedRead(1, 42, avgPhredScore = 20, readName = "poor%d".format(i)) + createMappedRead("1", 42, avgPhredScore = 20, readName = "poor%d".format(i)) } - val bestRead = createMappedRead(1, 42, avgPhredScore = 30, readName = "best") + val bestRead = createMappedRead("1", 42, avgPhredScore = 30, readName = "best") val marked = markDuplicates(List(bestRead) ++ poorReads: _*) val (dups, nonDup) = marked.partition(p => p.getDuplicateRead) assert(nonDup.size == 1 && nonDup(0) == bestRead) @@ -103,12 +115,12 @@ class MarkDuplicatesSuite extends SparkFunSuite { sparkTest("reads at the same position with clipping") { val poorClippedReads = for (i <- 0 until 5) yield { - createMappedRead(1, 44, numClippedBases = 2, avgPhredScore = 20, readName = "poorClipped%d".format(i)) + createMappedRead("1", 44, numClippedBases = 2, avgPhredScore = 20, readName = "poorClipped%d".format(i)) } val poorUnclippedReads = for (i <- 0 until 5) yield { - createMappedRead(1, 42, avgPhredScore = 20, readName = "poorUnclipped%d".format(i)) + createMappedRead("1", 42, avgPhredScore = 20, readName = "poorUnclipped%d".format(i)) } - val bestRead = createMappedRead(1, 42, avgPhredScore = 30, readName = "best") + val bestRead = createMappedRead("1", 42, avgPhredScore = 30, readName = "best") val marked = markDuplicates(List(bestRead) ++ poorClippedReads ++ poorUnclippedReads: _*) val (dups, nonDup) = marked.partition(p => p.getDuplicateRead) assert(nonDup.size == 1 && nonDup(0) == bestRead) @@ -117,9 +129,9 @@ class MarkDuplicatesSuite extends SparkFunSuite { sparkTest("reads on reverse strand") { val poorReads = for (i <- 0 until 7) yield { - createMappedRead(10, 42, isNegativeStrand = true, avgPhredScore = 20, readName = "poor%d".format(i)) + createMappedRead("10", 42, isNegativeStrand = true, avgPhredScore = 20, readName = "poor%d".format(i)) } - val bestRead = createMappedRead(10, 42, isNegativeStrand = true, avgPhredScore = 30, readName = "best") + val bestRead = createMappedRead("10", 42, isNegativeStrand = true, avgPhredScore = 30, readName = "best") val marked = markDuplicates(List(bestRead) ++ poorReads: _*) val (dups, nonDup) = marked.partition(p => p.getDuplicateRead) assert(nonDup.size == 1 && nonDup(0) == bestRead) @@ -137,9 +149,9 @@ class MarkDuplicatesSuite extends SparkFunSuite { sparkTest("read pairs") { val poorPairs = for ( i <- 0 until 10; - read <- createPair(0, 10, 0, 210, avgPhredScore = 20, readName = "poor%d".format(i)) + read <- createPair("0", 10, "0", 210, avgPhredScore = 20, readName = "poor%d".format(i)) ) yield read - val bestPair = createPair(0, 10, 0, 210, avgPhredScore = 30, readName = "best") + val bestPair = createPair("0", 10, "0", 210, avgPhredScore = 30, readName = "best") val marked = markDuplicates(bestPair ++ poorPairs: _*) val (dups, nonDups) = marked.partition(_.getDuplicateRead) assert(nonDups.size == 2 && nonDups.forall(p => p.getReadName.toString == "best")) @@ -148,10 +160,10 @@ class MarkDuplicatesSuite extends SparkFunSuite { sparkTest("read pairs with fragments") { val fragments = for (i <- 0 until 10) yield { - createMappedRead(2, 33, avgPhredScore = 40, readName = "fragment%d".format(i)) + createMappedRead("2", 33, avgPhredScore = 40, readName = "fragment%d".format(i)) } // Even though the phred score is lower, pairs always score higher than fragments - val pairs = createPair(2, 33, 2, 200, avgPhredScore = 20, readName = "pair") + val pairs = createPair("2", 33, "2", 200, avgPhredScore = 20, readName = "pair") val marked = markDuplicates(fragments ++ pairs: _*) val (dups, nonDups) = marked.partition(_.getDuplicateRead) assert(nonDups.size == 2 && nonDups.forall(p => p.getReadName.toString == "pair")) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/PileupAggregationSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/PileupAggregationSuite.scala index ec02a47952..d953ec833f 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/PileupAggregationSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/PileupAggregationSuite.scala @@ -17,7 +17,7 @@ package org.bdgenomics.adam.rdd import org.scalatest.FunSuite -import org.bdgenomics.adam.avro.{ ADAMPileup, Base } +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMPileup, Base } class PileupAggregationSuite extends FunSuite { @@ -53,7 +53,13 @@ class PileupAggregationSuite extends FunSuite { } test("aggregating a pileup with a single base type") { + val c0 = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(1000) + .build + val p0 = ADAMPileup.newBuilder() + .setContig(c0) .setPosition(1L) .setReadBase(Base.A) .setMapQuality(9) @@ -66,6 +72,7 @@ class PileupAggregationSuite extends FunSuite { .setReadEnd(1L) .build() val p1 = ADAMPileup.newBuilder() + .setContig(c0) .setPosition(1L) .setReadBase(Base.A) .setMapQuality(11) @@ -98,7 +105,11 @@ class PileupAggregationSuite extends FunSuite { } test("aggregating a pileup with a single base type, multiple bases at a position") { + val contig = ADAMContig.newBuilder + .setContigName("chr0") + .build val p0 = ADAMPileup.newBuilder() + .setContig(contig) .setPosition(1L) .setReadBase(Base.A) .setMapQuality(8) @@ -111,6 +122,7 @@ class PileupAggregationSuite extends FunSuite { .setReadEnd(1L) .build() val p1 = ADAMPileup.newBuilder() + .setContig(contig) .setPosition(1L) .setReadBase(Base.A) .setMapQuality(11) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/RegionJoinSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/RegionJoinSuite.scala index 47037cc01b..0787af44c2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/RegionJoinSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/RegionJoinSuite.scala @@ -17,7 +17,7 @@ package org.bdgenomics.adam.rdd import org.bdgenomics.adam.util.SparkFunSuite -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } import org.bdgenomics.adam.models.{ SequenceRecord, SequenceDictionary, ReferenceRegion, ReferenceMapping } import org.bdgenomics.adam.rich.ReferenceMappingContext._ import org.apache.spark.SparkContext._ @@ -27,8 +27,8 @@ class RegionJoinSuite extends SparkFunSuite { before { seqDict = SequenceDictionary( - SequenceRecord(1, "1", 5, "test://chrom1"), - SequenceRecord(2, "2", 5, "test://chrom2")) + SequenceRecord("chr1", 5, "test://chrom1"), + SequenceRecord("chr2", 5, "test://chrom2")) } test("alternating returns an alternating seq of items") { @@ -51,7 +51,7 @@ class RegionJoinSuite extends SparkFunSuite { } test("Single region returns itself") { - val region = new ReferenceRegion(1, 1, 2) + val region = new ReferenceRegion("chr1", 1, 2) val regions = new NonoverlappingRegions(seqDict, Seq(region)) val result = regions.findOverlappingRegions(region) assert(result.size === 1) @@ -60,17 +60,17 @@ class RegionJoinSuite extends SparkFunSuite { test("Two adjacent regions will be merged") { val regions = new NonoverlappingRegions(seqDict, Seq( - ReferenceRegion(1, 10, 20), - ReferenceRegion(1, 20, 30))) + ReferenceRegion("chr1", 10, 20), + ReferenceRegion("chr1", 20, 30))) assert(regions.endpoints === Array(10L, 30L)) } test("Nonoverlapping regions will all be returned") { - val region1 = new ReferenceRegion(1, 1, 2) - val region2 = new ReferenceRegion(1, 3, 5) - val testRegion3 = new ReferenceRegion(1, 1, 4) - val testRegion1 = new ReferenceRegion(1, 4, 5) + val region1 = new ReferenceRegion("chr1", 1, 2) + val region2 = new ReferenceRegion("chr1", 3, 5) + val testRegion3 = new ReferenceRegion("chr1", 1, 4) + val testRegion1 = new ReferenceRegion("chr1", 4, 5) val regions = new NonoverlappingRegions(seqDict, Seq(region1, region2)) // this should be 2, not 3, because binaryRegionSearch is (now) no longer returning @@ -81,20 +81,23 @@ class RegionJoinSuite extends SparkFunSuite { } test("Many overlapping regions will all be merged") { - val region1 = new ReferenceRegion(1, 1, 3) - val region2 = new ReferenceRegion(1, 2, 4) - val region3 = new ReferenceRegion(1, 3, 5) - val testRegion = new ReferenceRegion(1, 1, 4) + val region1 = new ReferenceRegion("chr1", 1, 3) + val region2 = new ReferenceRegion("chr1", 2, 4) + val region3 = new ReferenceRegion("chr1", 3, 5) + val testRegion = new ReferenceRegion("chr1", 1, 4) val regions = new NonoverlappingRegions(seqDict, Seq(region1, region2, region3)) assert(regions.findOverlappingRegions(testRegion).size === 1) } test("ADAMRecords return proper references") { + val contig = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(5L) + .setReferenceURL("test://chrom1") + .build + val built = ADAMRecord.newBuilder() - .setReferenceId(1) - .setReferenceName("1") - .setReferenceLength(5) - .setReferenceUrl("test://chrom1") + .setContig(contig) .setStart(1) .setReadMapped(true) .setCigar("1M") @@ -113,11 +116,14 @@ class RegionJoinSuite extends SparkFunSuite { } sparkTest("Ensure same reference regions get passed together") { + val contig = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(5L) + .setReferenceURL("test://chrom1") + .build + val builder = ADAMRecord.newBuilder() - .setReferenceId(1) - .setReferenceName("1") - .setReferenceLength(5) - .setReferenceUrl("test://chrom1") + .setContig(contig) .setStart(1) .setReadMapped(true) .setCigar("1M") @@ -150,11 +156,14 @@ class RegionJoinSuite extends SparkFunSuite { } sparkTest("Overlapping reference regions") { + val contig = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(5L) + .setReferenceURL("test://chrom1") + .build + val built = ADAMRecord.newBuilder() - .setReferenceId(1) - .setReferenceName("1") - .setReferenceLength(5) - .setReferenceUrl("test://chrom1") + .setContig(contig) .setStart(1) .setReadMapped(true) .setCigar("1M") @@ -184,20 +193,26 @@ class RegionJoinSuite extends SparkFunSuite { } sparkTest("Multiple reference regions do not throw exception") { + val contig1 = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(5L) + .setReferenceURL("test://chrom1") + .build + + val contig2 = ADAMContig.newBuilder + .setContigName("chr2") + .setContigLength(5L) + .setReferenceURL("test://chrom2") + .build + val builtRef1 = ADAMRecord.newBuilder() - .setReferenceId(1) - .setReferenceName("1") - .setReferenceLength(5) - .setReferenceUrl("test://chrom1") + .setContig(contig1) .setStart(1) .setReadMapped(true) .setCigar("1M") .build() val builtRef2 = ADAMRecord.newBuilder() - .setReferenceId(2) - .setReferenceName("2") - .setReferenceLength(5) - .setReferenceUrl("test://chrom2") + .setContig(contig2) .setStart(1) .setReadMapped(true) .setCigar("1M") @@ -229,20 +244,26 @@ class RegionJoinSuite extends SparkFunSuite { } sparkTest("regionJoin contains the same results as cartesianRegionJoin") { + val contig1 = ADAMContig.newBuilder + .setContigName("chr1") + .setContigLength(5L) + .setReferenceURL("test://chrom1") + .build + + val contig2 = ADAMContig.newBuilder + .setContigName("chr2") + .setContigLength(5L) + .setReferenceURL("test://chrom2") + .build + val builtRef1 = ADAMRecord.newBuilder() - .setReferenceId(1) - .setReferenceName("1") - .setReferenceLength(5) - .setReferenceUrl("test://chrom1") + .setContig(contig1) .setStart(1) .setReadMapped(true) .setCigar("1M") .build() val builtRef2 = ADAMRecord.newBuilder() - .setReferenceId(2) - .setReferenceName("2") - .setReferenceLength(5) - .setReferenceUrl("test://chrom2") + .setContig(contig2) .setStart(1) .setReadMapped(true) .setCigar("1M") diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/comparisons/ComparisonTraversalEngineSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/comparisons/ComparisonTraversalEngineSuite.scala index 4988d3e1b1..db5568fb3b 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/comparisons/ComparisonTraversalEngineSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/comparisons/ComparisonTraversalEngineSuite.scala @@ -16,7 +16,7 @@ package org.bdgenomics.adam.rdd.comparisons import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } import org.bdgenomics.adam.util.{ Histogram, SparkFunSuite } import org.bdgenomics.adam.projections.ADAMRecordField import org.bdgenomics.adam.metrics.aggregators.HistogramAggregator @@ -25,11 +25,14 @@ import org.bdgenomics.adam.metrics.MappedPosition class ComparisonTraversalEngineSuite extends SparkFunSuite { sparkTest("generate works on a simple RDD") { + val c0 = ADAMContig.newBuilder + .setContigName("chr0") + .build val a0 = ADAMRecord.newBuilder() + .setContig(c0) .setRecordGroupName("group0") .setReadName("read0") - .setReferenceId(0) .setStart(100) .setPrimaryAlignment(true) .setReadPaired(false) @@ -49,7 +52,7 @@ class ComparisonTraversalEngineSuite extends SparkFunSuite { val b = sc.parallelize(Seq(b0, b1)) import ADAMRecordField._ - val fields = Seq(recordGroupId, readName, referenceId, start, primaryAlignment, readPaired, readMapped) + val fields = Seq(recordGroupId, readName, contig, start, primaryAlignment, readPaired, readMapped) val engine = new ComparisonTraversalEngine(fields, a, b)(sc) @@ -67,10 +70,14 @@ class ComparisonTraversalEngineSuite extends SparkFunSuite { } sparkTest("combine works on a simple RDD") { + val c0 = ADAMContig.newBuilder + .setContigName("chr0") + .build + val a0 = ADAMRecord.newBuilder() .setRecordGroupName("group0") .setReadName("read0") - .setReferenceId(0) + .setContig(c0) .setStart(100) .setPrimaryAlignment(true) .setReadPaired(false) @@ -93,7 +100,7 @@ class ComparisonTraversalEngineSuite extends SparkFunSuite { val b = sc.parallelize(Seq(b0, b1, b2)) import ADAMRecordField._ - val fields = Seq(recordGroupId, readName, referenceId, start, primaryAlignment, readPaired, readMapped) + val fields = Seq(recordGroupId, readName, contig, start, primaryAlignment, readPaired, readMapped) val engine = new ComparisonTraversalEngine(fields, a, b)(sc) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variation/ADAMVariantContextRDDFunctionsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variation/ADAMVariantContextRDDFunctionsSuite.scala index fe10db57c0..234552d82a 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variation/ADAMVariantContextRDDFunctionsSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variation/ADAMVariantContextRDDFunctionsSuite.scala @@ -26,7 +26,7 @@ class ADAMVariantContextRDDFunctionsSuite extends SparkFunSuite { sparkTest("joins SNV database annotation") { val v0 = ADAMVariant.newBuilder - .setContig(ADAMContig.newBuilder.setContigName("11").setContigId(11).build) + .setContig(ADAMContig.newBuilder.setContigName("11").build) .setPosition(17409572) .setReferenceAllele("T") .setVariantAllele("C") diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variation/ADAMVariationContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variation/ADAMVariationContextSuite.scala index b6634930d6..acb5a107a2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variation/ADAMVariationContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variation/ADAMVariationContextSuite.scala @@ -30,7 +30,7 @@ class ADAMVariationContextSuite extends SparkFunSuite { def variants: RDD[ADAMVariantContext] = { val v0 = ADAMVariant.newBuilder - .setContig(ADAMContig.newBuilder.setContigId(1).setContigName("chr11").build) + .setContig(ADAMContig.newBuilder.setContigName("chr11").build) .setPosition(17409572) .setReferenceAllele("T") .setVariantAllele("C") diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rich/ADAMRecordReferenceMappingSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rich/ADAMRecordReferenceMappingSuite.scala index 366eb74eaf..1268470c94 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rich/ADAMRecordReferenceMappingSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rich/ADAMRecordReferenceMappingSuite.scala @@ -16,21 +16,17 @@ package org.bdgenomics.adam.rich import org.bdgenomics.adam.util.SparkFunSuite -import org.bdgenomics.adam.avro.ADAMRecord +import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord } import org.bdgenomics.adam.rich.ReferenceMappingContext.ADAMRecordReferenceMapping class ADAMRecordReferenceMappingSuite extends SparkFunSuite { sparkTest("test getReferenceId returns the right referenceId") { - val rec = ADAMRecord.newBuilder().setReferenceId(12).build() - assert(ADAMRecordReferenceMapping.getReferenceId(rec) === 12) - } - - sparkTest("test that remapReferenceId really changes the referenceId") { - val rec = ADAMRecord.newBuilder().setReferenceId(12).build() - val rec2 = ADAMRecordReferenceMapping.remapReferenceId(rec, 15) + val contig = ADAMContig.newBuilder + .setContigName("chr12") + .build - assert(ADAMRecordReferenceMapping.getReferenceId(rec) === 12) - assert(ADAMRecordReferenceMapping.getReferenceId(rec2) === 15) + val rec = ADAMRecord.newBuilder().setContig(contig).build() + assert(ADAMRecordReferenceMapping.getReferenceName(rec) === "chr12") } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichADAMVariantSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichADAMVariantSuite.scala index bc8af41a53..3d60fbd2a3 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichADAMVariantSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichADAMVariantSuite.scala @@ -22,12 +22,10 @@ import org.bdgenomics.adam.avro.{ ADAMVariant, ADAMContig } class RichADAMVariantSuite extends FunSuite { test("Equality without reference MD5") { val c1 = ADAMContig.newBuilder() - .setContigId(0) .setContigName("chr1") .setContigMD5("1b22b98cdeb4a9304cb5d48026a85128") .build val c2 = ADAMContig.newBuilder() - .setContigId(0) .setContigName("chr1") .build() @@ -52,12 +50,10 @@ class RichADAMVariantSuite extends FunSuite { test("Equality with reference MD5") { val c1 = ADAMContig.newBuilder() - .setContigId(0) .setContigName("chr1") .setContigMD5("1b22b98cdeb4a9304cb5d48026a85128") .build val c2 = ADAMContig.newBuilder() - .setContigId(0) .setContigName("chr1") .setContigMD5("1b22b98cdeb4a9304cb5d48026a85127") .build diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichRDDReferenceRecordsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichRDDReferenceRecordsSuite.scala deleted file mode 100644 index 5700762160..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichRDDReferenceRecordsSuite.scala +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright 2014 Genome Bridge LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rich - -import org.bdgenomics.adam.util.SparkFunSuite -import org.bdgenomics.adam.avro.ADAMRecord - -import org.bdgenomics.adam.rich.RichRDDReferenceRecords._ - -class RichRDDReferenceRecordsSuite extends SparkFunSuite { - - sparkTest("can convert referenceId values of RDD[ADAMRecord]") { - - val r1 = adamRecord(0, "foo", "read1", 1000L, true) - val r2 = adamRecord(1, "bar", "read2", 2000L, true) - - val map = Map(r1.getReadName -> r1, r2.getReadName -> r2) - - val rdd = sc.parallelize(Seq(r1, r2)) - - val remap = Map(0 -> 10, 1 -> 11) - val remapped = rdd.remapReferenceId(remap)(sc) - - val seq = remapped.collect().sortBy(_.getReadName.toString) - assert(seq.size === 2) - assert(seq(0).getReadName.toString === "read1") - assert(seq(1).getReadName.toString === "read2") - assert(seq(0).getReferenceId === 10) - assert(seq(1).getReferenceId === 11) - } - - def adamRecord(referenceId: Int, referenceName: String, readName: String, start: Long, readMapped: Boolean) = - ADAMRecord.newBuilder() - .setReferenceId(referenceId) - .setReferenceName(referenceName) - .setReadName(readName) - .setReadMapped(readMapped) - .setStart(start) - .build() -} diff --git a/adam-format/src/main/resources/avro/adam.avdl b/adam-format/src/main/resources/avro/adam.avdl index 5229275cba..5402abbe63 100644 --- a/adam-format/src/main/resources/avro/adam.avdl +++ b/adam-format/src/main/resources/avro/adam.avdl @@ -1,6 +1,13 @@ @namespace("org.bdgenomics.adam.avro") protocol ADAM { +record ADAMContig { + union { null, string } contigName = null; + union { null, long } contigLength = null; + union { null, string } contigMD5 = null; + union { null, string } referenceURL = null; +} + record ADAMRecord { /** @@ -9,10 +16,8 @@ record ADAMRecord { * of the schema, collectively form the contents * of the Sequence Dictionary embedded in the these * records from the BAM / SAM itself. - * TODO: this should be moved to ADAMContig */ - union { null, string } referenceName = null; - union { null, int } referenceId = null; + union { null, ADAMContig } contig = null; // 0-based reference position start union { null, long } start = null; @@ -59,14 +64,7 @@ record ADAMRecord { union { null, string } recordGroupPlatformUnit = null; union { null, string } recordGroupSample = null; - union { null, int } mateReferenceId = null; - - // See note, above. - union { null, long } referenceLength = null; - union { null, string } referenceUrl = null; - - union { null, long } mateReferenceLength = null; - union { null, string } mateReferenceUrl = null; + union { null, ADAMContig} mateContig = null; } enum Base { @@ -93,10 +91,8 @@ record ADAMNucleotideContigFragment { // stores a contig of nucleotides; this may be a reference chromosome, may be an // assembly, may be a BAC. very long contigs (>1Mbp) need to be split into fragments. // it seems that they are too long to load in a single go. - union { null, string } contigName = null; - union { null, int } contigId = null; + union { null, ADAMContig } contig = null; union { null, string } description = null; - union { null, string } url = null; union { null, string } fragmentSequence = null; // sequence of bases in this fragment union { null, long } contigLength = null; // length of the total contig (all fragments) union { null, int } fragmentNumber = null; // ordered number for this fragment @@ -105,8 +101,7 @@ record ADAMNucleotideContigFragment { } record ADAMPileup { - union { null, string } referenceName = null; - union { null, int } referenceId = null; + union { null, ADAMContig } contig = null; union { null, long } position = null; union { null, int } rangeOffset = null; union { null, int } rangeLength = null; @@ -142,13 +137,6 @@ record ADAMNestedPileup { array readEvidence; } -record ADAMContig { - union { null, int } contigId = null; - union { null, string } contigName = null; - union { null, long } contigLength = null; - union { null, string } contigMD5 = null; - union { null, string } referenceURL = null; -} record ADAMVariant { union { null, ADAMContig } contig = null;