Skip to content

Commit

Permalink
Merge pull request #217 from hammerlab/ref-id-to-name
Browse files Browse the repository at this point in the history
Stop using reference IDs and use reference names instead
  • Loading branch information
fnothaft committed Apr 22, 2014
2 parents 964ce7c + 1da1c6e commit b777945
Show file tree
Hide file tree
Showing 49 changed files with 731 additions and 995 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class CalculateDepth(protected val args: CalculateDepthArgs) extends ADAMSparkCo
// Quiet parquet logging...
ParquetLogger.hadoopLoggerLevel(Level.SEVERE)

val proj = Projection(referenceId, referenceName, referenceLength, referenceUrl, start, cigar, readMapped)
val proj = Projection(contig, start, cigar, readMapped)

val adamRDD: RDD[ADAMRecord] = sc.adamLoad(args.adamInputPath, projection = Some(proj))
val mappedRDD = adamRDD.filter(_.getReadMapped)
Expand Down Expand Up @@ -112,7 +112,7 @@ class CalculateDepth(protected val args: CalculateDepthArgs) extends ADAMSparkCo
depths.collect().foreach {
case (region, count) =>
println("%20s\t%15s\t% 5d".format(
"%s:%d".format(seqDict(region.refId).name, region.start),
"%s:%d".format(region.referenceName, region.start),
variantNames(region),
count))
}
Expand Down Expand Up @@ -144,13 +144,11 @@ class CalculateDepth(protected val args: CalculateDepthArgs) extends ADAMSparkCo
throw new IllegalArgumentException("chromosome name \"%s\" wasn't in the sequence dictionary (%s)".format(
chrom, seqDict.records.map(_.name).mkString(",")))
}
val refId = seqDict(chrom).id
val start = array(1).toLong
val name = array(2)
val end = start + array(3).length
(ReferenceRegion(refId, start, end), name)
(ReferenceRegion(chrom, start, end), name)
}
}.toSeq)
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class FindReads(protected val args: FindReadsArgs) extends ADAMSparkCommand[Find
case (name: CharSequence, ((bucket1: ReadBucket, bucket2: ReadBucket), generated: Seq[Any])) => {
val rec1 = bucket1.allReads().head
val rec2 = bucket2.allReads().head
(name, "%s:%d".format(rec1.getReferenceName, rec1.getStart), "%s:%d".format(rec2.getReferenceName, rec2.getStart))
(name, "%s:%d".format(rec1.getContig.getContigName, rec1.getStart), "%s:%d".format(rec2.getContig.getContigName, rec2.getStart))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class FlagStat(protected val args: FlagStatArgs) extends ADAMSparkCommand[FlagSt

val projection = Projection(
ADAMRecordField.readMapped, ADAMRecordField.mateMapped, ADAMRecordField.readPaired,
ADAMRecordField.referenceId, ADAMRecordField.mateReferenceId,
ADAMRecordField.contig, ADAMRecordField.mateContig,
ADAMRecordField.primaryAlignment,
ADAMRecordField.duplicateRead, ADAMRecordField.readMapped, ADAMRecordField.mateMapped,
ADAMRecordField.firstOfPair, ADAMRecordField.secondOfPair,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ class ListDict(protected val args: ListDictArgs) extends ADAMSparkCommand[ListDi

val dict = sc.adamDictionaryLoad[ADAMRecord](args.inputPath)

dict.recordsIn.sortBy(_.id).foreach {
dict.recordsIn.sortBy(_.name.toString).foreach {
rec: SequenceRecord =>
println("%d\t%s\t%d".format(rec.id, rec.name, rec.length))
println("%s\t%d".format(rec.name, rec.length))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
*/
package org.bdgenomics.adam.converters

import org.bdgenomics.adam.avro.ADAMNucleotideContigFragment
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import org.bdgenomics.adam.avro.{ ADAMContig, ADAMNucleotideContigFragment }
import org.bdgenomics.adam.rdd.ADAMContext._
import scala.Int
import scala.math.Ordering.Int
import scala.Predef._
Expand Down Expand Up @@ -195,18 +196,19 @@ private[converters] class FastaConverter(fragmentLength: Long) extends Serializa
.map(si => {
val (bases, index) = si

val contig = ADAMContig.newBuilder
.setContigLength(sequenceLength)

val builder = ADAMNucleotideContigFragment.newBuilder()
.setContigId(id)
.setFragmentSequence(bases)
.setContigLength(sequenceLength)
.setFragmentNumber(index)
.setFragmentStartPosition(index * fragmentLength)
.setNumberOfFragmentsInContig(fragmentCount)

// map over optional fields
name.foreach(builder.setContigName(_))
name.foreach(contig.setContigName(_))
description.foreach(builder.setDescription(_))

builder.setContig(contig.build)
// build and return
builder.build()
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ package org.bdgenomics.adam.converters

import net.sf.samtools.{ SAMReadGroupRecord, SAMRecord }

import org.bdgenomics.adam.avro.ADAMRecord
import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord }
import scala.collection.JavaConverters._
import org.bdgenomics.adam.models.{ Attribute, RecordGroupDictionary, SequenceDictionary }
import org.bdgenomics.adam.util.AttributeUtils
Expand All @@ -36,10 +36,10 @@ class SAMRecordConverter extends Serializable {
val readReference: Int = samRecord.getReferenceIndex
if (readReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
builder
.setReferenceId(readReference)
.setReferenceName(samRecord.getReferenceName)
.setReferenceLength(dict(samRecord.getReferenceIndex).length)
.setReferenceUrl(dict(samRecord.getReferenceIndex).url)
.setContig(ADAMContig.newBuilder
.setContigName(samRecord.getReferenceName)
.setContigLength(dict(samRecord.getReferenceName).length)
.setReferenceURL(dict(samRecord.getReferenceName).url).build)

val start: Int = samRecord.getAlignmentStart
if (start != 0) {
Expand All @@ -58,10 +58,10 @@ class SAMRecordConverter extends Serializable {

if (mateReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
builder
.setMateReferenceId(mateReference)
.setMateReference(samRecord.getMateReferenceName)
.setMateReferenceLength(dict(samRecord.getMateReferenceName).length)
.setMateReferenceUrl(dict(samRecord.getMateReferenceName).url)
.setMateContig(ADAMContig.newBuilder
.setContigName(samRecord.getMateReferenceName)
.setContigLength(dict(samRecord.getMateReferenceName).length)
.setReferenceURL(dict(samRecord.getMateReferenceName).url).build)

val mateStart = samRecord.getMateAlignmentStart
if (mateStart > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ class VariantContextConverter(dict: Option[SequenceDictionary] = None) extends S

val contig: ADAMContig.Builder = ADAMContig.newBuilder()
.setContigName(vc.getChr)
.setContigId(contigId)

if (dict.isDefined) {
val sr = (dict.get)(vc.getChr)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
*/
package org.bdgenomics.adam.metrics

import org.bdgenomics.adam.rich.RichADAMRecord._
import org.bdgenomics.adam.avro.{ ADAMRecord, ADAMContig }
import org.bdgenomics.adam.models.ReadBucket
import org.bdgenomics.adam.projections.FieldValue
import org.bdgenomics.adam.projections.ADAMRecordField._
import org.bdgenomics.adam.avro.ADAMRecord
import org.bdgenomics.adam.rich.RichADAMRecord._
import org.bdgenomics.adam.util.Util._;
import scala.collection.Map
import org.bdgenomics.adam.models.ReadBucket

Expand Down Expand Up @@ -93,7 +95,7 @@ object MappedPosition extends LongComparisons with Serializable {
case 1 => {
val r1 = records1.head
val r2 = records2.head
if (r1.getReferenceId == r2.getReferenceId) {
if (isSameContig(r1.getContig, r2.getContig)) {
val start1 = r1.getStart
val start2 = r2.getStart
if (start1 > start2) start1 - start2 else start2 - start1
Expand All @@ -115,7 +117,6 @@ object MappedPosition extends LongComparisons with Serializable {
distance(bucket1.pairedSecondSecondaryMappedReads, bucket2.pairedSecondSecondaryMappedReads))

def schemas: Seq[FieldValue] = Seq(
referenceId,
start,
firstOfPair)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@
package org.bdgenomics.adam.models

trait ReferenceMapping[T] {

def getReferenceId(value: T): Int
def remapReferenceId(value: T, newId: Int): T

def getReferenceName(value: T): String
def getReferenceRegion(value: T): ReferenceRegion
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,18 @@ object ReferencePositionWithOrientation {

case class ReferencePositionWithOrientation(refPos: Option[ReferencePosition], negativeStrand: Boolean)
extends Ordered[ReferencePositionWithOrientation] {

override def compare(that: ReferencePositionWithOrientation): Int = {
val posCompare = refPos.compare(that.refPos)
if (refPos.isEmpty && that.refPos.isEmpty) {
return 0
}
if (refPos.isEmpty) {
return -1
}
if (that.refPos.isEmpty) {
return 1
}
val posCompare = refPos.get.compare(that.refPos.get)
if (posCompare != 0) {
posCompare
} else {
Expand All @@ -61,7 +71,7 @@ object ReferencePosition {
* which is not located anywhere along the reference sequences (see, e.g. its use in
* GenomicRegionPartitioner).
*/
val UNMAPPED = new ReferencePosition(-1, -1)
val UNMAPPED = new ReferencePosition("", -1)

/**
* Checks to see if a read is mapped with a valid position.
Expand All @@ -70,9 +80,9 @@ object ReferencePosition {
* @return True if read is mapped and has a valid position, else false.
*/
def mappedPositionCheck(record: ADAMRecord): Boolean = {
val referenceId = Some(record.getReferenceId)
val contig = Some(record.getContig)
val start = Some(record.getStart)
record.getReadMapped && referenceId.isDefined && start.isDefined
record.getReadMapped && (contig.isDefined && Some(contig.get.getContigName).isDefined) && start.isDefined
}

/**
Expand All @@ -87,7 +97,7 @@ object ReferencePosition {
*/
def apply(record: ADAMRecord): Option[ReferencePosition] = {
if (mappedPositionCheck(record)) {
Some(new ReferencePosition(record.getReferenceId, record.getStart))
Some(new ReferencePosition(record.getContig.getContigName, record.getStart))
} else {
None
}
Expand All @@ -104,7 +114,7 @@ object ReferencePosition {
* @return The reference position of this variant.
*/
def apply(variant: ADAMVariant): ReferencePosition = {
new ReferencePosition(variant.getContig.getContigId, variant.getPosition)
new ReferencePosition(variant.getContig.getContigName, variant.getPosition)
}

/**
Expand All @@ -118,7 +128,7 @@ object ReferencePosition {
*/
def apply(genotype: ADAMGenotype): ReferencePosition = {
val variant = genotype.getVariant()
new ReferencePosition(variant.getContig.getContigId, variant.getPosition)
new ReferencePosition(variant.getContig.getContigName, variant.getPosition)
}

/**
Expand All @@ -134,7 +144,7 @@ object ReferencePosition {
*/
def fivePrime(record: ADAMRecord): Option[ReferencePosition] = {
if (mappedPositionCheck(record)) {
Some(new ReferencePosition(record.getReferenceId, record.fivePrimePosition.get))
Some(new ReferencePosition(record.getContig.getContigName, record.fivePrimePosition.get))
} else {
None
}
Expand All @@ -148,15 +158,15 @@ object ReferencePosition {
* @return The reference position of this pileup.
*/
def apply(pileup: ADAMPileup): ReferencePosition = {
new ReferencePosition(pileup.getReferenceId, pileup.getPosition)
new ReferencePosition(pileup.getContig.getContigName, pileup.getPosition)
}
}

case class ReferencePosition(refId: Int, pos: Long) extends Ordered[ReferencePosition] {
case class ReferencePosition(referenceName: String, pos: Long) extends Ordered[ReferencePosition] {

def compare(that: ReferencePosition): Int = {
override def compare(that: ReferencePosition): Int = {
// Note: important to compare by reference first for coordinate ordering
val refCompare = refId.compare(that.refId)
val refCompare = referenceName.compare(that.referenceName)
if (refCompare != 0) {
refCompare
} else {
Expand Down Expand Up @@ -194,13 +204,13 @@ class ReferencePositionWithOrientationSerializer extends Serializer[ReferencePos

class ReferencePositionSerializer extends Serializer[ReferencePosition] {
def write(kryo: Kryo, output: Output, obj: ReferencePosition) = {
output.writeInt(obj.refId)
output.writeString(obj.referenceName)
output.writeLong(obj.pos)
}

def read(kryo: Kryo, input: Input, klazz: Class[ReferencePosition]): ReferencePosition = {
val refId = input.readInt()
val refName = input.readString()
val pos = input.readLong()
new ReferencePosition(refId, pos)
new ReferencePosition(refName, pos)
}
}
Loading

0 comments on commit b777945

Please sign in to comment.