diff --git a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/BiallelicGenotyper.scala b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/BiallelicGenotyper.scala index bdd85d72..1fd68f90 100644 --- a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/BiallelicGenotyper.scala +++ b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/BiallelicGenotyper.scala @@ -216,11 +216,11 @@ class BiallelicGenotyper( // load reads val projection = Some(Filter(AlignmentRecordField.attributes, - AlignmentRecordField.origQual, - AlignmentRecordField.recordGroupName)) + AlignmentRecordField.originalQuality, + AlignmentRecordField.readGroupId)) val reads = sc.loadAlignments(args.inputPath, optProjection = projection) - val samples = reads.recordGroups.recordGroups.map(_.sample).toSet + val samples = reads.readGroups.readGroups.map(_.sampleId).toSet require(samples.nonEmpty, "Didn't see any samples attached to input. Did you forget to add read groups?") require(samples.size <= 1, diff --git a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/MergeDiscovered.scala b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/MergeDiscovered.scala index 67bab2dc..0b4b3e5a 100644 --- a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/MergeDiscovered.scala +++ b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/MergeDiscovered.scala @@ -56,7 +56,7 @@ class MergeDiscovered( sc.loadVariants(args.inputPath) .transformDataset(_.dropDuplicates("start", "end", - "contigName", + "referenceName", "referenceAllele", "alternateAllele")) .saveAsParquet(args.outputPath) diff --git a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Reassemble.scala b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Reassemble.scala index 7ddf923f..d0f1b8d0 100644 --- a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Reassemble.scala +++ b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Reassemble.scala @@ -20,7 +20,7 @@ package org.bdgenomics.avocado.cli import org.apache.spark.SparkContext import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs -import org.bdgenomics.adam.rdd.read.{ AlignmentRecordRDD, MDTagging } +import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, MDTagging } import org.bdgenomics.avocado.realigner.Realigner import org.bdgenomics.utils.cli._ import org.kohsuke.args4j.{ Argument, Option => Args4jOption } diff --git a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/TrioGenotyper.scala b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/TrioGenotyper.scala index 8fdb78aa..bbc3bc1c 100644 --- a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/TrioGenotyper.scala +++ b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/TrioGenotyper.scala @@ -21,8 +21,8 @@ import org.apache.spark.SparkContext import org.bdgenomics.adam.projections.{ AlignmentRecordField, Filter } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD -import org.bdgenomics.adam.rdd.variant.GenotypeRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.variant.GenotypeDataset import org.bdgenomics.avocado.genotyping.{ BiallelicGenotyper => Biallelic, DiscoverVariants => Discover, @@ -198,8 +198,8 @@ class TrioGenotyper( // load reads val projection = Some(Filter(AlignmentRecordField.attributes, - AlignmentRecordField.origQual, - AlignmentRecordField.recordGroupName)) + AlignmentRecordField.originalQuality, + AlignmentRecordField.readGroupId)) val firstParentReads = sc.loadAlignments(args.firstParentPath, optProjection = projection) val secondParentReads = sc.loadAlignments(args.secondParentPath, @@ -250,7 +250,7 @@ class TrioGenotyper( copyNumber, false) - val genotypes = GenotypeRDD(sc.union(firstParentGenotypes.rdd, + val genotypes = GenotypeDataset(sc.union(firstParentGenotypes.rdd, secondParentGenotypes.rdd, childGenotypes.rdd), variants.sequences, diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyper.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyper.scala index 65cd171f..b00e8ebe 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyper.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyper.scala @@ -25,10 +25,10 @@ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.IntegerType import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.adam.rdd.GenomeBins -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD + GenotypeDataset, + VariantDataset } import org.bdgenomics.adam.util.PhredUtils import org.bdgenomics.avocado.Timers._ @@ -85,21 +85,21 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging { * @param maxMapQ The highest mapping quality to allow. * @return Returns genotype calls. */ - def call(reads: AlignmentRecordRDD, - variants: VariantRDD, + def call(reads: AlignmentRecordDataset, + variants: VariantDataset, copyNumber: CopyNumberMap, scoreAllSites: Boolean, optDesiredPartitionCount: Option[Int] = None, optDesiredPartitionSize: Option[Int] = None, optDesiredMaxCoverage: Option[Int] = None, maxQuality: Int = 93, - maxMapQ: Int = 93): GenotypeRDD = CallGenotypes.time { + maxMapQ: Int = 93): GenotypeDataset = CallGenotypes.time { // validate metadata require(variants.sequences.isCompatibleWith(reads.sequences), "Variant sequence dictionary (%s) is not compatible with read dictionary (%s).".format( variants.sequences, reads.sequences)) - val samples = reads.recordGroups.recordGroups.map(_.sample).toSet + val samples = reads.readGroups.readGroups.map(_.sampleId).toSet require(samples.size == 1, "Currently, we only support a single sample. Saw: %s.".format( samples.mkString(", "))) @@ -124,11 +124,11 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging { val genotypeRdd = observationsToGenotypes(observationRdd, samples.head) - GenotypeRDD(genotypeRdd, + GenotypeDataset(genotypeRdd, variants.sequences, samples.map(s => { Sample.newBuilder() - .setSampleId(s) + .setId(s) .setName(s) .build() }).toSeq, org.bdgenomics.adam.converters.DefaultHeaderLines.allHeaderLines) @@ -153,7 +153,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging { * @param maxMapQ The highest mapping quality to allow. * @return Returns genotype calls. */ - def discoverAndCall(reads: AlignmentRecordRDD, + def discoverAndCall(reads: AlignmentRecordDataset, copyNumber: CopyNumberMap, scoreAllSites: Boolean, optDesiredPartitionCount: Option[Int] = None, @@ -162,7 +162,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging { optDesiredPartitionSize: Option[Int] = None, optDesiredMaxCoverage: Option[Int] = None, maxQuality: Int = 93, - maxMapQ: Int = 93): GenotypeRDD = { + maxMapQ: Int = 93): GenotypeDataset = { // get rdd storage level and warn if not persisted val readSl = reads.rdd.getStorageLevel @@ -442,7 +442,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging { // flatten schema val flatFields = Seq( - observationsDf("_1.contigName").as("contigName"), + observationsDf("_1.referenceName").as("referenceName"), observationsDf("_1.start").as("start"), observationsDf("_1.referenceAllele").as("referenceAllele"), observationsDf("_1.alternateAllele").as("alternateAllele"), @@ -493,14 +493,14 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging { sum("totalCoverage").as("totalCoverage"), first("isRef").as("isRef"), first("copyNumber").as("copyNumber")) - val aggregatedObservationsDf = joinedObservationsDf.groupBy("contigName", + val aggregatedObservationsDf = joinedObservationsDf.groupBy("referenceName", "start", "referenceAllele", "alternateAllele") .agg(aggCols.head, aggCols.tail: _*) // re-nest the output - val firstField = struct(aggregatedObservationsDf("contigName"), + val firstField = struct(aggregatedObservationsDf("referenceName"), aggregatedObservationsDf("start"), aggregatedObservationsDf("referenceAllele"), aggregatedObservationsDf("alternateAllele")) @@ -733,7 +733,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging { .setVariantCallingAnnotations(vcAnnotations) .setStart(v.getStart) .setEnd(v.getEnd) - .setContigName(v.getContigName) + .setReferenceName(v.getReferenceName) .setSampleId(sample) .setStrandBiasComponents(sbComponents .map(i => i: java.lang.Integer)) diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoverVariants.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoverVariants.scala index 9be29fd8..66c02c30 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoverVariants.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoverVariants.scala @@ -19,8 +19,8 @@ package org.bdgenomics.avocado.genotyping import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD -import org.bdgenomics.adam.rdd.variant.VariantRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.variant.VariantDataset import org.bdgenomics.avocado.Timers._ import org.bdgenomics.avocado.models.{ Clipped, @@ -41,19 +41,19 @@ import scala.annotation.tailrec object DiscoverVariants extends Serializable with Logging { /** - * Discovers all variants in an RDD of reads. + * Discovers all variants in an dataset of reads. * - * @param aRdd RDD of reads. + * @param aRdd Dataset of reads. * @param optPhredThreshold An optional threshold that discards all variants * not supported by bases of at least a given phred score. - * @return Returns an RDD of variants. + * @return Returns a dataset of variants. */ private[avocado] def apply( - aRdd: AlignmentRecordRDD, + aRdd: AlignmentRecordDataset, optPhredThreshold: Option[Int] = None, - optMinObservations: Option[Int] = None): VariantRDD = DiscoveringVariants.time { + optMinObservations: Option[Int] = None): VariantDataset = DiscoveringVariants.time { - VariantRDD(variantsInRdd(aRdd.rdd, + VariantDataset(variantsInRdd(aRdd.rdd, optPhredThreshold = optPhredThreshold, optMinObservations = optMinObservations), aRdd.sequences, @@ -87,7 +87,7 @@ object DiscoverVariants extends Serializable with Logging { val uniqueVariants = optMinObservations.fold({ variantDs.distinct })(mo => { - variantDs.groupBy(variantDs("contigName"), + variantDs.groupBy(variantDs("referenceName"), variantDs("start"), variantDs("referenceAllele"), variantDs("alternateAllele")) @@ -132,8 +132,8 @@ object DiscoverVariants extends Serializable with Logging { // get the read sequence, contig, etc val sequence = read.getSequence - val qual = read.getQual - val contigName = read.getContigName + val qual = read.getQuality + val referenceName = read.getReferenceName // advance to the first alignment match @tailrec def fastForward( @@ -198,7 +198,7 @@ object DiscoverVariants extends Serializable with Logging { val newVars = (0 until length).flatMap(i => { if (qual(i).toInt - 33 >= phredThreshold) { Some(DiscoveredVariant( - contigName, + referenceName, pos + i, ref(i).toString, sequence(idx + i).toString)) @@ -216,7 +216,7 @@ object DiscoverVariants extends Serializable with Logging { val insQuals = qual.substring(idx - 1, idx + length).map(_.toInt - 33).sum / length val newVar = if (insQuals >= phredThreshold) { DiscoveredVariant( - contigName, + referenceName, pos - 1, lastRef, sequence.substring(idx - 1, idx + length)) :: variants @@ -230,7 +230,7 @@ object DiscoverVariants extends Serializable with Logging { val delLength = ref.size val newVar = if (qual(idx - 1).toInt - 33 >= phredThreshold) { DiscoveredVariant( - contigName, + referenceName, pos - 1, lastRef + ref, sequence.substring(idx - 1, idx)) :: variants diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoveredVariant.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoveredVariant.scala index 1a7277c9..8be94d32 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoveredVariant.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoveredVariant.scala @@ -30,25 +30,25 @@ private[genotyping] object DiscoveredVariant { * @return Returns a case class-based representation of the variant. */ def apply(variant: Variant): DiscoveredVariant = { - new DiscoveredVariant(variant.getContigName, + new DiscoveredVariant(variant.getReferenceName, variant.getStart.toInt, variant.getReferenceAllele, Some(variant.getAlternateAllele)) } /** - * @param contigName The contig this variant is on. + * @param referenceName The contig this variant is on. * @param start The position this variant starts at. * @param referenceAllele The reference allele this variant varies from. * @param alternateAllele The substituted allele. * @return Returns a discovered variant with a defined alternate allele. */ def apply( - contigName: String, + referenceName: String, start: Int, referenceAllele: String, alternateAllele: String): DiscoveredVariant = { - new DiscoveredVariant(contigName, start, referenceAllele, Some(alternateAllele)) + new DiscoveredVariant(referenceName, start, referenceAllele, Some(alternateAllele)) } /** @@ -64,13 +64,13 @@ private[genotyping] object DiscoveredVariant { /** * A variant site and alleles. * - * @param contigName The contig this variant is on. + * @param referenceName The reference this variant is on. * @param start The position this variant starts at. * @param referenceAllele The reference allele this variant varies from. * @param alternateAllele The substituted allele. */ case class DiscoveredVariant( - contigName: String, + referenceName: String, start: Int, referenceAllele: String, alternateAllele: Option[String]) { @@ -87,7 +87,7 @@ case class DiscoveredVariant( */ def toVariant: Variant = { val builder = Variant.newBuilder - .setContigName(contigName) + .setReferenceName(referenceName) .setStart(start.toLong) .setEnd(end.toLong) .setReferenceAllele(referenceAllele) @@ -100,10 +100,10 @@ case class DiscoveredVariant( } def overlaps(v: DiscoveredVariant): Boolean = { - contigName == v.contigName && start < v.end && end > v.start + referenceName == v.referenceName && start < v.end && end > v.start } def overlaps(rr: ReferenceRegion): Boolean = { - contigName == rr.referenceName && start < rr.end && end > rr.start + referenceName == rr.referenceName && start < rr.end && end > rr.start } } diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/JointAnnotatorCaller.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/JointAnnotatorCaller.scala index dc9ceaf3..6c66d0be 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/JointAnnotatorCaller.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/JointAnnotatorCaller.scala @@ -21,8 +21,8 @@ import breeze.stats.distributions.Binomial import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.VariantContext import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantContextRDD + GenotypeDataset, + VariantContextDataset } import org.bdgenomics.adam.util.PhredUtils import org.bdgenomics.avocado.util.LogUtils @@ -49,7 +49,7 @@ object JointAnnotatorCaller extends Serializable { * @param genotypes The genotypes to jointly process. * @return Returns a squared off and annotated set of variant contexts. */ - def apply(genotypes: GenotypeRDD): VariantContextRDD = { + def apply(genotypes: GenotypeDataset): VariantContextDataset = { apply(genotypes.toVariantContexts) } @@ -59,7 +59,7 @@ object JointAnnotatorCaller extends Serializable { * @param variantContexts The squared off sites to process. * @return Returns a squared off and annotated set of variant contexts. */ - def apply(variantContexts: VariantContextRDD): VariantContextRDD = { + def apply(variantContexts: VariantContextDataset): VariantContextDataset = { variantContexts.transform(_.flatMap(annotateSite)) } diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/Observer.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/Observer.scala index 79a7d806..6336ea01 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/Observer.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/Observer.scala @@ -52,11 +52,11 @@ private[genotyping] object Observer extends Serializable { // for convenience, get the sample name, mapping quality, sequence, // qualities, and the contig name - val sampleId = read.getRecordGroupSample - val contigName = read.getContigName - val mapQ = read.getMapq + val sampleId = read.getReadGroupSampleId + val referenceName = read.getReferenceName + val mapQ = read.getMappingQuality val readSequence = read.getSequence - val readQualities = read.getQual + val readQualities = read.getQuality val forwardStrand = !read.getReadNegativeStrand // map over the alignment operators and generate allelic observations @@ -73,7 +73,7 @@ private[genotyping] object Observer extends Serializable { (0 until length).map(idx => { // the key is the (site, allele, sampleId) - val key = (ReferenceRegion(contigName, pos, pos + 1), + val key = (ReferenceRegion(referenceName, pos, pos + 1), readSequence(readIdx).toString, sampleId) @@ -104,7 +104,7 @@ private[genotyping] object Observer extends Serializable { // the key is the (site, allele, sampleId) // insertions associate to the site to their left, hence the -1 - val key = (ReferenceRegion(contigName, pos - 1, pos), + val key = (ReferenceRegion(referenceName, pos - 1, pos), bases, sampleId) @@ -124,7 +124,7 @@ private[genotyping] object Observer extends Serializable { // the key is the (site, allele, sampleId) // deletions have an empty string for the allele - val key = (ReferenceRegion(contigName, oldPos, pos), + val key = (ReferenceRegion(referenceName, oldPos, pos), "", sampleId) diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/SquareOffReferenceModel.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/SquareOffReferenceModel.scala index a8f26d6b..1e334052 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/SquareOffReferenceModel.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/SquareOffReferenceModel.scala @@ -23,9 +23,9 @@ import org.apache.spark.sql.functions._ import org.bdgenomics.adam.models.{ ReferenceRegion, VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD, - VariantContextRDD + GenotypeDataset, + VariantDataset, + VariantContextDataset } import org.bdgenomics.adam.sql.{ Genotype => GenotypeProduct, @@ -65,7 +65,7 @@ object SquareOffReferenceModel { * allele was called across all samples, with genotype likelihood models for * all samples that had data at the site. */ - def apply(genotypes: GenotypeRDD): VariantContextRDD = { + def apply(genotypes: GenotypeDataset): VariantContextDataset = { val variants = extractVariants(genotypes) @@ -80,14 +80,14 @@ object SquareOffReferenceModel { * allele was called across all samples, with genotype likelihood models for * all samples that had data at the site. */ - def apply(genotypes: GenotypeRDD, - variants: VariantRDD): VariantContextRDD = { + def apply(genotypes: GenotypeDataset, + variants: VariantDataset): VariantContextDataset = { // join variants back against genotypes val sites = variants.shuffleRegionJoinAndGroupByLeft(genotypes) variants.rdd.unpersist() - val calls = sites.transmute[VariantContext, VariantContextProduct, VariantContextRDD]( + val calls = sites.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( (rdd: RDD[(Variant, Iterable[Genotype])]) => rdd.map(s => squareOffSite(s._1, s._2))) calls.replaceSamples(genotypes.samples) @@ -132,7 +132,7 @@ object SquareOffReferenceModel { * @param genotypes Genotypes containing both called sites and reference models. * @return Returns sites where a variant was seen in at least one sample. */ - def extractVariants(genotypes: GenotypeRDD): VariantRDD = { + def extractVariants(genotypes: GenotypeDataset): VariantDataset = { val altString = GenotypeAllele.ALT.toString() @@ -144,7 +144,7 @@ object SquareOffReferenceModel { val trimUdf = udf((a: String, b: String) => trimRight(a, b)) val trimmerUdf = udf((a: String, b: Int) => a.dropRight(b)) - genotypes.transmuteDataset[Variant, VariantProduct, VariantRDD]((ds: Dataset[GenotypeProduct]) => { + genotypes.transmuteDataset[Variant, VariantProduct, VariantDataset]((ds: Dataset[GenotypeProduct]) => { import ds.sparkSession.implicits._ @@ -164,7 +164,7 @@ object SquareOffReferenceModel { trimmedVariants.dropDuplicates("start", "end", - "contigName", + "referenceName", "referenceAllele", "alternateAllele") }) @@ -210,7 +210,7 @@ object SquareOffReferenceModel { genotypes.find(gt => { gt.getStart == variant.getStart && gt.getEnd == variant.getEnd && - gt.getContigName == variant.getContigName && + gt.getReferenceName == variant.getReferenceName && gt.getVariant.getReferenceAllele == variant.getReferenceAllele && gt.getVariant.getAlternateAllele == variant.getAlternateAllele }) diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/TrioCaller.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/TrioCaller.scala index 7a5d7c34..859096ee 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/TrioCaller.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/TrioCaller.scala @@ -18,8 +18,8 @@ package org.bdgenomics.avocado.genotyping import org.bdgenomics.adam.models.VariantContext -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD -import org.bdgenomics.adam.rdd.variant.{ GenotypeRDD, VariantContextRDD } +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantContextDataset } import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele } import scala.collection.JavaConversions._ @@ -42,10 +42,10 @@ object TrioCaller extends Serializable { * @param rdd The reads to extract the sample ID from. * @return The sample ID. */ - def extractSampleId(rdd: AlignmentRecordRDD): String = { - require(!rdd.recordGroups.isEmpty, "Record groups are empty.") - val samples = rdd.recordGroups.recordGroups - .map(rg => rg.sample) + def extractSampleId(rdd: AlignmentRecordDataset): String = { + require(!rdd.readGroups.isEmpty, "Read groups are empty.") + val samples = rdd.readGroups.readGroups + .map(rg => rg.sampleId) .distinct require(samples.size == 1, "Had multiple sample names (%s) attached to reads.".format( @@ -57,18 +57,18 @@ object TrioCaller extends Serializable { /** * Trio calls genotypes in a pedigree with two parents and one child. * - * @param rdd RDD of base genotypes. + * @param genotypes Dataset of base genotypes. * @param firstParentId The sample ID for the first parent. * @param secondParentId The sample ID for the second parent. * @param childId The sample ID for the child. * @return Returns the final genotypes. */ - def apply(rdd: GenotypeRDD, + def apply(genotypes: GenotypeDataset, firstParentId: String, secondParentId: String, - childId: String): GenotypeRDD = { + childId: String): GenotypeDataset = { - apply(rdd.toVariantContexts, + apply(genotypes.toVariantContexts, firstParentId, secondParentId, childId).toGenotypes @@ -77,17 +77,17 @@ object TrioCaller extends Serializable { /** * Trio calls genotypes in a pedigree with two parents and one child. * - * @param rdd RDD of base genotypes. + * @param genotypes Dataset of base genotypes. * @param firstParentId The sample ID for the first parent. * @param secondParentId The sample ID for the second parent. * @param childId The sample ID for the child. * @return Returns the final genotypes. */ - private[genotyping] def apply(rdd: VariantContextRDD, + private[genotyping] def apply(genotypes: VariantContextDataset, firstParentId: String, secondParentId: String, - childId: String): VariantContextRDD = { - rdd.transform(rdd => { + childId: String): VariantContextDataset = { + genotypes.transform(rdd => { rdd.filter(!filterRef(_)) .map(processVariant(_, firstParentId, secondParentId, childId)) .filter(!filterRef(_)) @@ -124,7 +124,7 @@ object TrioCaller extends Serializable { def makeNoCall(sampleId: String): Genotype = { Genotype.newBuilder - .setContigName(vc.variant.variant.getContigName) + .setReferenceName(vc.variant.variant.getReferenceName) .setStart(vc.variant.variant.getStart) .setEnd(vc.variant.variant.getEnd) .setVariant(vc.variant.variant) diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/models/CopyNumberMap.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/models/CopyNumberMap.scala index fd74b7ef..80c8052b 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/models/CopyNumberMap.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/models/CopyNumberMap.scala @@ -19,7 +19,7 @@ package org.bdgenomics.avocado.models import org.apache.spark.SparkContext._ import org.bdgenomics.adam.models.ReferenceRegion -import org.bdgenomics.adam.rdd.feature.FeatureRDD +import org.bdgenomics.adam.rdd.feature.FeatureDataset import scala.math.{ max, min } private[avocado] object CopyNumberMap extends Serializable { @@ -40,10 +40,11 @@ private[avocado] object CopyNumberMap extends Serializable { * Creates a copy number variant map from CNVs stored as features. * * @param basePloidy The ploidy of this sample. + * @param features Dataset of features. * @return Returns a map containing copy number variants. */ def apply(basePloidy: Int, - features: FeatureRDD): CopyNumberMap = { + features: FeatureDataset): CopyNumberMap = { val cnvMap = features.rdd .flatMap(f => f.getFeatureType match { @@ -68,19 +69,19 @@ private[avocado] object CopyNumberMap extends Serializable { * An object that stores copy number variation. * * @param basePloidy The ploidy of this sample. - * @param variantsByContig A map mapping contig names to the regions containing - * copy number variants. These regions are sorted per contig, and are in + * @param variantsByReference A map mapping reference names to the regions containing + * copy number variants. These regions are sorted per reference, and are in * tuples with the observed copy number over that region. */ private[avocado] case class CopyNumberMap private ( val basePloidy: Int, - private[models] val variantsByContig: Map[String, Seq[(ReferenceRegion, Int)]]) { + private[models] val variantsByReference: Map[String, Seq[(ReferenceRegion, Int)]]) { /** * @return The lowest copy number seen over all regions. */ def minPloidy: Int = { - variantsByContig.values + variantsByReference.values .flatMap(s => s.map(_._2)) .fold(basePloidy)(_ min _) } @@ -89,7 +90,7 @@ private[avocado] case class CopyNumberMap private ( * @return The highest copy number seen over all regions. */ def maxPloidy: Int = { - variantsByContig.values + variantsByReference.values .flatMap(s => s.map(_._2)) .fold(basePloidy)(_ max _) } @@ -103,7 +104,7 @@ private[avocado] case class CopyNumberMap private ( def overlappingVariants( rr: ReferenceRegion): Iterable[(ReferenceRegion, Int)] = { - variantsByContig.get(rr.referenceName) + variantsByReference.get(rr.referenceName) .fold(Iterable.empty[(ReferenceRegion, Int)])(i => { i.dropWhile(!_._1.overlaps(rr)) .takeWhile(_._1.overlaps(rr)) diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/realigner/Realigner.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/realigner/Realigner.scala index f007b815..529363b3 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/realigner/Realigner.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/realigner/Realigner.scala @@ -18,7 +18,7 @@ package org.bdgenomics.avocado.realigner import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.avocado.Timers._ import org.bdgenomics.avocado.models.{ Clipped, @@ -43,8 +43,8 @@ object Realigner extends Logging { * @param kmerLength The length k of the k-mers. * @return Returns the realigned reads. */ - def realign(reads: AlignmentRecordRDD, - kmerLength: Int): AlignmentRecordRDD = { + def realign(reads: AlignmentRecordDataset, + kmerLength: Int): AlignmentRecordDataset = { reads.transform(realignRdd(_, kmerLength)) } diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala index e1efe565..7d218660 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala @@ -18,7 +18,7 @@ package org.bdgenomics.avocado.util import htsjdk.variant.vcf.{ VCFFilterHeaderLine, VCFHeaderLine } -import org.bdgenomics.adam.rdd.variant.GenotypeRDD +import org.bdgenomics.adam.rdd.variant.GenotypeDataset import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele, @@ -166,17 +166,17 @@ private[avocado] trait HardFilterGenotypesArgs extends Serializable { private[avocado] object HardFilterGenotypes extends Serializable { /** - * Applies hard filters to a GenotypeRDD. + * Applies hard filters to a GenotypeDataset. * - * @param grdd GenotypeRDD to filter. + * @param genotypes GenotypeDataset to filter. * @param args The hard filter configuration to apply. * @param filterRefGenotypes If true, discards homozygous ref calls. - * @return A new GenotypeRDD of hard filtered genotypes. + * @return A new GenotypeDataset of hard filtered genotypes. */ - def apply(grdd: GenotypeRDD, + def apply(genotypes: GenotypeDataset, args: HardFilterGenotypesArgs, filterRefGenotypes: Boolean = true, - emitAllGenotypes: Boolean = false): GenotypeRDD = { + emitAllGenotypes: Boolean = false): GenotypeDataset = { // make snp and indel filters val snpFilters = buildSnpHardFilters(args) @@ -242,7 +242,7 @@ private[avocado] object HardFilterGenotypes extends Serializable { // flat map the filters over the genotype rdd val minQuality = args.minQuality - grdd.transform(rdd => { + genotypes.transform(rdd => { rdd.flatMap(filterGenotype(_, minQuality, snpFilters, diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardLimiter.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardLimiter.scala index d24149ba..f8e967e5 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardLimiter.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardLimiter.scala @@ -109,7 +109,7 @@ private[avocado] object HardLimiter extends Serializable { val (lastRead, _) = kv assert(lastRead.getStart <= readStart, "New read (%s) is before last read (%s).".format(read, lastRead)) - assert(lastRead.getContigName == read._1.getContigName) + assert(lastRead.getReferenceName == read._1.getReferenceName) }) // any read that ends before this new read starts can be flushed diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/PrefilterReads.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/PrefilterReads.scala index bd739081..2deef675 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/PrefilterReads.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/PrefilterReads.scala @@ -18,7 +18,7 @@ package org.bdgenomics.avocado.util import org.bdgenomics.adam.models.SequenceDictionary -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.formats.avro.AlignmentRecord trait PrefilterReadsArgs extends Serializable { @@ -50,36 +50,36 @@ trait PrefilterReadsArgs extends Serializable { } /** - * Reifies an input AlignmentRecordRDD down to the contigs and reads we + * Reifies an input AlignmentRecordDataset down to the references and reads we * want to genotype. */ object PrefilterReads extends Serializable { /** - * Filters out reads and contigs that should not be processed. + * Filters out reads and references that should not be processed. * - * @param rdd RDD of reads and associated metadata. + * @param reads Dataset of reads and associated metadata. * @param args Arguments specifying the filters to apply. - * @return Returns a new AlignmentRecordRDD where reads that we don't want - * to use in genotyping have been discarded, and where contigs that we + * @return Returns a new AlignmentRecordDataset where reads that we don't want + * to use in genotyping have been discarded, and where references that we * don't want to genotype have been removed. */ - def apply(rdd: AlignmentRecordRDD, - args: PrefilterReadsArgs): AlignmentRecordRDD = { + def apply(reads: AlignmentRecordDataset, + args: PrefilterReadsArgs): AlignmentRecordDataset = { // get filter functions - val contigFn = contigFilterFn(args) - val readFn = readFilterFn(args, contigFn) + val referenceFn = referenceFilterFn(args) + val readFn = readFilterFn(args, referenceFn) - // filter contigs and construct a new sequence dictionary - val sequences = new SequenceDictionary(rdd.sequences + // filter references and construct a new sequence dictionary + val sequences = new SequenceDictionary(reads.sequences .records - .filter(r => contigFn(r.name))) + .filter(r => referenceFn(r.name))) - // filter reads and construct a new rdd - rdd.transform(r => { + // filter reads and construct a new dataset + reads.transform(r => { r.filter(readFn) - .map(maybeNullifyMate(_, contigFn)) + .map(maybeNullifyMate(_, referenceFn)) }).replaceSequences(sequences) } @@ -87,13 +87,13 @@ object PrefilterReads extends Serializable { * Nullifies the mate mapping info for reads whose mate is filtered. * * Needed to generate SAM/BAM/CRAM files containing filtered reads. - * If this isn't run, the conversion will error as the mate contig + * If this isn't run, the conversion will error as the mate reference * names are not found in the sequence dictionary. * * @param read Read to check for filtered mate. - * @param filterFn The function to use to filter contig names. + * @param filterFn The function to use to filter reference names. * @return Returns a read whose mate mapping info has been nullified if the - * mate mapping fields indicate that the mate is mapped to a contig that has + * mate mapping fields indicate that the mate is mapped to a reference that has * been filtered out. */ private[util] def maybeNullifyMate( @@ -102,12 +102,12 @@ object PrefilterReads extends Serializable { if (read.getReadPaired && read.getMateMapped) { - if (filterFn(read.getMateContigName)) { + if (filterFn(read.getMateReferenceName)) { read } else { AlignmentRecord.newBuilder(read) .setMateMapped(false) - .setMateContigName(null) + .setMateReferenceName(null) .build } } else { @@ -116,11 +116,11 @@ object PrefilterReads extends Serializable { } /** - * @param args The arguments specifying which contigs to keep. - * @return Returns a function that returns true if a contig with a given name + * @param args The arguments specifying which references to keep. + * @return Returns a function that returns true if a reference with a given name * should be kept. */ - protected[util] def contigFilterFn(args: PrefilterReadsArgs): (String => Boolean) = { + protected[util] def referenceFilterFn(args: PrefilterReadsArgs): (String => Boolean) = { val fns = Iterable(filterNonGrcAutosome(_), filterNonGrcSex(_), filterNonGrcMitochondrial(_), filterGrcAutosome(_), filterGrcSex(_), filterGrcMitochondrial(_)) val filteredFns = Iterable(true, !args.autosomalOnly, args.keepMitochondrialChromosome, @@ -140,18 +140,18 @@ object PrefilterReads extends Serializable { /** * @param args The arguments specifying which reads to keep. - * @param contigFilterFn A function that determines which contigs should be - * kept, given the contig name. + * @param referenceFilterFn A function that determines which references should be + * kept, given the reference name. * @return Returns a function that returns true if a read should be kept. */ protected[util] def readFilterFn( args: PrefilterReadsArgs, - contigFilterFn: (String => Boolean)): (AlignmentRecord => Boolean) = { + referenceFilterFn: (String => Boolean)): (AlignmentRecord => Boolean) = { def baseFilterFn(r: AlignmentRecord): Boolean = { (filterMapped(r, args.keepNonPrimary) && filterMappingQuality(r, args.minMappingQuality) && - contigFilterFn(r.getContigName)) + referenceFilterFn(r.getReferenceName)) } if (args.keepDuplicates) { @@ -190,76 +190,76 @@ object PrefilterReads extends Serializable { */ protected[util] def filterMappingQuality(read: AlignmentRecord, minMappingQuality: Int): Boolean = { - // if mapq is not set, ignore - if (read.getMapq == null) { + // if mappingQuality is not set, ignore + if (read.getMappingQuality == null) { true } else { - read.getMapq > minMappingQuality + read.getMappingQuality > minMappingQuality } } /** - * @param contigName Contig name to test for filtration. - * @return Returns true if the contig matches the naming scheme for GRCh + * @param referenceName Reference name to test for filtration. + * @return Returns true if the reference matches the naming scheme for GRCh * autosomal chromosomes. */ - protected[util] def filterGrcAutosome(contigName: String): Boolean = { - contigName != null && - contigName.size >= 4 && - contigName.startsWith("chr") && contigName.drop(3).forall(_.isDigit) + protected[util] def filterGrcAutosome(referenceName: String): Boolean = { + referenceName != null && + referenceName.size >= 4 && + referenceName.startsWith("chr") && referenceName.drop(3).forall(_.isDigit) } /** - * @param contigName Contig name to test for filtration. - * @return Returns true if the contig matches the naming scheme for GRCh + * @param referenceName Reference name to test for filtration. + * @return Returns true if the reference matches the naming scheme for GRCh * sex chromosomes. */ - protected[util] def filterGrcSex(contigName: String): Boolean = { - if (contigName != null && - contigName.length == 4 && - contigName.startsWith("chr")) { - contigName(3) == 'X' || contigName(3) == 'Y' || - contigName(3) == 'Z' || contigName(3) == 'W' + protected[util] def filterGrcSex(referenceName: String): Boolean = { + if (referenceName != null && + referenceName.length == 4 && + referenceName.startsWith("chr")) { + referenceName(3) == 'X' || referenceName(3) == 'Y' || + referenceName(3) == 'Z' || referenceName(3) == 'W' } else { false } } /** - * @param contigName Contig name to test for filtration. - * @return Returns true if the contig matches the GRCh mitochondrial + * @param referenceName Reference name to test for filtration. + * @return Returns true if the reference matches the GRCh mitochondrial * chromosome name. */ - protected[util] def filterGrcMitochondrial(contigName: String): Boolean = { - contigName != null && contigName == "chrM" + protected[util] def filterGrcMitochondrial(referenceName: String): Boolean = { + referenceName != null && referenceName == "chrM" } /** - * @param contigName Contig name to test for filtration. - * @return Returns true if the contig matches the naming scheme for HG/UCSC + * @param referenceName Reference name to test for filtration. + * @return Returns true if the reference matches the naming scheme for HG/UCSC * autosomal chromosomes. */ - protected[util] def filterNonGrcAutosome(contigName: String): Boolean = { - contigName != null && contigName.forall(_.isDigit) + protected[util] def filterNonGrcAutosome(referenceName: String): Boolean = { + referenceName != null && referenceName.forall(_.isDigit) } /** - * @param contigName Contig name to test for filtration. - * @return Returns true if the contig matches the naming scheme for HG/UCSC + * @param referenceName Reference name to test for filtration. + * @return Returns true if the reference matches the naming scheme for HG/UCSC * sex chromosomes. */ - protected[util] def filterNonGrcSex(contigName: String): Boolean = { - contigName != null && - (contigName == "X" || contigName == "Y" || - contigName == "Z" || contigName == "W") + protected[util] def filterNonGrcSex(referenceName: String): Boolean = { + referenceName != null && + (referenceName == "X" || referenceName == "Y" || + referenceName == "Z" || referenceName == "W") } /** - * @param contigName Contig name to test for filtration. - * @return Returns true if the contig matches the HG/UCSC mitochondrial + * @param referenceName Reference name to test for filtration. + * @return Returns true if the reference matches the HG/UCSC mitochondrial * chromosome name. */ - protected[util] def filterNonGrcMitochondrial(contigName: String): Boolean = { - contigName != null && contigName == "MT" + protected[util] def filterNonGrcMitochondrial(referenceName: String): Boolean = { + referenceName != null && referenceName == "MT" } } diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/RewriteHets.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/RewriteHets.scala index 14e899a8..ad991ef6 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/RewriteHets.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/RewriteHets.scala @@ -17,7 +17,7 @@ */ package org.bdgenomics.avocado.util -import org.bdgenomics.adam.rdd.variant.GenotypeRDD +import org.bdgenomics.adam.rdd.variant.GenotypeDataset import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele } import scala.collection.JavaConversions._ @@ -54,15 +54,15 @@ private[avocado] trait RewriteHetsArgs extends Serializable { object RewriteHets extends Serializable { /** - * Identifies high allelic fraction het calls in an RDD of genotypes and + * Identifies high allelic fraction het calls in an dataset of genotypes and * rewrites them as homozygous alt calls. * - * @param rdd The RDD of genotypes to filter. + * @param genotypes The dataset of genotypes to filter. * @param args The arguments to configure the rewriter. - * @return Returns a new RDD of genotypes. + * @return Returns a new dataset of genotypes. */ - def apply(rdd: GenotypeRDD, - args: RewriteHetsArgs): GenotypeRDD = { + def apply(genotypes: GenotypeDataset, + args: RewriteHetsArgs): GenotypeDataset = { val maxSnpAllelicFraction = args.maxHetSnpAltAllelicFraction val maxIndelAllelicFraction = args.maxHetIndelAltAllelicFraction @@ -70,13 +70,13 @@ object RewriteHets extends Serializable { val rewriteHetIndels = !args.disableHetIndelRewriting if (rewriteHetSnps || rewriteHetIndels) { - rdd.transform(gtRdd => gtRdd.map(processGenotype(_, + genotypes.transform(gtRdd => gtRdd.map(processGenotype(_, maxSnpAllelicFraction, maxIndelAllelicFraction, rewriteHetSnps, rewriteHetIndels))) } else { - rdd + genotypes } } diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyperSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyperSuite.scala index 89207621..4676d6ce 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyperSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyperSuite.scala @@ -18,15 +18,15 @@ package org.bdgenomics.avocado.genotyping import org.bdgenomics.adam.models.{ - RecordGroup, - RecordGroupDictionary, + ReadGroup, + ReadGroupDictionary, SequenceDictionary, SequenceRecord } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.feature.FeatureRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD -import org.bdgenomics.adam.rdd.variant.VariantRDD +import org.bdgenomics.adam.rdd.feature.FeatureDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.variant.VariantDataset import org.bdgenomics.avocado.AvocadoFunSuite import org.bdgenomics.avocado.models.{ CopyNumberMap, Observation } import org.bdgenomics.avocado.util.{ @@ -51,26 +51,26 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { .toSeq val perfectRead = AlignmentRecord.newBuilder - .setContigName("1") + .setReferenceName("1") .setStart(10L) .setEnd(25L) .setCigar("15M") .setMismatchingPositions("15") .setSequence("ATGGTCCACGAATAA") - .setQual("DEFGHIIIIIHGFED") - .setMapq(50) + .setQuality("DEFGHIIIIIHGFED") + .setMappingQuality(50) .setReadMapped(true) .build val snpRead = AlignmentRecord.newBuilder(perfectRead) .setMismatchingPositions("6C8") .setSequence("ATGGTCAACGAATAA") - .setMapq(40) + .setMappingQuality(40) .setReadNegativeStrand(true) .build val snp = Variant.newBuilder - .setContigName("1") + .setReferenceName("1") .setStart(16L) .setEnd(17L) .setReferenceAllele("C") @@ -78,7 +78,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { .build val cnSnp = Variant.newBuilder - .setContigName("1") + .setReferenceName("1") .setStart(17L) .setEnd(18L) .setReferenceAllele("A") @@ -86,14 +86,14 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { .build val cnvDup = Feature.newBuilder - .setContigName("1") + .setReferenceName("1") .setStart(17L) .setEnd(18L) .setFeatureType("DUP") .build val cnvDel = Feature.newBuilder - .setContigName("1") + .setReferenceName("1") .setStart(17L) .setEnd(18L) .setFeatureType("DEL") @@ -144,16 +144,16 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { sparkTest("score snps in a read overlapping a copy number dup boundary") { val genotypes = BiallelicGenotyper.call( - AlignmentRecordRDD(sc.parallelize(Seq(snpRead)), + AlignmentRecordDataset(sc.parallelize(Seq(snpRead)), SequenceDictionary.empty, - RecordGroupDictionary(Seq(RecordGroup("rg1", "rg1"))), + ReadGroupDictionary(Seq(ReadGroup("rg1", "rg1"))), Seq.empty), - VariantRDD(sc.parallelize(Seq(snp, cnSnp)), + VariantDataset(sc.parallelize(Seq(snp, cnSnp)), SequenceDictionary.empty, Seq.empty), CopyNumberMap(2, - FeatureRDD(sc.parallelize(Seq(cnvDup)), - SequenceDictionary.empty)), + FeatureDataset(sc.parallelize(Seq(cnvDup)), + SequenceDictionary.empty, Seq.empty)), false, maxQuality = 40, maxMapQ = 40) @@ -180,16 +180,16 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { sparkTest("score snps in a read overlapping a copy number del boundary") { val genotypes = BiallelicGenotyper.call( - AlignmentRecordRDD(sc.parallelize(Seq(snpRead)), + AlignmentRecordDataset(sc.parallelize(Seq(snpRead)), SequenceDictionary.empty, - RecordGroupDictionary(Seq(RecordGroup("rg1", "rg1"))), + ReadGroupDictionary(Seq(ReadGroup("rg1", "rg1"))), Seq.empty), - VariantRDD(sc.parallelize(Seq(snp, cnSnp)), + VariantDataset(sc.parallelize(Seq(snp, cnSnp)), SequenceDictionary.empty, Seq.empty), CopyNumberMap(2, - FeatureRDD(sc.parallelize(Seq(cnvDel)), - SequenceDictionary.empty)), + FeatureDataset(sc.parallelize(Seq(cnvDel)), + SequenceDictionary.empty, Seq.empty)), false, maxQuality = 40, maxMapQ = 40) @@ -307,7 +307,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { assert(genotype.getVariant === snp) assert(genotype.getStart === snp.getStart) assert(genotype.getEnd === snp.getEnd) - assert(genotype.getContigName === snp.getContigName) + assert(genotype.getReferenceName === snp.getReferenceName) assert(genotype.getSampleId === "sample") assert(genotype.getGenotypeQuality === 73) assert(genotype.getAlleles.size === 2) @@ -327,7 +327,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.104160.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val variants = DiscoverVariants(reads) @@ -384,7 +384,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878_snp_A2G_chr20_225058.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val genotypes = BiallelicGenotyper.discoverAndCall(reads, @@ -413,7 +413,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878_snp_A2G_chr20_225058.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val genotypes = BiallelicGenotyper.discoverAndCall(reads, @@ -446,7 +446,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.832736.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val genotypes = BiallelicGenotyper.discoverAndCall(reads, @@ -480,7 +480,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.839395.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val genotypes = BiallelicGenotyper.discoverAndCall(reads, @@ -548,7 +548,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.567239.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val variants = DiscoverVariants(reads) @@ -575,7 +575,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.875159.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val variants = DiscoverVariants(reads) @@ -601,7 +601,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.1_1777263.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val gts = BiallelicGenotyper.discoverAndCall(reads, @@ -637,7 +637,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.877715.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val gts = BiallelicGenotyper.discoverAndCall(reads, CopyNumberMap.empty(2), false) @@ -655,7 +655,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.886049.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val gts = BiallelicGenotyper.discoverAndCall(reads, CopyNumberMap.empty(2), false) @@ -673,7 +673,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.889159.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val gts = BiallelicGenotyper.discoverAndCall(reads, @@ -705,7 +705,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.866511.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }).realignIndels() val gts = BiallelicGenotyper.discoverAndCall(reads, @@ -720,7 +720,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.905130.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val gts = BiallelicGenotyper.discoverAndCall(reads, @@ -735,7 +735,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { assert(gts.size === 1) val gt = gts.head - assert(gt.getVariant.getContigName === "1") + assert(gt.getVariant.getReferenceName === "1") assert(gt.getVariant.getStart === 905129L) assert(gt.getVariant.getEnd === 905132L) assert(gt.getVariant.getReferenceAllele === "ATG") @@ -747,7 +747,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.905130.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val genotypes = BiallelicGenotyper.discoverAndCall(reads, @@ -770,7 +770,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { assert(refCountByGt.count(_ == 0) === 2) val gt = gts.filter(_.getVariant.getStart == 905129L).head - assert(gt.getVariant.getContigName === "1") + assert(gt.getVariant.getReferenceName === "1") assert(gt.getVariant.getEnd === 905132L) assert(gt.getVariant.getReferenceAllele === "ATG") assert(gt.getVariant.getAlternateAllele === "A") @@ -781,7 +781,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.907170.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 0) + rdd.filter(_.getMappingQuality > 0) }) val gts = BiallelicGenotyper.discoverAndCall(reads, @@ -793,7 +793,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { assert(gts.size === 1) val gt = gts.head - assert(gt.getVariant.getContigName === "1") + assert(gt.getVariant.getReferenceName === "1") assert(gt.getVariant.getStart === 907169L) assert(gt.getVariant.getEnd === 907171L) assert(gt.getVariant.getReferenceAllele === "AG") @@ -805,7 +805,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val readPath = resourceUrl("NA12878.chr1.240898.sam") val reads = sc.loadAlignments(readPath.toString) .transform(rdd => { - rdd.filter(_.getMapq > 10) + rdd.filter(_.getMappingQuality > 10) }) val gts = BiallelicGenotyper.discoverAndCall(reads, @@ -817,7 +817,7 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { assert(gts.size === 1) val gt = gts.head - assert(gt.getVariant.getContigName === "1") + assert(gt.getVariant.getReferenceName === "1") assert(gt.getVariant.getStart === 240897L) assert(gt.getVariant.getEnd === 240898L) assert(gt.getVariant.getReferenceAllele === "T") @@ -829,14 +829,14 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { def makeRead(allele: Char): AlignmentRecord = { assert(allele != 'T') AlignmentRecord.newBuilder - .setContigName("ctg") + .setReferenceName("ctg") .setStart(10L) .setEnd(15L) .setSequence("AC%sTG".format(allele)) .setCigar("5M") .setMismatchingPositions("2T2") - .setQual(Seq(50, 50, 50, 50, 50).map(q => (q + 33).toInt).mkString) - .setMapq(50) + .setQuality(Seq(50, 50, 50, 50, 50).map(q => (q + 33).toInt).mkString) + .setMappingQuality(50) .setReadMapped(true) .setPrimaryAlignment(true) .build @@ -844,10 +844,10 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { val reads = Seq(makeRead('A'), makeRead('A'), makeRead('A'), makeRead('A'), makeRead('C'), makeRead('C'), makeRead('C'), makeRead('C')) - val readRdd = AlignmentRecordRDD( + val readRdd = AlignmentRecordDataset( sc.parallelize(reads), SequenceDictionary(SequenceRecord("ctg", 16L)), - RecordGroupDictionary(Seq(RecordGroup("rg1", "rg1"))), + ReadGroupDictionary(Seq(ReadGroup("rg1", "rg1"))), Seq.empty) val gts = BiallelicGenotyper.discoverAndCall(readRdd, @@ -876,12 +876,12 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite { assert(gts.size === 2) val taaaGt = gts.filter(_.getVariant.getAlternateAllele === "TAAA").head - assert(taaaGt.getVariant.getContigName === "1") + assert(taaaGt.getVariant.getReferenceName === "1") assert(taaaGt.getVariant.getReferenceAllele === "T") assert(taaaGt.getVariant.getAlternateAllele === "TAAA") assert(taaaGt.getAlleles.count(_ == GenotypeAllele.ALT) === 2) val caaaGt = gts.filter(_.getVariant.getAlternateAllele === "CAAA").head - assert(caaaGt.getVariant.getContigName === "1") + assert(caaaGt.getVariant.getReferenceName === "1") assert(caaaGt.getVariant.getReferenceAllele === "T") assert(caaaGt.getVariant.getAlternateAllele === "CAAA") assert(caaaGt.getAlleles.count(_ == GenotypeAllele.OTHER_ALT) === 2) diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoverVariantsSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoverVariantsSuite.scala index 17332534..f61a0735 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoverVariantsSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoverVariantsSuite.scala @@ -18,11 +18,11 @@ package org.bdgenomics.avocado.genotyping import org.bdgenomics.adam.models.{ - RecordGroupDictionary, + ReadGroupDictionary, SequenceDictionary, SequenceRecord } -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.avocado.AvocadoFunSuite import org.bdgenomics.formats.avro.{ AlignmentRecord, Variant } @@ -31,104 +31,104 @@ class DiscoverVariantsSuite extends AvocadoFunSuite { val unalignedRead = AlignmentRecord.newBuilder() .setReadMapped(false) .setSequence("ACACATGA") - .setQual("!!!!!!!!") + .setQuality("!!!!!!!!") .build val perfectReadMCigar = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("1") + .setReferenceName("1") .setStart(10L) .setEnd(18L) .setSequence("ACACATGA") - .setQual("!!!!!!!!") + .setQuality("!!!!!!!!") .setCigar("8M") .setMismatchingPositions("8") .build val perfectReadEqCigar = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("1") + .setReferenceName("1") .setStart(10L) .setEnd(18L) .setSequence("ACACATGA") - .setQual("!!!!!!!!") + .setQuality("!!!!!!!!") .setCigar("8=") .setMismatchingPositions("8") .build val snpReadMCigar = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("1") + .setReferenceName("1") .setStart(10L) .setEnd(18L) .setSequence("ACACATGA") - .setQual("!!!!!!!!") + .setQuality("!!!!!!!!") .setCigar("8M") .setMismatchingPositions("4C3") .build val snpReadEqCigar = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("1") + .setReferenceName("1") .setStart(10L) .setEnd(18L) .setSequence("ACACATGA") - .setQual("!!!!!!!!") + .setQuality("!!!!!!!!") .setCigar("4=1X3=") .setMismatchingPositions("4C3") .build val snpReadHardClip = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("1") + .setReferenceName("1") .setStart(10L) .setEnd(18L) .setSequence("ACACATGA") - .setQual("!!!!!!!!") + .setQuality("!!!!!!!!") .setCigar("2H8M") .setMismatchingPositions("4C3") .build val snpReadSoftClip = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("1") + .setReferenceName("1") .setStart(10L) .setEnd(18L) .setSequence("TGACACATGA") - .setQual("!!!!!!!!!!") + .setQuality("!!!!!!!!!!") .setCigar("2S8M") .setMismatchingPositions("4C3") .build val insertRead = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("2") + .setReferenceName("2") .setStart(10L) .setEnd(18L) .setSequence("ACACTTATGA") - .setQual("!!!!!!!!!!") + .setQuality("!!!!!!!!!!") .setCigar("4M2I4M") .setMismatchingPositions("8") .build val deleteRead = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("3") + .setReferenceName("3") .setStart(10L) .setEnd(20L) .setSequence("ACACATGA") - .setQual("!!!!!!!!") + .setQuality("!!!!!!!!") .setCigar("4M2D4M") .setMismatchingPositions("4^TT4") .build val mnpRead = AlignmentRecord.newBuilder() .setReadMapped(true) - .setContigName("3") + .setReferenceName("3") .setStart(10L) .setEnd(18L) .setSequence("ACACATGA") - .setQual("!!!!!!!!") + .setQuality("!!!!!!!!") .setCigar("8M") .setMismatchingPositions("3T0T3") .build @@ -153,7 +153,7 @@ class DiscoverVariantsSuite extends AvocadoFunSuite { } def validateSnp(snp: Variant) { - assert(snp.getContigName() === "1") + assert(snp.getReferenceName() === "1") assert(snp.getStart() === 14L) assert(snp.getEnd() === 15L) assert(snp.getReferenceAllele === "C") @@ -185,7 +185,7 @@ class DiscoverVariantsSuite extends AvocadoFunSuite { } def validateInsertion(ins: Variant) { - assert(ins.getContigName() === "2") + assert(ins.getReferenceName() === "2") assert(ins.getStart() === 13L) assert(ins.getEnd() === 14L) assert(ins.getReferenceAllele() === "C") @@ -206,7 +206,7 @@ class DiscoverVariantsSuite extends AvocadoFunSuite { } def validateDeletion(del: Variant) { - assert(del.getContigName() === "3") + assert(del.getReferenceName() === "3") assert(del.getStart() === 13L) assert(del.getEnd() === 16L) assert(del.getReferenceAllele() === "CTT") @@ -233,12 +233,12 @@ class DiscoverVariantsSuite extends AvocadoFunSuite { snpReadMCigar, snpReadEqCigar, insertRead, deleteRead)) - val readRdd = AlignmentRecordRDD(rdd, + val readRdd = AlignmentRecordDataset(rdd, SequenceDictionary( SequenceRecord("1", 50L), SequenceRecord("2", 40L), SequenceRecord("3", 30L)), - RecordGroupDictionary.empty, + ReadGroupDictionary.empty, Seq.empty) val variantRdd = DiscoverVariants(readRdd) @@ -250,7 +250,7 @@ class DiscoverVariantsSuite extends AvocadoFunSuite { test("break TT->CA mnp into two snps") { val variants = DiscoverVariants.variantsInRead(mnpRead, 0) assert(variants.size === 2) - assert(variants.forall(_.contigName == "3")) + assert(variants.forall(_.referenceName == "3")) assert(variants.forall(_.referenceAllele == "T")) val optC = variants.find(_.alternateAllele == Some("C")) assert(optC.isDefined) diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoveredVariantSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoveredVariantSuite.scala index b3e7a90f..003c2e80 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoveredVariantSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoveredVariantSuite.scala @@ -24,7 +24,7 @@ class DiscoveredVariantSuite extends FunSuite { test("round trip conversion to/from variant") { val variant = Variant.newBuilder - .setContigName("ctg") + .setReferenceName("ctg") .setStart(100L) .setEnd(101L) .setReferenceAllele("A") diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/JointAnnotatorCallerSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/JointAnnotatorCallerSuite.scala index f8df3644..3410d451 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/JointAnnotatorCallerSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/JointAnnotatorCallerSuite.scala @@ -31,7 +31,7 @@ import scala.collection.JavaConversions._ class JointAnnotatorCallerSuite extends AvocadoFunSuite { val baseGt = Genotype.newBuilder - .setContigName("chr1") + .setReferenceName("chr1") .setStart(1000) .setEnd(1001) .setVariant(Variant.newBuilder diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/ObserverSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/ObserverSuite.scala index d5743a0a..3f0a5307 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/ObserverSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/ObserverSuite.scala @@ -33,10 +33,10 @@ class ObserverSuite extends AvocadoFunSuite { val read = AlignmentRecord.newBuilder .setStart(10L) .setEnd(11L) - .setContigName("ctg") + .setReferenceName("ctg") .setSequence("AAAA") - .setQual("****") - .setMapq(0) + .setQuality("****") + .setMappingQuality(0) .setReadMapped(true) .setCigar("4S") .setMismatchingPositions("0") @@ -68,17 +68,17 @@ class ObserverSuite extends AvocadoFunSuite { val read = AlignmentRecord.newBuilder .setStart(10L) .setEnd(15L) - .setContigName("ctg") + .setReferenceName("ctg") .setSequence("ACGT") - .setQual(Array(20, 30, 40, 50) + .setQuality(Array(20, 30, 40, 50) .map(v => (v + 33).toChar) .mkString) .setReadMapped(true) .setReadNegativeStrand(false) .setCigar("4M") .setMismatchingPositions("4") - .setMapq(50) - .setRecordGroupSample("sample") + .setMappingQuality(50) + .setReadGroupSampleId("sample") .build() val obs = Observer.observeRead(read) @@ -115,17 +115,17 @@ class ObserverSuite extends AvocadoFunSuite { val read = AlignmentRecord.newBuilder .setStart(10L) .setEnd(12L) - .setContigName("ctg") + .setReferenceName("ctg") .setSequence("ACGT") - .setQual(Array(20, 30, 40, 50) + .setQuality(Array(20, 30, 40, 50) .map(v => (v + 33).toChar) .mkString) .setReadMapped(true) .setReadNegativeStrand(false) .setCigar("1M2I1M") .setMismatchingPositions("2") - .setMapq(50) - .setRecordGroupSample("sample") + .setMappingQuality(50) + .setReadGroupSampleId("sample") .build() val obs = Observer.observeRead(read) @@ -174,17 +174,17 @@ class ObserverSuite extends AvocadoFunSuite { val read = AlignmentRecord.newBuilder .setStart(10L) .setEnd(17L) - .setContigName("ctg") + .setReferenceName("ctg") .setSequence("ACGT") - .setQual(Array(20, 30, 40, 50) + .setQuality(Array(20, 30, 40, 50) .map(v => (v + 33).toChar) .mkString) .setReadMapped(true) .setReadNegativeStrand(false) .setCigar("2M2D2M") .setMismatchingPositions("2^NN2") - .setMapq(50) - .setRecordGroupSample("sample") + .setMappingQuality(50) + .setReadGroupSampleId("sample") .build() val obs = Observer.observeRead(read) diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/SquareOffReferenceModelSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/SquareOffReferenceModelSuite.scala index 485e9152..f379dfb7 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/SquareOffReferenceModelSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/SquareOffReferenceModelSuite.scala @@ -58,7 +58,7 @@ class SquareOffReferenceModelSuite extends AvocadoFunSuite { val variants = discoveredVariants.rdd.collect assert(variants.size === 3) - assert(variants.forall(_.getContigName == "chr22")) + assert(variants.forall(_.getReferenceName == "chr22")) val s602 = variants.filter(_.getStart == 16157602L) assert(s602.size === 1) assert(s602.forall(_.getReferenceAllele == "G")) @@ -77,14 +77,14 @@ class SquareOffReferenceModelSuite extends AvocadoFunSuite { test("find genotype if variant is present") { val variant = Variant.newBuilder() - .setContigName("ctg") + .setReferenceName("ctg") .setStart(1L) .setEnd(2L) .setReferenceAllele("A") .setAlternateAllele("T") .build val genotypes = Iterable(Genotype.newBuilder - .setContigName("ctg") + .setReferenceName("ctg") .setStart(1L) .setEnd(2L) .setVariant(Variant.newBuilder() @@ -101,14 +101,14 @@ class SquareOffReferenceModelSuite extends AvocadoFunSuite { test("don't find genotype if variant is not present") { val variant = Variant.newBuilder() - .setContigName("ctg") + .setReferenceName("ctg") .setStart(1L) .setEnd(2L) .setReferenceAllele("A") .setAlternateAllele("T") .build val genotypes = Iterable(Genotype.newBuilder - .setContigName("ctg") + .setReferenceName("ctg") .setStart(1L) .setEnd(10L) .setVariant(Variant.newBuilder() @@ -126,14 +126,14 @@ class SquareOffReferenceModelSuite extends AvocadoFunSuite { val variant = Variant.newBuilder .setStart(100L) .setEnd(101L) - .setContigName("ctg") + .setReferenceName("ctg") .setReferenceAllele("A") .setAlternateAllele("G") .build val genotypes = Iterable(Genotype.newBuilder .setStart(90L) .setEnd(110L) - .setContigName("ctg") + .setReferenceName("ctg") .setNonReferenceLikelihoods(Seq(0.0, -1.0, -2.0) .map(d => d: java.lang.Double)) .build) @@ -146,7 +146,7 @@ class SquareOffReferenceModelSuite extends AvocadoFunSuite { optExcisedGenotype.foreach(gt => { assert(gt.getStart === 100L) assert(gt.getEnd === 101L) - assert(gt.getContigName === "ctg") + assert(gt.getReferenceName === "ctg") assert(gt.getVariant.getReferenceAllele === "A") assert(gt.getVariant.getAlternateAllele === "G") assert(gt.getGenotypeLikelihoods.size === 3) @@ -160,12 +160,12 @@ class SquareOffReferenceModelSuite extends AvocadoFunSuite { val variant = Variant.newBuilder .setStart(100L) .setEnd(101L) - .setContigName("ctg") + .setReferenceName("ctg") .setReferenceAllele("A") .setAlternateAllele("G") .build val genotypes = Iterable(Genotype.newBuilder - .setContigName("ctg") + .setReferenceName("ctg") .setStart(100L) .setEnd(101L) .setVariant(Variant.newBuilder() @@ -176,7 +176,7 @@ class SquareOffReferenceModelSuite extends AvocadoFunSuite { .build, Genotype.newBuilder .setStart(90L) .setEnd(110L) - .setContigName("ctg") + .setReferenceName("ctg") .setNonReferenceLikelihoods(Seq(0.0, -1.0, -2.0) .map(d => d: java.lang.Double)) .setSampleId("sample2") @@ -189,7 +189,7 @@ class SquareOffReferenceModelSuite extends AvocadoFunSuite { assert(vc.genotypes.size === 2) assert(vc.genotypes.forall(_.getStart == 100L)) assert(vc.genotypes.forall(_.getEnd == 101L)) - assert(vc.genotypes.forall(_.getContigName == "ctg")) + assert(vc.genotypes.forall(_.getReferenceName == "ctg")) assert(vc.genotypes.forall(_.getVariant.getReferenceAllele == "A")) assert(vc.genotypes.forall(_.getVariant.getAlternateAllele == "G")) assert(vc.genotypes.count(_.getSampleId == "sample1") === 1) diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/TrioCallerSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/TrioCallerSuite.scala index 005a2402..3b6be3e6 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/TrioCallerSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/TrioCallerSuite.scala @@ -19,13 +19,13 @@ package org.bdgenomics.avocado.genotyping import htsjdk.samtools.ValidationStringency import org.bdgenomics.adam.models.{ - RecordGroup, - RecordGroupDictionary, + ReadGroup, + ReadGroupDictionary, SequenceDictionary, VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.avocado.AvocadoFunSuite import org.bdgenomics.formats.avro.{ AlignmentRecord, @@ -38,42 +38,42 @@ import scala.collection.JavaConversions._ class TrioCallerSuite extends AvocadoFunSuite { - def makeRdd(recordGroups: RecordGroupDictionary): AlignmentRecordRDD = { - AlignmentRecordRDD(sc.emptyRDD[AlignmentRecord], + def makeRdd(readGroups: ReadGroupDictionary): AlignmentRecordDataset = { + AlignmentRecordDataset(sc.emptyRDD[AlignmentRecord], SequenceDictionary.empty, - recordGroups, + readGroups, Seq.empty[ProcessingStep]) } sparkTest("cannot have a sample with no record groups") { intercept[IllegalArgumentException] { - TrioCaller.extractSampleId(makeRdd(RecordGroupDictionary.empty)) + TrioCaller.extractSampleId(makeRdd(ReadGroupDictionary.empty)) } } sparkTest("cannot have a sample with discordant sample ids") { intercept[IllegalArgumentException] { - TrioCaller.extractSampleId(makeRdd(RecordGroupDictionary(Seq( - RecordGroup("sample1", "rg1"), - RecordGroup("sample2", "rg2"))))) + TrioCaller.extractSampleId(makeRdd(ReadGroupDictionary(Seq( + ReadGroup("sample1", "rg1"), + ReadGroup("sample2", "rg2"))))) } } sparkTest("extract id from a single read group") { - val sampleId = TrioCaller.extractSampleId(makeRdd(RecordGroupDictionary(Seq( - RecordGroup("sample1", "rg1"))))) + val sampleId = TrioCaller.extractSampleId(makeRdd(ReadGroupDictionary(Seq( + ReadGroup("sample1", "rg1"))))) assert(sampleId === "sample1") } sparkTest("extract id from multiple read groups") { - val sampleId = TrioCaller.extractSampleId(makeRdd(RecordGroupDictionary(Seq( - RecordGroup("sample1", "rg1"), - RecordGroup("sample1", "rg2"))))) + val sampleId = TrioCaller.extractSampleId(makeRdd(ReadGroupDictionary(Seq( + ReadGroup("sample1", "rg1"), + ReadGroup("sample1", "rg2"))))) assert(sampleId === "sample1") } val variant = Variant.newBuilder - .setContigName("chr") + .setReferenceName("chr") .setStart(100L) .setEnd(101L) .setReferenceAllele("A") diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/models/CopyNumberMapSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/models/CopyNumberMapSuite.scala index 0eec38a7..11d2cc51 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/models/CopyNumberMapSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/models/CopyNumberMapSuite.scala @@ -18,7 +18,7 @@ package org.bdgenomics.avocado.models import org.bdgenomics.adam.models.ReferenceRegion -import org.bdgenomics.adam.rdd.feature.FeatureRDD +import org.bdgenomics.adam.rdd.feature.FeatureDataset import org.bdgenomics.avocado.AvocadoFunSuite import org.bdgenomics.formats.avro.Feature @@ -30,24 +30,24 @@ class CopyNumberMapSuite extends AvocadoFunSuite { assert(emptyMap.basePloidy === 2) assert(emptyMap.minPloidy === 2) assert(emptyMap.maxPloidy === 2) - assert(emptyMap.variantsByContig.isEmpty) + assert(emptyMap.variantsByReference.isEmpty) } sparkTest("create a map with only diploid features") { val cnvs = Seq(Feature.newBuilder .setStart(100L) .setEnd(201L) - .setContigName("chr1") + .setReferenceName("chr1") .setFeatureType("DIP") .build) val emptyMap = CopyNumberMap(2, - FeatureRDD(sc.parallelize(cnvs))) + FeatureDataset(sc.parallelize(cnvs))) assert(emptyMap.basePloidy === 2) assert(emptyMap.minPloidy === 2) assert(emptyMap.maxPloidy === 2) - assert(emptyMap.variantsByContig.isEmpty) + assert(emptyMap.variantsByReference.isEmpty) assert(emptyMap.overlappingVariants(ReferenceRegion("chr1", 100L, 201L)) .isEmpty) } @@ -56,42 +56,42 @@ class CopyNumberMapSuite extends AvocadoFunSuite { val cnvs = Seq(Feature.newBuilder .setStart(100L) .setEnd(201L) - .setContigName("chr1") + .setReferenceName("chr1") .setFeatureType("DIP") .build, Feature.newBuilder .setStart(1000L) .setEnd(2000L) - .setContigName("chr1") + .setReferenceName("chr1") .setFeatureType("DUP") .build, Feature.newBuilder .setStart(2000L) .setEnd(3000L) - .setContigName("chr1") + .setReferenceName("chr1") .setFeatureType("DEL") .build, Feature.newBuilder .setStart(2000L) .setEnd(3000L) - .setContigName("chr2") + .setReferenceName("chr2") .setFeatureType("DEL") .build) val cnvMap = CopyNumberMap(2, - FeatureRDD(sc.parallelize(cnvs))) + FeatureDataset(sc.parallelize(cnvs))) assert(cnvMap.basePloidy === 2) assert(cnvMap.minPloidy === 1) assert(cnvMap.maxPloidy === 3) - assert(cnvMap.variantsByContig.size === 2) - val chr1Cnvs = cnvMap.variantsByContig("chr1") + assert(cnvMap.variantsByReference.size === 2) + val chr1Cnvs = cnvMap.variantsByReference("chr1") assert(chr1Cnvs.size === 2) assert(chr1Cnvs(0)._1 === ReferenceRegion("chr1", 1000L, 2000L)) assert(chr1Cnvs(0)._2 === 3) assert(chr1Cnvs(1)._1 === ReferenceRegion("chr1", 2000L, 3000L)) assert(chr1Cnvs(1)._2 === 1) - val chr2Cnvs = cnvMap.variantsByContig("chr2") + val chr2Cnvs = cnvMap.variantsByReference("chr2") assert(chr2Cnvs.size === 1) assert(chr2Cnvs(0)._1 === ReferenceRegion("chr2", 2000L, 3000L)) assert(chr2Cnvs(0)._2 === 1) diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/RealignerSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/RealignerSuite.scala index c21615be..e9cb71a0 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/RealignerSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/RealignerSuite.scala @@ -20,11 +20,11 @@ package org.bdgenomics.avocado.realigner import org.bdgenomics.adam.models.{ SequenceDictionary, SequenceRecord, - RecordGroup, - RecordGroupDictionary + ReadGroup, + ReadGroupDictionary } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.avocado.AvocadoFunSuite import org.bdgenomics.avocado.models.{ Clipped, @@ -192,9 +192,9 @@ class RealignerSuite extends AvocadoFunSuite { def makeAndRealignRdd(reads: Seq[AlignmentRecord], kmerLength: Int): Array[AlignmentRecord] = { - val gRdd = AlignmentRecordRDD(sc.parallelize(reads), + val gRdd = AlignmentRecordDataset(sc.parallelize(reads), SequenceDictionary(SequenceRecord("ctg", 50L)), - RecordGroupDictionary(Seq(RecordGroup("rg", "rg"))), + ReadGroupDictionary(Seq(ReadGroup("rg", "rg"))), Seq.empty) // realign the genomic rdd @@ -219,8 +219,8 @@ class RealignerSuite extends AvocadoFunSuite { AlignmentRecord.newBuilder() .setReadName(rId.toString) - .setContigName("ctg") - .setRecordGroupName("rg") + .setReferenceName("ctg") + .setReadGroupId("rg") .setReadMapped(true) .setSequence(sequence.drop(rId).take(readLength)) .setStart(rId.toLong) @@ -262,8 +262,8 @@ class RealignerSuite extends AvocadoFunSuite { AlignmentRecord.newBuilder() .setReadName(rId.toString) - .setContigName("ctg") - .setRecordGroupName("rg") + .setReferenceName("ctg") + .setReadGroupId("rg") .setReadMapped(true) .setSequence(sequence.drop(rId).take(readLength)) .setStart(rId.toLong) diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/HardLimiterSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/HardLimiterSuite.scala index e991bf87..f0fe3ff9 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/HardLimiterSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/HardLimiterSuite.scala @@ -26,7 +26,7 @@ class HardLimiterSuite extends AvocadoFunSuite { val reads = (0 to 5).map(i => { AlignmentRecord.newBuilder() - .setContigName("ctg") + .setReferenceName("ctg") .setStart(i.toLong) .setEnd(i.toLong + 3L) .build() @@ -139,10 +139,10 @@ class HardLimiterSuite extends AvocadoFunSuite { } } - test("adding a read that is on the wrong contig should fire an assert") { + test("adding a read that is on the wrong reference should fire an assert") { intercept[AssertionError] { val randomRead = AlignmentRecord.newBuilder() - .setContigName("random") + .setReferenceName("random") .setStart(100L) .setEnd(101L) .build() diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/PrefilterReadsSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/PrefilterReadsSuite.scala index 657ced72..f420518b 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/PrefilterReadsSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/PrefilterReadsSuite.scala @@ -19,12 +19,12 @@ package org.bdgenomics.avocado.util import org.bdgenomics.avocado.AvocadoFunSuite import org.bdgenomics.adam.models.{ - RecordGroupDictionary, + ReadGroupDictionary, SequenceDictionary, SequenceRecord } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.formats.avro.AlignmentRecord case class TestPrefilterReadsArgs(var autosomalOnly: Boolean = false, @@ -68,7 +68,7 @@ class PrefilterReadsSuite extends AvocadoFunSuite { assert(!PrefilterReads.filterMapped(unmappedRead, true)) } - val contigNames = Seq("chr1", + val referenceNames = Seq("chr1", "1", "chrX", "X", @@ -90,7 +90,7 @@ class PrefilterReadsSuite extends AvocadoFunSuite { } } - contigNames.zipWithIndex + referenceNames.zipWithIndex .foreach(p => assertIdx(p._2, p._1)) } @@ -119,31 +119,31 @@ class PrefilterReadsSuite extends AvocadoFunSuite { } test("filter autosomal chromosomes from generator") { - testChromosomeHelperSet(PrefilterReads.contigFilterFn(TestPrefilterReadsArgs(autosomalOnly = true)), Set(0, 1)) + testChromosomeHelperSet(PrefilterReads.referenceFilterFn(TestPrefilterReadsArgs(autosomalOnly = true)), Set(0, 1)) } test("filter autosomal + sex chromosomes from generator") { - testChromosomeHelperSet(PrefilterReads.contigFilterFn(TestPrefilterReadsArgs()), Set(0, 1, + testChromosomeHelperSet(PrefilterReads.referenceFilterFn(TestPrefilterReadsArgs()), Set(0, 1, 2, 3, 4, 5)) } test("filter all chromosomes from generator") { - testChromosomeHelperSet(PrefilterReads.contigFilterFn(TestPrefilterReadsArgs(keepMitochondrialChromosome = true)), Set(0, 1, 2, 3, 4, 5, 6, 7)) + testChromosomeHelperSet(PrefilterReads.referenceFilterFn(TestPrefilterReadsArgs(keepMitochondrialChromosome = true)), Set(0, 1, 2, 3, 4, 5, 6, 7)) } - test("update a read whose mate is mapped to a filtered contig") { + test("update a read whose mate is mapped to a filtered reference") { val read = AlignmentRecord.newBuilder() .setReadPaired(true) .setMateMapped(true) - .setMateContigName("notARealContig") + .setMateReferenceName("notARealReference") .build - val filters = PrefilterReads.contigFilterFn(TestPrefilterReadsArgs()) + val filters = PrefilterReads.referenceFilterFn(TestPrefilterReadsArgs()) val nullified = PrefilterReads.maybeNullifyMate(read, filters) assert(!nullified.getMateMapped) - assert(nullified.getMateContigName == null) + assert(nullified.getMateReferenceName == null) } val reads = Seq(AlignmentRecord.newBuilder() @@ -154,12 +154,12 @@ class PrefilterReadsSuite extends AvocadoFunSuite { AlignmentRecord.newBuilder() .setReadMapped(true) .setDuplicateRead(false)).flatMap(rb => { - contigNames.map(cn => rb.setContigName(cn).build) + referenceNames.map(cn => rb.setReferenceName(cn).build) }) def testReadHelperSet(testArgs: PrefilterReadsArgs, passIdxSet: Set[Int]) { val testFn = PrefilterReads.readFilterFn(testArgs, - PrefilterReads.contigFilterFn(testArgs)) + PrefilterReads.referenceFilterFn(testArgs)) def assertIdx(idx: Int, testRead: AlignmentRecord) = { if (passIdxSet(idx)) { @@ -203,16 +203,16 @@ class PrefilterReadsSuite extends AvocadoFunSuite { Set(16, 17, 18, 19, 20, 21, 22, 23)) } - val sequences = new SequenceDictionary(contigNames.map(cn => SequenceRecord(cn, 10L)) + val sequences = new SequenceDictionary(referenceNames.map(cn => SequenceRecord(cn, 10L)) .toVector) - def testRdd(args: PrefilterReadsArgs, numReads: Int, numContigs: Int) { + def testRdd(args: PrefilterReadsArgs, numReads: Int, numReferences: Int) { - val readRdd = AlignmentRecordRDD(sc.parallelize(reads), sequences, RecordGroupDictionary.empty, Seq.empty) + val readRdd = AlignmentRecordDataset(sc.parallelize(reads), sequences, ReadGroupDictionary.empty, Seq.empty) val filteredRdd = PrefilterReads(readRdd, args) assert(filteredRdd.rdd.count === numReads) - assert(filteredRdd.sequences.records.size === numContigs) + assert(filteredRdd.sequences.records.size === numReferences) } sparkTest("filter rdd of reads mapped to autosomal chromosomes from generator") { diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/RewriteHetsSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/RewriteHetsSuite.scala index 05084351..d82451d3 100644 --- a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/RewriteHetsSuite.scala +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/RewriteHetsSuite.scala @@ -19,7 +19,7 @@ package org.bdgenomics.avocado.util import org.bdgenomics.avocado.AvocadoFunSuite import org.bdgenomics.adam.models.SequenceDictionary -import org.bdgenomics.adam.rdd.variant.GenotypeRDD +import org.bdgenomics.adam.rdd.variant.GenotypeDataset import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele, Variant } import scala.collection.JavaConversions._ @@ -165,9 +165,9 @@ class RewriteHetsSuite extends AvocadoFunSuite { homRefSnp, homRefIndel, homAltSnp, homAltIndel) - def gtRdd: GenotypeRDD = { + def gtRdd: GenotypeDataset = { val rdd = sc.parallelize(genotypes) - GenotypeRDD(rdd, + GenotypeDataset(rdd, SequenceDictionary.empty, Seq.empty, Seq.empty) diff --git a/pom.xml b/pom.xml index 77de3787..761ed6d1 100644 --- a/pom.xml +++ b/pom.xml @@ -15,15 +15,15 @@ avocado: A Variant Caller, Distributed - 0.24.0 - 1.8.0 + 0.26.0 + 1.8.2 1.8 - 2.11.4 + 2.11.12 2.11 - 2.2.0 + 2.3.2 - 2.6.0 - 0.2.11 + 2.7.5 + 0.2.14 1.1.1 @@ -71,7 +71,7 @@ org.apache.maven.plugins maven-assembly-plugin - 2.4.1 + 3.1.0 org.apache.maven.plugins @@ -86,7 +86,7 @@ pl.project13.maven git-commit-id-plugin - 2.2.1 + 2.2.2 true @@ -122,7 +122,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.5.1 + 3.8.0 ${java.version} ${java.version} @@ -137,7 +137,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.7 + 2.22.1 @@ -153,12 +153,12 @@ exec-maven-plugin org.codehaus.mojo - 1.3.2 + 1.5.0 org.codehaus.mojo build-helper-maven-plugin - 1.10 + 1.12 org.apache.maven.plugins @@ -170,6 +170,11 @@ -Psonatype-oss-release -DskipTests + + org.apache.maven.plugins + maven-jar-plugin + 3.1.0 + @@ -276,24 +281,34 @@ - - - Sonatype - http://oss.sonatype.org/content/repositories/snapshots/ - - - Apache - http://people.apache.org/repo/m2-snapshot-repository - - - + + org.seqdoop + hadoop-bam + 7.9.2 + + + org.seqdoop + htsjdk + + + org.bdgenomics.utils utils-cli-spark2_2.11 ${utils.version} + + org.bdgenomics.utils + utils-intervalrdd-spark2_2.11 + ${utils.version} + + + org.bdgenomics.utils + utils-io-spark2_2.11 + ${utils.version} + org.bdgenomics.avocado avocado-core_2.11 @@ -312,6 +327,12 @@ ${utils.version} test-jar test + + + org.apache.spark + * + + org.bdgenomics.utils @@ -368,7 +389,7 @@ org.scalatest scalatest_${scala.version.prefix} - 2.2.6 + 3.0.6 test @@ -379,12 +400,12 @@ commons-io commons-io - 1.3.2 + 2.6 args4j args4j - 2.0.23 + 2.0.31 org.apache.spark @@ -426,7 +447,7 @@ org.apache.maven.plugins maven-source-plugin - 2.2.1 + 3.0.1 attach-sources @@ -440,7 +461,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.9.1 + 3.0.1 attach-javadocs @@ -495,7 +516,7 @@ scoverage-maven-plugin ${scoverage.plugin.version} - 2.11.4 + 2.11.12 org.bdgenomics.avocado.Timers true 90