From 7a5f55e13a6286166fe273b68ea46377d30b967d Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Mon, 13 Feb 2017 14:20:38 -0800 Subject: [PATCH] [ADAM-1381] Fix Variant end position. Resolves #1381. Sets variant end position to the proper site for symbolic alleles. --- .../converters/VariantContextConverter.scala | 15 +++++++++++++-- .../VariantContextConverterSuite.scala | 16 ++++++++++++++++ .../bdgenomics/adam/rdd/ADAMContextSuite.scala | 2 -- .../rdd/variant/VariantContextRDDSuite.scala | 1 + 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala index 0ef009592c..e65cbfada7 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala @@ -106,6 +106,16 @@ private[adam] object VariantContextConverter { } } + private val OPT_NON_REF = Some(Allele.create("", false)) + + private def optNonRef(v: Variant): Option[Allele] = { + if (v.getAlternateAllele != null) { + None + } else { + OPT_NON_REF + } + } + /** * Converts the alleles in a variant into a Java collection of htsjdk alleles. * @@ -115,7 +125,8 @@ private[adam] object VariantContextConverter { */ private def convertAlleles(v: Variant): java.util.Collection[Allele] = { val asSeq = Seq(convertAlleleOpt(v.getReferenceAllele, true), - convertAlleleOpt(v.getAlternateAllele)).flatten + convertAlleleOpt(v.getAlternateAllele), + optNonRef(v)).flatten asSeq } @@ -1848,7 +1859,7 @@ private[adam] class VariantContextConverter( val builder = new VariantContextBuilder() .chr(v.getContigName) .start(v.getStart + 1) - .stop(v.getStart + v.getReferenceAllele.length) + .stop(v.getEnd) .alleles(VariantContextConverter.convertAlleles(v)) // bind the conversion functions and fold diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala index 25af1493d6..9fa6e1122e 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala @@ -42,6 +42,7 @@ import org.bdgenomics.adam.models.{ SequenceDictionary, VariantContext => ADAMVariantContext } +import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.util.{ ADAMFunSuite, PhredUtils } import org.bdgenomics.formats.avro._ import scala.collection.JavaConversions._ @@ -85,6 +86,7 @@ class VariantContextConverterSuite extends ADAMFunSuite { def adamSNVBuilder(contig: String = "1"): Variant.Builder = Variant.newBuilder() .setContigName(contig) .setStart(0L) + .setEnd(1L) .setReferenceAllele("A") .setAlternateAllele("T") @@ -1863,6 +1865,7 @@ class VariantContextConverterSuite extends ADAMFunSuite { val v = Variant.newBuilder .setContigName("1") .setStart(0L) + .setEnd(1L) .setReferenceAllele("A") .setAlternateAllele("T") .build @@ -2534,4 +2537,17 @@ class VariantContextConverterSuite extends ADAMFunSuite { assert(adamGt.getVariantCallingAnnotations.getAttributes.containsKey("STRING_G")) assert(adamGt.getVariantCallingAnnotations.getAttributes.get("STRING_G") === "foo,bar,baz") } + + sparkTest("respect end position for symbolic alts") { + val vcRecords = sc.loadVcf(testFile("gvcf_dir/gvcf_multiallelic.g.vcf")) + .rdd + .collect() + + val symbolic = vcRecords.filter(_.variant.variant.getStart == 16157520L) + .head + val optHtsjdkVc = converter.convert(symbolic) + + assert(optHtsjdkVc.isDefined) + assert(optHtsjdkVc.get.getEnd === 16157602) + } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index a58465c466..10fdbdfb42 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -399,8 +399,6 @@ class ADAMContextSuite extends ADAMFunSuite { val path = new File(testFile("gvcf_dir/gvcf_multiallelic.g.vcf")).getParent() val variants = sc.loadVcf(path).toVariantRDD - // Not sure that the count should be 7 below, however the current failure to read the mult-allelic site happens - // before this assertion is even reached assert(variants.rdd.count === 6) } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala index bc1ba02549..ce548a17b2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala @@ -42,6 +42,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { val v0 = Variant.newBuilder .setContigName("chr11") .setStart(17409572L) + .setEnd(17409573L) .setReferenceAllele("T") .setAlternateAllele("C") .setNames(ImmutableList.of("rs3131972", "rs201888535"))