From 93b2510f78d230b9dfdb8e8cda1a18e34c0249f5 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Tue, 6 Mar 2018 17:10:58 -0800 Subject: [PATCH] [ADAM-1939] Allow validation stringency to waive off FLAG arrays. Resolves #1939. --- .../converters/VariantContextConverter.scala | 20 ++++++++++++++++--- .../resources/invalid/small.INFO_flag.vcf | 5 +++++ .../adam/rdd/ADAMContextSuite.scala | 7 +++++++ 3 files changed, 29 insertions(+), 3 deletions(-) create mode 100644 adam-core/src/test/resources/invalid/small.INFO_flag.vcf diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala index 45d3f7cbd3..1c2d7bc58f 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala @@ -296,7 +296,7 @@ class VariantContextConverter( import VariantContextConverter._ // format fns gatk --> bdg, extract fns bdg --> gatk - private val variantFormatFn = makeVariantFormatFn(headerLines) + private val variantFormatFn = makeVariantFormatFn(headerLines, stringency) private val variantExtractFn = makeVariantExtractFn(headerLines) private val genotypeFormatFn = makeGenotypeFormatFn(headerLines) private val genotypeExtractFn = makeGenotypeExtractFn(headerLines) @@ -1636,7 +1636,8 @@ class VariantContextConverter( } private def makeVariantFormatFn( - headerLines: Seq[VCFHeaderLine]): (HtsjdkVariantContext, Option[String], Int, Boolean) => (Variant, Variant) = { + headerLines: Seq[VCFHeaderLine], + stringency: ValidationStringency = ValidationStringency.STRICT): (HtsjdkVariantContext, Option[String], Int, Boolean) => (Variant, Variant) = { val attributeFns: Iterable[(HtsjdkVariantContext, Int, Array[Int]) => Option[(String, String)]] = headerLines .flatMap(hl => hl match { @@ -1648,7 +1649,20 @@ class VariantContextConverter( if (DefaultHeaderLines.infoHeaderLines .find(_.getID == key) .isEmpty) { - lineToVariantContextExtractor(il) + try { + lineToVariantContextExtractor(il) + } catch { + case t: Throwable => { + if (stringency == ValidationStringency.STRICT) { + throw t + } else { + if (stringency == ValidationStringency.LENIENT) { + log.warn("Saw invalid info field %s. Ignoring...".format(t)) + } + None + } + } + } } else { None } diff --git a/adam-core/src/test/resources/invalid/small.INFO_flag.vcf b/adam-core/src/test/resources/invalid/small.INFO_flag.vcf new file mode 100644 index 0000000000..4c1bf4c83a --- /dev/null +++ b/adam-core/src/test/resources/invalid/small.INFO_flag.vcf @@ -0,0 +1,5 @@ +##fileformat=VCFv4.1 +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 14397 . CTGT C 139.12 IndelQD AC=2;AF=0.333;AN=6;BaseQRankSum=1.800;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384 GT:AD:DP:FT:GQ:PL 0/1:16,4:20:rd:99:120,0,827 0/1:8,2:10:dp;rd:60:60,0,414 0/0:39,0:39:PASS:99:0,116,2114 \ No newline at end of file diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index fd42565bd1..448944c7b4 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -179,6 +179,13 @@ class ADAMContextSuite extends ADAMFunSuite { assert(last.getName === "gn|BPY2C;ccds|CCDS44030;ens|ENSG00000185894;vega|OTTHUMG00000045199") } + sparkTest("can read a small .vcf file with a validation issue") { + val path = testFile("invalid/small.INFO_flag.vcf") + + val vcs = sc.loadVcf(path, stringency = ValidationStringency.LENIENT) + assert(vcs.rdd.count === 1) + } + sparkTest("can read a small .vcf file") { val path = testFile("small.vcf")