From 9246472f702c72ef5134faa079c219718af4bbcb Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Wed, 23 Mar 2016 16:28:56 +0000 Subject: [PATCH] add validation stringency to bam parsing, flagstat --- .../org/bdgenomics/adam/cli/FlagStat.scala | 19 ++++++++++++++----- .../org/bdgenomics/adam/rdd/ADAMContext.scala | 6 ++++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala index 5c4f485631..7ca51bfb6c 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala @@ -17,15 +17,15 @@ */ package org.bdgenomics.adam.cli -import org.apache.hadoop.fs.{ Path, FileSystem } -import org.apache.hadoop.mapreduce.Job +import htsjdk.samtools.ValidationStringency +import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.projections.{ Projection, AlignmentRecordField } +import org.bdgenomics.adam.projections.{ AlignmentRecordField, Projection } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.formats.avro.AlignmentRecord import org.bdgenomics.utils.cli._ -import org.kohsuke.args4j.Argument +import org.kohsuke.args4j.{ Argument, Option ⇒ Args4jOption } object FlagStat extends BDGCommandCompanion { val commandName: String = "flagstat" @@ -41,6 +41,8 @@ class FlagStatArgs extends Args4jBase with ParquetArgs { val inputPath: String = null @Argument(required = false, metaVar = "OUTPUT", usage = "Optionally write the stats to this file.", index = 1) val outputPath: String = null + @Args4jOption(required = false, name = "-stringency", usage = "Set the parsing stringency: SILENT, LENIENT, STRICT.") + val stringency: String = "SILENT" } class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagStatArgs] { @@ -65,7 +67,14 @@ class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagSta AlignmentRecordField.supplementaryAlignment ) - val adamFile: RDD[AlignmentRecord] = sc.loadAlignments(args.inputPath, projection = Some(projection)) + val stringency = ValidationStringency.valueOf(args.stringency) + + val adamFile: RDD[AlignmentRecord] = + sc.loadAlignments( + args.inputPath, + projection = Some(projection), + stringency = stringency + ) val (failedVendorQuality, passedVendorQuality) = adamFile.adamFlagStat() diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala index 31e97ceb39..70c4a577c2 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala @@ -261,7 +261,8 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * * @see loadAlignments */ - def loadBam(filePath: String): AlignmentRecordRDD = { + def loadBam(filePath: String, + validationStringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD = { val path = new Path(filePath) val fs = Option( @@ -289,6 +290,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log // We need to separately read the header, so that we can inject the sequence dictionary // data into each individual Read (see the argument to samRecordConverter.convert, // below). + sc.hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, validationStringency.toString) val samHeader = SAMHeaderReader.readSAMHeaderFrom(fp, sc.hadoopConfiguration) log.info("Loaded header from " + fp) val sd = adamBamDictionaryLoad(samHeader) @@ -858,7 +860,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log if (filePath.endsWith(".sam") || filePath.endsWith(".bam")) { log.info("Loading " + filePath + " as SAM/BAM and converting to AlignmentRecords. Projection is ignored.") - loadBam(filePath) + loadBam(filePath, stringency) } else if (filePath.endsWith(".ifq")) { log.info("Loading " + filePath + " as interleaved FASTQ and converting to AlignmentRecords. Projection is ignored.") loadInterleavedFastq(filePath)