Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add validation stringency to bam parsing, flagstat #976

Merged
merged 1 commit into from
Mar 24, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
*/
package org.bdgenomics.adam.cli

import org.apache.hadoop.fs.{ Path, FileSystem }
import org.apache.hadoop.mapreduce.Job
import htsjdk.samtools.ValidationStringency
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.projections.{ Projection, AlignmentRecordField }
import org.bdgenomics.adam.projections.{ AlignmentRecordField, Projection }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.formats.avro.AlignmentRecord
import org.bdgenomics.utils.cli._
import org.kohsuke.args4j.Argument
import org.kohsuke.args4j.{ Argument, Option ⇒ Args4jOption }

object FlagStat extends BDGCommandCompanion {
val commandName: String = "flagstat"
Expand All @@ -41,6 +41,8 @@ class FlagStatArgs extends Args4jBase with ParquetArgs {
val inputPath: String = null
@Argument(required = false, metaVar = "OUTPUT", usage = "Optionally write the stats to this file.", index = 1)
val outputPath: String = null
@Args4jOption(required = false, name = "-stringency", usage = "Set the parsing stringency: SILENT, LENIENT, STRICT.")
val stringency: String = "SILENT"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the default if SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY is not set?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

STRICT. flagstat is a simple enough operation that i thought SILENT was more often than not what a user would want.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I agree with you on flagstat defaulting to SILENT, and I see the default in loadBam is STRICT. +1

}

class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagStatArgs] {
Expand All @@ -65,7 +67,14 @@ class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagSta
AlignmentRecordField.supplementaryAlignment
)

val adamFile: RDD[AlignmentRecord] = sc.loadAlignments(args.inputPath, projection = Some(projection))
val stringency = ValidationStringency.valueOf(args.stringency)

val adamFile: RDD[AlignmentRecord] =
sc.loadAlignments(
args.inputPath,
projection = Some(projection),
stringency = stringency
)

val (failedVendorQuality, passedVendorQuality) = adamFile.adamFlagStat()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
*
* @see loadAlignments
*/
def loadBam(filePath: String): AlignmentRecordRDD = {
def loadBam(filePath: String,
validationStringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD = {
val path = new Path(filePath)
val fs =
Option(
Expand Down Expand Up @@ -289,6 +290,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
// We need to separately read the header, so that we can inject the sequence dictionary
// data into each individual Read (see the argument to samRecordConverter.convert,
// below).
sc.hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, validationStringency.toString)
val samHeader = SAMHeaderReader.readSAMHeaderFrom(fp, sc.hadoopConfiguration)
log.info("Loaded header from " + fp)
val sd = adamBamDictionaryLoad(samHeader)
Expand Down Expand Up @@ -858,7 +860,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
if (filePath.endsWith(".sam") ||
filePath.endsWith(".bam")) {
log.info("Loading " + filePath + " as SAM/BAM and converting to AlignmentRecords. Projection is ignored.")
loadBam(filePath)
loadBam(filePath, stringency)
} else if (filePath.endsWith(".ifq")) {
log.info("Loading " + filePath + " as interleaved FASTQ and converting to AlignmentRecords. Projection is ignored.")
loadInterleavedFastq(filePath)
Expand Down