Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement contig-to-RefSeq translation, refactor SequenceDictionary with... #238

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ package org.bdgenomics.adam.cli
import org.bdgenomics.adam.avro.ADAMGenotype
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.variation.ADAMVariationContext._
import org.kohsuke.args4j.Argument
import org.kohsuke.args4j.{ Option => Args4jOption, Argument }
import org.apache.spark.rdd.RDD
import org.apache.spark.{ Logging, SparkContext }
import org.apache.hadoop.mapreduce.Job
import java.io.File
import org.bdgenomics.adam.models.SequenceDictionary
import scala.Option

object ADAM2Vcf extends ADAMCommandCompanion {

Expand All @@ -35,17 +38,25 @@ object ADAM2Vcf extends ADAMCommandCompanion {
}

class ADAM2VcfArgs extends Args4jBase with ParquetArgs with SparkArgs {
@Args4jOption(required = false, name = "-dict", usage = "Reference dictionary")
var dictionaryFile: File = _

@Argument(required = true, metaVar = "ADAM", usage = "The ADAM variant files to convert", index = 0)
var adamFile: String = _

@Argument(required = true, metaVar = "VCF", usage = "Location to write VCF data", index = 1)
var outputPath: String = null
}

class ADAM2Vcf(val args: ADAM2VcfArgs) extends ADAMSparkCommand[ADAM2VcfArgs] with Logging {
class ADAM2Vcf(val args: ADAM2VcfArgs) extends ADAMSparkCommand[ADAM2VcfArgs] with DictionaryCommand with Logging {
val companion = ADAM2Vcf

def run(sc: SparkContext, job: Job) {
var dictionary: Option[SequenceDictionary] = loadSequenceDictionary(args.dictionaryFile)
if (dictionary.isDefined)
log.info("Using contig translation")

val adamGTs: RDD[ADAMGenotype] = sc.adamLoad(args.adamFile)
sc.adamVCFSave(args.outputPath, adamGTs.toADAMVariantContext)
sc.adamVCFSave(args.outputPath, adamGTs.toADAMVariantContext, dict = dictionary)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class Bam2ADAM(args: Bam2ADAMArgs) extends ADAMCommand {
val samReader = new SAMFileReader(new File(args.bamFile), null, true)
samReader.setValidationStringency(args.validationStringency)

val seqDict = SequenceDictionary.fromSAMReader(samReader)
val seqDict = SequenceDictionary(samReader)
val rgDict = RecordGroupDictionary.fromSAMReader(samReader)

println(seqDict)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2014. Mount Sinai School of Medicine
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.bdgenomics.adam.cli

import java.io.{ FileOutputStream, File }
import org.apache.commons.io.IOUtils
import org.bdgenomics.adam.models.SequenceDictionary
import net.sf.samtools.SAMFileReader

trait DictionaryCommand {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this trait. Good idea!

private def getDictionaryFile(name: String): Option[File] = {
val stream = ClassLoader.getSystemClassLoader.getResourceAsStream("dictionaries/" + name)
if (stream == null)
return None
val file = File.createTempFile(name, ".dict")
file.deleteOnExit()
IOUtils.copy(stream, new FileOutputStream(file))
Some(file)
}

private def getDictionary(file: File) = Some(SequenceDictionary(SAMFileReader.getSequenceDictionary(file)))

def loadSequenceDictionary(file: File): Option[SequenceDictionary] = {
if (file != null) {
if (file.exists)
getDictionary(file)
else getDictionaryFile(file.getName) match {
case Some(file) => getDictionary(file)
case _ => None
}
} else None
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class ListDict(protected val args: ListDictArgs) extends ADAMSparkCommand[ListDi

val dict = sc.adamDictionaryLoad[ADAMRecord](args.inputPath)

dict.recordsIn.sortBy(_.name.toString).foreach {
dict.records.foreach {
rec: SequenceRecord =>
println("%s\t%d".format(rec.name, rec.length))
}
Expand Down
35 changes: 23 additions & 12 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@

package org.bdgenomics.adam.cli

import org.bdgenomics.adam.models.ADAMVariantContext
import org.bdgenomics.adam.models.{ SequenceDictionary, ADAMVariantContext }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.variation.ADAMVariationContext._
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.{ Logging, SparkContext }
import org.apache.spark.rdd.RDD
import org.kohsuke.args4j.Argument
import org.kohsuke.args4j.{ Option => Args4jOption, Argument }
import java.io.File

object Vcf2ADAM extends ADAMCommandCompanion {
val commandName = "vcf2adam"
Expand All @@ -34,21 +35,31 @@ object Vcf2ADAM extends ADAMCommandCompanion {
}

class Vcf2ADAMArgs extends Args4jBase with ParquetArgs with SparkArgs {
@Argument(required = true, metaVar = "VCF",
usage = "The VCF file to convert", index = 0)
var vcfFile: String = _
@Argument(required = true, metaVar = "ADAM",
usage = "Location to write ADAM Variant data", index = 1)
@Args4jOption(required = false, name = "-dict", usage = "Reference dictionary")
var dictionaryFile: File = _

@Argument(required = true, metaVar = "VCF", usage = "The VCF file to convert", index = 0)
var vcfPath: String = _

@Argument(required = true, metaVar = "ADAM", usage = "Location to write ADAM Variant data", index = 1)
var outputPath: String = null
}

class Vcf2ADAM(val args: Vcf2ADAMArgs) extends ADAMSparkCommand[Vcf2ADAMArgs] {
class Vcf2ADAM(val args: Vcf2ADAMArgs) extends ADAMSparkCommand[Vcf2ADAMArgs] with DictionaryCommand with Logging {
val companion = Vcf2ADAM

def run(sc: SparkContext, job: Job) {
var adamVariants: RDD[ADAMVariantContext] = sc.adamVCFLoad(args.vcfFile)
adamVariants.flatMap(p => p.genotypes).adamSave(args.outputPath,
blockSize = args.blockSize, pageSize = args.pageSize,

var dictionary: Option[SequenceDictionary] = loadSequenceDictionary(args.dictionaryFile)
if (dictionary.isDefined)
log.info("Using contig translation")

var adamVariants: RDD[ADAMVariantContext] = sc.adamVCFLoad(args.vcfPath, dict = dictionary)

adamVariants.flatMap(p => p.genotypes).adamSave(
args.outputPath,
blockSize = args.blockSize,
pageSize = args.pageSize,
compressCodec = args.compressionCodec,
disableDictionaryEncoding = args.disableDictionary)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ package org.bdgenomics.adam.converters

import net.sf.samtools.{ SAMReadGroupRecord, SAMRecord }

import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord }
import org.bdgenomics.adam.avro.ADAMRecord
import scala.collection.JavaConverters._
import org.bdgenomics.adam.models.{ Attribute, RecordGroupDictionary, SequenceDictionary }
import org.bdgenomics.adam.models.{ SequenceRecord, Attribute, RecordGroupDictionary, SequenceDictionary }
import org.bdgenomics.adam.util.AttributeUtils

class SAMRecordConverter extends Serializable {
Expand All @@ -35,11 +35,7 @@ class SAMRecordConverter extends Serializable {
// This prevents looking up a -1 in the sequence dictionary
val readReference: Int = samRecord.getReferenceIndex
if (readReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
builder
.setContig(ADAMContig.newBuilder
.setContigName(samRecord.getReferenceName)
.setContigLength(dict(samRecord.getReferenceName).length)
.setReferenceURL(dict(samRecord.getReferenceName).url).build)
builder.setContig(SequenceRecord.toADAMContig(dict(samRecord.getReferenceName).get))

val start: Int = samRecord.getAlignmentStart
if (start != 0) {
Expand All @@ -57,11 +53,7 @@ class SAMRecordConverter extends Serializable {
val mateReference: Int = samRecord.getMateReferenceIndex

if (mateReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
builder
.setMateContig(ADAMContig.newBuilder
.setContigName(samRecord.getMateReferenceName)
.setContigLength(dict(samRecord.getMateReferenceName).length)
.setReferenceURL(dict(samRecord.getMateReferenceName).url).build)
builder.setMateContig(SequenceRecord.toADAMContig(dict(samRecord.getMateReferenceName).get))

val mateStart = samRecord.getMateAlignmentStart
if (mateStart > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

package org.bdgenomics.adam.converters

import org.broadinstitute.variant.variantcontext.VariantContext
import org.broadinstitute.variant.variantcontext.{ Genotype, VariantContext }
import org.apache.avro.Schema
import org.apache.avro.specific.SpecificRecord
import org.broadinstitute.variant.vcf._
Expand Down Expand Up @@ -101,7 +101,9 @@ object VariantAnnotationConverter extends Serializable {
AttrKey("gtFilters", VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)),
AttrKey("genotypeLikelihoods", VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_PL_KEY)),
AttrKey("phaseQuality", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")),
AttrKey("phaseSetId", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phase set")))
AttrKey("phaseSetId", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phase set")),
AttrKey("minReadDepth", attrAsInt _, new VCFFormatHeaderLine("MIN_DP", 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")),
AttrKey("strandBiasComponents", attrAsInt _, new VCFFormatHeaderLine("SB", 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")))

lazy val infoHeaderLines: Seq[VCFCompoundHeaderLine] = INFO_KEYS.map(_.hdrLine)
lazy val formatHeaderLines: Seq[VCFCompoundHeaderLine] = FORMAT_KEYS.map(_.hdrLine)
Expand Down Expand Up @@ -143,6 +145,17 @@ object VariantAnnotationConverter extends Serializable {
fillRecord(VCF2VariantCallingAnnotations, vc, call)
}

def convert(g: Genotype, genotype: ADAMGenotype): ADAMGenotype = {
for ((v, a) <- VariantAnnotationConverter.VCF2GenotypeAnnotations) {
// Add extended attributes if present
val attr = g.getExtendedAttribute(v)
if (attr != null && attr != VCFConstants.MISSING_VALUE_v4) {
genotype.put(a._1, a._2(attr))
}
}
genotype
}

def mergeAnnotations(leftRecord: ADAMDatabaseVariantAnnotation, rightRecord: ADAMDatabaseVariantAnnotation): ADAMDatabaseVariantAnnotation = {
val mergedAnnotation = ADAMDatabaseVariantAnnotation.newBuilder(leftRecord).build()
val numFields = ADAMDatabaseVariantAnnotation.getClassSchema.getFields.size
Expand Down
Loading