Skip to content

Commit

Permalink
Merge pull request #243 from massie/pr238
Browse files Browse the repository at this point in the history
Rebase PR#238 onto master
  • Loading branch information
massie committed May 14, 2014
2 parents b0b0a06 + 1bf03f6 commit 0b2cf5d
Show file tree
Hide file tree
Showing 36 changed files with 640 additions and 818 deletions.
17 changes: 14 additions & 3 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ package org.bdgenomics.adam.cli
import org.bdgenomics.adam.avro.ADAMGenotype
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.variation.ADAMVariationContext._
import org.kohsuke.args4j.Argument
import org.kohsuke.args4j.{ Option => Args4jOption, Argument }
import org.apache.spark.rdd.RDD
import org.apache.spark.{ Logging, SparkContext }
import org.apache.hadoop.mapreduce.Job
import java.io.File
import org.bdgenomics.adam.models.SequenceDictionary
import scala.Option

object ADAM2Vcf extends ADAMCommandCompanion {

Expand All @@ -35,17 +38,25 @@ object ADAM2Vcf extends ADAMCommandCompanion {
}

class ADAM2VcfArgs extends Args4jBase with ParquetArgs with SparkArgs {
@Args4jOption(required = false, name = "-dict", usage = "Reference dictionary")
var dictionaryFile: File = _

@Argument(required = true, metaVar = "ADAM", usage = "The ADAM variant files to convert", index = 0)
var adamFile: String = _

@Argument(required = true, metaVar = "VCF", usage = "Location to write VCF data", index = 1)
var outputPath: String = null
}

class ADAM2Vcf(val args: ADAM2VcfArgs) extends ADAMSparkCommand[ADAM2VcfArgs] with Logging {
class ADAM2Vcf(val args: ADAM2VcfArgs) extends ADAMSparkCommand[ADAM2VcfArgs] with DictionaryCommand with Logging {
val companion = ADAM2Vcf

def run(sc: SparkContext, job: Job) {
var dictionary: Option[SequenceDictionary] = loadSequenceDictionary(args.dictionaryFile)
if (dictionary.isDefined)
log.info("Using contig translation")

val adamGTs: RDD[ADAMGenotype] = sc.adamLoad(args.adamFile)
sc.adamVCFSave(args.outputPath, adamGTs.toADAMVariantContext)
sc.adamVCFSave(args.outputPath, adamGTs.toADAMVariantContext, dict = dictionary)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class Bam2ADAM(args: Bam2ADAMArgs) extends ADAMCommand {
val samReader = new SAMFileReader(new File(args.bamFile), null, true)
samReader.setValidationStringency(args.validationStringency)

val seqDict = SequenceDictionary.fromSAMReader(samReader)
val seqDict = SequenceDictionary(samReader)
val rgDict = RecordGroupDictionary.fromSAMReader(samReader)

println(seqDict)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2014. Mount Sinai School of Medicine
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.bdgenomics.adam.cli

import java.io.{ FileOutputStream, File }
import org.apache.commons.io.IOUtils
import org.bdgenomics.adam.models.SequenceDictionary
import net.sf.samtools.SAMFileReader

trait DictionaryCommand {
private def getDictionaryFile(name: String): Option[File] = {
val stream = ClassLoader.getSystemClassLoader.getResourceAsStream("dictionaries/" + name)
if (stream == null)
return None
val file = File.createTempFile(name, ".dict")
file.deleteOnExit()
IOUtils.copy(stream, new FileOutputStream(file))
Some(file)
}

private def getDictionary(file: File) = Some(SequenceDictionary(SAMFileReader.getSequenceDictionary(file)))

def loadSequenceDictionary(file: File): Option[SequenceDictionary] = {
if (file != null) {
if (file.exists)
getDictionary(file)
else getDictionaryFile(file.getName) match {
case Some(file) => getDictionary(file)
case _ => None
}
} else None
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class ListDict(protected val args: ListDictArgs) extends ADAMSparkCommand[ListDi

val dict = sc.adamDictionaryLoad[ADAMRecord](args.inputPath)

dict.recordsIn.sortBy(_.name.toString).foreach {
dict.records.foreach {
rec: SequenceRecord =>
println("%s\t%d".format(rec.name, rec.length))
}
Expand Down
35 changes: 23 additions & 12 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@

package org.bdgenomics.adam.cli

import org.bdgenomics.adam.models.ADAMVariantContext
import org.bdgenomics.adam.models.{ SequenceDictionary, ADAMVariantContext }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.variation.ADAMVariationContext._
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.{ Logging, SparkContext }
import org.apache.spark.rdd.RDD
import org.kohsuke.args4j.Argument
import org.kohsuke.args4j.{ Option => Args4jOption, Argument }
import java.io.File

object Vcf2ADAM extends ADAMCommandCompanion {
val commandName = "vcf2adam"
Expand All @@ -34,21 +35,31 @@ object Vcf2ADAM extends ADAMCommandCompanion {
}

class Vcf2ADAMArgs extends Args4jBase with ParquetArgs with SparkArgs {
@Argument(required = true, metaVar = "VCF",
usage = "The VCF file to convert", index = 0)
var vcfFile: String = _
@Argument(required = true, metaVar = "ADAM",
usage = "Location to write ADAM Variant data", index = 1)
@Args4jOption(required = false, name = "-dict", usage = "Reference dictionary")
var dictionaryFile: File = _

@Argument(required = true, metaVar = "VCF", usage = "The VCF file to convert", index = 0)
var vcfPath: String = _

@Argument(required = true, metaVar = "ADAM", usage = "Location to write ADAM Variant data", index = 1)
var outputPath: String = null
}

class Vcf2ADAM(val args: Vcf2ADAMArgs) extends ADAMSparkCommand[Vcf2ADAMArgs] {
class Vcf2ADAM(val args: Vcf2ADAMArgs) extends ADAMSparkCommand[Vcf2ADAMArgs] with DictionaryCommand with Logging {
val companion = Vcf2ADAM

def run(sc: SparkContext, job: Job) {
var adamVariants: RDD[ADAMVariantContext] = sc.adamVCFLoad(args.vcfFile)
adamVariants.flatMap(p => p.genotypes).adamSave(args.outputPath,
blockSize = args.blockSize, pageSize = args.pageSize,

var dictionary: Option[SequenceDictionary] = loadSequenceDictionary(args.dictionaryFile)
if (dictionary.isDefined)
log.info("Using contig translation")

var adamVariants: RDD[ADAMVariantContext] = sc.adamVCFLoad(args.vcfPath, dict = dictionary)

adamVariants.flatMap(p => p.genotypes).adamSave(
args.outputPath,
blockSize = args.blockSize,
pageSize = args.pageSize,
compressCodec = args.compressionCodec,
disableDictionaryEncoding = args.disableDictionary)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ package org.bdgenomics.adam.converters

import net.sf.samtools.{ SAMReadGroupRecord, SAMRecord }

import org.bdgenomics.adam.avro.{ ADAMContig, ADAMRecord }
import org.bdgenomics.adam.avro.ADAMRecord
import scala.collection.JavaConverters._
import org.bdgenomics.adam.models.{ Attribute, RecordGroupDictionary, SequenceDictionary }
import org.bdgenomics.adam.models.{ SequenceRecord, Attribute, RecordGroupDictionary, SequenceDictionary }
import org.bdgenomics.adam.util.AttributeUtils

class SAMRecordConverter extends Serializable {
Expand All @@ -35,11 +35,7 @@ class SAMRecordConverter extends Serializable {
// This prevents looking up a -1 in the sequence dictionary
val readReference: Int = samRecord.getReferenceIndex
if (readReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
builder
.setContig(ADAMContig.newBuilder
.setContigName(samRecord.getReferenceName)
.setContigLength(dict(samRecord.getReferenceName).length)
.setReferenceURL(dict(samRecord.getReferenceName).url).build)
builder.setContig(SequenceRecord.toADAMContig(dict(samRecord.getReferenceName).get))

val start: Int = samRecord.getAlignmentStart
if (start != 0) {
Expand All @@ -57,11 +53,7 @@ class SAMRecordConverter extends Serializable {
val mateReference: Int = samRecord.getMateReferenceIndex

if (mateReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
builder
.setMateContig(ADAMContig.newBuilder
.setContigName(samRecord.getMateReferenceName)
.setContigLength(dict(samRecord.getMateReferenceName).length)
.setReferenceURL(dict(samRecord.getMateReferenceName).url).build)
builder.setMateContig(SequenceRecord.toADAMContig(dict(samRecord.getMateReferenceName).get))

val mateStart = samRecord.getMateAlignmentStart
if (mateStart > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

package org.bdgenomics.adam.converters

import org.broadinstitute.variant.variantcontext.VariantContext
import org.broadinstitute.variant.variantcontext.{ Genotype, VariantContext }
import org.apache.avro.Schema
import org.apache.avro.specific.SpecificRecord
import org.broadinstitute.variant.vcf._
Expand Down Expand Up @@ -101,7 +101,9 @@ object VariantAnnotationConverter extends Serializable {
AttrKey("gtFilters", VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)),
AttrKey("genotypeLikelihoods", VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_PL_KEY)),
AttrKey("phaseQuality", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")),
AttrKey("phaseSetId", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phase set")))
AttrKey("phaseSetId", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phase set")),
AttrKey("minReadDepth", attrAsInt _, new VCFFormatHeaderLine("MIN_DP", 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")),
AttrKey("strandBiasComponents", attrAsInt _, new VCFFormatHeaderLine("SB", 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")))

lazy val infoHeaderLines: Seq[VCFCompoundHeaderLine] = INFO_KEYS.map(_.hdrLine)
lazy val formatHeaderLines: Seq[VCFCompoundHeaderLine] = FORMAT_KEYS.map(_.hdrLine)
Expand Down Expand Up @@ -143,6 +145,17 @@ object VariantAnnotationConverter extends Serializable {
fillRecord(VCF2VariantCallingAnnotations, vc, call)
}

def convert(g: Genotype, genotype: ADAMGenotype): ADAMGenotype = {
for ((v, a) <- VariantAnnotationConverter.VCF2GenotypeAnnotations) {
// Add extended attributes if present
val attr = g.getExtendedAttribute(v)
if (attr != null && attr != VCFConstants.MISSING_VALUE_v4) {
genotype.put(a._1, a._2(attr))
}
}
genotype
}

def mergeAnnotations(leftRecord: ADAMDatabaseVariantAnnotation, rightRecord: ADAMDatabaseVariantAnnotation): ADAMDatabaseVariantAnnotation = {
val mergedAnnotation = ADAMDatabaseVariantAnnotation.newBuilder(leftRecord).build()
val numFields = ADAMDatabaseVariantAnnotation.getClassSchema.getFields.size
Expand Down
Loading

0 comments on commit 0b2cf5d

Please sign in to comment.