From 8f5d44d687caa49405af1e8f948cc5f599678d13 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Wed, 24 Mar 2021 12:49:42 -0500 Subject: [PATCH] Provide access to header to model conversions. --- .../adam/converters/AlignmentConverter.scala | 260 +++++++++++++++++- .../adam/converters/SAMRecordConverter.scala | 220 --------------- .../converters/VariantContextConverter.scala | 91 +++--- .../org/bdgenomics/adam/ds/ADAMContext.scala | 8 +- .../adam/ds/read/AnySAMOutFormatter.scala | 4 +- .../adam/models/ReadGroupDictionary.scala | 4 + .../converters/AlignmentConverterSuite.scala | 108 +++++++- .../converters/SAMRecordConverterSuite.scala | 129 --------- 8 files changed, 426 insertions(+), 398 deletions(-) delete mode 100644 adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala delete mode 100644 adam-core/src/test/scala/org/bdgenomics/adam/converters/SAMRecordConverterSuite.scala diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentConverter.scala index 56483a218f..9c51f78a88 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentConverter.scala @@ -17,16 +17,218 @@ */ package org.bdgenomics.adam.converters -import htsjdk.samtools.{ SAMFileHeader, SAMRecord } -import org.bdgenomics.adam.models._ +import grizzled.slf4j.Logging +import htsjdk.samtools.{ + SAMFileHeader, + SAMProgramRecord, + SAMReadGroupRecord, + SAMRecord, + SAMUtils +} +import org.bdgenomics.adam.models.{ + Alphabet, + Attribute, + ReadGroupDictionary, + SequenceDictionary +} import org.bdgenomics.adam.rich.RichAlignment -import org.bdgenomics.formats.avro.{ Alignment, Fragment } -import scala.collection.JavaConversions._ +import org.bdgenomics.adam.util.AttributeUtils +import org.bdgenomics.formats.avro.{ + Alignment, + Fragment, + ProcessingStep +} +import scala.collection.JavaConverters._ /** - * This class contains methods to convert Alignments to other formats. + * Conversions between Alignments and other formats. */ -class AlignmentConverter extends Serializable { +class AlignmentConverter extends Serializable with Logging { + + /** + * Returns true if a tag should not be kept in the attributes field. + * + * The SAM/BAM format supports attributes, which is a key/value pair map. In + * ADAM, we have promoted some of these fields to "primary" fields, so that we + * can more efficiently access them. These include the MD tag, which describes + * substitutions against the reference; the OQ tag, which describes the + * original read base quality scores; and the OP and OC tags, which describe the + * original read alignment position and CIGAR. + * + * @param attrTag Tag name to check. + * @return Returns true if the tag should be skipped. + */ + private[converters] def skipTag(attrTag: String): Boolean = attrTag match { + case "OQ" => true + case "OP" => true + case "OC" => true + case "MD" => true + case _ => false + } + + /** + * Converts a SAM record into an Avro Alignment. + * + * @param samRecord Record to convert. + * @return Returns the original record converted into Avro. + */ + def convert(samRecord: SAMRecord): Alignment = { + try { + val cigar: String = samRecord.getCigarString + val startTrim = if (cigar == "*") { + 0 + } else { + val count = cigar.takeWhile(_.isDigit).toInt + val operator = cigar.dropWhile(_.isDigit).head + + if (operator == 'H') { + count + } else { + 0 + } + } + val endTrim = if (cigar.endsWith("H")) { + // must reverse string as takeWhile is not implemented in reverse direction + cigar.dropRight(1).reverse.takeWhile(_.isDigit).reverse.toInt + } else { + 0 + } + + val builder: Alignment.Builder = Alignment.newBuilder + .setReadName(samRecord.getReadName) + .setSequence(samRecord.getReadString) + .setCigar(cigar) + .setBasesTrimmedFromStart(startTrim) + .setBasesTrimmedFromEnd(endTrim) + .setOriginalQualityScores(SAMUtils.phredToFastq(samRecord.getOriginalBaseQualities)) + + // if the quality scores string is "*", then we null it in the record + // or, in other words, we only set the quality scores if it is not "*" + val qualityScores = samRecord.getBaseQualityString + if (qualityScores != "*") { + builder.setQualityScores(qualityScores) + } + + // Only set the reference information if the read is aligned, matching the mate reference + // This prevents looking up a -1 in the sequence dictionary + val readReference: Int = samRecord.getReferenceIndex + if (readReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + builder.setReferenceName(samRecord.getReferenceName) + + // set read alignment flag + val start: Int = samRecord.getAlignmentStart + assert(start != 0, "Start cannot equal 0 if reference is set.") + builder.setStart(start - 1L) + + // set OP and OC flags, if applicable + if (samRecord.getAttribute("OP") != null) { + builder.setOriginalStart(samRecord.getIntegerAttribute("OP").toLong - 1) + builder.setOriginalCigar(samRecord.getStringAttribute("OC")) + } + + val end = start.toLong - 1 + samRecord.getCigar.getReferenceLength + builder.setEnd(end) + // set mapping quality + val mapq: Int = samRecord.getMappingQuality + + if (mapq != SAMRecord.UNKNOWN_MAPPING_QUALITY) { + builder.setMappingQuality(mapq) + } + + } + + // set mapping flags + // oddly enough, it appears that reads can show up with mapping + // info (mapq, cigar, position) + // even if the read unmapped flag is set... + + // While the meaning of the ReadMapped, ReadNegativeStand, + // PrimaryAlignmentFlag and SupplementaryAlignmentFlag + // are unclear when the read is not mapped or reference is not defined, + // it is nonetheless favorable to set these flags in the ADAM file + // in same way as they appear in the input BAM inorder to match exactly + // the statistics output by other programs, specifically Samtools Flagstat + + builder.setReadMapped(!samRecord.getReadUnmappedFlag) + builder.setReadNegativeStrand(samRecord.getReadNegativeStrandFlag) + builder.setPrimaryAlignment(!samRecord.getNotPrimaryAlignmentFlag) + builder.setSupplementaryAlignment(samRecord.getSupplementaryAlignmentFlag) + + // Position of the mate/next segment + val mateReference: Int = samRecord.getMateReferenceIndex + + if (mateReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + builder.setMateReferenceName(samRecord.getMateReferenceName) + + val mateStart = samRecord.getMateAlignmentStart + if (mateStart > 0) { + // We subtract one here to be 0-based offset + builder.setMateAlignmentStart(mateStart - 1L) + } + } + + // The Avro scheme defines all flags as defaulting to 'false'. We only + // need to set the flags that are true. + if (samRecord.getFlags != 0) { + if (samRecord.getReadPairedFlag) { + builder.setReadPaired(true) + if (samRecord.getMateNegativeStrandFlag) { + builder.setMateNegativeStrand(true) + } + if (!samRecord.getMateUnmappedFlag) { + builder.setMateMapped(true) + } + if (samRecord.getProperPairFlag) { + builder.setProperPair(true) + } + if (samRecord.getFirstOfPairFlag) { + builder.setReadInFragment(0) + } + if (samRecord.getSecondOfPairFlag) { + builder.setReadInFragment(1) + } + } + if (samRecord.getDuplicateReadFlag) { + builder.setDuplicateRead(true) + } + if (samRecord.getReadFailsVendorQualityCheckFlag) { + builder.setFailedVendorQualityChecks(true) + } + } + + var tags = List[Attribute]() + val tlen = samRecord.getInferredInsertSize + if (tlen != 0) { + builder.setInsertSize(tlen.toLong) + } + if (samRecord.getAttributes != null) { + samRecord.getAttributes.asScala.foreach { + attr => + if (attr.tag == "MD") { + builder.setMismatchingPositions(attr.value.toString) + } else if (!skipTag(attr.tag)) { + tags ::= AttributeUtils.convertSAMTagAndValue(attr) + } + } + } + if (tags.nonEmpty) { + builder.setAttributes(tags.mkString("\t")) + } + + val recordGroup: SAMReadGroupRecord = samRecord.getReadGroup + if (recordGroup != null) { + builder.setReadGroupId(recordGroup.getReadGroupId) + .setReadGroupSampleId(recordGroup.getSample) + } + + builder.build + } catch { + case t: Throwable => { + error("Conversion of read: " + samRecord + " failed.") + throw t + } + } + } /** * Prepare a single record for conversion to FASTQ and similar formats by @@ -329,7 +531,7 @@ class AlignmentConverter extends Serializable { .foreach(builder.setAttribute("MD", _)) Option(adamRecord.getOriginalQualityScores) .map(s => s.getBytes.map(v => (v - 33).toByte)) // not ascii, but short int - .foreach(builder.setOriginalBaseQualities(_)) + .foreach(builder.setOriginalBaseQualities) Option(adamRecord.getOriginalCigar) .foreach(builder.setAttribute("OC", _)) Option(adamRecord.getOriginalStart) @@ -373,7 +575,7 @@ class AlignmentConverter extends Serializable { * Alignment per sequence in the Fragment. */ def convertFragment(fragment: Fragment): Iterable[Alignment] = { - asScalaBuffer(fragment.getAlignments).toIterable + asScalaBuffer(fragment.getAlignments) } } @@ -383,7 +585,47 @@ class AlignmentConverter extends Serializable { * Singleton object exists due to cross reference from * org.bdgenomics.adam.rdd.read.AlignmentDatasetFunctions. */ -private[adam] object AlignmentConverter extends Serializable { +object AlignmentConverter extends Serializable { + + /** + * Return the references in the specified SAM file header. + * + * @param header SAM file header + * @return the references in the specified SAM file header + */ + def references(header: SAMFileHeader): SequenceDictionary = { + SequenceDictionary(header) + } + + /** + * Return the read groups in the specified SAM file header. + * + * @param header SAM file header + * @return the read groups in the specified SAM file header + */ + def readGroups(header: SAMFileHeader): ReadGroupDictionary = { + ReadGroupDictionary.fromSAMHeader(header) + } + + /** + * Return the processing steps in the specified SAM file header. + * + * @param header SAM file header + * @return the processing steps in the specified SAM file header + */ + def processingSteps(header: SAMFileHeader): Seq[ProcessingStep] = { + val programRecords = header.getProgramRecords().asScala + programRecords.map(convertSAMProgramRecord) + } + + private def convertSAMProgramRecord(record: SAMProgramRecord): ProcessingStep = { + val builder = ProcessingStep.newBuilder.setId(record.getId) + Option(record.getPreviousProgramGroupId).foreach(builder.setPreviousId) + Option(record.getProgramVersion).foreach(builder.setVersion) + Option(record.getProgramName).foreach(builder.setProgramName) + Option(record.getCommandLine).foreach(builder.setCommandLine) + builder.build + } /** * Checks to see if a read name has a index suffix. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala deleted file mode 100644 index faf997b019..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala +++ /dev/null @@ -1,220 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.converters - -import grizzled.slf4j.Logging -import htsjdk.samtools.{ - SAMReadGroupRecord, - SAMRecord, - SAMUtils -} -import org.bdgenomics.adam.models.Attribute -import org.bdgenomics.adam.util.AttributeUtils -import org.bdgenomics.formats.avro.Alignment -import scala.collection.JavaConverters._ - -/** - * Helper class for converting SAMRecords into Alignments. - */ -private[adam] class SAMRecordConverter extends Serializable with Logging { - - /** - * Returns true if a tag should not be kept in the attributes field. - * - * The SAM/BAM format supports attributes, which is a key/value pair map. In - * ADAM, we have promoted some of these fields to "primary" fields, so that we - * can more efficiently access them. These include the MD tag, which describes - * substitutions against the reference; the OQ tag, which describes the - * original read base quality scores; and the OP and OC tags, which describe the - * original read alignment position and CIGAR. - * - * @param attrTag Tag name to check. - * @return Returns true if the tag should be skipped. - */ - private[converters] def skipTag(attrTag: String): Boolean = attrTag match { - case "OQ" => true - case "OP" => true - case "OC" => true - case "MD" => true - case _ => false - } - - /** - * Converts a SAM record into an Avro Alignment. - * - * @param samRecord Record to convert. - * @return Returns the original record converted into Avro. - */ - def convert(samRecord: SAMRecord): Alignment = { - try { - val cigar: String = samRecord.getCigarString - val startTrim = if (cigar == "*") { - 0 - } else { - val count = cigar.takeWhile(_.isDigit).toInt - val operator = cigar.dropWhile(_.isDigit).head - - if (operator == 'H') { - count - } else { - 0 - } - } - val endTrim = if (cigar.endsWith("H")) { - // must reverse string as takeWhile is not implemented in reverse direction - cigar.dropRight(1).reverse.takeWhile(_.isDigit).reverse.toInt - } else { - 0 - } - - val builder: Alignment.Builder = Alignment.newBuilder - .setReadName(samRecord.getReadName) - .setSequence(samRecord.getReadString) - .setCigar(cigar) - .setBasesTrimmedFromStart(startTrim) - .setBasesTrimmedFromEnd(endTrim) - .setOriginalQualityScores(SAMUtils.phredToFastq(samRecord.getOriginalBaseQualities)) - - // if the quality scores string is "*", then we null it in the record - // or, in other words, we only set the quality scores if it is not "*" - val qualityScores = samRecord.getBaseQualityString - if (qualityScores != "*") { - builder.setQualityScores(qualityScores) - } - - // Only set the reference information if the read is aligned, matching the mate reference - // This prevents looking up a -1 in the sequence dictionary - val readReference: Int = samRecord.getReferenceIndex - if (readReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - builder.setReferenceName(samRecord.getReferenceName) - - // set read alignment flag - val start: Int = samRecord.getAlignmentStart - assert(start != 0, "Start cannot equal 0 if reference is set.") - builder.setStart(start - 1L) - - // set OP and OC flags, if applicable - if (samRecord.getAttribute("OP") != null) { - builder.setOriginalStart(samRecord.getIntegerAttribute("OP").toLong - 1) - builder.setOriginalCigar(samRecord.getStringAttribute("OC")) - } - - val end = start.toLong - 1 + samRecord.getCigar.getReferenceLength - builder.setEnd(end) - // set mapping quality - val mapq: Int = samRecord.getMappingQuality - - if (mapq != SAMRecord.UNKNOWN_MAPPING_QUALITY) { - builder.setMappingQuality(mapq) - } - - } - - // set mapping flags - // oddly enough, it appears that reads can show up with mapping - // info (mapq, cigar, position) - // even if the read unmapped flag is set... - - // While the meaning of the ReadMapped, ReadNegativeStand, - // PrimaryAlignmentFlag and SupplementaryAlignmentFlag - // are unclear when the read is not mapped or reference is not defined, - // it is nonetheless favorable to set these flags in the ADAM file - // in same way as they appear in the input BAM inorder to match exactly - // the statistics output by other programs, specifically Samtools Flagstat - - builder.setReadMapped(!samRecord.getReadUnmappedFlag) - builder.setReadNegativeStrand(samRecord.getReadNegativeStrandFlag) - builder.setPrimaryAlignment(!samRecord.getNotPrimaryAlignmentFlag) - builder.setSupplementaryAlignment(samRecord.getSupplementaryAlignmentFlag) - - // Position of the mate/next segment - val mateReference: Int = samRecord.getMateReferenceIndex - - if (mateReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - builder.setMateReferenceName(samRecord.getMateReferenceName) - - val mateStart = samRecord.getMateAlignmentStart - if (mateStart > 0) { - // We subtract one here to be 0-based offset - builder.setMateAlignmentStart(mateStart - 1L) - } - } - - // The Avro scheme defines all flags as defaulting to 'false'. We only - // need to set the flags that are true. - if (samRecord.getFlags != 0) { - if (samRecord.getReadPairedFlag) { - builder.setReadPaired(true) - if (samRecord.getMateNegativeStrandFlag) { - builder.setMateNegativeStrand(true) - } - if (!samRecord.getMateUnmappedFlag) { - builder.setMateMapped(true) - } - if (samRecord.getProperPairFlag) { - builder.setProperPair(true) - } - if (samRecord.getFirstOfPairFlag) { - builder.setReadInFragment(0) - } - if (samRecord.getSecondOfPairFlag) { - builder.setReadInFragment(1) - } - } - if (samRecord.getDuplicateReadFlag) { - builder.setDuplicateRead(true) - } - if (samRecord.getReadFailsVendorQualityCheckFlag) { - builder.setFailedVendorQualityChecks(true) - } - } - - var tags = List[Attribute]() - val tlen = samRecord.getInferredInsertSize - if (tlen != 0) { - builder.setInsertSize(tlen.toLong) - } - if (samRecord.getAttributes != null) { - samRecord.getAttributes.asScala.foreach { - attr => - if (attr.tag == "MD") { - builder.setMismatchingPositions(attr.value.toString) - } else if (!skipTag(attr.tag)) { - tags ::= AttributeUtils.convertSAMTagAndValue(attr) - } - } - } - if (tags.nonEmpty) { - builder.setAttributes(tags.mkString("\t")) - } - - val recordGroup: SAMReadGroupRecord = samRecord.getReadGroup - if (recordGroup != null) { - builder.setReadGroupId(recordGroup.getReadGroupId) - .setReadGroupSampleId(recordGroup.getSample) - } - - builder.build - } catch { - case t: Throwable => { - error("Conversion of read: " + samRecord + " failed.") - throw t - } - } - } -} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala index 1eb6c0c551..fa3eb6f9fd 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala @@ -17,18 +17,16 @@ */ package org.bdgenomics.adam.converters -import com.google.common.base.Splitter import com.google.common.collect.ImmutableList import grizzled.slf4j.Logging import htsjdk.samtools.ValidationStringency import htsjdk.variant.variantcontext.{ Allele, - Genotype => HtsjdkGenotype, GenotypeBuilder, - GenotypesContext, GenotypeLikelihoods, - VariantContext => HtsjdkVariantContext, - VariantContextBuilder + VariantContextBuilder, + Genotype => HtsjdkGenotype, + VariantContext => HtsjdkVariantContext } import htsjdk.variant.vcf.{ VCFCompoundHeaderLine, @@ -42,7 +40,6 @@ import htsjdk.variant.vcf.{ } import java.util.Collections import org.apache.hadoop.conf.Configuration -import org.bdgenomics.utils.misc.MathUtils import org.bdgenomics.adam.models.{ SequenceDictionary, VariantContext => ADAMVariantContext @@ -50,9 +47,10 @@ import org.bdgenomics.adam.models.{ import org.bdgenomics.adam.util.PhredUtils import org.bdgenomics.formats.avro._ import org.slf4j.Logger + import scala.annotation.tailrec import scala.collection.JavaConversions._ -import scala.collection.mutable.{ Buffer, HashMap } +import scala.collection.mutable /** * Object for converting between htsjdk and ADAM VariantContexts. @@ -189,7 +187,7 @@ object VariantContextConverter { } } - private[adam] def cleanAndMixInSupportedLines( + def cleanAndMixInSupportedLines( headerLines: Seq[VCFHeaderLine], stringency: ValidationStringency, log: Logger): Seq[VCFHeaderLine] = { @@ -264,11 +262,42 @@ object VariantContextConverter { }) ++ DefaultHeaderLines.allHeaderLines } - private[adam] def headerLines(header: VCFHeader): Seq[VCFHeaderLine] = { + /** + * Return the header lines in the specified VCF file header. + * + * @param header VCF file header + * @return the header lines in the specified VCF file header + */ + def headerLines(header: VCFHeader): Seq[VCFHeaderLine] = { (header.getFilterLines ++ header.getFormatHeaderLines ++ header.getInfoHeaderLines ++ - header.getOtherHeaderLines).toSeq + header.getOtherHeaderLines) + } + + /** + * Return the references in the specified VCF file header. + * + * @param header VCF file header + * @return the references in the specified VCF file header + */ + def references(header: VCFHeader): SequenceDictionary = { + SequenceDictionary.fromVCFHeader(header) + } + + /** + * Return the samples in the specified VCF file header. + * + * @param header VCF file header + * @return the samples in the specified VCF file header + */ + def samples(header: VCFHeader): Seq[Sample] = { + asScalaBuffer(header.getGenotypeSamples) + .map(s => { + Sample.newBuilder() + .setId(s) + .build() + }) } def apply(headerLines: Seq[VCFHeaderLine], @@ -296,7 +325,7 @@ class VariantContextConverter( setNestedAnnotationInGenotype: Boolean) extends Serializable with Logging { import VariantContextConverter._ - // format fns gatk --> bdg, extract fns bdg --> gatk + // format fns htsjdk --> bdg, extract fns bdg --> htsjdk private val variantFormatFn = makeVariantFormatFn(headerLines, stringency) private val variantExtractFn = makeVariantExtractFn(headerLines) private val genotypeFormatFn = makeGenotypeFormatFn(headerLines) @@ -328,10 +357,10 @@ class VariantContextConverter( } /** - * Converts a GATK variant context into one or more ADAM variant context(s). + * Converts a htsjdk variant context into one or more ADAM variant context(s). * - * @param vc GATK variant context to convert. - * @return The specified GATK variant context converted into one or more ADAM variant context(s) + * @param vc htsjdk variant context to convert. + * @return The specified htsjdk variant context converted into one or more ADAM variant context(s) */ def convert( vc: HtsjdkVariantContext): Seq[ADAMVariantContext] = { @@ -442,7 +471,7 @@ class VariantContextConverter( * from the array of variant names */ private def joinNames(variant: Variant): Option[String] = { - if (variant.getNames != null && variant.getNames.length > 0) { + if (variant.getNames != null && variant.getNames.nonEmpty) { Some(variant.getNames.mkString(VCFConstants.ID_FIELD_SEPARATOR)) } else { None @@ -455,7 +484,7 @@ class VariantContextConverter( vc: HtsjdkVariantContext, vb: Variant.Builder): Variant.Builder = { - splitIds(vc).fold(vb)(vb.setNames(_)) + splitIds(vc).fold(vb)(vb.setNames) } private[converters] def formatQuality( @@ -495,7 +524,7 @@ class VariantContextConverter( v: Variant, vcb: VariantContextBuilder): VariantContextBuilder = { - joinNames(v).fold(vcb.noID())(vcb.id(_)) + joinNames(v).fold(vcb.noID())(vcb.id) } private[converters] def extractQuality( @@ -546,7 +575,7 @@ class VariantContextConverter( index: Int): VariantAnnotation.Builder = { Option(vc.getAttributeAsString("AA", null)) - .fold(vab)(vab.setAncestralAllele(_)) + .fold(vab)(vab.setAncestralAllele) } private[converters] def formatDbSnp( @@ -556,7 +585,7 @@ class VariantContextConverter( index: Int): VariantAnnotation.Builder = { Option(vc.getAttribute("DB").asInstanceOf[java.lang.Boolean]) - .fold(vab)(vab.setDbSnp(_)) + .fold(vab)(vab.setDbSnp) } private[converters] def formatHapMap2( @@ -566,7 +595,7 @@ class VariantContextConverter( index: Int): VariantAnnotation.Builder = { Option(vc.getAttribute("H2").asInstanceOf[java.lang.Boolean]) - .fold(vab)(vab.setHapMap2(_)) + .fold(vab)(vab.setHapMap2) } private[converters] def formatHapMap3( @@ -576,7 +605,7 @@ class VariantContextConverter( index: Int): VariantAnnotation.Builder = { Option(vc.getAttribute("H3").asInstanceOf[java.lang.Boolean]) - .fold(vab)(vab.setHapMap3(_)) + .fold(vab)(vab.setHapMap3) } private[converters] def formatValidated( @@ -586,7 +615,7 @@ class VariantContextConverter( index: Int): VariantAnnotation.Builder = { Option(vc.getAttribute("VALIDATED").asInstanceOf[java.lang.Boolean]) - .fold(vab)(vab.setValidated(_)) + .fold(vab)(vab.setValidated) } private[converters] def formatThousandGenomes( @@ -596,7 +625,7 @@ class VariantContextConverter( index: Int): VariantAnnotation.Builder = { Option(vc.getAttribute("1000G").asInstanceOf[java.lang.Boolean]) - .fold(vab)(vab.setThousandGenomes(_)) + .fold(vab)(vab.setThousandGenomes) } private[converters] def formatSomatic( @@ -607,7 +636,7 @@ class VariantContextConverter( // default somatic to false if unspecified Option(vc.getAttribute("SOMATIC").asInstanceOf[java.lang.Boolean]) - .fold(vab.setSomatic(false))(vab.setSomatic(_)) + .fold(vab.setSomatic(false))(vab.setSomatic) } private[converters] def formatAlleleCount( @@ -864,7 +893,7 @@ class VariantContextConverter( gIndices: Array[Int]): Genotype.Builder = { // AD is an array type field - if (g.hasAD && gIdx < g.getAD.size) { + if (g.hasAD && gIdx < g.getAD.length) { val ad = g.getAD gb.setReferenceReadDepth(ad(0)) .setAlternateReadDepth(ad(gIdx)) @@ -1373,7 +1402,7 @@ class VariantContextConverter( indices: List[Int]): List[T] = { if (indices.isEmpty) { array.toList - } else if (indices.max < array.size) { + } else if (indices.max < array.length) { indices.map(idx => array(idx)) } else { List.empty @@ -1489,7 +1518,7 @@ class VariantContextConverter( try { val l: java.util.List[String] = obj.asInstanceOf[java.util.List[String]] // java.util.List has a conflicing toString. Get implicit to buffer first - val sL: Buffer[String] = l + val sL: mutable.Buffer[String] = l sL.toArray } catch { case cce: ClassCastException => { @@ -1690,7 +1719,7 @@ class VariantContextConverter( variantBuilder.setSplitFromMultiAllelic(true) } - alt.foreach(variantBuilder.setAlternateAllele(_)) + alt.foreach(variantBuilder.setAlternateAllele) // bind the conversion functions and fold val boundFns: Iterable[Variant.Builder => Variant.Builder] = variantFormatFns @@ -2181,7 +2210,7 @@ class VariantContextConverter( def convert(vc: ADAMVariantContext): HtsjdkVariantContext = { val v = vc.variant.variant val hasNonRefAlleles = vc.genotypes - .exists(_.getNonReferenceLikelihoods.length != 0) + .exists(_.getNonReferenceLikelihoods.nonEmpty) val builder = new VariantContextBuilder() .chr(v.getReferenceName) .start(v.getStart + 1) @@ -2366,10 +2395,10 @@ class VariantContextConverter( } /** - * Convert an ADAM variant context into a GATK variant context. + * Convert an ADAM variant context into a htsjdk variant context. * * @param vc ADAM variant context to convert. - * @return The specified ADAM variant context converted into a GATK variant context. + * @return The specified ADAM variant context converted into a htsjdk variant context. */ def convert( vc: ADAMVariantContext): Option[HtsjdkVariantContext] = { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/ds/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/ds/ADAMContext.scala index c9026d77f4..652be3ef49 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/ds/ADAMContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/ds/ADAMContext.scala @@ -2053,9 +2053,9 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log sc.newAPIHadoopFile(pathName, classOf[AnySAMInputFormat], classOf[LongWritable], classOf[SAMRecordWritable], ContextUtil.getConfiguration(job)) } - val samRecordConverter = new SAMRecordConverter + val alignmentConverter = new AlignmentConverter - AlignmentDataset(records.map(p => samRecordConverter.convert(p._2.get)), + AlignmentDataset(records.map(p => alignmentConverter.convert(p._2.get)), seqDict, readGroups, programs) @@ -2167,8 +2167,8 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log classOf[SAMRecordWritable], conf) - val samRecordConverter = new SAMRecordConverter - AlignmentDataset(records.map(p => samRecordConverter.convert(p._2.get)), + val alignmentConverter = new AlignmentConverter + AlignmentDataset(records.map(p => alignmentConverter.convert(p._2.get)), seqDict, readGroups, programs) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/AnySAMOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/AnySAMOutFormatter.scala index af5c6d5021..bf61a88f4a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/AnySAMOutFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/ds/read/AnySAMOutFormatter.scala @@ -19,7 +19,7 @@ package org.bdgenomics.adam.ds.read import htsjdk.samtools._ import java.io.InputStream -import org.bdgenomics.adam.converters.SAMRecordConverter +import org.bdgenomics.adam.converters.AlignmentConverter import org.bdgenomics.adam.ds.OutFormatter import org.bdgenomics.formats.avro.Alignment import scala.annotation.tailrec @@ -55,7 +55,7 @@ private case class SAMIteratorConverter(val reader: SamReader) extends Iterator[ val iter = reader.iterator() // make converter and empty dicts - val converter = new SAMRecordConverter + val converter = new AlignmentConverter def hasNext: Boolean = { iter.hasNext diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadGroupDictionary.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadGroupDictionary.scala index e8cf2f620d..cb3adc890b 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadGroupDictionary.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadGroupDictionary.scala @@ -206,6 +206,10 @@ object ReadGroup { Option(readGroup.getPlatform), Option(readGroup.getPlatformUnit)) } + + def fromAvro(readGroups: Seq[ReadGroupMetadata]): ReadGroupDictionary = { + new ReadGroupDictionary(readGroups.map(rgm => ReadGroup.fromAvro(rgm))) + } } /** diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/AlignmentConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/AlignmentConverterSuite.scala index 98e71334cd..8baee0a8e0 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/AlignmentConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/AlignmentConverterSuite.scala @@ -163,14 +163,14 @@ class AlignmentConverterSuite extends FunSuite { } def getSAMRecordFromReadName(readName: String): (Alignment, Alignment) = { - val samToADAMConverter = new SAMRecordConverter + val alignmentConverter = new AlignmentConverter val SAMTestFile = new File(getClass.getClassLoader.getResource("bqsr1.sam").getFile) val newSAMReader = SamReaderFactory.makeDefault().open(SAMTestFile) // Obtain SAMRecord val newSAMRecord = newSAMReader.iterator().dropWhile(r => r.getReadName != readName) - val firstRecord = samToADAMConverter.convert(newSAMRecord.next()) - val secondRecord = samToADAMConverter.convert(newSAMRecord.next()) + val firstRecord = alignmentConverter.convert(newSAMRecord.next()) + val secondRecord = alignmentConverter.convert(newSAMRecord.next()) (firstRecord, secondRecord) } @@ -336,5 +336,107 @@ class AlignmentConverterSuite extends FunSuite { val converted = adamRecordConverter.convertFragment(fragment) assert(converted.head.getReadNegativeStrand) } + test("testing the fields in an Alignment obtained from a mapped samRecord conversion") { + + val testRecordConverter = new AlignmentConverter + val testFileString = getClass.getClassLoader.getResource("reads12.sam").getFile + val testFile = new File(testFileString) + + // Iterator of SamReads in the file that each have a samRecord for conversion + val testIterator = SamReaderFactory.makeDefault().open(testFile) + val testSAMRecord = testIterator.iterator().next() + + // set the oq, md, oc, and op attributes + testSAMRecord.setOriginalBaseQualities("*****".getBytes.map(v => (v - 33).toByte)) + testSAMRecord.setAttribute("MD", "100") + testSAMRecord.setAttribute("OC", "100M") + testSAMRecord.setAttribute("OP", 1) + + // Convert samRecord to Alignment + val testAlignment = testRecordConverter.convert(testSAMRecord) + + // Validating Conversion + assert(testAlignment.getCigar === testSAMRecord.getCigarString) + assert(testAlignment.getDuplicateRead === testSAMRecord.getDuplicateReadFlag) + assert(testAlignment.getEnd.toInt === testSAMRecord.getAlignmentEnd) + assert(testAlignment.getMappingQuality.toInt === testSAMRecord.getMappingQuality) + assert(testAlignment.getStart.toInt === (testSAMRecord.getAlignmentStart - 1)) + assert(testAlignment.getReadInFragment == 0) + assert(testAlignment.getFailedVendorQualityChecks === testSAMRecord.getReadFailsVendorQualityCheckFlag) + assert(!testAlignment.getPrimaryAlignment === testSAMRecord.getNotPrimaryAlignmentFlag) + assert(!testAlignment.getReadMapped === testSAMRecord.getReadUnmappedFlag) + assert(testAlignment.getReadName === testSAMRecord.getReadName) + assert(testAlignment.getReadNegativeStrand === testSAMRecord.getReadNegativeStrandFlag) + assert(!testAlignment.getReadPaired) + assert(testAlignment.getReadInFragment != 1) + assert(testAlignment.getSupplementaryAlignment === testSAMRecord.getSupplementaryAlignmentFlag) + assert(testAlignment.getOriginalQualityScores === "*****") + assert(testAlignment.getMismatchingPositions === "100") + assert(testAlignment.getOriginalCigar === "100M") + assert(testAlignment.getOriginalStart === 0L) + assert(testAlignment.getAttributes === "XS:i:0\tAS:i:75\tNM:i:0") + } + + test("testing the fields in an Alignment obtained from an unmapped samRecord conversion") { + + val testRecordConverter = new AlignmentConverter + val testFileString = getClass.getClassLoader.getResource("reads12.sam").getFile + val testFile = new File(testFileString) + + // Iterator of SamReads in the file that each have a samRecord for conversion + val testIterator = SamReaderFactory.makeDefault().open(testFile) + val testSAMRecord = testIterator.iterator().next() + + // Convert samRecord to Alignment + val testAlignment = testRecordConverter.convert(testSAMRecord) + + // Validating Conversion + assert(testAlignment.getCigar === testSAMRecord.getCigarString) + assert(testAlignment.getDuplicateRead === testSAMRecord.getDuplicateReadFlag) + assert(testAlignment.getEnd.toInt === testSAMRecord.getAlignmentEnd) + assert(testAlignment.getMappingQuality.toInt === testSAMRecord.getMappingQuality) + assert(testAlignment.getStart.toInt === (testSAMRecord.getAlignmentStart - 1)) + assert(testAlignment.getReadInFragment == 0) + assert(testAlignment.getFailedVendorQualityChecks === testSAMRecord.getReadFailsVendorQualityCheckFlag) + assert(!testAlignment.getPrimaryAlignment === testSAMRecord.getNotPrimaryAlignmentFlag) + assert(!testAlignment.getReadMapped === testSAMRecord.getReadUnmappedFlag) + assert(testAlignment.getReadName === testSAMRecord.getReadName) + assert(testAlignment.getReadNegativeStrand === testSAMRecord.getReadNegativeStrandFlag) + assert(!testAlignment.getReadPaired) + assert(testAlignment.getReadInFragment != 1) + assert(testAlignment.getSupplementaryAlignment === testSAMRecord.getSupplementaryAlignmentFlag) + } + + test("'*' quality gets nulled out") { + + val newRecordConverter = new AlignmentConverter + val newTestFile = new File(getClass.getClassLoader.getResource("unmapped.sam").getFile) + val newSAMReader = SamReaderFactory.makeDefault().open(newTestFile) + + // Obtain SAMRecord + val newSAMRecordIter = { + val samIter = asScalaIterator(newSAMReader.iterator()) + samIter.toIterable.dropWhile(!_.getReadUnmappedFlag) + } + val newSAMRecord = newSAMRecordIter.toIterator.next() + + // null out quality + newSAMRecord.setBaseQualityString("*") + + // Conversion + val newAlignment = newRecordConverter.convert(newSAMRecord) + + // Validating Conversion + assert(newAlignment.getQualityScores === null) + } + + test("don't keep denormalized fields") { + val rc = new AlignmentConverter + + assert(rc.skipTag("MD")) + assert(rc.skipTag("OQ")) + assert(rc.skipTag("OP")) + assert(rc.skipTag("OC")) + } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/SAMRecordConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/SAMRecordConverterSuite.scala deleted file mode 100644 index 8f990ba67c..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/SAMRecordConverterSuite.scala +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.converters - -import htsjdk.samtools._ -import java.io.File -import org.scalatest.FunSuite -import scala.collection.JavaConversions._ - -class SAMRecordConverterSuite extends FunSuite { - - test("testing the fields in an Alignment obtained from a mapped samRecord conversion") { - - val testRecordConverter = new SAMRecordConverter - val testFileString = getClass.getClassLoader.getResource("reads12.sam").getFile - val testFile = new File(testFileString) - - // Iterator of SamReads in the file that each have a samRecord for conversion - val testIterator = SamReaderFactory.makeDefault().open(testFile) - val testSAMRecord = testIterator.iterator().next() - - // set the oq, md, oc, and op attributes - testSAMRecord.setOriginalBaseQualities("*****".getBytes.map(v => (v - 33).toByte)) - testSAMRecord.setAttribute("MD", "100") - testSAMRecord.setAttribute("OC", "100M") - testSAMRecord.setAttribute("OP", 1) - - // Convert samRecord to Alignment - val testAlignment = testRecordConverter.convert(testSAMRecord) - - // Validating Conversion - assert(testAlignment.getCigar === testSAMRecord.getCigarString) - assert(testAlignment.getDuplicateRead === testSAMRecord.getDuplicateReadFlag) - assert(testAlignment.getEnd.toInt === testSAMRecord.getAlignmentEnd) - assert(testAlignment.getMappingQuality.toInt === testSAMRecord.getMappingQuality) - assert(testAlignment.getStart.toInt === (testSAMRecord.getAlignmentStart - 1)) - assert(testAlignment.getReadInFragment == 0) - assert(testAlignment.getFailedVendorQualityChecks === testSAMRecord.getReadFailsVendorQualityCheckFlag) - assert(!testAlignment.getPrimaryAlignment === testSAMRecord.getNotPrimaryAlignmentFlag) - assert(!testAlignment.getReadMapped === testSAMRecord.getReadUnmappedFlag) - assert(testAlignment.getReadName === testSAMRecord.getReadName) - assert(testAlignment.getReadNegativeStrand === testSAMRecord.getReadNegativeStrandFlag) - assert(!testAlignment.getReadPaired) - assert(testAlignment.getReadInFragment != 1) - assert(testAlignment.getSupplementaryAlignment === testSAMRecord.getSupplementaryAlignmentFlag) - assert(testAlignment.getOriginalQualityScores === "*****") - assert(testAlignment.getMismatchingPositions === "100") - assert(testAlignment.getOriginalCigar === "100M") - assert(testAlignment.getOriginalStart === 0L) - assert(testAlignment.getAttributes === "XS:i:0\tAS:i:75\tNM:i:0") - } - - test("testing the fields in an Alignment obtained from an unmapped samRecord conversion") { - - val testRecordConverter = new SAMRecordConverter - val testFileString = getClass.getClassLoader.getResource("reads12.sam").getFile - val testFile = new File(testFileString) - - // Iterator of SamReads in the file that each have a samRecord for conversion - val testIterator = SamReaderFactory.makeDefault().open(testFile) - val testSAMRecord = testIterator.iterator().next() - - // Convert samRecord to Alignment - val testAlignment = testRecordConverter.convert(testSAMRecord) - - // Validating Conversion - assert(testAlignment.getCigar === testSAMRecord.getCigarString) - assert(testAlignment.getDuplicateRead === testSAMRecord.getDuplicateReadFlag) - assert(testAlignment.getEnd.toInt === testSAMRecord.getAlignmentEnd) - assert(testAlignment.getMappingQuality.toInt === testSAMRecord.getMappingQuality) - assert(testAlignment.getStart.toInt === (testSAMRecord.getAlignmentStart - 1)) - assert(testAlignment.getReadInFragment == 0) - assert(testAlignment.getFailedVendorQualityChecks === testSAMRecord.getReadFailsVendorQualityCheckFlag) - assert(!testAlignment.getPrimaryAlignment === testSAMRecord.getNotPrimaryAlignmentFlag) - assert(!testAlignment.getReadMapped === testSAMRecord.getReadUnmappedFlag) - assert(testAlignment.getReadName === testSAMRecord.getReadName) - assert(testAlignment.getReadNegativeStrand === testSAMRecord.getReadNegativeStrandFlag) - assert(!testAlignment.getReadPaired) - assert(testAlignment.getReadInFragment != 1) - assert(testAlignment.getSupplementaryAlignment === testSAMRecord.getSupplementaryAlignmentFlag) - } - - test("'*' quality gets nulled out") { - - val newRecordConverter = new SAMRecordConverter - val newTestFile = new File(getClass.getClassLoader.getResource("unmapped.sam").getFile) - val newSAMReader = SamReaderFactory.makeDefault().open(newTestFile) - - // Obtain SAMRecord - val newSAMRecordIter = { - val samIter = asScalaIterator(newSAMReader.iterator()) - samIter.toIterable.dropWhile(!_.getReadUnmappedFlag) - } - val newSAMRecord = newSAMRecordIter.toIterator.next() - - // null out quality - newSAMRecord.setBaseQualityString("*") - - // Conversion - val newAlignment = newRecordConverter.convert(newSAMRecord) - - // Validating Conversion - assert(newAlignment.getQualityScores === null) - } - - test("don't keep denormalized fields") { - val rc = new SAMRecordConverter - - assert(rc.skipTag("MD")) - assert(rc.skipTag("OQ")) - assert(rc.skipTag("OP")) - assert(rc.skipTag("OC")) - } -}