-
Notifications
You must be signed in to change notification settings - Fork 308
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merge VariantAnnotation and DatabaseVariantAnnotation records #1250
Changes from 11 commits
80f9557
5c30085
12f245c
8c18150
028b245
b3e3438
b43a252
9b33c81
bcb92e7
c799092
bd15743
95c4107
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,6 @@ | |
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.bdgenomics.adam.converters | ||
|
||
import htsjdk.samtools.ValidationStringency | ||
|
@@ -31,7 +30,14 @@ import org.bdgenomics.formats.avro.{ | |
import org.bdgenomics.utils.misc.Logging | ||
import scala.collection.JavaConverters._ | ||
|
||
object VariantAnnotations extends Serializable with Logging { | ||
/** | ||
* Convert between htsjdk VCF INFO reserved key "ANN" values and TranscriptEffects. | ||
*/ | ||
private[adam] object TranscriptEffectConverter extends Serializable with Logging { | ||
|
||
/** | ||
* Look up variant annotation messages by name and message code. | ||
*/ | ||
private val MESSAGES: Map[String, VariantAnnotationMessage] = Map( | ||
// name -> enum | ||
ERROR_CHROMOSOME_NOT_FOUND.name() -> ERROR_CHROMOSOME_NOT_FOUND, | ||
|
@@ -57,15 +63,33 @@ object VariantAnnotations extends Serializable with Logging { | |
"I3" -> INFO_NON_REFERENCE_ANNOTATION | ||
) | ||
|
||
/** | ||
* Split effects by <code>&</code> character. | ||
* | ||
* @param s effects to split | ||
* @return effects split by <code>&</code> character | ||
*/ | ||
private def parseEffects(s: String): List[String] = { | ||
s.split("&").toList | ||
} | ||
|
||
/** | ||
* Split variant effect messages by <code>&</code> character. | ||
* | ||
* @param s variant effect messages to split | ||
* @return variant effect messages split by <code>&</code> character | ||
*/ | ||
private def parseMessages(s: String): List[VariantAnnotationMessage] = { | ||
// todo: haven't seen a delimiter here, assuming it is also '&' | ||
s.split("&").map(MESSAGES.get(_)).toList.flatten | ||
} | ||
|
||
/** | ||
* Split a single or fractional value into optional numerator and denominator values. | ||
* | ||
* @param s single or fractional value to split | ||
* @return single or fractional value split into optional numerator and denominator values | ||
*/ | ||
private def parseFraction(s: String): (Option[Integer], Option[Integer]) = { | ||
if ("".equals(s)) { | ||
return (None, None) | ||
|
@@ -78,10 +102,23 @@ object VariantAnnotations extends Serializable with Logging { | |
} | ||
} | ||
|
||
/** | ||
* Set a value via a function if the value is not empty. | ||
* | ||
* @param s value to set | ||
* @param setFn function to call if the value is not empty | ||
*/ | ||
def setIfNotEmpty(s: String, setFn: String => Unit) { | ||
Option(s).filter(_.nonEmpty).foreach(setFn) | ||
} | ||
|
||
/** | ||
* Parse zero or one transcript effects from the specified string value. | ||
* | ||
* @param s value to parse | ||
* @param stringency validation stringency | ||
* @return zero or one transcript effects parsed from the specified string value | ||
*/ | ||
private[converters] def parseTranscriptEffect( | ||
s: String, | ||
stringency: ValidationStringency): Seq[TranscriptEffect] = { | ||
|
@@ -110,7 +147,8 @@ object VariantAnnotations extends Serializable with Logging { | |
|
||
val te = TranscriptEffect.newBuilder() | ||
setIfNotEmpty(alternateAllele, te.setAlternateAllele(_)) | ||
if (!effects.isEmpty) te.setEffects(effects.asJava) | ||
if (effects.nonEmpty) te.setEffects(effects.asJava) | ||
// note: annotationImpact is output by SnpEff but is not part of the VCF ANN specification | ||
setIfNotEmpty(geneName, te.setGeneName(_)) | ||
setIfNotEmpty(geneId, te.setGeneId(_)) | ||
setIfNotEmpty(featureType, te.setFeatureType(_)) | ||
|
@@ -132,26 +170,115 @@ object VariantAnnotations extends Serializable with Logging { | |
Seq(te.build()) | ||
} | ||
|
||
/** | ||
* Parse the VCF INFO reserved key "ANN" value into zero or more TranscriptEffects. | ||
* | ||
* @param s string to parse | ||
* @param stringency validation stringency | ||
* @return the VCF INFO reserved key "ANN" value parsed into zero or more TranscriptEffects | ||
*/ | ||
private[converters] def parseAnn( | ||
s: String, | ||
stringency: ValidationStringency): List[TranscriptEffect] = { | ||
|
||
s.split(",").map(parseTranscriptEffect(_, stringency)).flatten.toList | ||
} | ||
|
||
def createVariantAnnotation( | ||
/** | ||
* Convert the htsjdk VCF INFO reserved key "ANN" value into zero or more TranscriptEffects, | ||
* matching on alternate allele. | ||
* | ||
* @param variant variant | ||
* @param vc htsjdk variant context | ||
* @param stringency validation stringency, defaults to strict | ||
* @return the htsjdk VCF INFO reserved key "ANN" value converted into zero or more | ||
* TranscriptEffects, matching on alternate allele, and wrapped in an option | ||
*/ | ||
def convertToTranscriptEffects( | ||
variant: Variant, | ||
vc: VariantContext, | ||
stringency: ValidationStringency = ValidationStringency.STRICT): VariantAnnotation = { | ||
stringency: ValidationStringency = ValidationStringency.STRICT): Option[List[TranscriptEffect]] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of returning There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would make my brain hurt less. The thought is elsewhere it matters whether this field has been set, so checking Option seemed more correct than checking for an empty list. |
||
|
||
def parseAndFilter(attr: String): Option[List[TranscriptEffect]] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method should be private. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, sorry, I misread this and didn't notice that it is nested inside another function. |
||
if (attr == VCFConstants.MISSING_VALUE_v4) { | ||
None | ||
} else { | ||
val filtered = parseAnn(attr, stringency) | ||
.filter(_.getAlternateAllele == variant.getAlternateAllele) | ||
if (filtered.isEmpty) { | ||
None | ||
} else { | ||
Some(filtered) | ||
} | ||
} | ||
} | ||
|
||
val va = VariantAnnotation.newBuilder() | ||
.setVariant(variant) | ||
val attrOpt = Option(vc.getAttributeAsString("ANN", null)) | ||
try { | ||
attrOpt.flatMap(attr => parseAndFilter(attr)) | ||
} catch { | ||
case t: Throwable => { | ||
if (stringency == ValidationStringency.STRICT) { | ||
throw t | ||
} else if (stringency == ValidationStringency.LENIENT) { | ||
log.warn("Could not convert VCF INFO reserved key ANN value to TranscriptEffect, caught %s.".format(t)) | ||
} | ||
None | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Convert the specified transcript effects into a string suitable for a VCF INFO reserved | ||
* key "ANN" value. | ||
* | ||
* @param effects zero or more transcript effects | ||
* @return the specified transcript effects converted into a string suitable for a VCF INFO | ||
* reserved key "ANN" value | ||
*/ | ||
def convertToVcfInfoAnnValue(effects: Seq[TranscriptEffect]): String = { | ||
def toFraction(numerator: java.lang.Integer, denominator: java.lang.Integer): String = { | ||
val numOpt = Option(numerator) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've seen There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yeah, scala tries to coerce to the Scala native type. You need to do: Option(myJavaInt: java.lang.Integer) To prevent that. That may not be the exact syntax but the syntax shows up in AlignmentRecordComverter, IIRC. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This NPE with Option types still needs to be fixed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I can see, this is fine with The unit test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, SGTM! |
||
val denomOpt = Option(denominator) | ||
|
||
(numOpt, denomOpt) match { | ||
case (None, None) => { | ||
"" | ||
} | ||
case (Some(n), None) => { | ||
"%d".format(n) | ||
} | ||
case (None, Some(d)) => { | ||
log.warn("Incorrect fractional value ?/%d, missing numerator".format(d)) | ||
"" | ||
} | ||
case (Some(n), Some(d)) => { | ||
"%d/%d".format(n, d) | ||
} | ||
} | ||
} | ||
|
||
val attr = vc.getAttributeAsString("ANN", null) | ||
if (attr != null && attr != VCFConstants.MISSING_VALUE_v4) { | ||
va.setTranscriptEffects(parseAnn(attr, stringency).asJava) | ||
def toAnn(te: TranscriptEffect): String = { | ||
Seq( | ||
Option(te.getAlternateAllele).getOrElse(""), // 0 | ||
te.getEffects.asScala.mkString("&"), // 1 | ||
"", // annotationImpact | ||
Option(te.getGeneName).getOrElse(""), // 3 | ||
Option(te.getGeneId).getOrElse(""), | ||
Option(te.getFeatureType).getOrElse(""), | ||
Option(te.getFeatureId).getOrElse(""), | ||
Option(te.getBiotype).getOrElse(""), | ||
toFraction(te.getRank, te.getTotal), // 8 | ||
Option(te.getTranscriptHgvs).getOrElse(""), // 9 | ||
Option(te.getProteinHgvs).getOrElse(""), // 10 | ||
toFraction(te.getCdnaPosition, te.getCdnaLength), // 11 | ||
toFraction(te.getCdsPosition, te.getCdsLength), // 12 | ||
toFraction(te.getProteinPosition, te.getProteinLength), // 13 | ||
Option(te.getDistance).getOrElse(""), | ||
te.getMessages.asScala.mkString("&") | ||
).mkString("|") | ||
} | ||
|
||
va.build() | ||
effects.map(toAnn).mkString(",") | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated to this PR, as this line is unchanged, but whenever possible, I prefer
Iterable
toSeq
unless you need random lookup by index.