Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added custom VariantClassification severity ordering. #7673

Merged
merged 9 commits into from
Mar 8, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.Hidden;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;

import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

public abstract class BaseFuncotatorArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;
Expand Down Expand Up @@ -115,4 +116,11 @@ public abstract class BaseFuncotatorArgumentCollection implements Serializable {
doc = "The minimum number of bases for a variant to be annotated as a segment. Recommended to be changed only for use with FuncotateSegments. Defaults to " + FuncotatorUtils.DEFAULT_MIN_NUM_BASES_FOR_VALID_SEGMENT
)
public int minNumBasesForValidSegment = FuncotatorUtils.DEFAULT_MIN_NUM_BASES_FOR_VALID_SEGMENT;

@Argument(
fullName = FuncotatorArgumentDefinitions.CUSTOM_VARIANT_CLASS_ORDER_FILE,
optional = true,
doc = "TSV File containing custom Variant Classification severity map of the form: VARIANT_CLASSIFICATION\tSEV. VARIANT_CLASSIFICAITON must match one of the VariantClassification names (" + GencodeFuncotation.VariantClassification.ALL_VC_NAMES + "). SEV is an unsigned integer, where lower is sorted first."
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
)
public GATKPath customVariantClassificationOrderFile = null;
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ public class FuncotatorArgumentDefinitions {
public static final String FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION = "force-b37-to-hg19-reference-contig-conversion";
public static final String MIN_NUM_BASES_FOR_SEGMENT_FUNCOTATION = "min-num-bases-for-segment-funcotation";

public static final String CUSTOM_VARIANT_CLASS_ORDER_FILE = "custom-variant-classifiation-order";
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved

// ------------------------------------------------------------
// Helper Types:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ public FuncotatorEngine(final BaseFuncotatorArgumentCollection funcotatorArgs,

// Determine whether we have to convert given variants from B37 to HG19:
mustConvertInputContigsToHg19 = determineReferenceAndDatasourceCompatibility();

// Read in the custom variant classification order file here so that it can be shared across all engines:
if (funcotatorArgs.customVariantClassificationOrderFile != null) {
FuncotatorUtils.setVariantClassificationCustomSeverity(funcotatorArgs.customVariantClassificationOrderFile);
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
Expand All @@ -34,9 +35,12 @@
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -2344,4 +2348,51 @@ public static <T,U> LinkedHashMap<T,U> createLinkedHashMapFromLists(final List<T
}, LinkedHashMap::new));
}

/**
* Set the severity for {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification}s as specified in a given input file.
* @param customSeverityFile {@link GATKPath} to TSV file containing VARIANT_CLASSIFICATION SEV information.
*/
public static void setVariantClassificationCustomSeverity(final GATKPath customSeverityFile) {
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
try {
logger.info("Setting custom variant classification severities from: " + customSeverityFile);

if ( !Files.exists(customSeverityFile.toPath()) ) {
throw new UserException("Custom severity file does not exist: " + customSeverityFile);
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
}
final BufferedReader reader = new BufferedReader(new InputStreamReader(customSeverityFile.getInputStream()));
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved

int lineNum = 1;
while ( reader.ready() ) {
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
final String line = reader.readLine();

// Ignore empty lines:
if (line.length() == 0) {
continue;
}

final String[] lineFields = line.split("\t");
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
if (lineFields.length != 2) {
throw new UserException("Line " + lineNum + " has " + lineFields.length + " fields! Each TSV line must have 2 fields!");
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
}

try {
final String vcName = lineFields[ 0 ];
final int sev = Integer.parseInt(lineFields[ 1 ]);

logger.info(" Setting new Variant Classification severity: " + vcName + " = " + sev);
GencodeFuncotation.VariantClassification.valueOf(vcName).setSeverity(sev);
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
}
catch (final NumberFormatException ex) {
throw new UserException("Severity on line " + lineNum + " is not an integer ("+ lineFields[1] +")! Custom severities must be integer values!", ex);
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
}

lineNum += 1;
}
}
catch (final IOException ex) {
throw new UserException("Could not read from custom Variant Classification file: " + customSeverityFile, ex);
}
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;

import java.io.File;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.stream.Collectors;

/**
* Arguments to be be used by the {@link Funcotator} {@link org.broadinstitute.hellbender.engine.GATKTool},
Expand Down Expand Up @@ -47,6 +53,4 @@ public class FuncotatorVariantArgumentCollection extends BaseFuncotatorArgumentC
doc = "When input VCF has already been annotated, still annotate again."
)
public boolean reannotateVCF = false;


}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import java.util.LinkedHashSet;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import java.util.stream.Stream;

/**
* A class to represent a Functional Annotation. Each instance represents the annotations on a single transcript.
Expand Down Expand Up @@ -824,26 +824,58 @@ public enum VariantClassification {
/** Variant lies on one of the lincRNAs. */
LINCRNA("LINCRNA", 4);

/**
* Variable to store the list of all valid {@link VariantClassification} types.
* This is used for command-line argument documentation and MUST be maintained if / when any of the
* VariantClassification values / names are updated.
*/
public static final String ALL_VC_NAMES = "COULD_NOT_DETERMINE, INTRON, FIVE_PRIME_UTR, THREE_PRIME_UTR, IGR, FIVE_PRIME_FLANK, THREE_PRIME_FLANK, MISSENSE, NONSENSE, NONSTOP, SILENT, SPLICE_SITE, IN_FRAME_DEL, IN_FRAME_INS, FRAME_SHIFT_INS, FRAME_SHIFT_DEL, START_CODON_SNP, START_CODON_INS, START_CODON_DEL, DE_NOVO_START_IN_FRAME, DE_NOVO_START_OUT_FRAME, RNA, LINCRNA";
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved

/**
* The relative severity of each {@link VariantClassification}.
* Lower numbers are considered more severe.
* Higher numbers are considered less severe.
*/
final private int relativeSeverity;
private int relativeSeverity;

/**
* The default value for the {@link VariantClassification#relativeSeverity} of this {@link VariantClassification}.
*/
final private int defaultRelativeSeverity;

/** The serialized version of this {@link VariantClassification} */
final private String serialized;

VariantClassification(final String serialized, final int sev) {
this.serialized = serialized;
defaultRelativeSeverity = sev;
relativeSeverity = sev;
}

/**
* Reset the severities of all {@link VariantClassification}s to their default values.
*/
public static void resetSeveritiesToDefault() {
for (VariantClassification vc : VariantClassification.values()) {
vc.setSeverity(vc.getDefaultSeverity());
}
}

/**
* @return The {@link VariantClassification#relativeSeverity} of {@code this} {@link VariantClassification}.
*/
public int getSeverity() { return relativeSeverity; }

/**
* @return The {@link VariantClassification#defaultRelativeSeverity} of {@code this} {@link VariantClassification}.
*/
public int getDefaultSeverity() { return defaultRelativeSeverity; }

/**
* Set the {@link VariantClassification#relativeSeverity} of {@code this} {@link VariantClassification}.
*/
public void setSeverity(final int sev) { this.relativeSeverity = sev; }

@Override
public String toString() {
return serialized;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.commons.collections.MapUtils;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.engine.ReferenceFileSource;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.testutils.FuncotatorReferenceTestUtils;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.TableFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotationBuilder;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;
import org.broadinstitute.hellbender.tools.funcotator.metadata.VcfFuncotationMetadata;
Expand Down Expand Up @@ -2471,4 +2473,93 @@ public void testCreateLinkedHashMapFromListsWithIllegalArgs(final List<String> k
FuncotatorUtils.createLinkedHashMapFromLists(keys, values);
}

@DataProvider
public Object[][] provideForTestSetVariantClassificationCustomSeverity() {

final Map<GencodeFuncotation.VariantClassification, Integer> expected1 = new HashMap<>();
expected1.put(GencodeFuncotation.VariantClassification.COULD_NOT_DETERMINE, 0);
expected1.put(GencodeFuncotation.VariantClassification.INTRON, 1);
expected1.put(GencodeFuncotation.VariantClassification.FIVE_PRIME_UTR, 2);
expected1.put(GencodeFuncotation.VariantClassification.THREE_PRIME_UTR, 4);
expected1.put(GencodeFuncotation.VariantClassification.IGR, 8);
expected1.put(GencodeFuncotation.VariantClassification.FIVE_PRIME_FLANK, 16);
expected1.put(GencodeFuncotation.VariantClassification.THREE_PRIME_FLANK, 32);
expected1.put(GencodeFuncotation.VariantClassification.MISSENSE, 64);
expected1.put(GencodeFuncotation.VariantClassification.NONSENSE, 128);
expected1.put(GencodeFuncotation.VariantClassification.NONSTOP, 256);
expected1.put(GencodeFuncotation.VariantClassification.SILENT, 512);
expected1.put(GencodeFuncotation.VariantClassification.SPLICE_SITE, 1024);
expected1.put(GencodeFuncotation.VariantClassification.IN_FRAME_DEL, 2048);
expected1.put(GencodeFuncotation.VariantClassification.IN_FRAME_INS, 4096);
expected1.put(GencodeFuncotation.VariantClassification.FRAME_SHIFT_INS, 8192);
expected1.put(GencodeFuncotation.VariantClassification.FRAME_SHIFT_DEL, 16384);
expected1.put(GencodeFuncotation.VariantClassification.START_CODON_SNP, 32768);
expected1.put(GencodeFuncotation.VariantClassification.START_CODON_INS, 65536);
expected1.put(GencodeFuncotation.VariantClassification.START_CODON_DEL, 131072);
expected1.put(GencodeFuncotation.VariantClassification.DE_NOVO_START_IN_FRAME, 262144);
expected1.put(GencodeFuncotation.VariantClassification.DE_NOVO_START_OUT_FRAME, 524288);
expected1.put(GencodeFuncotation.VariantClassification.RNA, 1048576);
expected1.put(GencodeFuncotation.VariantClassification.LINCRNA, 2097152);
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved

final Map<GencodeFuncotation.VariantClassification, Integer> expected2 = new HashMap<>();
expected2.put(GencodeFuncotation.VariantClassification.COULD_NOT_DETERMINE, 0);
expected2.put(GencodeFuncotation.VariantClassification.INTRON, 1);
expected2.put(GencodeFuncotation.VariantClassification.FIVE_PRIME_UTR, 2);
expected2.put(GencodeFuncotation.VariantClassification.THREE_PRIME_UTR, 4);
expected2.put(GencodeFuncotation.VariantClassification.IGR, 8);
expected2.put(GencodeFuncotation.VariantClassification.SPLICE_SITE, 1024);
expected2.put(GencodeFuncotation.VariantClassification.IN_FRAME_DEL, 2048);
expected2.put(GencodeFuncotation.VariantClassification.IN_FRAME_INS, 4096);
expected2.put(GencodeFuncotation.VariantClassification.RNA, 1048576);
expected2.put(GencodeFuncotation.VariantClassification.LINCRNA, 2097152);

final Map<GencodeFuncotation.VariantClassification, Integer> expected3 = new HashMap<>();
expected3.put(GencodeFuncotation.VariantClassification.LINCRNA, 2097152);

return new Object[][] {
{new GATKPath(largeFileTestDir + "funcotator/custom_vc_order_files/" + "test1.tsv"), expected1},
{new GATKPath(largeFileTestDir + "funcotator/custom_vc_order_files/" + "test2.tsv"), expected2},
{new GATKPath(largeFileTestDir + "funcotator/custom_vc_order_files/" + "test3.tsv"), expected3},
};
}

@Test(dataProvider = "provideForTestSetVariantClassificationCustomSeverity")
public void testSetVariantClassificationCustomSeverity(final GATKPath customVcFile,
final Map<GencodeFuncotation.VariantClassification, Integer> expectedVCSevMap) {
// Set new severity:
FuncotatorUtils.setVariantClassificationCustomSeverity(customVcFile);

// Check we've set it properly:
for ( final GencodeFuncotation.VariantClassification vc : expectedVCSevMap.keySet()) {
Assert.assertEquals(vc.getSeverity(), expectedVCSevMap.get(vc).intValue());
}

// Reset severity:
GencodeFuncotation.VariantClassification.resetSeveritiesToDefault();

// Check that the reset works:
for ( final GencodeFuncotation.VariantClassification vc : GencodeFuncotation.VariantClassification.values()) {
Assert.assertEquals(vc.getSeverity(), vc.getDefaultSeverity());
}
}

@Test(expectedExceptions = {UserException.class})
public void testSetVariantClassificationCustomSeverityNonIntSev() {
final GATKPath customVcFile = new GATKPath(largeFileTestDir + "funcotator/custom_vc_order_files/non_int_sev.tsv");
FuncotatorUtils.setVariantClassificationCustomSeverity(customVcFile);
}

@Test(expectedExceptions = {UserException.class})
public void testSetVariantClassificationCustomSeverityWrongColumnsTsv() {
final GATKPath customVcFile = new GATKPath(largeFileTestDir + "funcotator/custom_vc_order_files/wrong_num_columns.tsv");
FuncotatorUtils.setVariantClassificationCustomSeverity(customVcFile);
}

@Test(expectedExceptions = {UserException.class})
public void testSetVariantClassificationCustomSeverityNonexistentFile() {
final Path p = getSafeNonExistentPath("TEST");
final GATKPath customVcFile = new GATKPath(p.toUri().toString());
FuncotatorUtils.setVariantClassificationCustomSeverity(customVcFile);
}

jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
Expand Up @@ -399,4 +399,27 @@ public void testGetField(final GencodeFuncotation gencodeFuncotation, final Stri
public void testGetFieldFail(final GencodeFuncotation gencodeFuncotation, final String fieldName) {
gencodeFuncotation.getField(fieldName);
}

@Test
public void testVariantClassificationSeverityOverride() {

// Check that we start with the defaults:
for ( final GencodeFuncotation.VariantClassification vc : GencodeFuncotation.VariantClassification.values()) {
Assert.assertEquals(vc.getSeverity(), vc.getDefaultSeverity());
}

// Now set some values and make sure they stick:
for (int i = 98; i < 173; ++i) {
GencodeFuncotation.VariantClassification.COULD_NOT_DETERMINE.setSeverity(i);
Assert.assertEquals(GencodeFuncotation.VariantClassification.COULD_NOT_DETERMINE.getSeverity(), i);
}

// Now reset the values:
GencodeFuncotation.VariantClassification.resetSeveritiesToDefault();

// Check that the reset works:
for ( final GencodeFuncotation.VariantClassification vc : GencodeFuncotation.VariantClassification.values()) {
Assert.assertEquals(vc.getSeverity(), vc.getDefaultSeverity());
}
}
}
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown