Skip to content

Commit

Permalink
More funcotator fixes (#4783)
Browse files Browse the repository at this point in the history
* Some other fixes:
- Ordering specified in the header did not match the variants and hg19/b37 flexibility was being applied to VCF datasources and inducing a lot of missed annotations.
- Sanitizing funcotations in VCF to pass vcf validation.
- Implemented lookahead caching as a parameter in `Funcotator`.  Used the default in `VariantWalkerBase` as the default for `Funcotator`. 
- Added Funcotator tests for Clinvar and Gencode v28 in hg38, and mixed chr/no-chr GENCODE and datasources would still annotate properly for hg19.
- Eased restrictions so that Gencode v28 would be recognized as a valid gtf.  Future versions of Gencode will not fail just based on the version number and warning will be emitted instead.
- (Dev) Tarring and untarring test references to shrink size in git lfs.  Created a class to minimize the number of copies we make of each untarred test reference.
  • Loading branch information
LeeTL1220 authored May 24, 2018
1 parent 2cc7abd commit c8f53cd
Show file tree
Hide file tree
Showing 50 changed files with 4,202 additions and 186 deletions.
10 changes: 8 additions & 2 deletions scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ workflow Mutect2 {
String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"])
Boolean? filter_oncotator_maf
Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true])
Boolean? filter_funcotations
Boolean filter_funcotations_or_default = select_first([filter_funcotations, true])
String? oncotator_extra_args

Int? preemptible_attempts
Expand Down Expand Up @@ -341,7 +343,8 @@ workflow Mutect2 {
annotation_defaults = annotation_defaults,
annotation_overrides = annotation_overrides,
gatk_docker = gatk_docker,
gatk_override = gatk_override
gatk_override = gatk_override,
filter_funcotations = filter_funcotations_or_default
}
}

Expand Down Expand Up @@ -906,12 +909,14 @@ task Funcotate {
Array[String]? transcript_selection_list
Array[String]? annotation_defaults
Array[String]? annotation_overrides
Boolean filter_funcotations

# ==============
# Process input args:
String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else ""
String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else ""
String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else ""
String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else ""
# ==============
# runtime
Expand Down Expand Up @@ -962,7 +967,8 @@ task Funcotate {
${"--transcript-selection-mode " + transcript_selection_mode} \
${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \
${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \
${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides}
${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \
${filter_funcotations_args}
>>>

runtime {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.transformers.ReadTransformer;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
Expand Down Expand Up @@ -834,8 +833,7 @@ record = header.getProgramRecord(pgID);
*/
protected FeatureInput<? extends Feature> addFeatureInputsAfterInitialization(final String filePath,
final String name,
final Class<? extends Feature> featureType,
final int featureQueryLookahead) {
final Class<? extends Feature> featureType, final int featureQueryLookahead) {

final FeatureInput<? extends Feature> featureInput = new FeatureInput<>(filePath, name);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ private List<Feature> getFeatureListFromMap(final Map<String, List<Feature>> fea
.collect(Collectors.toList());
}
else {
featureList = featureSourceMap.get( getName() );
featureList = featureSourceMap.getOrDefault( getName(), Collections.emptyList() );
}
return featureList;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ default void setFieldSerializationOverrideValues(final Map<String,String> fieldS
}

/**
* TODO: This interface should have nothing specific to a VCF. That should be the job of the VCFOutputRenderer to sanitize any strings. https://github.com/broadinstitute/gatk/issues/4797
* Converts this {@link Funcotation} to a string suitable for insertion into a VCF file.
* {@code manualAnnotationString} should be written first, followed by the inherent annotations in this {@link Funcotation}.
* @param manualAnnotationString A {@link String} of manually-provided annotations to add to this {@link Funcotation}.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.mafOutput.MafOutputRenderer;
import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand Down Expand Up @@ -247,11 +246,11 @@ public class Funcotator extends VariantWalker {
// Optional args:

@Argument(
fullName = FuncotatorArgumentDefinitions.IGNORE_FILTERED_VARIANTS_LONG_NAME,
fullName = FuncotatorArgumentDefinitions.REMOVE_FILTERED_VARIANTS_LONG_NAME,
optional = true,
doc = "Whether to ignore variants that have been filtered or to annotate them."
doc = "Ignore/drop variants that have been filtered in the input. These variants will not appear in the output file."
)
protected boolean includeFilteredVariants = false;
protected boolean removeFilteredVariants = false;

@Argument(
fullName = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME,
Expand Down Expand Up @@ -284,15 +283,22 @@ public class Funcotator extends VariantWalker {
@Argument(
fullName = FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME,
optional = true,
doc = "Allow for the HG19 Reference version of GENCODE to match with B37 Contig names. (May create erroneous annotations in some contigs where B37 != HG19)."
doc = "Allow for the HG19 Reference version of GENCODE (or any other datasource) to match with B37 Contig names. (May create erroneous annotations in some contigs where B37 != HG19)."
)
protected boolean allowHg19GencodeContigNamesWithB37 = true;
protected boolean allowHg19ContigNamesWithB37 = true;

@Argument(
fullName = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_NAME,
optional = true,
minValue = 0,
doc = "Number of base-pairs to cache when querying variants."
)
protected int lookaheadFeatureCachingInBp = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE;

//==================================================================================================================

private OutputRenderer outputRenderer;
private final List<DataSourceFuncotationFactory> dataSourceFactories = new ArrayList<>();
private final List<GencodeFuncotationFactory> gencodeFuncotationFactories = new ArrayList<>();

private final List<FeatureInput<? extends Feature>> manualLocatableFeatureInputs = new ArrayList<>();

Expand Down Expand Up @@ -328,15 +334,8 @@ public void onTraversalStart() {
DataSourceUtils.createDataSourceFuncotationFactoriesForDataSources(configData, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet)
);

// Sort our data source factories to ensure they're always in the same order:
dataSourceFactories.sort( Comparator.comparing(DataSourceFuncotationFactory::getName) );

// Identify and store any Gencode data sources:
for ( final DataSourceFuncotationFactory factory : dataSourceFactories ) {
if ( factory.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE) ) {
gencodeFuncotationFactories.add( (GencodeFuncotationFactory) factory );
}
}
// Sort our data source factories to ensure they're always in the same order: gencode datasources first
dataSourceFactories.sort(DataSourceUtils::datasourceComparator);

// Determine which annotations are accounted for (by the funcotation factories) and which are not.
final LinkedHashMap<String, String> unaccountedForDefaultAnnotations = getUnaccountedForAnnotations( dataSourceFactories, annotationDefaultsMap );
Expand Down Expand Up @@ -365,7 +364,7 @@ public void onTraversalStart() {
}

// Check for reference version (in)compatibility:
if ( allowHg19GencodeContigNamesWithB37 &&
if ( allowHg19ContigNamesWithB37 &&
referenceVersion.equals(FuncotatorArgumentDefinitions.HG19_REFERENCE_VERSION_STRING) ) {
// NOTE AND WARNING:
// hg19 is from ucsc. b37 is from the genome reference consortium.
Expand Down Expand Up @@ -444,9 +443,8 @@ private LinkedHashMap<String, String> getUnaccountedForAnnotations( final List<D
*/
private void enqueueAndHandleVariant(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext) {

//TODO: Check against Issue #4358 - https://github.com/broadinstitute/gatk/issues/4358
// Check to see if we're annotating filtered variants and ignore this if we're told to:
if ( (!includeFilteredVariants) && variant.isFiltered() ) {
if ( removeFilteredVariants && variant.isFiltered() ) {
// We can ignore this variant since it was filtered out.
return;
}
Expand All @@ -456,11 +454,12 @@ private void enqueueAndHandleVariant(final VariantContext variant, final Referen

// Create a variant context for annotation that has a new contig based on whether we need to overwrite the
// contig names in the next section.
//TODO: From @lbergelson - restructure this so that it first checks if the contig will be renamed before creating the builder, so it doesn't need to go from VariantContext -> builder -> VariantContext everytime
final VariantContextBuilder variantContextBuilderForFixedContigForDataSources = new VariantContextBuilder(variant);
VariantContext variantContextFixedContigForDataSources = variant;

// Check to see if we need to query with a different reference convention (i.e. "chr1" vs "1").
if ( allowHg19GencodeContigNamesWithB37 && inputReferenceIsB37 ) {
if (allowHg19ContigNamesWithB37 && inputReferenceIsB37) {
final VariantContextBuilder variantContextBuilderForFixedContigForDataSources = new VariantContextBuilder(variant);

// Construct a new contig and new interval with no "chr" in front of it:
final String hg19Contig = FuncotatorUtils.convertB37ContigToHg19Contig( variant.getContig() );
final SimpleInterval hg19Interval = new SimpleInterval(hg19Contig, variant.getStart(), variant.getEnd());
Expand All @@ -471,8 +470,13 @@ private void enqueueAndHandleVariant(final VariantContext variant, final Referen
for ( final FeatureInput<? extends Feature> featureInput : manualLocatableFeatureInputs ) {
@SuppressWarnings("unchecked")
final List<Feature> featureList = (List<Feature>)featureContext.getValues(featureInput, hg19Interval);

// TODO: This is a little sloppy, since it checks every datasource twice. Once for hg19 contig names and once for b37 contig names. See https://github.com/broadinstitute/gatk/issues/4798
featureList.addAll(featureContext.getValues(featureInput));
featureSourceMap.put( featureInput.getName(), featureList);
}
// Get our VariantContext for annotation:
variantContextFixedContigForDataSources = variantContextBuilderForFixedContigForDataSources.make();
}
else {
for ( final FeatureInput<? extends Feature> featureInput : manualLocatableFeatureInputs ) {
Expand All @@ -482,34 +486,26 @@ private void enqueueAndHandleVariant(final VariantContext variant, final Referen
}
}

// Get our VariantContext for annotation:
final VariantContext variantContextFixedContigForDataSources = variantContextBuilderForFixedContigForDataSources.make();

// Create a place to keep our funcotations:
final List<Funcotation> funcotations = new ArrayList<>();

// Annotate with Gencode first:
// Create a list of GencodeFuncotation to use for other Data Sources:
final List<GencodeFuncotation> gencodeFuncotations = new ArrayList<>();

for ( final GencodeFuncotationFactory factory : gencodeFuncotationFactories ) {
final List<Funcotation> funcotationsFromGencodeFactory = factory.createFuncotations(variantContextFixedContigForDataSources, referenceContext, featureSourceMap);
funcotations.addAll(funcotationsFromGencodeFactory);
gencodeFuncotations.addAll(
funcotationsFromGencodeFactory.stream()
.map(x -> (GencodeFuncotation)x)
.collect(Collectors.toList())
);
}

// Annotate with the rest of the data sources:
for ( final DataSourceFuncotationFactory funcotationFactory : dataSourceFactories ) {

// Make sure we don't double up on the Gencodes:
if ( funcotationFactory.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE) ) {
continue;
// Perform the actual annotation. Note that we leverage the ordering of datasources here (i.e. that gencode/transcript
// datasources always appear first)
for (final DataSourceFuncotationFactory funcotationFactory : dataSourceFactories ) {
if (funcotationFactory.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE)) {
final List<Funcotation> funcotationsFromGencodeFactory = funcotationFactory.createFuncotations(variantContextFixedContigForDataSources, referenceContext, featureSourceMap);
funcotations.addAll(funcotationsFromGencodeFactory);
gencodeFuncotations.addAll(
funcotationsFromGencodeFactory.stream()
.map(x -> (GencodeFuncotation)x)
.collect(Collectors.toList()));
} else {
funcotations.addAll( funcotationFactory.createFuncotations(variantContextFixedContigForDataSources, referenceContext, featureSourceMap, gencodeFuncotations) );
}
funcotations.addAll( funcotationFactory.createFuncotations(variantContextFixedContigForDataSources, referenceContext, featureSourceMap, gencodeFuncotations) );
}

outputRenderer.write(variant, funcotations);
Expand Down Expand Up @@ -580,9 +576,7 @@ private void addFeaturesForLocatableDataSource(final Path dataSourceFile,
IOUtils.getPath( dataSourceProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_SRC_FILE) )
).toUri().toString(),
name,
featureClazz,
VariantWalkerBase.FEATURE_CACHE_LOOKAHEAD
);
featureClazz, lookaheadFeatureCachingInBp);

// Add our feature input to our list of manual inputs:
manualLocatableFeatureInputs.add(featureInput);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.funcotator;

import org.broadinstitute.hellbender.engine.VariantWalkerBase;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.xsv.SimpleKeyXsvFuncotationFactory;
Expand Down Expand Up @@ -27,7 +28,7 @@ public class FuncotatorArgumentDefinitions {
// ------------------------------------------------------------
// Definitions for optional arguments:

public static final String IGNORE_FILTERED_VARIANTS_LONG_NAME = "ignore-filtered-variants";
public static final String REMOVE_FILTERED_VARIANTS_LONG_NAME = "remove-filtered-variants";

public static final String TRANSCRIPT_SELECTION_MODE_LONG_NAME = "transcript-selection-mode";
public static final TranscriptSelectionMode TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE = TranscriptSelectionMode.CANONICAL;
Expand All @@ -46,6 +47,9 @@ public class FuncotatorArgumentDefinitions {
public static final String HG19_REFERENCE_VERSION_STRING = "hg19";
public static final String HG38_REFERENCE_VERSION_STRING = "hg38";

public static final String LOOKAHEAD_CACHE_IN_BP_NAME = "lookahead-cache-bp";
public static final int LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE = VariantWalkerBase.FEATURE_CACHE_LOOKAHEAD;

// ------------------------------------------------------------
// Helper Types:

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
package org.broadinstitute.hellbender.tools.funcotator.dataSources;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.DataSourceFuncotationFactory;
Expand Down Expand Up @@ -700,4 +704,28 @@ else if ( !Files.isReadable(sourceFilePath) ) {
" - " + field + " is not readable: " + sourceFilePath);
}
}

/**
* Sort by putting gencode datasources first and then non-gencode in alphabetical order.
*
* Note that this must match how the datasources are being rendered in the {@link Funcotator#apply(VariantContext, ReadsContext, ReferenceContext, FeatureContext)}
* @param f1 datasource to compare. Never {@code null}
* @param f2 datasource to compare. Never {@code null}
* @return -1 if f1 should be put first, 1 if f2 should be put first.
*/
public static int datasourceComparator(final DataSourceFuncotationFactory f1, final DataSourceFuncotationFactory f2) {
Utils.nonNull(f1);
Utils.nonNull(f2);
final boolean isF1Gencode = f1.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE);
final boolean isF2Gencode = f2.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE);
if (isF1Gencode == isF2Gencode) {
return f1.getInfoString().compareTo(f2.getInfoString());
} else {
if (isF1Gencode) {
return -1;
} else {
return 1;
}
}
}
}
Loading

0 comments on commit c8f53cd

Please sign in to comment.