Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve VCF fields in MAF output #4872

Merged
merged 7 commits into from
Jun 15, 2018
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.funcotator;

import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;

import java.util.LinkedHashSet;
import java.util.Map;
Expand Down Expand Up @@ -77,4 +78,10 @@ default String serializeToVcfString(final String manualAnnotationString) {
* @return Return whether the field exists in this {@link Funcotation}.
*/
boolean hasField(final String fieldName);

/**
* @return Metadata for this {@link Funcotation}. Never {@code null}. All fields in {@see getFieldNames} should be
* represented.
*/
FuncotationMetadata getMetadata();
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public class FuncotationMap {
/** Standard Logger. */
protected static final Logger logger = LogManager.getLogger(FuncotationMap.class);


final private Map<String, LinkedHashSet<Funcotation>> txToFuncotations = new LinkedHashMap<>();

private FuncotationMap() {}
Expand Down Expand Up @@ -218,8 +219,7 @@ public static FuncotationMap createAsAllTableFuncotationsFromVcf(final String tr
.toMap(i -> funcotationKeys[i], i-> values[i]));

final List<String> valuesAsList = Arrays.asList(funcotationKeys).stream().map(k -> simpleNameValuePairs.get(k)).collect(Collectors.toList());
result.add(simpleNameValuePairs.getOrDefault(transcriptFieldName, NO_TRANSCRIPT_AVAILABLE_KEY), new TableFuncotation(Arrays.asList(funcotationKeys), valuesAsList, altAllele, datasourceName));

result.add(simpleNameValuePairs.getOrDefault(transcriptFieldName, NO_TRANSCRIPT_AVAILABLE_KEY), TableFuncotation.create(Arrays.asList(funcotationKeys), valuesAsList, altAllele, datasourceName, null));
}
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.mafOutput.MafOutputRenderer;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;
import org.broadinstitute.hellbender.tools.funcotator.metadata.VcfFuncotationMetadata;
import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.codecs.gencode.GencodeGtfFeature;
Expand Down Expand Up @@ -228,19 +230,19 @@ public class Funcotator extends VariantWalker {
fullName = FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME,
doc = "The version of the Human Genome reference to use (e.g. hg19, hg38, etc.). This will correspond to a sub-folder of each data source corresponding to that data source for the given reference."
)
protected String referenceVersion;
private String referenceVersion;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the change to private for the input args? (I have no objection.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got rid of IntelliJ warnings.

No action.


@Argument(
fullName = FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME,
doc = "The path to a data source folder for Funcotator. May be specified more than once to handle multiple data source folders."
)
protected List<String> dataSourceDirectories;
private List<String> dataSourceDirectories;

@Argument(
fullName = FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME,
doc = "The output file format. Either VCF or MAF"
)
protected FuncotatorArgumentDefinitions.OutputFormatType outputFormatType;
private FuncotatorArgumentDefinitions.OutputFormatType outputFormatType;

//-----------------------------------------------------
// Optional args:
Expand All @@ -250,57 +252,57 @@ public class Funcotator extends VariantWalker {
optional = true,
doc = "Ignore/drop variants that have been filtered in the input. These variants will not appear in the output file."
)
protected boolean removeFilteredVariants = false;
private boolean removeFilteredVariants = false;

@Argument(
fullName = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME,
optional = true,
doc = "Method of detailed transcript selection. This will select the transcript for detailed annotation (CANONICAL or BEST_EFFECT)."
doc = "Method of detailed transcript selection. This will select the transcript for detailed annotation (CANONICAL, ALL, or BEST_EFFECT)."
)
protected TranscriptSelectionMode transcriptSelectionMode = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE;
private TranscriptSelectionMode transcriptSelectionMode = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE;

@Argument(
fullName = FuncotatorArgumentDefinitions.TRANSCRIPT_LIST_LONG_NAME,
optional = true,
doc = "File to use as a list of transcripts (one transcript ID per line, version numbers are ignored) OR A set of transcript IDs to use for annotation to override selected transcript."
)
protected Set<String> userTranscriptIdSet = new HashSet<>();
private Set<String> userTranscriptIdSet = new HashSet<>();

@Argument(
fullName = FuncotatorArgumentDefinitions.ANNOTATION_DEFAULTS_LONG_NAME,
optional = true,
doc = "Annotations to include in all annotated variants if the annotation is not specified in the data sources (in the format <ANNOTATION>:<VALUE>). This will add the specified annotation to every annotated variant if it is not already present."
)
protected List<String> annotationDefaults = new ArrayList<>();
private List<String> annotationDefaults = new ArrayList<>();

@Argument(
fullName = FuncotatorArgumentDefinitions.ANNOTATION_OVERRIDES_LONG_NAME,
optional = true,
doc = "Override values for annotations (in the format <ANNOTATION>:<VALUE>). Replaces existing annotations of the given name with given values."
)
protected List<String> annotationOverrides = new ArrayList<>();
private List<String> annotationOverrides = new ArrayList<>();

@Argument(
fullName = FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME,
optional = true,
doc = "Allow for the HG19 Reference version of GENCODE (or any other datasource) to match with B37 Contig names. (May create erroneous annotations in some contigs where B37 != HG19)."
)
protected boolean allowHg19ContigNamesWithB37 = true;
private boolean allowHg19ContigNamesWithB37 = true;

@Argument(
fullName = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_NAME,
optional = true,
minValue = 0,
doc = "Number of base-pairs to cache when querying variants."
)
protected int lookaheadFeatureCachingInBp = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE;
private int lookaheadFeatureCachingInBp = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE;

@Argument(
fullName = FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_OVERRIDE_LONG_NAME,
optional = true,
doc = "(Advanced/Use at your own risk) Use in conjunction with allow hg19 contig names with b37. If you also select this flag, no check that your input reference is b37 is actually performed. Otherwise, ignored. Typically, this option is useful in integration tests (written by devs) only."
)
protected boolean allowHg19ContigNamesWithB37Lenient = false;
private boolean allowHg19ContigNamesWithB37Lenient = false;

//==================================================================================================================

Expand All @@ -311,6 +313,8 @@ public class Funcotator extends VariantWalker {

private boolean inputReferenceIsB37 = false;

private FuncotationMetadata inputMetadata;

//==================================================================================================================

@Override
Expand Down Expand Up @@ -344,6 +348,9 @@ public void onTraversalStart() {
// Sort our data source factories to ensure they're always in the same order: gencode datasources first
dataSourceFactories.sort(DataSourceUtils::datasourceComparator);

// Create the metadata directly from the input.
inputMetadata = VcfFuncotationMetadata.create(new ArrayList<>(getHeaderForVariants().getInfoHeaderLines()));

// Determine which annotations are accounted for (by the funcotation factories) and which are not.
final LinkedHashMap<String, String> unaccountedForDefaultAnnotations = getUnaccountedForAnnotations( dataSourceFactories, annotationDefaultsMap );
final LinkedHashMap<String, String> unaccountedForOverrideAnnotations = getUnaccountedForAnnotations( dataSourceFactories, annotationOverridesMap );
Expand Down Expand Up @@ -519,6 +526,13 @@ private void enqueueAndHandleVariant(final VariantContext variant, final Referen
}
}

// Create the funcotations for the input and add to all txID mappings.
final List<String> txIds = funcotationMap.getTranscriptList();

for (final String txId: txIds) {
funcotationMap.add(txId, FuncotatorUtils.createFuncotations(variant, inputMetadata, FuncotatorConstants.DATASOURCE_NAME_FOR_INPUT_VCFS));
}

// At this point there is only one transcript ID in the funcotation map if canonical or best effect are selected
outputRenderer.write(variant, funcotationMap);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package org.broadinstitute.hellbender.tools.funcotator;

public class FuncotatorConstants {
/**
* Datasource name to use for Funcotations created from input variants from a VCF.
*/
public static String DATASOURCE_NAME_FOR_INPUT_VCFS = "INPUT_VCF";

}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.TableFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;
import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -2022,7 +2024,7 @@ public static String[] extractFuncotatorKeysFromHeaderDescription(final String f
*/
public static String sanitizeFuncotationForVcf(final String individualFuncotation) {
Utils.nonNull(individualFuncotation);
return StringUtils.replaceEach(individualFuncotation, new String[]{",", ";", "=", "\t", "|"}, new String[]{"_%2C_", "_%3B_", "_%3D_", "_%09_", "_%7C_"});
return StringUtils.replaceEach(individualFuncotation, new String[]{",", ";", "=", "\t", "|", " "}, new String[]{"_%2C_", "_%3B_", "_%3D_", "_%09_", "_%7C_", "_%20_"});
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the VCF files are TSVs, do we need to escape a space in the values?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was getting errors when I left it in...

No action.

}

/**
Expand Down Expand Up @@ -2075,5 +2077,44 @@ public static boolean isGencodeFuncotation(final Funcotation f) {
public static boolean areAnyGencodeFuncotation(final List<Funcotation> funcotations) {
return funcotations.stream().anyMatch(FuncotatorUtils::isGencodeFuncotation);
}

/**
* Create funcotations (one for each alt allele) corresponding to the given variant context.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make this more descriptive? I understand what this is doing but the documentation here is pretty sparse.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

*
* Assumes that the fields in the variant context are named exactly the same as what is in the metadata. Additionally, the
* metadata must include all variant attributes.
*
* @param vc The variant context to derive funcotations. Never {@code null}
* @param metadata Existing metadata that matches the variant context info field attributes exactly. Never {@code null}
* @param datasourceName Name to use as the datasource in the funcotations. Never {@code null}
* @return A list of funcotations based on the variant context (INFO) attributes. Never empty, unless the metadata has no fields. Never {@code null}
*/
public static List<Funcotation> createFuncotations(final VariantContext vc, final FuncotationMetadata metadata, final String datasourceName) {

Utils.nonNull(vc);
Utils.nonNull(metadata);
Utils.nonNull(datasourceName);

final List<Funcotation> result = new ArrayList<>();
final List<String> allFields = metadata.retrieveAllHeaderInfo().stream().map(h -> h.getID()).collect(Collectors.toList());

final Set<String> attributesNotInMetadata = vc.getAttributes().keySet().stream().filter(k -> !allFields.contains(k)).collect(Collectors.toSet());
if (attributesNotInMetadata.size() != 0) {
throw new UserException.MalformedFile("Not all attributes in the variant context appear in the metadata: " + attributesNotInMetadata.stream().collect(Collectors.joining(", ")) + " .... Please add these attributes to the input metadata (e.g. VCF Header).");
}

for (final Allele allele: vc.getAlternateAlleles()) {

// We must have fields for everything in the metadata.
final List<String> funcotationFieldValues = new ArrayList<>();
for (final String funcotationFieldName : allFields) {
funcotationFieldValues.add(vc.getAttributeAsString(funcotationFieldName, ""));
}

result.add(TableFuncotation.create(allFields, funcotationFieldValues, allele, datasourceName, metadata));
}

return result;
}
}

Loading