Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize some Mutect-related tools #5073

Merged
merged 4 commits into from
Aug 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ task CalculateContamination {
Int? mem

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 7000
Int machine_mem = if defined(mem) then mem * 1000 else 3000
Int command_mem = machine_mem - 500

command {
Expand All @@ -734,11 +734,13 @@ task CalculateContamination {
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

if [[ -f "${normal_bam}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O normal_pileups.table
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O normal_pileups.table
NORMAL_CMD="-matched normal_pileups.table"
fi

gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O pileups.table
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table
gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table --tumor-segmentation segments.table $NORMAL_CMD
}

Expand Down
8 changes: 5 additions & 3 deletions scripts/mutect2_wdl/mutect2_nio.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ task CalculateContamination {
Int? mem

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 7000
Int machine_mem = if defined(mem) then mem * 1000 else 3000
Int command_mem = machine_mem - 500

command {
Expand All @@ -707,11 +707,13 @@ task CalculateContamination {
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

if [[ -f "${normal_bam}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O normal_pileups.table
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O normal_pileups.table
NORMAL_CMD="-matched normal_pileups.table"
fi

gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O pileups.table
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table
gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table --tumor-segmentation segments.table $NORMAL_CMD
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ private void initializeTruthVariantsIfNecessary() {
}

if (truthVariants == null) {
truthVariants = new FeatureDataSource<>(new FeatureInput<>(truthVariantsFile, "truth"), CACHE_LOOKAHEAD, VariantContext.class);
truthVariants = new FeatureDataSource<>(new FeatureInput<>(truthVariantsFile, "truth"), CACHE_LOOKAHEAD, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@davidbenjamin You should make this same change to the evalVariants FeatureDataSource.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @cmnbroad, did it.

}
}

Expand All @@ -101,7 +101,7 @@ protected final void onStartup() {
super.onStartup();

initializeTruthVariantsIfNecessary();
evalVariants = new FeatureDataSource<>(new FeatureInput<>(evalVariantsFile, "eval"), CACHE_LOOKAHEAD, VariantContext.class);
evalVariants = new FeatureDataSource<>(new FeatureInput<>(evalVariantsFile, "eval"), CACHE_LOOKAHEAD, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer);

if ( hasUserSuppliedIntervals() ) {
truthVariants.setIntervalsForTraversal(userIntervals);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@

import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.MultiVariantWalker;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
Expand Down Expand Up @@ -55,17 +53,31 @@
* gatk GetPileupSummaries \
* -I tumor.bam \
* -V common_biallelic.vcf.gz \
* -L common_biallelic.vcf.gz \
* -O pileups.table
* </pre>
*
* <pre>
* gatk GetPileupSummaries \
* -I normal.bam \
* -V common_biallelic.vcf.gz \
* -L chr1 \
* -L common_biallelic.vcf.gz \
* -O pileups.table
* </pre>
*
* Although the sites (-L) and variants (-V) resources will often be identical, this need not be the case. For example,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typically these files would only be different if you were trying to use a subset of the variants? I.e. -L was a subset of -V? If so, can you add that to the doc?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

* <pre>
* gatk GetPileupSummaries \
* -I normal.bam \
* -V gnomad.vcf.gz \
* -L common_snps.interval_list \
* -O pileups.table
* </pre>
* attempts to get pileups at a list of common snps and emits output for those sites that are present in gnomAD, using the
* allele frequencies from gnomAD. Note that the sites may be a subset of the variants, the variants may be a subset of the sites,
* or they may overlap partially. In all cases pileup summaries are emitted for the overlap and nowhere else. The most common use
* case in which sites and variants differ is when the variants resources is a large file and the sites is an interval list subset from that file.
*
* <p>
* GetPileupSummaries tabulates results into six columns as shown below.
* The alt_count and allele_frequency correspond to the ALT allele in the germline resource.
Expand Down Expand Up @@ -94,7 +106,7 @@
programGroup = CoverageAnalysisProgramGroup.class)
@BetaFeature
@DocumentedFeature
public class GetPileupSummaries extends MultiVariantWalker {
public class GetPileupSummaries extends LocusWalker {

public static final String MAX_SITE_AF_LONG_NAME = "maximum-population-allele-frequency";
public static final String MIN_SITE_AF_LONG_NAME = "minimum-population-allele-frequency";
Expand All @@ -112,6 +124,9 @@ public class GetPileupSummaries extends MultiVariantWalker {
doc="The output table", optional=false)
private File outputTable;

@Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME, shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME, doc = "A VCF file containing variants and allele frequencies")
public FeatureInput<VariantContext> variants;

@Argument(fullName = MIN_SITE_AF_LONG_NAME,
shortName = MIN_SITE_AF_SHORT_NAME,
doc = "Minimum population allele frequency of sites to consider. A low value increases accuracy at the expense of speed.", optional = true)
Expand All @@ -127,8 +142,6 @@ public class GetPileupSummaries extends MultiVariantWalker {

private final List<PileupSummary> pileupSummaries = new ArrayList<>();

private VariantContext lastVariant = null;

private boolean sawVariantsWithoutAlleleFrequency = false;
private boolean sawVariantsWithAlleleFrequency = false;

Expand All @@ -142,6 +155,16 @@ public boolean requiresReference() {
return false;
}

@Override
public boolean requiresIntervals() {
return true;
}

@Override
public boolean requiresFeatures() {
return true;
}

@Override
public List<ReadFilter> getDefaultReadFilters() {
final List<ReadFilter> filters = new ArrayList<>();
Expand All @@ -160,24 +183,26 @@ public List<ReadFilter> getDefaultReadFilters() {

@Override
public void onTraversalStart() {
final boolean alleleFrequencyInHeader = getHeaderForVariants().getInfoHeaderLines().stream()
final boolean alleleFrequencyInHeader = ((VCFHeader) getHeaderForFeatures(variants)).getInfoHeaderLines().stream()
.anyMatch(line -> line.getID().equals(VCFConstants.ALLELE_FREQUENCY_KEY));
if (!alleleFrequencyInHeader) {
throw new UserException.BadInput("Population vcf does not have an allele frequency (AF) info field in its header.");
}
}

@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext ) {
// if we input multiple sources of variants, ignore repeats
if (lastVariant != null && vc.getStart() == lastVariant.getStart()) {
public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) {
final List<VariantContext> vcs = featureContext.getValues(variants);
if (vcs.isEmpty()) {
return;
} else if ( vc.isBiallelic() && vc.isSNP() && alleleFrequencyInRange(vc) ) {
final ReadPileup pileup = GATKProtectedVariantContextUtils.getPileup(vc, readsContext)
}
final VariantContext vc = vcs.get(0);

if ( vc.isBiallelic() && vc.isSNP() && alleleFrequencyInRange(vc) ) {
final ReadPileup pileup = alignmentContext.getBasePileup()
.makeFilteredPileup(pe -> pe.getRead().getMappingQuality() >= minMappingQuality);
pileupSummaries.add(new PileupSummary(vc, pileup));
}
lastVariant = vc;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public void test() {
final String[] args = {
"-I", NA12878.getAbsolutePath(),
"-V", thousandGenomes.getAbsolutePath(),
"-L", thousandGenomes.getAbsolutePath(),
"-O", output.getAbsolutePath(),
"-" + GetPileupSummaries.MAX_SITE_AF_SHORT_NAME, "0.9"
};
Expand Down Expand Up @@ -66,6 +67,7 @@ public void testNoAFFieldInHeader() {
final String[] args = {
"-I", NA12878.getAbsolutePath(),
"-V", vcfWithoutAF.getAbsolutePath(),
"-L", vcfWithoutAF.getAbsolutePath(),
"-O", output.getAbsolutePath(),
};
runCommandLine(args);
Expand Down