-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize some Mutect-related tools #5073
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,16 +2,14 @@ | |
|
||
import htsjdk.variant.variantcontext.VariantContext; | ||
import htsjdk.variant.vcf.VCFConstants; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.BetaFeature; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup; | ||
import org.broadinstitute.hellbender.engine.FeatureContext; | ||
import org.broadinstitute.hellbender.engine.MultiVariantWalker; | ||
import org.broadinstitute.hellbender.engine.ReadsContext; | ||
import org.broadinstitute.hellbender.engine.ReferenceContext; | ||
import org.broadinstitute.hellbender.engine.*; | ||
import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter; | ||
import org.broadinstitute.hellbender.engine.filters.ReadFilter; | ||
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; | ||
|
@@ -55,17 +53,31 @@ | |
* gatk GetPileupSummaries \ | ||
* -I tumor.bam \ | ||
* -V common_biallelic.vcf.gz \ | ||
* -L common_biallelic.vcf.gz \ | ||
* -O pileups.table | ||
* </pre> | ||
* | ||
* <pre> | ||
* gatk GetPileupSummaries \ | ||
* -I normal.bam \ | ||
* -V common_biallelic.vcf.gz \ | ||
* -L chr1 \ | ||
* -L common_biallelic.vcf.gz \ | ||
* -O pileups.table | ||
* </pre> | ||
* | ||
* Although the sites (-L) and variants (-V) resources will often be identical, this need not be the case. For example, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typically these files would only be different if you were trying to use a subset of the variants? I.e. -L was a subset of -V? If so, can you add that to the doc? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
* <pre> | ||
* gatk GetPileupSummaries \ | ||
* -I normal.bam \ | ||
* -V gnomad.vcf.gz \ | ||
* -L common_snps.interval_list \ | ||
* -O pileups.table | ||
* </pre> | ||
* attempts to get pileups at a list of common snps and emits output for those sites that are present in gnomAD, using the | ||
* allele frequencies from gnomAD. Note that the sites may be a subset of the variants, the variants may be a subset of the sites, | ||
* or they may overlap partially. In all cases pileup summaries are emitted for the overlap and nowhere else. The most common use | ||
* case in which sites and variants differ is when the variants resources is a large file and the sites is an interval list subset from that file. | ||
* | ||
* <p> | ||
* GetPileupSummaries tabulates results into six columns as shown below. | ||
* The alt_count and allele_frequency correspond to the ALT allele in the germline resource. | ||
|
@@ -94,7 +106,7 @@ | |
programGroup = CoverageAnalysisProgramGroup.class) | ||
@BetaFeature | ||
@DocumentedFeature | ||
public class GetPileupSummaries extends MultiVariantWalker { | ||
public class GetPileupSummaries extends LocusWalker { | ||
|
||
public static final String MAX_SITE_AF_LONG_NAME = "maximum-population-allele-frequency"; | ||
public static final String MIN_SITE_AF_LONG_NAME = "minimum-population-allele-frequency"; | ||
|
@@ -112,6 +124,9 @@ public class GetPileupSummaries extends MultiVariantWalker { | |
doc="The output table", optional=false) | ||
private File outputTable; | ||
|
||
@Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME, shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME, doc = "A VCF file containing variants and allele frequencies") | ||
public FeatureInput<VariantContext> variants; | ||
|
||
@Argument(fullName = MIN_SITE_AF_LONG_NAME, | ||
shortName = MIN_SITE_AF_SHORT_NAME, | ||
doc = "Minimum population allele frequency of sites to consider. A low value increases accuracy at the expense of speed.", optional = true) | ||
|
@@ -127,8 +142,6 @@ public class GetPileupSummaries extends MultiVariantWalker { | |
|
||
private final List<PileupSummary> pileupSummaries = new ArrayList<>(); | ||
|
||
private VariantContext lastVariant = null; | ||
|
||
private boolean sawVariantsWithoutAlleleFrequency = false; | ||
private boolean sawVariantsWithAlleleFrequency = false; | ||
|
||
|
@@ -142,6 +155,16 @@ public boolean requiresReference() { | |
return false; | ||
} | ||
|
||
@Override | ||
public boolean requiresIntervals() { | ||
return true; | ||
} | ||
|
||
@Override | ||
public boolean requiresFeatures() { | ||
return true; | ||
} | ||
|
||
@Override | ||
public List<ReadFilter> getDefaultReadFilters() { | ||
final List<ReadFilter> filters = new ArrayList<>(); | ||
|
@@ -160,24 +183,26 @@ public List<ReadFilter> getDefaultReadFilters() { | |
|
||
@Override | ||
public void onTraversalStart() { | ||
final boolean alleleFrequencyInHeader = getHeaderForVariants().getInfoHeaderLines().stream() | ||
final boolean alleleFrequencyInHeader = ((VCFHeader) getHeaderForFeatures(variants)).getInfoHeaderLines().stream() | ||
.anyMatch(line -> line.getID().equals(VCFConstants.ALLELE_FREQUENCY_KEY)); | ||
if (!alleleFrequencyInHeader) { | ||
throw new UserException.BadInput("Population vcf does not have an allele frequency (AF) info field in its header."); | ||
} | ||
} | ||
|
||
@Override | ||
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext ) { | ||
// if we input multiple sources of variants, ignore repeats | ||
if (lastVariant != null && vc.getStart() == lastVariant.getStart()) { | ||
public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) { | ||
final List<VariantContext> vcs = featureContext.getValues(variants); | ||
if (vcs.isEmpty()) { | ||
return; | ||
} else if ( vc.isBiallelic() && vc.isSNP() && alleleFrequencyInRange(vc) ) { | ||
final ReadPileup pileup = GATKProtectedVariantContextUtils.getPileup(vc, readsContext) | ||
} | ||
final VariantContext vc = vcs.get(0); | ||
|
||
if ( vc.isBiallelic() && vc.isSNP() && alleleFrequencyInRange(vc) ) { | ||
final ReadPileup pileup = alignmentContext.getBasePileup() | ||
.makeFilteredPileup(pe -> pe.getRead().getMappingQuality() >= minMappingQuality); | ||
pileupSummaries.add(new PileupSummary(vc, pileup)); | ||
} | ||
lastVariant = vc; | ||
} | ||
|
||
@Override | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@davidbenjamin You should make this same change to the
evalVariants
FeatureDataSource
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @cmnbroad, did it.