Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to Mutect2's Permutect training data mode #8663

Merged
merged 4 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,26 +1,22 @@
package org.broadinstitute.hellbender.tools.walkers.annotator;
package org.broadinstitute.hellbender.tools.walkers.mutect;


import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality;
import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.*;
import java.util.stream.Collectors;
Expand All @@ -31,69 +27,23 @@
* [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is
* 1,2,3,4|5,6,7,8
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
summary="Featurized read sets for Mutect3 training data")
public class FeaturizedReadSets implements JumboGenotypeAnnotation {
public static final int DEFAULT_BASE_QUALITY = 25;

private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE;
public class FeaturizedReadSets {
private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class);

public static final int FEATURES_PER_READ = 11;
public static final int DEFAULT_BASE_QUALITY = 25;

private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);

// downsample ref reads to this count if needed
private final int maxRefCount;

public FeaturizedReadSets(final int maxRefCount) {
this.maxRefCount = maxRefCount;
}

public FeaturizedReadSets() {
this(DEFAULT_MAX_REF_COUNT);
}

@Override
public void annotate(final ReferenceContext ref,
final FeatureContext features,
final VariantContext vc,
final Genotype g,
final GenotypeBuilder gb,
final AlleleLikelihoods<GATKRead, Allele> likelihoods,
final AlleleLikelihoods<Fragment, Allele> fragmentLikelihoods,
final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods) {
Utils.nonNull(vc);
Utils.nonNull(g);
Utils.nonNull(gb);

if ( likelihoods == null) {
return;
}

final List<List<List<Integer>>> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()),
likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE);

// flatten twice: over all reads supporting each allele and over all alleles
// we can partition by allele with the countsInAlleleOrder annotation
// and by read using the constant feature vector size
final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream()
.flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream))
.mapToInt(n -> n)
.toArray();

final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray();

gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder);
gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder);
}
private FeaturizedReadSets() { }

public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
final Collection<String> samples,
final AlleleLikelihoods<GATKRead, Allele> likelihoods,
final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods,
final int refDownsample,
final int altDownsample) {
return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap());
final int altDownsample,
final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode);
}

// returns Lists (in allele order) of lists of read vectors supporting each allele
Expand All @@ -103,7 +53,8 @@ public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods,
final int refDownsample,
final int altDownsample,
final Map<Allele, Integer> altDownsampleMap) {
final Map<Allele, Integer> altDownsampleMap,
final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final Map<Allele, List<GATKRead>> readsByAllele = likelihoods.alleles().stream()
.collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));

Expand All @@ -126,17 +77,14 @@ public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
.forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));

return vc.getAlleles().stream()
.map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList()))
.map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode)).collect(Collectors.toList()))
.collect(Collectors.toList());
}


@Override
public List<String> getKeyNames() {
return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY);
}

private static List<Integer> featurize(final GATKRead read, final VariantContext vc, final Map<GATKRead, Haplotype> bestHaplotypes) {
private static List<Integer> featurize(final GATKRead read, final VariantContext vc,
final Map<GATKRead, Haplotype> bestHaplotypes,
final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final List<Integer> result = new ArrayList<>();
result.add(read.getMappingQuality());
result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
Expand All @@ -151,23 +99,41 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext

result.add(Math.abs(read.getFragmentLength()));

// distances from ends of fragment
final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
result.add(vc.getStart() - fragmentStart);
result.add(fragmentEnd - vc.getEnd());
if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) {
// distances from ends of fragment
final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
result.add(vc.getStart() - fragmentStart);
result.add(fragmentEnd - vc.getEnd());
}

// Ultima-specific read tags
if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) {
result.add(read.getAttributeAsInteger("si")); // si is an integer on the order of 100s or 1000s
result.add((int) (1000*read.getAttributeAsFloat("rq"))); // rq is a float from 0 and 1, so we multiply by 1000 and round
}

// mismatches versus best haplotype
final Haplotype haplotype = bestHaplotypes.get(read);
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
result.add(mismatchCount);

final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
result.add((int) indelsVsBestHaplotype);
Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features");

// TODO: fix this
// I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina
if (!bestHaplotypes.containsKey(read)) {
logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(),
vc.getContig(), vc.getStart()));
result.add(3);
result.add(2);
} else {
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
result.add(mismatchCount);

final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
result.add((int) indelsVsBestHaplotype);
}
Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");

return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample";
public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset";
public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth";
public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode";

public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10;
public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20;
Expand Down Expand Up @@ -205,6 +206,25 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection")
public File mutect3Dataset;

@Advanced
@Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset. Depends on sequencing technology.")
public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;

public enum Mutect3DatasetMode {
ILLUMINA(11),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get a little nervous when you hardcode the sequencing technology, especially when given the same value. Why not just make this a default? What is the justification for this? (We do not need to block this PR over this)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since Ultima uses unpaired reads it can't be encoded in the same way as Illumina data. It's only a coincidence that the dimension of read tensors is 11 in both cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(And note that this is an Advanced argument)

ULTIMA(11);

final private int numReadFeatures;

Mutect3DatasetMode(final int numReadFeatures) {
this.numReadFeatures = numReadFeatures;
}

public int getNumReadFeatures() {
return numReadFeatures;
}
}

/**
* VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field)
* contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
Expand Down
Loading
Loading