Skip to content

Commit

Permalink
Using cigar complexity to break ties in uninformative reads' best hap…
Browse files Browse the repository at this point in the history
…lotypes (#5359)
  • Loading branch information
davidbenjamin authored Nov 28, 2018
1 parent 349d423 commit 7226ad9
Show file tree
Hide file tree
Showing 13 changed files with 110 additions and 52 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.reference.ReferenceSequenceFile;
Expand Down Expand Up @@ -30,6 +31,7 @@

import java.io.File;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
Expand All @@ -41,6 +43,12 @@ public final class AssemblyBasedCallerUtils {
public static final String SUPPORTED_ALLELES_TAG="XA";
public static final String CALLABLE_REGION_TAG = "CR";
public static final String ALIGNMENT_REGION_TAG = "AR";
public static final Function<Haplotype, Double> HAPLOTYPE_ALIGNMENT_TIEBREAKING_PRIORITY = h -> {
final Cigar cigar = h.getCigar();
final int referenceTerm = (h.isReference() ? 1 : 0);
final int cigarTerm = cigar == null ? 0 : (1 - cigar.numCigarElements());
return (double) referenceTerm + cigarTerm;
};

/**
* Returns a map with the original read as a key and the realigned read as the value.
Expand All @@ -50,7 +58,7 @@ public final class AssemblyBasedCallerUtils {
* @return never {@code null}
*/
public static Map<GATKRead, GATKRead> realignReadsToTheirBestHaplotype(final ReadLikelihoods<Haplotype> originalReadLikelihoods, final Haplotype refHaplotype, final Locatable paddedReferenceLoc, final SmithWatermanAligner aligner) {
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestAlleles = originalReadLikelihoods.bestAllelesBreakingTies();
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestAlleles = originalReadLikelihoods.bestAllelesBreakingTies(HAPLOTYPE_ALIGNMENT_TIEBREAKING_PRIORITY);
final Map<GATKRead, GATKRead> result = new HashMap<>(bestAlleles.size());

for (final ReadLikelihoods<Haplotype>.BestAllele bestAllele : bestAlleles) {
Expand Down Expand Up @@ -291,7 +299,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
public static void annotateReadLikelihoodsWithRegions(final ReadLikelihoods<Haplotype> likelihoods,
final Locatable callableRegion) {
//assign alignment regions to each read
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestHaplotypes = likelihoods.bestAllelesBreakingTies();
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestHaplotypes = likelihoods.bestAllelesBreakingTies(HAPLOTYPE_ALIGNMENT_TIEBREAKING_PRIORITY);
for (final ReadLikelihoods<Haplotype>.BestAllele bestHaplotype : bestHaplotypes) {
final GATKRead read = bestHaplotype.read;
final Haplotype haplotype = bestHaplotype.allele;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand Down Expand Up @@ -366,7 +367,7 @@ public void normalizeLikelihoods(final double maximumLikelihoodDifferenceCap) {
private void normalizeLikelihoodsPerRead(final double maximumBestAltLikelihoodDifference,
final double[][] sampleValues, final int sampleIndex, final int readIndex) {

final BestAllele bestAlternativeAllele = searchBestAllele(sampleIndex,readIndex,false, false);
final BestAllele bestAlternativeAllele = searchBestAllele(sampleIndex,readIndex,false);

final double worstLikelihoodCap = bestAlternativeAllele.likelihood + maximumBestAltLikelihoodDifference;

Expand Down Expand Up @@ -421,11 +422,12 @@ public List<A> alleles() {
* @param sampleIndex including sample index.
* @param readIndex target read index.
*
* @param useReferenceIfUninformative
* @param priorities An array of allele priorities (higher values have higher priority) to be used, if present, to break ties for
* uninformative likelihoods, in which case the read is assigned to the allele with the higher score.
* @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null}
* if non-could be found.
*/
private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference, boolean useReferenceIfUninformative) {
private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference, Optional<double[]> priorities) {
final int alleleCount = alleles.numberOfAlleles();
if (alleleCount == 0 || (alleleCount == 1 && referenceAlleleIndex == 0 && !canBeReference)) {
return new BestAllele(sampleIndex, readIndex, -1, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
Expand All @@ -434,33 +436,56 @@ private BestAllele searchBestAllele(final int sampleIndex, final int readIndex,
final double[][] sampleValues = valuesBySampleIndex[sampleIndex];
int bestAlleleIndex = canBeReference || referenceAlleleIndex != 0 ? 0 : 1;

int secondBestIndex = 0;
double bestLikelihood = sampleValues[bestAlleleIndex][readIndex];
double secondBestLikelihood = Double.NEGATIVE_INFINITY;

for (int a = bestAlleleIndex + 1; a < alleleCount; a++) {
if (!canBeReference && referenceAlleleIndex == a) {
continue;
}
final double candidateLikelihood = sampleValues[a][readIndex];
if (candidateLikelihood > bestLikelihood) {
secondBestIndex = bestAlleleIndex;
bestAlleleIndex = a;
secondBestLikelihood = bestLikelihood;
bestLikelihood = candidateLikelihood;
} else if (candidateLikelihood > secondBestLikelihood) {
secondBestIndex = a;
secondBestLikelihood = candidateLikelihood;
}
}

// if our read is not informative against the ref we set the ref as the best allele. This is so that bamouts don't
// spuriously show deletions in ref reads that end in STRs
if (useReferenceIfUninformative && canBeReference && referenceAlleleIndex != MISSING_REF && bestAlleleIndex != referenceAlleleIndex) {
final double referenceLikelihood = sampleValues[referenceAlleleIndex][readIndex];
if ( bestLikelihood - referenceLikelihood < BestAllele.INFORMATIVE_THRESHOLD ) {
secondBestLikelihood = bestLikelihood;
bestAlleleIndex = referenceAlleleIndex;
bestLikelihood = referenceLikelihood;
if (priorities.isPresent() && bestLikelihood - secondBestLikelihood < BestAllele.INFORMATIVE_THRESHOLD) {
double bestPriority = priorities.get()[bestAlleleIndex];
double secondBestPriority = priorities.get()[secondBestIndex];
for (int a = 0; a < alleleCount; a++) {
final double candidateLikelihood = sampleValues[a][readIndex];
if (a == bestAlleleIndex || (!canBeReference && a == referenceAlleleIndex) || bestLikelihood - candidateLikelihood > BestAllele.INFORMATIVE_THRESHOLD) {
continue;
}
final double candidatePriority = priorities.get()[a];

if (candidatePriority > bestPriority) {
secondBestIndex = bestAlleleIndex;
bestAlleleIndex = a;
secondBestPriority = bestPriority;
bestPriority = candidatePriority;
} else if (candidatePriority > secondBestPriority) {
secondBestIndex = a;
secondBestPriority = candidatePriority;
}
}
}
return new BestAllele(sampleIndex,readIndex,bestAlleleIndex,bestLikelihood,secondBestLikelihood);

bestLikelihood = sampleValues[bestAlleleIndex][readIndex];
secondBestLikelihood = secondBestIndex != bestAlleleIndex ? sampleValues[secondBestIndex][readIndex] : Double.NEGATIVE_INFINITY;

return new BestAllele(sampleIndex, readIndex, bestAlleleIndex, bestLikelihood, secondBestLikelihood);
}

private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference) {
return searchBestAllele(sampleIndex, readIndex, canBeReference, Optional.empty());
}

public void changeReads(final Map<GATKRead, GATKRead> readRealignments) {
Expand Down Expand Up @@ -903,7 +928,7 @@ public void updateNonRefAlleleLikelihoods(final AlleleList<A> allelesToConsider)
final double[][] sampleValues = valuesBySampleIndex[s];
final int readCount = sampleValues[0].length;
for (int r = 0; r < readCount; r++) {
final BestAllele bestAllele = searchBestAllele(s, r, true, false);
final BestAllele bestAllele = searchBestAllele(s, r, true);
int numberOfQualifiedAlleleLikelihoods = 0;
for (int i = 0; i < alleleCount; i++) {
final double alleleLikelihood = sampleValues[i][r];
Expand Down Expand Up @@ -960,25 +985,40 @@ public void contaminationDownsampling(final Map<String, Double> perSampleDownsam
/**
* Returns the collection of best allele estimates for the reads based on the read-likelihoods.
* "Ties" where the ref likelihood is within {@code ReadLikelihoods.INFORMATIVE_THRESHOLD} of the greatest likelihood
* are broken in favor of the reference.
* are broken by the {@code tieBreakingPriority} function.
*
* @throws IllegalStateException if there is no alleles.
*
* @return never {@code null}, one element per read in the read-likelihoods collection.
*/
public Collection<BestAllele> bestAllelesBreakingTies(final Function<A, Double> tieBreakingPriority) {
return IntStream.range(0, numberOfSamples()).boxed().flatMap(n -> bestAllelesBreakingTies(n, tieBreakingPriority).stream()).collect(Collectors.toList());
}

/**
* Default version where ties are broken in favor of the reference allele
*/
public Collection<BestAllele> bestAllelesBreakingTies() {
return IntStream.range(0, numberOfSamples()).boxed().flatMap(n -> bestAllelesBreakingTies(n).stream()).collect(Collectors.toList());
}

/**
* Returns the collection of best allele estimates for one sample's reads based on the read-likelihoods.
* "Ties" where the ref likelihood is within {@code ReadLikelihoods.INFORMATIVE_THRESHOLD} of the greatest likelihood
* are broken in favor of the reference.
* are broken by the {@code tieBreakingPriority} function.
*
* @throws IllegalStateException if there is no alleles.
*
* @return never {@code null}, one element per read in the read-likelihoods collection.
*/
public Collection<BestAllele> bestAllelesBreakingTies(final String sample, final Function<A, Double> tieBreakingPriority) {
final int sampleIndex = indexOfSample(sample);
return bestAllelesBreakingTies(sampleIndex, tieBreakingPriority);
}

/**
* Default version where ties are broken in favor of the reference allele
*/
public Collection<BestAllele> bestAllelesBreakingTies(final String sample) {
final int sampleIndex = indexOfSample(sample);
return bestAllelesBreakingTies(sampleIndex);
Expand All @@ -987,25 +1027,36 @@ public Collection<BestAllele> bestAllelesBreakingTies(final String sample) {
/**
* Returns the collection of best allele estimates for one sample's reads reads based on the read-likelihoods.
* "Ties" where the ref likelihood is within {@code ReadLikelihoods.INFORMATIVE_THRESHOLD} of the greatest likelihood
* are broken in favor of the reference.
* are broken by the {@code tieBreakingPriority} function.
*
* @throws IllegalStateException if there is no alleles.
*
* @return never {@code null}, one element per read in the read-likelihoods collection.
*/
private Collection<BestAllele> bestAllelesBreakingTies(final int sampleIndex) {
private Collection<BestAllele> bestAllelesBreakingTies(final int sampleIndex, final Function<A, Double> tieBreakingPriority) {
Utils.validIndex(sampleIndex, numberOfSamples());

//TODO: this currently just does ref vs alt. Really we want CIGAR complexity.
final Optional<double[]> priorities = alleles() == null ? Optional.empty() :
Optional.of(alleles().stream().mapToDouble(tieBreakingPriority::apply).toArray());

final GATKRead[] sampleReads = readsBySampleIndex[sampleIndex];
final int readCount = sampleReads.length;
final List<BestAllele> result = new ArrayList<>(readCount);
for (int r = 0; r < readCount; r++) {
result.add(searchBestAllele(sampleIndex, r, true, true));
result.add(searchBestAllele(sampleIndex, r, true, priorities));
}

return result;
}

/**
* Default version where ties are broken in favor of the reference allele
*/
private Collection<BestAllele> bestAllelesBreakingTies(final int sampleIndex) {
return bestAllelesBreakingTies(sampleIndex, a -> a.isReference() ? 1.0 : 0);
}


/**
* Returns reads stratified by their best allele.
Expand Down Expand Up @@ -1048,7 +1099,7 @@ private void readsByBestAlleleMap(final int sampleIndex, final Map<A, List<GATKR
final int readCount = reads.length;

for (int r = 0; r < readCount; r++) {
final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true, false);
final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true);
if (!bestAllele.isInformative()) {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@
20 10004194 . C <NON_REF> . . END=10004221 GT:DP:GQ:MIN_DP:PL ./.:50:99:46:0,105,1575 ./.
20 10004222 . C CA,<NON_REF> . . BaseQRankSum=0.335;DP=54;ExcessHet=3.01;MQRankSum=-2.042e+00;RAW_MQ=194729.00;ReadPosRankSum=1.27 GT:AD:DP:GQ:PL:SB ./.:31,8,0:39:50:50,0,923,153,952,1105:18,13,4,4 ./.
20 10004223 . A AG,<NON_REF> . . BaseQRankSum=-4.800e-01;DP=62;ExcessHet=3.01;MQRankSum=1.89;RAW_MQ=210769.00;ReadPosRankSum=0.339 GT:AD:DP:GQ:PL:SB ./.:36,11,0:47:85:85,0,982,193,1017,1210:16,20,5,6 ./.
20 10004224 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:66:0:66:0,0,1403 ./.
20 10004224 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:66:0:66:0,0,1325 ./.
20 10004225 . A <NON_REF> . . END=10004240 GT:DP:GQ:MIN_DP:PL ./.:70:99:65:0,117,1755 ./.
20 10004241 . G <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:66:0:66:0,0,1965 ./.
20 10004242 . A <NON_REF> . . END=10004350 GT:DP:GQ:MIN_DP:PL ./.:73:99:58:0,120,1800 ./.
Expand Down Expand Up @@ -914,7 +914,7 @@
20 10040773 . T <NON_REF> . . END=10040811 GT:DP:GQ:MIN_DP:PL ./.:77:99:75:0,120,1800 ./.
20 10040812 . AT A,<NON_REF> . . DP=79;ExcessHet=3.01;RAW_MQ=203299.00 GT:AD:DP:GQ:PL:SB ./.:0,71,0:71:99:2645,214,0,2645,214,2645:0,0,36,35 ./.
20 10040813 . T *,<NON_REF> . . DP=79 GT:AD:DP:GQ:PL:SB ./.:0,71,0:71:99:2645,214,0,2645,214,2645:0,0,36,35 ./.
20 10040814 . T <NON_REF> . . END=10040820 GT:DP:GQ:MIN_DP:PL ./.:78:99:77:0,120,1800 ./.
20 10040814 . T <NON_REF> . . END=10040820 GT:DP:GQ:MIN_DP:PL ./.:77:99:75:0,120,1800 ./.
20 10040821 . T A,<NON_REF> . . BaseQRankSum=-2.100e-02;DP=78;ExcessHet=3.01;MQRankSum=-9.700e-02;RAW_MQ=199699.00;ReadPosRankSum=-1.385e+00 GT:AD:DP:GQ:PL:SB ./.:38,40,0:78:99:1200,0,1099,1314,1219,2533:20,18,21,19 ./.
20 10040822 . A <NON_REF> . . END=10041303 GT:DP:GQ:MIN_DP:PL ./.:71:99:54:0,119,1800 ./.
20 10041304 . C T,<NON_REF> . . DP=68;ExcessHet=3.01;RAW_MQ=239810.00 GT:AD:DP:GQ:PL:SB ./.:0,68,0:68:99:2794,204,0,2794,204,2794:0,0,34,34 ./.
Expand Down Expand Up @@ -1198,9 +1198,9 @@
20 10087821 . A *,<NON_REF> . . DP=99 GT:AD:DP:GQ:PL:SB ./.:35,0,0:53:99:466,669,2577,661,2237,2189:17,18,8,10 ./.
20 10087822 . G *,<NON_REF> . . DP=99 GT:AD:DP:GQ:PL:SB ./.:35,0,0:53:99:466,669,2577,661,2237,2189:17,18,8,10 ./.
20 10087823 . A <NON_REF> . . END=10087841 GT:DP:GQ:MIN_DP:PL ./.:86:99:83:0,120,1800 ./.
20 10087842 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:82:0:82:0,0,2206 ./.
20 10087843 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:82:73:82:0,73,2973 ./.
20 10087844 . G <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:80:0:80:0,0,2190 ./.
20 10087842 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:82:0:82:0,0,2353 ./.
20 10087843 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:82:99:82:0,120,1800 ./.
20 10087844 . G <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:80:0:80:0,0,2416 ./.
20 10087845 . A <NON_REF> . . END=10088062 GT:DP:GQ:MIN_DP:PL ./.:95:99:71:0,120,1800 ./.
20 10088063 . C T,<NON_REF> . . BaseQRankSum=3.40;DP=93;ExcessHet=3.01;MQRankSum=0.515;RAW_MQ=327579.00;ReadPosRankSum=0.285 GT:AD:DP:GQ:PL:SB ./.:49,44,0:93:99:1446,0,1403,1592,1535,3128:24,25,18,26 ./.
20 10088064 . G <NON_REF> . . END=10088698 GT:DP:GQ:MIN_DP:PL ./.:87:99:60:0,120,1800 ./.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,7 @@
20 10132717 . T A,<NON_REF> 83.77 . BaseQRankSum=0.593;DP=47;ExcessHet=3.0103;MLEAC=1,0;MLEAF=0.500,0.00;MQRankSum=-3.720;RAW_MQ=114046.00;ReadPosRankSum=-0.480 GT:AD:DP:GQ:PGT:PID:PL:SB 0/1:22,9,0:31:99:0|1:10132706_G_GAC:112,0,872,179,888,1068:6,16,6,3
20 10132718 . G <NON_REF> . . END=10132718 GT:DP:GQ:MIN_DP:PL 0/0:48:90:48:0,90,1350
20 10132719 . T A,<NON_REF> 99.77 . BaseQRankSum=0.592;DP=48;ExcessHet=3.0103;MLEAC=1,0;MLEAF=0.500,0.00;MQRankSum=-3.720;RAW_MQ=117646.00;ReadPosRankSum=-0.829 GT:AD:DP:GQ:PGT:PID:PL:SB 0/1:22,9,0:31:99:0|1:10132706_G_GAC:128,0,872,195,888,1084:6,16,6,3
20 10132720 . G <NON_REF> . . END=10132720 GT:DP:GQ:MIN_DP:PL 0/0:48:90:48:0,90,1350
20 10132721 . A <NON_REF> . . END=10132721 GT:DP:GQ:MIN_DP:PL 0/0:47:81:47:0,81,1215
20 10132720 . G <NON_REF> . . END=10132721 GT:DP:GQ:MIN_DP:PL 0/0:48:99:47:0,99,1485
20 10132722 . CAG C,GAG,CAGAGAGAGAGAGAGAGAG,<NON_REF> 951.73 . DP=48;ExcessHet=3.0103;MLEAC=1,1,0,0;MLEAF=0.500,0.500,0.00,0.00;RAW_MQ=117646.00 GT:AD:DP:GQ:PL:SB 1/2:0,15,9,1,0:25:99:989,671,679,357,0,685,675,373,439,1166,1028,718,465,831,1176:0,0,9,16
20 10132725 . A <NON_REF> . . END=10132726 GT:DP:GQ:MIN_DP:PL 0/0:52:72:51:0,72,1080
20 10132727 . A <NON_REF> . . END=10132728 GT:DP:GQ:MIN_DP:PL 0/0:53:81:52:0,81,1215
Expand Down
Loading

0 comments on commit 7226ad9

Please sign in to comment.