Skip to content

Commit

Permalink
The db_pd_hap_cleaning PR, squashed
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin committed Jun 15, 2023
1 parent abe8148 commit 678d09e
Show file tree
Hide file tree
Showing 10 changed files with 311 additions and 326 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ public void writeAlleleLikelihoods(final AlleleLikelihoods<GATKRead, Haplotype>

final Haplotype first_hap = haplotypes.get(0);
try {
if (outputInterval == null || outputInterval.contains(first_hap.getLocation())) {
output.write(String.format("> Location %s:%d-%d\n", first_hap.getLocation().getContig(),
first_hap.getStartPosition(), first_hap.getStopPosition()));
if (outputInterval == null || outputInterval.contains(first_hap)) {
output.write(String.format("> Location %s:%d-%d\n", first_hap.getContig(),
first_hap.getStart(), first_hap.getEnd()));
output.write(">> Haplotypes\n");
for (int i = 0; i < haplotypes.size(); i++) {
output.write(String.format("%04d\t%s\n", i, haplotypes.get(i).toString()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -706,16 +706,10 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
System.out.println("CallingSpan: " + region.getSpan());
}
assemblyResult = PartiallyDeterminedHaplotypeComputationEngine.generatePDHaplotypes(assemblyResult,
region.getSpan(),
assemblyResult.getReferenceHaplotype(),
assemblyVariants,
badPileupEvents,
goodPileupEvents,
hcArgs.pileupDetectionArgs.snpAdjacentToAssemblyIndel,
aligner,
hcArgs.getHaplotypeToReferenceSWParameters(),
hcArgs.pileupDetectionArgs.determinePDHaps,
hcArgs.pileupDetectionArgs.debugPileupStdout);
hcArgs);
}

// Legacy Pileupcaller code. Supplement the assembly haps with artifical haps constructed from the discovered pileupcaller
Expand Down Expand Up @@ -773,13 +767,13 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
if (HaplotypeCallerGenotypingDebugger.isEnabled()) {
HaplotypeCallerGenotypingDebugger.println("\nUnclipped Haplotypes("+haplotypes.size()+"):");
for (Haplotype haplotype : untrimmedAssemblyResult.getHaplotypeList()) {
HaplotypeCallerGenotypingDebugger.println("["+haplotype.getStartPosition()+"-"+haplotype.getStopPosition()+"] k="+haplotype.getKmerSize()+" len: "+haplotype.length()+" "+haplotype.getCigar()+(haplotype.isReference()?"ref":""));
HaplotypeCallerGenotypingDebugger.println("["+haplotype.getStart()+"-"+haplotype.getEnd()+"] k="+haplotype.getKmerSize()+" len: "+haplotype.length()+" "+haplotype.getCigar()+(haplotype.isReference()?"ref":""));
HaplotypeCallerGenotypingDebugger.println(haplotype.toString());
}

HaplotypeCallerGenotypingDebugger.println("\nClipped Haplotyes("+haplotypes.size()+"):");
for (Haplotype haplotype : haplotypes) {
HaplotypeCallerGenotypingDebugger.println("["+haplotype.getStartPosition()+"-"+haplotype.getStopPosition()+"] k="+haplotype.getKmerSize()+" len: "+haplotype.length()+" "+haplotype.getCigar()+(haplotype.isReference()?"ref":""));
HaplotypeCallerGenotypingDebugger.println("["+haplotype.getStart()+"-"+haplotype.getEnd()+"] k="+haplotype.getKmerSize()+" len: "+haplotype.length()+" "+haplotype.getCigar()+(haplotype.isReference()?"ref":""));
HaplotypeCallerGenotypingDebugger.println(haplotype.toString());
}
HaplotypeCallerGenotypingDebugger.println("");
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ public int compare(final Haplotype o1, final Haplotype o2) {
int delta = (o1.isReference() ? 1 : 0) - (o2.isReference() ? 1 : 0);

if ( delta == 0 ) {
delta = o1.getLocation().getContig().compareTo(o2.getLocation().getContig());
delta = o1.getContig().compareTo(o2.getContig());
if ( delta == 0 ) {
delta = o1.getLocation().getStart() - o2.getLocation().getStart();
delta = o1.getStart() - o2.getStart();
if ( delta == 0 ) {
delta = o1.getLocation().getEnd() - o2.getLocation().getEnd();
delta = o1.getEnd() - o2.getEnd();
if ( delta == 0 ) {
double ddelta = o1.getScore() - o2.getScore();
if ( ddelta < 0 )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import org.apache.logging.log4j.LogManager;
Expand All @@ -15,9 +16,11 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResultSet;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.LongHomopolymerHaplotypeCollapsingEngine;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReadErrorCorrector;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.hellbender.utils.Histogram;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
Expand All @@ -26,16 +29,13 @@
import org.broadinstitute.hellbender.utils.read.CigarUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.LongHomopolymerHaplotypeCollapsingEngine;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import java.util.stream.Collectors;

import static java.lang.String.format;

public final class ReadThreadingAssembler {
private static final Logger logger = LogManager.getLogger(ReadThreadingAssembler.class);

Expand Down Expand Up @@ -252,7 +252,7 @@ private void assembleGraphsAndExpandKmersGivenHaplotypes(final Haplotype refHapl
if (assembledResult != null && assembledResult.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION) {
// do some QC on the graph
sanityCheckGraph(assembledResult.getThreadingGraph(), refHaplotype);
assembledResult.getThreadingGraph().postProcessForHaplotypeFinding(debugGraphOutputPath, refHaplotype.getLocation());
assembledResult.getThreadingGraph().postProcessForHaplotypeFinding(debugGraphOutputPath, refHaplotype);
// add it to graphs with meaningful non-reference features
assemblyResultByRTGraph.put(assembledResult.getThreadingGraph(), assembledResult);
nonRefRTGraphs.add(assembledResult.getThreadingGraph());
Expand Down Expand Up @@ -484,25 +484,25 @@ private AssemblyResult getResultSetForRTGraph(final AbstractReadThreadingGraph r
// Performs the various transformations necessary on a sequence graph
private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph, final Haplotype refHaplotype) {
if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.0.non_ref_removed.dot");
printDebugGraphTransform(seqGraph, dotFileName(refHaplotype, seqGraph.getKmerSize(),".1.0.non_ref_removed.dot"));
}
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
seqGraph.zipLinearChains();
if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.1.zipped.dot");
printDebugGraphTransform(seqGraph, dotFileName(refHaplotype, seqGraph.getKmerSize(), ".1.1.zipped.dot"));
}

// now go through and prune the graph, removing vertices no longer connected to the reference chain
seqGraph.removeSingletonOrphanVertices();
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();

if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph." + seqGraph.getKmerSize() + ".1.2.pruned.dot");
printDebugGraphTransform(seqGraph, dotFileName(refHaplotype, seqGraph.getKmerSize(), ".1.2.pruned.dot"));
}
seqGraph.simplifyGraph();

if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.3.merged.dot");
printDebugGraphTransform(seqGraph, dotFileName(refHaplotype, seqGraph.getKmerSize(), ".1.3.merged.dot"));
}

// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
Expand All @@ -524,7 +524,7 @@ private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph, final Haplotype
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
}
if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph." + seqGraph.getKmerSize() + ".1.4.final.dot");
printDebugGraphTransform(seqGraph, dotFileName(refHaplotype, seqGraph.getKmerSize(), ".1.4.final.dot"));
}
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph, null);
}
Expand Down Expand Up @@ -674,7 +674,7 @@ private AssemblyResult createGraph(final Iterable<GATKRead> reads,
rtgraph.buildGraphIfNecessary();

if (debugGraphTransformations) {
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot");
printDebugGraphTransform(rtgraph, dotFileName(refHaplotype, kmerSize, ".0.0.raw_readthreading_graph.dot"));
}

// It's important to prune before recovering dangling ends so that we don't waste time recovering bad ends.
Expand Down Expand Up @@ -714,7 +714,7 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int
}

if (debugGraphTransformations) {
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.chain_pruned_readthreading_graph.dot");
printDebugGraphTransform(rtgraph, dotFileName(refHaplotype, kmerSize, ".0.1.chain_pruned_readthreading_graph.dot"));
}

// look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if
Expand All @@ -730,14 +730,14 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int
}

if (debugGraphTransformations) {
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.cleaned_readthreading_graph.dot");
printDebugGraphTransform(rtgraph, dotFileName(refHaplotype, kmerSize, ".0.2.cleaned_readthreading_graph.dot"));
}

// Either return an assembly result with a sequence graph or with an unchanged sequence graph deptending on the kmer duplication behavior
if (generateSeqGraph) {
final SeqGraph initialSeqGraph = rtgraph.toSequenceGraph();
if (debugGraphTransformations) {
rtgraph.printGraph(new File(debugGraphOutputPath, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.3.initial_seqgraph.dot"), 10000);
rtgraph.printGraph(new File(debugGraphOutputPath, dotFileName(refHaplotype, kmerSize, ".0.3.initial_seqgraph.dot")), 10000);
}

// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
Expand All @@ -749,7 +749,7 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int
logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
}
if (debugGraphTransformations) {
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.4.initial_seqgraph.dot");
printDebugGraphTransform(initialSeqGraph, dotFileName(refHaplotype, kmerSize, ".0.4.initial_seqgraph.dot"));
}
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction

Expand Down Expand Up @@ -939,4 +939,9 @@ public AssemblyResultSet generateEmptyLLocalAssemblyResult(final AssemblyRegion

return resultSet;
}

private static String dotFileName(final Haplotype refHaplotype, final int kmerSize, final String suffix) {
return (refHaplotype.getGenomeLocation() == null ? "" : IntervalUtils.locatableToString(refHaplotype) )
+ "-sequenceGraph." + kmerSize + suffix;
}
}
6 changes: 6 additions & 0 deletions src/main/java/org/broadinstitute/hellbender/utils/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,12 @@ public static void validate(final boolean condition, final Supplier<String> msg)
}
}

public static void printIf(final boolean condition, final Supplier<String> msg){
if (condition){
System.out.println(msg.get());
}
}


/**
* Calculates the optimum initial size for a hash table given the maximum number
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public static EventMap fromHaplotype(final Haplotype haplotype, final byte[] ref
}

public static EventMap fromHaplotype(final Haplotype haplotype, final byte[] ref, final int maxMnpDistance) {
return new EventMap(getEvents(haplotype, ref, haplotype.getLocation(), maxMnpDistance));
return new EventMap(getEvents(haplotype, ref, haplotype, maxMnpDistance));
}

// this is really just a convenient way to make EventMap objects in unit tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.CigarBuilder;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import spire.math.All;

import java.util.Arrays;
import java.util.Comparator;

public class Haplotype extends SimpleAllele {
public class Haplotype extends SimpleAllele implements Locatable{
private static final long serialVersionUID = 1L;

/**
Expand Down Expand Up @@ -187,23 +186,22 @@ public String toString() {
return getDisplayString();
}

/**
* Get the span of this haplotype (may be null)
* @return a potentially null genome loc
*/
public Locatable getLocation() {
return this.genomeLocation;
}

public void setGenomeLocation(final Locatable genomeLocation) {
this.genomeLocation = genomeLocation;
}

public long getStartPosition() {
@Override
public String getContig() {
return genomeLocation.getContig();
}

@Override
public int getStart() {
return genomeLocation.getStart();
}

public long getStopPosition() {
@Override
public int getEnd() {
return genomeLocation.getEnd();
}

Expand Down Expand Up @@ -260,7 +258,7 @@ public void setCigar( final Cigar cigar ) {
public Haplotype insertAllele(final Allele refAllele, final Allele altAllele, final int insertLocation) {
//special case for zeroth base deletion. In this case the common base is "outside" of the contig
final byte[] myBases = getBases();
if ((refAllele.length()>altAllele.length()) && (insertLocation==(int)getStartPosition()-1)){
if ((refAllele.length()>altAllele.length()) && (insertLocation==getStart()-1)){
int delSize = refAllele.length() - altAllele.length();
if (delSize > myBases.length){
return null;
Expand All @@ -273,7 +271,7 @@ public Haplotype insertAllele(final Allele refAllele, final Allele altAllele, fi
}


final Pair<Integer, CigarOperator> haplotypeInsertLocationAndOperator = ReadUtils.getReadIndexForReferenceCoordinate((int) getStartPosition(), cigar, insertLocation);
final Pair<Integer, CigarOperator> haplotypeInsertLocationAndOperator = ReadUtils.getReadIndexForReferenceCoordinate(getStart(), cigar, insertLocation);

// can't insert outside the haplotype or into a deletion
if( haplotypeInsertLocationAndOperator.getLeft() == ReadUtils.READ_INDEX_NOT_FOUND || !haplotypeInsertLocationAndOperator.getRight().consumesReadBases() ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public String toString() {
String output = "HapLen:"+length() +", "+new String(getDisplayBases());
output = output + "\nUnresolved Bases["+alternateBases.length+"] "+Arrays.toString(alternateBases);
return output + "\n"+getCigar().toString()+" "+ constituentBuiltEvents.stream()
.map(v ->(v==this.alleleBearingEvent ?"*":"")+ getDRAGENDebugEventString((int)getStartPosition()).apply(v) )
.map(v ->(v==this.alleleBearingEvent ?"*":"")+ getDRAGENDebugEventString( getStart()).apply(v) )
.collect(Collectors.joining("->"));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,8 @@ public void testTrim(final Haplotype full, final GenomeLoc trimTo, final Haploty
final Haplotype actual = full.trim(trimTo);
if ( expected != null ) {
Assert.assertEquals(actual.getBases(), expected.getBases());
Assert.assertEquals(actual.getStartPosition(), trimTo.getStart());
Assert.assertEquals(actual.getStopPosition(), trimTo.getStop());
Assert.assertEquals(actual.getStart(), trimTo.getStart());
Assert.assertEquals(actual.getEnd(), trimTo.getStop());
Assert.assertEquals(actual.getCigar(), expected.getCigar());
Assert.assertEquals(actual.getAlignmentStartHapwrtRef(), expected.getAlignmentStartHapwrtRef());
} else {
Expand Down

0 comments on commit 678d09e

Please sign in to comment.