Skip to content

Commit

Permalink
(SV) Overhaul tests on assembly-based non-complex breakpoint and type…
Browse files Browse the repository at this point in the history
… inference code

Also

  * expand cases on BND format variants
  * change to how VariantContext are extracted from NARL and SVType
  * refactor the tool DiscoverVariantsFromContigAlignmentsSAMSpark into a new worker class ContigChimericAlignmentIterativeInterpreter
  • Loading branch information
SHuang-Broad committed Jun 22, 2018
1 parent ee7c717 commit 7c291ba
Show file tree
Hide file tree
Showing 56 changed files with 4,724 additions and 3,839 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,6 @@ public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection
@Argument(doc = "Maximum size deletion to call based on imprecise evidence without corroborating read depth evidence",
fullName = "max-callable-imprecise-deletion-size", optional=true)
public int maxCallableImpreciseVariantDeletionSize = DEFAULT_MAX_CALLABLE_IMPRECISE_DELETION_SIZE;

@Advanced
@Hidden
@Argument(doc = "output cpx variants in a format that is more human friendly, primarily for debugging purposes",
fullName = "cpx-for-human-eye", optional = true)
public boolean outputCpxResultsInHumanReadableFormat = false;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContigGenerator;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.ContigAlignmentsModifier;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ContigChimericAlignmentIterativeInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ImpreciseVariantDetector;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.AlignedAssemblyOrExcuse;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
Expand Down Expand Up @@ -176,10 +177,8 @@ protected void runTool( final JavaSparkContext ctx ) {
final SvDiscoveryInputMetaData svDiscoveryInputMetaData = getSvDiscoveryInputData(ctx, headerForReads, assembledEvidenceResults);

// TODO: 1/14/18 this is to be phased-out: old way of calling precise variants
// assembled breakpoints
@SuppressWarnings("deprecation")
List<VariantContext> assemblyBasedVariants =
org.broadinstitute.hellbender.tools.spark.sv.discovery.DiscoverVariantsFromContigAlignmentsSAMSpark
ContigChimericAlignmentIterativeInterpreter
.discoverVariantsFromChimeras(svDiscoveryInputMetaData, parsedAlignments);

final List<VariantContext> annotatedVariants = processEvidenceTargetLinks(assemblyBasedVariants, svDiscoveryInputMetaData);
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryPipelineSpark;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.*;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.CpxVariantInterpreter;
Expand All @@ -34,6 +35,8 @@
import scala.Tuple2;

import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
Expand Down Expand Up @@ -138,7 +141,7 @@ protected void runTool(final JavaSparkContext ctx) {
final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast =
StructuralVariationDiscoveryPipelineSpark.broadcastCNVCalls(ctx, getHeaderForReads(),
discoverStageArgs.cnvCallsFile);
final String outputPrefixWithSampleName = outputPrefix + SVUtils.getSampleId(getHeaderForReads()) + "_";
final String outputPrefixWithSampleName = getOutputPrefix();
final SvDiscoveryInputMetaData svDiscoveryInputMetaData =
new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile, outputPrefixWithSampleName,
null, null, null,
Expand All @@ -152,6 +155,20 @@ protected void runTool(final JavaSparkContext ctx) {
dispatchJobs(ctx, contigsByPossibleRawTypes, svDiscoveryInputMetaData, assemblyRawAlignments, writeSAMFiles);
}

/**
* @return prefix of outputs, with {@link #outputPrefix} decorated with sample name and trailing underscore
*/
private String getOutputPrefix() {
if ( Files.exists(Paths.get(outputPrefix)) ) {
if (Files.isDirectory(Paths.get(outputPrefix))) // existing directory
return outputPrefix + (outputPrefix.endsWith("/") ? "" : "/") + SVUtils.getSampleId(getHeaderForReads()) + "_";
else
throw new UserException("Provided prefix for output is pointing to an existing file: " + outputPrefix); // to avoid accidental override of a file
} else { // prefixForOutput doesn't point to an existing file or directory
return outputPrefix + (outputPrefix.endsWith("/") ? "" : "_") + SVUtils.getSampleId(getHeaderForReads()) + "_";
}
}

//==================================================================================================================

public static final class AssemblyContigsClassifiedByAlignmentSignatures {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,41 +1,169 @@
package org.broadinstitute.hellbender.tools.spark.sv.discovery;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.EnumUtils;
import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.Collections;
import java.util.Map;
import java.io.IOException;
import java.util.*;

import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.INTERVAL_VARIANT_ID_FIELD_SEPARATOR;


/**
* Various types of structural variations
* Various types of structural variations.
* Holding minimum information on VariantContext
* it can hold:
* CHR, POS, ID, REF, ALT, SVTYPE, END (optional), SVLEN(optional)
*/
public abstract class SvType {
public static final int NO_APPLICABLE_END = -1;
public static final int NO_APPLICABLE_LEN = -1;

// fields whose name starts with "variant" are following the VCF spec, i.e. they can be used directly to construct VariantContextBuilder
protected final String variantChromosome;
protected final int variantStart;
protected final int variantStop;
protected final String variantId;
protected final Allele refAllele;
protected final Allele altAllele;
protected final int svLen;
protected final Map<String, String> extraAttributes;
protected final Map<String, Object> typeSpecificAttributes;

protected static final Map<String, String> noExtraAttributes = Collections.emptyMap();
protected static final Map<String, Object> noExtraAttributes = Collections.emptyMap();

protected SvType(final String id, final Allele altAllele, final int len, final Map<String, String> typeSpecificExtraAttributes) {
variantId = id;
public SvType(final String variantChromosome, final int variantStart, final int variantStop, final String variantId,
final Allele refAllele, final Allele altAllele, final int svLen, final Map<String, Object> typeSpecificAttributes) {
this.variantChromosome = variantChromosome;
this.variantStart = variantStart;
this.variantStop = variantStop;
this.variantId = variantId;
this.refAllele = refAllele;
this.altAllele = altAllele;
svLen = len;
extraAttributes = typeSpecificExtraAttributes;
this.svLen = svLen;
this.typeSpecificAttributes = typeSpecificAttributes;
}

public final VariantContextBuilder getBasicInformation() {
if ( ! hasApplicableEnd() ) {
VariantContextBuilder builder = new VariantContextBuilder()
.chr(variantChromosome).start(variantStart).stop(variantStart)
.id(variantId)
.alleles(new ArrayList<>(Arrays.asList(refAllele, altAllele)))
.attribute(GATKSVVCFConstants.SVTYPE, toString());
typeSpecificAttributes.forEach(builder::attribute);
return builder;
} else { // assuming if there's valid END, there must be valid SVLEN
VariantContextBuilder builder = new VariantContextBuilder()
.chr(variantChromosome).start(variantStart).stop(variantStop)
.id(variantId)
.alleles(new ArrayList<>(Arrays.asList(refAllele, altAllele)))
.attribute(VCFConstants.END_KEY, variantStop)
.attribute(GATKSVVCFConstants.SVTYPE, toString())
.attribute(GATKSVVCFConstants.SVLEN, svLen);
typeSpecificAttributes.forEach(builder::attribute);
return builder;
}
}

public String getVariantChromosome() {
return variantChromosome;
}
public int getVariantStart() {
return variantStart;
}
public int getVariantStop() {
return variantStop;
}
public final String getInternalVariantId() {
return variantId;
}
Allele getRefAllele() {
return refAllele;
}
public final Allele getAltAllele() {
return altAllele;
}
public final int getSVLength() {
return svLen;
}
public final Map<String, String> getTypeSpecificAttributes() {
return extraAttributes;
public final Map<String, Object> getTypeSpecificAttributes() {
return typeSpecificAttributes;
}

public abstract boolean hasApplicableEnd();
public abstract boolean hasApplicableLength();

@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;

final SvType svType = (SvType) o;

if (svLen != svType.svLen) return false;
if (!variantId.equals(svType.variantId)) return false;
if (!altAllele.equals(svType.altAllele)) return false;
if (typeSpecificAttributes.size() != svType.typeSpecificAttributes.size()) return false;

for ( final Map.Entry<String, Object> act : typeSpecificAttributes.entrySet() ) {
if ( svType.typeSpecificAttributes.containsKey(act.getKey())) {
if ( ! act.getValue().equals(svType.typeSpecificAttributes.get(act.getKey())))
return false;
} else
return false;
}
return true;
}

@Override
public int hashCode() {
int result = variantId.hashCode();
result = 31 * result + altAllele.hashCode();
result = 31 * result + svLen;
result = 31 * result + typeSpecificAttributes.hashCode();
return result;
}

// TODO: 5/23/18 any better way to do this?
public static SortedSet<String> getKnownTypes() {
final SortedSet<String> knownTypes = new TreeSet<>( EnumUtils.getEnumMap(SimpleSVType.SupportedType.class).keySet() );

knownTypes.add(GATKSVVCFConstants.CPX_SV_SYB_ALT_ALLELE_STR);

for (final BreakEndVariantType.SupportedType supportedType : BreakEndVariantType.SupportedType.values()) {
knownTypes.add(supportedType.name());
}

return Collections.unmodifiableSortedSet(knownTypes);
}

public static String makeLocationString(final String chr1, final int pos1, final String chr2, final int pos2) {
return chr1 + INTERVAL_VARIANT_ID_FIELD_SEPARATOR
+ pos1 + INTERVAL_VARIANT_ID_FIELD_SEPARATOR
+ (chr2.equals(chr1) ? "" : chr2 + INTERVAL_VARIANT_ID_FIELD_SEPARATOR)
+ pos2;
}

public static String makeLocationString(final NovelAdjacencyAndAltHaplotype novelAdjacencyAndAltHaplotype) {
String leftContig = novelAdjacencyAndAltHaplotype.getLeftJustifiedLeftRefLoc().getContig();
String rightContig = novelAdjacencyAndAltHaplotype.getLeftJustifiedRightRefLoc().getContig();
int pos1 = novelAdjacencyAndAltHaplotype.getLeftJustifiedLeftRefLoc().getStart();
int pos2 = novelAdjacencyAndAltHaplotype.getLeftJustifiedRightRefLoc().getEnd();
return makeLocationString(leftContig, pos1, rightContig, pos2);
}

static byte[] extractRefBases(final SimpleInterval interval, final ReferenceMultiSource reference) {
try {
return reference.getReferenceBases(interval).getBases();
} catch (final IOException ioex) {
throw new GATKException("Failed to extract bases from region: " + interval.toString());
}
}
public abstract boolean isBreakEndOnly();
}
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ static boolean gappedAlignmentOffersBetterCoverage(final AlignmentInterval gappe
* then this function returns all the original {@code differentRepresentationsForOneContig}
*/
@VisibleForTesting
static List<GoodAndBadMappings> filterSecondaryConfigurationsByMappingQualityThreshold(
public static List<GoodAndBadMappings> filterSecondaryConfigurationsByMappingQualityThreshold(
final List<GoodAndBadMappings> differentRepresentationsForOneContig,
final int threshold) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,10 +278,10 @@ public static boolean hasIncompletePictureFromTwoAlignments(final AlignmentInter
* head and tail alignment contain each other in terms of their reference span
* </li>
* <li>
* head and tail alignment mapped to different chromosome
* head and tail alignment mapped to different chromosomes
* </li>
* <li>
* head and tail alignment mapped to different strands
* head and tail alignment mapped to opposite strands of the same chromosome
* </li>
* <li>
* head and tail alignment have reference order switch, indicating we have likely
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ private static Cigar constructNewCigar(final List<CigarElement> left, final List
return new Cigar(SvCigarUtils.compactifyNeighboringSoftClippings(cigar.getCigarElements()));
}

public static enum AlnModType {
public enum AlnModType {
NONE, UNDERGONE_OVERLAP_REMOVAL, EXTRACTED_FROM_LARGER_ALIGNMENT, FROM_SPLIT_GAPPED_ALIGNMENT;

public enum ModTypeString {
Expand Down
Loading

0 comments on commit 7c291ba

Please sign in to comment.