This tool packages the algorithms described in {@link FindBreakpointEvidenceSpark} and
- * {@link DiscoverVariantsFromContigAlignmentsSAMSpark} as an integrated workflow. Please consult the
+ * {@link org.broadinstitute.hellbender.tools.spark.sv.discovery.DiscoverVariantsFromContigAlignmentsSAMSpark}
+ * as an integrated workflow. Please consult the
* descriptions of those tools for more details about the algorithms employed. In brief, input reads are examined
* for evidence of structural variation in a genomic region, regions so identified are locally assembled, and
* the local assemblies are called for structural variation.
@@ -108,16 +113,20 @@ public class StructuralVariationDiscoveryPipelineSpark extends GATKSparkTool {
@ArgumentCollection
private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs
= new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection();
+
@Argument(doc = "sam file for aligned contigs", fullName = "contig-sam-file")
private String outputAssemblyAlignments;
- @Argument(doc = "filename for output vcf", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+
+ @Argument(doc = "directory for VCF output, including those from experimental interpretation tool if so requested, " +
+ "will be created if not present; sample name will be appended after the provided argument",
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
- private String vcfOutputFileName;
+ private String variantsOutDir;
+
@Advanced
- @Argument(doc = "directory to output results of our prototyping breakpoint and type inference tool in addition to the master VCF;" +
- " the directory contains multiple VCF's for different types and record-generating SAM files of assembly contigs,",
- fullName = "exp-variants-out-dir", optional = true)
- private String expVariantsOutDir;
+ @Argument(doc = "flag to signal that user wants to run experimental interpretation tool as well",
+ fullName = "exp-interpret", optional = true)
+ private Boolean expInterpret = false;
@Override
public boolean requiresReads()
@@ -162,13 +171,7 @@ protected void runTool( final JavaSparkContext ctx ) {
// todo: when we call imprecise variants don't return here
if(parsedAlignments.isEmpty()) return;
- final Broadcast> cnvCallsBroadcast = broadcastCNVCalls(ctx, headerForReads, discoverStageArgs.cnvCallsFile);
- final SvDiscoveryInputData svDiscoveryInputData =
- new SvDiscoveryInputData(ctx, discoverStageArgs, vcfOutputFileName,
- assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(),
- makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()),
- cnvCallsBroadcast,
- getReads(), getHeaderForReads(), getReference(), localLogger);
+ final SvDiscoveryInputData svDiscoveryInputData = getSvDiscoveryInputData(ctx, headerForReads, assembledEvidenceResults);
// TODO: 1/14/18 this is to be phased-out: old way of calling precise variants
// assembled breakpoints
@@ -182,15 +185,36 @@ protected void runTool( final JavaSparkContext ctx ) {
final String outputPath = svDiscoveryInputData.outputPath;
final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputData.referenceSequenceDictionaryBroadcast.getValue();
final Logger toolLogger = svDiscoveryInputData.toolLogger;
- SVVCFWriter.writeVCF(annotatedVariants, outputPath, refSeqDictionary, toolLogger);
+ SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, toolLogger);
- // TODO: 1/14/18 this is the next version of precise variant calling
- if ( expVariantsOutDir != null ) {
- svDiscoveryInputData.updateOutputPath(expVariantsOutDir);
+ if ( expInterpret != null ) {
experimentalInterpretation(ctx, assembledEvidenceResults, svDiscoveryInputData, evidenceAndAssemblyArgs.crossContigsToIgnoreFile);
}
}
+ private SvDiscoveryInputData getSvDiscoveryInputData(final JavaSparkContext ctx,
+ final SAMFileHeader headerForReads,
+ final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults) {
+ final Broadcast> cnvCallsBroadcast =
+ broadcastCNVCalls(ctx, headerForReads, discoverStageArgs.cnvCallsFile);
+ try {
+ if ( !java.nio.file.Files.exists(Paths.get(variantsOutDir)) ) {
+ IOUtils.createDirectory(variantsOutDir);
+ }
+ } catch (final IOException ioex) {
+ throw new GATKException("Failed to create output directory " + variantsOutDir + " though it does not yet exist", ioex);
+ }
+
+ final String outputPrefixWithSampleName = variantsOutDir + (variantsOutDir.endsWith("/") ? "" : "/")
+ + SVUtils.getSampleId(headerForReads) + "_";
+
+ return new SvDiscoveryInputData(ctx, discoverStageArgs, outputPrefixWithSampleName,
+ assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(),
+ makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()),
+ cnvCallsBroadcast,
+ getReads(), getHeaderForReads(), getReference(), localLogger);
+ }
+
/**
* Uses the input EvidenceTargetLinks to
*
@@ -240,7 +264,7 @@ private void experimentalInterpretation(final JavaSparkContext ctx,
final SvDiscoveryInputData svDiscoveryInputData,
final String nonCanonicalChromosomeNamesFile) {
- if ( expVariantsOutDir == null )
+ if ( ! expInterpret )
return;
final Broadcast referenceSequenceDictionaryBroadcast = svDiscoveryInputData.referenceSequenceDictionaryBroadcast;
@@ -263,7 +287,8 @@ private void experimentalInterpretation(final JavaSparkContext ctx,
final Broadcast> cnvCallsBroadcast = svDiscoveryInputData.cnvCallsBroadcast;
final SvDiscoveryInputData updatedSvDiscoveryInputData =
- new SvDiscoveryInputData(sampleId, svDiscoveryInputData.discoverStageArgs, expVariantsOutDir,
+ new SvDiscoveryInputData(sampleId, svDiscoveryInputData.discoverStageArgs,
+ svDiscoveryInputData.outputPath + "experimentalInterpretation_",
svDiscoveryInputData.metadata, svDiscoveryInputData.assembledIntervals,
svDiscoveryInputData.evidenceTargetLinks, reads, svDiscoveryInputData.toolLogger,
referenceBroadcast, referenceSequenceDictionaryBroadcast, headerBroadcast, cnvCallsBroadcast);
@@ -368,7 +393,7 @@ public static List filterAndConvertToAlignedContigDirect(final It
.filter(AlignedAssemblyOrExcuse::isNotFailure)
.map(alignedAssembly -> getAlignedContigsInOneAssembly(alignedAssembly, refNames, header))
.flatMap(Utils::stream) // size == total # of contigs' from all successful assemblies
- .filter(contig -> !contig.alignmentIntervals.isEmpty()) // filter out unmapped and contigs without primary alignments
+ .filter(contig -> !contig.getAlignments().isEmpty()) // filter out unmapped and contigs without primary alignments
.collect(Collectors.toList());
}
@@ -388,9 +413,9 @@ public static Iterable getAlignedContigsInOneAssembly(final Align
.mapToObj( contigIdx -> {
final byte[] contigSequence = assembly.getContig(contigIdx).getSequence();
final String contigName = AlignedAssemblyOrExcuse.formatContigName(alignedAssembly.getAssemblyId(), contigIdx);
- final List arOfAContig
+ final List alignmentsForOneContig
= getAlignmentsForOneContig(contigName, contigSequence, allAlignments.get(contigIdx), refNames, header);
- return new AlignedContig(contigName, contigSequence, arOfAContig, false);
+ return new AlignedContig(contigName, contigSequence, alignmentsForOneContig);
} ).collect(Collectors.toList());
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java
index 377f6662f3c..46932a84380 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java
@@ -29,6 +29,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.Utils;
import scala.Tuple2;
@@ -106,9 +107,10 @@ public final class DiscoverVariantsFromContigAlignmentsSAMSpark extends GATKSpar
private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs =
new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection();
- @Argument(doc = "\"output path for discovery (non-genotyped) VCF", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ @Argument(doc = "prefix for discovery (non-genotyped) VCF; sample name will be appended after the provided argument, followed by \"_inv_del_ins.vcf\"",
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
- private String vcfOutputFileName;
+ private String prefixForOutput;
@Override
public boolean requiresReference() {
@@ -133,20 +135,23 @@ protected void runTool(final JavaSparkContext ctx) {
discoverStageArgs.cnvCallsFile);
final SvDiscoveryInputData svDiscoveryInputData =
- new SvDiscoveryInputData(ctx, discoverStageArgs, vcfOutputFileName,
+ new SvDiscoveryInputData(ctx,
+ discoverStageArgs, prefixForOutput + "_" + SVUtils.getSampleId(getHeaderForReads()) + "_inv_del_ins.vcf",
null, null, null,
cnvCallsBroadcast,
getReads(), getHeaderForReads(), getReference(), localLogger);
final JavaRDD parsedContigAlignments =
- new SvDiscoverFromLocalAssemblyContigAlignmentsSpark.SAMFormattedContigAlignmentParser(svDiscoveryInputData.assemblyRawAlignments, svDiscoveryInputData.headerBroadcast.getValue(), true)
+ new SvDiscoverFromLocalAssemblyContigAlignmentsSpark
+ .SAMFormattedContigAlignmentParser(svDiscoveryInputData.assemblyRawAlignments,
+ svDiscoveryInputData.headerBroadcast.getValue(), true)
.getAlignedContigs();
// assembly-based breakpoints
List annotatedVariants = discoverVariantsFromChimeras(svDiscoveryInputData, parsedContigAlignments);
final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputData.referenceSequenceDictionaryBroadcast.getValue();
- SVVCFWriter.writeVCF(annotatedVariants, vcfOutputFileName, refSeqDictionary, localLogger);
+ SVVCFWriter.writeVCF(annotatedVariants, svDiscoveryInputData.outputPath, refSeqDictionary, localLogger);
}
@Deprecated
@@ -155,9 +160,9 @@ public static List discoverVariantsFromChimeras(final SvDiscover
final Broadcast referenceSequenceDictionaryBroadcast = svDiscoveryInputData.referenceSequenceDictionaryBroadcast;
final JavaPairRDD> contigSeqAndChimeras =
- alignedContigs.filter(alignedContig -> alignedContig.alignmentIntervals.size() > 1)
+ alignedContigs.filter(alignedContig -> alignedContig.getAlignments().size() > 1)
.mapToPair(alignedContig ->
- new Tuple2<>(alignedContig.contigSequence,
+ new Tuple2<>(alignedContig.getContigSequence(),
ChimericAlignment.parseOneContig(alignedContig, referenceSequenceDictionaryBroadcast.getValue(),
true, DEFAULT_MIN_ALIGNMENT_LENGTH,
CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD, true)));
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java
index f47a8842190..086fcf355f3 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java
@@ -1,10 +1,7 @@
package org.broadinstitute.hellbender.tools.spark.sv.discovery;
import com.google.common.annotations.VisibleForTesting;
-import htsjdk.samtools.CigarOperator;
-import htsjdk.samtools.SAMFileHeader;
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.*;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.logging.log4j.LogManager;
@@ -24,16 +21,14 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
-import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryPipelineSpark;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.*;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.*;
-import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
-import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import scala.Tuple2;
@@ -113,10 +108,10 @@ public final class SvDiscoverFromLocalAssemblyContigAlignmentsSpark extends GATK
fullName = "non-canonical-contig-names-file", optional = true)
public String nonCanonicalChromosomeNamesFile;
- @Argument(doc = "output directory for outputting VCF files for each type of variant, and if enabled, the signaling assembly contig's alignments",
+ @Argument(doc = "prefix for output files (including VCF files and if enabled, the signaling assembly contig's alignments); sample name will be appended after the provided argument",
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
- private String outputDir;
+ private String outputPrefix;
@Argument(doc = "output SAM files", fullName = "write-sam", optional = true)
private boolean writeSAMFiles;
@@ -142,8 +137,9 @@ protected void runTool(final JavaSparkContext ctx) {
final Broadcast> cnvCallsBroadcast =
StructuralVariationDiscoveryPipelineSpark.broadcastCNVCalls(ctx, getHeaderForReads(),
discoverStageArgs.cnvCallsFile);
+ final String outputPrefixWithSampleName = outputPrefix + SVUtils.getSampleId(getHeaderForReads()) + "_";
final SvDiscoveryInputData svDiscoveryInputData =
- new SvDiscoveryInputData(ctx, discoverStageArgs, outputDir,
+ new SvDiscoveryInputData(ctx, discoverStageArgs, outputPrefixWithSampleName,
null, null, null,
cnvCallsBroadcast,
getReads(), getHeaderForReads(), getReference(), localLogger);
@@ -167,7 +163,6 @@ public static EnumMap>
final Broadcast referenceSequenceDictionaryBroadcast = svDiscoveryInputData.referenceSequenceDictionaryBroadcast;
final Broadcast headerBroadcast = svDiscoveryInputData.headerBroadcast;
final JavaRDD assemblyRawAlignments = svDiscoveryInputData.assemblyRawAlignments;
- final String outputPath = svDiscoveryInputData.outputPath;
final Logger toolLogger = svDiscoveryInputData.toolLogger;
// filter alignments and split the gaps, hence the name "reconstructed"
@@ -184,18 +179,18 @@ public static EnumMap>
AssemblyContigAlignmentSignatureClassifier.classifyContigs(contigsWithChimericAlignmentsReconstructed,
referenceSequenceDictionaryBroadcast, toolLogger);
- try {
- IOUtils.createDirectory(outputPath);
- // write SAM file, if requested, for each type of possibly variant as recognized in {@link RawTypes}
- // and stored in {@code contigsByPossibleRawTypes} by extracting original alignments,
- if (writeSAMFiles) {
- contigsByPossibleRawTypes.forEach(
- (k, v) ->
- writeSAM(v, k.name(), assemblyRawAlignments, headerBroadcast, outputPath, toolLogger));
- }
- } catch (final IOException x) {
- throw new UserException.CouldNotCreateOutputFile("Could not create file at path:" +
- outputPath + " due to " + x.getMessage(), x);
+ // write SAM file, if requested, for original alignments of contigs recognized as "Ambiguous", "Incomplete", and "MisAssemblySuspect"
+ if (writeSAMFiles) {
+ final String outputPrefix = svDiscoveryInputData.outputPath;
+
+ final Set ambiguousContigNames = new HashSet<>( contigsByPossibleRawTypes.get(RawTypes.Ambiguous).map(AssemblyContigWithFineTunedAlignments::getContigName).collect() );
+ final Set incompleteContigNames = new HashSet<>( contigsByPossibleRawTypes.get(RawTypes.Incomplete).map(AssemblyContigWithFineTunedAlignments::getContigName).collect() );
+ final Set misAssemblySuspectContigNames = new HashSet<>( contigsByPossibleRawTypes.get(RawTypes.MisAssemblySuspect).map(AssemblyContigWithFineTunedAlignments::getContigName).collect() );
+
+ final SAMFileHeader header = headerBroadcast.getValue();
+ SvDiscoveryUtils.writeSAMRecords(assemblyRawAlignments, ambiguousContigNames, outputPrefix + RawTypes.Ambiguous.name() + ".bam", header);
+ SvDiscoveryUtils.writeSAMRecords(assemblyRawAlignments, incompleteContigNames, outputPrefix + RawTypes.Incomplete.name() + ".bam", header);
+ SvDiscoveryUtils.writeSAMRecords(assemblyRawAlignments, misAssemblySuspectContigNames, outputPrefix + RawTypes.MisAssemblySuspect.name() + ".bam", header);
}
return contigsByPossibleRawTypes;
@@ -206,24 +201,23 @@ public static EnumMap>
/**
* Sends assembly contigs classified based on their alignment signature to
* a corresponding breakpoint location inference unit.
- * {@link AssemblyContigAlignmentSignatureClassifier.RawTypes#InsDel},
- * {@link AssemblyContigAlignmentSignatureClassifier.RawTypes#IntraChrStrandSwitch},
- * {@link AssemblyContigAlignmentSignatureClassifier.RawTypes#MappedInsertionBkpt}, and
- * {@link AssemblyContigAlignmentSignatureClassifier.RawTypes#Cpx} (PR to be reviewed and merged)
- * each will have its own VCF output in the directory specified in {@link #outputDir},
- * whereas
+ *
+ * Two VCF files will be output: {@link #outputPrefix}"NonComplex.vcf" and {@link #outputPrefix}"Cpx.vcf".
+ *
+ * Note that
* {@link AssemblyContigAlignmentSignatureClassifier.RawTypes#Incomplete},
* {@link AssemblyContigAlignmentSignatureClassifier.RawTypes#Ambiguous}, and
* {@link AssemblyContigAlignmentSignatureClassifier.RawTypes#MisAssemblySuspect}
* currently DO NOT generate any VCF yet.
- * However, if flag {@link #writeSAMFiles} is turned on, alignments of all contigs that are classified to be any of
+ * However, if flag {@link #writeSAMFiles} is turned on,
+ * alignments of all contigs that are classified to be any of
* {@link AssemblyContigAlignmentSignatureClassifier.RawTypes}
- * will be extracted and put in SAM files in {@link #outputDir} too.
+ * will be extracted and put in SAM files in {@link #outputPrefix} too.
*/
public static void dispatchJobs(final EnumMap> contigsByPossibleRawTypes,
final SvDiscoveryInputData svDiscoveryInputData) {
- final String outputDir = svDiscoveryInputData.outputPath;
+ final String outputPrefixWithSampleName = svDiscoveryInputData.outputPath;
// TODO: 1/10/18 bring back read annotation, see ticket 4228
forNonComplexVariants(contigsByPossibleRawTypes, svDiscoveryInputData);
@@ -231,7 +225,7 @@ public static void dispatchJobs(final EnumMap complexVariants =
CpxVariantInterpreter.inferCpxVariant(contigsByPossibleRawTypes.get(RawTypes.Cpx), svDiscoveryInputData);
- svDiscoveryInputData.updateOutputPath(outputDir+"/"+RawTypes.Cpx.name()+".vcf");
+ svDiscoveryInputData.updateOutputPath(outputPrefixWithSampleName + RawTypes.Cpx.name() + ".vcf");
SVVCFWriter.writeVCF(complexVariants, svDiscoveryInputData.outputPath,
svDiscoveryInputData.referenceSequenceDictionaryBroadcast.getValue(), svDiscoveryInputData.toolLogger);
}
@@ -243,9 +237,9 @@ private static void forNonComplexVariants(final EnumMap referenceBroadcast = svDiscoveryInputData.referenceBroadcast;
final Broadcast referenceSequenceDictionaryBroadcast = svDiscoveryInputData.referenceSequenceDictionaryBroadcast;
final Broadcast> cnvCallsBroadcast = svDiscoveryInputData.cnvCallsBroadcast;
- final String outputDir = svDiscoveryInputData.outputPath;
+ final String outputPrefixWithSampleName = svDiscoveryInputData.outputPath;
- svDiscoveryInputData.updateOutputPath(outputDir + "/nonComplex.vcf");
+ svDiscoveryInputData.updateOutputPath(outputPrefixWithSampleName + "NonComplex.vcf");
final JavaRDD nonComplexSignatures =
contigsByPossibleRawTypes.get(RawTypes.InsDel)
@@ -306,23 +300,6 @@ private static Iterator getVariantContextIterator(final Tuple2 filteredContigs, final String rawTypeString,
- final JavaRDD originalAlignments, final Broadcast headerBroadcast,
- final String outputDir, final Logger toolLogger) {
-
- final Set filteredReadNames = new HashSet<>( filteredContigs.map(AssemblyContigWithFineTunedAlignments::getContigName).distinct().collect() );
- toolLogger.info(filteredReadNames.size() + " contigs indicating " + rawTypeString);
- final JavaRDD splitLongReads = originalAlignments.filter(read -> filteredReadNames.contains(read.getName()))
- .map(read -> read.convertToSAMRecord(headerBroadcast.getValue()));
- SVFileUtils.writeSAMFile(outputDir+"/"+rawTypeString+".sam", splitLongReads.collect().iterator(),
- headerBroadcast.getValue(), false);
- }
-
public static final class SAMFormattedContigAlignmentParser extends AlignedContigGenerator implements Serializable {
private static final long serialVersionUID = 1L;
@@ -389,7 +366,7 @@ public static AlignedContig parseReadsAndOptionallySplitGappedAlignments(final I
parsedAlignments = unSplitAIList.collect(Collectors.toList());
}
}
- return new AlignedContig(primaryAlignment.getReadName(), contigSequence, parsedAlignments, false);
+ return new AlignedContig(primaryAlignment.getReadName(), contigSequence, parsedAlignments);
}
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtils.java
index 57a491ae882..4905982b8e6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtils.java
@@ -1,32 +1,32 @@
package org.broadinstitute.hellbender.tools.spark.sv.discovery;
-import htsjdk.samtools.SAMSequenceDictionary;
-import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.*;
import org.apache.logging.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.BreakpointComplications;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFReader;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.nio.file.Files;
-import java.util.HashSet;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class SvDiscoveryUtils {
- //==================================================================================================================
+
public static void evaluateIntervalsAndNarls(final List assembledIntervals,
final List narls,
@@ -98,6 +98,8 @@ private static int getAmbiguity(final BreakpointComplications complications) {
return complications.getHomologyForwardStrandRep().length();
}
+ //==================================================================================================================
+
public static Set getCanonicalChromosomes(final String nonCanonicalContigNamesFile,
@Nonnull final SAMSequenceDictionary dictionary) {
final LinkedHashSet allContigs = Utils.nonNull(dictionary).getSequences().stream().map(SAMSequenceRecord::getSequenceName)
@@ -112,4 +114,37 @@ public static Set getCanonicalChromosomes(final String nonCanonicalConti
throw new UserException("Can't read nonCanonicalContigNamesFile file "+nonCanonicalContigNamesFile, ioe);
}
}
+
+ //==================================================================================================================
+
+ /**
+ * write SAM file for provided {@code filteredContigs}
+ * by extracting original alignments from {@code originalAlignments},
+ * to directory specified by {@code outputDir}.
+ */
+ public static void writeSAMRecords(final JavaRDD originalAlignments, final Set readNameToInclude,
+ final String outputPath, final SAMFileHeader header) {
+ final SAMFileHeader cloneHeader = header.clone();
+ final SAMRecordComparator localComparator;
+ if (outputPath.toLowerCase().endsWith("bam")) {
+ cloneHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
+ localComparator = new SAMRecordCoordinateComparator();
+ } else if (outputPath.toLowerCase().endsWith("sam")) {
+ cloneHeader.setSortOrder(SAMFileHeader.SortOrder.queryname);
+ localComparator = new SAMRecordQueryNameComparator();
+ } else {
+ throw new IllegalArgumentException("Unsupported output format " + outputPath);
+ }
+
+ final List reads = originalAlignments.collect();
+ final List samRecords = new ArrayList<>();
+ reads.forEach(gatkRead -> {
+ if ( readNameToInclude.contains(gatkRead.getName()) ) {
+ samRecords.add(gatkRead.convertToSAMRecord(cloneHeader));
+ }
+ });
+
+ samRecords.sort(localComparator);
+ SVFileUtils.writeSAMFile( outputPath, samRecords.iterator(), cloneHeader, true);
+ }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContig.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContig.java
index 16f80c3db06..b79763c3a73 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContig.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContig.java
@@ -4,7 +4,6 @@
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
-import org.broadinstitute.hellbender.utils.Utils;
import scala.Tuple2;
import java.util.ArrayList;
@@ -22,18 +21,18 @@
@DefaultSerializer(AlignedContig.Serializer.class)
public final class AlignedContig {
- public final String contigName;
- public final byte[] contigSequence;
- public final List alignmentIntervals;
- public final boolean hasEquallyGoodAlnConfigurations;
+ private final String contigName;
+ private final byte[] contigSequence;
+ private final List alignmentIntervals;
- public AlignedContig(final String contigName, final byte[] contigSequence, final List alignmentIntervals,
- final boolean hasEquallyGoodAlnConfigurations) {
+ // throws if alignment interval is null
+ public AlignedContig(final String contigName, final byte[] contigSequence, final List alignmentIntervals) {
+ if (alignmentIntervals == null) {
+ throw new IllegalArgumentException("AlignedContig being constructed with null alignments: " + contigName);
+ }
this.contigName = contigName;
this.contigSequence = contigSequence;
- this.alignmentIntervals = Utils.stream( Utils.nonNull(alignmentIntervals) )
- .sorted(getAlignmentIntervalComparator()).collect(Collectors.toList());
- this.hasEquallyGoodAlnConfigurations = hasEquallyGoodAlnConfigurations;
+ this.alignmentIntervals = alignmentIntervals.stream().sorted(getAlignmentIntervalComparator()).collect(Collectors.toList());
}
AlignedContig(final Kryo kryo, final Input input) {
@@ -51,43 +50,49 @@ public AlignedContig(final String contigName, final byte[] contigSequence, final
for (int i = 0; i < nAlignments; ++i) {
alignmentIntervals.add(new AlignmentInterval(kryo, input));
}
-
- hasEquallyGoodAlnConfigurations = input.readBoolean();
}
- public static Comparator getAlignmentIntervalComparator() {
- Comparator comparePos = (AlignmentInterval a1, AlignmentInterval a2) -> Integer.compare(a1.startInAssembledContig, a2.startInAssembledContig);
- Comparator compareRefTig = (AlignmentInterval a1, AlignmentInterval a2) -> a1.referenceSpan.getContig().compareTo(a2.referenceSpan.getContig());
- Comparator compareRefSpanStart = (AlignmentInterval a1, AlignmentInterval a2) -> a1.referenceSpan.getStart() - a2.referenceSpan.getStart();
+ static Comparator getAlignmentIntervalComparator() {
+ Comparator comparePos = Comparator.comparingInt(aln -> aln.startInAssembledContig);
+ Comparator compareRefTig = Comparator.comparing(aln -> aln.referenceSpan.getContig());
+ Comparator compareRefSpanStart = Comparator.comparingInt(aln -> aln.referenceSpan.getStart());
return comparePos.thenComparing(compareRefTig).thenComparing(compareRefSpanStart);
}
- public boolean hasOnly2Alignments() {
+ boolean hasOnly2Alignments() {
return alignmentIntervals.size() == 2;
}
- public boolean isInformative() {
- return alignmentIntervals.size() > 1;
- }
-
/**
- * @return first alignment of the contig, {@code null} if it is unmapped.
+ * @return first alignment of the contig
*/
public AlignmentInterval getHeadAlignment() {
- if (alignmentIntervals.isEmpty())
- return null;
return alignmentIntervals.get(0);
}
/**
- * @return last alignment of the contig, {@code null} if it is unmapped.
+ * @return last alignment of the contig
*/
public AlignmentInterval getTailAlignment() {
- if (alignmentIntervals.isEmpty())
- return null;
return alignmentIntervals.get(alignmentIntervals.size() - 1);
}
+ public String getContigName() {
+ return contigName;
+ }
+
+ public byte[] getContigSequence() {
+ return contigSequence;
+ }
+
+ public boolean isUnmapped() {
+ return alignmentIntervals.isEmpty();
+ }
+
+ public List getAlignments() {
+ return alignmentIntervals;
+ }
+
@Override
public String toString() {
return formatContigInfo(
@@ -102,13 +107,12 @@ public static String formatContigInfo(final Tuple2> tigName
}
@Override
- public boolean equals(Object o) {
+ public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
- AlignedContig that = (AlignedContig) o;
+ final AlignedContig that = (AlignedContig) o;
- if (hasEquallyGoodAlnConfigurations != that.hasEquallyGoodAlnConfigurations) return false;
if (!contigName.equals(that.contigName)) return false;
if (!Arrays.equals(contigSequence, that.contigSequence)) return false;
return alignmentIntervals.equals(that.alignmentIntervals);
@@ -119,7 +123,6 @@ public int hashCode() {
int result = contigName.hashCode();
result = 31 * result + Arrays.hashCode(contigSequence);
result = 31 * result + alignmentIntervals.hashCode();
- result = 31 * result + (hasEquallyGoodAlnConfigurations ? 1 : 0);
return result;
}
@@ -135,7 +138,6 @@ void serialize(final Kryo kryo, final Output output) {
output.writeInt(alignmentIntervals.size());
alignmentIntervals.forEach(it -> it.serialize(kryo, output));
- output.writeBoolean(hasEquallyGoodAlnConfigurations);
}
public static final class Serializer extends com.esotericsoftware.kryo.Serializer {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java
index 897aa1b91e4..6bd6142dd28 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java
@@ -117,11 +117,11 @@ private static JavaRDD convertRawAlignmentsToAlignedContigAndFilt
*/
@VisibleForTesting
static boolean notDiscardForBadMQ(final AlignedContig contig) {
- if (contig.alignmentIntervals.size() < 2 ) {
- return (!contig.alignmentIntervals.isEmpty()) && contig.alignmentIntervals.get(0).mapQual > ALIGNMENT_MQ_THRESHOLD;
+ if (contig.getAlignments().size() < 2 ) {
+ return (!contig.getAlignments().isEmpty()) && contig.getAlignments().get(0).mapQual > ALIGNMENT_MQ_THRESHOLD;
} else {
int notBadMappingsCount = 0; // either more than 1 non-bad mappings, or at least 1 non-bad mapping containing large gap
- for (final AlignmentInterval alignment : contig.alignmentIntervals) {
+ for (final AlignmentInterval alignment : contig.getAlignments()) {
if (alignment.mapQual > ALIGNMENT_MQ_THRESHOLD) {
if (alignment.containsGapOfEqualOrLargerSize(GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY)) {
return true;// early return when a not-bad one contains a large gap
@@ -148,7 +148,7 @@ static JavaPairRDD, List> gatherBestC
final Set canonicalChromosomes = SvDiscoveryUtils.getCanonicalChromosomes(nonCanonicalContigNamesFile, dictionary);
return parsedContigAlignments
- .mapToPair(alignedContig -> new Tuple2<>(new Tuple2<>(alignedContig.contigName,alignedContig.contigSequence),
+ .mapToPair(alignedContig -> new Tuple2<>(new Tuple2<>(alignedContig.getContigName(), alignedContig.getContigSequence()),
pickBestConfigurations(alignedContig, canonicalChromosomes, scoreDiffTolerance)))
.mapToPair(nameSeqAndConfigurations -> new Tuple2<>(nameSeqAndConfigurations._1,
filterSecondaryConfigurationsByMappingQualityThreshold(nameSeqAndConfigurations._2, mqThreshold)));
@@ -174,6 +174,10 @@ public static final class GoodAndBadMappings {
private final List badMappings;
private final AlignmentInterval goodMappingToNonCanonicalChromosome;
+ public GoodAndBadMappings(@Nonnull final List goodMappings) {
+ this(goodMappings, Collections.emptyList(), null);
+ }
+
public GoodAndBadMappings(@Nonnull final List goodMappings, @Nonnull final List badMappings,
final AlignmentInterval goodMappingToNonCanonicalChr) {
this.goodMappings = goodMappings;
@@ -248,15 +252,15 @@ public static List pickBestConfigurations(final AlignedConti
final Set canonicalChromosomes,
final Double scoreDiffTolerance) {
// nothing to score if only one alignment
- if (alignedContig.alignmentIntervals.size() == 1) {
+ if (alignedContig.getAlignments().size() == 1) {
return Collections.singletonList(
- new GoodAndBadMappings(Collections.singletonList(alignedContig.alignmentIntervals.get(0)),
+ new GoodAndBadMappings(Collections.singletonList(alignedContig.getAlignments().get(0)),
Collections.emptyList())
);
}
// step 1: get max aligner score of mappings to canonical chromosomes and speed up in case of too many mappings
- final int maxCanonicalChrAlignerScore = alignedContig.alignmentIntervals.stream()
+ final int maxCanonicalChrAlignerScore = alignedContig.getAlignments().stream()
.filter(alignmentInterval -> canonicalChromosomes.contains(alignmentInterval.referenceSpan.getContig()))
.mapToInt(ai -> ai.alnScore).max().orElse(0); // possible that no mapping to canonical chromosomes
@@ -278,21 +282,22 @@ public static List pickBestConfigurations(final AlignedConti
// step 2: generate, and score configurations
return getGoodAndBadMappings(goodMappings, badMappings, goodMappingToNonCanonicalChromosome, canonicalChromosomes,
- newMaxCanonicalChrAlignerScore, scoreDiffTolerance, alignedContig.contigName);
+ newMaxCanonicalChrAlignerScore, scoreDiffTolerance, alignedContig.getContigName());
}
// speed up if number of alignments is too high (>10)
// if mapped to canonical chromosomes, MQ must be >10; otherwise, must have AS higher than max canonical aligner score
- private static GoodAndBadMappings speedUpWhenTooManyMappings(final AlignedContig alignedContig,
- final Set canonicalChromosomes,
- final int maxCanonicalChrAlignerScore) {
+ @VisibleForTesting
+ static GoodAndBadMappings speedUpWhenTooManyMappings(final AlignedContig alignedContig,
+ final Set canonicalChromosomes,
+ final int maxCanonicalChrAlignerScore) {
final List goods;
final List bads;
- if (alignedContig.alignmentIntervals.size() > 10) {
+ if (alignedContig.getAlignments().size() > 10) {
goods = new ArrayList<>();
bads = new ArrayList<>();
- for(final AlignmentInterval alignment : alignedContig.alignmentIntervals) {
+ for(final AlignmentInterval alignment : alignedContig.getAlignments()) {
final boolean isGood = (!canonicalChromosomes.contains(alignment.referenceSpan.getContig()) && alignment.alnScore > maxCanonicalChrAlignerScore)
|| alignment.mapQual > ALIGNMENT_MQ_THRESHOLD_FOR_SPEED_BOOST;
if (isGood)
@@ -301,7 +306,7 @@ private static GoodAndBadMappings speedUpWhenTooManyMappings(final AlignedContig
bads.add(alignment);
}
} else {
- goods = alignedContig.alignmentIntervals;
+ goods = alignedContig.getAlignments();
bads = Collections.emptyList();
}
return new GoodAndBadMappings(goods, bads);
@@ -408,7 +413,7 @@ public static Iterator reConstructContigF
if (bestConfigurations.size() > 1) { // more than one best configuration
return bestConfigurations.stream()
.map(configuration -> updateContigMappingsWithGapSplit(contigName, contigSeq, configuration, true))
- .sorted(sortConfigurations())
+ .sorted(getConfigurationComparator())
.iterator();
} else {
return Collections.singletonList(updateContigMappingsWithGapSplit(contigName, contigSeq, bestConfigurations.get(0), false)).iterator();
@@ -426,18 +431,20 @@ private static AssemblyContigWithFineTunedAlignments updateContigMappingsWithGap
}
return new AssemblyContigWithFineTunedAlignments(
- new AlignedContig(contigName, contigSeq, goodAndBadMappings.goodMappings, setResultContigAsAmbiguous),
+ new AlignedContig(contigName, contigSeq, goodAndBadMappings.goodMappings),
goodAndBadMappings.badMappings.stream().map(AlignmentInterval::toPackedString).collect(Collectors.toList()),
+ setResultContigAsAmbiguous,
goodAndBadMappings.getMayBeNullGoodMappingToNonCanonicalChromosome());
}
/**
* when two configurations are the same,
- * put the one with less alignments,
- * or less summed mismatches if still tie
- * first
+ * implement ordering that
+ * prefers the configuration with less alignments,
+ * and prefers the configuration with a lower number of summed mismatches in case of a tie
*/
- private static Comparator sortConfigurations() {
+ @VisibleForTesting
+ static Comparator getConfigurationComparator() {
final Comparator numFirst
= Comparator.comparingInt(tig -> tig.getAlignments().size());
final Comparator mismatchSecond
@@ -503,8 +510,9 @@ static GoodAndBadMappings splitGaps(final GoodAndBadMappings configuration) {
return new GoodAndBadMappings(good, bad, configuration.goodMappingToNonCanonicalChromosome);
}
- private static boolean gappedAlignmentOffersBetterCoverage(final AlignmentInterval gapped,
- final AlignmentInterval overlappingNonGapped) {
+ @VisibleForTesting
+ static boolean gappedAlignmentOffersBetterCoverage(final AlignmentInterval gapped,
+ final AlignmentInterval overlappingNonGapped) {
final int diff = gapped.getSizeOnRead() - overlappingNonGapped.getSizeOnRead();
if ( diff == 0) {
return gapped.alnScore > overlappingNonGapped.alnScore;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigWithFineTunedAlignments.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigWithFineTunedAlignments.java
index da1e814dc75..a6d7a650eb4 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigWithFineTunedAlignments.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigWithFineTunedAlignments.java
@@ -12,6 +12,11 @@
import java.util.List;
import java.util.Set;
+/**
+ * A wrapper around {@link AlignedContig} to
+ * represent mapped assembly contig whose alignments
+ * went through {@link AssemblyContigAlignmentsConfigPicker}.
+ */
@DefaultSerializer(AssemblyContigWithFineTunedAlignments.Serializer.class)
public final class AssemblyContigWithFineTunedAlignments {
private static final AlignedContig.Serializer contigSerializer = new AlignedContig.Serializer();
@@ -23,6 +28,8 @@ public final class AssemblyContigWithFineTunedAlignments {
// useful for annotation but not essential for reliable event interpretation
private final List insertionMappings;
+ private final boolean hasEquallyGoodAlnConfigurations;
+
/**
* for signalling (i.e. not null) that the alignments went through the special treatment in
* {@link AssemblyContigAlignmentsConfigPicker#getBetterNonCanonicalMapping(Set, List, int)}
@@ -30,23 +37,27 @@ public final class AssemblyContigWithFineTunedAlignments {
private final String saTAGForGoodMappingToNonCanonicalChromosome;
public AssemblyContigWithFineTunedAlignments(final AlignedContig contig) {
- this(Utils.nonNull(contig), emptyInsertionMappings, NO_GOOD_MAPPING_TO_NON_CANONICAL_CHROMOSOME);
+ this(Utils.nonNull(contig), emptyInsertionMappings, false, NO_GOOD_MAPPING_TO_NON_CANONICAL_CHROMOSOME);
}
// {@code goodMappingToNonCanonicalChromosome} could be null
public AssemblyContigWithFineTunedAlignments(final AlignedContig contig,
final List insertionMappings,
+ final boolean hasEquallyGoodAlnConfigurations,
final AlignmentInterval goodMappingToNonCanonicalChromosome) {
this(Utils.nonNull(contig), Utils.nonNull(insertionMappings),
+ hasEquallyGoodAlnConfigurations,
goodMappingToNonCanonicalChromosome == null ? NO_GOOD_MAPPING_TO_NON_CANONICAL_CHROMOSOME
: goodMappingToNonCanonicalChromosome.toSATagString());
}
public AssemblyContigWithFineTunedAlignments(final AlignedContig contig,
final List insertionMappings,
+ final boolean hasEquallyGoodAlnConfigurations,
final String saTAGForGoodMappingToNonCanonicalChromosome) {
this.sourceTig = contig;
this.insertionMappings = insertionMappings;
+ this.hasEquallyGoodAlnConfigurations = hasEquallyGoodAlnConfigurations;
this.saTAGForGoodMappingToNonCanonicalChromosome = saTAGForGoodMappingToNonCanonicalChromosome;
}
@@ -58,6 +69,7 @@ public AssemblyContigWithFineTunedAlignments(final AlignedContig contig,
insertionMappings.add(input.readString());
}
+ hasEquallyGoodAlnConfigurations = input.readBoolean();
saTAGForGoodMappingToNonCanonicalChromosome = input.readString();
}
@@ -66,7 +78,7 @@ public AlignedContig getSourceContig() {
}
public List getAlignments() {
- return sourceTig.alignmentIntervals;
+ return sourceTig.getAlignments();
}
public AlignmentInterval getHeadAlignment() {
@@ -78,11 +90,11 @@ public AlignmentInterval getTailAlignment() {
}
public String getContigName() {
- return sourceTig.contigName;
+ return sourceTig.getContigName();
}
public byte[] getContigSequence() {
- return sourceTig.contigSequence;
+ return sourceTig.getContigSequence();
}
public List getInsertionMappings() {
@@ -91,7 +103,7 @@ public List getInsertionMappings() {
// after fine tuning, a contig may have no good alignment left, or only 1
public final boolean isInformative() {
- return sourceTig.isInformative();
+ return sourceTig.getAlignments().size() > 1;
}
/**
@@ -114,7 +126,7 @@ public final boolean hasIncompletePicture() {
}
public final boolean hasEquallyGoodAlnConfigurations() {
- return sourceTig.hasEquallyGoodAlnConfigurations;
+ return hasEquallyGoodAlnConfigurations;
}
//==================================================================================================================
@@ -141,8 +153,8 @@ public boolean hasIncompletePictureFromTwoAlignments() {
}
private boolean hasIncompletePictureDueToRefSpanContainment() {
- return oneRefSpanContainsTheOther(sourceTig.alignmentIntervals.get(0).referenceSpan,
- sourceTig.alignmentIntervals.get(1).referenceSpan);
+ return oneRefSpanContainsTheOther(sourceTig.getHeadAlignment().referenceSpan,
+ sourceTig.getTailAlignment().referenceSpan);
}
/**
@@ -152,8 +164,8 @@ private boolean hasIncompletePictureDueToRefSpanContainment() {
*/
private boolean hasIncompletePictureDueToOverlappingRefOrderSwitch() {
- final AlignmentInterval one = sourceTig.alignmentIntervals.get(0);
- final AlignmentInterval two = sourceTig.alignmentIntervals.get(1);
+ final AlignmentInterval one = sourceTig.getHeadAlignment();
+ final AlignmentInterval two = sourceTig.getTailAlignment();
final SimpleInterval referenceSpanOne = one.referenceSpan;
final SimpleInterval referenceSpanTwo = two.referenceSpan;
@@ -212,10 +224,10 @@ private boolean hasIncompletePictureDueToOverlappingRefOrderSwitch() {
* This is similar to the case where a particular mapping is unreliable without confident left and right flanking.
*/
private boolean hasIncompletePictureFromMultipleAlignments() {
- final int goodAlignmentsCount = sourceTig.alignmentIntervals.size();
+ final int goodAlignmentsCount = sourceTig.getAlignments().size();
- final AlignmentInterval head = sourceTig.alignmentIntervals.get(0),
- tail = sourceTig.alignmentIntervals.get(goodAlignmentsCount-1);
+ final AlignmentInterval head = sourceTig.getHeadAlignment(),
+ tail = sourceTig.getTailAlignment();
// mapped to different chr or strand
if ( !head.referenceSpan.getContig().equals(tail.referenceSpan.getContig()) || head.forwardStrand != tail.forwardStrand)
@@ -233,7 +245,7 @@ private boolean hasIncompletePictureFromMultipleAlignments() {
Math.min(referenceSpanHead.getStart(), referenceSpanTail.getStart()),
Math.max(referenceSpanHead.getEnd(), referenceSpanTail.getEnd()));
final boolean notCompleteDupRegion =
- sourceTig.alignmentIntervals
+ sourceTig.getAlignments()
.subList(1, goodAlignmentsCount - 1) // take middle (no head no tail) alignments' ref span
.stream().map(ai -> ai.referenceSpan)
.anyMatch( middle -> middle.overlaps(validRegion) && !validRegion.contains(middle));
@@ -256,8 +268,8 @@ private static boolean oneRefSpanContainsTheOther(final SimpleInterval one, fina
public boolean firstAndLastAlignmentMappedToSameChr() {
- final String firstMappedChr = this.sourceTig.alignmentIntervals.get(0).referenceSpan.getContig();
- final String lastMappedChr = this.sourceTig.alignmentIntervals.get(this.sourceTig.alignmentIntervals.size() - 1).referenceSpan.getContig();
+ final String firstMappedChr = this.sourceTig.getHeadAlignment().referenceSpan.getContig();
+ final String lastMappedChr = this.sourceTig.getTailAlignment().referenceSpan.getContig();
return firstMappedChr.equals(lastMappedChr);
}
@@ -268,6 +280,7 @@ void serialize(final Kryo kryo, final Output output) {
for (final String mapping : insertionMappings) {
output.writeString(mapping);
}
+ output.writeBoolean(hasEquallyGoodAlnConfigurations);
output.writeString(saTAGForGoodMappingToNonCanonicalChromosome);
}
@@ -288,6 +301,7 @@ public String toString() {
final StringBuilder sb = new StringBuilder("AssemblyContigWithFineTunedAlignments{");
sb.append("sourceTig=").append(sourceTig);
sb.append(", insertionMappings=").append(insertionMappings);
+ sb.append(", hasEquallyGoodAlnConfigurations=").append(hasEquallyGoodAlnConfigurations);
sb.append(", saTAGForGoodMappingToNonCanonicalChromosome='").append(saTAGForGoodMappingToNonCanonicalChromosome).append('\'');
sb.append('}');
return sb.toString();
@@ -300,6 +314,7 @@ public boolean equals(final Object o) {
final AssemblyContigWithFineTunedAlignments that = (AssemblyContigWithFineTunedAlignments) o;
+ if (hasEquallyGoodAlnConfigurations != that.hasEquallyGoodAlnConfigurations) return false;
if (!sourceTig.equals(that.sourceTig)) return false;
if (!insertionMappings.equals(that.insertionMappings)) return false;
return saTAGForGoodMappingToNonCanonicalChromosome.equals(that.saTAGForGoodMappingToNonCanonicalChromosome);
@@ -309,6 +324,7 @@ public boolean equals(final Object o) {
public int hashCode() {
int result = sourceTig.hashCode();
result = 31 * result + insertionMappings.hashCode();
+ result = 31 * result + (hasEquallyGoodAlnConfigurations ? 1 : 0);
result = 31 * result + saTAGForGoodMappingToNonCanonicalChromosome.hashCode();
return result;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifier.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifier.java
index cd49ee3ccf1..bcabd033cb6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifier.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifier.java
@@ -323,7 +323,7 @@ public static Iterable splitGappedAlignment(final AlignmentIn
final AlignmentInterval split = new AlignmentInterval(referenceInterval, contigIntervalStart, contigIntervalEnd,
cigarForNewAlignmentInterval, oneRegion.forwardStrand, originalMapQ,
- AlignmentInterval.NO_NM, oneRegion.alnScore, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT);
+ AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT);
result.add(split);
@@ -366,7 +366,7 @@ public static Iterable splitGappedAlignment(final AlignmentIn
result.add(new AlignmentInterval(lastReferenceInterval,
contigIntervalStart, unclippedContigLen-clippedNBasesFromEnd, lastForwardStrandCigar,
oneRegion.forwardStrand, originalMapQ,
- AlignmentInterval.NO_NM, oneRegion.alnScore, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT));
+ AlignmentInterval.NO_NM, AlignmentInterval.NO_AS, ContigAlignmentsModifier.AlnModType.FROM_SPLIT_GAPPED_ALIGNMENT));
return result;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/AssemblyContigAlignmentSignatureClassifier.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/AssemblyContigAlignmentSignatureClassifier.java
index b175b3a845e..23c4a2e1309 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/AssemblyContigAlignmentSignatureClassifier.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/AssemblyContigAlignmentSignatureClassifier.java
@@ -150,11 +150,11 @@ static EnumMap> process
static boolean indicatesIntraChrStrandSwitchBkpts(final AssemblyContigWithFineTunedAlignments decoratedContig) {
final AlignedContig contig = decoratedContig.getSourceContig();
- if (contig.alignmentIntervals.size() != 2)
+ if (contig.getAlignments().size() != 2)
return false;
- final AlignmentInterval first = contig.alignmentIntervals.get(0);
- final AlignmentInterval last = contig.alignmentIntervals.get(1);
+ final AlignmentInterval first = contig.getAlignments().get(0);
+ final AlignmentInterval last = contig.getAlignments().get(1);
if ( !first.referenceSpan.getContig().equals(last.referenceSpan.getContig()) )
return false;
@@ -163,11 +163,11 @@ static boolean indicatesIntraChrStrandSwitchBkpts(final AssemblyContigWithFineTu
static boolean indicatesIntraChrTandemDupBkpts(final AssemblyContigWithFineTunedAlignments decoratedContig) {
final AlignedContig contig = decoratedContig.getSourceContig();
- if (contig.alignmentIntervals.size() != 2)
+ if (contig.getAlignments().size() != 2)
return false;
- final AlignmentInterval first = contig.alignmentIntervals.get(0);
- final AlignmentInterval last = contig.alignmentIntervals.get(1);
+ final AlignmentInterval first = contig.getAlignments().get(0);
+ final AlignmentInterval last = contig.getAlignments().get(1);
if ( !first.referenceSpan.getContig().equals(last.referenceSpan.getContig()) )
return false;
@@ -329,10 +329,11 @@ static MultipleAlignmentReclassificationResults reClassifyContigsWithMultipleAli
final AssemblyContigAlignmentsConfigPicker.GoodAndBadMappings refinedMappings =
removeNonUniqueMappings(tig.getAlignments(), mapQThresholdInclusive, uniqReadLenInclusive);
- final AlignedContig updatedTig = new AlignedContig(tig.getSourceContig().contigName, tig.getSourceContig().contigSequence,
- refinedMappings.getGoodMappings(), tig.hasEquallyGoodAlnConfigurations());
+ final AlignedContig updatedTig = new AlignedContig(tig.getContigName(), tig.getContigSequence(),
+ refinedMappings.getGoodMappings());
return new AssemblyContigWithFineTunedAlignments(updatedTig,
refinedMappings.getBadMappingsAsCompactStrings(),
+ tig.hasEquallyGoodAlnConfigurations(),
tig.getSAtagForGoodMappingToNonCanonicalChromosome());
});
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/BreakpointComplications.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/BreakpointComplications.java
index 290ba7a85ec..4790bf4756a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/BreakpointComplications.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/BreakpointComplications.java
@@ -88,6 +88,7 @@ public Map toVariantAttributes() {
if ( !getInsertedSequenceForwardStrandRep().isEmpty() ) {
attributeMap.put(GATKSVVCFConstants.INSERTED_SEQUENCE, getInsertedSequenceForwardStrandRep());
+ attributeMap.put(GATKSVVCFConstants.INSERTED_SEQUENCE_LENGTH, getInsertedSequenceForwardStrandRep().length());
}
if ( !getHomologyForwardStrandRep().isEmpty() ) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ChimericAlignment.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ChimericAlignment.java
index 10f8496f0b4..fb90fafece0 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ChimericAlignment.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ChimericAlignment.java
@@ -329,11 +329,11 @@ public static List parseOneContig(final AlignedContig aligned
final int mapQualThresholdInclusive,
final boolean filterWhollyContainedAlignments) {
- if (alignedContig.alignmentIntervals.size() < 2) {
+ if (alignedContig.getAlignments().size() < 2) {
return new ArrayList<>();
}
- final Iterator iterator = alignedContig.alignmentIntervals.iterator();
+ final Iterator iterator = alignedContig.getAlignments().iterator();
// fast forward to the first alignment region with high MapQ
AlignmentInterval current = iterator.next();
@@ -343,7 +343,7 @@ public static List parseOneContig(final AlignedContig aligned
}
}
- final List results = new ArrayList<>(alignedContig.alignmentIntervals.size() - 1);
+ final List results = new ArrayList<>(alignedContig.getAlignments().size() - 1);
final List insertionMappings = new ArrayList<>();
// then iterate over the AR's in pair to identify CA's.
@@ -366,7 +366,7 @@ public static List parseOneContig(final AlignedContig aligned
// this was initially developed for ins/del (and tested for that purpose), simple translocations travel through a different code path at the moment.
// TODO: ultimately we need to merge these two code paths
final ChimericAlignment chimericAlignment = new ChimericAlignment(current, next, insertionMappings,
- alignedContig.contigName, referenceDictionary);
+ alignedContig.getContigName(), referenceDictionary);
// the following check/filter is due to the fact that simple translocations are to be handled in a different code path
if (chimericAlignment.isNeitherSimpleTranslocationNorIncompletePicture())
results.add(chimericAlignment);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInducingAssemblyContig.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInducingAssemblyContig.java
index d641f32ea74..7ff8c2b2154 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInducingAssemblyContig.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInducingAssemblyContig.java
@@ -283,10 +283,10 @@ static final class BasicInfo {
@VisibleForTesting
BasicInfo(final AlignedContig contig) {
+ if (contig.isUnmapped())
+ throw new GATKException("Trying to extract information from unmapped read: " + contig.toString());
final AlignmentInterval head = contig.getHeadAlignment();
final AlignmentInterval tail = contig.getTailAlignment();
- if (head == null || tail == null)
- throw new GATKException("Head or tail alignment is null from contig:\n" + contig.toString());
eventPrimaryChromosome = head.referenceSpan.getContig();
forwardStrandRep = head.forwardStrand;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInterpreter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInterpreter.java
index 1034a8f1049..436f2e692a5 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInterpreter.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInterpreter.java
@@ -80,8 +80,9 @@ static AssemblyContigWithFineTunedAlignments furtherPreprocess(final AssemblyCon
return new AssemblyContigWithFineTunedAlignments(
new AlignedContig(contigWithFineTunedAlignments.getContigName(), contigWithFineTunedAlignments.getContigSequence(),
- deOverlappedAlignmentConfiguration, contigWithFineTunedAlignments.hasEquallyGoodAlnConfigurations()),
+ deOverlappedAlignmentConfiguration),
contigWithFineTunedAlignments.getInsertionMappings(),
+ contigWithFineTunedAlignments.hasEquallyGoodAlnConfigurations(),
contigWithFineTunedAlignments.getSAtagForGoodMappingToNonCanonicalChromosome());
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
index 5ba5283ae40..7e8cf65f99c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
@@ -41,6 +41,7 @@ public final class GATKSVVCFConstants {
// applicable to all precise records when available
public static final String SEQ_ALT_HAPLOTYPE = "SEQ_ALT_HAPLOTYPE";
public static final String INSERTED_SEQUENCE = "INSSEQ";
+ public static final String INSERTED_SEQUENCE_LENGTH = "INSLEN";
public static final String INSERTED_SEQUENCE_MAPPINGS = "INSSEQ_MAP";
public static final String HOMOLOGY = "HOMSEQ";
public static final String HOMOLOGY_LENGTH = "HOMLEN";
@@ -84,7 +85,7 @@ public final class GATKSVVCFConstants {
= Stream.of("SVTYPE", "SVLEN", "MATEID", "INV", "DEL", "INS", "DUP", "DUP:INV",
"CIPOS", "CIEND", "IMPRECISE", "READ_PAIR_SUPPORT", "SPLIT_READ_SUPPORT",
"CTG_NAMES", "TOTAL_MAPPINGS", "MAPPING_QUALITIES", "HQ_MAPPINGS", "ALIGN_LENGTHS", "MAX_ALIGN_LENGTH",
- "SEQ_ALT_HAPLOTYPE", "INSSEQ", "INSSEQ_MAP", "HOMSEQ", "HOMLEN", "DUP_REPEAT_UNIT_REF_SPAN",
+ "SEQ_ALT_HAPLOTYPE", "INSSEQ", "INSLEN", "INSSEQ_MAP", "HOMSEQ", "HOMLEN", "DUP_REPEAT_UNIT_REF_SPAN",
"DUP_SEQ_CIGARS", "DUP_NUM", "DUP_ANNOTATIONS_IMPRECISE", "CONTRACTION", "EXPANSION", "DUP_ORIENTATIONS",
"INV33", "INV55", "EXTERNAL_CNV_CALLS", "DUP_IMPRECISE_AFFECTED_RANGE", "CTG_GOOD_NONCANONICAL_MAPPING")
.sorted().collect(Collectors.toList());
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFHeaderLines.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFHeaderLines.java
index 8989a6ce305..543bd546888 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFHeaderLines.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFHeaderLines.java
@@ -83,6 +83,10 @@ private static void addSymbAltAlleleLine(final VCFSimpleHeaderLine line) {
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.String,
"Inserted sequence at the breakpoint"));
+ addInfoLine(new VCFInfoHeaderLine(GATKSVVCFConstants.INSERTED_SEQUENCE_LENGTH,
+ VCFHeaderLineCount.A,
+ VCFHeaderLineType.Integer,
+ "Length of inserted sequence (note for duplication records, this does not count the extra copies of the duplicated sequence)"));
addInfoLine(new VCFInfoHeaderLine(GATKSVVCFConstants.INSERTED_SEQUENCE_MAPPINGS,
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.String,
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducerUnitTest.java
index 473002298e0..dfa5c4f233f 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducerUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducerUnitTest.java
@@ -181,12 +181,12 @@ private Object[][] dataForIntegrativeTest() {
// simple insertion
data.add(new Object[]{forSimpleInsertion_plus, Stream.concat( commonAttributes.stream(),
- Sets.newHashSet(INSERTED_SEQUENCE).stream()).sorted().collect(Collectors.toList()),
+ Sets.newHashSet(INSERTED_SEQUENCE, INSERTED_SEQUENCE_LENGTH).stream()).sorted().collect(Collectors.toList()),
altHaplotypeSeq, broadcastCNVCalls, referenceBroadcast, refSeqDictBroadcast});
// long range substitution
data.add(new Object[]{forLongRangeSubstitution_minus, Stream.concat( commonAttributes.stream(),
- Sets.newHashSet(INSERTED_SEQUENCE).stream()).sorted().collect(Collectors.toList()),
+ Sets.newHashSet(INSERTED_SEQUENCE, INSERTED_SEQUENCE_LENGTH).stream()).sorted().collect(Collectors.toList()),
altHaplotypeSeq, broadcastCNVCalls, referenceBroadcast, refSeqDictBroadcast});
// simple deletion with homology
@@ -206,7 +206,7 @@ private Object[][] dataForIntegrativeTest() {
// simple tandem dup expansion from 1 unit to 2 units and novel insertion
data.add(new Object[]{forSimpleTanDupExpansionWithNovelIns_minus, Stream.concat( commonAttributes.stream(),
- Sets.newHashSet(DUP_TAN_EXPANSION_STRING, DUP_REPEAT_UNIT_REF_SPAN, DUP_SEQ_CIGARS, DUPLICATION_NUMBERS, DUP_ORIENTATIONS, INSERTED_SEQUENCE).stream()).sorted().collect(Collectors.toList()),
+ Sets.newHashSet(DUP_TAN_EXPANSION_STRING, DUP_REPEAT_UNIT_REF_SPAN, DUP_SEQ_CIGARS, DUPLICATION_NUMBERS, DUP_ORIENTATIONS, INSERTED_SEQUENCE, INSERTED_SEQUENCE_LENGTH).stream()).sorted().collect(Collectors.toList()),
altHaplotypeSeq, broadcastCNVCalls, referenceBroadcast, refSeqDictBroadcast});
// tandem dup expansion from 1 unit to 2 units with pseudo-homology
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SVTestUtils.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SVTestUtils.java
index 61eb5fa6f58..bfef640a288 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SVTestUtils.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SVTestUtils.java
@@ -78,9 +78,9 @@ public static AlignedContig fromPrimarySAMRecordString(final String samRecordStr
for (final String sup : supplements) {
alignments.add( new AlignmentInterval(sup) );
}
- return new AlignedContig(readName, sequence, alignments, false);
+ return new AlignedContig(readName, sequence, alignments);
} else {
- return new AlignedContig(readName, sequence, Collections.singletonList(primaryAlignment), false);
+ return new AlignedContig(readName, sequence, Collections.singletonList(primaryAlignment));
}
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SimpleSVDiscoveryTestDataProvider.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SimpleSVDiscoveryTestDataProvider.java
index 8253206f7d8..4b4fe5e6ce9 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SimpleSVDiscoveryTestDataProvider.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SimpleSVDiscoveryTestDataProvider.java
@@ -196,8 +196,8 @@ public static final class TestDataForSimpleSVs {
// reference intervals are changed from real values, which were generated on a different chromosome, to a fake but valid value.
final AlignmentInterval region1 = new AlignmentInterval(new SimpleInterval("21", 69149, 69294), 1, 146, TextCigarCodec.decode("146M51S"), true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE);
final AlignmentInterval region2 = new AlignmentInterval(new SimpleInterval("21", 69315, 69364), 148, 197, TextCigarCodec.decode("147S50M"), false, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE);
- final AlignedContig alignedContig = new AlignedContig("asm000001:tig00001", contigSeq, Arrays.asList(region1, region2), false);
- final NovelAdjacencyAndAltHaplotype breakpoints = new NovelAdjacencyAndAltHaplotype(new ChimericAlignment(region1, region2, Collections.emptyList(), alignedContig.contigName, b37_seqDict), alignedContig.contigSequence, b37_seqDict);
+ final AlignedContig alignedContig = new AlignedContig("asm000001:tig00001", contigSeq, Arrays.asList(region1, region2));
+ final NovelAdjacencyAndAltHaplotype breakpoints = new NovelAdjacencyAndAltHaplotype(new ChimericAlignment(region1, region2, Collections.emptyList(), alignedContig.getContigName(), b37_seqDict), alignedContig.getContigSequence(), b37_seqDict);
return new TestDataForSimpleSVs(region1, region2, breakpoints, "asm000001:tig00001");
}
@@ -208,7 +208,7 @@ public static final class TestDataForSimpleSVs {
final AlignmentInterval region1 = new AlignmentInterval(new SimpleInterval(chrForLongContig1, 20138007, 20142231), 1, contigSequence.length - 1986, TextCigarCodec.decode("1986S236M2D1572M1I798M5D730M1I347M4I535M"), false, 60, 36, 100, ContigAlignmentsModifier.AlnModType.NONE);
final AlignmentInterval region2 = new AlignmentInterval(new SimpleInterval(chrForLongContig1, 20152030, 20154634), 3604, contigSequence.length, TextCigarCodec.decode("3603H24M1I611M1I1970M"), true, 60, 36, 100, ContigAlignmentsModifier.AlnModType.NONE);
- final AlignedContig alignedContig = new AlignedContig("asm702700:tig00001", contigSequence, Arrays.asList(region1, region2), false);
+ final AlignedContig alignedContig = new AlignedContig("asm702700:tig00001", contigSequence, Arrays.asList(region1, region2));
final NovelAdjacencyAndAltHaplotype breakpoints = new NovelAdjacencyAndAltHaplotype(ChimericAlignment.parseOneContig(alignedContig, b37_seqDict, true, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD, true).get(0), contigSequence, b37_seqDict);
return new TestDataForSimpleSVs(region1, region2, breakpoints, "asm702700:tig00001");
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtilsUnitTest.java
new file mode 100644
index 00000000000..4c5da7b859f
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtilsUnitTest.java
@@ -0,0 +1,64 @@
+package org.broadinstitute.hellbender.tools.spark.sv.discovery;
+
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.ValidationStringency;
+import org.apache.spark.api.java.JavaRDD;
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.engine.spark.SparkContextFactory;
+import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.test.BaseTest;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+
+public class SvDiscoveryUtilsUnitTest extends GATKBaseTest {
+
+ @Test(groups = "sv")
+ public void test() {
+
+ ReadsSparkSource readSource = new ReadsSparkSource(SparkContextFactory.getTestSparkContext(), ValidationStringency.LENIENT);
+ final String referenceFile = b38_reference_20_21;
+
+ final String inputBam = toolsTestDir + "/spark/sv/utils/testSAMWriter_chr20.bam";
+
+ JavaRDD rddParallelReads = readSource.getParallelReads(inputBam, referenceFile);
+
+ final List temp = rddParallelReads.collect();
+ final byte[] bases = temp.get(0).getBases();
+ Assert.assertNotNull(bases);
+ SAMFileHeader header = readSource.getHeader(inputBam, referenceFile);
+
+ final File tempDirNew = BaseTest.createTempDir("new");
+ tempDirNew.deleteOnExit();
+
+ // test output empty set
+ String outputPath = tempDirNew.getAbsolutePath() + "/out.bam";
+ SvDiscoveryUtils.writeSAMRecords(rddParallelReads, Collections.emptySet(), outputPath, header);
+ Assert.assertTrue(readSource.getParallelReads(outputPath, referenceFile).collect().isEmpty());
+
+ // test output BAM
+ final HashSet names = new HashSet<>(Arrays.asList("asm000004:tig00016",
+ "asm028919:tig00012",
+ "asm016781:tig00011",
+ "asm028306:tig00006",
+ "asm028311:tig00040",
+ "asm000004:tig00007",
+ "asm000004:tig00032",
+ "asm016781:tig00011",
+ "asm028317:tig00003",
+ "asm000004:tig00022"));
+ SvDiscoveryUtils.writeSAMRecords(rddParallelReads, names, outputPath, header);
+ Assert.assertEquals(readSource.getParallelReads(outputPath, referenceFile).collect().size(), 16);
+
+ // test output SAM
+ outputPath = tempDirNew.getAbsolutePath() + "/out.sam";
+ SvDiscoveryUtils.writeSAMRecords(rddParallelReads, names, outputPath, header);
+ Assert.assertEquals(readSource.getParallelReads(outputPath, referenceFile).collect().size(), 16);
+ }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedAssemblyUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedAssemblyUnitTest.java
index c0c10bc5b0c..8217ca4b50c 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedAssemblyUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedAssemblyUnitTest.java
@@ -73,11 +73,11 @@ private Object[][] createInputsAndExpectedResults_Serialization() {
alignmentIntervalsForSimpleInversion.add(alignmentIntervalRight);
if (pair == 0) {
- allContigs.add( new AlignedContig(AlignedAssemblyOrExcuse.formatContigName(0, 0), dummySequenceForContigOne, alignmentIntervalsForSimpleInversion, false) );
+ allContigs.add( new AlignedContig(AlignedAssemblyOrExcuse.formatContigName(0, 0), dummySequenceForContigOne, alignmentIntervalsForSimpleInversion) );
} else if (pair <3) {
- allContigs.add( new AlignedContig(AlignedAssemblyOrExcuse.formatContigName(1, pair-1), pair==1 ? dummySequenceForContigTwo : dummySequenceForContigThree, alignmentIntervalsForSimpleInversion, false) );
+ allContigs.add( new AlignedContig(AlignedAssemblyOrExcuse.formatContigName(1, pair-1), pair==1 ? dummySequenceForContigTwo : dummySequenceForContigThree, alignmentIntervalsForSimpleInversion) );
} else {
- allContigs.add( new AlignedContig(AlignedAssemblyOrExcuse.formatContigName(2, 0), dummySequenceForContigFour, alignmentIntervalsForSimpleInversion, false) );
+ allContigs.add( new AlignedContig(AlignedAssemblyOrExcuse.formatContigName(2, 0), dummySequenceForContigFour, alignmentIntervalsForSimpleInversion) );
}
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContigGeneratorUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContigGeneratorUnitTest.java
index 8e748e22e0f..a399b39a69e 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContigGeneratorUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContigGeneratorUnitTest.java
@@ -21,10 +21,7 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
+import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -71,9 +68,9 @@ public void testConvertGATKReadsToAlignedContig() {
final List reads = Stream.of(read1, read2, read3).map(read -> read.convertToSAMRecord(null)).collect(Collectors.toList());
final AlignedContig alignedContig = SvDiscoverFromLocalAssemblyContigAlignmentsSpark.SAMFormattedContigAlignmentParser.parseReadsAndOptionallySplitGappedAlignments(reads, GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, true);
- assertEquals(alignedContig.contigSequence, read2.getBases());
+ assertEquals(alignedContig.getContigSequence(), read2.getBases());
- assertEquals(alignedContig.alignmentIntervals.size(), 3);
+ assertEquals(alignedContig.getAlignments().size(), 3);
final byte[] read4Bytes = SimpleSVDiscoveryTestDataProvider.LONG_CONTIG1.getBytes();
final String read4Seq = new String(read4Bytes);
@@ -100,9 +97,9 @@ public void testConvertGATKReadsToAlignedContig() {
final AlignedContig alignedContig2 = SvDiscoverFromLocalAssemblyContigAlignmentsSpark.SAMFormattedContigAlignmentParser.parseReadsAndOptionallySplitGappedAlignments(reads2, GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, true);
// these should be the reverse complements of each other
- assertEquals(alignedContig2.contigSequence.length, read4.getBases().length);
+ assertEquals(alignedContig2.getContigSequence().length, read4.getBases().length);
- final List alignmentIntervals2 = alignedContig2.alignmentIntervals;
+ final List alignmentIntervals2 = alignedContig2.getAlignments();
assertEquals(alignmentIntervals2.size(), 2);
final AlignmentInterval alignmentInterval4 = alignmentIntervals2.get(0);
@@ -153,21 +150,21 @@ public void testConvertAlignedAssemblyOrExcuseToAlignedContigsDirectAndConcordan
final Iterator it = alignedContigsIncludingUnmapped.iterator();
- Assert.assertTrue(it.next().alignmentIntervals.isEmpty());
- Assert.assertTrue(it.next().alignmentIntervals.isEmpty());
+ Assert.assertTrue(it.next().getAlignments().isEmpty());
+ Assert.assertTrue(it.next().getAlignments().isEmpty());
- final List alignmentIntervalsForCleanContig = it.next().alignmentIntervals;
+ final List alignmentIntervalsForCleanContig = it.next().getAlignments();
Assert.assertEquals(alignmentIntervalsForCleanContig.size(), 1);
Assert.assertEquals(alignmentIntervalsForCleanContig.get(0), new AlignmentInterval(new SimpleInterval(dummyRefName, 1000001, 1001000), 1, 1000, TextCigarCodec.decode("1000M"), true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE));
- final List alignmentIntervalsForContigWithGappedAlignment = it.next().alignmentIntervals;
+ final List alignmentIntervalsForContigWithGappedAlignment = it.next().getAlignments();
Assert.assertEquals(alignmentIntervalsForContigWithGappedAlignment.size(), 3);
// test direct conversion (essentially the filtering step)
final List parsedContigsViaDirectRoute
= StructuralVariationDiscoveryPipelineSpark.InMemoryAlignmentParser.filterAndConvertToAlignedContigDirect(Collections.singleton(alignedAssembly), refNames, null);
Assert.assertEquals(parsedContigsViaDirectRoute.size(), 2);
- Assert.assertTrue( parsedContigsViaDirectRoute.containsAll(Utils.stream(alignedContigsIncludingUnmapped).filter(ctg -> !ctg.alignmentIntervals.isEmpty()).collect(Collectors.toList())) );
+ Assert.assertTrue( parsedContigsViaDirectRoute.containsAll(Utils.stream(alignedContigsIncludingUnmapped).filter(ctg -> !ctg.getAlignments().isEmpty()).collect(Collectors.toList())) );
// concordance test with results obtained via SAM route
final List parsedContigsViaSAMRoute
@@ -195,6 +192,20 @@ public void testConvertUnmappedRecords() {
SvDiscoverFromLocalAssemblyContigAlignmentsSpark.SAMFormattedContigAlignmentParser.
parseReadsAndOptionallySplitGappedAlignments(Collections.singletonList(unmappedSam),
GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, true);
- Assert.assertTrue(unmappedContig.alignmentIntervals.isEmpty());
+ Assert.assertTrue(unmappedContig.isUnmapped());
+ }
+
+ @DataProvider(name = "forNullOrEmptyAlignments")
+ private Object[][] forNullOrEmptyAlignments() {
+ final List