diff --git a/scripts/sv/runWholePipeline.sh b/scripts/sv/runWholePipeline.sh index ed822519f17..4daea29e78c 100755 --- a/scripts/sv/runWholePipeline.sh +++ b/scripts/sv/runWholePipeline.sh @@ -58,7 +58,7 @@ case ${GATK_SV_TOOL} in "StructuralVariationDiscoveryPipelineSpark") TOOL_OPTIONS="\ -I ${INPUT_BAM} \ - -O ${PROJECT_OUTPUT_DIR}/variants/inv_del_ins.vcf \ + -O ${PROJECT_OUTPUT_DIR}/variants/ \ -R ${REF_TWOBIT} \ --aligner-index-image ${REF_INDEX_IMAGE} \ --exclusion-intervals ${INTERVAL_KILL_LIST} \ @@ -67,9 +67,9 @@ case ${GATK_SV_TOOL} in --breakpoint-intervals ${PROJECT_OUTPUT_DIR}/intervals \ --high-coverage-intervals "${PROJECT_OUTPUT_DIR}/highCoverageIntervals.bed" \ --fastq-dir ${PROJECT_OUTPUT_DIR}/fastq \ - --contig-sam-file ${PROJECT_OUTPUT_DIR}/assemblies.sam \ + --contig-sam-file ${PROJECT_OUTPUT_DIR}/assemblies.bam \ --target-link-file ${PROJECT_OUTPUT_DIR}/target_links.bedpe \ - --exp-variants-out-dir ${PROJECT_OUTPUT_DIR}/experimentalVariantInterpretations" + --exp-interpret" ;; "ExtractSVEvidenceSpark") TOOL_OPTIONS="\ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java index 8196da77ff7..b2b83a7747f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java @@ -17,6 +17,7 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; +import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.tools.spark.sv.discovery.AnnotatedVariantProducer; import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoverFromLocalAssemblyContigAlignmentsSpark; import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputData; @@ -33,10 +34,13 @@ import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment; import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignmentUtils; import org.broadinstitute.hellbender.utils.fermi.FermiLiteAssembly; +import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter; import scala.Serializable; +import java.io.IOException; +import java.nio.file.Paths; import java.util.EnumMap; import java.util.List; import java.util.stream.Collectors; @@ -49,7 +53,8 @@ * Runs the structural variation discovery workflow on a single sample * *

This tool packages the algorithms described in {@link FindBreakpointEvidenceSpark} and - * {@link DiscoverVariantsFromContigAlignmentsSAMSpark} as an integrated workflow. Please consult the + * {@link org.broadinstitute.hellbender.tools.spark.sv.discovery.DiscoverVariantsFromContigAlignmentsSAMSpark} + * as an integrated workflow. Please consult the * descriptions of those tools for more details about the algorithms employed. In brief, input reads are examined * for evidence of structural variation in a genomic region, regions so identified are locally assembled, and * the local assemblies are called for structural variation.

@@ -108,16 +113,20 @@ public class StructuralVariationDiscoveryPipelineSpark extends GATKSparkTool { @ArgumentCollection private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs = new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection(); + @Argument(doc = "sam file for aligned contigs", fullName = "contig-sam-file") private String outputAssemblyAlignments; - @Argument(doc = "filename for output vcf", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + + @Argument(doc = "directory for VCF output, including those from experimental interpretation tool if so requested, " + + "will be created if not present; sample name will be appended after the provided argument", + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME) - private String vcfOutputFileName; + private String variantsOutDir; + @Advanced - @Argument(doc = "directory to output results of our prototyping breakpoint and type inference tool in addition to the master VCF;" + - " the directory contains multiple VCF's for different types and record-generating SAM files of assembly contigs,", - fullName = "exp-variants-out-dir", optional = true) - private String expVariantsOutDir; + @Argument(doc = "flag to signal that user wants to run experimental interpretation tool as well", + fullName = "exp-interpret", optional = true) + private Boolean expInterpret = false; @Override public boolean requiresReads() @@ -162,13 +171,7 @@ protected void runTool( final JavaSparkContext ctx ) { // todo: when we call imprecise variants don't return here if(parsedAlignments.isEmpty()) return; - final Broadcast> cnvCallsBroadcast = broadcastCNVCalls(ctx, headerForReads, discoverStageArgs.cnvCallsFile); - final SvDiscoveryInputData svDiscoveryInputData = - new SvDiscoveryInputData(ctx, discoverStageArgs, vcfOutputFileName, - assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(), - makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()), - cnvCallsBroadcast, - getReads(), getHeaderForReads(), getReference(), localLogger); + final SvDiscoveryInputData svDiscoveryInputData = getSvDiscoveryInputData(ctx, headerForReads, assembledEvidenceResults); // TODO: 1/14/18 this is to be phased-out: old way of calling precise variants // assembled breakpoints @@ -182,15 +185,36 @@ protected void runTool( final JavaSparkContext ctx ) { final String outputPath = svDiscoveryInputData.outputPath; final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputData.referenceSequenceDictionaryBroadcast.getValue(); final Logger toolLogger = svDiscoveryInputData.toolLogger; - SVVCFWriter.writeVCF(annotatedVariants, outputPath, refSeqDictionary, toolLogger); + SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, toolLogger); - // TODO: 1/14/18 this is the next version of precise variant calling - if ( expVariantsOutDir != null ) { - svDiscoveryInputData.updateOutputPath(expVariantsOutDir); + if ( expInterpret != null ) { experimentalInterpretation(ctx, assembledEvidenceResults, svDiscoveryInputData, evidenceAndAssemblyArgs.crossContigsToIgnoreFile); } } + private SvDiscoveryInputData getSvDiscoveryInputData(final JavaSparkContext ctx, + final SAMFileHeader headerForReads, + final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults) { + final Broadcast> cnvCallsBroadcast = + broadcastCNVCalls(ctx, headerForReads, discoverStageArgs.cnvCallsFile); + try { + if ( !java.nio.file.Files.exists(Paths.get(variantsOutDir)) ) { + IOUtils.createDirectory(variantsOutDir); + } + } catch (final IOException ioex) { + throw new GATKException("Failed to create output directory " + variantsOutDir + " though it does not yet exist", ioex); + } + + final String outputPrefixWithSampleName = variantsOutDir + (variantsOutDir.endsWith("/") ? "" : "/") + + SVUtils.getSampleId(headerForReads) + "_"; + + return new SvDiscoveryInputData(ctx, discoverStageArgs, outputPrefixWithSampleName, + assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(), + makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()), + cnvCallsBroadcast, + getReads(), getHeaderForReads(), getReference(), localLogger); + } + /** * Uses the input EvidenceTargetLinks to *