Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(SV) Misc. improvements to the new location inference and type interpretation tool #4562

Merged
merged 1 commit into from
Mar 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions scripts/sv/runWholePipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ case ${GATK_SV_TOOL} in
"StructuralVariationDiscoveryPipelineSpark")
TOOL_OPTIONS="\
-I ${INPUT_BAM} \
-O ${PROJECT_OUTPUT_DIR}/variants/inv_del_ins.vcf \
-O ${PROJECT_OUTPUT_DIR}/variants/ \
-R ${REF_TWOBIT} \
--aligner-index-image ${REF_INDEX_IMAGE} \
--exclusion-intervals ${INTERVAL_KILL_LIST} \
Expand All @@ -67,9 +67,9 @@ case ${GATK_SV_TOOL} in
--breakpoint-intervals ${PROJECT_OUTPUT_DIR}/intervals \
--high-coverage-intervals "${PROJECT_OUTPUT_DIR}/highCoverageIntervals.bed" \
--fastq-dir ${PROJECT_OUTPUT_DIR}/fastq \
--contig-sam-file ${PROJECT_OUTPUT_DIR}/assemblies.sam \
--contig-sam-file ${PROJECT_OUTPUT_DIR}/assemblies.bam \
--target-link-file ${PROJECT_OUTPUT_DIR}/target_links.bedpe \
--exp-variants-out-dir ${PROJECT_OUTPUT_DIR}/experimentalVariantInterpretations"
--exp-interpret"
;;
"ExtractSVEvidenceSpark")
TOOL_OPTIONS="\
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.AnnotatedVariantProducer;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoverFromLocalAssemblyContigAlignmentsSpark;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputData;
Expand All @@ -33,10 +34,13 @@
import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment;
import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignmentUtils;
import org.broadinstitute.hellbender.utils.fermi.FermiLiteAssembly;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter;
import scala.Serializable;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.EnumMap;
import java.util.List;
import java.util.stream.Collectors;
Expand All @@ -49,7 +53,8 @@
* Runs the structural variation discovery workflow on a single sample
*
* <p>This tool packages the algorithms described in {@link FindBreakpointEvidenceSpark} and
* {@link DiscoverVariantsFromContigAlignmentsSAMSpark} as an integrated workflow. Please consult the
* {@link org.broadinstitute.hellbender.tools.spark.sv.discovery.DiscoverVariantsFromContigAlignmentsSAMSpark}
* as an integrated workflow. Please consult the
* descriptions of those tools for more details about the algorithms employed. In brief, input reads are examined
* for evidence of structural variation in a genomic region, regions so identified are locally assembled, and
* the local assemblies are called for structural variation.</p>
Expand Down Expand Up @@ -108,16 +113,20 @@ public class StructuralVariationDiscoveryPipelineSpark extends GATKSparkTool {
@ArgumentCollection
private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs
= new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection();

@Argument(doc = "sam file for aligned contigs", fullName = "contig-sam-file")
private String outputAssemblyAlignments;
@Argument(doc = "filename for output vcf", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,

@Argument(doc = "directory for VCF output, including those from experimental interpretation tool if so requested, " +
"will be created if not present; sample name will be appended after the provided argument",
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
private String vcfOutputFileName;
private String variantsOutDir;

@Advanced
@Argument(doc = "directory to output results of our prototyping breakpoint and type inference tool in addition to the master VCF;" +
" the directory contains multiple VCF's for different types and record-generating SAM files of assembly contigs,",
fullName = "exp-variants-out-dir", optional = true)
private String expVariantsOutDir;
@Argument(doc = "flag to signal that user wants to run experimental interpretation tool as well",
fullName = "exp-interpret", optional = true)
private Boolean expInterpret = false;

@Override
public boolean requiresReads()
Expand Down Expand Up @@ -162,13 +171,7 @@ protected void runTool( final JavaSparkContext ctx ) {
// todo: when we call imprecise variants don't return here
if(parsedAlignments.isEmpty()) return;

final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast = broadcastCNVCalls(ctx, headerForReads, discoverStageArgs.cnvCallsFile);
final SvDiscoveryInputData svDiscoveryInputData =
new SvDiscoveryInputData(ctx, discoverStageArgs, vcfOutputFileName,
assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(),
makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()),
cnvCallsBroadcast,
getReads(), getHeaderForReads(), getReference(), localLogger);
final SvDiscoveryInputData svDiscoveryInputData = getSvDiscoveryInputData(ctx, headerForReads, assembledEvidenceResults);

// TODO: 1/14/18 this is to be phased-out: old way of calling precise variants
// assembled breakpoints
Expand All @@ -182,15 +185,36 @@ protected void runTool( final JavaSparkContext ctx ) {
final String outputPath = svDiscoveryInputData.outputPath;
final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputData.referenceSequenceDictionaryBroadcast.getValue();
final Logger toolLogger = svDiscoveryInputData.toolLogger;
SVVCFWriter.writeVCF(annotatedVariants, outputPath, refSeqDictionary, toolLogger);
SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, toolLogger);

// TODO: 1/14/18 this is the next version of precise variant calling
if ( expVariantsOutDir != null ) {
svDiscoveryInputData.updateOutputPath(expVariantsOutDir);
if ( expInterpret != null ) {
experimentalInterpretation(ctx, assembledEvidenceResults, svDiscoveryInputData, evidenceAndAssemblyArgs.crossContigsToIgnoreFile);
}
}

private SvDiscoveryInputData getSvDiscoveryInputData(final JavaSparkContext ctx,
final SAMFileHeader headerForReads,
final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults) {
final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast =
broadcastCNVCalls(ctx, headerForReads, discoverStageArgs.cnvCallsFile);
try {
if ( !java.nio.file.Files.exists(Paths.get(variantsOutDir)) ) {
IOUtils.createDirectory(variantsOutDir);
}
} catch (final IOException ioex) {
throw new GATKException("Failed to create output directory " + variantsOutDir + " though it does not yet exist", ioex);
}

final String outputPrefixWithSampleName = variantsOutDir + (variantsOutDir.endsWith("/") ? "" : "/")
+ SVUtils.getSampleId(headerForReads) + "_";

return new SvDiscoveryInputData(ctx, discoverStageArgs, outputPrefixWithSampleName,
assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(),
makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()),
cnvCallsBroadcast,
getReads(), getHeaderForReads(), getReference(), localLogger);
}

/**
* Uses the input EvidenceTargetLinks to
* <ul>
Expand Down Expand Up @@ -240,7 +264,7 @@ private void experimentalInterpretation(final JavaSparkContext ctx,
final SvDiscoveryInputData svDiscoveryInputData,
final String nonCanonicalChromosomeNamesFile) {

if ( expVariantsOutDir == null )
if ( ! expInterpret )
return;

final Broadcast<SAMSequenceDictionary> referenceSequenceDictionaryBroadcast = svDiscoveryInputData.referenceSequenceDictionaryBroadcast;
Expand All @@ -263,7 +287,8 @@ private void experimentalInterpretation(final JavaSparkContext ctx,
final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast = svDiscoveryInputData.cnvCallsBroadcast;

final SvDiscoveryInputData updatedSvDiscoveryInputData =
new SvDiscoveryInputData(sampleId, svDiscoveryInputData.discoverStageArgs, expVariantsOutDir,
new SvDiscoveryInputData(sampleId, svDiscoveryInputData.discoverStageArgs,
svDiscoveryInputData.outputPath + "experimentalInterpretation_",
svDiscoveryInputData.metadata, svDiscoveryInputData.assembledIntervals,
svDiscoveryInputData.evidenceTargetLinks, reads, svDiscoveryInputData.toolLogger,
referenceBroadcast, referenceSequenceDictionaryBroadcast, headerBroadcast, cnvCallsBroadcast);
Expand Down Expand Up @@ -368,7 +393,7 @@ public static List<AlignedContig> filterAndConvertToAlignedContigDirect(final It
.filter(AlignedAssemblyOrExcuse::isNotFailure)
.map(alignedAssembly -> getAlignedContigsInOneAssembly(alignedAssembly, refNames, header))
.flatMap(Utils::stream) // size == total # of contigs' from all successful assemblies
.filter(contig -> !contig.alignmentIntervals.isEmpty()) // filter out unmapped and contigs without primary alignments
.filter(contig -> !contig.getAlignments().isEmpty()) // filter out unmapped and contigs without primary alignments
.collect(Collectors.toList());
}

Expand All @@ -388,9 +413,9 @@ public static Iterable<AlignedContig> getAlignedContigsInOneAssembly(final Align
.mapToObj( contigIdx -> {
final byte[] contigSequence = assembly.getContig(contigIdx).getSequence();
final String contigName = AlignedAssemblyOrExcuse.formatContigName(alignedAssembly.getAssemblyId(), contigIdx);
final List<AlignmentInterval> arOfAContig
final List<AlignmentInterval> alignmentsForOneContig
= getAlignmentsForOneContig(contigName, contigSequence, allAlignments.get(contigIdx), refNames, header);
return new AlignedContig(contigName, contigSequence, arOfAContig, false);
return new AlignedContig(contigName, contigSequence, alignmentsForOneContig);
} ).collect(Collectors.toList());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.Utils;
import scala.Tuple2;
Expand Down Expand Up @@ -106,9 +107,10 @@ public final class DiscoverVariantsFromContigAlignmentsSAMSpark extends GATKSpar
private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs =
new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection();

@Argument(doc = "\"output path for discovery (non-genotyped) VCF", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
@Argument(doc = "prefix for discovery (non-genotyped) VCF; sample name will be appended after the provided argument, followed by \"_inv_del_ins.vcf\"",
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
private String vcfOutputFileName;
private String prefixForOutput;

@Override
public boolean requiresReference() {
Expand All @@ -133,20 +135,23 @@ protected void runTool(final JavaSparkContext ctx) {
discoverStageArgs.cnvCallsFile);

final SvDiscoveryInputData svDiscoveryInputData =
new SvDiscoveryInputData(ctx, discoverStageArgs, vcfOutputFileName,
new SvDiscoveryInputData(ctx,
discoverStageArgs, prefixForOutput + "_" + SVUtils.getSampleId(getHeaderForReads()) + "_inv_del_ins.vcf",
null, null, null,
cnvCallsBroadcast,
getReads(), getHeaderForReads(), getReference(), localLogger);

final JavaRDD<AlignedContig> parsedContigAlignments =
new SvDiscoverFromLocalAssemblyContigAlignmentsSpark.SAMFormattedContigAlignmentParser(svDiscoveryInputData.assemblyRawAlignments, svDiscoveryInputData.headerBroadcast.getValue(), true)
new SvDiscoverFromLocalAssemblyContigAlignmentsSpark
.SAMFormattedContigAlignmentParser(svDiscoveryInputData.assemblyRawAlignments,
svDiscoveryInputData.headerBroadcast.getValue(), true)
.getAlignedContigs();

// assembly-based breakpoints
List<VariantContext> annotatedVariants = discoverVariantsFromChimeras(svDiscoveryInputData, parsedContigAlignments);

final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputData.referenceSequenceDictionaryBroadcast.getValue();
SVVCFWriter.writeVCF(annotatedVariants, vcfOutputFileName, refSeqDictionary, localLogger);
SVVCFWriter.writeVCF(annotatedVariants, svDiscoveryInputData.outputPath, refSeqDictionary, localLogger);
}

@Deprecated
Expand All @@ -155,9 +160,9 @@ public static List<VariantContext> discoverVariantsFromChimeras(final SvDiscover
final Broadcast<SAMSequenceDictionary> referenceSequenceDictionaryBroadcast = svDiscoveryInputData.referenceSequenceDictionaryBroadcast;

final JavaPairRDD<byte[], List<ChimericAlignment>> contigSeqAndChimeras =
alignedContigs.filter(alignedContig -> alignedContig.alignmentIntervals.size() > 1)
alignedContigs.filter(alignedContig -> alignedContig.getAlignments().size() > 1)
.mapToPair(alignedContig ->
new Tuple2<>(alignedContig.contigSequence,
new Tuple2<>(alignedContig.getContigSequence(),
ChimericAlignment.parseOneContig(alignedContig, referenceSequenceDictionaryBroadcast.getValue(),
true, DEFAULT_MIN_ALIGNMENT_LENGTH,
CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD, true)));
Expand Down
Loading