Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tool to detect CRAM base corruption caused by GATK issue 8768 #8819

Merged
merged 6 commits into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package org.broadinstitute.hellbender.tools;

import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.hellbender.tools.filediagnostics.CRAMIssue8768Analyzer;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import picard.cmdline.programgroups.OtherProgramGroup;

/**
* A diagnostic tool that analyzes a CRAM input file to look for conditions that trigger issue 8768.
*
* Writes a report to a file indicating whether the file appears to have read base corruption caused by 8768.
*
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a 1-2 paragraph description of the bug here, including under what circumstances it gets triggered, how it manifests, and what versions of GATK/Picard it affects. We should also post this information in a comment to issue 8768.

* By default, the output report will have a summary of the average mismatch rate for all suspected bad containers
* and a few presumed good containers in order to determine if there is a large difference in the base mismatch rate.
* To analyze the base mismatch rate for ALL containers, use the "verbose" option.
*
* Works on files ending in .cram.
*
* Sample Usage:
*
* gatk CRAMIssue8768Detector -I input.cram -O output.txt -R reference.fasta
*/
@ExperimentalFeature
@WorkflowProperties
@CommandLineProgramProperties(
summary = "Analyze a CRAM file for issue 8768",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Analyze a CRAM file to check for base corruption caused by issue 8768"

oneLineSummary = "Analyze a CRAM file for issue 8768",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Analyze a CRAM file to check for base corruption caused by issue 8768"

programGroup = OtherProgramGroup.class
)
public class CRAMIssue8768Detector extends CommandLineProgram {
// default average mismatch rate threshold above which we consider the file to be corrupt
private static final double DEFAULT_MISMATCH_RATE_THRESHOLD = 0.05;

@Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
doc = "Input path of file to analyze",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

file -> CRAM file

common = true)
@WorkflowInput
public GATKPath inputPath;

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc = "Output diagnostics text file",
common = true)
@WorkflowOutput
public GATKPath textOutputPath;

public static final String OUTPUT_TSV__ARG_NAME = "output-tsv";
@Argument(fullName = OUTPUT_TSV__ARG_NAME,
shortName = OUTPUT_TSV__ARG_NAME,
doc = "Output diagnostics tsv file",
optional = true)
@WorkflowOutput
public GATKPath tsvOutputPath;

@Argument(fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME,
shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME,
doc = "Reference for the CRAM file",
common = true)
@WorkflowOutput
public GATKPath referencePath;

public static final String MISMATCH_RATE_THRESHOLD_ARG_NAME = "mismatch-rate-threshold";
@Argument(fullName = MISMATCH_RATE_THRESHOLD_ARG_NAME,
shortName = MISMATCH_RATE_THRESHOLD_ARG_NAME,
doc = "Mismatch rate threshold above which we consider the file to be corrupt",
optional = true)
public double mismatchRateThreshold = DEFAULT_MISMATCH_RATE_THRESHOLD;

public static final String VERBOSE_ARG_NAME = "verbose";
@Argument(fullName = VERBOSE_ARG_NAME,
shortName= VERBOSE_ARG_NAME,
doc="Calculate and print the mismatch rate for all containers",
optional=true)
public boolean verbose = false;

public static final String ECHO_ARG_NAME = "echo-to-stdout";
@Argument(fullName = ECHO_ARG_NAME,
shortName= ECHO_ARG_NAME,
doc="Echo text output to stdout",
optional=true)
public boolean echoToStdout = false;

private CRAMIssue8768Analyzer cramAnalyzer;

@Override
protected Object doWork() {
cramAnalyzer = new CRAMIssue8768Analyzer(
inputPath,
textOutputPath,
tsvOutputPath,
referencePath,
mismatchRateThreshold,
verbose,
echoToStdout);
cramAnalyzer.doAnalysis();
return cramAnalyzer.getRetCode();
}

@Override
protected void onShutdown() {
if ( cramAnalyzer != null ) {
try {
cramAnalyzer.close();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import org.broadinstitute.hellbender.tools.filediagnostics.HTSAnalyzerFactory;
import picard.cmdline.programgroups.OtherProgramGroup;

import java.io.File;

/**
* A diagnostic tool that prints meta information about a GATK input file.
*
Expand Down Expand Up @@ -43,8 +41,8 @@ public class PrintFileDiagnostics extends CommandLineProgram {
doc = "Outut file for diagnostics (must be a local file)",
optional = false,
common = true)
@WorkflowInput
public File outputFile;
@WorkflowOutput
public GATKPath outputPath;

@Argument(shortName="count-limit",
fullName="count-limit",
Expand All @@ -56,7 +54,7 @@ public class PrintFileDiagnostics extends CommandLineProgram {
@Override
protected void onStartup() {
super.onStartup();
htsAnalyzer = HTSAnalyzerFactory.getFileAnalyzer(inputPath, outputFile, countLimit);
htsAnalyzer = HTSAnalyzerFactory.getFileAnalyzer(inputPath, outputPath, countLimit);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,17 @@
*/
public class BAIAnalyzer extends HTSAnalyzer {

public BAIAnalyzer(final GATKPath inputPath, final File outputFile) {
super(inputPath, outputFile);
public BAIAnalyzer(final GATKPath inputPath, final GATKPath outputPath) {
super(inputPath, outputPath);
}

/**
* Run the analyzer for the file.
*/
protected void doAnalysis() {
System.out.println(String.format("\nOutput written to %s\n", outputFile));
BAMIndexer.createAndWriteIndex(inputPath.toPath().toFile(), outputFile, true);
System.out.println(String.format("\nOutput written to %s\n", outputPath));
// note this method is not nio aware
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably note this limitation in the class-level docs as well

BAMIndexer.createAndWriteIndex(inputPath.toPath().toFile(), new File(outputPath.getRawInputString()), true);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,22 @@
import htsjdk.samtools.util.RuntimeIOException;
import org.broadinstitute.hellbender.engine.GATKPath;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.*;
import java.nio.file.Files;

/**
* Analyzer for CRAM (.crai) index files.
*/
public class CRAIAnalyzer extends HTSAnalyzer {

final FileOutputStream fos;
final OutputStream fos;

public CRAIAnalyzer(final GATKPath inputPath, final File outputFile) {
super(inputPath, outputFile);
public CRAIAnalyzer(final GATKPath inputPath, final GATKPath outputPath) {
super(inputPath, outputPath);
try {
fos = new FileOutputStream(outputFile);
fos = Files.newOutputStream(outputPath.toPath());
} catch (final IOException e) {

throw new RuntimeIOException(e);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,11 @@
import htsjdk.samtools.util.RuntimeIOException;
import org.broadinstitute.hellbender.engine.GATKPath;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.util.Base64;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

Expand All @@ -36,13 +33,13 @@ public class CRAMAnalyzer extends HTSAnalyzer {
long coreBlocksDataSize = 0L;
long recordCount = 0;
final long countLimit;
final FileOutputStream fos;
final OutputStream fos;

public CRAMAnalyzer(final GATKPath inputPathName, final File outputFile, final long countLimit) {
super(inputPathName, outputFile);
public CRAMAnalyzer(final GATKPath inputPathName, final GATKPath outputPath, final long countLimit) {
super(inputPathName, outputPath);
this.countLimit = countLimit;
try {
fos = new FileOutputStream(outputFile);
fos = Files.newOutputStream(outputPath.toPath());
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
Expand Down
Loading
Loading