Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapt CollectRrbsMetrics to SinglePassSamProgram #940

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion src/main/java/picard/analysis/CollectMultipleMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ public class CollectMultipleMetrics extends CommandLineProgram {
static final String USAGE_DETAILS ="This 'meta-metrics' tool runs one or more of the metrics collection modules at the same" +
" time to cut down on the time spent reading in data from input files. Available modules include " +
"CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, QualityScoreDistribution, MeanQualityByCycle, " +
"CollectBaseDistributionByCycle, CollectGcBiasMetrics, RnaSeqMetrics, CollectSequencingArtifactMetrics, and CollectQualityYieldMetrics. " +
"CollectBaseDistributionByCycle, CollectGcBiasMetrics, RnaSeqMetrics, CollectSequencingArtifactMetrics, " +
"CollectQualityYieldMetrics and CollectRrbsMetrics. " +
"The tool produces outputs of '.pdf' and '.txt' files for each module, except for the " +
"CollectAlignmentSummaryMetrics module, which outputs only a '.txt' file. Output files are named by specifying a base name " +
"(without any file extensions).<br /><br />" +
Expand Down Expand Up @@ -103,6 +104,29 @@ SinglePassSamProgram makeInstance(final String outbase, final String outext, fin
}

public static enum Program implements ProgramInterface {
CollectRrbsMetrics {
@Override
public SinglePassSamProgram makeInstance(final String outbase, final String outext, final File input, final File reference, final Set<MetricAccumulationLevel> metricAccumulationLevel, final File dbSnp, final File intervals) {
final CollectRrbsMetrics program = new CollectRrbsMetrics();
program.OUTPUT = new File(outbase + ".rrbc.detail_metrics" + outext);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rrbc->rrbs (x3)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

program.SUMMARY_OUTPUT = new File(outbase + ".rrbc.summary_metrics" + outext);
program.CHART_OUTPUT = new File(outbase + ".rrbc.pdf");
program.INPUT = input;
program.METRIC_ACCUMULATION_LEVEL = metricAccumulationLevel;
program.ASSUME_SORTED = false;
program.setReferenceSequence(reference);

return program;
}
@Override
public boolean needsReferenceSequence() {
return true;
}
@Override
public boolean supportsMetricAccumulationLevel() {
return true;
}
},
CollectAlignmentSummaryMetrics {
@Override
public boolean needsReferenceSequence() {
Expand Down
127 changes: 65 additions & 62 deletions src/main/java/picard/analysis/CollectRrbsMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,11 @@
import htsjdk.samtools.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.argumentcollections.ReferenceArgumentCollection;
import picard.cmdline.programgroups.Metrics;
Expand All @@ -62,7 +56,7 @@
programGroup = Metrics.class
)
@DocumentedFeature
public class CollectRrbsMetrics extends CommandLineProgram {
public class CollectRrbsMetrics extends SinglePassSamProgram {
static final String USAGE_SUMMARY = "<b>Collects metrics from reduced representation bisulfite sequencing (Rrbs) data.</b> ";
static final String USAGE_DETAILS = "<p>This tool uses reduced representation bisulfite sequencing (Rrbs) data to determine cytosine " +
"methylation status across all reads of a genomic DNA sequence. For a primer on bisulfite sequencing and cytosine methylation, " +
Expand All @@ -82,7 +76,18 @@ public class CollectRrbsMetrics extends CommandLineProgram {
"The detailed metrics table includes the coordinates of all of the CpG sites for the experiment as well as the conversion rates " +
"observed for each site.</p>" +

"<h4>Usage example:</h4>" +
"<p>It is possible to launch CollectRrbsMetrics tool by setting all output files or by indicating a base name for metrics files only. See examples:</p>" +

"<h4>Usage example 1:</h4>" +
"<pre>" +
"java -jar picard.jar CollectRrbsMetrics \\<br />" +
" I=input.bam \\<br />" +
" O=detail_output.bam \\<br />" +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

O=detail_output.txt, not .bam

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

" CHART=chart_output.pdf \\<br />" +
" S=summary_output.bam \\<br />" +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.txt not .bam

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

" R=reference_sequence.fasta" +
"</pre>" +
"<h4>Usage example 2:</h4>" +
"<pre>" +
"java -jar picard.jar CollectRrbsMetrics \\<br />" +
" R=reference_sequence.fasta \\<br />" +
Expand All @@ -95,13 +100,11 @@ public class CollectRrbsMetrics extends CommandLineProgram {
" for a complete description of both the detail and summary metrics produced by this tool.</p>" +
"<hr />";

// Path to R file for plotting purposes
// Path to R file for plotting purposes

private static final String R_SCRIPT = "picard/analysis/rrbsQc.R";
private static final String R_SCRIPT = "picard/analysis/rrbsQc.R";

@Argument(doc = "The BAM or SAM file containing aligned reads. Must be coordinate sorted", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
public File INPUT;
@Argument(doc = "Base name for output files", shortName = StandardOptionDefinitions.METRICS_FILE_SHORT_NAME)
@Argument(doc = "Base name for output files", shortName = StandardOptionDefinitions.METRICS_FILE_SHORT_NAME, mutex = {"OUTPUT", "CHART_OUTPUT", "SUMMARY_OUTPUT"})
public String METRICS_FILE_PREFIX;
@Argument(doc = "Minimum read length")
public int MINIMUM_READ_LENGTH = 5;
Expand All @@ -112,18 +115,19 @@ public class CollectRrbsMetrics extends CommandLineProgram {
@Argument(doc = "Maximum percentage of mismatches in a read for it to be considered, with a range of 0-1")
public double MAX_MISMATCH_RATE = 0.1;
@Argument(doc = "Set of sequence names to consider, if not specified all sequences will be used", optional = true)
public Set<String> SEQUENCE_NAMES = new HashSet<String>();
@Argument(shortName = StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME,
doc = "If true, assume that the input file is coordinate sorted even if the header says otherwise.")
public boolean ASSUME_SORTED = false;
public Set<String> SEQUENCE_NAMES = new HashSet<>();
@Argument(shortName = "LEVEL", doc = "The level(s) at which to accumulate metrics. ")
public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
@Argument(shortName = "CHART", doc = "The PDF file to render the chart to.", mutex = {"METRICS_FILE_PREFIX"})
public File CHART_OUTPUT;
@Argument(shortName = "S", doc = "The text file to write summary metrics to.", mutex = {"METRICS_FILE_PREFIX"})
public File SUMMARY_OUTPUT;

public static final String DETAIL_FILE_EXTENSION = "rrbs_detail_metrics";
public static final String SUMMARY_FILE_EXTENSION = "rrbs_summary_metrics";
public static final String PDF_FILE_EXTENSION = "rrbs_qc.pdf";

private static final Log log = Log.getInstance(CollectRrbsMetrics.class);
private RrbsMetricsCollector metricsCollector;

// return a custom argument collection since this tool uses a (required) argument name
// of "REFERENCE", not "REFERENCE_SEQUENCE"
Expand All @@ -147,33 +151,43 @@ public static void main(final String[] args) {
}

@Override
protected int doWork() {
if (!METRICS_FILE_PREFIX.endsWith(".")) {
METRICS_FILE_PREFIX = METRICS_FILE_PREFIX + ".";
}
final File SUMMARY_OUT = new File(METRICS_FILE_PREFIX + SUMMARY_FILE_EXTENSION);
final File DETAILS_OUT = new File(METRICS_FILE_PREFIX + DETAIL_FILE_EXTENSION);
final File PLOTS_OUT = new File(METRICS_FILE_PREFIX + PDF_FILE_EXTENSION);
assertIoFiles(SUMMARY_OUT, DETAILS_OUT, PLOTS_OUT);

final SamReader samReader = SamReaderFactory.makeDefault().open(INPUT);
if (!ASSUME_SORTED && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new PicardException("The input file " + INPUT.getAbsolutePath() + " does not appear to be coordinate sorted");
protected void setup(final SAMFileHeader header, final File samFile) {
if (METRICS_FILE_PREFIX != null) {
if (!METRICS_FILE_PREFIX.endsWith(".")) {
METRICS_FILE_PREFIX = METRICS_FILE_PREFIX + ".";
}
OUTPUT = new File(METRICS_FILE_PREFIX + DETAIL_FILE_EXTENSION);
SUMMARY_OUTPUT = new File(METRICS_FILE_PREFIX + SUMMARY_FILE_EXTENSION);
CHART_OUTPUT = new File(METRICS_FILE_PREFIX + PDF_FILE_EXTENSION);
}
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(SUMMARY_OUTPUT);
IOUtil.assertFileIsWritable(CHART_OUTPUT);
metricsCollector = new RrbsMetricsCollector(
METRIC_ACCUMULATION_LEVEL,
header.getReadGroups(),
C_QUALITY_THRESHOLD,
NEXT_BASE_QUALITY_THRESHOLD,
MINIMUM_READ_LENGTH,
MAX_MISMATCH_RATE
);
}

final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final ProgressLogger progressLogger = new ProgressLogger(log);
@Override
protected void acceptRead(final SAMRecord samRecord, final ReferenceSequence ref) {
if (!samRecord.getReadUnmappedFlag() && !isSequenceFiltered(samRecord.getReferenceName())) {
metricsCollector.acceptRecord(samRecord, ref);
}
}

final RrbsMetricsCollector metricsCollector = new RrbsMetricsCollector(METRIC_ACCUMULATION_LEVEL, samReader.getFileHeader().getReadGroups(),
C_QUALITY_THRESHOLD, NEXT_BASE_QUALITY_THRESHOLD, MINIMUM_READ_LENGTH, MAX_MISMATCH_RATE);
private boolean isSequenceFiltered(final String sequenceName) {
return SEQUENCE_NAMES != null
&& !SEQUENCE_NAMES.isEmpty()
&& !SEQUENCE_NAMES.contains(sequenceName);
}

for (final SAMRecord samRecord : samReader) {
progressLogger.record(samRecord);
if (!samRecord.getReadUnmappedFlag() && !isSequenceFiltered(samRecord.getReferenceName())) {
final ReferenceSequence referenceSequence = refWalker.get(samRecord.getReferenceIndex());
metricsCollector.acceptRecord(samRecord, referenceSequence);
}
}
@Override
protected void finish() {
metricsCollector.finish();
final MetricsFile<RrbsMetrics, Comparable<?>> rrbsMetrics = getMetricsFile();
metricsCollector.addAllLevelsToFile(rrbsMetrics);
Expand All @@ -182,34 +196,23 @@ protected int doWork() {
// we get it out split it apart to the two separate MetricsFiles and write them to file
final MetricsFile<RrbsSummaryMetrics, ?> summaryFile = getMetricsFile();
final MetricsFile<RrbsCpgDetailMetrics, ?> detailsFile = getMetricsFile();
for (final RrbsMetrics rrbsMetric : rrbsMetrics.getMetrics()) {
rrbsMetrics.getMetrics().forEach(rrbsMetric -> {
summaryFile.addMetric(rrbsMetric.getSummaryMetrics());
for (final RrbsCpgDetailMetrics detailMetric : rrbsMetric.getDetailMetrics()) {
detailsFile.addMetric(detailMetric);
}
}
summaryFile.write(SUMMARY_OUT);
detailsFile.write(DETAILS_OUT);
RExecutor.executeFromClasspath(R_SCRIPT, DETAILS_OUT.getAbsolutePath(), SUMMARY_OUT.getAbsolutePath(), PLOTS_OUT.getAbsolutePath());
CloserUtil.close(samReader);
return 0;
}
rrbsMetric.getDetailMetrics().forEach(detailsFile::addMetric);
});

private boolean isSequenceFiltered(final String sequenceName) {
return (SEQUENCE_NAMES != null) && (!SEQUENCE_NAMES.isEmpty()) && (!SEQUENCE_NAMES.contains(sequenceName));
}
summaryFile.write(SUMMARY_OUTPUT);
detailsFile.write(OUTPUT);

private void assertIoFiles(final File summaryFile, final File detailsFile, final File plotsFile) {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
IOUtil.assertFileIsWritable(summaryFile);
IOUtil.assertFileIsWritable(detailsFile);
IOUtil.assertFileIsWritable(plotsFile);
RExecutor.executeFromClasspath(R_SCRIPT,
OUTPUT.getAbsolutePath(),
SUMMARY_OUTPUT.getAbsolutePath(),
CHART_OUTPUT.getAbsolutePath());
}

@Override
protected String[] customCommandLineValidation() {
final List<String> errorMsgs = new ArrayList<String>();
final List<String> errorMsgs = new ArrayList<>();
if (MAX_MISMATCH_RATE < 0 || MAX_MISMATCH_RATE > 1) {
errorMsgs.add("MAX_MISMATCH_RATE must be in the range of 0-1");
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/picard/analysis/SinglePassSamProgram.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public abstract class SinglePassSamProgram extends CommandLineProgram {
@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.")
public File INPUT;

@Argument(shortName = "O", doc = "File to write the output to.")
@Argument(shortName = "O", doc = "File to write the output to.", optional = true)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this optional?

Copy link
Contributor Author

@tcibinan tcibinan Sep 29, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review @yfarjoun.

CollectRrbsMetrics didn't have OUTPUT argument (only METRICS_FILE_PREFIX). But now it extends from SinglePassSamProgram and you can't run the metrics without mandatory argument OUTPUT.

So OUTPUT was made optional in order to be able to run CollectRrbsMetrics without it.

Also we can use that excessive OUTPUT argument with two more to specify output files explicitly. It can be useful and it is a way to use this unnecessary argument.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thinking about this more, this isn't "OK". this changes the behavior of all the other programs that extend from SinglePassSamProgram.

can you find a solution that doens't change the behavior/API for the other programs?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I owuld almost prefer to simply rename METRICS_FILE_PREFIX -> "OUTPUT" and to do away with the specific metric/chart/summary output variables....the only problem is that it would create an API change...

@cnorman, perhaps you can suggest solutions for this...given your knowledge of the parsing system.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest that you implement it like CollectGcBiasMetrics is implemented, i.e. OUTPUT is for the detailed_metrics, SUMMARY_OUTPUT is for summary_metrics and CHART_OUTPUT for the pdf. You can use METRICS_FILE_PREFIX as Mutex, or remove it and change the API for this program (but not for all of the Single-Pass-Sam programs)

public File OUTPUT;

@Argument(doc = "If true (default), then the sort order in the header file will be ignored.",
Expand Down
52 changes: 52 additions & 0 deletions src/test/java/picard/analysis/CollectMultipleMetricsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import picard.cmdline.CommandLineProgramTest;
import picard.metrics.MultilevelMetrics;
import picard.sam.SortSam;
import static picard.analysis.GcBiasMetricsCollector.PerUnitGcBiasMetricsCollector.*;

Expand All @@ -33,11 +34,62 @@
public class CollectMultipleMetricsTest extends CommandLineProgramTest {

private static final File TEST_DATA_DIR = new File("testdata/picard/sam");
private static final File TEST_DATA_DIR_RRBS = new File("testdata/picard/metrics");

public String getCommandLineProgramName() {
return CollectMultipleMetrics.class.getSimpleName();
}

@Test
public void testCollectRrbsMetrics() throws IOException {
final File input = new File(TEST_DATA_DIR_RRBS, "chrMReads.sam");
final File reference = new File(TEST_DATA_DIR_RRBS, "chrM.reference.fasta");
final File outfile = File.createTempFile("crmt.", ".rrbs_summary_metrics");
outfile.deleteOnExit();

final File summary = new File(outfile + ".rrbc.summary_metrics");
final File detail = new File(outfile + ".rrbc.detail_metrics");
final File pdf = new File(outfile + ".rrbc.pdf");
summary.deleteOnExit();
detail.deleteOnExit();
pdf.deleteOnExit();

final String[] args = new String[] {
"INPUT=" + input.getAbsolutePath(),
"OUTPUT=" + outfile.getAbsolutePath(),
"REFERENCE_SEQUENCE=" + reference.getAbsolutePath(),
"PROGRAM=" + null,
"PROGRAM=" + CollectMultipleMetrics.Program.CollectRrbsMetrics.name()};

Assert.assertEquals(runPicardCommandLine(args), 0);

final RrbsSummaryMetrics metrics = getMultilevelMetrics(summary);
Assert.assertEquals(metrics.READS_ALIGNED.intValue(), 5);
Assert.assertEquals(metrics.NON_CPG_BASES.intValue(), 15);
Assert.assertEquals(metrics.NON_CPG_CONVERTED_BASES.intValue(), 11);
Assert.assertEquals(metrics.PCT_NON_CPG_BASES_CONVERTED, 0.733333);
Assert.assertEquals(metrics.CPG_BASES_SEEN.intValue(), 5);
Assert.assertEquals(metrics.CPG_BASES_CONVERTED.intValue(), 1);
Assert.assertEquals(metrics.PCT_CPG_BASES_CONVERTED, 0.2);
Assert.assertEquals(metrics.MEAN_CPG_COVERAGE, 1.666667);
Assert.assertEquals(metrics.MEDIAN_CPG_COVERAGE.intValue(), 2);
Assert.assertEquals(metrics.READS_WITH_NO_CPG.intValue(), 1);
Assert.assertEquals(metrics.READS_IGNORED_SHORT.intValue(), 1);
Assert.assertEquals(metrics.READS_IGNORED_MISMATCHES.intValue(), 1);

final RrbsCpgDetailMetrics metricsCpg = getMultilevelMetrics(detail);
Assert.assertEquals(metricsCpg.SEQUENCE_NAME, "chrM");
Assert.assertEquals(metricsCpg.POSITION.intValue(), 60);
Assert.assertEquals(metricsCpg.TOTAL_SITES.intValue(), 1);
Assert.assertEquals(metricsCpg.CONVERTED_SITES.intValue(), 1);
Assert.assertEquals(metricsCpg.PCT_CONVERTED.intValue(), 1);
}

private <T extends MultilevelMetrics> T getMultilevelMetrics(final File file) throws FileNotFoundException {
final MetricsFile<T, ?> retVal = new MetricsFile<T, Integer>();
retVal.read(new FileReader(file));
return retVal.getMetrics().get(0);
}

@Test
public void testAlignmentSummaryViaMultipleMetrics() throws IOException {
Expand Down
Loading