Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleaning up old interval code #6465

Merged
merged 3 commits into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public Object doWork() {

private static GenomeLocSortedSet getGenomeLocs(final GenomeLocParser genomeLocParser, final String intervalFile) {

final List<GenomeLoc> genomeLocs = IntervalUtils.intervalFileToList(genomeLocParser, intervalFile);
final List<GenomeLoc> genomeLocs = IntervalUtils.parseIntervalArguments(genomeLocParser, intervalFile);
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, genomeLocs, IntervalMergingRule.ALL);
}
}
110 changes: 43 additions & 67 deletions src/main/java/org/broadinstitute/hellbender/utils/IntervalUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,19 @@
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;

Expand All @@ -42,8 +54,8 @@ public final class IntervalUtils {
/**
* Recognized extensions for interval files
*/
public static final List<String> INTERVAL_FILE_EXTENSIONS = Collections.unmodifiableList(Arrays.asList(
".list", ".interval_list", ".intervals", ".picard"
public static final List<String> GATK_INTERVAL_FILE_EXTENSIONS = Collections.unmodifiableList(Arrays.asList(
".list", ".intervals"
));

/**
Expand Down Expand Up @@ -303,27 +315,23 @@ public static List<GenomeLoc> parseIntervalArguments(final GenomeLocParser parse
}
// If it's a Feature-containing file, convert it to a list of intervals
else if ( FeatureManager.isFeatureFile(IOUtils.getPath(arg)) ) {
rawIntervals.addAll(featureFileToIntervals(parser, arg));
}
// If it's an interval file, add its contents to the raw interval list
else if ( isIntervalFile(arg) ) {
try {
rawIntervals.addAll(intervalFileToList(parser, arg));
}
catch ( final UserException.MalformedGenomeLoc e ) {
throw e;
}
catch ( final Exception e ) {
throw new UserException.MalformedFile(new File(arg), "Interval file could not be parsed in any supported format.", e);
rawIntervals.addAll(featureFileToIntervals(parser, arg));
} catch (final IllegalArgumentException e){
throw new UserException.MalformedFile(IOUtils.getPath(arg), "Failure while loading intervals from file.", e);
}
}
// If it's an interval file, add its contents to the raw interval list
else if ( isGatkIntervalFile(arg) ) {
rawIntervals.addAll(gatkIntervalFileToList(parser, arg));
}
// If it's neither a Feature-containing file nor an interval file, but is an existing file, throw an error.
// Note that since contigs can contain periods in their names, we can't use the mere presence of an "extension"
// as evidence that the user intended the String to be interpreted as a file.
else if ( new File(arg).exists() ) {
throw new UserException.CouldNotReadInputFile(arg, String.format("The file %s exists, but does not contain Features " +
"(ie., is not in a supported Feature file format such as vcf, bcf, or bed), " +
"and does not have one of the supported interval file extensions (" + INTERVAL_FILE_EXTENSIONS + "). " +
"(ie., is not in a supported Feature file format such as vcf, bcf, bed, or interval_list), " +
"and does not have one of the supported interval file extensions (" + GATK_INTERVAL_FILE_EXTENSIONS + "). " +
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kind of a nit, but this error message will no longer include ".interval_list", which is an acceptable extension.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically interval_list is a feature file now

"Please rename your file with the appropriate extension. If %s is NOT supposed to be a file, " +
"please move or rename the file at location %s", arg, arg, new File(arg).getAbsolutePath()));
}
Expand Down Expand Up @@ -356,67 +364,35 @@ public static List<GenomeLoc> featureFileToIntervals( final GenomeLocParser pars
}

/**
* Read a file of genome locations to process. The file may be in Picard
* or GATK interval format.
* Read a file of genome locations to process, this file must be in GATK interval format.
*
* @param glParser GenomeLocParser
* @param fileName interval file
* @return List<GenomeLoc> List of Genome Locs that have been parsed from file
*/
public static List<GenomeLoc> intervalFileToList(final GenomeLocParser glParser, final String fileName) {
private static List<GenomeLoc> gatkIntervalFileToList(final GenomeLocParser glParser, final String fileName) {
Utils.nonNull(glParser, "glParser is null");
Utils.nonNull(fileName, "file name is null");

final Path inputPath = IOUtils.getPath(fileName);
final List<GenomeLoc> ret = new ArrayList<>();

/**
* First try to read the file as a Picard interval file since that's well structured --
* we'll fail quickly if it's not a valid file.
*/
boolean isPicardInterval = false;
try {
// Note: Picard will skip over intervals with contigs not in the sequence dictionary
final IntervalList il = IntervalList.fromPath(inputPath);
isPicardInterval = true;

for (final Interval interval : il.getIntervals()) {
// The current Agilent exome interval list is off-by-one on all end positions. Until this is fixed we
// need to tolerate intervals where the end is one before the start. We should remove this once a
// corrected version of the interval list is released. This is tracked in:
// https://github.com/broadinstitute/gatk/issues/2089
if (interval.getStart() - interval.getEnd() == 1 ) {
logger.warn("Ignoring possibly incorrectly converted length 1 interval : " + interval);
}
else if ( glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)) {
ret.add(glParser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true));
} else {
throw new UserException(inputPath.toUri() + " has an invalid interval : " + interval) ;
try (final PathLineIterator reader = new PathLineIterator(inputPath)) {
for (final String line : reader) {
final String trimmedLine = line.trim();
if (!trimmedLine.isEmpty()) {
ret.add(glParser.parseGenomeLoc(trimmedLine));
}
}
}
// if that didn't work, try parsing file as a GATK interval file
catch (final Exception e) {
if ( isPicardInterval ) // definitely a picard file, but we failed to parse
{
throw new UserException.CouldNotReadInputFile(inputPath, e);
} else {
try (PathLineIterator reader = new PathLineIterator(inputPath)) {
for (final String line : reader) {
final String trimmedLine = line.trim();
if (!trimmedLine.isEmpty()) {
ret.add(glParser.parseGenomeLoc(trimmedLine));
}
}
}
if (ret.isEmpty()) {
throw new UserException.MalformedFile(inputPath, "It contains no intervals.");
}
return ret;
} catch (final UserException e){
throw e;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block seems unnecessary ? It might be better to just rely on the next catch block, which will always include the inputPath name.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I thought it might replace a more specific error with a less specific one. Since userexceptions don't show the stack trace you'd lose the underlying one in most cases.

} catch ( final Exception e ) {
throw new UserException.MalformedFile(inputPath, "Interval file could not be parsed as a GATK interval file", e);
}

if ( ret.isEmpty() ) {
throw new UserException.MalformedFile(inputPath, "It contains no intervals.");
}

return ret;
}

/**
Expand Down Expand Up @@ -547,21 +523,21 @@ private static <T extends Comparable<T>> List<T> reverse(final List<T> l) {
* @param str token to identify as a filename.
* @return true if the token looks like a filename, or false otherwise.
*/
public static boolean isIntervalFile(final String str) {
return isIntervalFile(str, true);
public static boolean isGatkIntervalFile(final String str) {
return isGatkIntervalFile(str, true);
}

/**
* Check if string argument was intended as a file
* Accepted file extensions are defined in {@link #INTERVAL_FILE_EXTENSIONS}
* Accepted file extensions are defined in {@link #GATK_INTERVAL_FILE_EXTENSIONS}
* @param str token to identify as a filename.
* @param checkExists if true throws an exception if the file doesn't exist and has an interval file extension
* @return true if the token looks like an interval file name, or false otherwise.
*/
public static boolean isIntervalFile(final String str, final boolean checkExists) {
public static boolean isGatkIntervalFile(final String str, final boolean checkExists) {
Utils.nonNull(str);
boolean hasIntervalFileExtension = false;
for ( final String extension : INTERVAL_FILE_EXTENSIONS ) {
for ( final String extension : GATK_INTERVAL_FILE_EXTENSIONS) {
if ( str.toLowerCase().endsWith(extension) ) {
hasIntervalFileExtension = true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ public void testPicardIntervalList() throws Exception {
final TestGATKToolWithReads tool = new TestGATKToolWithReads();
final CommandLineParser clp = new CommandLineArgumentParser(tool);
final File bamFile = new File(publicTestDir + "org/broadinstitute/hellbender/engine/reads_data_source_test1.bam");
final File intervalsFile = new File(publicTestDir + "picard_intervals.list");
final File intervalsFile = new File(publicTestDir + "picard_intervals.interval_list");
final String[] args = {
"-I", bamFile.getCanonicalPath(),
"-L", intervalsFile.getCanonicalPath()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ public void testTetraploidRun() throws IOException {
args.addArgument("variant",getToolTestDataDir()+"tetraploid-gvcf-1.vcf");
args.addArgument("variant",getToolTestDataDir()+"tetraploid-gvcf-2.vcf");
args.addArgument("variant",getToolTestDataDir()+"tetraploid-gvcf-3.vcf");
args.addArgument("intervals", getToolTestDataDir() + "tetraploid-gvcfs.intervals");
args.addArgument("intervals", getToolTestDataDir() + "tetraploid-gvcfs.interval_list");

runCommandLine(args);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package org.broadinstitute.hellbender.tools.walkers;

import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.IntervalList;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.Test;
import picard.util.IntervalList.IntervalListScatterMode;
Expand Down Expand Up @@ -151,7 +153,7 @@ private static void verifyScatteredFilesExist(final int scatterCount, final File
}

private static List<SimpleInterval> readIntervals(final File intervalsFile) {
return IntervalUtils.intervalFileToList(GLP, intervalsFile.getAbsolutePath()).stream().map(SimpleInterval::new).collect(Collectors.toList());
return Utils.stream(IntervalList.fromFile(intervalsFile)).map(SimpleInterval::new).collect(Collectors.toList());
}

private static void checkIntervalSizes(final int scatterCount, final File outputDir, final int expectedTotalLength, String extension) {
Expand Down
Loading