-
Notifications
You must be signed in to change notification settings - Fork 591
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Issue #7159 Create tool for producing genomic regions (as a BED file) (
#8942) * Initial commit and basic code to read gtf * add: code to write to bed & integration test * fix: make getAllFeatures public and use the nesting of features to get to transcripts * add: filtering transcripts by basic tag * add: sorts by contig and start (need to fix - sorting lexicographically) * fix: now sorts by contig then start & output is correct * fix: make dictionary an arg * add: comments + simplified CompareGtfInfo * refactor: apply method test: add separate tests for gene and transcript * refactor: onTraversalSuccess and writeToBed * add: more tests * fix: test files in correct dir pt1. (files are too large) * fix: test files in correct dir pt2. * add: compareFiles and ground truth bed files * fix: runGtfToBed assert * add: comments to GtfToBed * fix: error handling for different versions of gtf and dictionary * fix: edited some bad conventions * fix: remove spaces from input file fullName * add: gtf file with MYT1L and MAPK1 * add: many transcripts unit test and refactoring * add: tiebreaker sorting by id * add: make sort by basic optional * add: html doc comment * fix: dictionary arg * fix: add "Gencode" to description * add: sample mouse gencode testing * fix: Remove arg shortnames * fix: rename and move CompareGtfInfo * fix: kebab-case args * fix: update html doc * fix: use IntegrationTestSpec.assertEqualTextFiles() * fix: remove unnecessary test of pik3ca * fix: remove set functions in GtfInfo * fix: style of comparator * fix: style of comparator * fix: use Files.newOutputStream() to write and logger for errors * fix: use getBestAvailableSequenceDictionary() * fix: use dataProvider for integration tests * fix: better encapsulation * fix: move mapk1.gtf to large dir * fix: arg names * fix: rename reference dict. * fix: sequence-dictionary arg javadoc * add: javadoc to GtfInfo * add: dictionary exception and corresponding test * add: test with fasta file as reference arg * add: javadoc for fasta file * fix: javadoc and onTraversalStart exception
- Loading branch information
1 parent
9f2fbb5
commit 3c1430d
Showing
22 changed files
with
8,675 additions
and
3 deletions.
There are no files selected for viewing
66 changes: 66 additions & 0 deletions
66
src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfInfo.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package org.broadinstitute.hellbender.tools.walkers.conversion; | ||
|
||
import htsjdk.samtools.util.Interval; | ||
|
||
/** | ||
* A class that represents information extracted from a feature in a GTF file. | ||
* The {@code GtfInfo} object encapsulates details about a specific gene or transcript, | ||
* including its type, gene name, and interval. | ||
* | ||
* <p>The {@code GtfInfo.Type} enum specifies whether the features is a | ||
* {@code GENE} or a {@code TRANSCRIPT}.</p> | ||
* | ||
* <p>The interval specifies the feature's contig (chromosome), start position, | ||
* and end position, providing the precise location of the gene or transcript | ||
* on the genome.</p> | ||
* | ||
* <p>Example usage:</p> | ||
* <pre> | ||
* Interval interval = new Interval("chr1", 1000, 2000); | ||
* GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, "MAPK1"); | ||
* </pre> | ||
**/ | ||
|
||
public class GtfInfo { | ||
|
||
public enum Type { | ||
GENE, | ||
TRANSCRIPT | ||
} | ||
|
||
private final Type type; | ||
private final String geneName; | ||
private final Interval interval; | ||
|
||
public GtfInfo(Interval interval, Type type, String geneName) { | ||
this.interval = interval; | ||
this.type = type; | ||
this.geneName = geneName; | ||
} | ||
|
||
public Type getType() { | ||
return type; | ||
} | ||
|
||
public String getGeneName() { | ||
return geneName; | ||
} | ||
|
||
public Interval getInterval() { | ||
return interval; | ||
} | ||
|
||
public Integer getStart() { | ||
return interval.getStart(); | ||
} | ||
|
||
public Integer getEnd() { | ||
return interval.getEnd(); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "GtfInfo{ " + "type = " + type + " geneName = " + geneName + "interval = " + interval; | ||
} | ||
|
||
} |
322 changes: 322 additions & 0 deletions
322
src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfToBed.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,322 @@ | ||
package org.broadinstitute.hellbender.tools.walkers.conversion; | ||
|
||
import htsjdk.samtools.SAMSequenceDictionary; | ||
import htsjdk.samtools.util.*; | ||
import htsjdk.tribble.Feature; | ||
import org.apache.log4j.LogManager; | ||
import org.apache.log4j.Logger; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.argparser.WorkflowProperties; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup; | ||
import org.broadinstitute.hellbender.engine.*; | ||
import org.broadinstitute.hellbender.exceptions.GATKException; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.Utils; | ||
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature; | ||
|
||
import java.io.IOException; | ||
import java.io.OutputStream; | ||
import java.nio.file.Files; | ||
import java.util.*; | ||
|
||
/** | ||
* <p>Convert Gencode GTF files to BED format with options for gene and transcript level processing. | ||
* This tool allows for the extraction of gene and transcript information from Gencode GTF files and | ||
* outputs the data in BED format.</p> | ||
* | ||
* | ||
* <p>The conversion process includes sorting entries | ||
* by karyotype, providing flexibility in the selection of either gene or transcript level | ||
* data, and an option to only use basic tags. It ensures that the BED output is sorted and formatted correctly for subsequent use. | ||
* Note that it has been tested for both human and mouse Gencode GTFs. </p> | ||
* | ||
* <h3>Usage examples</h3> | ||
* <p>Example commands to run GtfToBed for typical scenarios:</p> | ||
* | ||
* <h4>(i) Convert GTF to BED with gene level data</h4> | ||
* <p>This mode extracts and converts gene data from the input GTF file to BED format:</p> | ||
* | ||
* <pre> | ||
* java -jar GtfToBed.jar \ | ||
* -gtf-path input.gtf \ | ||
* -sequence-dictionary dictionary.dict \ | ||
= * -output output.bed \ | ||
* </pre> | ||
* | ||
* <h4>(ii) Convert GTF to BED with transcript level data</h4> | ||
* <p>This mode extracts and converts transcript data from the input GTF file to BED format:</p> | ||
* | ||
* <pre> | ||
* java -jar GtfToBed.jar \ | ||
* -gtf-path input.gtf \ | ||
* -sequence-dictionary dictionary.dict \ | ||
* -sort-by-transcript \ | ||
* -output output.bed \ | ||
* </pre> | ||
* | ||
* <h4>(iii) Convert GTF to BED with transcript level data, filtering for only those with the basic tag</h4> | ||
* <p>This mode extracts and converts basic transcript data from the input GTF file to BED format:</p> | ||
* | ||
* <pre> | ||
* java -jar GtfToBed.jar \ | ||
* -gtf-path input.gtf \ | ||
* -sequence-dictionary dictionary.dict \ | ||
* -sort-by-transcript \ | ||
* -use-basic-transcript \ | ||
* -output output.bed \ | ||
* </pre> | ||
* | ||
* <h4>(iiii) </h4> Convert GTF to BED with transcript level data using a reference FASTA instead of DICT file. | ||
* <p>This mode extracts and converts transcript data from the input GTF file to BED format using a FASTA file:</p> | ||
* | ||
* <pre> | ||
* java -jar GtfToBed.jar \ | ||
* -gtf-path input.gtf \ | ||
* -reference ref.fa \ | ||
* -sort-by-transcript \ | ||
* -output output.bed \ | ||
* </pre> | ||
*/ | ||
|
||
@CommandLineProgramProperties( | ||
summary = "Converts Gencode GTF files to Bed file format with each row of bed file being either a gene or a transcript.", | ||
oneLineSummary = "Gencode GTF to BED", | ||
programGroup = ShortVariantDiscoveryProgramGroup.class | ||
) | ||
|
||
@DocumentedFeature | ||
@WorkflowProperties | ||
public class GtfToBed extends FeatureWalker<GencodeGtfFeature> { | ||
public static final String SORT_BY_TRANSCRIPT_LONG_NAME = "sort-by-transcript"; | ||
public static final String USE_BASIC_TRANSCRIPT_LONG_NAME = "use-basic-transcript"; | ||
public static final String INPUT_LONG_NAME = "gtf-path"; | ||
protected final Logger logger = LogManager.getLogger(this.getClass()); | ||
|
||
@Argument(fullName = INPUT_LONG_NAME, doc = "Path to Gencode GTF file") | ||
public GATKPath inputFile; | ||
|
||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME , doc = "Output BED file") | ||
public GATKPath outputFile; | ||
|
||
@Argument(fullName = SORT_BY_TRANSCRIPT_LONG_NAME, doc = "Make each row of BED file sorted by transcript", optional = true) | ||
public boolean sortByTranscript = false; | ||
|
||
@Argument(fullName = USE_BASIC_TRANSCRIPT_LONG_NAME, doc = "Only use basic transcripts") | ||
public boolean sortByBasic = false; | ||
|
||
//stores either gene or transcript ID and summary information about the feature | ||
private final Map<String, GtfInfo> featureInfoMap = new HashMap<>(); | ||
|
||
//Sequence Dictionary | ||
private SAMSequenceDictionary sequenceDictionary = null; | ||
|
||
@Override | ||
protected boolean isAcceptableFeatureType(Class<? extends Feature> featureType) { | ||
return featureType.isAssignableFrom(GencodeGtfFeature.class); | ||
} | ||
|
||
// runs per line of gtf file | ||
@Override | ||
public void apply(GencodeGtfFeature feature, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { | ||
// list of all features of the gene | ||
List<GencodeGtfFeature> geneFeatures = feature.getAllFeatures(); | ||
|
||
// process each gtf feature in the list of gene features | ||
for (GencodeGtfFeature gtfFeature : geneFeatures) { | ||
// the basic tag is in optional fields | ||
List<GencodeGtfFeature.OptionalField<?>> optionalFields = getOptionalFields(gtfFeature); | ||
|
||
// if the gtf feature is a Gene | ||
if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.GENE) { | ||
processGeneFeature(gtfFeature); | ||
} | ||
|
||
// check if the gtf feature is a transcript. If user only wants basic transcripts check that it has the basic tag | ||
else if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.TRANSCRIPT) { | ||
if (sortByBasic) { | ||
for (GencodeGtfFeature.OptionalField<?> field : optionalFields) { | ||
if ("basic".equals(field.getValue())) { | ||
processTranscriptFeature(gtfFeature); | ||
} | ||
} | ||
} else { | ||
processTranscriptFeature(gtfFeature); | ||
} | ||
} | ||
} | ||
} | ||
|
||
// gets the tag out of the list of optional fields | ||
private List<GencodeGtfFeature.OptionalField<?>> getOptionalFields(GencodeGtfFeature gtfFeature) { | ||
List<GencodeGtfFeature.OptionalField<?>> optionalFields = null; | ||
try { | ||
optionalFields = gtfFeature.getOptionalField("tag"); | ||
} catch (Exception e) { | ||
logger.error("Could not retrieve optional fields: ", e); | ||
} | ||
return optionalFields; | ||
} | ||
|
||
// stores the gene ID and Interval info in hashmap | ||
private void processGeneFeature(GencodeGtfFeature gtfFeature) { | ||
final int geneStart = gtfFeature.getStart(); | ||
final int geneEnd = gtfFeature.getEnd(); | ||
final Interval interval = new Interval(gtfFeature.getContig(), geneStart, geneEnd); | ||
|
||
// put the interval, type as gene, and the name of gene | ||
final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, gtfFeature.getGeneName()); | ||
|
||
// store in hashmap with key as geneId | ||
featureInfoMap.put(gtfFeature.getGeneId(), gtfInfo); | ||
} | ||
|
||
// stores the transcript ID and Interval info in hashmap | ||
private void processTranscriptFeature(GencodeGtfFeature gtfFeature) { | ||
//get interval and put the interval, type as transcript, and the name of the gene it's in | ||
final Interval interval = new Interval(gtfFeature.getContig(), gtfFeature.getStart(), gtfFeature.getEnd()); | ||
final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.TRANSCRIPT, gtfFeature.getGeneName()); | ||
|
||
//store in hashmap with key as transcriptId | ||
featureInfoMap.put(gtfFeature.getTranscriptId(), gtfInfo); | ||
|
||
//update start/end of corresponding gene if needed | ||
updateGeneStart(gtfFeature); | ||
updateGeneEnd(gtfFeature); | ||
} | ||
|
||
// update the gene interval start position based on the transcript | ||
private void updateGeneStart(GencodeGtfFeature gtfFeature) { | ||
// get the start value of the gene | ||
int geneStart = featureInfoMap.get(gtfFeature.getGeneId()).getStart(); | ||
|
||
// if the transcript start is less than the gene start | ||
if (gtfFeature.getStart() < geneStart) { | ||
// set the gene start to be the transcript start | ||
geneStart = gtfFeature.getStart(); | ||
updateGeneInterval(gtfFeature, geneStart, featureInfoMap.get(gtfFeature.getGeneId()).getEnd()); | ||
} | ||
} | ||
|
||
// update the gene interval end position based on the transcript | ||
private void updateGeneEnd(GencodeGtfFeature gtfFeature) { | ||
// get the end value of the gene | ||
int geneEnd = featureInfoMap.get(gtfFeature.getGeneId()).getEnd(); | ||
|
||
// if the transcript end is greater than the gene end | ||
if (gtfFeature.getEnd() > geneEnd) { | ||
// set the gene end to be the transcript end | ||
geneEnd = gtfFeature.getEnd(); | ||
updateGeneInterval(gtfFeature, featureInfoMap.get(gtfFeature.getGeneId()).getStart(), geneEnd); | ||
} | ||
} | ||
|
||
// updates an interval of the gene if it needs to be changed | ||
private void updateGeneInterval(GencodeGtfFeature gtfFeature, int geneStart, int geneEnd) { | ||
Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd); | ||
GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName()); | ||
featureInfoMap.put(gtfFeature.getGeneId(), gtfGeneInfo); | ||
} | ||
|
||
@Override | ||
public void onTraversalStart() { | ||
sequenceDictionary = getBestAvailableSequenceDictionary(); | ||
if(sequenceDictionary == null){ | ||
throw new UserException("Sequence Dictionary must be specified (" + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME + ")."); | ||
} | ||
} | ||
|
||
// runs immediately after it has gone through each line of gtf (apply method) | ||
@Override | ||
public Object onTraversalSuccess() { | ||
// create linked hash map to store sorted values of idToInfo | ||
LinkedHashMap<String, GtfInfo> karyotypeIdToInfo = getSortedMap(sequenceDictionary); | ||
|
||
// if user wants to sort by transcript only use transcripts else only use genes | ||
GtfInfo.Type selectedType = sortByTranscript ? GtfInfo.Type.TRANSCRIPT : GtfInfo.Type.GENE; | ||
writeToBed(selectedType, karyotypeIdToInfo); | ||
|
||
return null; | ||
} | ||
|
||
//Compare GtfInfo objects positionally by contig and start position. If transcripts have the same contig and start, compare by TranscriptId | ||
public static class GtfInfoComparator implements Comparator<Map.Entry<String, GtfInfo>> { | ||
|
||
private final SAMSequenceDictionary dictionary; | ||
|
||
GtfInfoComparator(SAMSequenceDictionary dictionary) { | ||
this.dictionary = dictionary; | ||
} | ||
|
||
// compare two entries of a map where key = geneId or transcriptId and value = gtfInfo object | ||
@Override | ||
public int compare(Map.Entry<String, GtfInfo> e1, Map.Entry<String, GtfInfo> e2) { | ||
final Interval e1Interval = e1.getValue().getInterval(); | ||
final Interval e2Interval = e2.getValue().getInterval(); | ||
|
||
Utils.nonNull(dictionary.getSequence(e1Interval.getContig()), "could not get sequence for " + e1Interval.getContig()); | ||
Utils.nonNull(dictionary.getSequence(e2Interval.getContig()), "could not get sequence for " + e2Interval.getContig()); | ||
|
||
//compare by contig, then start, then by key | ||
return Comparator | ||
.comparingInt((Map.Entry<String, GtfInfo> e) -> | ||
dictionary.getSequence(e.getValue().getInterval().getContig()).getSequenceIndex()) | ||
.thenComparingInt(e -> e.getValue().getInterval().getStart()) | ||
.thenComparing(Map.Entry::getKey) | ||
.compare(e1,e2); | ||
|
||
|
||
} | ||
} | ||
|
||
// sorts the map containing the features based on contig and start position | ||
private LinkedHashMap<String, GtfInfo> getSortedMap(SAMSequenceDictionary sequenceDictionary) { | ||
// create a list that has the keys and values of idToInfo and sort the list using GtfInfoComparator | ||
List<Map.Entry<String, GtfInfo>> entries = new ArrayList<>(featureInfoMap.entrySet()); | ||
entries.sort(new GtfInfoComparator(sequenceDictionary)); | ||
|
||
// put each (sorted) entry in the list into a linked hashmap | ||
LinkedHashMap<String, GtfInfo> karyotypeIdToInfo = new LinkedHashMap<>(); | ||
for (Map.Entry<String, GtfInfo> entry : entries) { | ||
karyotypeIdToInfo.put(entry.getKey(), entry.getValue()); | ||
} | ||
|
||
return karyotypeIdToInfo; | ||
} | ||
|
||
// writes to bed file | ||
private void writeToBed(GtfInfo.Type type, Map<String, GtfInfo> sortedMap) { | ||
try (final OutputStream writer = Files.newOutputStream(outputFile.toPath())) { | ||
for (Map.Entry<String, GtfInfo> entry : sortedMap.entrySet()) { | ||
if (entry.getValue().getType() == type) { | ||
String line = formatBedLine(entry, type); | ||
writer.write((line + System.lineSeparator()).getBytes()); | ||
} | ||
} | ||
} catch (IOException e) { | ||
throw new GATKException("Error writing to BED file", e); | ||
} | ||
} | ||
|
||
// formats each line of the bed file depending on whether user has selected gene or transcript | ||
private String formatBedLine(Map.Entry<String, GtfInfo> entry, GtfInfo.Type type) { | ||
GtfInfo info = entry.getValue(); | ||
String line = info.getInterval().getContig() + "\t" + | ||
info.getInterval().getStart() + "\t" + | ||
info.getInterval().getEnd() + "\t" + | ||
info.getGeneName(); | ||
|
||
if (type == GtfInfo.Type.TRANSCRIPT) { | ||
line += "," + entry.getKey(); | ||
} | ||
|
||
return line; | ||
} | ||
|
||
@Override | ||
public GATKPath getDrivingFeaturePath() { | ||
return inputFile; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.