Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add step to filter gtf for only chromosomes that exist in the fasta file #274

Merged
merged 24 commits into from
Sep 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,9 @@ script:
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --aligner hisat2
# Run with STAR and Salmon
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon
# Run with STAR and Salmon + build transcriptome from genome fasta + gtf
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon --transcript_fasta false
# Run with Salmon and no alignment
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon --skipAlignment
# Test gff -> gtf conversion
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --gff false
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
* Add `--gencode` option for compatibility of Salmon and featureCounts biotypes with GENCODE gene annotations
* Use `file` instead of `new File` to create `pipeline_report.{html,txt}` files, and properly create subfolders
* Add `--skipAlignment` option to only use pseudo-alignment and no alignment with STAR or HiSat2
* Check that gtf features are on chromosomes that exist in the genome fasta file [#274](https://github.com/nf-core/rnaseq/pull/274)
* Maintain all gff features upon gtf conversion (keeps gene_biotype)


### Dependency Updates

Expand Down
72 changes: 72 additions & 0 deletions bin/filter_gtf_for_genes_in_genome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python
from __future__ import print_function
import logging
from itertools import groupby
import argparse

# Create a logger
logging.basicConfig(format='%(name)s - %(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)

def is_header(line):
return line[0] == '>'


def extract_fasta_seq_names(fasta_name):
"""
modified from Brent Pedersen
Correct Way To Parse A Fasta File In Python
given a fasta file. yield tuples of header, sequence
from https://www.biostars.org/p/710/
"""
"first open the file outside "
fh = open(fasta_name)

# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
faiter = (x[1] for x in groupby(fh, is_header))

for i, header in enumerate(faiter):
line = next(header)
if is_header(line):
# drop the ">"
headerStr = line[1:].strip().split()[0]
yield headerStr


def extract_genes_in_genome(fasta, gtf_in, gtf_out):
seq_names_in_genome = set(extract_fasta_seq_names(fasta))
logger.info("Extracted chromosome sequence names from : %s" % fasta)
logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome)))
seq_names_in_gtf = set([])

n_total_lines = 0
n_lines_in_genome = 0
with open(gtf_out, 'w') as f:
with open(gtf_in) as g:

for line in g.readlines():
n_total_lines += 1
seq_name_gtf = line.split("\t")[0]
seq_names_in_gtf.add(seq_name_gtf)
if seq_name_gtf in seq_names_in_genome:
n_lines_in_genome += 1
f.write(line)
logger.info("Extracted %d / %d lines from %s matching sequences in %s" %
(n_lines_in_genome, n_total_lines, gtf_in, fasta))
logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf)))

logger.info("Wrote matching lines to %s" % gtf_out)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""Filter GTF only for features in the genome""")
parser.add_argument("--gtf", type=str, help="GTF file")
parser.add_argument("--fasta", type=str, help="Genome fasta file")
parser.add_argument("-o", "--output", dest='output',
default='genes_in_genome.gtf',
type=str, help="GTF features on fasta genome sequences")

args = parser.parse_args()
extract_genes_in_genome(args.fasta, args.gtf, args.output)
1 change: 1 addition & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,6 @@ params {
// Genome references
fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fa'
gtf = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gtf'
gff = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gff'
transcript_fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/transcriptome.fasta'
}
14 changes: 11 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -395,9 +395,13 @@ process get_software_versions {
/*
* PREPROCESSING - Convert GFF3 to GTF
*/
if(params.gff){
// Prefer gtf over gff
log.info "Prefer GTF over GFF, so ignoring provided GFF in favor of GTF"
if(params.gff && !params.gtf){
process convertGFFtoGTF {
tag "$gff"
publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir },
saveAs: { params.saveReference ? it : null }, mode: 'copy'

input:
file gff from gffFile
Expand All @@ -408,9 +412,11 @@ if(params.gff){

script:
"""
gffread $gff -T -o ${gff.baseName}.gtf
gffread $gff --keep-exon-attrs -F -T -o ${gff.baseName}.gtf
apeltzer marked this conversation as resolved.
Show resolved Hide resolved
"""
}
} else {
log.info "Prefer GTF over GFF, so ignoring provided GFF in favor of GTF"
}

/*
Expand Down Expand Up @@ -557,8 +563,10 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){
file "*.fa" into ch_fasta_for_salmon_index

script:
// filter_gtf_for_genes_in_genome.py is bundled in this package, in rnaseq/bin
"""
gffread -w transcripts.fa -g $fasta $gtf
filter_gtf_for_genes_in_genome.py --gtf $gtf --fasta $fasta -o ${gtf.baseName}__in__${fasta.baseName}.gtf
gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf
"""
}
}
Expand Down