nf-core · apeltzer · Sep 6, 2019 · Aug 30, 2019 · Aug 30, 2019 · Aug 30, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -44,5 +44,9 @@ script:
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --aligner hisat2
   # Run with STAR and Salmon
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon
+  # Run with STAR and Salmon + build transcriptome from genome fasta + gtf
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon --transcript_fasta false
   # Run with Salmon and no alignment
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon --skipAlignment
+  # Test gff -> gtf conversion
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --gff false
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,9 @@
 * Add `--gencode` option for compatibility of Salmon and featureCounts biotypes with GENCODE gene annotations
 * Use `file` instead of `new File` to create `pipeline_report.{html,txt}` files, and properly create subfolders
 * Add `--skipAlignment` option to only use pseudo-alignment and no alignment with STAR or HiSat2
+* Check that gtf features are on chromosomes that exist in the genome fasta file [#274](https://github.com/nf-core/rnaseq/pull/274)
+* Maintain all gff features upon gtf conversion (keeps gene_biotype)
+
 
 ### Dependency Updates
 

diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+from __future__ import print_function
+import logging
+from itertools import groupby
+import argparse
+
+# Create a logger
+logging.basicConfig(format='%(name)s - %(asctime)s %(levelname)s: %(message)s')
+logger = logging.getLogger(__file__)
+logger.setLevel(logging.INFO)
+
+def is_header(line):
+    return line[0] == '>'
+
+
+def extract_fasta_seq_names(fasta_name):
+    """
+    modified from Brent Pedersen
+    Correct Way To Parse A Fasta File In Python
+    given a fasta file. yield tuples of header, sequence
+    from https://www.biostars.org/p/710/
+    """
+    "first open the file outside "
+    fh = open(fasta_name)
+
+    # ditch the boolean (x[0]) and just keep the header or sequence since
+    # we know they alternate.
+    faiter = (x[1] for x in groupby(fh, is_header))
+
+    for i, header in enumerate(faiter):
+        line = next(header)
+        if is_header(line):
+            # drop the ">"
+            headerStr = line[1:].strip().split()[0]
+        yield headerStr
+
+
+def extract_genes_in_genome(fasta, gtf_in, gtf_out):
+    seq_names_in_genome = set(extract_fasta_seq_names(fasta))
+    logger.info("Extracted chromosome sequence names from : %s" % fasta)
+    logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome)))
+    seq_names_in_gtf = set([])
+
+    n_total_lines = 0
+    n_lines_in_genome = 0
+    with open(gtf_out, 'w') as f:
+        with open(gtf_in) as g:
+
+            for line in g.readlines():
+                n_total_lines += 1
+                seq_name_gtf = line.split("\t")[0]
+                seq_names_in_gtf.add(seq_name_gtf)
+                if seq_name_gtf in seq_names_in_genome:
+                    n_lines_in_genome += 1
+                    f.write(line)
+    logger.info("Extracted %d / %d lines from %s matching sequences in %s" %
+                (n_lines_in_genome, n_total_lines, gtf_in, fasta))
+    logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf)))
+
+    logger.info("Wrote matching lines to %s" % gtf_out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Filter GTF only for features in the genome""")
+    parser.add_argument("--gtf", type=str, help="GTF file")
+    parser.add_argument("--fasta", type=str, help="Genome fasta file")
+    parser.add_argument("-o", "--output", dest='output',
+                        default='genes_in_genome.gtf',
+                        type=str, help="GTF features on fasta genome sequences")
+
+    args = parser.parse_args()
+    extract_genes_in_genome(args.fasta, args.gtf, args.output)
diff --git a/conf/test.config b/conf/test.config
@@ -25,5 +25,6 @@ params {
   // Genome references
   fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fa'
   gtf = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gtf'
+  gff = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gff'
   transcript_fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/transcriptome.fasta'
 }
diff --git a/main.nf b/main.nf
@@ -395,9 +395,13 @@ process get_software_versions {
 /*
  * PREPROCESSING - Convert GFF3 to GTF
  */
-if(params.gff){
+// Prefer gtf over gff
+log.info "Prefer GTF over GFF, so ignoring provided GFF in favor of GTF"
+if(params.gff && !params.gtf){
     process convertGFFtoGTF {
         tag "$gff"
+        publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir },
+                   saveAs: { params.saveReference ? it : null }, mode: 'copy'
 
         input:
         file gff from gffFile
@@ -408,9 +412,11 @@ if(params.gff){
 
         script:
         """
-        gffread $gff -T -o ${gff.baseName}.gtf
+        gffread $gff --keep-exon-attrs -F -T -o ${gff.baseName}.gtf
         """
     }
+} else {
+  log.info "Prefer GTF over GFF, so ignoring provided GFF in favor of GTF"
 }
 
 /*
@@ -557,8 +563,10 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){
             file "*.fa" into ch_fasta_for_salmon_index
 
             script:
+	          // filter_gtf_for_genes_in_genome.py is bundled in this package, in rnaseq/bin
             """
-            gffread -w transcripts.fa -g $fasta $gtf
+            filter_gtf_for_genes_in_genome.py --gtf $gtf --fasta $fasta -o ${gtf.baseName}__in__${fasta.baseName}.gtf
+            gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf
             """
         }
     }