From 25d616b6c20baa22c47becb71a6669e2982d5980 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 30 Aug 2019 14:57:04 -0700 Subject: [PATCH 01/24] Add script to filter gtf on seqnames in genome fasta --- bin/filter_gtf_for_genes_in_genome.py | 62 +++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 bin/filter_gtf_for_genes_in_genome.py diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py new file mode 100644 index 000000000..d4ba5570f --- /dev/null +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +from __future__ import print_function +import logging +from itertools import groupby +import argparse + +# Create a logger +logging.basicConfig(format='%(name)s - %(asctime)s %(levelname)s: %(message)s') +logger = logging.getLogger(__file__) +logger.setLevel(logging.INFO) + + +def extract_fasta_seq_names(fasta_name): + """ + modified from Brent Pedersen + Correct Way To Parse A Fasta File In Python + given a fasta file. yield tuples of header, sequence + from https://www.biostars.org/p/710/ + """ + "first open the file outside " + fh = open(fasta_name) + + # ditch the boolean (x[0]) and just keep the header or sequence since + # we know they alternate. + faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">")) + + for header in faiter: + # drop the ">" + headerStr = header.__next__()[1:].strip() + + yield headerStr + + +def extract_genes_in_genome(fasta, gtf_in, gtf_out): + seq_names_in_genome = set(extract_fasta_seq_names(fasta)) + logger.info("Extracted chromosome sequence names from : %s" % fasta) + + n_total_lines = 0 + n_lines_in_genome = 0 + with open(gtf_out, 'w') as f: + with open(gtf_in) as g: + + for line in g.readlines(): + n_total_lines += 1 + if line.split('\t')[0] in seq_names_in_genome: + n_lines_in_genome += 1 + f.write(line) + logger.info("Extracted %d / %d lines from %s matching sequences in %s" % + (n_lines_in_genome, n_total_lines, gtf_in, fasta)) + logger.info("Wrote matching lines to %s" % gtf_out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Filter GTF only for features in the genome""") + parser.add_argument("--gtf", type=str, help="GTF file") + parser.add_argument("--fasta", type=str, help="Genome fasta file") + parser.add_argument("-o", "--output", dest='output', + default='genes_in_genome.gtf', + type=str, help="GTF features on fasta genome sequences") + + args = parser.parse_args() + extract_genes_in_genome(args.gtf, args.fasta, args.output) From 64bcc6a0f492355f0f81b145cae47200a4635a94 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 30 Aug 2019 15:06:18 -0700 Subject: [PATCH 02/24] Add filter for genes in genome --- .idea/libraries/R_User_Library.xml | 6 +++ .idea/markdown-navigator.xml | 71 ++++++++++++++++++++++++++++++ .idea/vcs.xml | 6 +++ main.nf | 3 +- 4 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 .idea/libraries/R_User_Library.xml create mode 100644 .idea/markdown-navigator.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml new file mode 100644 index 000000000..71f5ff749 --- /dev/null +++ b/.idea/libraries/R_User_Library.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/markdown-navigator.xml b/.idea/markdown-navigator.xml new file mode 100644 index 000000000..3efe9d64c --- /dev/null +++ b/.idea/markdown-navigator.xml @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..94a25f7f4 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/main.nf b/main.nf index 153459e30..21809dff4 100644 --- a/main.nf +++ b/main.nf @@ -558,7 +558,8 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){ script: """ - gffread -w transcripts.fa -g $fasta $gtf +filter_gtf_for_genes_in_genome.py $gtf $fasta ${gtf.baseName}__in__${genome.baseName}.gtf + gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${genome.baseName}.gtf """ } } From 090b65c8ff243e285f577118b685d1a74cde1733 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 30 Aug 2019 15:06:39 -0700 Subject: [PATCH 03/24] remove pycharm nonsesnse --- .idea/libraries/R_User_Library.xml | 6 --- .idea/markdown-navigator.xml | 71 ------------------------------ .idea/vcs.xml | 6 --- 3 files changed, 83 deletions(-) delete mode 100644 .idea/libraries/R_User_Library.xml delete mode 100644 .idea/markdown-navigator.xml delete mode 100644 .idea/vcs.xml diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml deleted file mode 100644 index 71f5ff749..000000000 --- a/.idea/libraries/R_User_Library.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/markdown-navigator.xml b/.idea/markdown-navigator.xml deleted file mode 100644 index 3efe9d64c..000000000 --- a/.idea/markdown-navigator.xml +++ /dev/null @@ -1,71 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7f4..000000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From a8e582b6f9557b4a032edc95a26cc592e20f137f Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 30 Aug 2019 15:57:34 -0700 Subject: [PATCH 04/24] Retain all gff features upon conversion to gtf --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 21809dff4..2e23f203a 100644 --- a/main.nf +++ b/main.nf @@ -408,7 +408,7 @@ if(params.gff){ script: """ - gffread $gff -T -o ${gff.baseName}.gtf + gffread $gff -F -T -o ${gff.baseName}.gtf """ } } @@ -557,6 +557,7 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){ file "*.fa" into ch_fasta_for_salmon_index script: + // filter_gtf_for_genes_in_genome.py is bundled in this package, in in rnaseq/bin """ filter_gtf_for_genes_in_genome.py $gtf $fasta ${gtf.baseName}__in__${genome.baseName}.gtf gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${genome.baseName}.gtf From 96110a39bf9edcfb2e533f51d56688372c925071 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 2 Sep 2019 11:51:07 -0700 Subject: [PATCH 05/24] Add changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c3342447..fd93fce1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ * Add `--gencode` option for compatibility of Salmon and featureCounts biotypes with GENCODE gene annotations * Use `file` instead of `new File` to create `pipeline_report.{html,txt}` files, and properly create subfolders * Add `--skipAlignment` option to only use pseudo-alignment and no alignment with STAR or HiSat2 +* Check that gtf features are on chromosomes that exist in the genome fasta file [#274](https://github.com/nf-core/rnaseq/pull/274) ### Dependency Updates From 96e681a5544a69e775f410fea8e97aaab77f890d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 2 Sep 2019 12:01:17 -0700 Subject: [PATCH 06/24] Add another line in changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd93fce1f..2ee34656e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,7 +30,8 @@ * Use `file` instead of `new File` to create `pipeline_report.{html,txt}` files, and properly create subfolders * Add `--skipAlignment` option to only use pseudo-alignment and no alignment with STAR or HiSat2 * Check that gtf features are on chromosomes that exist in the genome fasta file [#274](https://github.com/nf-core/rnaseq/pull/274) - +* Maintain all gff features upon gtf conversion (keeps gene_biotype) + ### Dependency Updates * Picard 2.20.0 -> 2.20.2 From 337a4da6b50b5371b1c1c35a03efd02f83786259 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 3 Sep 2019 09:32:40 -0700 Subject: [PATCH 07/24] Save gtf file converted from gff --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index 2e23f203a..4b9eb27b3 100644 --- a/main.nf +++ b/main.nf @@ -398,6 +398,8 @@ process get_software_versions { if(params.gff){ process convertGFFtoGTF { tag "$gff" + publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { params.saveReference ? it : null }, mode: 'copy' input: file gff from gffFile From 9ae4e4e4ee138c72ad0fcf62d3fd713a47b86a17 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 3 Sep 2019 09:32:54 -0700 Subject: [PATCH 08/24] Keep exon attributes in conversion to gtf --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 4b9eb27b3..22846a261 100644 --- a/main.nf +++ b/main.nf @@ -410,7 +410,7 @@ if(params.gff){ script: """ - gffread $gff -F -T -o ${gff.baseName}.gtf + gffread $gff --keep-exon-attrs -F -T -o ${gff.baseName}.gtf """ } } From 082f348833622ed29a19fa69c78439e2c92c6f11 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 3 Sep 2019 09:33:08 -0700 Subject: [PATCH 09/24] Whitespace fixes --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 22846a261..4cca5fe14 100644 --- a/main.nf +++ b/main.nf @@ -561,7 +561,7 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){ script: // filter_gtf_for_genes_in_genome.py is bundled in this package, in in rnaseq/bin """ -filter_gtf_for_genes_in_genome.py $gtf $fasta ${gtf.baseName}__in__${genome.baseName}.gtf + filter_gtf_for_genes_in_genome.py $gtf $fasta ${gtf.baseName}__in__${genome.baseName}.gtf gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${genome.baseName}.gtf """ } From f4cdcb5631b8565087e6cb9c5fe3563e5a44d733 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 3 Sep 2019 09:35:59 -0700 Subject: [PATCH 10/24] whitespace fixes --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ee34656e..6275b403a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,9 +29,10 @@ * Add `--gencode` option for compatibility of Salmon and featureCounts biotypes with GENCODE gene annotations * Use `file` instead of `new File` to create `pipeline_report.{html,txt}` files, and properly create subfolders * Add `--skipAlignment` option to only use pseudo-alignment and no alignment with STAR or HiSat2 -* Check that gtf features are on chromosomes that exist in the genome fasta file [#274](https://github.com/nf-core/rnaseq/pull/274) +* Check that gtf features are on chromosomes that exist in the genome fasta file [#274](https://github.com/nf-core/rnaseq/pull/274) * Maintain all gff features upon gtf conversion (keeps gene_biotype) - + + ### Dependency Updates * Picard 2.20.0 -> 2.20.2 From 8b6893049a7e578218e549693ac5053c1e75eb21 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 3 Sep 2019 09:50:44 -0700 Subject: [PATCH 11/24] Add gff to test config --- .travis.yml | 2 ++ conf/test.config | 1 + 2 files changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index a1ebc155b..239dee90d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,3 +46,5 @@ script: - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon # Run with Salmon and no alignment - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon --skipAlignment + # Test gff -> gtf conversion + - nextflow run -profile test,docker . --gff false diff --git a/conf/test.config b/conf/test.config index 5a4a0980b..6ed07bdcd 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,5 +25,6 @@ params { // Genome references fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fa' gtf = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gtf' + gff = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gff' transcript_fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/transcriptome.fasta' } From 9daeced73ee42641f3b7aa3221d2a98006bf59e9 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 07:54:22 -0700 Subject: [PATCH 12/24] Fix typo Co-Authored-By: Alexander Peltzer --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 4cca5fe14..b9afcdb58 100644 --- a/main.nf +++ b/main.nf @@ -559,7 +559,7 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){ file "*.fa" into ch_fasta_for_salmon_index script: - // filter_gtf_for_genes_in_genome.py is bundled in this package, in in rnaseq/bin + // filter_gtf_for_genes_in_genome.py is bundled in this package, in rnaseq/bin """ filter_gtf_for_genes_in_genome.py $gtf $fasta ${gtf.baseName}__in__${genome.baseName}.gtf gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${genome.baseName}.gtf From e61ad3289d74697d95afa2001d5fa014c2413582 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 08:03:34 -0700 Subject: [PATCH 13/24] Prefer gtf over gff --- main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index b9afcdb58..da87377d9 100644 --- a/main.nf +++ b/main.nf @@ -395,7 +395,9 @@ process get_software_versions { /* * PREPROCESSING - Convert GFF3 to GTF */ -if(params.gff){ +// Prefer gtf over gff +log.info "Prefer GTF over GFF, so ignoring provided GFF in favor of GTF" +if(params.gff && !params.gtf){ process convertGFFtoGTF { tag "$gff" publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, From a7bd0adadcac0a532ccf7d578e142e34e848b121 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 10:15:14 -0700 Subject: [PATCH 14/24] Fix nextflow run command on travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 239dee90d..ee025ec3c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -47,4 +47,4 @@ script: # Run with Salmon and no alignment - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon --skipAlignment # Test gff -> gtf conversion - - nextflow run -profile test,docker . --gff false + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --gff false From aa341cd3748ce1fe26e96bd715ff99107c5698e8 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 16:13:48 -0700 Subject: [PATCH 15/24] Move notice about ignoring GFF to when one is actually ignoring it --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index da87377d9..6d8868d59 100644 --- a/main.nf +++ b/main.nf @@ -415,6 +415,8 @@ if(params.gff && !params.gtf){ gffread $gff --keep-exon-attrs -F -T -o ${gff.baseName}.gtf """ } +} else { + log.info "Prefer GTF over GFF, so ignoring provided GFF in favor of GTF" } /* From 99b2b802896565552ac3f21fdc2905382749d98b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 16:24:46 -0700 Subject: [PATCH 16/24] Use genome fasta name --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 6d8868d59..e67de399b 100644 --- a/main.nf +++ b/main.nf @@ -563,10 +563,10 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){ file "*.fa" into ch_fasta_for_salmon_index script: - // filter_gtf_for_genes_in_genome.py is bundled in this package, in rnaseq/bin + // filter_gtf_for_genes_in_genome.py is bundled in this package, in rnaseq/bin """ - filter_gtf_for_genes_in_genome.py $gtf $fasta ${gtf.baseName}__in__${genome.baseName}.gtf - gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${genome.baseName}.gtf + filter_gtf_for_genes_in_genome.py $gtf $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf + gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf """ } } From c02b7abe3f9845e47bb0f346c23fe19eb836df9d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 16:37:15 -0700 Subject: [PATCH 17/24] Change permissions to executable --- bin/filter_gtf_for_genes_in_genome.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/filter_gtf_for_genes_in_genome.py diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py old mode 100644 new mode 100755 From 4d6a56f2f560a9bc56664e05a1c308cf772726a7 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 16:42:22 -0700 Subject: [PATCH 18/24] Add test for extracting fasta transcripts --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index ee025ec3c..7b06789d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,6 +44,8 @@ script: - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --aligner hisat2 # Run with STAR and Salmon - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon + # Run with STAR and Salmon + build transcriptome from genome fasta + gtf + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon --transcript_fasta false # Run with Salmon and no alignment - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon --skipAlignment # Test gff -> gtf conversion From f4068d0f5b949446f26c2578c981c256f1a70c47 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 16:42:36 -0700 Subject: [PATCH 19/24] Add proper flags for filter_gtf_for_genes_in_genome.py --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index e67de399b..c7394c6a4 100644 --- a/main.nf +++ b/main.nf @@ -565,7 +565,7 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){ script: // filter_gtf_for_genes_in_genome.py is bundled in this package, in rnaseq/bin """ - filter_gtf_for_genes_in_genome.py $gtf $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf + filter_gtf_for_genes_in_genome.py --gtf $gtf --fasta $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf """ } From 866cc8dd847357cbb6a1e1f0544f0aa58b34b1bb Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 16:46:30 -0700 Subject: [PATCH 20/24] Add output flag --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index c7394c6a4..a468f8cf0 100644 --- a/main.nf +++ b/main.nf @@ -565,7 +565,7 @@ if(params.pseudo_aligner == 'salmon' && !params.salmon_index){ script: // filter_gtf_for_genes_in_genome.py is bundled in this package, in rnaseq/bin """ - filter_gtf_for_genes_in_genome.py --gtf $gtf --fasta $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf + filter_gtf_for_genes_in_genome.py --gtf $gtf --fasta $fasta -o ${gtf.baseName}__in__${fasta.baseName}.gtf gffread -F -w transcripts.fa -g $fasta ${gtf.baseName}__in__${fasta.baseName}.gtf """ } From 4c0484cda3cea8a6c477a831d0f09300d23bfe8e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 16:52:56 -0700 Subject: [PATCH 21/24] Split fasta id name by whitespace to get name --- bin/filter_gtf_for_genes_in_genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py index d4ba5570f..4df621c32 100755 --- a/bin/filter_gtf_for_genes_in_genome.py +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -26,7 +26,7 @@ def extract_fasta_seq_names(fasta_name): for header in faiter: # drop the ">" - headerStr = header.__next__()[1:].strip() + headerStr = header.__next__()[1:].strip().split()[0] yield headerStr From 7c626f8036bfe62a02215fdef7e8ca0f04de2b6a Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 16:58:24 -0700 Subject: [PATCH 22/24] Print out which genome seqnames were found --- bin/filter_gtf_for_genes_in_genome.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py index 4df621c32..35ce77650 100755 --- a/bin/filter_gtf_for_genes_in_genome.py +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -34,6 +34,7 @@ def extract_fasta_seq_names(fasta_name): def extract_genes_in_genome(fasta, gtf_in, gtf_out): seq_names_in_genome = set(extract_fasta_seq_names(fasta)) logger.info("Extracted chromosome sequence names from : %s" % fasta) + logger.info("\n".join(seq_names_in_genome)) n_total_lines = 0 n_lines_in_genome = 0 From 0579d5202c148a491c9832702df58606604935c5 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Sep 2019 17:00:04 -0700 Subject: [PATCH 23/24] Add more log message --- bin/filter_gtf_for_genes_in_genome.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py index 35ce77650..c90095496 100755 --- a/bin/filter_gtf_for_genes_in_genome.py +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -34,7 +34,8 @@ def extract_fasta_seq_names(fasta_name): def extract_genes_in_genome(fasta, gtf_in, gtf_out): seq_names_in_genome = set(extract_fasta_seq_names(fasta)) logger.info("Extracted chromosome sequence names from : %s" % fasta) - logger.info("\n".join(seq_names_in_genome)) + logger.info("\n".join(sorted(x for x in seq_names_in_genome))) + seq_names_in_gtf = set([]) n_total_lines = 0 n_lines_in_genome = 0 @@ -43,11 +44,16 @@ def extract_genes_in_genome(fasta, gtf_in, gtf_out): for line in g.readlines(): n_total_lines += 1 - if line.split('\t')[0] in seq_names_in_genome: + seq_name_gtf = line.split("\t")[0] + seq_names_in_gtf.add(seq_name_gtf) + if seq_name_gtf in seq_names_in_genome: n_lines_in_genome += 1 f.write(line) logger.info("Extracted %d / %d lines from %s matching sequences in %s" % (n_lines_in_genome, n_total_lines, gtf_in, fasta)) + logger.info("All sequence IDs from GTF") + logger.info("\n".join(sorted(x for x in seq_name_gtf))) + logger.info("Wrote matching lines to %s" % gtf_out) From 6c59356551c5f6801d60de5895aec193e88be3a6 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Sep 2019 06:12:40 -0700 Subject: [PATCH 24/24] Fix extracting overlapping chromosome names for gtf and fasta --- bin/filter_gtf_for_genes_in_genome.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py index c90095496..73e778e08 100755 --- a/bin/filter_gtf_for_genes_in_genome.py +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -9,6 +9,9 @@ logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) +def is_header(line): + return line[0] == '>' + def extract_fasta_seq_names(fasta_name): """ @@ -22,19 +25,20 @@ def extract_fasta_seq_names(fasta_name): # ditch the boolean (x[0]) and just keep the header or sequence since # we know they alternate. - faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">")) - - for header in faiter: - # drop the ">" - headerStr = header.__next__()[1:].strip().split()[0] + faiter = (x[1] for x in groupby(fh, is_header)) + for i, header in enumerate(faiter): + line = next(header) + if is_header(line): + # drop the ">" + headerStr = line[1:].strip().split()[0] yield headerStr def extract_genes_in_genome(fasta, gtf_in, gtf_out): seq_names_in_genome = set(extract_fasta_seq_names(fasta)) logger.info("Extracted chromosome sequence names from : %s" % fasta) - logger.info("\n".join(sorted(x for x in seq_names_in_genome))) + logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome))) seq_names_in_gtf = set([]) n_total_lines = 0 @@ -51,8 +55,7 @@ def extract_genes_in_genome(fasta, gtf_in, gtf_out): f.write(line) logger.info("Extracted %d / %d lines from %s matching sequences in %s" % (n_lines_in_genome, n_total_lines, gtf_in, fasta)) - logger.info("All sequence IDs from GTF") - logger.info("\n".join(sorted(x for x in seq_name_gtf))) + logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf))) logger.info("Wrote matching lines to %s" % gtf_out) @@ -66,4 +69,4 @@ def extract_genes_in_genome(fasta, gtf_in, gtf_out): type=str, help="GTF features on fasta genome sequences") args = parser.parse_args() - extract_genes_in_genome(args.gtf, args.fasta, args.output) + extract_genes_in_genome(args.fasta, args.gtf, args.output)