-
Notifications
You must be signed in to change notification settings - Fork 4
/
gff2gtf.pl
executable file
·127 lines (103 loc) · 4.08 KB
/
gff2gtf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#! /usr/bin/perl
# Converts a GFF (argument 1) to a GTF file for use with SNPGenie.
# Copyright (C) 2015 Chase W. Nelson
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# AUTHOR: Chase W. Nelson
# CONTACT1: nelsoncw@email.sc.edu
# CONTACT2: cwnelson88@gmail.com
# AFFILIATION1: Austin L. Hughes lab, University of South Carolina (Columbia, SC, USA)
# AFFILIATION2: Wen-Hsiung Li lab, Academia Sinica (Taipei, Taiwan)
# ACKNOWLEDGMENTS: written by C.W.N. with support from a National Science Foundation
# Graduate Research Fellowship (DGE-0929297), a National Science Foundation East Asian
# and Pacific Summer Institutes Fellowship, and a University of South Carolina
# Presidential Fellowship.
use strict;
#use warnings;
use IO::Handle;
if(scalar @ARGV != 1) {
die "\n\n## WARNING: The SNPGenie script gff2gtf needs exactly 1 ".
"argument:\n## A GFF file with '+' strand data relative to the ".
"reference sequence against which SNPs were called.".
"\n## Only the \"ID\" tag will be used to identify the gene name, e.g., \"ID=GENE_001\"".
"\n\n## For example: ".
"snpgenie-gff2gtf.pl my_cds_file.gff\n\n";
}
my $gff_file_nm = $ARGV[0];
# Generate new file name names
my $new_gtf_file_nm;
if($gff_file_nm =~/\.gff/) {
$new_gtf_file_nm = $` . "_gff_converted.gtf";
} elsif($gff_file_nm =~/gff\.txt/) {
$new_gtf_file_nm = $` . "_gff_converted.gtf";
} else {
die "\nFirst and only argument must be a .gff file\n\n";
}
if(-e $new_gtf_file_nm) {
die "\n## $new_gtf_file_nm already exists in this directory; delete before proceeding\n\n";
}
print "\n## Converting $gff_file_nm to $new_gtf_file_nm for SNPGenie...\n";
#my %all_ORF_info; # {name}->{all_attributes}
open(OUTFILE,">>$new_gtf_file_nm");
open (CURRINFILE, $gff_file_nm);
while (<CURRINFILE>) {
if(/^##FASTA/ || /^\>/) {
last;
} else {
unless(/^#/) {
if(/CDS/) {
chomp;
my @line_arr = split(/\t/,$_);
my $seqname = $line_arr[0];
my $source = $line_arr[1];
my $feature = $line_arr[2];
my $start = $line_arr[3];
my $end = $line_arr[4];
my $score = $line_arr[5];
my $strand = $line_arr[6];
my $frame = $line_arr[7];
my $group = $line_arr[8];
my $gene_id;
if($group =~ /ID=CDS\:([\w\s\.']+)/) {
$gene_id = $1;
} elsif($group =~ /ID=([\w\s\.']+)/) {
$gene_id = $1;
} elsif($group =~ /ID=([\w\s\.']+ [\w\s\.']+)/) {
$gene_id = $1;
}
# $all_ORF_info{$gene_id}->{$seqname} = $line_arr[0];
# $all_ORF_info{$gene_id}->{$source} = $line_arr[1];
# $all_ORF_info{$gene_id}->{$feature} = $line_arr[2];
# $all_ORF_info{$gene_id}->{$start} = $line_arr[3];
# $all_ORF_info{$gene_id}->{$end} = $line_arr[4];
# $all_ORF_info{$gene_id}->{$score} = $line_arr[5];
# $all_ORF_info{$gene_id}->{$strand} = $line_arr[6];
# $all_ORF_info{$gene_id}->{$frame} = $line_arr[7];
# if($_ =~ /CDS\t\d+\t\d+\t[\.\d+]\t\+/) { # Must be on the + strand
# if($_ =~/gene_id \"gene\:([\w\s\.']+)\"/) {
# $products_hash{$1} = 1;
# } elsif($_ =~ /gene_id \"([\w\s\.']+ [\w\s\.']+)\"/) {
# $products_hash{$1} = 1;
# } elsif($_ =~/gene_id \"([\w\s\.']+)\"/) {
# $products_hash{$1} = 1;
# }
# }
my $this_line = "". $seqname . "\t" . $source . "\t" . $feature . "\t" . $start .
"\t" . $end . "\t" . $score . "\t" . $strand . "\t" . $frame . "\t" .
"gene_id \"$gene_id\"\;";
print OUTFILE "$this_line\n";
}
}
}
}
close CURRINFILE;
close OUTFILE;
print "\n## GTF for SNPGenie file has been written to $new_gtf_file_nm\n\n";