-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_slimmed_gff3.py
executable file
·87 lines (70 loc) · 2.82 KB
/
get_slimmed_gff3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
"""
> get_slimmed_gff3.py <
Based on input gff3 files, pick the gene model that produces the longest
cDNA/protein sequence, then produces a gff3 file that only contains one
gene model per gene.
"""
import argparse
import re
import parse_gff3
parser = argparse.ArgumentParser(description="""
Based on input gff3 files, pick the gene model that produces the longest
cDNA/protein sequence, then produces a gff3 file that only contains one
gene model per gene.""")
parser.add_argument('scaffold_gff3', metavar="gff3_file",
type=argparse.FileType('r'),
help="corresponding gff3 file of the genome.")
args = parser.parse_args()
# read genome details into memory
scaffold_gff3 = parse_gff3.parse_gff3(args.scaffold_gff3, 'exon')
# pick longest transcript in the gff3 file
scaffold_gff3 = parse_gff3.pick_longest_mRNA(scaffold_gff3)
# store which transcript is the longest in a dict
longest_mRNA = {}
for scaf in scaffold_gff3:
for gene in scaffold_gff3[scaf]:
# double check that there is only one gene model per gene; crash
# noisily if not
tx = list(scaffold_gff3[scaf][gene].mRNAs.keys())
assert len(tx) == 1, f'{gene} does not have one gene model per gene!'
longest_mRNA[gene] = tx[0]
# dump all longest mRNAs into a list
all_longest_mRNAs = list(longest_mRNA.values())
# rewind the gff3 file now, then depending on what's in the line, different
# strategies are adopted
# 1. comment lines: retain only comment lines for longest gene model
# 2. content lines with no "Parent=": these are genes. retain everything
# 3. content lines with "Parent=": retain if parent is the longest gene model
args.scaffold_gff3.seek(0)
for line in args.scaffold_gff3:
line = line.strip()
# if line is blank, just print it out, then go onto next iteration
if not line:
print (line)
continue
if line[0] == '#':
# comment line has two types: those that contain protein sequence, and
# those that do not
if 'PROT' in line:
genem = line.split(' ')[1]
else:
genem = line.split(' ')[2]
# remove commas in gene model name, if they exist
if genem[-1] == ',':
genem = genem[:-1]
if genem in all_longest_mRNAs:
print (line)
else:
# these are content lines
if 'Parent=' in line:
# check whether this line is an "mRNA" line. if it is, the
# transcript name is under 'Name='
if '\tmRNA\t' in line:
genem = re.search(r'Name=(.*?)$', line)[1]
else:
genem = re.search(r'Parent=(.*?)(;|$)', line)[1]
if genem in all_longest_mRNAs:
print (line)
else:
print (line)