forked from nextgenusfs/genome_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gb_stats.py
executable file
·102 lines (86 loc) · 2.79 KB
/
gb_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
#This script generates some basic genome stats from a genbank flat file format (.gbk)
#Run like: gb_stats.py file.gbk
#The N50 calculation is from Cara Magnabosco https://caramagnabosco.wordpress.com/2014/02/20/calculate-the-n50-of-assembled-contigs/
import sys
import numpy
from Bio import SeqIO
import argparse
parser=argparse.ArgumentParser(
description='''Script that takes Genbank file input and computes statistics ''',
epilog="""Jon Palmer (2015) palmer.jona@gmail.com""")
parser.add_argument('gbk', help='Genbank file (Required)')
args=parser.parse_args()
if len(sys.argv) < 2:
parser.print_usage()
sys.exit(1)
gbk_filename = args.gbk
gene_length = 0
avg_gene_length = 0
num_records = list(SeqIO.parse(gbk_filename, "genbank"))
contigs = len(num_records)
percent = "%"
ending = "Mb"
ending2 = "bp"
cds_count = 0
gene_count = 0
trna_count = 0
total = 0
C_count = 0
G_count = 0
seqlength = []
for record in SeqIO.parse(gbk_filename, "genbank"):
total += len(record.seq)
bp = len(record.seq)
seqlength.append(bp)
mb_total = float(total) / 1000000
C_count += record.seq.count("C")
G_count += record.seq.count("G")
GC_total = C_count + G_count
GC_count = GC_total / float(total) * 100
for f in record.features:
if f.type == "source":
organism = f.qualifiers.get("organism", ["???"])[0]
isolate = f.qualifiers.get("strain", ["???"])[0]
if f.type == "CDS":
cds_count = cds_count + 1
if f.type == "gene":
gene_count = gene_count + 1
if f.type == "tRNA":
trna_count = trna_count + 1
if f.type == "gene":
gene_length = gene_length + len(f)
avg_gene_length = gene_length / gene_count
seqlength = sorted(seqlength)
unique = []
for entry in seqlength:
if not entry in unique:
unique.append(entry)
n50 = []
for entry in unique:
multiplier = seqlength.count(entry) * entry
for i in range(multiplier):
n50.append(entry)
index = len(n50)/2
avg = []
if index % 2==0:
first = n50[index -1]
second = n50[index]
avg.append(first)
avg.append(second)
n50 = numpy.mean(avg)
n50 = n50
else:
n50 = n50[index -1]
print "-----------------------------------------"
print("%s %s" % (organism, isolate))
print "-----------------------------------------"
print ("Scaffolds:\t{:,}".format(contigs))
print ("Total bp:\t{:.2f} {}".format (mb_total, ending))
print ("Largest Contig:\t{:,} {}".format(seqlength[-1], ending2))
print ("Scaffold N50:\t{:,g} {}".format(n50, ending2))
print ("GC Content:\t%.2f %s" % (GC_count,percent))
print ("Genes:\t\t{:,}".format(gene_count))
print ("Protein Coding:\t{:,}".format(cds_count))
print ("tRNA:\t\t{:,}".format(trna_count))
print ("Avg gene len:\t{:,g} {}".format (avg_gene_length, ending2))