Skip to content

Commit e39cac1

Browse files
committed
FastaStats shows Contig N50 as well.
1 parent b5a74b1 commit e39cac1

File tree

2 files changed

+36
-13
lines changed

2 files changed

+36
-13
lines changed

DNASkittleUtils/FastaStats.py

+35-12
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# N50 = contig length so that half of the contigs are longer and 1/2 of contigs are shorter
1313
from __future__ import print_function, division, absolute_import, with_statement
1414
import sys
15+
from itertools import chain
1516

1617
from DNASkittleUtils.Contigs import read_contigs
1718

@@ -25,7 +26,7 @@ def cumulative_sum(numbers_list):
2526
return running_sums
2627

2728

28-
def collect_n50_stats(scaffold_lengths):
29+
def collect_n50_stats(scaffold_lengths, prefix=''):
2930
"""N50:
3031
the length of the shortest contig such that the sum of contigs of equal
3132
length or longer is at least 50% of the total length of all contigs"""
@@ -37,40 +38,62 @@ def collect_n50_stats(scaffold_lengths):
3738
csum = cumulative_sum(all_len)
3839

3940
assembly_size = sum(scaffold_lengths)
40-
stats['N'] = int(assembly_size)
41+
stats[prefix + 'N'] = int(assembly_size)
4142
halfway_point = (assembly_size // 2)
4243

4344
# get index for cumsum >= N/2
4445
for i, x in enumerate(csum):
4546
if x >= halfway_point:
46-
stats['N50'] = all_len[i]
47+
stats[prefix + 'N50'] = all_len[i]
4748
break
4849

4950
# N90
50-
stats['nx90'] = int(assembly_size * 0.90)
51+
stats[prefix + 'nx90'] = int(assembly_size * 0.90)
5152

5253
# index for csumsum >= 0.9*N
5354
for i, x in enumerate(csum):
54-
if x >= stats['nx90']:
55-
stats['N90'] = all_len[i]
55+
if x >= stats[prefix + 'nx90']:
56+
stats[prefix + 'N90'] = all_len[i]
5657
break
5758

5859
return stats
5960

6061

6162
def scaffold_lengths_from_fasta(input_fasta_path):
62-
contigs = read_contigs(input_fasta_path)
63-
lengths = [len(x.seq) for x in contigs]
63+
scaffolds = read_contigs(input_fasta_path)
64+
lengths = [len(x.seq) for x in scaffolds]
65+
return scaffolds, lengths
66+
67+
68+
def split_by_N(scaffolds):
69+
length_collection = set()
70+
for scaffold in scaffolds:
71+
pieces = scaffold.seq.split('N')
72+
length_collection.add((len(p) for p in pieces))
73+
lengths = list(chain(*length_collection))
6474
return lengths
6575

6676

6777
def all_stats(input_fasta):
68-
lengths = scaffold_lengths_from_fasta(input_fasta)
69-
return collect_n50_stats(lengths)
78+
scaffolds, lengths = scaffold_lengths_from_fasta(input_fasta)
79+
scaffold_stats = collect_n50_stats(lengths, prefix='Scaffold ')
80+
contig_lengths = split_by_N(scaffolds)
81+
contig_stats = collect_n50_stats(contig_lengths, prefix='Contig ')
82+
scaffold_stats.update(contig_stats)
83+
scaffold_stats['N%'] = (1 - (scaffold_stats['Contig N'] / float(scaffold_stats['Scaffold N']))) * 100
84+
return scaffold_stats
7085

7186

7287
if __name__ == '__main__':
73-
input_fasta_name= sys.argv[1]
88+
input_fasta_name = sys.argv[1]
7489
assembly_stats = all_stats(input_fasta_name)
75-
for key in assembly_stats:
90+
label_order = ['Scaffold N', 'Scaffold N50', 'Scaffold N90', 'Scaffold nx90',
91+
'Contig N', 'Contig N50', 'Contig N90', 'Contig nx90',
92+
'N%']
93+
for key in label_order:
7694
print(key + ":", "{:,}".format(assembly_stats[key]))
95+
for key in assembly_stats: # unordered labels
96+
if key not in label_order:
97+
print(key + ":", "{:,}".format(assembly_stats[key]))
98+
99+

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name='DNASkittleUtils',
5-
version='1.0.7',
5+
version='1.0.9',
66
description='Bioinformatics functions that have been useful in multiple projects. Manipulating FASTA files, executing pipelines, etc.',
77
author='Josiah Seaman',
88
author_email='josiah.seaman@gmail.com',

0 commit comments

Comments
 (0)