12
12
# N50 = contig length so that half of the contigs are longer and 1/2 of contigs are shorter
13
13
from __future__ import print_function , division , absolute_import , with_statement
14
14
import sys
15
+ from itertools import chain
15
16
16
17
from DNASkittleUtils .Contigs import read_contigs
17
18
@@ -25,7 +26,7 @@ def cumulative_sum(numbers_list):
25
26
return running_sums
26
27
27
28
28
- def collect_n50_stats (scaffold_lengths ):
29
+ def collect_n50_stats (scaffold_lengths , prefix = '' ):
29
30
"""N50:
30
31
the length of the shortest contig such that the sum of contigs of equal
31
32
length or longer is at least 50% of the total length of all contigs"""
@@ -37,40 +38,62 @@ def collect_n50_stats(scaffold_lengths):
37
38
csum = cumulative_sum (all_len )
38
39
39
40
assembly_size = sum (scaffold_lengths )
40
- stats ['N' ] = int (assembly_size )
41
+ stats [prefix + 'N' ] = int (assembly_size )
41
42
halfway_point = (assembly_size // 2 )
42
43
43
44
# get index for cumsum >= N/2
44
45
for i , x in enumerate (csum ):
45
46
if x >= halfway_point :
46
- stats ['N50' ] = all_len [i ]
47
+ stats [prefix + 'N50' ] = all_len [i ]
47
48
break
48
49
49
50
# N90
50
- stats ['nx90' ] = int (assembly_size * 0.90 )
51
+ stats [prefix + 'nx90' ] = int (assembly_size * 0.90 )
51
52
52
53
# index for csumsum >= 0.9*N
53
54
for i , x in enumerate (csum ):
54
- if x >= stats ['nx90' ]:
55
- stats ['N90' ] = all_len [i ]
55
+ if x >= stats [prefix + 'nx90' ]:
56
+ stats [prefix + 'N90' ] = all_len [i ]
56
57
break
57
58
58
59
return stats
59
60
60
61
61
62
def scaffold_lengths_from_fasta (input_fasta_path ):
62
- contigs = read_contigs (input_fasta_path )
63
- lengths = [len (x .seq ) for x in contigs ]
63
+ scaffolds = read_contigs (input_fasta_path )
64
+ lengths = [len (x .seq ) for x in scaffolds ]
65
+ return scaffolds , lengths
66
+
67
+
68
+ def split_by_N (scaffolds ):
69
+ length_collection = set ()
70
+ for scaffold in scaffolds :
71
+ pieces = scaffold .seq .split ('N' )
72
+ length_collection .add ((len (p ) for p in pieces ))
73
+ lengths = list (chain (* length_collection ))
64
74
return lengths
65
75
66
76
67
77
def all_stats (input_fasta ):
68
- lengths = scaffold_lengths_from_fasta (input_fasta )
69
- return collect_n50_stats (lengths )
78
+ scaffolds , lengths = scaffold_lengths_from_fasta (input_fasta )
79
+ scaffold_stats = collect_n50_stats (lengths , prefix = 'Scaffold ' )
80
+ contig_lengths = split_by_N (scaffolds )
81
+ contig_stats = collect_n50_stats (contig_lengths , prefix = 'Contig ' )
82
+ scaffold_stats .update (contig_stats )
83
+ scaffold_stats ['N%' ] = (1 - (scaffold_stats ['Contig N' ] / float (scaffold_stats ['Scaffold N' ]))) * 100
84
+ return scaffold_stats
70
85
71
86
72
87
if __name__ == '__main__' :
73
- input_fasta_name = sys .argv [1 ]
88
+ input_fasta_name = sys .argv [1 ]
74
89
assembly_stats = all_stats (input_fasta_name )
75
- for key in assembly_stats :
90
+ label_order = ['Scaffold N' , 'Scaffold N50' , 'Scaffold N90' , 'Scaffold nx90' ,
91
+ 'Contig N' , 'Contig N50' , 'Contig N90' , 'Contig nx90' ,
92
+ 'N%' ]
93
+ for key in label_order :
76
94
print (key + ":" , "{:,}" .format (assembly_stats [key ]))
95
+ for key in assembly_stats : # unordered labels
96
+ if key not in label_order :
97
+ print (key + ":" , "{:,}" .format (assembly_stats [key ]))
98
+
99
+
0 commit comments