-
Notifications
You must be signed in to change notification settings - Fork 1
/
rarefaction_kaiju_wmin.py
134 lines (106 loc) · 4.52 KB
/
rarefaction_kaiju_wmin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 31 10:46:54 2022
@author: tina
"""
import random
import sys
def do_rarefaction(input_file, list_of_percentages, iterations, path_to_out,profile):
"""
Parameters
----------
input_file : string
Path to the kaiju.names.out file.
list_of_percentages : list
list of percent values that you want to do rarefactions for.
iterations: integer
number of iterations for rarefaction for each percent value
Returns
-------
None.
Writes output for each iteration in the format: sample.kaiju.rf{percent}.min_abundance.iteration.csv
"""
read_list=open(input_file).read().strip().split('\n')
abundant_taxa=get_abundant_taxa(profile)
smp=input_file.split('/')[-1].split('.')[0]
for p in list_of_percentages:
print('{}%...'.format(p))
sampled_dict=subsample(read_list, p, iterations)
for i in sampled_dict:
taxa=count_up_taxa(sampled_dict[i],abundant_taxa)
# out1='{0}.kaiju.rf{1}.it{2}.csv'.format(smp, p, i)
out2='{0}.kaiju.rf{1}.min{2}_new.it{3}.csv'.format(smp, p, '0.001', i)
# open('{0}{1}/{2}'.format(path_to_out,p,out1), 'w') as outf1,
with open('{0}{1}/{2}'.format('./',p,out2), 'w') as outf2:
for taxon in taxa:
# outf1.write('{},{},{}\n'.format(taxon,
# taxa[taxon]['num_reads'],
# taxa[taxon]['frac_reads']))
# if taxa[taxon]['frac_reads']>0.00001:
outf2.write('{},{},{}\n'.format(taxon,
taxa[taxon]['num_reads'],
taxa[taxon]['frac_reads']))
return
def subsample(read_list, percent, i):
"""
Takes a read dictionary as input and subsamples p percent of reads it i times.
Returns a dictionary with iterations as keys that are subsets of read_dict.
"""
sample_dict={}
for iteration in range(i):
it=str(iteration+1).rjust(3,'0')
number=int((percent/100)*len(read_list))
sample_dict[it]=random.sample(read_list, number)
return sample_dict
def count_up_taxa(read_list, abundant_taxa):
weirdos=0
total=len(read_list)
tax_dict={'u': {'num_reads':0,
'frac_reads':0}}
for r in read_list:
if r.startswith('U'):
tax_dict['u']['num_reads']+=1
tax_dict['u']['frac_reads']+=1/total
else:
try:
taxon=r.split('\t')[7]
if taxon in abundant_taxa:
if taxon not in tax_dict:
tax_dict[taxon]={'num_reads':0,
'frac_reads':0}
tax_dict[taxon]['num_reads']+=1
tax_dict[taxon]['frac_reads']+=1/total
else:
tax_dict['u']['num_reads']+=1
tax_dict['u']['frac_reads']+=1/total
except IndexError:
weirdos+=1
return tax_dict
def get_abundant_taxa(profile):
abundant_taxa=set()
with open(profile) as p:
for line in p:
taxon=line.split(',')[0]
abundant_taxa.add(taxon)
return abundant_taxa
if __name__=='__main__':
path_to_out=('/hosts/linuxhome/phage/mgx/people/tina/RAT/benchmark/CAMI_II_mousegut_w_dm/'
'rarefaction/')
# wells=['19', '22', '23']
# mousegut=['smp6','smp13','smp23','smp25','smp26','smp30','smp33','smp38','smp53']
i=sys.argv[1]
# for i in mousegut:
# for j in range(1,7):
# sample='W{0}-{1}'.format(i,j)
sample=i
print('Working on sample {}...'.format(sample))
input_file=('/hosts/linuxhome/phage/mgx/people/tina/RAT/benchmark/'
'CAMI_II_mousegut_w_dm/kraken_kaiju_centrifuge/{0}/'
'kaiju/{1}.kaiju.names.out'.format(sample,sample))
profile=('/hosts/linuxhome/phage/mgx/people/tina/RAT/benchmark/'
'CAMI_II_mousegut_w_dm/kraken_kaiju_centrifuge/'
'{0}.kaiju.manually_counted.0.001.csv'.format(sample))
print(input_file)
list_of_percentages=[1,5,10,20,30,40,50,60,70,80,90]
iterations=10
do_rarefaction(input_file, list_of_percentages, iterations, path_to_out, profile)