-
Notifications
You must be signed in to change notification settings - Fork 5
/
theseus_10x.py
163 lines (120 loc) · 4.67 KB
/
theseus_10x.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#
# Usage: python theseus_10x.py FULLPATH2H5 OUTPUTPATH TPT_THRESHOLD NUMBER_OF_CELLS
import pandas as pd
import numpy as np
import scipy
from scipy import sparse, io
import h5py
import re
import time
import sys
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def read_10x_h5(path2file):
with h5py.File(path2file,'r') as hf:
print('List of arrays in this file: \n', hf.keys())
start = time.time()
bc = np.array(hf.get('barcode'))
bc_cor = np.array(hf.get('barcode_corrected_reads'))
umi = np.array(hf.get('umi'))
gene = np.array(hf.get('gene'))
reads = np.array(hf.get('reads'))
nonconf_reads = np.array(hf.get('nonconf_mapped_reads'))
unmap_reads = np.array(hf.get('unmapped_reads'))
map_pos = np.array(hf.get('conf_mapped_uniq_read_pos'))
#TABLE = pd.DataFrame([bc,gene,np.int64(reads),nonconf_reads,unmap_reads,map_pos==1])
stop = time.time()
print 'reading file took ' + str(stop-start)
TABLE=pd.DataFrame()
start = time.time()
TABLE['bc']=bc
TABLE['umi']=umi
TABLE['gene']=gene
TABLE['bcumi']=zip(bc,umi)
TABLE['unique']=[1]*len(TABLE)
TABLE['map_logical']=map_pos>0
TABLE['read_counts']=reads+nonconf_reads+unmap_reads
stop = time.time()
print 'pandaifying took ' + str(stop-start)
return TABLE
#filters to top numcells
def filterT(TABLE,numcells):
start = time.time()
map_rate = np.divide(np.multiply(1.0,np.sum(TABLE['read_counts'][TABLE['map_logical']])),np.sum(TABLE['read_counts']))
print('mapping rate ',map_rate)
TABLE_mapped = TABLE[TABLE['map_logical']]
TABLE_mapped_merge = TABLE_mapped.groupby('bc').sum()
TABLE_mapped_merge=TABLE_mapped_merge.sort_values('read_counts',ascending=False).reset_index()
TABLE_topcells = TABLE_mapped_merge.iloc[range(numcells)]
top_bcs = set(TABLE_topcells['bc'])
top_bc_logical = [True if x in top_bcs else False for x in TABLE_mapped['bc']]
TABLE_mapped_filtered = TABLE_mapped[top_bc_logical]
top_cells_mapping = np.divide(np.double(np.sum(TABLE_mapped_filtered['read_counts'])),np.sum(TABLE_mapped_merge['read_counts']))
print('top cells mapping rate ',top_cells_mapping)
stop = time.time()
total_mapping = np.multiply(map_rate,top_cells_mapping)
print('total mapping rate ',total_mapping)
print 'filtering took ' + str(stop-start)
return TABLE_mapped_filtered
path2h5=sys.argv[1]
pathout=sys.argv[2]
T = read_10x_h5(path2h5)
tpt_filter=float(sys.argv[3])
number_of_cells=int(sys.argv[4])
print('The input path is '+path2h5)
print('The output path is '+pathout)
print('TPT threshold is '+str(tpt_filter))
print('Number of Cells is '+str(number_of_cells))
with h5py.File(path2h5,'r') as hf:
genenames = np.array(hf.get('gene_names'))
#calculate TPT
BCUMI_group=T.groupby('bcumi').sum()
BCUMI_group=pd.DataFrame(BCUMI_group['read_counts'])
BCUMI_group.columns=['total_reads']
T_tot=T.copy()
T_tot.index=T_tot['bcumi']
T_tot=T_tot.join(BCUMI_group)
T_tot['TPT']=1.0*(np.divide(1.0*T_tot['read_counts'],T_tot['total_reads']))
#plot TPT and reads distributions
plt.hist(T_tot['TPT'],20)
plt.xlabel('TPT')
plt.savefig(pathout+'/TPT_hist.pdf')
plt.clf()
plt.hist(np.log2(T_tot['read_counts']+1),20)
plt.xlabel('log2(Reads/UMI+1)')
plt.savefig(pathout+'/reads_hist.pdf')
plt.clf()
tpt_logical=T_tot['TPT']>tpt_filter
T_tot_filt=T_tot[tpt_logical]
print('Filtered '+str(np.round(100*(1.0-np.mean(tpt_logical)),4))+'%')
T_F=filterT(T_tot_filt,number_of_cells)
T_F['unique']=np.array([1.0]*len(T_F))
T_F['bcgene']=[(x,y) for x,y in zip(T_F['bc'],T_F['gene'])]
T_FF = T_F.groupby('bcgene').sum()
def str2index(strlist):
reduced=pd.DataFrame(list(set(np.sort(strlist))))
reduced=reduced.reset_index()
reduced.index=reduced[0]
dftmp=pd.DataFrame(strlist,index=strlist)
dftmp=dftmp.merge(reduced,how='left')['index']
return np.array(dftmp),list(reduced[0])
#convert to full expression matrix
tfrow=[x[1] for x in T_FF.index]
tfrow,gnames=str2index(tfrow)
tfcol=[x[0] for x in T_FF.index]
tfcol,cnames=str2index(tfcol)
tfdata=np.array(T_FF['unique'])
tmpcol=pd.DataFrame(np.unique(tfcol))
tmpcol['unind']=range(len(tmpcol))
dftfcol=pd.DataFrame(tfcol)
dftfcol=dftfcol.merge(tmpcol,on=0)
tfcol=np.array(dftfcol['unind'])
EXPR_MAT=scipy.sparse.csr_matrix((tfdata,(tfrow,tfcol)),shape=(np.max(tfrow)+1,np.max(tfcol)+1)).toarray()
EXPR_MAT=pd.DataFrame(EXPR_MAT)
EXPR_MAT.index=gnames
EXPR_MAT.columns=cnames
gene_labels=pd.DataFrame(genenames)
gene_labels.index=gene_labels.index+1
EXPR_MAT.index=gene_labels.ix[EXPR_MAT.index,0]
EXPR_MAT.to_csv(pathout+'/UMI_TPTfilt.txt',sep='\t')