-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstat_cluster.py
executable file
·155 lines (131 loc) · 3.69 KB
/
stat_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*
'''
Author: Mei Hou
This script is used stat the tidy_cluster file. it's the output of tidy_intersect.py
'''
import sys
# import numpy as np
# import pdb
# import cPickle
# import copy
import getopt
from collections import defaultdict
# deal with parameters
bioTypeFile = '/lustre/user/houm/projects/AnnoLnc/bio_type.txt'
tidyFile = ''
opts, args = getopt.getopt(sys.argv[1:], 'b:t:h')
for op, value in opts:
if op == '-b':
bioTypeFile = value
elif op == '-t':
tidyFile = value
elif op == '-h':
print '''Usage: stat_cluster.py -t cluster_tidy_file [-b biotype_file] > output_file
\t-t: the output file tidy_intersect.py
\t-b: the biotype classification file that map a biotype to a protein_coding/pseudogene/lncRNA/sncRNA...'''
sys.exit()
else:
print 'Unknown argument!'
sys.exit(1)
if tidyFile == '':
sys.stderr.write('-t must be specified!\n')
sys.exit(1)
# read in the biotype file
dBioType = {}
bioTypeFile = open(bioTypeFile)
for bt in bioTypeFile.readlines():
bt = bt.rstrip('\n').split('\t')
dBioType[bt[0]] = bt[1]
ddFeatureCount = defaultdict(dict)
ddBiotypeCount = defaultdict(dict)
cTrans = {}
cGene = {}
ddChCount = defaultdict(dict)
ddStrandCount = defaultdict(dict)
totalClusters = {}
# read in the cluster tidy file
tidyFile = open(tidyFile)
tidyFile.readline()
for li in tidyFile.readlines():
li = li.rstrip('\n').split('\t')
clusterId, ch, strand = li[0:3]
transId, biotype, feature = li[5:8]
geneId = li[9]
# check
if not biotype in dBioType:
sys.stderr.write(biotype + ' is not in the biotype file!\n')
sys.exit(1)
# count total clusters
totalClusters[clusterId] = 1
# count chromosome distribution
ddChCount[ch][clusterId] = 1
# count strand distribution
ddStrandCount[strand][clusterId] = 1
# count transcript num
cTrans[transId] = 1
# count gene num
cGene[geneId] = 1
# count clusters for every feature
ddFeatureCount[feature][clusterId] = 1
# count clusters for every biotype
ddBiotypeCount[biotype][clusterId] = 1
def getCount(mydd):
'''get count from defaultdict'''
dOut = {}
for k, v in mydd.items():
dOut[k] = len(v)
return(dOut)
def sortDictAndPrint(myDict, myKey = lambda d: d[1], reverse = True):
'''input a dict of count, sort it by key, and print it '''
myDict = sorted(myDict.items(), key = myKey, reverse = reverse)
for i in myDict:
print '%s\t%d' % (i[0], i[1])
def dd2Print(mydd, myKey = lambda d: d[1], reverse = True):
'''conbine getCount() and sortDictAndPrint()'''
myDict = getCount(mydd)
sortDictAndPrint(myDict, myKey, reverse)
def toInt(x):
'''
input a string. If the string is a putative number, turn it to int, else return itself
'''
if x.isdigit():
return(int(x))
else:
return(x)
def printLine():
print '------------------------------------------------------------------'
cBbioType = getCount(ddBiotypeCount)
cBroadBiotype = defaultdict(int)
for k, v in cBbioType.items():
cBroadBiotype[dBioType[k]] += v
print '# Clusters that have annotations\t%d' % len(totalClusters)
print '# Transcripts that overlapped with clusters\t%d' % len(cTrans)
print '# Genes that overlapped with clusters\t%d' % len(cGene)
print
printLine()
print
print 'The statistic of clusters for transcript features:'
dd2Print(ddFeatureCount)
print
printLine()
print
print 'The statictic of clusters for biotype groups:'
sortDictAndPrint(cBroadBiotype)
print
printLine()
print
print 'The statictic of clusters for biotypes:'
sortDictAndPrint(cBbioType)
print
printLine()
print
print 'The cluster distribution for both strands: '
dd2Print(ddStrandCount)
print
printLine()
print
print 'The cluster distribution for each chromosomes: '
dd2Print(ddChCount, lambda d: toInt(d[0].split('chr')[1]), False)
print
printLine()