This repository has been archived by the owner on Apr 17, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHmm.py
executable file
·160 lines (139 loc) · 5.63 KB
/
Hmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import sys
import os
import numpy
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as pyplot
from openFile import openFile
# Represents an HMM model.
# Can come from 2 sources
# 1) A skew file, which hold all the data
# 2) Just the name, and then build up data by adding hammer table lines for data.
class Hmm(object):
def __init__(self, line=None):
l = line.strip().split(",");
if len(l) == 11: # If the model is being built from a skew file.
self.model = l[0];
self.modelLength = int(l[1]);
self.snumpyCoverageCoefficent = l[2];
self.hits = l[3];
self.percentCoverage = l[4];
self.avgerageCoverageDepth = l[5];
self.shannonEntropy = l[6];
self.metricEntropy = l[7];
self.minMax = l[8];
self.l2normDeviation = l[9];
self.vector = [int(x) for x in l[10].split(" ")];
elif len(l) == 1: # If only the model name is passed in.
self.model = l[0];
self.last = 0;
self.vector = []
# Takes a line from a hammer table file and uses it to build coverage
# data for the model.
def addLine(self, l):
if l[2] != self.model:
raise Exception("Line does not go to this model. Expected: " + self.model + " but got: " + l[2])
if int(l[5]) > self.last:
self.vector.extend([0]*(int(l[5]) - self.last))
self.last = int(l[5])
self.modelLength = len(self.vector)
for x in range(int(l[4]), int(l[5]) + 1):
self.vector[x-1] += 1
# Calculates the coverage, and figures out where areas of low covarge are.
def calcBlocks(self, cutoff=0.75):
self.breakpoints = [0] * self.modelLength
self.coverageMean = numpy.average(self.vector)
self.coverageStd = numpy.std(self.vector)
self.coveragePercent = sum([1 for x in self.vector if x])/float(len(self.vector))
self.missingBlocks = [0] * len(self.vector)
for i, value in enumerate(self.vector):
if value < self.coverageMean * cutoff:
self.missingBlocks[i] = 1
else:
self.missingBlocks[i] = 0
self.highCoveragePercent = sum(self.missingBlocks)/float(len(self.missingBlocks))
# Breakpoints are where sequences start and stop. We thought this would be
# helpful, but it was not. Probably won't be used.
def addBreakpoints(self, start, end):
self.breakpoints[start-1] += 1;
self.breakpoints[end-1] += 1;
# Creates and saves a graph that shows the coverage and areas of low
# coverage.
def drawVectorAndBreakpoints(self, location = "graphs/"):
self.calcBlocks()
# Testing to see if this works. It may not.
if self.coverageMean == 0:
return
pyplot.plot(self.vector, c = "b");
pyplot.plot(self.breakpoints, c = "m");
pyplot.plot([x * self.coverageMean * .8 for x in self.missingBlocks], c = "r");
pyplot.axhline(self.coverageMean, c = "g");
pyplot.axhline(self.coverageMean*.75, c = "y");
pyplot.xlabel("Base");
pyplot.ylabel("Count");
pyplot.title(self.model + ": " + str(self.modelLength) + "\n"
+ " mean: " + str(int(self.coverageMean)) + " "
+ " std: " + str(int(self.coverageStd)) + "\n"
+ " coverage: " + str(self.coveragePercent))
pyplot.draw();
if self.coveragePercent > 0.8 and self.highCoveragePercent > 0.4:
location = location + "high_coverage/"
else:
location = location + "low_coverage/"
if not os.path.exists(os.path.dirname(location)):
try:
os.makedirs(os.path.dirname(location))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
pyplot.savefig(location + "coverage-and-breakpoints-" + self.model + ".png");
pyplot.close()
def __str__(self):
return self.model;
def __repr__(self):
return self.__str__();
# Create a dictionary with HMMs from a skew file.
# Not currently used
# @staticmethod
# def createHmms(fileLoc):
# hmmDic = {};
# with openFile(fileLoc) as hmms:
# next(hmms); # Skip the header
# for line in hmms:
# newHmm = Hmm(line);
# newHmm.calcBlocks();
# hmmDic[newHmm.model] = newHmm;
# return hmmDic;
# Creates and dictionary with HMMs from a hammer file.
# It also does the calculations needed to create graphs.
@staticmethod
def createHmmsFromHammer(fileLoc):
hmmDic = {}
with openFile(fileLoc) as hammer:
# Skip the first two lines.
next(hammer)
next(hammer)
for line in hammer:
if line.strip()[0] == "#": # Skip comment lines.
continue
l = line.strip().split();
hmmName = l[2]
if not hmmDic.has_key(hmmName):
hmmDic[hmmName] = Hmm(hmmName)
hmmDic[hmmName].addLine(l)
for hmm in hmmDic:
hmmDic[hmm].calcBlocks()
return hmmDic;
# I don't think I used this any more. Will remove eventually.
# @staticmethod
# def createHmmSequences(fileLoc):
# hmm = {}
# with openFile(fileLoc) as hmmFile:
# for line in hmmFile:
# l = line.split(" ");
# hmm[l[0]] = l[1:]
# return hmm
def main():
Hmm.createHmmsFromHammer(sys.argv[1])
if __name__ == "__main__":
main()