-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhmmsearch_besthit.py
84 lines (76 loc) · 3.12 KB
/
hmmsearch_besthit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
#Ian Rambo
'''
Created: January 30, 2016
Last modified: January 31, 2016
GOAL: parse HMMSEARCH output to deliver the best hit. This script will
select hits based on the best e-value.
'''
#==============================================================================
# ----------- ------------
# -------- \___________/ ---------
# ------- O O ---------
# -------\ V /--------
# \ _____ /
# \/ \/
#==============================================================================
import os
import re
#==============================================================================
#-----GLOBAL VARIABLES-----
rootDir = '/Users/imrambo/R/ORMP/data'
inputDir = '%s/01-targetGenomes_PfamHMM_select' % rootDir
outDir = '%s/02-targetGenomes_PfamHMM_bestHit' % rootDir
fileCount = 0
#==============================================================================
#-----FUNCTIONS-----
def hmm_besthit(hmmFile) :
'''
Loop through a modified HMMSEARCH results file and select the best hit
for each cds per accession.
'''
hmmDict = dict()
with open(hmmFile, 'r') as hmm :
for h in hmm :
if h.startswith('cds') :
h = h.rstrip()
hFields = h.split()
seqID = hFields[0]
pfam = hFields[1]
seqIDFields = seqID.split(';')
cds = seqIDFields[0]
accession = re.findall(r'((\w+_\d+)\.\d+)', seqIDFields[1])[0][1]
evalue = float(hFields[2])
#If the accession and CDS exist already, choose the hit
#with the best e-value
if cds in hmmDict :
if float(evalue) < float(hmmDict[cds]['evalue']) :
hmmDict[cds] = dict([('accession',accession),('cds',cds),\
('evalue', str(evalue)), ('pfamily', pfam)])
else :
hmmDict[cds] = dict([('accession',accession),('cds',cds),\
('evalue', str(evalue)), ('pfamily', pfam)])
return(hmmDict)
#==============================================================================
#-----MAIN-----
print('-----BEGIN-----')
print('...finding best hits...')
if not os.path.exists(outDir) :
print('\ncreating output folder %s\n' % outDir)
os.makedirs(outDir)
else :
print('\n%s already exists\n' % outDir)
pass
#Get the best hits for each accession
for subdir, dirs, files in os.walk(inputDir) :
for hmmFile in files :
hmmPath = os.path.join(subdir, hmmFile)
hmmDict = hmm_besthit(hmmPath)
fileHandle = hmmFile.replace('hmmsearch.txt', 'bestHit')
#Open the output file to contain best hit results
with open('%s/%s.txt' % (outDir, fileHandle), 'w') as bh :
for c in hmmDict :
bh.write('%s\t%s\t%s\n' % (hmmDict[c]['accession'], hmmDict[c]['cds'], hmmDict[c]['pfamily']))
fileCount += 1
print('Created %d new files' % fileCount)
print('-----DONE-----')