-
Notifications
You must be signed in to change notification settings - Fork 10
/
run-blast.rb
executable file
·206 lines (189 loc) · 8.27 KB
/
run-blast.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#! /usr/bin/env ruby
# Copyright (c) 2007-2012 Michael Specht, Till Bald
#
# This file is part of Proteomatic.
#
# Proteomatic is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Proteomatic is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Proteomatic. If not, see <http://www.gnu.org/licenses/>.
require './include/ruby/proteomatic'
require './include/ruby/externaltools'
require './include/ruby/fasta'
require './include/ruby/ext/fastercsv'
require './include/ruby/formats'
require './include/ruby/misc'
require 'set'
require 'yaml'
require 'fileutils'
class RunBlast < ProteomaticScript
def runBlast(databasePath, queryBatchPath, resultPath)
tempResultPath = tempFilename('run-blast-results')
command = "\"#{ExternalTools::binaryPath('blast.blastall')}\" "
command += @mk_Parameters.commandLineFor("blast.blastall") + " "
# database file
# blastall seperates multiple input files via spaces, therefore the path
# has to be in "", just in case it contains spaces
command += "-d \"\\\"#{databasePath}\\\"\" "
# input query file
command += "-i \"#{queryBatchPath}\" "
# XML output
command += "-m 7 "
# redirect output
command += "-o \"#{tempResultPath}\""
runCommand(command, true)
cache = Hash.new
File::open(resultPath, 'a') do |out|
File::open(tempResultPath, 'r') do |f|
f.each_line do |line|
line.strip!
content = line.gsub(/\<[^\>]+\>/, '')
cache[:iteration] = Hash.new if line.index('<Iteration>') == 0
cache[:hit] = Hash.new if line.index('<Hit>') == 0
cache[:hsp] = Hash.new if line.index('<Hsp>') == 0
cache[:iteration][:queryDef] = content if line.index('<Iteration_query-def>') == 0
cache[:hit][:hitId]= content if line.index('<Hit_id>') == 0
cache[:hit][:hitDef]= content if line.index('<Hit_def>') == 0
cache[:hsp][:hspBitScore]= content if line.index('<Hsp_bit-score>') == 0
cache[:hsp][:hspScore]= content if line.index('<Hsp_score>') == 0
cache[:hsp][:hspEValue]= content if line.index('<Hsp_evalue>') == 0
cache[:hsp][:qSeq]= content if line.index('<Hsp_qseq>') == 0
cache[:hsp][:hSeq]= content if line.index('<Hsp_hseq>') == 0
cache[:hsp][:midLine]= content if line.index('<Hsp_midline>') == 0
if line.index('</Hsp>') == 0
# reached the end of a HSP, print it
#puts cache.to_yaml
list = Array.new
list << cache[:iteration][:queryDef]
list << cache[:hit][:hitId]
list << cache[:hit][:hitDef]
list << cache[:hsp][:hspBitScore]
list << cache[:hsp][:hspScore]
list << cache[:hsp][:hspEValue]
list << cache[:hsp][:qSeq]
list << cache[:hsp][:hSeq]
list << cache[:hsp][:midLine]
out.puts list.to_csv()
end
end
end
end
FileUtils::rm(tempResultPath)
end
def run()
databases = Set.new
@input[:databases].each do |path|
if fileMatchesFormat(path, 'blastdb')
# strip .00.pin / .psq / .phr
strippedPath = path[0, path.size - 4]
if strippedPath.rindex(/\.\d\d/) == strippedPath.size - 3
# strip .00 trailing number
strippedPath = strippedPath[0, strippedPath.size - 3]
end
databases << strippedPath
else
# convert FASTA database to BLAST
end
end
if databases.size != 1
puts "Error: You cannot specify more than one database for BLAST."
exit 1
end
# copy database files to temp dir on Windows
# blast can somehow not handle network drives ...
if RUBY_PLATFORM.downcase.include?("mswin")
print("\rCopying database files to temp directory ...")
tmp_dir = Dir.mktmpdir('proteomatic_blast')
['.pin', '.psq', '.phr', ''].each do |filetype|
src_path = databases.first + filetype
if File.exist?(src_path)
FileUtils.copy(src_path, tmp_dir)
end
end
database_name = databases.first.split('/').last
databases = Set.new
databases << tmp_dir + '/' + database_name
print("\rCopying database files to temp directory done.")
puts
end
unless @output[:csvResults]
puts "Notice: CSV output not activated. Skipping BLAST..."
exit 0
end
resultFile = @output[:csvResults]
File::open(resultFile, 'w') do |f|
list = ['Query Def', 'Hit Id', 'Hit Def', 'Hsp Bit Score', 'Hsp Score', 'HSP E-value', 'Query Sequence', 'Hit Sequence', 'Mid line']
f.puts list.to_csv()
end
queryFile = nil
unless @param[:peptides].empty?
peptides = @param[:peptides].split(%r{[,;\s/]+})
peptides.reject! { |x| x.strip.empty? }
peptides.collect! { |x| x.strip }
peptides.uniq!
unless peptides.empty?
queryFile ||= tempFilename('run-blast')
File::open(queryFile, 'a') do |outFile|
i = 0
peptides.each do |peptide|
i += 1
outFile.puts ">query_#{i} #{peptide}"
outFile.puts peptide
end
end
end
end
if queryFile || (@input[:queries].size > 1)
queryFile ||= tempFilename('run-blast')
File::open(queryFile, 'a') do |out|
@input[:queries].each do |path|
out.puts(File::read(path))
end
end
else
queryFile = @input[:queries].first
end
# now extract small batches from queryFile
# with the nr database, one query produced ~600 KiB of XML,
# so do batches of 20, we'll have about 12 MiB of XML
queryBatchPath = tempFilename('run-blast-query-batch')
File::open(queryFile, 'r') do |queryIn|
batchSize = 0
queryCount = 0
queryOut = File::open(queryBatchPath, 'w')
queryIn.each_line do |line|
if (line.strip[0, 1] == '>')
# a new query starts here
if batchSize >= 20
# we have 20 queries now, run BLAST
queryOut.close
runBlast(databases.to_a.first, queryBatchPath, resultFile)
print("\rRunning BLAST, processed #{queryCount} queries...")
# clear query batch file
queryOut = File::open(queryBatchPath, 'w')
batchSize = 0
end
batchSize += 1
queryCount += 1
end
queryOut.puts(line)
end
queryOut.close
# run BLAST if we have queries left
if batchSize > 0
runBlast(databases.to_a.first, queryBatchPath, resultFile)
print("\rRunning BLAST, processed #{queryCount} queries...")
end
puts
end
end
end
script = RunBlast.new