forked from OpenNMT/OpenNMT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtag.lua
125 lines (97 loc) · 2.86 KB
/
tag.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
require('onmt.init')
local cmd = onmt.utils.ExtendedCmdLine.new('tag.lua')
local options = {
{
'-src', '',
[[Source sequences to tag.]],
{
valid = onmt.utils.ExtendedCmdLine.nonEmpty
}
},
{
'-output', 'pred.txt',
[[Output file.]]
},
{
'-idx_files', false,
[[If set, source and target files are 'key value' with key match between source and target.]]
}
}
cmd:setCmdLineOptions(options, 'Data')
onmt.tagger.Tagger.declareOpts(cmd)
onmt.utils.Cuda.declareOpts(cmd)
onmt.utils.Logger.declareOpts(cmd)
cmd:text('')
cmd:text('Other options')
cmd:text('')
cmd:option('-time', false, [[Measure average translation time.]])
local function main()
local opt = cmd:parse(arg)
_G.logger = onmt.utils.Logger.new(opt.log_file, opt.disable_logs, opt.log_level)
onmt.utils.Cuda.init(opt)
local tagger = onmt.tagger.Tagger.new(opt)
local srcReader = onmt.utils.FileReader.new(opt.src, opt.idx_files, tagger:srcFeat())
local srcBatch = {}
local srcIdBatch = {}
local outFile = io.open(opt.output, 'w')
local sentId = 1
local batchId = 1
local timer
if opt.time then
timer = torch.Timer()
timer:stop()
timer:reset()
end
while true do
local srcTokens, srcSeqId = srcReader:next()
if srcTokens ~= nil then
table.insert(srcBatch, tagger:buildInput(srcTokens))
table.insert(srcIdBatch, srcSeqId)
elseif #srcBatch == 0 then
break
end
if srcTokens == nil or #srcBatch == opt.batch_size then
if opt.time then
timer:resume()
end
local results = tagger:tag(srcBatch)
if opt.time then
timer:stop()
end
for b = 1, #results do
if (srcBatch[b].words and #srcBatch[b].words == 0) then
_G.logger:warning('Line ' .. sentId .. ' is empty.')
outFile:write('\n')
else
if srcBatch[b].words then
_G.logger:info('SENT %d: %s', sentId, tagger:buildOutput(srcBatch[b]))
else
_G.logger:info('FEATS %d: IDX - %s - SIZE %d', sentId, srcIdBatch[b], srcBatch[b].vectors:size(1))
end
local sentence = tagger:buildOutput(results[b])
outFile:write(sentence .. '\n')
_G.logger:info("PRED %d: %s", sentId, sentence)
end
_G.logger:info('')
sentId = sentId + 1
end
if srcTokens == nil then
break
end
batchId = batchId + 1
srcBatch = {}
collectgarbage()
end
end
if opt.time then
local time = timer:time()
local sentenceCount = sentId-1
_G.logger:info("Average sentence tagging time (in seconds):\n")
_G.logger:info("avg real\t" .. time.real / sentenceCount .. "\n")
_G.logger:info("avg user\t" .. time.user / sentenceCount .. "\n")
_G.logger:info("avg sys\t" .. time.sys / sentenceCount .. "\n")
end
outFile:close()
_G.logger:shutDown()
end
main()