-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_wordfreq_t.rb
executable file
·298 lines (251 loc) · 6.65 KB
/
parse_wordfreq_t.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/usr/bin/env ruby
require 'date'
require 'pp'
require 'find'
require 'csv'
require 'logger'
require 'stanford-core-nlp'
### classes
class Participant
attr_accessor :first_name, :last_name, :affil, :title, :type
def initialize(name_str, affil_str, type)
@first_name, @last_name = name_str.split(' ', 2)
@affil, @title = affil_str.split(' - ', 2) unless affil_str.nil?
@type = type
end
## ex: Thomas A. Moore - Biopure Corporation - President, CEO and Director
def to_s
str = @first_name
str += ' ' + @last_name unless @last_name.nil?
str += ', ' + @affil unless @affil.nil?
str += ' - ' + @title unless @title.nil?
return str
end
end
class Transcript
attr_accessor :participant, :transcript, :sentences
def initialize
@transcript = ''
end
def questions
qw_regex = "(what|where|when|which|who|whom|would|do|does|doesn't|is|isn't|can|could|to what exten|should|was|has|how|which|if)\W+"
questions = []
sentences.each do |s|
if s =~ /\?/
questions << s
next
end
questions << s if s.downcase =~ /^#{qw_regex}/
end
questions
end
def num_of_words(sents=@sentences)
count = 0
sents = sentences if sents.nil?
sents.each do |s|
s.split(' ').each do |w|
count += 1 unless w =~ /^\W+$/
end
end
count
end
def num_of_questions
questions.size
end
def num_of_words_in_questions
num_of_words(questions)
end
def sentences
if @sentences.nil?
text = StanfordCoreNLP::Annotation.new(@transcript)
@@pipeline.annotate(text)
@sentences = []
text.get(:sentences).each do |s|
@sentences << s.to_s
end
end
@sentences
end
def cal_word_freq(word_freq)
#puts @transcript
word_freq.keys.each do |w|
freq = @transcript.downcase.scan(/\b#{w}\b/).count
#puts w + ': ' + freq.to_s
word_freq[w] += freq
end
end
end
### functions
def usage
puts 'Usage: ' + File.basename(__FILE__) + ' <directory>'
exit 1
end
def debug(msg)
return unless DEBUG
if msg.is_a? String
puts msg
else
pp msg
end
end
def err(msg)
@log.error msg
@stdout.error msg
end
def csv_out(content, path)
CSV.open(path, 'wb') do |csv|
content.each do |arr|
csv << arr
end
end
end
def parse_p(entry,type)
participants = []
h = Hash[entry.map.with_index.to_a]
names = entry.select {|l| l =~ /^\*/}
names.each do |n|
name = n[1..-1].strip
next if name.empty?
affil = entry[h[n]+1]
next if affil =~ /^\*/
participants << Participant.new(name, affil, type)
end
participants
end
def parse(file)
txt = ''
File.open(file, 'r').each do |line|
l = line.strip
l.gsub!(/\s+/, ' ')
l.gsub!(/^=+$/, '===')
l.gsub!(/^-+$/, '---')
txt += "|#{l}" unless l == ''
end
sections = txt.split('===')
#pp sections
sections.collect! {|l| l[1..-2]} # remove the leading and trailing '|'
header = sections.shift.split('|')
#pp header
ticker = header[2].split('-')[0].strip
reason = header[3]
datetime_str = header[4]
datetime = DateTime.parse(datetime_str)
date_str = datetime.strftime('%Y-%m-%d')
time_str = datetime.strftime('%H:%M')
timezone_str = datetime_str[/\W[A-Z]{2,}$/]
timezone_str = timezone_str.strip unless timezone_str.nil?
# parse participants
ops = {}
h = Hash[sections.map.with_index.to_a]
if h.has_key?(CP)
cp = sections[h[CP]+1].split('|')
parse_p(cp, 'C').each {|p| ops[p.to_s] = p}
end
if h.has_key?(CCP)
ccp = sections[h[CCP]+1].split('|')
parse_p(ccp, 'A').each {|p| ops[p.to_s] = p}
end
ops['Operator'] = 'Operator'
#pp ops
search_strings = ops.keys
# parse presentations
=begin
ps = []
p_sections = sections.select {|l| l =~ /^#{PRESENTATION}/}
if ! p_sections.empty?
current_p = nil
p_entries = p_sections.shift.split'|---|'
p_entries.each do |l|
if [QNA,'Definitions','Disclaimer'].include? l
ps << current_p.clone unless current_p.nil?
break
end
l.gsub!(/\s+\[\d+\]$/, '')
if search_strings.include?(l)
ps << current_p.clone unless current_p.nil?
current_p = Transcript.new
current_p.participant = ops[l]
else
current_p.transcript += l.gsub('|', ' ') unless current_p.nil?
end
end
end
=end
# parse questions and answers
qnas = []
qna_sections = sections.select {|l| l =~ /^#{QNA}/}
if ! qna_sections.empty?
current_qna = nil
qna_entries = qna_sections.shift.split'|---|'
qna_entries.each do |l|
if ['Definitions','Disclaimer'].include? l
qnas << current_qna.clone unless current_qna.nil?
break
end
# "Fred Ziegel, Topeka Capital Markets - Analyst [31]"
l.gsub!(/\s+\[\d+\]$/, '')
if search_strings.include?(l)
qnas << current_qna.clone unless current_qna.nil?
current_qna = Transcript.new
current_qna.participant = ops[l]
else
current_qna.transcript += l.gsub('|', ' ') unless current_qna.nil?
end
end
end
return if qnas.empty?
word_freq = Hash[@words.collect {|w| [w,0]}]
qnas.each do |qna|
p = qna.participant
next unless p.respond_to? :type and p.type == 'A'
qna.cal_word_freq(word_freq)
end
## build the csv array
@csv << [ticker,date_str,time_str,timezone_str] + word_freq.values
end
### main
usage unless ARGV.length == 1 and File.directory?(ARGV[0])
StanfordCoreNLP.jar_path = '/opt/stanford-core-nlp-minimal/'
StanfordCoreNLP.model_path = '/opt/stanford-core-nlp-minimal/'
StanfordCoreNLP.set_model('pos.model', 'english-left3words-distsim.tagger')
StanfordCoreNLP.use :english
@@pipeline = StanfordCoreNLP.load(:tokenize, :ssplit)
DEBUG = false
CP = 'Corporate Participants'
CCP = 'Conference Call Participants'
QNA = 'Questions and Answers'
PRESENTATION = 'Presentation'
log_dt_format = "%Y-%m-%d %H:%M:%S"
@log = Logger.new('parse.log')
@log.datetime_format = log_dt_format
@log.level = Logger::INFO
@stdout = Logger.new(STDOUT)
@stdout.datetime_format = log_dt_format
@stdout.level = Logger::DEBUG
@words = [
'cash',
'cash constrained',
'financially constrained',
'compensation',
'ceo compensation',
'executive compensation',
]
@csv = [['ticker','date','time','timezone'] + @words]
input = ARGV[0]
output_dir = File.dirname(input)
output_file = File.basename(input).gsub(/\W+/,'_') + '.csv'
output = File.join(output_dir, output_file)
Find.find(input) do |path|
if File.directory? (path)
next
else
if File.extname(path) == '.txt' and not File.basename(path) =~ /^\./
msg = "Parsing [" + path + "]"
@log.info msg
@stdout.info msg
parse(path)
end
end
end
## write to csv
csv_out(@csv, output)