forked from sfbrigade/sf-admin-code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
american_legal_scraper.rb
355 lines (260 loc) · 10.8 KB
/
american_legal_scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
require 'rubygems'
require 'nokogiri'
require 'json'
require 'watir-webdriver'
# define a utility method we'll need later
def collect_between(first, last)
first == last ? [first] : [first, *collect_between(first.next, last)]
end
# also define a way to search html text for the section class definition
def is_a_section(str)
(str =~ /class="Section"/i) != nil
end
# also define a way to search html text for the footer div containing disclaimer
def is_footer(str)
(str =~ /American Legal Publishing Corporation provides these documents for informational purposes only./i) != nil
end
browser = Watir::Browser.new
browser.goto 'http://www.amlegal.com/nxt/gateway.dll?f=templates&fn=default.htm&vid=amlegal:sanfrancisco_ca'
frame = Nokogiri::HTML(browser.frame(:name,"contents").html)
first_level_container = frame.css("body")[0].css('div#California_c')[0].css('div')[3]
an_img = first_level_container.css('img')[0]
browser.frame(:name,"contents").element(css: an_img.css_path).click
puts "waiting 7 seconds for operation..."
sleep(7)
puts "done waiting"
# update after pressing the button (takes some time to load--may need to lengthen)
frame = Nokogiri::HTML(browser.frame(:name,"contents").html)
first_level_container = frame.css("body")[0].css('div#California_c')[0].css('div')[3]
morenode_img = first_level_container.css("[ct='application/morenode']")[0].css('img')[0]
browser.frame(:name,"contents").element(css: morenode_img.css_path).click
puts "waiting 5 seconds for operation..."
sleep(5)
puts "done waiting"
# update again after pressing more button
frame = Nokogiri::HTML(browser.frame(:name,"contents").html)
first_level_container = frame.css("body")[0].css('div#California_c')[0].css('div')[3]
chapters = first_level_container.css("[class='treenode']")
puts "number of elements:"
puts chapters.length
# initialize the sections and titles arrays
sections_array = Array.new()
titles_array = Array.new()
chapters.each_with_index do |chapter,i|
# within each Chapter, except the first and last which are weird...
if i == 0 || i == chapters.length - 1
puts "skipping non-numbered chapters"
else
# find the link that needs to be clicked
chapter_link = chapter.css("[class='nodetext']")[0]
# click on that element
browser.frame(:name,"contents").element(css: chapter_link.css_path).click
#wait 2 seconds
puts "waiting 3 seconds for operation..."
sleep(3)
puts "done waiting"
#scoot over to the main window
doc_body = Nokogiri::HTML(browser.frame(:name,"main").frame(:name,"docbody").html)
# grab the chapter's title-- the first one is weird but the rest are same
chapter_title = doc_body.css("[class='Chapter']")[0].css('span')[0].text
# originally the index started at ': ' but now ':'
# also need to get the index of '.' because some chapters have that?
colon_index = chapter_title.index(':')
period_index = chapter_title.index('.')
divider_index = 0
if colon_index != nil
if colon_index < 20
divider_index = colon_index
end
else
divider_index = period_index
end
# get the index and text title separated
chapter_index = chapter_title[(chapter_title.index('R')+2)..(divider_index-1)]
chapter_name = chapter_title[(divider_index+2)..-1]
puts chapter_index
puts chapter_name
if i == chapters.length - 2
chapter_index = "A"
end
# add the index and text to an array
chapter_array = Array.new()
chapter_array.push(chapter_index)
chapter_array.push(chapter_name)
# add the array to the titles_array
titles_array.push(chapter_array)
# grab any articles in it
articles = doc_body.css("[class='Article']")
# we may not need the histories? trying with disclaimer div
footers = doc_body.search "[text()*='American Legal Publishing Corporation provides these documents for informational purposes only.']"
footer = footers.last
articles.each_with_index do |article,j|
# for each article...
# grab histories too, for now
histories = article.css("[class='History']")
# isolate the html chunk associated with this article (last is tricky)
if j < articles.length - 1
paragraphs = collect_between(article,articles[j+1])
else
paragraphs = collect_between(article,footer)
end
# use our custom method to get the elements that are sections
sections1 = paragraphs.select { |p| is_a_section(p.to_s)}
sections1.each_with_index do |section1,k|
if i == 51 && k == 91
elsif i == 51 && k == 92
else
# for each section:
# grab and add the section's title
# some of theme are in different formats!
# most are in a span, but a few are just in the h5
section_title = section1.text.gsub("\n"," ")
puts section_title
# grab the text inside the section. last one is tricky!!
if k < sections1.length - 1
paragraphs1 = collect_between(section1,sections1[k+1])
elsif j < articles.length - 1
#replacing histories.last with footer
paragraphs1 = collect_between(section1,histories.last)
else
paragraphs1 = collect_between(section1,footer)
end
section_text1 = ""
paragraphs1.each_with_index do |paragraph1,l|
if l != 0 && l < paragraphs1.length - 1 #&& k < sections1.length - 1
section_text1 += paragraph1
# elsif l != 0 && k == sections1.length - 1
# section_text1 += paragraph1
end
end
# parse section_title
section_title_a = section_title[(section_title.index(' ')+1)..-1]
section_title_index = section_title_a[0...section_title_a.index(' ')]
section_title_text = section_title_a[(section_title_a.index(' ')+1)..-1]
section_title_index = section_title_index[0...-2]
# add section title to new array
section_array = Array.new()
section_array.push(section_title_index)
section_array.push(section_title_text)
sections_array.push(section_array)
# create a JSON Object for this section
section_object = Hash.new()
# text
section_object.merge!(:text => section_text1)
# credits (tag is history but it seems more credit like?)
# division (identifier and text). use article
# chapter (identifier and text). use chapter
# index is usually a period
# like 'SEC. 10.100-373.'
# but sometimes a colon
# like 'Appendix A:'
section_period_index = section_title_index.index('.')
# heading
section_heading = Hash.new()
if section_period_index != nil
section_heading.merge!(:title => section_title_index[0...section_period_index])
section_heading.merge!(:chaptersection => section_title_index[(section_period_index+1)..-1])
else
section_heading.merge!(:title => section_title_index)
end
section_heading.merge!(:identifier => section_title_index)
section_heading.merge!(:catch_text => section_title_text)
section_object.merge!(:heading => section_heading)
# create a file for this section and put in json with title_index
File.open("sections/" + section_title_index + ".json","w") do |f|
f.write(section_object.to_json)
end
end
end
end
# at chapter level, if no articles then grab any sections in it
if articles.length == 0
sections2 = doc_body.css("[class='Section']")
sections2.each_with_index do |section2,k|
# for each section:
# grab and add the section's title
section_title2 = section2.css('span')[0].text
puts section_title2
# grab the text inside the section. last one is tricky
if k < sections2.length - 1
paragraphs = collect_between(section2,sections2[k+1])
else
paragraphs = collect_between(section2,footer)
# else
# MAYBE TODO: need case for 29A, where the last paragraph DOESN't have history
# last before div?????
# paragraphs = collect_between(section2,histories.last)
end
section_text2 = ""
paragraphs.each_with_index do |paragraph,l|
if l != 0 && l < paragraphs.length - 1 #&& k < sections2.length - 1
section_text2 += paragraph
# elsif l != 0 && k == sections2.length - 1
# section_text2 += paragraph
end
end
# parse section_title
section_title_a = section_title2[(section_title2.index(' ')+1)..-1]
section_title_index = section_title_a[0...section_title_a.index(' ')]
section_title_text = section_title_a[(section_title_a.index(' ')+1)..-1]
puts "then..."
puts section_title_index
puts section_title_text
if i == chapters.length - 2
section_title_text = section_title_text[2..-1]
section_title_index = "A." + section_title_index[0...-1]
else
section_title_index = section_title_index[0...-2]
end
puts "now..."
puts section_title_index
puts section_title_text
# add section title to new array
section_array = Array.new()
section_array.push(section_title_index)
section_array.push(section_title_text)
sections_array.push(section_array)
# create a JSON Object for this section
section_object = Hash.new()
# text
section_object.merge!(:text => section_text2)
# credits (tag is history but it seems more credit like?)
# division (identifier and text). use article
# chapter (identifier and text). use chapter
# index is usually a period
# like 'SEC. 10.100-373.'
# but sometimes a colon
# like 'Appendix A:'
section_period_index = section_title_index.index('.')
# heading
section_heading = Hash.new()
if section_period_index != nil
section_heading.merge!(:title => section_title_index[0...section_period_index])
section_heading.merge!(:chaptersection => section_title_index[(section_period_index+1)..-1])
else
section_heading.merge!(:title => section_title_index)
end
section_heading.merge!(:identifier => section_title_index)
section_heading.merge!(:catch_text => section_title_text)
section_object.merge!(:heading => section_heading)
# create a file for this section and put in json with title_index
File.open("sections/" + section_title_index + ".json","w") do |f|
f.write(section_object.to_json)
end
end
end
end
end
File.open("sids.json","w") do |f|
f.write(sections_array.to_json)
end
# add sids and titles to new index object
index_object = Hash.new()
index_object.merge!(:sections => sections_array)
index_object.merge!(:titles => titles_array)
File.open("index.json","w") do |f|
f.write(index_object.to_json)
end
# TODO: extra. handle the first and last chapters. they are not real chapters so their formatting is funky
#celebrate good times
#come on