-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathbundde-behoerden-scraper.rb
executable file
·50 lines (38 loc) · 1.23 KB
/
bundde-behoerden-scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/ruby
require 'rubygems'
require 'bundler/setup'
require 'mechanize'
require 'csv'
CATEGORIES = [
'oberstebundesbehoerde',
'mittlerebundesbehoerde',
'verfassungsorgan',
'bundesgericht',
'oberebundesbehoerde',
'rechtsfaehige_anstaltdesoeffentlichenrechts',
'nichtrechtsfaehigeanstaltdesoeffentlichenrechts',
]
URL = "https://www.bund.de/Content/DE/Behoerden/Suche/Formular.html?nn=4641496&cl2Categories_Einordnung=#{CATEGORIES.join('+')}&resultsPerPage=100"
@mech = Mechanize.new
def clean_domain(domain)
domain.downcase.gsub(/^https?:\/\//, '').gsub(/^www./, '')
end
def scrape_page(page)
list_entries = page.search('//ul[@class="result-list"]/li')
list_entries.each do |entry|
name = entry.at_css('h3 em').next.text.gsub(/\n/, ' ').strip
link = entry.at_css('a')
detail_page = @mech.click link
domain = detail_page.search('//div[@class="orgUnitHomepage"]//a').text.strip
next if domain.nil? || domain == ''
domain = clean_domain(domain)
puts [domain, name].to_csv
end
end
page = @mech.get URL
loop do
scrape_page(page)
next_page_link = page.search('//div[contains(@class,"pager")]//li[@class="next"]/a').first
break if next_page_link.nil?
page = @mech.click next_page_link
end