-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtestscript.py
123 lines (94 loc) · 4.38 KB
/
testscript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# scrapeNRW - Scraping the state parliament information system.
# Copyright (C) 2015 Markus Drenger
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import requests
import json
import locale
import datetime
from bs4 import BeautifulSoup
try:
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
except locale.Error:
print("Setting the required german locale failed. Please install!")
raise
def find_between(s, first, last):
try:
start = s.index(first) + len(first)
end = s.index(last, start)
return s[start:end]
except ValueError:
return ""
def getURLSofPages(soup):
results = int(soup.find('p',{'class':'paging-register'}).text.split(' ')[-1])
print(results)
urls = []
i = 1
#while int(results / 50) >= i:
while i*50 < results+49:
print("next")
urls.append("http://www.parlamentsspiegel.de/ps/suche/Suchergebnisse_Parlamentsspiegel.jsp?m={}1&w=1%3D1&order=&maxRows=50&view=kurz&db=psakt".format(5*i))
i = i + 1
return urls
def getDetailsLinks(soup):
dds = soup.findAll('dd',{'class':'link'})
links = []
for dd in dds:
links.append('http://www.parlamentsspiegel.de'+dd.a['href'])
return links
def parseDetailspage(soup,url):
print(url)
detail = {}
detail['url']=url
table = soup.find('table')
detail['title']= table.tr.a.strong.text
detail['basisdoklink'] = 'http://www.parlamentsspiegel.de'+table.tr.a['href']
detail['textblock'] = table.select('tr')[1].select('td')[1].text
detail['systematik'] = list(map(str.strip, find_between(detail['textblock'], 'Systematik:', '\n').split('*')))
detail['schlagworte'] = list(map(str.strip, find_between(detail['textblock'], 'Schlagworte:', '\n').split('*')))
detail['suchworte'] = list(map(str.strip, find_between(detail['textblock'], 'Suchworte:', '\n').split('*')))
detail['region'] = list(map(str.strip, find_between(detail['textblock'], 'Region:', '\n').split('*')))
if len(table.findAll('tr'))>=3:
if table.select('tr')[2].select('td')[0].find('a') is not None:
detail['doclink']=table.select('tr')[2].select('td')[0].a['href']
if table.select('tr')[2].select('td')[1].find('a') is not None:
detail['doc_meta_link']=table.select('tr')[2].select('td')[1].a['href']
return detail
def parseBasisDok(soup):
basisdok={}
table = soup.find('table')
for tr in table.findAll('tr'):
basisdok[tr.select('td')[0].text.strip()]=tr.select('td')[1].text.strip()
return basisdok
testurl = 'http://www.parlamentsspiegel.de/ps/suche/Suchergebnisse_Parlamentsspiegel.jsp?w=native%28%27%28BEGRIFF_IX+phrase+like+%27%27Gemini%27%27%29%27%29&order=&fm=&db=psakt'
r = requests.get(testurl).text
#r = requests.get("http://www.parlamentsspiegel.de/ps/suche/Suchergebnisse_Parlamentsspiegel.jsp?w=&order=&fm=&db=psakt").text
soup = BeautifulSoup(r)
detaillinks = []
for url in getURLSofPages(soup):
print(url)
detaillinks.extend(getDetailsLinks(BeautifulSoup(requests.get(url).text)))
json.dump(detaillinks, open("parlamentsspiegel-detaillinks.json", "w", -1, "UTF8"))
details = []
for link in detaillinks:
details.append(parseDetailspage(BeautifulSoup(requests.get(link).text),link))
json.dump(details, open("parlamentsspiegel-details.json", "w", -1, "UTF8"))
basisdoks = []
for link in details:
basisdoks.append(parseBasisDok(BeautifulSoup(requests.get(link['basisdoklink']).text)))
json.dump(basisdoks, open("parlamentsspiegel-basisdoks.json", "w", -1, "UTF8"))
metadata = []
for detailsdata in details:
if "doc_meta_link" in detailsdata.keys():
metadata.append(parseBasisDok(BeautifulSoup(requests.get(detailsdata['basisdoklink']).text)))
json.dump(metadata, open("parlamentsspiegel-basisdoks.json", "w", -1, "UTF8"))