-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
126 lines (96 loc) · 2.7 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from bs4 import BeautifulSoup
import os
import requests
import re
import pickle
# parse
with open('all.html','r') as f:
html_doc = f.read()
soup = BeautifulSoup(html_doc)
# get all repos
trs = soup.find_all('tr')
repos = trs[1:-6]
# loop through
repo_ds = []
for repo in repos:
#
repo_d = {}
# find icon type
icon_hash = {
'https://www.coretrustseal.org/wp-content/uploads/leaflet-maps-marker-icons/CTS.png':'c',
'https://www.coretrustseal.org/wp-content/uploads/leaflet-maps-marker-icons/DSA.png':'d',
'https://www.coretrustseal.org/wp-content/uploads/leaflet-maps-marker-icons/WDS.png':'w',
'https://www.coretrustseal.org/wp-content/uploads/leaflet-maps-marker-icons/WDSA-3.png':'dw'
}
icon_p = repo.find('td',{'class':'lmm-listmarkers-icon'})
img = next(icon_p.children)
repo_type = icon_hash[img.attrs['src']]
print(repo_type)
repo_d['type'] = repo_type
# grab name
name = repo.find('span',{'class':'lmm-listmarkers-markername'}).text
print(name)
repo_d['name'] = name
# address (if exists)
try:
address = repo.find('div',{'class':'lmm-listmarkers-hr'}).text
except:
address = None
print(address)
repo_d['address'] = address
# HANDLE TYPES
# c
if repo_type == 'c':
# get p
p = repo.find('p')
# get first link
repo_d['homepage'] = p.find('a').text
# download pdf
for a in p.find_all('a'):
if a.text == 'CoreTrustSeal certification 2017-2019':
cert_pdf_url = a.attrs['href']
repo_d['cert_pdf_url'] = cert_pdf_url
filename = cert_pdf_url.split('/')[-1]
if not os.path.exists('c/%s' % filename):
r = requests.get(cert_pdf_url)
with open('c/%s' % filename, 'wb') as f:
f.write(r.content)
# d
if repo_type == 'd':
# get p
p = repo.find('p')
# get first link
repo_d['homepage'] = p.find('a').text
# get dsa date
date = re.match(r'.*DSA seal date: (.+?)DSA Seal', p.text).groups()[0]
repo_d['cert_date'] = date
# download pdf
for a in p.find_all('a'):
if a.text == 'DSA Seal':
data_seal_pdf = a.attrs['href']
repo_d['data_seal_pdf'] = data_seal_pdf
filename = '%s.pdf' % name.replace(' ','_').replace('/','-')
if not os.path.exists('d/%s' % filename):
r = requests.get(data_seal_pdf)
with open('d/%s' % filename, 'wb') as f:
f.write(r.content)
# w
if repo_type == 'w':
# get p
p = repo.find('p')
# get first link
repo_d['homepage'] = p.find('a').text
# get dsa date
try:
date = re.match(r'.*WDS Regular Member certification date: (.+?)WDS Regular Members', p.text).groups()[0]
repo_d['cert_date'] = date
except:
repo_d['cert_date'] = None
# dw
if repo_type == 'dw':
pass
# append
repo_ds.append(repo_d)
# pickle
pickle.dump(repo_ds,open('repos.pickle','wb'))
# write to csvs