-
Notifications
You must be signed in to change notification settings - Fork 0
/
CTSNet-selenium.py
115 lines (89 loc) · 3.25 KB
/
CTSNet-selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from selenium import webdriver # powers the browser interaction
from selenium.webdriver.support.ui import Select # selects menu options
from bs4 import BeautifulSoup # to parse HTML
import csv # to write CSV
import pandas as pd # to see CSV
import time
import os
import random
driver = webdriver.PhantomJS()
next_page = "https://www.ctsnet.org/surgeons/surgeons-advanced-search?ln=&fn=&subspecialty=adult_cardiac_surgery&city=&country=gb&province=&o"
with open("IT-cardi.csv", "a") as f:
csv_w_interv = csv.writer(f)
csv_w_interv.writerow(["Name",
"Hospital",
"Phone",
"Interests",
"Practice-Areas",
"City-Region",
"Country",
"Street", "URL"])
for i in range(1000):
driver.get(next_page)
soup = BeautifulSoup(driver.page_source, "html5lib")
try:
next_page = "https://www.ctsnet.org" + \
soup.find('a', {'title': 'Go to next page'})['href']
except:
next_page = ""
td_a = soup.find_all(
"td", {"class": "views-field views-field-field-contact-last-name"})
if i == 0:
links = ["https://www.ctsnet.org" +
x.find("a")['href'] for x in td_a[48:]]
else:
links = ["https://www.ctsnet.org" + x.find("a")['href'] for x in td_a]
for l in links:
driver.get(l)
soup = BeautifulSoup(driver.page_source, "html5lib")
try:
name = soup.find('h1', {"class": 'page-title'}).text.strip()
print(name)
except:
continue
try:
hospital = soup.find(
'div', {
"class": 'contact-institution'}).text.strip()
except:
continue
try:
country = soup.find('div',
{"class": 'contact-country'}).text.strip()
except:
country = ''
try:
street = soup.find('div', {"class": 'contact-street'}).text.strip()
except:
street = ''
try:
city = soup.find(
'div', {
"class": 'contact-city-province-code'}).text.strip()
except:
city = ''
try:
phone = soup.find('div', {"class": 'contact-numbers'}).text.strip()
except:
continue
try:
fields = soup.find(
'div', {
"class": 'views-field views-field-field-contact-subspecialty'}).text.strip().replace(
'\n', '; ')
except:
fields = ''
try:
interests = soup.find(
'div', {
"class": 'field field--name-field-contact-interest field--type-text-long field--label-hidden'}).text.strip().replace(
'\n', '; ')
except:
interests = ''
if len(phone) > 0:
with open("IT-cardi.csv", "a") as f:
csv_w_interv = csv.writer(f)
csv_w_interv.writerow(
[name, hospital, phone, interests, fields, city, country, street, l])
time.sleep(random.randint(1, 3))
time.sleep(random.randint(1, 3))