-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
131 lines (106 loc) · 4.53 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import re
import csv
import time
import numpy as np
from selenium import webdriver
from bs4 import BeautifulSoup
from datetime import datetime
# to time how long the script runs
start_time = datetime.now()
# Accepts an array and a string
# Inserts value into the array. If the value is NoneType or an empty string,
# 'NA' is inserted instead.
def insert_value(clinician, value):
if value == None or value == "":
clinician = np.append(clinician, values='NA')
else:
clinician = np.append(clinician, values=value)
return clinician
# all values that are being collected
attributes = ['Clinician URL', 'Image URL', 'Category', 'First Name',
'Last Name', 'Title', 'Office Name', 'Address', 'City',
'State', 'Zip/Postal Code','Country', 'Phone', 'Fax', 'Email',
'Website', 'Alternate Office Website', 'Alternate Office State',
'Alternate Office Country', 'Languages Spoken',
'Type of Facility',
'Does Your Facility Offer Both LSVT BIG & LSVT LOUD',
'Do You Accept Insurance', 'LSVT Certification Date',
'LSVT Certification Renewal Date']
# create the first row for the .csv file
clinicians = np.array([attributes])
driver = webdriver.Chrome()
# open the browser to the given link
driver.get('https://www.lsvtglobal.com/clinicians')
# Choose the 'LSVT BIG Physical Therapists or Occupational Therapists' option
driver.find_element_by_xpath("//input[@type='radio' and @value='big']").click()
# Choose the first option for all clinicians
driver.find_element_by_xpath("//*[@id='clinician_search_country']/option[1]").click()
# Agree to Terms and Conditions
driver.find_element_by_xpath("//*[@id='clinician_terms_checkbox']").click()
# Submit query
driver.find_element_by_xpath("//*[@id='uniform-undefined']//span//input[@type='submit' and @value='Search']").click()
time.sleep(5)
# lazy-load all clinicians (8641)
element = driver.find_element_by_id("advanced_search_wrap")
driver.execute_script("arguments[0].scrollIntoView();", element)
time.sleep(900)
# Selenium hands the page source to Beautiful Soup
all_soup = BeautifulSoup(driver.page_source, 'lxml')
# collect all the clinician links (8641)
clinician_links = []
for clinician_link in all_soup.find_all('li', class_=['one', 'two', 'three']):
clinician_links.append(clinician_link.h2.a['href'])
print('Number of Clinicians:', len(clinician_links))
print('Time it took to lazy-load:', datetime.now() - start_time)
# scrape info about each clinician
for link in clinician_links:
# load the clinician's page
driver.get(link)
# create BeautifulSoup object for info scraping
single_soup = BeautifulSoup(driver.page_source, 'lxml')
# initialize an empty array for a clinician's information
clinician = np.array([])
# insert clinician's link into the clinician's array
clinician = insert_value(clinician, link)
# get the clinician's image url and insert it into the clinician's array
image_url = single_soup.find('img', class_ = 'flt-right small')
if image_url != None:
image_url = image_url['src']
clinician = insert_value(clinician, image_url)
# get the clinician's category and insert it into the clinician's array
category = single_soup.find('div', class_='cms-main')
if category != None:
category = category.p.text
clinician = insert_value(clinician, category)
# for each attribute in the list of attributes, collect the information
# corresponding to the attribute
for attribute in attributes[3:]:
pattern = re.compile('(?<=%s</th><td>)(.*?)(?=</td>)'%attribute)
if attribute == "Website":
pattern = re.compile('(?<=Website</th><td><a href=")(.*?)(?=" target)')
elif attribute == "Alternate Office Website":
pattern = re.compile('(?<=Office Website</th><td><a href=")(.*?)(?=" target)')
match = pattern.search(str(single_soup))
if match:
value = match.group()
else:
value = None
clinician = insert_value(clinician, value)
clinicians = np.vstack((clinicians, clinician))
# close the browser
driver.quit()
# open a .csv file
outfile = open('LSVT_BIG.csv','w', newline='', encoding='utf-8')
writer = csv.writer(outfile)
# insert rows per clinician
for clinician in clinicians:
try:
writer.writerow(clinician)
except:
print("Write Error: ", clinician)
raise
continue
# close the .csv file
outfile.close()
print('Time it took to run script:', datetime.now() - start_time)
print("DONE")