-
Notifications
You must be signed in to change notification settings - Fork 0
/
otzyvua_review_scraper.py
112 lines (89 loc) · 4.44 KB
/
otzyvua_review_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import math
def init_driver(headless=True, adblock=True):
"""
A utility function that launches Google Chrome running Selenium. It does 3 things: downloads the driver on first
launch, installs an ad blocker, and starts the browser in headless mode. Advertising closes the area of clickable
elements and interferes with scrolling through the page.
:param headless: If true, runs in headless mode. It is more eficcient, but sometimes scraping fails in headless mode
:param adblock: If true, installs and uses uBlock origin
:return: Selenium webdriver
"""
if headless:
# TODO: Throw ValueError, when both set to True, because it's impossible to have adblock in non-headless mode
adblock = False
driver_installer = ChromeDriverManager(log_level=0, print_first_line=False)
service = Service(driver_installer.install())
options = Options()
options.headless = headless
if adblock:
options.add_extension('selenium_supplementary/ublock_1_39_2_0.crx')
return webdriver.Chrome(service=service, options=options)
def reviews(urls):
"""
Iterates over the urls of review pages and yields reviews. To do this, launches Selenium via init_driver
:param urls: List, each item of which is a URL to review, that should be scraped
:return: Generator, yielding reviews from each URL
"""
with init_driver() as driver:
for url in urls:
driver.get(url)
try:
yield driver.find_element(By.CSS_SELECTOR, "div#comments_container span.comment.description").text
except Exception:
pass
def review_urls_from_feed():
"""
Uses scrape_feed method
:return: Generator of links to review pages from an endless feed of all reviews at otzyvua.net
"""
return scrape_feed("div#container div.otzyv_box_float a.user-review")
def clinic_urls_from_feed():
"""
Uses scrape_feed method
:return: Generator of links to clinic pages from an endless feed of all reviews at otzyvua.net
"""
return scrape_feed("div#container div.otzyv_box_float h2 a")
def review_urls_from_clinic_page(clinic_url, reviews_per_page=30):
with init_driver(adblock=False) as driver:
driver.get(clinic_url)
n_reviews = int(driver.find_element(
By.XPATH, '//div[@class="btn-group"][.//span[@class="inav-title"][text()="Відгуки"]]//div/span'
).text)
n_pages = math.ceil(n_reviews / reviews_per_page)
for page_idx in range(1, n_pages + 1):
driver.get(clinic_url + f"?page={page_idx}")
review_elems = driver.find_elements(By.CSS_SELECTOR, ".commentbox h2 a")
for elem in review_elems:
yield elem.get_attribute('href')
def scrape_feed(css_selector, n_batches=20, scroll_pause_time=6):
"""
Launches Chrome using the init_driver, opens the endless feed page and tries
to load `n_batches more' times, scrolling to the very bottom. After each
scrolling, wait `scroll_pause_time' seconds.
Optimal default values for `n_batches' and `scroll_pause_time' are handpicked. If values are decreased, then
the feed will not be loaded completely, and if increased, the feed will exhaust and code will be running idle
without loading anything.
:param css_selector: Valid CSS selector
:param n_batches: How many scroll events to send
:param scroll_pause_time: How long to wait before sending next scroll event
:return: Generator, yielding all texts inside HTML tags with given selector
"""
with init_driver(headless=False) as driver:
driver.get("https://www.otzyvua.net/uk/meditsina/kliniki/")
last_height = driver.execute_script("return document.body.scrollHeight")
for i in range(n_batches):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_pause_time)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
link_elems = driver.find_elements(By.CSS_SELECTOR, css_selector)
for elem in link_elems:
yield elem.get_attribute('href')