From 048b75fc53c87605916f41f7d947bb9b6c8815a1 Mon Sep 17 00:00:00 2001 From: David Park Date: Thu, 19 Dec 2024 11:28:14 +0000 Subject: [PATCH] feat: #1063 - rewrite Kirklees Council parser for new website --- .../councils/KirkleesCouncil.py | 117 ++++++++++-------- 1 file changed, 64 insertions(+), 53 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py b/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py index fd0a299e64..238c978f6c 100644 --- a/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py @@ -1,12 +1,17 @@ +import time from datetime import datetime from typing import Optional +from bs4 import BeautifulSoup from selenium.common import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.remote.webdriver import WebDriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait +from webdriver_manager.drivers.chrome import ChromeDriver + +from selenium import webdriver from uk_bin_collection.uk_bin_collection.common import create_webdriver from uk_bin_collection.uk_bin_collection.common import date_format @@ -55,78 +60,84 @@ def _parse_data(self, page: str, **kwargs) -> dict: - Extract info from the 'alt' attribute of the images on that page """ - bins = [] + data = {"bins": []} + collections = [] user_paon = kwargs["paon"] user_postcode = kwargs["postcode"] - self._driver = driver = create_webdriver( - web_driver=kwargs["web_driver"], - headless=kwargs.get("headless", True), - session_name=__name__, - ) + self._driver = driver = webdriver.Chrome() + # self._driver = driver = create_webdriver( + # web_driver=kwargs["web_driver"], + # headless=kwargs.get("headless", True), + # session_name=__name__, + # ) driver.implicitly_wait(1) driver.get( - "https://www.kirklees.gov.uk/beta/your-property-bins-recycling/your-bins/default.aspx" + "https://my.kirklees.gov.uk/service/Bins_and_recycling___Manage_your_bins" ) - wait_for_element( - driver, By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoPremises" - ) + time.sleep(5) + + # Switch to iframe + iframe = driver.find_element(By.CSS_SELECTOR, "#fillform-frame-1") + driver.switch_to.frame(iframe) - house_input = driver.find_element( - By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoPremises" + wait_for_element( + driver, By.ID, "mandatory_Postcode", timeout=10 ) - house_input.send_keys(user_paon) postcode_input = driver.find_element( - By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoSearch" + By.ID, "Postcode" ) postcode_input.send_keys(user_postcode) - # submit address search - driver.find_element(By.ID, "butGeoSearch").send_keys(Keys.RETURN) + wait_for_element(driver, By.ID, "List") + time.sleep(2) + + WebDriverWait(driver, 10).until( + EC.element_to_be_clickable( + ( + By.XPATH, + "//select[@name='List']//option[contains(., '" + + user_paon + + "')]", + ) + ) + ).click() - wait_for_element( - driver, - By.ID, - "cphPageBody_cphContent_wtcDomestic240__lnkAccordionAnchor", - # submitting can be slow - timeout=30, - ) + time.sleep(10) - # Open the panel - driver.find_element( - By.ID, "cphPageBody_cphContent_wtcDomestic240__lnkAccordionAnchor" - ).click() + # For whatever reason, the page sometimes automatically goes to the next step + next_button = driver.find_element(By.XPATH, '/html/body/div/div/section/form/div/nav/div[2]/button') + if next_button.is_displayed(): + next_button.click() - # Domestic waste calendar - wait_for_element( - driver, By.ID, "cphPageBody_cphContent_wtcDomestic240__LnkCalendar" - ) - calendar_link = driver.find_element( - By.ID, "cphPageBody_cphContent_wtcDomestic240__LnkCalendar" - ) - driver.execute_script("arguments[0].click();", calendar_link) - #