Skip to content

Commit

Permalink
feat: #1063 - rewrite Kirklees Council parser for new website
Browse files Browse the repository at this point in the history
  • Loading branch information
dp247 committed Dec 19, 2024
1 parent b1d8358 commit 048b75f
Showing 1 changed file with 64 additions and 53 deletions.
117 changes: 64 additions & 53 deletions uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import time
from datetime import datetime
from typing import Optional

from bs4 import BeautifulSoup
from selenium.common import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.drivers.chrome import ChromeDriver

from selenium import webdriver

from uk_bin_collection.uk_bin_collection.common import create_webdriver
from uk_bin_collection.uk_bin_collection.common import date_format
Expand Down Expand Up @@ -55,78 +60,84 @@ def _parse_data(self, page: str, **kwargs) -> dict:
- Extract info from the 'alt' attribute of the images on that page
"""
bins = []
data = {"bins": []}
collections = []

user_paon = kwargs["paon"]
user_postcode = kwargs["postcode"]

self._driver = driver = create_webdriver(
web_driver=kwargs["web_driver"],
headless=kwargs.get("headless", True),
session_name=__name__,
)
self._driver = driver = webdriver.Chrome()
# self._driver = driver = create_webdriver(
# web_driver=kwargs["web_driver"],
# headless=kwargs.get("headless", True),
# session_name=__name__,
# )
driver.implicitly_wait(1)

driver.get(
"https://www.kirklees.gov.uk/beta/your-property-bins-recycling/your-bins/default.aspx"
"https://my.kirklees.gov.uk/service/Bins_and_recycling___Manage_your_bins"
)

wait_for_element(
driver, By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoPremises"
)
time.sleep(5)

# Switch to iframe
iframe = driver.find_element(By.CSS_SELECTOR, "#fillform-frame-1")
driver.switch_to.frame(iframe)

house_input = driver.find_element(
By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoPremises"
wait_for_element(
driver, By.ID, "mandatory_Postcode", timeout=10
)
house_input.send_keys(user_paon)

postcode_input = driver.find_element(
By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoSearch"
By.ID, "Postcode"
)
postcode_input.send_keys(user_postcode)

# submit address search
driver.find_element(By.ID, "butGeoSearch").send_keys(Keys.RETURN)
wait_for_element(driver, By.ID, "List")
time.sleep(2)

WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(
By.XPATH,
"//select[@name='List']//option[contains(., '"
+ user_paon
+ "')]",
)
)
).click()

wait_for_element(
driver,
By.ID,
"cphPageBody_cphContent_wtcDomestic240__lnkAccordionAnchor",
# submitting can be slow
timeout=30,
)
time.sleep(10)

# Open the panel
driver.find_element(
By.ID, "cphPageBody_cphContent_wtcDomestic240__lnkAccordionAnchor"
).click()
# For whatever reason, the page sometimes automatically goes to the next step
next_button = driver.find_element(By.XPATH, '/html/body/div/div/section/form/div/nav/div[2]/button')
if next_button.is_displayed():
next_button.click()

# Domestic waste calendar
wait_for_element(
driver, By.ID, "cphPageBody_cphContent_wtcDomestic240__LnkCalendar"
)
calendar_link = driver.find_element(
By.ID, "cphPageBody_cphContent_wtcDomestic240__LnkCalendar"
)
driver.execute_script("arguments[0].click();", calendar_link)

# <img alt="Recycling collection date 14 March 2024"
# <img alt="Domestic collection date 21 March 2024
date_strings = driver.find_elements(
By.CSS_SELECTOR, 'img[alt*="collection date"]'
)
time.sleep(5)

for date in date_strings:
bin_type, _, _, day, month, year = date.get_attribute("alt").split()
collection_date = datetime.strptime(
f"{day} {month} {year}", "%d %B %Y"
).strftime(date_format)

bins.append(
{
"type": bin_type,
"collectionDate": collection_date,
}
)
soup = BeautifulSoup(self._driver.page_source, features="html.parser")
soup.prettify()

radio_button_text = soup.find_all("label", {"class": "radio-label"})
for label in radio_button_text:
parsed_text = label.text.split("x ")
row = parsed_text[1].lower().split("collection date: ")
bin_type = row[0].split("(")[0].strip()
date_text = row[1].strip().replace(")", "")
if date_text == "today":
bin_date = datetime.now()
else:
bin_date = datetime.strptime(date_text, "%A %d %B %Y")
collections.append((bin_type, bin_date))

ordered_data = sorted(collections, key=lambda x: x[1])
for item in ordered_data:
dict_data = {
"type": item[0].replace("standard ", "").capitalize(),
"collectionDate": item[1].strftime(date_format),
}
data["bins"].append(dict_data)

return {"bins": bins}
return data

0 comments on commit 048b75f

Please sign in to comment.