Skip to content

Commit

Permalink
Merge pull request #452 from dandantheitman/bolton-scraper-fix-29112023
Browse files Browse the repository at this point in the history
Fix scraper for Bolton
  • Loading branch information
dp247 authored Nov 29, 2023
2 parents 7c162b1 + 815ceb6 commit 778e4b0
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 10 deletions.
7 changes: 4 additions & 3 deletions uk_bin_collection/tests/input.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@
},
"BoltonCouncil": {
"skip_get_url": true,
"uprn": "100010886949|21 HEATON AVENUE, BOLTON, BL1 5PQ",
"url": "https://www.bolton.gov.uk/next-bin-collection",
"postcode": "BL1 5PQ",
"uprn": "100010886949",
"url": "https://carehomes.bolton.gov.uk/bins.aspx",
"wiki_name": "Bolton Council",
"wiki_note": "To get the UPRN, you will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN, and the Bolton Council [website](https://www.bolton.gov.uk/next-bin-collection) to find the full address. See [here](https://github.com/robbrad/UKBinCollectionData/issues/272) for more information."
"wiki_note": "To get the UPRN, you will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search). Previously required single field that was UPRN and full address, now requires UPRN and postcode as separate fields."
},
"BristolCityCouncil": {
"skip_get_url": true,
Expand Down
60 changes: 53 additions & 7 deletions uk_bin_collection/uk_bin_collection/councils/BoltonCouncil.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,15 @@ def parse_data(self, page: str, **kwargs) -> dict:
user_uprn = kwargs.get("uprn")
check_uprn(user_uprn)

user_postcode = kwargs.get("postcode")
check_postcode(user_postcode)

data = {"bins": []}

# Start a new session
requests.packages.urllib3.disable_warnings()
s = requests.session()

headers = {
'authority': 'www.bolton.gov.uk',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
Expand All @@ -36,27 +43,65 @@ def parse_data(self, page: str, **kwargs) -> dict:
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.5563.147 Safari/537.36',
}

# Get our initial session running
response = s.get("https://carehomes.bolton.gov.uk/bins.aspx", headers=headers)

req_data = {
'uprn': user_uprn,
soup = BeautifulSoup(response.text, features="html.parser")
soup.prettify()

# Grab the variables needed to continue
payload = {
"__VIEWSTATE": (
soup.find("input", {"id": "__VIEWSTATE"}).get("value")
),
"__VIEWSTATEGENERATOR": (
soup.find("input", {"id": "__VIEWSTATEGENERATOR"}).get("value")
),
"__EVENTVALIDATION": (
soup.find("input", {"id": "__EVENTVALIDATION"}).get("value")
),
"txtPostcode": (user_postcode),
"btnSubmit": "Submit"
}

requests.packages.urllib3.disable_warnings()
response = requests.post('https://www.bolton.gov.uk/next-bin-collection', headers=headers, data=req_data)
# Get the address selection page
response = s.post("https://carehomes.bolton.gov.uk/bins.aspx", data=payload, headers=headers)

soup = BeautifulSoup(response.text, features="html.parser")
soup.prettify()

# Grab the variables needed to continue
payload = {
"__VIEWSTATE": (
soup.find("input", {"id": "__VIEWSTATE"}).get("value")
),
"__VIEWSTATEGENERATOR": (
soup.find("input", {"id": "__VIEWSTATEGENERATOR"}).get("value")
),
"__EVENTVALIDATION": (
soup.find("input", {"id": "__EVENTVALIDATION"}).get("value")
),
"txtPostcode": (user_postcode),
"ddlAddresses": (user_uprn)
}

# Get the final page with the actual bin data
response = s.post("https://carehomes.bolton.gov.uk/bins.aspx", data=payload, headers=headers)

soup = BeautifulSoup(response.text, features="html.parser")
soup.prettify()

collections = []

# Find section with bins in
sections = soup.find_all("div", {"class": "media-body"})
sections = soup.find_all("div", {"class": "bin-info"})

# For each bin section, get the text and the list elements
for item in sections:
words = item.find_next("p", {"class": "media-heading"}).text.split()[2:4]
words = item.find_next("strong").text.split()[2:4]
bin_type = ' '.join(words).capitalize()
date_list = item.find_all("li")
date_list = item.find_all("p")
for d in date_list:
next_collection = datetime.strptime(d.text.strip(), "%A %d %B %Y")
collections.append((bin_type, next_collection))
Expand All @@ -73,3 +118,4 @@ def parse_data(self, page: str, **kwargs) -> dict:
data["bins"].append(dict_data)

return data

0 comments on commit 778e4b0

Please sign in to comment.