Merge pull request #452 from dandantheitman/bolton-scraper-fix-29112023

Fix scraper for Bolton
robbrad · Nov 29, 2023 · 778e4b0 · 778e4b0
2 parents 7c162b1 + 815ceb6
commit 778e4b0
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 10 deletions.
diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json
@@ -40,10 +40,11 @@
     },
     "BoltonCouncil": {
         "skip_get_url": true,
-        "uprn": "100010886949|21 HEATON AVENUE, BOLTON, BL1 5PQ",
-        "url": "https://www.bolton.gov.uk/next-bin-collection",
+        "postcode": "BL1 5PQ",
+        "uprn": "100010886949",
+        "url": "https://carehomes.bolton.gov.uk/bins.aspx",
         "wiki_name": "Bolton Council",
-        "wiki_note": "To get the UPRN, you will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN, and the Bolton Council [website](https://www.bolton.gov.uk/next-bin-collection) to find the full address. See [here](https://github.com/robbrad/UKBinCollectionData/issues/272) for more information."
+        "wiki_note": "To get the UPRN, you will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search). Previously required single field that was UPRN and full address, now requires UPRN and postcode as separate fields."
     },
     "BristolCityCouncil": {
         "skip_get_url": true,

diff --git a/uk_bin_collection/uk_bin_collection/councils/BoltonCouncil.py b/uk_bin_collection/uk_bin_collection/councils/BoltonCouncil.py
@@ -15,8 +15,15 @@ def parse_data(self, page: str, **kwargs) -> dict:
         user_uprn = kwargs.get("uprn")
         check_uprn(user_uprn)
 
+        user_postcode = kwargs.get("postcode")
+        check_postcode(user_postcode)
+
         data = {"bins": []}
 
+        # Start a new session
+        requests.packages.urllib3.disable_warnings()
+        s = requests.session()
+
         headers = {
             'authority': 'www.bolton.gov.uk',
             'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -36,27 +43,65 @@ def parse_data(self, page: str, **kwargs) -> dict:
             'upgrade-insecure-requests': '1',
             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.5563.147 Safari/537.36',
         }
+
+        # Get our initial session running
+        response = s.get("https://carehomes.bolton.gov.uk/bins.aspx", headers=headers)
 
-        req_data = {
-            'uprn': user_uprn,
+        soup = BeautifulSoup(response.text, features="html.parser")
+        soup.prettify()
+
+        # Grab the variables needed to continue
+        payload = {
+            "__VIEWSTATE": (
+                soup.find("input", {"id": "__VIEWSTATE"}).get("value")
+            ),
+            "__VIEWSTATEGENERATOR": (
+                soup.find("input", {"id": "__VIEWSTATEGENERATOR"}).get("value")
+            ),
+            "__EVENTVALIDATION": (
+                soup.find("input", {"id": "__EVENTVALIDATION"}).get("value")
+            ),
+            "txtPostcode": (user_postcode),
+            "btnSubmit": "Submit"
         }
 
-        requests.packages.urllib3.disable_warnings()
-        response = requests.post('https://www.bolton.gov.uk/next-bin-collection', headers=headers, data=req_data)
+        # Get the address selection page
+        response = s.post("https://carehomes.bolton.gov.uk/bins.aspx", data=payload, headers=headers)
+
+        soup = BeautifulSoup(response.text, features="html.parser")
+        soup.prettify()
+
+        # Grab the variables needed to continue
+        payload = {
+            "__VIEWSTATE": (
+                soup.find("input", {"id": "__VIEWSTATE"}).get("value")
+            ),
+            "__VIEWSTATEGENERATOR": (
+                soup.find("input", {"id": "__VIEWSTATEGENERATOR"}).get("value")
+            ),
+            "__EVENTVALIDATION": (
+                soup.find("input", {"id": "__EVENTVALIDATION"}).get("value")
+            ),
+            "txtPostcode": (user_postcode),
+            "ddlAddresses": (user_uprn)
+        }
+
+        # Get the final page with the actual bin data
+        response = s.post("https://carehomes.bolton.gov.uk/bins.aspx", data=payload, headers=headers)
 
         soup = BeautifulSoup(response.text, features="html.parser")
         soup.prettify()
 
         collections = []
 
         # Find section with bins in
-        sections = soup.find_all("div", {"class": "media-body"})
+        sections = soup.find_all("div", {"class": "bin-info"})
 
         # For each bin section, get the text and the list elements
         for item in sections:
-            words = item.find_next("p", {"class": "media-heading"}).text.split()[2:4]
+            words = item.find_next("strong").text.split()[2:4]
             bin_type = ' '.join(words).capitalize()
-            date_list = item.find_all("li")
+            date_list = item.find_all("p")
             for d in date_list:
                 next_collection = datetime.strptime(d.text.strip(), "%A %d %B %Y")
                 collections.append((bin_type, next_collection))
@@ -73,3 +118,4 @@ def parse_data(self, page: str, **kwargs) -> dict:
             data["bins"].append(dict_data)
 
         return data
+