Skip to content

Commit

Permalink
Merge branch 'ca_on_markham_fix'
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Nov 1, 2024
2 parents 361c1e3 + 4853a95 commit 350e477
Showing 1 changed file with 66 additions and 50 deletions.
116 changes: 66 additions & 50 deletions ca_on_markham/people.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,78 @@
import re

from utils import CanadianPerson as Person
from utils import CanadianScraper

COUNCIL_PAGE = (
"https://www.markham.ca/wps/portal/home/about/city-hall/regional-ward-councillors/02-regional-ward-councillors"
)
MAYOR_PAGE = "https://www.markham.ca/wps/portal/home/about/city-hall/mayor/00-mayors-office"
COUNCIL_PAGE = "https://www.markham.ca/about-city-markham/city-hall/regional-ward-councillors"
MAYOR_PAGE = "https://www.markham.ca/about-city-markham/city-hall/mayors-office"


class MarkhamPersonScraper(CanadianScraper):
def scrape(self):
yield self.scrape_mayor(MAYOR_PAGE)

groups = self.lxmlize(COUNCIL_PAGE).xpath(
'//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]'
)
assert len(groups) == 2, "No councillors found"

regional_councillor_seat_number = 1
for i, group in enumerate(groups):
for councillor in group:
name = councillor.xpath(".//h3/text()")[0].strip()
district = councillor.xpath(".//p/text()")[0].strip()

page = self.lxmlize(COUNCIL_PAGE)
if i == 0:
role = "Regional Councillor"
district = f"Markham (seat {regional_councillor_seat_number})"
regional_councillor_seat_number += 1
else:
role = "Councillor"
district = district.replace("Councillor", "").strip()

yield self.scrape_mayor(MAYOR_PAGE)
image = councillor.xpath(".//img/@src")[0]
url = councillor.xpath(".//a/@href")[0]

address, phone, email, links = self.get_contact(url)

councillors = page.xpath('//div[@class="col-sm-3 col-xs-6"]')
assert len(councillors), "No councillors found"
for councillor in councillors:
name, district = councillor.xpath(".//h4/text()")[0].split(", ")
if "Ward" in district:
district = district.replace("Councillor", "").strip()
role = "Councillor"
elif "Regional" in district:
role = "Regional Councillor"
district = f"Markham (seat {regional_councillor_seat_number})"
regional_councillor_seat_number += 1
else:
role = district
district = "Markham"

image = councillor.xpath(".//img/@src")[0]
url = "https://www.markham.ca/wps/portal/home/about" + re.search(
r"(?<=about).*(?='\))", councillor.xpath(".//a/@href")[0]
).group(0)

address, phone, email, links = self.get_contact(url)

p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.add_source(url)

p.image = image
p.add_contact("address", address, "legislature")
p.add_contact("voice", phone, "legislature")
p.add_contact("email", email)

for link in links:
p.add_link(link)

yield p
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.add_source(url)

p.image = image
p.add_contact("address", address, "legislature")
p.add_contact("voice", phone, "legislature")
p.add_contact("email", email)

for link in links:
p.add_link(link)

yield p

def get_contact(self, url):
page = self.lxmlize(url)

contact_node = page.xpath('//div[@class="vcard col-sm-6"]')[0]
contact_node = page.xpath(
'//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]'
)[0]
links = []

address = contact_node.xpath(".//p/text()")[:2]
if contact_node.xpath('.//span[@class="address-line1"]/text()'):
address = (
contact_node.xpath('.//span[@class="address-line1"]/text()')[0]
+ " "
+ contact_node.xpath('.//span[@class="locality"]/text()')[0]
+ " "
+ contact_node.xpath('.//span[@class="administrative-area"]/text()')[0]
+ " "
+ contact_node.xpath('.//span[@class="postal-code"]/text()')[0]
+ " "
+ contact_node.xpath('.//span[@class="country"]/text()')[0]
)
else:
contact_node = page.xpath(
'//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]'
)[0]
address = contact_node.xpath(".//p/text()")[0] + " " + contact_node.xpath(".//p/text()")[1]

links = get_links(contact_node)
phone = self.get_phone(contact_node)
email = self.get_email(contact_node)
Expand All @@ -68,12 +81,15 @@ def get_contact(self, url):

def scrape_mayor(self, url):
page = self.lxmlize(url)
name = page.xpath('//img/@alt[contains(., "Mayor")]')[0].split(", ", 1)[1]
email = self.get_email(page)
phone = self.get_phone(page)
name = page.xpath(
'.//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()'
)[0]
contact_node = page.xpath('.//div[@class="dept-contact-info--block"]')[0]
email = self.get_email(contact_node)
phone = self.get_phone(contact_node)

p = Person(primary_org="legislature", name=name, district="Markham", role="Mayor")
p.image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0]
p.image = page.xpath('.//div[@class="align-right media--image"]/div/img/@src')[0]
p.add_contact("email", email)
p.add_contact("voice", phone, "legislature")
p.add_source(url)
Expand All @@ -86,6 +102,6 @@ def get_links(elem):
links = elem.xpath(".//a")
for link in links:
link = link.attrib["href"]
if "http://www.markham.ca" not in link and "mail" not in link:
if "http://www.markham.ca" not in link and "mail" not in link and "tel" not in link:
links_r.append(link)
return links_r

0 comments on commit 350e477

Please sign in to comment.