-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping.py
114 lines (100 loc) · 4.23 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import requests
import time
import csv
from bs4 import BeautifulSoup
from bs4.element import Comment
from selenium import webdriver
from langdetect import detect
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
# TODO Remove all jobs with company name "Jobindex Kurser"
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
def retrieve_jobs(job_keywords, job_tag, job_class, base_url):
job_listings = []
for job_keyword in job_keywords:
page = 1
while True:
search_url = base_url.format(page=page, job=job_keyword)
response = requests.get(search_url)
soup = BeautifulSoup(response.text, "html.parser")
fetched_jobs = soup.find_all(job_tag, job_class)
if not fetched_jobs:
break
job_listings += fetched_jobs
page += 1
return job_listings
jobs = []
def scrape_jobindex(job_keywords):
base_url = "https://www.jobindex.dk/jobsoegning?page={page}&q={job}%27&subid=1&subid=3&subid=4&subid=6&subid=93"
job_listings = retrieve_jobs(job_keywords, "div", {"class": ["PaidJob", "jix_robotjob"]}, base_url)
for job_ad in tqdm(job_listings):
url = job_ad.find("a", text='Se jobbet')["href"]
if url in jobs:
continue
title = find_element(job_ad, "h4", {"class": ""})
company = find_element(job_ad, "div", {"class": "jix-toolbar-top__company"})
location = find_element(job_ad, "span", {"class": "jix_robotjob--area"})
description = find_description(url, None)
if description == "Unknown":
continue
lang = detect(description)
if lang == "da":
continue
jobs.append([title, company, location, url, description, lang])
return jobs
def scrape_hub(job_keywords):
base_url = "https://thehub.io/jobs?search={job}&location=Denmark&countryCode=DK&sorting=mostPopular&page={page}"
job_listings = retrieve_jobs(job_keywords, "div", {"class": "my-10"}, base_url)
for job_ad in tqdm(job_listings):
url = "https://thehub.io" + job_ad.find("a", class_="card-job-find-list__link")["href"]
if url in jobs:
continue
title = find_element(job_ad, "span", {"class": "card-job-find-list__position"})
bullets = job_ad.find("div", {"class": "bullet-inline-list"})
company, location = [span.text for span in bullets.find_all("span")][:2]
description = find_description(url, {"tag": "div", "class": {"class": "text-block"}})
if description == "Unknown":
continue
lang = detect(description)
if lang == "da":
continue
jobs.append([title, company, location, url, description, lang])
return jobs
def find_element(soup, tag, class_):
element = soup.find(tag, class_)
return element.text.strip() if element else "Unknown"
def find_description(url, desc_config):
driver.set_page_load_timeout(3)
try:
driver.get(url)
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
# print(soup)
if desc_config:
return find_element(soup, desc_config['tag'], desc_config['class'])
else:
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return " ".join(t.strip() for t in visible_texts)
except TimeoutException:
print("TimeoutException")
return "Unknown"
scraped_jobs = scrape_jobindex(["\"Data+Science\"", "\"Machine+Learning\"", "\"Computer+Science\"", "\"Software+Developer\""])
scraped_jobs = scrape_hub(["\"Data Science\"", "\"Machine Learning\""])
print(len(scraped_jobs))
driver.quit()
csv_file = "job_info.csv"
with open(csv_file, mode="w", newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Title', 'Company', 'Location', 'URL', 'Description', 'Language'])
writer.writerows(scraped_jobs)