-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_simplyhired.py
155 lines (127 loc) · 5.51 KB
/
scrape_simplyhired.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pandas as pd
import urllib
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
from rich import print
import datetime
base_url = "https://www.simplyhired.com"
# Function to Build the URL to search
def build_simplyhired_job_search_url(job_title,
location,
radius=None,
fromage=None):
"""
Builds a URL for searching jobs on Indeed with a specified job title, location, and optional radius.
Args:
job_title: The job title for the search (string).
location: The desired location for the job search (city, state, or zip code).
radius: The search radius in miles (optional, integer).
Returns:
A string containing the formatted Indeed job search URL.
"""
# Indeed base URL for job search
simplyhired_base_url = "https://www.simplyhired.com/search?"
# Encode job title and location for safe URL inclusion
encoded_job_title = urllib.parse.quote(job_title)
encoded_location = urllib.parse.quote(location)
# Build the URL with job title and location parameters
search_url = simplyhired_base_url + f"q={encoded_job_title}&l={encoded_location}"
# Add radius parameter if provided
if radius:
search_url += f"&sr={radius}"
if fromage:
search_url += f"&t={fromage}"
return search_url
# List for the Things to be searched for
search_params = [
'Dental Practices#Boston, MA# 100 miles',
'Dental Practices#Houston, TX# 100 miles',
'Dental Practices#Greensboro, NC# 50 miles',
'Dental Practices#High point, NC# 50 miles',
'Dental Practices#Wintson-Salem, NC# 50 miles',
'Dental Practices#Los Angeles, CA# 100 miles',
'Dental Practices#Cleveland OH# 100 miles',
]
# To preprocess the search params for the inforamtion needed to build the link
# Not the best way but it is easy to modify
def preprocess_links(search_param):
job, location, distance = search_param.split('#')
dist= int(distance.strip().split(' ')[0])
time = 7
return (job, location, dist, time)
# The parsed links
parsed_output = [preprocess_links(search_param) for search_param in search_params]
links = []
for parsed in parsed_output:
job, location, distance, fromage = parsed
link = build_simplyhired_job_search_url(job, location, distance, fromage)
links.append(link)
# Function to extract all the job links from the loaded page
def extract_job_details(job_card):
"""
Extracts key features (title, company, location, salary, snippet, date, job URL) from a job listing HTML snippet.
Args:
html_content: The HTML content of the job listing snippet.
Returns:
A dictionary containing extracted key features of the job.
"""
# Extract job title using data-testid
job_title_element = job_card.find('h2', class_="chakra-text")
job_title = job_title_element.find('a').text.strip() if job_title_element else None
# Extract company name using data-testid
company_element = soup.find('span', class_='css-lvyu5j')
company_name = company_element.text.strip() if company_element else None
# Extract location
location_element = job_card.find('span', data_testid='searchSerpJobLocation')
location = location_element.text.strip() if location_element else None
# Extract salary
salary_element = job_card.find('p', class_='chakra-text css-1g1y608')
salary = salary_element.text.strip() if salary_element else None
# Extract job snippet
snippet_element = job_card.find('p', class_='chakra-text css-jhqp7z')
snippet = snippet_element.text.strip() if snippet_element else None
# Extract date posted
date_element = job_card.find('p', class_='chakra-text css-5yilgw')
date_posted = date_element.text.strip() if date_element else None
# Extract job URL
job_url_element = job_card.find('h2', class_='chakra-text css-8rdtm5').find('a')
job_url = (base_url+job_url_element['href']) if job_url_element else None
# Create a dictionary with extracted features
job_details = {
'Title': job_title,
'Company': company_name,
'Location': location,
'Salary': salary,
'Snippet': snippet,
'Date Posted': date_posted,
'Job Url': job_url
}
return job_details
# Set chrome to headless
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# Add the driver to be able to load the pages
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options = chrome_options)
# Scrape the pages
job_list = []
for link in links:
browser.get(link)
content = browser.page_source
soup = BeautifulSoup(content, 'html.parser')
job_cards = soup.find_all('div', class_ = 'css-f8dtpc')
job_list.extend([extract_job_details(job_card) for job_card in job_cards])
# Convert to a DataFrame
df = pd.DataFrame(job_list)
# Getting the current date and time to be able to name the csv file to be saved
current_datetime = datetime.datetime.now()
formatted_time = current_datetime.strftime('%Y-%m-%d %H:%M:%S')
# Save the dataframe to a CSV file
df.to_csv(f'scraped datasets/simplyhired {formatted_time}.csv', index=False)
print('Websites successfully scraped')