forked from basalamader/Job-Parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scraps.py
213 lines (172 loc) · 7.89 KB
/
Scraps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import itertools
import pandas as pd
from urllib import robotparser
from urllib import request as u
from sys import exit
from bs4 import BeautifulSoup
# The Required dictionaries to be used later on in the script.
Dict, profile, profile_company = {}, {}, {}
# Lists
location_list = []
company_list = []
job_title = []
job_urls = []
date_list = []
def allow():
'''
The purpose of this module will be to go to the robots.txt file and ask for access for the parsing process
to begin. If the robot.txt denies the required access, then the program will be halted and a message will be
displayed explaining that we do not have the rights to parse the site.
'''
rp = robotparser.RobotFileParser()
rp.set_url("http://be.indeed.com/robots.txt")
rp.read()
access = rp.can_fetch("*","http://be.indeed.com/jobs?q=python%20analyst&l=CA" )
return access
class Parser:
def __init__(self):
self.df = pd.DataFrame([])
self.url = ''
self.jobstats = pd.DataFrame([])
self.textdata = pd.DataFrame([])
def pull_job_all(self, job, region='2220', radius=30, date=True):
'''
:param job:
:param region:
:param radius:
:param date:
:return:
'''
jobstats = self.jobstats
textdata = self.textdata
# fetch the number of jobs
soup = self.pull_job(job, region, radius, date)
jobstats = jobstats.append(self.data_parse(soup, job))
textdata = textdata.append(self.list_jobs(soup, job))
for text in soup.find_all("div", id="searchCount"):
count = str(text.get_text()).split(' ')[-1] #strip only the last number so we know the total count
stop = int(count)//10
print('found {count} jobs for {job}'.format(job=job, count=count))
for page in range(10, stop*10, 10):
soup = self.pull_job(job, region, radius, date, page=page)
jobstats = jobstats.append(self.data_parse(soup, job), ignore_index=True)
#print(jobstats)
textdata = textdata.append(self.list_jobs(soup, job), ignore_index=True)
return jobstats, textdata
def pull_job(self, job, region='2220', radius=30, date=True, page=0):
'''
:param job: the job query we are searching
:param region: the region (postal code preferably we want Default = 2220
:param radius: what range we want to search for (km's) Default = 30
:param date: (bool) sorted according to date Default = True3
:param all: (bool) fetch them all Default = False
:return: return the soup for data parse
'''
#first check if we can actually access it
if not allow():
exit('Robot not allowed, quitting')
# check if we listed a job, else raise error
if not job:
raise ValueError('need a job')
# base url setting
url = "http://be.indeed.com/jobs?q="+ str(job).replace(' ','%2B') \
+ "&l="+ str(region) \
+ "&radius=" + str(radius) \
+ "&fromage=last"
if date:
url += '&sort=date'
if page:
url += '&start='+str(page)
response = u.urlopen(url)
response = response.read()
soup = BeautifulSoup(response, "html.parser")
#print(url)
#print(soup.prettify())
return soup
def data_parse(self, soup, job):
'''
:param x: x is the boolean parameter that is passed from the previous query with the robots.txt file./
Once passed then the first process will be to validate the required access, displaying a message/
if it does not agree.
:param job: (str) a list of jobs will be passed to determine which one has the most amount of jobs possible.
:param soup: bs4 object
:return: The process will return a Pandas DataFrame table that displays the quantity of jobs available/
per query. For instance, Python engineer- 10,000 jobs available, Python Analyst- 5000 jobs e.t.c.
'''
# Once access is granted then the process starts parsing the data by first comparing the number/
# of jobs available and returning the facts and figures.
for text in soup.find_all("div", id="searchCount"):
data = str(text.get_text()).split(' ')[-1] #strip only the last number so we know the total count
job2 = job.replace("+", " ")
return [job2, data]
def list_jobs(self, soup, job):
for post in soup.find_all("div", {"class":" row result"}):
# job title
jobs = post.find_all("a", {"class": "turnstileLink"})
job_contents = (job.get_text(' ', strip=True)[:25] for job in jobs)
job_url = ('http://be.indeed.com'+job['href'] for job in jobs if job['href'])
job_urls.append(job_url)
job_title.append(job_contents)
# company Name
companies = post.find_all("span", {"itemprop":"name"})
company_content = (company.get_text(' ', strip=True)[:20] for company in companies)
company_list.append(company_content)
# location
locations = post.find_all("span", {"itemprop":"addressLocality"})
locality = (location.get_text(' ', strip=True)[:15] for location in locations)
location_list.append(locality)
# posting dat
dates = post.find_all("span", {"class":"date"})
date = (date.get_text(' ', strip=True).split(' ')[0] for date in dates)
date_list.append(date)
# return location_list
profile["Job Title"] =(list(itertools.chain.from_iterable(job_title)))
profile["Location"] = (list(itertools.chain.from_iterable(location_list)))
profile_company["Company"] = (list(itertools.chain.from_iterable(company_list)))
profile_company["Date"] = (list(itertools.chain.from_iterable(date_list)))
profile_company["url"] = (list(itertools.chain.from_iterable(job_urls)))
# Turning the list into a panda DataFrame which will have 3 columns. These columns/
# include jobtitle, job location, and Company
df3 = pd.DataFrame(profile_company)
df4 = pd.DataFrame(profile).join(df3, how='left')
return df4
# main module that manages all the other modules in the script
def main(jobs):
username = input("please enter your Plotly Username: \n")
api_key = input("please enter your Plotly Api Key: \n")
local = "2220"
radius = "35"
for job in jobs:
# Declaring the classes that have been used sequentially
parser = Parser()
#salary = SalaryEstimates()
# functions that are present in these classes respectively
#soup = parser.pull_job(job)
#items = parser.data_parse(soup, job)
items, final = parser.pull_job_all( job, region=local, radius=radius, date=True)
#write final to csv
final.to_excel(job+'.xlsx',index=False)
#final = parser.list_jobs(soup, job)
print("-~"*50)
print("-~"*50)
print("the requested job was", job)
print("-~"*50)
print(final)
print("-~"*50)
print("-~"*50)
# we don't have a salary posting so can't use it
#print("the requested job salary was "+job+" salary")
#print("-~"*50)
#wage_compiled = salary.salary_parser(soup)
#print("\n")
#print("-~"*50)
#print("-~"*50)
#print("The total number of jobs in each field is")
#print("-~"*50)
#print(items)
# compares the total number of jobs visually on Plotly
#parser.graph_parsed_data(username, api_key)
# runs the salary graph on Plotly
#salary.graphing_salary(username, api_key)
main(["process engineer",'maintenance engineer', 'project engineer', 'proces ingenieur'])
#main(["proces ingenieur"])