forked from ChrisDolph/scrape_city_jobs_site
-
Notifications
You must be signed in to change notification settings - Fork 0
/
city_job_scraper.py
131 lines (79 loc) · 3.32 KB
/
city_job_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
"""
Created on Thurs Oct 10 2017
Author: Christopher Dolph
"""
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
SEARCH_LINK_SUBSTR = 'https://www.austincityjobs.org'
def get_html(url,verify = True):
"""get_html uses requests to return a raw html string from a url"""
r = requests.get(url,verify = verify)
raw_html = r.text
return raw_html
def get_pages():
""" get_pages returns a list of urls from the City of Austins job search site
the length of which is equal to the number of pages containing job postings. This
number will vary according to the total number of jobs advertised on the site."""
search_link = 'https://www.austincityjobs.org/postings/search'
site_html = get_html(search_link, False)
page_soup = BeautifulSoup(site_html, 'html.parser',parse_only=SoupStrainer('div',{"class":"pagination"}))
pagination = page_soup.find_all(href =True)
pages_to_append = list(set([a['href'] for a in pagination]))
pages = ['https://www.austincityjobs.org/postings/search?page=1']
for i in pages_to_append:
pages.append(SEARCH_LINK_SUBSTR + i)
pages.sort()
return pages
def get_job_links(some_html):
"""get_job_links works on a single webpage. It returns a list of urls to the current job postings on the City of Austin's website.
It takes a raw html string as a argument and parses it using BeautifulSoup """
job_links = []
links_soup = BeautifulSoup(some_html, 'html.parser',parse_only=SoupStrainer('td',{"class":"job-title"}))
for link in links_soup.find_all('a'):
job_links.append(SEARCH_LINK_SUBSTR + link.get('href'))
return job_links
def compile_links():
"""compile_links consolidates lists of urls generated by get_job_links into one list """
pages_to_parse = get_pages()
all_links = []
for i in pages_to_parse:
html = get_html(i,False)
all_links += get_job_links(html)
return all_links
def get_tables(link):
""" get tables collects table row data from an html table on a webpage"""
html = get_html(link,False)
table_soup = BeautifulSoup(html, 'html.parser')
table = table_soup.find_all("tr")
return table
def get_headers(table_rows):
"""get_headers gathers only the th tags from an html table to set the headers for a csv export"""
headers = [i.find('th').text for i in table_rows]
return headers
def get_rows(table):
"""get_rows gathers only the td tags from an html table for a csv export"""
row = [i.find('td').text for i in table]
return row
def build_dataframe(links):
"""build_dataframe constructs a pandas dataframe one row at a time, where row is the
html table data from a url."""
headers = get_headers(get_tables(links[0])) #look at first link only for headers
df = pd.DataFrame(columns = headers)
try:
for i, link in enumerate(links):
rows = get_rows(get_tables(link))
# not all tables are of equal length, some are one column shorter
if len(rows) < 22:
rows.append('null')
df.loc[i] = rows
finally:
return df
def main():
the_links = compile_links()
data_frame = build_dataframe(the_links)
data_frame.to_csv('jobs.csv')
#call main()
if __name__ == "__main__":
main()