forked from xingag/spider_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider_lagou.py
191 lines (147 loc) · 5.81 KB
/
spider_lagou.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python
# encoding: utf-8
"""
@version: v1.0
@author: xag
@license: Apache Licence
@contact: xinganguo@gmail.com
@site: http://www.xingag.top
@software: PyCharm
@file: spider_lagou.py
@time: 2018/10/10 10:17
@description:使用selenium爬去拉勾网数据
"""
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import requests
from lxml import etree
import time
import re
# 封装一个爬虫类
class LagouSpider(object):
driver_path = "/usr/local/bin/chromedriver"
def __init__(self):
# 初始化driver
self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
# init base url
self.base_url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
# init spider result data
self.positions = []
def run(self):
"""
使用selenium取爬虫
:return:
"""
# 1.open the base_url
self.driver.get(self.base_url)
while True:
# 2.get detail page url
# 适用于第 1 页,第 2 页,第 3 页
source = self.driver.page_source
# 2.1 wait for the element had be presented【 下一页 】
WebDriverWait(driver=self.driver, timeout=20).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
)
# 3.parse the first page
self.parse_list_page(source)
# 4.use selenium to click the next page
# 找到最后一个 span 标签:下一页
next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
# 5.perform the click method
# 注意:确保不是在最后一页
if "pager_next_disabled" in next_btn.get_attribute('class'):
# 最后一页面的时候,退出应用
self.driver.quit()
break
else:
next_btn.click()
# 6.爬一页完成,就休息 1 秒钟
time.sleep(1)
def parse_list_page(self, source):
"""
解析一个页面的数据,获取详情页面的链接地址
:param source:页面源码数据
:return:
"""
# 思路:通过 a 标签【class='position_link'】
html = etree.HTML(source)
links = html.xpath('//a[@class="position_link"]/@href')
# 解析每一个职位的详情页面
for link in links:
self.request_detail_page(link)
# 注意:爬完一个详情页面,就休息 1 秒钟
time.sleep(1)
def request_detail_page(self, detail_url):
"""
打开详情页面
:param detail_url:详情页面的 URL
:return:
"""
# 注意:重新切换窗口,不能覆盖之前的窗口
# 1.保证有且只有两个窗口,第一个窗口:列表页面;第二个窗口:详情页面
# self.driver.get(detail_url)
self.driver.execute_script("window.open('%s')" % detail_url)
self.driver.switch_to.window(self.driver.window_handles[1])
# 2.获取详情页面的内容
detail_page_source = self.driver.page_source
# 3.解析详情页面
self.parse_detail_page(detail_page_source)
# 4.关闭详情页面的窗口【关闭当前页面]】,并把 driver 句柄切换回列表页面
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self, detail_page_source):
"""
解析详情页面
:param detail_page_source:
:return:
"""
html_element = etree.HTML(detail_page_source)
# 1.利用xpath解析页面
# 【数据】职位名称
position_name = html_element.xpath("//div[@class='job-name']/span/text()")[0]
job_request_spans = html_element.xpath("//dd[@class='job_request']//span")
# 【数据】薪水
salary = job_request_spans[0].xpath('./text()')[0].strip()
# 【数据】城市
# 【注意:利用正则表达式去除特殊符号和空格】
city_pre = job_request_spans[1].xpath('./text()')[0].strip()
city = re.sub(r'[\s/]', '', city_pre)
# 【数据】工作年限
work_years_pre = job_request_spans[2].xpath('./text()')[0].strip()
work_years = re.sub(r'[\s/]', '', work_years_pre)
# 【数据】学历
# 去掉空格、/ 符号
education_pre = job_request_spans[3].xpath('./text()')[0].strip()
education = re.sub(r'[\s/]', '', education_pre)
# 【数据】全职
full_time = job_request_spans[4].xpath('./text()')[0].strip()
# 【数据】职位的详情信息 - 列表
desc_pre = html_element.xpath('//dd[@class="job_bt"]//text()')
# 把列表转换拼接为字符串,并去掉首位的空字符
desc = ''.join(desc_pre).strip()
# 【数据】公司名称
company_name = html_element.xpath('//h2[@class="fl"]/text()')[0].strip()
position = {
'position_name': position_name,
'salary': salary,
'city': city,
'work_years': work_years,
'education': education,
'full_time': full_time,
'desc': desc,
"company_name": company_name
}
print('==' * 30)
print('爬取一个职位数据成功')
print(position)
print("==" * 30)
self.positions.append(position)
if __name__ == '__main__':
# 1.init spider instance
spider = LagouSpider()
# 2.start to spider
spider.run()
# 3.测试数据
print(spider.positions)