-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCaptureAirasia.py
86 lines (81 loc) · 3.49 KB
/
CaptureAirasia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/11/22 20:37
# @Author : long.zhang
# @Contact : long.zhang@opg.global
# @Site :
# @File : CaptureAirasia.py
# @Software: PyCharm
# @Desc :
'''
Created on 2016年6月4日
@author: Administrator
'''
from CaptureBase import CaptureBase
import time
import json
from CrawlingProxy import CrawlingProxy,useragent
from logger import logger
from retrying import retry
from datetime import datetime
from urlparse import urljoin
class CaptureAirasia(CaptureBase):
home_url = 'https://www.airasia.com/en/home.page'
banner_url = 'https://www.airasia.com/api/MainBannersApi/GetMainBanners?currentCulture=en&userCountry=HK&userCity=SHEUNGWAN¤cy=CNY&destination='
HEADER = '''
accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding:gzip, deflate, br
accept-language:zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7
cache-control:max-age=0
upgrade-insecure-requests:1
User-Agent:{}
'''
Channel = 'airasia'
def __init__(self, user_agent, proxy_ip=None):
super(CaptureAirasia, self).__init__(user_agent, proxy_ip)
self.header = self._getDict4str(self.HEADER.format(self.user_agent))
def __del__(self):
super(CaptureAirasia, self).__del__()
'''
function: 获取并存储首页滚动栏的商品信息
@return: True or raise
'''
@retry(stop_max_attempt_number=3, wait_fixed=2000)
def dealHomeGoods(self):
result_datas = []
try:
page_source = self.getHtml(self.banner_url, self.header)
page_source = json.loads(page_source)
pre_load_data = page_source.get('Data').get('Items')
for load_data in pre_load_data:
logger.debug('load_data: {}'.format(load_data))
resultData = {}
resultData['CHANNEL'.lower()] = self.Channel
resultData['STATUS'.lower()] = '01'
resultData['LINK'.lower()] = load_data.get('NavigateUrl')
resultData['TITLE'.lower()] = load_data.get('Headline')
resultData['MAIN_IMAGE'.lower()] = load_data.get('ImageUrl')
resultData['CREATE_TIME'.lower()] = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))
result_datas.append(resultData)
result_datas = self._rm_duplicate(result_datas, 'LINK'.lower())
if len(result_datas) == 0:
logger.error('page_source: {}'.format(page_source))
raise ValueError('not get valid data')
format_select = r'SELECT ID FROM {} WHERE CHANNEL="{{channel}}" and LINK="{{link}}" ORDER BY CREATE_TIME DESC'
good_datas = result_datas
select_sql = format_select.format(self.TABLE_NAME_BANNER)
table = self.TABLE_NAME_BANNER
replace_insert_columns = ['CHANNEL', 'LINK', 'MAIN_IMAGE', 'CREATE_TIME', 'STATUS', 'TITLE', 'MIN_IMAGE']
select_columns = ['ID']
return self._saveDatas(good_datas, table, select_sql, replace_insert_columns, select_columns)
except Exception, e:
logger.error('Get home goods infos error:{},retry it'.format(e))
raise
def main():
startTime = datetime.now()
objCaptureAirasia = CaptureAirasia(useragent)
objCaptureAirasia.dealHomeGoods()
endTime = datetime.now()
print 'seconds', (endTime - startTime).seconds
if __name__ == '__main__':
main()