diff --git a/.gitignore b/.gitignore index a0d63bdf..da93f8d6 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ backup datas/* !datas/.keep .env +.venv *.db* *.sqlite3 *.tgz diff --git a/scrapy-package/.gitignore b/scrapy-package/.gitignore index 8b3e3da4..a1c00fc7 100644 --- a/scrapy-package/.gitignore +++ b/scrapy-package/.gitignore @@ -1,4 +1,5 @@ __pycache__ scrapy.log dist +.venv *.egg-info diff --git a/scrapy-package/scrapy_twrh/items.py b/scrapy-package/scrapy_twrh/items.py index 8859aa14..bfe7d180 100644 --- a/scrapy-package/scrapy_twrh/items.py +++ b/scrapy-package/scrapy_twrh/items.py @@ -19,6 +19,7 @@ class GenericHouseItem(Item): vendor_house_url = Field() # price related monthly_price = Field() + min_monthly_price = Field() deposit_type = Field() n_month_deposit = Field() deposit = Field() diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py b/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py index 59c3271d..e630830b 100644 --- a/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py +++ b/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py @@ -1,4 +1,6 @@ +import json import re +from functools import reduce from urllib.parse import urlparse, parse_qs from decimal import Decimal from functools import partial @@ -6,7 +8,18 @@ from scrapy_twrh.spiders.util import clean_number from scrapy_twrh.items import RawHouseItem, GenericHouseItem from .request_generator import RequestGenerator -from .util import DetailRequestMeta, SITE_URL +from .util import parse_price + +# copy from stackoverflow XD +# https://stackoverflow.com/questions/25833613/safe-method-to-get-value-of-nested-dictionary +def get(dictionary, keys, default=None): + return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default, keys.split("."), dictionary) + +def list_to_dict (list, name_field = 'name', value_field = 'value'): + ret = {} + for item in list: + ret[item[name_field]] = item[value_field] + return ret def dict_from_tuple(keys, values): min_length = min(len(keys), len(values)) @@ -42,52 +55,10 @@ class DetailMixin(RequestGenerator): apt_features = { 'n_living_room': '廳', 'n_bed_room': '房', - 'n_balcony': '陽台', 'n_bath_room': '衛' } - def default_parse_detail(self, response): - meta = response.meta['rental'] - if meta.gps: - return self.parse_gps_response(response) - - return self.parse_main_response(response) - - def parse_gps_response(self, response): - house_id = response.meta['rental'].id - - if response.status == 404: - self.logger.info( - 'GPS {} not found by receiving status code {}' - .format(house_id, response.status) - ) - yield True - return - - gmap_url = self.css_first(response, '#main .propMapBarMap iframe::attr(src)') - # example //maps.google.com.tw/maps?f=q&hl=zh-TW&q=25.0268980,121.5542323&z=17&output=embed - - parsed_url = urlparse(gmap_url) - qs = parse_qs(parsed_url.query) - if 'q' not in qs or not qs['q']: - self.logger.info( - 'Invalid GPS page in house: {}' - .format(house_id) - ) - return - - gps_str = qs['q'][0] - coordinate = list(map(Decimal, gps_str.split(','))) - - if len(coordinate) == 2: - yield GenericHouseItem( - vendor=self.vendor, - vendor_house_id=house_id, - rough_coordinate=coordinate - ) - - def parse_main_response(self, response): house_id = response.meta['rental'].id if response.status == 400: @@ -110,52 +81,29 @@ def parse_main_response(self, response): is_list=False, raw=response.body ) - - # sometime we got 200 but it's actually 30x... - browser_title = self.css_first(response, 'title::text') - if browser_title.startswith('等待跳轉'): - yield GenericHouseItem( - vendor=self.vendor, - vendor_house_id=house_id, - deal_status=enums.DealStatusType.NOT_FOUND - ) - else: - detail_dict = self.collect_dict(response) - - yield RawHouseItem( - house_id=house_id, - vendor=self.vendor, - is_list=False, - dict=detail_dict + jsonResp = json.loads(response.text) + if 'data' not in jsonResp: + self.logger.error('Invalid detail response for 591 house: {}' + .format(response.meta['rental'].id) ) + return False - yield GenericHouseItem( - **self.gen_detail_shared_attrs(detail_dict) - ) - - # get gps only when the house existed - yield self.gen_detail_request(DetailRequestMeta( - house_id, - True - )) - - def css_first(self, base, selector, default='', allow_empty=False, deep_text=False): - # Check how to find if there's missing attribute - css = self.css(base, selector, [default], deep_text=deep_text) - if css: - return css[0] + detail_dict = jsonResp['data'] + detail_dict['house_id'] = house_id - if not allow_empty: - self.logger.info( - 'Fail to get css first from {}({})'.format( - base, - selector - ) + yield RawHouseItem( + house_id=house_id, + vendor=self.vendor, + is_list=False, + dict=detail_dict ) - return '' + yield GenericHouseItem( + **self.gen_detail_shared_attrs(detail_dict) + ) def css(self, base, selector, default=None, deep_text=False): + # keep this for now, in case we meet this issue again.. #89 # Issue #30, we may get innerHTML like "some of target string" # deep_text=True retrieve text in the way different from ::text, # which will also get all child text. @@ -176,211 +124,6 @@ def clean_string(self, strings): strings = map(lambda str: str.replace(u'\xa0', '').strip(), strings) return strings - def collect_dict(self, response): - # title - title = self.css_first(response, '.houseInfoTitle', deep_text=True) - - # region 首頁/租屋/xx市/xx區 - breadcromb = self.css(response, '#propNav a', deep_text=True) - if len(breadcromb) >= 4: - if breadcromb[2] == '出租' and len(breadcromb) >= 5: - # 首頁 > 店面 > 出租 > 台北市 > 大安區 > 台北市大安區安和路二段 - top_region = breadcromb[3] - sub_region = breadcromb[4] - else: - # 首頁 > 租屋 > 台北市 > 大安區 > 獨立套房 > 20000-30000元 > 台北市大安區仁愛路四段50號 - top_region = breadcromb[2] - sub_region = breadcromb[3] - else: - top_region = '__UNKNOWN__' - sub_region = '__UNKNOWN__' - - # rough address - address = self.css_first(response, '#propNav .addr', deep_text=True) - - # image, it's in a hidden input - imgs = self.css_first( - response, - '#hid_imgArr::attr(value)', - allow_empty=True - ).replace('"', '').split(',') - - if imgs[0] == "": - imgs.pop(0) - - # top meta, including 押金, 法定用途, etc.. - top_meta_keys = self.css(response, '.labelList-1 .one', deep_text=True) - top_meta_values = self.css(response, '.labelList-1 .two em', deep_text=True) - top_metas = dict_from_tuple(top_meta_keys, top_meta_values) - - if '身份要求' in top_metas: - top_metas['身份要求'] = top_metas['身份要求'].split('、') - - # facilities, including 衣櫃、沙發, etc.. - fa_status = self.css(response, '.facility li span::attr(class)') - fa_text = self.css(response, '.facility li', deep_text=True) - fa = [] - without_fa = [] - for index, key in enumerate(fa_text): - if fa_status[index] != 'no': - fa.append(key) - else: - without_fa.append(key) - - # environment - #

生活機能:近便利商店;傳統市場;夜市

- env_keys = self.css(response, '.lifeBox > p strong', deep_text=True) - env_desps = self.css(response, '.lifeBox > p', deep_text=True) - env_desps = list(map(lambda desp: re.sub('.*:', '', desp).split(';'), env_desps)) - env = dict_from_tuple(env_keys, env_desps) - - # neighbor - nei_selector = response.css('.lifeBox.community') - nei = {} - if nei_selector: - nei['name'] = self.css_first(nei_selector, '.communityName a', deep_text=True) - nei['desp'] = self.css_first( - nei_selector, - '.communityIntroduce::text', - deep_text=True, - allow_empty=True - ) - nei['url'] = SITE_URL +\ - self.css_first(nei_selector, '.communityIntroduce a::attr(href)', allow_empty=True) - nei_keys = self.css(nei_selector, '.communityDetail p::text') - nei_values = self.css(nei_selector, '.communityDetail p > *', deep_text=True) - nei['info'] = dict_from_tuple(nei_keys, nei_values) - - # sublets 分租套房、雅房 - sublets_keys = self.css(response, '.list-title span', deep_text=True) - sublets_list = response.css('.house-list') - sublets = [] - for sublet in sublets_list: - texts = self.css(sublet, 'li', deep_text=True) - sublet_dict = dict_from_tuple(sublets_keys, texts) - if '租金' in sublet_dict: - sublet_dict['租金'] = clean_number(sublet_dict['租金']) - if '坪數' in sublet_dict: - sublet_dict['坪數'] = clean_number(sublet_dict['坪數']) - - sublets.append(sublet_dict) - - # desp - desp = self.css(response, '.houseIntro *', deep_text=True) - - # q and a - # TODO - # TODO: format correct - - # price - #
14,500 元/月
- price = self.css_first(response, '.price i', deep_text=True) - - # built-in facility - price_includes = self.css_first( - response, - '.detailInfo .price+.explain', - deep_text=True, - allow_empty=True - ).split('/') - - # lease status - is_deal = len(response.css('.filled').extract()) > 0 - # house_state = 'OPENED' - # deal_at = None - # if is_deal: - # house_state = 'DEAL' - # deal_at = timezone.localtime() - - # side meta - sides = self.css(response, '.detailInfo .attr li', deep_text=True) - side_metas = {} - for side in sides: - tokens = side.split(':') - if len(tokens) >= 2: - side_metas[tokens[0]] = ':'.join(tokens[1::]) - - # 格局 : 3房2廳2衛2陽台 - if '格局' in side_metas: - # TODO: 開放式格局 - parts = re.findall( - r'(\d)([^\d]+)', - side_metas['格局'] - ) - parts_dict = {} - for part in parts: - parts_dict[part[1]] = part[0] - side_metas['格局'] = parts_dict - if '坪數' in side_metas: - side_metas['坪數'] = clean_number(side_metas['坪數']) - if '權狀坪數' in side_metas: - side_metas['權狀坪數'] = clean_number(side_metas['權狀坪數']) - - # due day - due_day = self.css_first(response, '.explain .ft-rt', deep_text=True) - due_day = due_day.replace('有效期:', '') - - # owner - owner = {} - owner['name'] = self.css_first(response, '.avatarRight i', deep_text=True) - owner['comment'] = self.css_first(response, '.avatarRight div', deep_text=True) - agent_info = self.css(response, '.avatarRight .auatarSonBox p', deep_text=True) - make_agent_info = partial(split_string_to_dict, seperator=':') - agent_info = list(map(make_agent_info, agent_info)) - owner['isAgent'] = len(agent_info) > 0 - owner['agent'] = agent_info - - phone_ext = self.css_first(response, '.phone-hide .num', deep_text=True, allow_empty=True) - phone_url = response.css('.phone-hide .num img').xpath('@src').extract_first() - - if phone_ext: - # phone will be pure text when owner use 591 built-in phone number - # TODO: check is the ext is identical for the same owner - owner['id'] = phone_ext - elif phone_url: - # or it will be an img, the src would be identical for the same owner - # url is sth like - # statics.591.com.tw/tools/showPhone.php?info_data=%2BbRfNLlKoLNhHOKui2zb%2FBxYO6A&type=rLEFMu4XrrpgEw - parsed_url = urlparse(phone_url) - qs = parse_qs(parsed_url.query) - if 'info_data' in qs and qs['info_data']: - owner['id'] = qs['info_data'][0] - else: - # sth strange happened, such as it's already dealt - # let's try if there's avatar - avatar = response.css('.userInfo .avatar img').xpath('@src').extract_first() - if avatar and 'no-photo-new.png' not in avatar: - owner['id'] = avatar - else: - # last try, search description to see if there's phone number - phone = re.search(r'09[0-9]{8}', ' '.join(desp)) - if phone: - phone = phone.group() - owner['id'] = phone - - return { - 'house_id': response.meta['rental'].id, - 'n_views': self.css_first(response, '.pageView b', deep_text=True), - 'top_region': top_region, - 'sub_region': sub_region, - 'address': address, - 'title': title, - 'imgs': imgs, - 'top_metas': top_metas, - 'facilities': fa, - 'without_facilities': without_fa, - 'environment': env, - 'sublets': sublets, - 'neighbor': nei, - 'desp': desp, - 'price': price, - 'price_includes': price_includes, - 'is_deal': is_deal, - 'side_metas': side_metas, - 'due_day': due_day, - 'owner': owner - } - def from_zh_number(self, zh_number): if zh_number in self.zh_number_dict: return self.zh_number_dict[zh_number] @@ -390,9 +133,13 @@ def from_zh_number(self, zh_number): def get_shared_price(self, detail_dict, basic_info): ret = {} + cost_data = list_to_dict( + get(detail_dict, 'costData.data', default=[]) + ) + # deposit_type, n_month_deposit - if '押金' in detail_dict['top_metas']: - deposit = detail_dict['top_metas']['押金'] + if '押金' in cost_data: + deposit = cost_data['押金'] month_deposit = deposit.split('個月') if len(month_deposit) == 2: ret['deposit_type'] = enums.DepositType.月 @@ -412,12 +159,16 @@ def get_shared_price(self, detail_dict, basic_info): ret['n_month_deposit'] = None ret['deposit'] = None - # is_remanagement_fee, monthly_management_fee - if '管理費' in detail_dict['price_includes']: + # is_management_fee, monthly_management_fee + price_includes = [] + if '租金含' in cost_data: + price_includes = cost_data['租金含'].split('、') + + if '管理費' in price_includes: ret['is_require_management_fee'] = False ret['monthly_management_fee'] = 0 - elif '管理費' in detail_dict['top_metas']: - mgmt_fee = detail_dict['top_metas']['管理費'] + elif '管理費' in cost_data: + mgmt_fee = cost_data['管理費'] # could be xxx元/月, --, -, !@$#$%... if '元/月' in mgmt_fee: ret['is_require_management_fee'] = True @@ -427,10 +178,13 @@ def get_shared_price(self, detail_dict, basic_info): ret['monthly_management_fee'] = 0 # *_parking* - if '車 位' in detail_dict['top_metas']: - parking_str = detail_dict['top_metas']['車 位'] + if '車位費' in price_includes: + ret['has_parking'] = True + ret['is_require_parking_fee'] = False + ret['monthly_parking_fee'] = 0 + elif '車位費' in cost_data: + parking_str = cost_data['車位費'] parking = clean_number(parking_str) - ret['has_parking'] = True if parking: ret['is_require_parking_fee'] = True @@ -457,28 +211,35 @@ def get_shared_price(self, detail_dict, basic_info): def get_shared_basic(self, detail_dict): ret = {} - # top_region, sub_region - if 'top_region' in detail_dict: - ret['top_region'] = self.get_enum( - enums.TopRegionType, - detail_dict['house_id'], - detail_dict['top_region'] - ) - - ret['sub_region'] = self.get_enum( - enums.SubRegionType, - detail_dict['house_id'], - '{}{}'.format( - detail_dict['top_region'], - detail_dict['sub_region'] - ) + # region xx市/xx區/物件類型 + breadcrumb = list_to_dict( + get(detail_dict, 'breadcrumb', default=[]), + name_field='query', + value_field='name' + ) + top_region = get(breadcrumb, 'region', default='__UNKNOWN__') + sub_region = get(breadcrumb, 'section', default='__UNKNOWN__') + + ret['top_region'] = self.get_enum( + enums.TopRegionType, + detail_dict['house_id'], + top_region + ) + + ret['sub_region'] = self.get_enum( + enums.SubRegionType, + detail_dict['house_id'], + '{}{}'.format( + top_region, + sub_region ) + ) - if 'address' in detail_dict: - ret['rough_address'] = detail_dict['address'] + ret['rough_address'] = get(detail_dict, 'favData.address') # deal_status - if detail_dict['is_deal']: + dealDay = get(detail_dict, 'dealTime', 0) + if dealDay > 0: # Issue #15, update only deal_status in crawler # let `syncstateful` to update the rest ret['deal_status'] = enums.DealStatusType.DEAL @@ -486,12 +247,14 @@ def get_shared_basic(self, detail_dict): # Issue #14, always update deal status since item may be reopened ret['deal_status'] = enums.DealStatusType.OPENED + infoSection = list_to_dict(get(detail_dict, 'info', default=[])) + # building_type, 公寓 / 電梯大樓 / 透天 - if '型態' in detail_dict['side_metas']: - building_type = detail_dict['side_metas']['型態'] + if '型態' in infoSection: + building_type = infoSection['型態'] if building_type == '別墅' or building_type == '透天厝': ret['building_type'] = enums.BuildingType.透天 - elif building_type == '住宅大樓': + elif building_type == '住宅大樓' or building_type == '電梯大樓': ret['building_type'] = enums.BuildingType.電梯大樓 else: ret['building_type'] = self.get_enum( @@ -501,19 +264,22 @@ def get_shared_basic(self, detail_dict): ) # property type - if '現況' in detail_dict['side_metas']: + if '類型' in infoSection: ret['property_type'] = self.get_enum( enums.PropertyType, detail_dict['house_id'], - detail_dict['side_metas']['現況'] + infoSection['類型'] ) + elif '格局' in infoSection: + ret['property_type'] = enums.PropertyType.整層住家 # is_rooftop, floor, total_floor # TODO: use title to detect rooftop - if '樓層' in detail_dict['side_metas']: + if '樓層' in infoSection: # floor_info = 1F/2F or 頂樓加蓋/2F or 整棟/2F - floor_info = detail_dict['side_metas']['樓層'].split('/') + floor_info = infoSection['樓層'].split('/') floor = clean_number(floor_info[0]) + # mark 整棟 as floor 0 ret['floor'] = 0 ret['total_floor'] = clean_number(floor_info[1]) ret['is_rooftop'] = False @@ -529,12 +295,28 @@ def get_shared_basic(self, detail_dict): ret['dist_to_highest_floor'] = ret['total_floor'] - ret['floor'] - if '坪數' in detail_dict['side_metas']: - ret['floor_ping'] = clean_number( - detail_dict['side_metas']['坪數']) - - if '格局' in detail_dict['side_metas']: - apt_feature = detail_dict['side_metas']['格局'] + if '坪數' in infoSection: + ret['floor_ping'] = clean_number(infoSection['坪數']) + + facilityKeys = list_to_dict( + get(detail_dict, 'service.facility'), + name_field='key', + # For 陽台 only, + # When no 陽台, name is '陽台' + # When there's 陽台, name is 'x陽台'... + value_field='name' + ) + nBalcony = clean_number(get(facilityKeys, 'balcony', default='')) + ret['n_balcony'] = nBalcony or 0 + + if '格局' in infoSection: + apt_parts = re.findall( + r'(\d)([^\d]+)', + infoSection['格局'] + ) + apt_feature = {} + for part in apt_parts: + apt_feature[part[1]] = part[0] for name in self.apt_features: if self.apt_features[name] in apt_feature: @@ -550,8 +332,6 @@ def get_shared_basic(self, detail_dict): ret['n_living_room'] ) - # TODO: rough_address - return ret def count_keyword_in_list(self, haystack, the_list, must_not_match=False): @@ -568,7 +348,10 @@ def count_keyword_in_list(self, haystack, the_list, must_not_match=False): def get_shared_environment(self, detail_dict): # additional fee - price_includes = detail_dict['price_includes'] + cost_data = list_to_dict(get(detail_dict, 'costData.data')) + price_includes = [] + if '租金含' in cost_data: + price_includes = cost_data['租金含'].split('、') additional_fee = { 'eletricity': '電費' not in price_includes, @@ -578,148 +361,121 @@ def get_shared_environment(self, detail_dict): 'cable_tv': '第四台' not in price_includes } - # living_functions - living_functions = {} - if '生活機能' in detail_dict['environment']: - living = detail_dict['environment']['生活機能'] - living_functions = { - 'school': '學校' in living, - 'park': '公園綠地' in living, - 'dept_store': '百貨公司' in living, - 'conv_store': '便利商店' in living, - 'traditional_mkt': '傳統市場' in living, - 'night_mkt': '夜市' in living, - 'hospital': '醫療機構' in living, - # not provided XDDD - 'police_office': False - } - - lower_desp = [] - for line in detail_dict['desp']: - lower_desp.append(line.lower()) - - transportation = {} - if '附近交通' in detail_dict['environment']: - tp_list = detail_dict['environment']['附近交通'] - transportation = { - 'subway': self.count_keyword_in_list('捷運站', tp_list), - 'bus': self.count_keyword_in_list('公車站', tp_list) + - self.count_keyword_in_list('路', tp_list), - 'train': self.count_keyword_in_list('火車站', tp_list), - 'hsr': self.count_keyword_in_list('高速鐵路', tp_list), - 'public_bike': self.count_keyword_in_list('bike', lower_desp) - } + # living_functions & transportation + # remove for now, as 2021 591 API doesn't provide necessary info #89 ret = { - 'additional_fee': additional_fee, - 'living_functions': living_functions, - 'transportation': transportation + 'additional_fee': additional_fee } return ret def get_shared_boolean_info(self, detail_dict): ret = {} + features = list_to_dict( + get(detail_dict, 'tags', default=[]), + name_field='value', + value_field='id' + ) # has_tenant_restriction - ret['has_tenant_restriction'] = False - if '身份要求' in detail_dict['top_metas']: - if detail_dict['top_metas']['身份要求']: - ret['has_tenant_restriction'] = True + rule = get(detail_dict, 'service.rule') + + # 2021 591 API use more soft word, with the same meaning... + # 適合學生 === 限學生 + # 適合上班族及家庭 === 限上班族及家庭 + ret['has_tenant_restriction'] = '適合' in rule # has_gender_restriction + # 2021 591 API use 此房屋限男生租住 / 此房屋限女生租住 / 此房屋男女皆可租住 / None ret['has_gender_restriction'] = False ret['gender_restriction'] = enums.GenderType.不限 - if '性別要求' in detail_dict['top_metas']: - gender = detail_dict['top_metas']['性別要求'] - if gender == '女生': + if '此房屋限' in rule: + if '女生' in rule: ret['has_gender_restriction'] = True ret['gender_restriction'] = enums.GenderType.女 - elif gender == '男生': + elif '男生' in rule: ret['has_gender_restriction'] = True ret['gender_restriction'] = enums.GenderType.男 - elif '不限' not in gender and '男女生皆可' not in gender: - ret['has_gender_restriction'] = True - ret['gender_restriction'] = enums.GenderType.其他 # can_cook - if '開伙' in detail_dict['top_metas']: - ret['can_cook'] = detail_dict['top_metas']['開伙'] == '可以' + if '不可開伙' in rule: + ret['can_cook'] = False + elif '可開伙' in features: + ret['can_cook'] = True else: ret['can_cook'] = None # allow pet - if '養寵物' in detail_dict['top_metas']: - ret['allow_pet'] = detail_dict['top_metas']['養寵物'] == '可以' + if '不可養寵物' in rule: + ret['allow_pet'] = False + elif '可養寵物' in features: + ret['allow_pet'] = True else: ret['allow_pet'] = None # has_perperty_registration - ret['has_perperty_registration'] = detail_dict['top_metas']\ - .get('產權登記', '') == '已辦' + properMetaTitle = get(detail_dict, 'infoData.title') + ret['has_perperty_registration'] = properMetaTitle == '房屋已辦產權登記' return ret def get_shared_misc(self, detail_dict): ret = {} + # rough_coordinate + position = get(detail_dict, 'positionRound') + coordinate = [ + Decimal(position['lat']), + Decimal(position['lng']) + ] + + if (coordinate[0] > 20 and coordinate[0] < 30): + # simple lat validator + # 東沙島 = 20.7036471,116.719958 + # 馬祖 = 26.402385,119.8869727 + ret['rough_coordinate'] = coordinate + # facilities facilities = {} - for item in detail_dict['facilities']: - facilities[item] = True - - for item in detail_dict['without_facilities']: - facilities[item] = False + for item in get(detail_dict, 'service.facility', default=[]): + if item['key'] == 'balcony': + continue + doProvide = item['active'] == 1 + if item['key'] == 'table_chairs': + facilities['桌子'] = doProvide + facilities['椅子'] = doProvide + else: + facilities[item['name']] = doProvide ret['facilities'] = facilities # contact, agent, and author - owner = detail_dict['owner'] - if '代理人' in owner['comment']: - ret['contact'] = enums.ContactType.代理人 - elif owner['isAgent']: + owner = get(detail_dict, 'linkInfo', default={}) + if owner['roleName'] == '仲介': ret['contact'] = enums.ContactType.房仲 else: - ret['contact'] = enums.ContactType.屋主 - - if owner['isAgent']: - agent = {} - for item in owner['agent']: - for key in item: - agent[key] = item[key] - - if '公司名' in agent: - ret['agent_org'] = agent['公司名'] - elif '經濟業' in agent: - ret['agent_org'] = agent['經濟業'] - else: - ret['agent_org'] = '/'.join(agent.values()) + ret['contact'] = self.get_enum( + enums.ContactType, + detail_dict['house_id'], + owner['roleName'] + ) - if 'id' in detail_dict['owner'] and detail_dict['owner']['id']: - ret['author'] = detail_dict['owner']['id'] + if owner['mobile'] != '': + ret['author'] = owner['mobile'].replace('-', '') + else: + ret['author'] = owner['uid'] or owner['imUid'] + + if ret['contact'] == enums.ContactType.房仲: + ret['agent_org'] = owner['roleTxt'] or owner['certificateTxt'] + if ret['agent_org'] == '經紀業: 不動產經紀業': + ret['agent_org'] = '未認證' return ret def gen_detail_shared_attrs(self, detail_dict): - detail_dict['price'] = clean_number(detail_dict['price']) - - detail_dict['price_includes'] = list(map( - lambda x: x.replace('含', ''), - detail_dict['price_includes'] - )) - - if '生活機能' in detail_dict['environment']: - detail_dict['environment']['生活機能'] = list(map( - lambda x: x.replace('近', ''), - detail_dict['environment']['生活機能'] - )) - - if '附近交通' in detail_dict['environment']: - detail_dict['environment']['附近交通'] = list(map( - lambda x: re.sub('[  ]', '', x.replace('近', '')), - detail_dict['environment']['附近交通'] - )) - + price_range = parse_price(detail_dict['price']) + detail_dict['price'] = price_range['monthly_price'] basic_info = self.get_shared_basic(detail_dict) price_info = self.get_shared_price(detail_dict, basic_info) env_info = self.get_shared_environment(detail_dict) @@ -730,12 +486,13 @@ def gen_detail_shared_attrs(self, detail_dict): 'vendor': self.vendor, 'vendor_house_id': detail_dict['house_id'], 'monthly_price': detail_dict['price'], - 'imgs': detail_dict['imgs'], + **price_range, **price_info, **basic_info, **env_info, **boolean_info, - **misc_info + **misc_info, + } return ret diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/list_mixin.py b/scrapy-package/scrapy_twrh/spiders/rental591/list_mixin.py index 90012572..b85a3c77 100644 --- a/scrapy-package/scrapy_twrh/spiders/rental591/list_mixin.py +++ b/scrapy-package/scrapy_twrh/spiders/rental591/list_mixin.py @@ -2,7 +2,7 @@ from scrapy_twrh.items import RawHouseItem, GenericHouseItem from scrapy_twrh.spiders.enums import PropertyType, TopRegionType, SubRegionType from scrapy_twrh.spiders.util import clean_number -from .util import SITE_URL, ListRequestMeta, DetailRequestMeta +from .util import API_URL, ListRequestMeta, DetailRequestMeta, parse_price from .request_generator import RequestGenerator def get_list_val(house, regular_attr, top_attr=None, to_number=False): @@ -52,7 +52,6 @@ def default_parse_list(self, response): houses = data['data']['topData'] + data['data']['data'] for house in houses: - house['is_vip'] = 'id' not in house house_item = self.gen_shared_attrs(house, meta) yield RawHouseItem( house_id=house_item['vendor_house_id'], @@ -61,12 +60,12 @@ def default_parse_list(self, response): raw=json.dumps(house, ensure_ascii=False) ) yield GenericHouseItem(**house_item) - yield self.gen_detail_request(DetailRequestMeta(house_item['vendor_house_id'], False)) + yield self.gen_detail_request(DetailRequestMeta(house_item['vendor_house_id'])) def gen_shared_attrs(self, house, meta: ListRequestMeta): house_id = get_list_val(house, 'id', 'post_id') - url = '{}/rent-detail-{}.html'.format(SITE_URL, house_id) + url = "{}/v1/house/rent/detail?id={}".format(API_URL, house_id) if 'region_name' in house: # topData doesn't contain region_name for some reason.. @@ -85,21 +84,41 @@ def gen_shared_attrs(self, house, meta: ListRequestMeta): ) ) - property_type = self.get_enum( - PropertyType, house_id, get_list_val(house, 'kind_name', 'kind_str')) + property_type = None + if 'kind_name' in house: + self.get_enum(PropertyType, house_id, get_list_val(house, 'kind_name')) + + floor = None + total_floor = None + if 'floor_str' in house: + floor_info = house['floor_str'].split('/') + if len(floor_info) >= 2: + floor = clean_number(floor_info[0]) + total_floor = clean_number(floor_info[1]) + + if floor == '頂樓加蓋': + floor = total_floor +1 + elif 'B' in floor_info[0] and floor: + # basement + floor = -floor + elif floor is None: + # 整棟 + floor = 0 + + price_range = parse_price(get_list_val(house, 'price')) generic_house = { 'vendor': self.vendor, 'vendor_house_id': house_id, 'vendor_house_url': url, - 'imgs': [get_list_val(house, 'cover', 'img_src')], + 'imgs': get_list_val(house, 'photo_list'), 'top_region': top_region, 'sub_region': sub_region, 'property_type': property_type, 'floor_ping': clean_number(house['area']), - 'floor': get_list_val(house, 'floor', to_number=True), - 'total_floor': get_list_val(house, 'allfloor', to_number=True), - 'monthly_price': get_list_val(house, 'price', to_number=True) + 'floor': floor, + 'total_floor': total_floor, + **price_range } # 99 and 100 are magic number in 591... diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/rental591_spider.py b/scrapy-package/scrapy_twrh/spiders/rental591/rental591_spider.py index 5ca04c75..157a15b5 100644 --- a/scrapy-package/scrapy_twrh/spiders/rental591/rental591_spider.py +++ b/scrapy-package/scrapy_twrh/spiders/rental591/rental591_spider.py @@ -6,6 +6,11 @@ class Rental591Spider(ListMixin, DetailMixin): name = 'rental591' + csrf_token = '' + session = { + '591_new_session': None, + 'PHPSESSID': None + } def __init__(self, target_cities=None, **kwargs): super().__init__( @@ -36,9 +41,10 @@ def handle_session_init(self, response): for cookie in response.headers.getlist('Set-Cookie'): cookie_tokens = cookie.decode('utf-8').split('; ') - if cookie_tokens and cookie_tokens[0].startswith('591_new_session='): - self.session_token = cookie_tokens[0].split('=')[1] - break + for cookie in cookie_tokens: + tokens = cookie.split('=') + if len(tokens) is 2 and tokens[0] in self.session: + self.session[tokens[0]] = tokens[1] for item in self.start_list(): yield item diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/request_generator.py b/scrapy-package/scrapy_twrh/spiders/rental591/request_generator.py index cdd10acd..ce017795 100644 --- a/scrapy-package/scrapy_twrh/spiders/rental591/request_generator.py +++ b/scrapy-package/scrapy_twrh/spiders/rental591/request_generator.py @@ -1,14 +1,11 @@ from scrapy.spidermiddlewares.httperror import HttpError from scrapy_twrh.spiders.rental_spider import RentalSpider -from .util import SITE_URL, LIST_ENDPOINT, ListRequestMeta, DetailRequestMeta +from .util import SITE_URL, API_URL, LIST_ENDPOINT, ListRequestMeta, DetailRequestMeta class RequestGenerator(RentalSpider): def __init__(self, **kwargs): super().__init__(**kwargs) - self.csrf_token = None - self.session_token = None - def gen_list_request_args(self, rental_meta: ListRequestMeta): # don't filter as 591 use 30x to indicate house status... ret = { @@ -20,9 +17,10 @@ def gen_list_request_args(self, rental_meta: ListRequestMeta): rental_meta.page * self.N_PAGE ), 'headers': { - 'Cookie': 'urlJumpIp={}; 591_new_session={};'.format( + 'Cookie': 'urlJumpIp={}; 591_new_session={}; PHPSESSID={}'.format( rental_meta.id, - self.session_token + self.session['591_new_session'], + self.session['PHPSESSID'] ), 'X-CSRF-TOKEN': self.csrf_token } @@ -30,38 +28,23 @@ def gen_list_request_args(self, rental_meta: ListRequestMeta): return ret def gen_detail_request_args(self, rental_meta: DetailRequestMeta): - if rental_meta.gps: - # https://rent.591.com.tw/map-houseRound.html?type=1&detail=detail&version=1&post_id=6635655 - url = "{}/map-houseRound.html?type=1&detail=detail&version=1&post_id={}".format( - SITE_URL, rental_meta.id) + # https://bff.591.com.tw/v1/house/rent/detail?id=11501075 + url = "{}/v1/house/rent/detail?id={}".format(API_URL, rental_meta.id) - #19, the house may be closed in 3 hours when we found it.... - # retrieve gps in lowest priority - # don't filter as 591 use 30x to indicate house status... - return { - 'dont_filter': True, - 'url': url, - 'priority': -1, - 'errback': self.error_handler, - 'meta': { - 'rental': rental_meta, - 'handle_httpstatus_list': [404] - } - } - else: - # https://rent.591.com.tw/rent-detail-6635655.html - url = "{}/rent-detail-{}.html".format(SITE_URL, rental_meta.id) - - # don't filter as 591 use 30x to indicate house status... - return { - 'dont_filter': True, - 'url': url, - 'errback': self.error_handler, - 'meta': { - 'rental': rental_meta, - 'handle_httpstatus_list': [400, 404, 302, 301] - } + # don't filter as 591 use 30x to indicate house status... + return { + 'dont_filter': True, + 'url': url, + 'errback': self.error_handler, + 'meta': { + 'rental': rental_meta, + 'handle_httpstatus_list': [400, 404, 302, 301] + }, + 'headers': { + 'device': 'pc', + 'deviceid': self.session['PHPSESSID'] } + } def error_handler(self, failure): if failure.check(HttpError): diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/util.py b/scrapy-package/scrapy_twrh/spiders/rental591/util.py index e59ef034..9ec3ad37 100644 --- a/scrapy-package/scrapy_twrh/spiders/rental591/util.py +++ b/scrapy-package/scrapy_twrh/spiders/rental591/util.py @@ -1,9 +1,21 @@ from collections import namedtuple +from scrapy_twrh.spiders.util import clean_number SITE_URL = 'https://rent.591.com.tw' -LIST_ENDPOINT = '{}/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1'.format(SITE_URL) +API_URL = 'https://bff.591.com.tw' +LIST_ENDPOINT = '{}/home/search/rsList?is_new_list=1&type=1&is_format_data=1'.format(SITE_URL) SESSION_ENDPOINT = '{}/?kind=0®ion=6'.format(SITE_URL) ListRequestMeta = namedtuple('ListRequestMeta', ['id', 'name', 'page']) -DetailRequestMeta = namedtuple('DetailRequestMeta', ['id', 'gps']) +DetailRequestMeta = namedtuple('DetailRequestMeta', ['id']) + +def parse_price(number_string: str): + #87, 社會住宅's monthly_price is a range + tokens = number_string.split('~') + price = clean_number(tokens[0]) + ret = { 'monthly_price': price } + if len(tokens) >= 2: + ret['min_monthly_price'] = clean_number(tokens[1]) + + return ret diff --git a/scrapy-package/setup.py b/scrapy-package/setup.py index 70c582b3..fbdf7c1e 100644 --- a/scrapy-package/setup.py +++ b/scrapy-package/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="scrapy-tw-rental-house", - version="0.1.2", + version="1.0.0", author="ddio", author_email="ddio@ddio.io", description="Scrapy spider for TW Rental House",