diff --git a/.gitignore b/.gitignore
index a0d63bdf..da93f8d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ backup
datas/*
!datas/.keep
.env
+.venv
*.db*
*.sqlite3
*.tgz
diff --git a/scrapy-package/.gitignore b/scrapy-package/.gitignore
index 8b3e3da4..a1c00fc7 100644
--- a/scrapy-package/.gitignore
+++ b/scrapy-package/.gitignore
@@ -1,4 +1,5 @@
__pycache__
scrapy.log
dist
+.venv
*.egg-info
diff --git a/scrapy-package/scrapy_twrh/items.py b/scrapy-package/scrapy_twrh/items.py
index 8859aa14..bfe7d180 100644
--- a/scrapy-package/scrapy_twrh/items.py
+++ b/scrapy-package/scrapy_twrh/items.py
@@ -19,6 +19,7 @@ class GenericHouseItem(Item):
vendor_house_url = Field()
# price related
monthly_price = Field()
+ min_monthly_price = Field()
deposit_type = Field()
n_month_deposit = Field()
deposit = Field()
diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py b/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py
index 59c3271d..e630830b 100644
--- a/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py
+++ b/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py
@@ -1,4 +1,6 @@
+import json
import re
+from functools import reduce
from urllib.parse import urlparse, parse_qs
from decimal import Decimal
from functools import partial
@@ -6,7 +8,18 @@
from scrapy_twrh.spiders.util import clean_number
from scrapy_twrh.items import RawHouseItem, GenericHouseItem
from .request_generator import RequestGenerator
-from .util import DetailRequestMeta, SITE_URL
+from .util import parse_price
+
+# copy from stackoverflow XD
+# https://stackoverflow.com/questions/25833613/safe-method-to-get-value-of-nested-dictionary
+def get(dictionary, keys, default=None):
+ return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default, keys.split("."), dictionary)
+
+def list_to_dict (list, name_field = 'name', value_field = 'value'):
+ ret = {}
+ for item in list:
+ ret[item[name_field]] = item[value_field]
+ return ret
def dict_from_tuple(keys, values):
min_length = min(len(keys), len(values))
@@ -42,52 +55,10 @@ class DetailMixin(RequestGenerator):
apt_features = {
'n_living_room': '廳',
'n_bed_room': '房',
- 'n_balcony': '陽台',
'n_bath_room': '衛'
}
-
def default_parse_detail(self, response):
- meta = response.meta['rental']
- if meta.gps:
- return self.parse_gps_response(response)
-
- return self.parse_main_response(response)
-
- def parse_gps_response(self, response):
- house_id = response.meta['rental'].id
-
- if response.status == 404:
- self.logger.info(
- 'GPS {} not found by receiving status code {}'
- .format(house_id, response.status)
- )
- yield True
- return
-
- gmap_url = self.css_first(response, '#main .propMapBarMap iframe::attr(src)')
- # example //maps.google.com.tw/maps?f=q&hl=zh-TW&q=25.0268980,121.5542323&z=17&output=embed
-
- parsed_url = urlparse(gmap_url)
- qs = parse_qs(parsed_url.query)
- if 'q' not in qs or not qs['q']:
- self.logger.info(
- 'Invalid GPS page in house: {}'
- .format(house_id)
- )
- return
-
- gps_str = qs['q'][0]
- coordinate = list(map(Decimal, gps_str.split(',')))
-
- if len(coordinate) == 2:
- yield GenericHouseItem(
- vendor=self.vendor,
- vendor_house_id=house_id,
- rough_coordinate=coordinate
- )
-
- def parse_main_response(self, response):
house_id = response.meta['rental'].id
if response.status == 400:
@@ -110,52 +81,29 @@ def parse_main_response(self, response):
is_list=False,
raw=response.body
)
-
- # sometime we got 200 but it's actually 30x...
- browser_title = self.css_first(response, 'title::text')
- if browser_title.startswith('等待跳轉'):
- yield GenericHouseItem(
- vendor=self.vendor,
- vendor_house_id=house_id,
- deal_status=enums.DealStatusType.NOT_FOUND
- )
- else:
- detail_dict = self.collect_dict(response)
-
- yield RawHouseItem(
- house_id=house_id,
- vendor=self.vendor,
- is_list=False,
- dict=detail_dict
+ jsonResp = json.loads(response.text)
+ if 'data' not in jsonResp:
+ self.logger.error('Invalid detail response for 591 house: {}'
+ .format(response.meta['rental'].id)
)
+ return False
- yield GenericHouseItem(
- **self.gen_detail_shared_attrs(detail_dict)
- )
-
- # get gps only when the house existed
- yield self.gen_detail_request(DetailRequestMeta(
- house_id,
- True
- ))
-
- def css_first(self, base, selector, default='', allow_empty=False, deep_text=False):
- # Check how to find if there's missing attribute
- css = self.css(base, selector, [default], deep_text=deep_text)
- if css:
- return css[0]
+ detail_dict = jsonResp['data']
+ detail_dict['house_id'] = house_id
- if not allow_empty:
- self.logger.info(
- 'Fail to get css first from {}({})'.format(
- base,
- selector
- )
+ yield RawHouseItem(
+ house_id=house_id,
+ vendor=self.vendor,
+ is_list=False,
+ dict=detail_dict
)
- return ''
+ yield GenericHouseItem(
+ **self.gen_detail_shared_attrs(detail_dict)
+ )
def css(self, base, selector, default=None, deep_text=False):
+ # keep this for now, in case we meet this issue again.. #89
# Issue #30, we may get innerHTML like "some of target string"
# deep_text=True retrieve text in the way different from ::text,
# which will also get all child text.
@@ -176,211 +124,6 @@ def clean_string(self, strings):
strings = map(lambda str: str.replace(u'\xa0', '').strip(), strings)
return strings
- def collect_dict(self, response):
- # title
- title = self.css_first(response, '.houseInfoTitle', deep_text=True)
-
- # region 首頁/租屋/xx市/xx區
- breadcromb = self.css(response, '#propNav a', deep_text=True)
- if len(breadcromb) >= 4:
- if breadcromb[2] == '出租' and len(breadcromb) >= 5:
- # 首頁 > 店面 > 出租 > 台北市 > 大安區 > 台北市大安區安和路二段
- top_region = breadcromb[3]
- sub_region = breadcromb[4]
- else:
- # 首頁 > 租屋 > 台北市 > 大安區 > 獨立套房 > 20000-30000元 > 台北市大安區仁愛路四段50號
- top_region = breadcromb[2]
- sub_region = breadcromb[3]
- else:
- top_region = '__UNKNOWN__'
- sub_region = '__UNKNOWN__'
-
- # rough address
- address = self.css_first(response, '#propNav .addr', deep_text=True)
-
- # image, it's in a hidden input
- imgs = self.css_first(
- response,
- '#hid_imgArr::attr(value)',
- allow_empty=True
- ).replace('"', '').split(',')
-
- if imgs[0] == "":
- imgs.pop(0)
-
- # top meta, including 押金, 法定用途, etc..
- top_meta_keys = self.css(response, '.labelList-1 .one', deep_text=True)
- top_meta_values = self.css(response, '.labelList-1 .two em', deep_text=True)
- top_metas = dict_from_tuple(top_meta_keys, top_meta_values)
-
- if '身份要求' in top_metas:
- top_metas['身份要求'] = top_metas['身份要求'].split('、')
-
- # facilities, including 衣櫃、沙發, etc..
- fa_status = self.css(response, '.facility li span::attr(class)')
- fa_text = self.css(response, '.facility li', deep_text=True)
- fa = []
- without_fa = []
- for index, key in enumerate(fa_text):
- if fa_status[index] != 'no':
- fa.append(key)
- else:
- without_fa.append(key)
-
- # environment
- #
生活機能:近便利商店;傳統市場;夜市
- env_keys = self.css(response, '.lifeBox > p strong', deep_text=True)
- env_desps = self.css(response, '.lifeBox > p', deep_text=True)
- env_desps = list(map(lambda desp: re.sub('.*:', '', desp).split(';'), env_desps))
- env = dict_from_tuple(env_keys, env_desps)
-
- # neighbor
- nei_selector = response.css('.lifeBox.community')
- nei = {}
- if nei_selector:
- nei['name'] = self.css_first(nei_selector, '.communityName a', deep_text=True)
- nei['desp'] = self.css_first(
- nei_selector,
- '.communityIntroduce::text',
- deep_text=True,
- allow_empty=True
- )
- nei['url'] = SITE_URL +\
- self.css_first(nei_selector, '.communityIntroduce a::attr(href)', allow_empty=True)
- nei_keys = self.css(nei_selector, '.communityDetail p::text')
- nei_values = self.css(nei_selector, '.communityDetail p > *', deep_text=True)
- nei['info'] = dict_from_tuple(nei_keys, nei_values)
-
- # sublets 分租套房、雅房
- sublets_keys = self.css(response, '.list-title span', deep_text=True)
- sublets_list = response.css('.house-list')
- sublets = []
- for sublet in sublets_list:
- texts = self.css(sublet, 'li', deep_text=True)
- sublet_dict = dict_from_tuple(sublets_keys, texts)
- if '租金' in sublet_dict:
- sublet_dict['租金'] = clean_number(sublet_dict['租金'])
- if '坪數' in sublet_dict:
- sublet_dict['坪數'] = clean_number(sublet_dict['坪數'])
-
- sublets.append(sublet_dict)
-
- # desp
- desp = self.css(response, '.houseIntro *', deep_text=True)
-
- # q and a
- # TODO
- # TODO: format correct
-
- # price
- # 14,500 元/月
- price = self.css_first(response, '.price i', deep_text=True)
-
- # built-in facility
- price_includes = self.css_first(
- response,
- '.detailInfo .price+.explain',
- deep_text=True,
- allow_empty=True
- ).split('/')
-
- # lease status
- is_deal = len(response.css('.filled').extract()) > 0
- # house_state = 'OPENED'
- # deal_at = None
- # if is_deal:
- # house_state = 'DEAL'
- # deal_at = timezone.localtime()
-
- # side meta
- sides = self.css(response, '.detailInfo .attr li', deep_text=True)
- side_metas = {}
- for side in sides:
- tokens = side.split(':')
- if len(tokens) >= 2:
- side_metas[tokens[0]] = ':'.join(tokens[1::])
-
- # 格局 : 3房2廳2衛2陽台
- if '格局' in side_metas:
- # TODO: 開放式格局
- parts = re.findall(
- r'(\d)([^\d]+)',
- side_metas['格局']
- )
- parts_dict = {}
- for part in parts:
- parts_dict[part[1]] = part[0]
- side_metas['格局'] = parts_dict
- if '坪數' in side_metas:
- side_metas['坪數'] = clean_number(side_metas['坪數'])
- if '權狀坪數' in side_metas:
- side_metas['權狀坪數'] = clean_number(side_metas['權狀坪數'])
-
- # due day
- due_day = self.css_first(response, '.explain .ft-rt', deep_text=True)
- due_day = due_day.replace('有效期:', '')
-
- # owner
- owner = {}
- owner['name'] = self.css_first(response, '.avatarRight i', deep_text=True)
- owner['comment'] = self.css_first(response, '.avatarRight div', deep_text=True)
- agent_info = self.css(response, '.avatarRight .auatarSonBox p', deep_text=True)
- make_agent_info = partial(split_string_to_dict, seperator=':')
- agent_info = list(map(make_agent_info, agent_info))
- owner['isAgent'] = len(agent_info) > 0
- owner['agent'] = agent_info
-
- phone_ext = self.css_first(response, '.phone-hide .num', deep_text=True, allow_empty=True)
- phone_url = response.css('.phone-hide .num img').xpath('@src').extract_first()
-
- if phone_ext:
- # phone will be pure text when owner use 591 built-in phone number
- # TODO: check is the ext is identical for the same owner
- owner['id'] = phone_ext
- elif phone_url:
- # or it will be an img, the src would be identical for the same owner
- # url is sth like
- # statics.591.com.tw/tools/showPhone.php?info_data=%2BbRfNLlKoLNhHOKui2zb%2FBxYO6A&type=rLEFMu4XrrpgEw
- parsed_url = urlparse(phone_url)
- qs = parse_qs(parsed_url.query)
- if 'info_data' in qs and qs['info_data']:
- owner['id'] = qs['info_data'][0]
- else:
- # sth strange happened, such as it's already dealt
- # let's try if there's avatar
- avatar = response.css('.userInfo .avatar img').xpath('@src').extract_first()
- if avatar and 'no-photo-new.png' not in avatar:
- owner['id'] = avatar
- else:
- # last try, search description to see if there's phone number
- phone = re.search(r'09[0-9]{8}', ' '.join(desp))
- if phone:
- phone = phone.group()
- owner['id'] = phone
-
- return {
- 'house_id': response.meta['rental'].id,
- 'n_views': self.css_first(response, '.pageView b', deep_text=True),
- 'top_region': top_region,
- 'sub_region': sub_region,
- 'address': address,
- 'title': title,
- 'imgs': imgs,
- 'top_metas': top_metas,
- 'facilities': fa,
- 'without_facilities': without_fa,
- 'environment': env,
- 'sublets': sublets,
- 'neighbor': nei,
- 'desp': desp,
- 'price': price,
- 'price_includes': price_includes,
- 'is_deal': is_deal,
- 'side_metas': side_metas,
- 'due_day': due_day,
- 'owner': owner
- }
-
def from_zh_number(self, zh_number):
if zh_number in self.zh_number_dict:
return self.zh_number_dict[zh_number]
@@ -390,9 +133,13 @@ def from_zh_number(self, zh_number):
def get_shared_price(self, detail_dict, basic_info):
ret = {}
+ cost_data = list_to_dict(
+ get(detail_dict, 'costData.data', default=[])
+ )
+
# deposit_type, n_month_deposit
- if '押金' in detail_dict['top_metas']:
- deposit = detail_dict['top_metas']['押金']
+ if '押金' in cost_data:
+ deposit = cost_data['押金']
month_deposit = deposit.split('個月')
if len(month_deposit) == 2:
ret['deposit_type'] = enums.DepositType.月
@@ -412,12 +159,16 @@ def get_shared_price(self, detail_dict, basic_info):
ret['n_month_deposit'] = None
ret['deposit'] = None
- # is_remanagement_fee, monthly_management_fee
- if '管理費' in detail_dict['price_includes']:
+ # is_management_fee, monthly_management_fee
+ price_includes = []
+ if '租金含' in cost_data:
+ price_includes = cost_data['租金含'].split('、')
+
+ if '管理費' in price_includes:
ret['is_require_management_fee'] = False
ret['monthly_management_fee'] = 0
- elif '管理費' in detail_dict['top_metas']:
- mgmt_fee = detail_dict['top_metas']['管理費']
+ elif '管理費' in cost_data:
+ mgmt_fee = cost_data['管理費']
# could be xxx元/月, --, -, !@$#$%...
if '元/月' in mgmt_fee:
ret['is_require_management_fee'] = True
@@ -427,10 +178,13 @@ def get_shared_price(self, detail_dict, basic_info):
ret['monthly_management_fee'] = 0
# *_parking*
- if '車 位' in detail_dict['top_metas']:
- parking_str = detail_dict['top_metas']['車 位']
+ if '車位費' in price_includes:
+ ret['has_parking'] = True
+ ret['is_require_parking_fee'] = False
+ ret['monthly_parking_fee'] = 0
+ elif '車位費' in cost_data:
+ parking_str = cost_data['車位費']
parking = clean_number(parking_str)
-
ret['has_parking'] = True
if parking:
ret['is_require_parking_fee'] = True
@@ -457,28 +211,35 @@ def get_shared_price(self, detail_dict, basic_info):
def get_shared_basic(self, detail_dict):
ret = {}
- # top_region, sub_region
- if 'top_region' in detail_dict:
- ret['top_region'] = self.get_enum(
- enums.TopRegionType,
- detail_dict['house_id'],
- detail_dict['top_region']
- )
-
- ret['sub_region'] = self.get_enum(
- enums.SubRegionType,
- detail_dict['house_id'],
- '{}{}'.format(
- detail_dict['top_region'],
- detail_dict['sub_region']
- )
+ # region xx市/xx區/物件類型
+ breadcrumb = list_to_dict(
+ get(detail_dict, 'breadcrumb', default=[]),
+ name_field='query',
+ value_field='name'
+ )
+ top_region = get(breadcrumb, 'region', default='__UNKNOWN__')
+ sub_region = get(breadcrumb, 'section', default='__UNKNOWN__')
+
+ ret['top_region'] = self.get_enum(
+ enums.TopRegionType,
+ detail_dict['house_id'],
+ top_region
+ )
+
+ ret['sub_region'] = self.get_enum(
+ enums.SubRegionType,
+ detail_dict['house_id'],
+ '{}{}'.format(
+ top_region,
+ sub_region
)
+ )
- if 'address' in detail_dict:
- ret['rough_address'] = detail_dict['address']
+ ret['rough_address'] = get(detail_dict, 'favData.address')
# deal_status
- if detail_dict['is_deal']:
+ dealDay = get(detail_dict, 'dealTime', 0)
+ if dealDay > 0:
# Issue #15, update only deal_status in crawler
# let `syncstateful` to update the rest
ret['deal_status'] = enums.DealStatusType.DEAL
@@ -486,12 +247,14 @@ def get_shared_basic(self, detail_dict):
# Issue #14, always update deal status since item may be reopened
ret['deal_status'] = enums.DealStatusType.OPENED
+ infoSection = list_to_dict(get(detail_dict, 'info', default=[]))
+
# building_type, 公寓 / 電梯大樓 / 透天
- if '型態' in detail_dict['side_metas']:
- building_type = detail_dict['side_metas']['型態']
+ if '型態' in infoSection:
+ building_type = infoSection['型態']
if building_type == '別墅' or building_type == '透天厝':
ret['building_type'] = enums.BuildingType.透天
- elif building_type == '住宅大樓':
+ elif building_type == '住宅大樓' or building_type == '電梯大樓':
ret['building_type'] = enums.BuildingType.電梯大樓
else:
ret['building_type'] = self.get_enum(
@@ -501,19 +264,22 @@ def get_shared_basic(self, detail_dict):
)
# property type
- if '現況' in detail_dict['side_metas']:
+ if '類型' in infoSection:
ret['property_type'] = self.get_enum(
enums.PropertyType,
detail_dict['house_id'],
- detail_dict['side_metas']['現況']
+ infoSection['類型']
)
+ elif '格局' in infoSection:
+ ret['property_type'] = enums.PropertyType.整層住家
# is_rooftop, floor, total_floor
# TODO: use title to detect rooftop
- if '樓層' in detail_dict['side_metas']:
+ if '樓層' in infoSection:
# floor_info = 1F/2F or 頂樓加蓋/2F or 整棟/2F
- floor_info = detail_dict['side_metas']['樓層'].split('/')
+ floor_info = infoSection['樓層'].split('/')
floor = clean_number(floor_info[0])
+ # mark 整棟 as floor 0
ret['floor'] = 0
ret['total_floor'] = clean_number(floor_info[1])
ret['is_rooftop'] = False
@@ -529,12 +295,28 @@ def get_shared_basic(self, detail_dict):
ret['dist_to_highest_floor'] = ret['total_floor'] - ret['floor']
- if '坪數' in detail_dict['side_metas']:
- ret['floor_ping'] = clean_number(
- detail_dict['side_metas']['坪數'])
-
- if '格局' in detail_dict['side_metas']:
- apt_feature = detail_dict['side_metas']['格局']
+ if '坪數' in infoSection:
+ ret['floor_ping'] = clean_number(infoSection['坪數'])
+
+ facilityKeys = list_to_dict(
+ get(detail_dict, 'service.facility'),
+ name_field='key',
+ # For 陽台 only,
+ # When no 陽台, name is '陽台'
+ # When there's 陽台, name is 'x陽台'...
+ value_field='name'
+ )
+ nBalcony = clean_number(get(facilityKeys, 'balcony', default=''))
+ ret['n_balcony'] = nBalcony or 0
+
+ if '格局' in infoSection:
+ apt_parts = re.findall(
+ r'(\d)([^\d]+)',
+ infoSection['格局']
+ )
+ apt_feature = {}
+ for part in apt_parts:
+ apt_feature[part[1]] = part[0]
for name in self.apt_features:
if self.apt_features[name] in apt_feature:
@@ -550,8 +332,6 @@ def get_shared_basic(self, detail_dict):
ret['n_living_room']
)
- # TODO: rough_address
-
return ret
def count_keyword_in_list(self, haystack, the_list, must_not_match=False):
@@ -568,7 +348,10 @@ def count_keyword_in_list(self, haystack, the_list, must_not_match=False):
def get_shared_environment(self, detail_dict):
# additional fee
- price_includes = detail_dict['price_includes']
+ cost_data = list_to_dict(get(detail_dict, 'costData.data'))
+ price_includes = []
+ if '租金含' in cost_data:
+ price_includes = cost_data['租金含'].split('、')
additional_fee = {
'eletricity': '電費' not in price_includes,
@@ -578,148 +361,121 @@ def get_shared_environment(self, detail_dict):
'cable_tv': '第四台' not in price_includes
}
- # living_functions
- living_functions = {}
- if '生活機能' in detail_dict['environment']:
- living = detail_dict['environment']['生活機能']
- living_functions = {
- 'school': '學校' in living,
- 'park': '公園綠地' in living,
- 'dept_store': '百貨公司' in living,
- 'conv_store': '便利商店' in living,
- 'traditional_mkt': '傳統市場' in living,
- 'night_mkt': '夜市' in living,
- 'hospital': '醫療機構' in living,
- # not provided XDDD
- 'police_office': False
- }
-
- lower_desp = []
- for line in detail_dict['desp']:
- lower_desp.append(line.lower())
-
- transportation = {}
- if '附近交通' in detail_dict['environment']:
- tp_list = detail_dict['environment']['附近交通']
- transportation = {
- 'subway': self.count_keyword_in_list('捷運站', tp_list),
- 'bus': self.count_keyword_in_list('公車站', tp_list) +
- self.count_keyword_in_list('路', tp_list),
- 'train': self.count_keyword_in_list('火車站', tp_list),
- 'hsr': self.count_keyword_in_list('高速鐵路', tp_list),
- 'public_bike': self.count_keyword_in_list('bike', lower_desp)
- }
+ # living_functions & transportation
+ # remove for now, as 2021 591 API doesn't provide necessary info #89
ret = {
- 'additional_fee': additional_fee,
- 'living_functions': living_functions,
- 'transportation': transportation
+ 'additional_fee': additional_fee
}
return ret
def get_shared_boolean_info(self, detail_dict):
ret = {}
+ features = list_to_dict(
+ get(detail_dict, 'tags', default=[]),
+ name_field='value',
+ value_field='id'
+ )
# has_tenant_restriction
- ret['has_tenant_restriction'] = False
- if '身份要求' in detail_dict['top_metas']:
- if detail_dict['top_metas']['身份要求']:
- ret['has_tenant_restriction'] = True
+ rule = get(detail_dict, 'service.rule')
+
+ # 2021 591 API use more soft word, with the same meaning...
+ # 適合學生 === 限學生
+ # 適合上班族及家庭 === 限上班族及家庭
+ ret['has_tenant_restriction'] = '適合' in rule
# has_gender_restriction
+ # 2021 591 API use 此房屋限男生租住 / 此房屋限女生租住 / 此房屋男女皆可租住 / None
ret['has_gender_restriction'] = False
ret['gender_restriction'] = enums.GenderType.不限
- if '性別要求' in detail_dict['top_metas']:
- gender = detail_dict['top_metas']['性別要求']
- if gender == '女生':
+ if '此房屋限' in rule:
+ if '女生' in rule:
ret['has_gender_restriction'] = True
ret['gender_restriction'] = enums.GenderType.女
- elif gender == '男生':
+ elif '男生' in rule:
ret['has_gender_restriction'] = True
ret['gender_restriction'] = enums.GenderType.男
- elif '不限' not in gender and '男女生皆可' not in gender:
- ret['has_gender_restriction'] = True
- ret['gender_restriction'] = enums.GenderType.其他
# can_cook
- if '開伙' in detail_dict['top_metas']:
- ret['can_cook'] = detail_dict['top_metas']['開伙'] == '可以'
+ if '不可開伙' in rule:
+ ret['can_cook'] = False
+ elif '可開伙' in features:
+ ret['can_cook'] = True
else:
ret['can_cook'] = None
# allow pet
- if '養寵物' in detail_dict['top_metas']:
- ret['allow_pet'] = detail_dict['top_metas']['養寵物'] == '可以'
+ if '不可養寵物' in rule:
+ ret['allow_pet'] = False
+ elif '可養寵物' in features:
+ ret['allow_pet'] = True
else:
ret['allow_pet'] = None
# has_perperty_registration
- ret['has_perperty_registration'] = detail_dict['top_metas']\
- .get('產權登記', '') == '已辦'
+ properMetaTitle = get(detail_dict, 'infoData.title')
+ ret['has_perperty_registration'] = properMetaTitle == '房屋已辦產權登記'
return ret
def get_shared_misc(self, detail_dict):
ret = {}
+ # rough_coordinate
+ position = get(detail_dict, 'positionRound')
+ coordinate = [
+ Decimal(position['lat']),
+ Decimal(position['lng'])
+ ]
+
+ if (coordinate[0] > 20 and coordinate[0] < 30):
+ # simple lat validator
+ # 東沙島 = 20.7036471,116.719958
+ # 馬祖 = 26.402385,119.8869727
+ ret['rough_coordinate'] = coordinate
+
# facilities
facilities = {}
- for item in detail_dict['facilities']:
- facilities[item] = True
-
- for item in detail_dict['without_facilities']:
- facilities[item] = False
+ for item in get(detail_dict, 'service.facility', default=[]):
+ if item['key'] == 'balcony':
+ continue
+ doProvide = item['active'] == 1
+ if item['key'] == 'table_chairs':
+ facilities['桌子'] = doProvide
+ facilities['椅子'] = doProvide
+ else:
+ facilities[item['name']] = doProvide
ret['facilities'] = facilities
# contact, agent, and author
- owner = detail_dict['owner']
- if '代理人' in owner['comment']:
- ret['contact'] = enums.ContactType.代理人
- elif owner['isAgent']:
+ owner = get(detail_dict, 'linkInfo', default={})
+ if owner['roleName'] == '仲介':
ret['contact'] = enums.ContactType.房仲
else:
- ret['contact'] = enums.ContactType.屋主
-
- if owner['isAgent']:
- agent = {}
- for item in owner['agent']:
- for key in item:
- agent[key] = item[key]
-
- if '公司名' in agent:
- ret['agent_org'] = agent['公司名']
- elif '經濟業' in agent:
- ret['agent_org'] = agent['經濟業']
- else:
- ret['agent_org'] = '/'.join(agent.values())
+ ret['contact'] = self.get_enum(
+ enums.ContactType,
+ detail_dict['house_id'],
+ owner['roleName']
+ )
- if 'id' in detail_dict['owner'] and detail_dict['owner']['id']:
- ret['author'] = detail_dict['owner']['id']
+ if owner['mobile'] != '':
+ ret['author'] = owner['mobile'].replace('-', '')
+ else:
+ ret['author'] = owner['uid'] or owner['imUid']
+
+ if ret['contact'] == enums.ContactType.房仲:
+ ret['agent_org'] = owner['roleTxt'] or owner['certificateTxt']
+ if ret['agent_org'] == '經紀業: 不動產經紀業':
+ ret['agent_org'] = '未認證'
return ret
def gen_detail_shared_attrs(self, detail_dict):
- detail_dict['price'] = clean_number(detail_dict['price'])
-
- detail_dict['price_includes'] = list(map(
- lambda x: x.replace('含', ''),
- detail_dict['price_includes']
- ))
-
- if '生活機能' in detail_dict['environment']:
- detail_dict['environment']['生活機能'] = list(map(
- lambda x: x.replace('近', ''),
- detail_dict['environment']['生活機能']
- ))
-
- if '附近交通' in detail_dict['environment']:
- detail_dict['environment']['附近交通'] = list(map(
- lambda x: re.sub('[ ]', '', x.replace('近', '')),
- detail_dict['environment']['附近交通']
- ))
-
+ price_range = parse_price(detail_dict['price'])
+ detail_dict['price'] = price_range['monthly_price']
basic_info = self.get_shared_basic(detail_dict)
price_info = self.get_shared_price(detail_dict, basic_info)
env_info = self.get_shared_environment(detail_dict)
@@ -730,12 +486,13 @@ def gen_detail_shared_attrs(self, detail_dict):
'vendor': self.vendor,
'vendor_house_id': detail_dict['house_id'],
'monthly_price': detail_dict['price'],
- 'imgs': detail_dict['imgs'],
+ **price_range,
**price_info,
**basic_info,
**env_info,
**boolean_info,
- **misc_info
+ **misc_info,
+
}
return ret
diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/list_mixin.py b/scrapy-package/scrapy_twrh/spiders/rental591/list_mixin.py
index 90012572..b85a3c77 100644
--- a/scrapy-package/scrapy_twrh/spiders/rental591/list_mixin.py
+++ b/scrapy-package/scrapy_twrh/spiders/rental591/list_mixin.py
@@ -2,7 +2,7 @@
from scrapy_twrh.items import RawHouseItem, GenericHouseItem
from scrapy_twrh.spiders.enums import PropertyType, TopRegionType, SubRegionType
from scrapy_twrh.spiders.util import clean_number
-from .util import SITE_URL, ListRequestMeta, DetailRequestMeta
+from .util import API_URL, ListRequestMeta, DetailRequestMeta, parse_price
from .request_generator import RequestGenerator
def get_list_val(house, regular_attr, top_attr=None, to_number=False):
@@ -52,7 +52,6 @@ def default_parse_list(self, response):
houses = data['data']['topData'] + data['data']['data']
for house in houses:
- house['is_vip'] = 'id' not in house
house_item = self.gen_shared_attrs(house, meta)
yield RawHouseItem(
house_id=house_item['vendor_house_id'],
@@ -61,12 +60,12 @@ def default_parse_list(self, response):
raw=json.dumps(house, ensure_ascii=False)
)
yield GenericHouseItem(**house_item)
- yield self.gen_detail_request(DetailRequestMeta(house_item['vendor_house_id'], False))
+ yield self.gen_detail_request(DetailRequestMeta(house_item['vendor_house_id']))
def gen_shared_attrs(self, house, meta: ListRequestMeta):
house_id = get_list_val(house, 'id', 'post_id')
- url = '{}/rent-detail-{}.html'.format(SITE_URL, house_id)
+ url = "{}/v1/house/rent/detail?id={}".format(API_URL, house_id)
if 'region_name' in house:
# topData doesn't contain region_name for some reason..
@@ -85,21 +84,41 @@ def gen_shared_attrs(self, house, meta: ListRequestMeta):
)
)
- property_type = self.get_enum(
- PropertyType, house_id, get_list_val(house, 'kind_name', 'kind_str'))
+ property_type = None
+ if 'kind_name' in house:
+ self.get_enum(PropertyType, house_id, get_list_val(house, 'kind_name'))
+
+ floor = None
+ total_floor = None
+ if 'floor_str' in house:
+ floor_info = house['floor_str'].split('/')
+ if len(floor_info) >= 2:
+ floor = clean_number(floor_info[0])
+ total_floor = clean_number(floor_info[1])
+
+ if floor == '頂樓加蓋':
+ floor = total_floor +1
+ elif 'B' in floor_info[0] and floor:
+ # basement
+ floor = -floor
+ elif floor is None:
+ # 整棟
+ floor = 0
+
+ price_range = parse_price(get_list_val(house, 'price'))
generic_house = {
'vendor': self.vendor,
'vendor_house_id': house_id,
'vendor_house_url': url,
- 'imgs': [get_list_val(house, 'cover', 'img_src')],
+ 'imgs': get_list_val(house, 'photo_list'),
'top_region': top_region,
'sub_region': sub_region,
'property_type': property_type,
'floor_ping': clean_number(house['area']),
- 'floor': get_list_val(house, 'floor', to_number=True),
- 'total_floor': get_list_val(house, 'allfloor', to_number=True),
- 'monthly_price': get_list_val(house, 'price', to_number=True)
+ 'floor': floor,
+ 'total_floor': total_floor,
+ **price_range
}
# 99 and 100 are magic number in 591...
diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/rental591_spider.py b/scrapy-package/scrapy_twrh/spiders/rental591/rental591_spider.py
index 5ca04c75..157a15b5 100644
--- a/scrapy-package/scrapy_twrh/spiders/rental591/rental591_spider.py
+++ b/scrapy-package/scrapy_twrh/spiders/rental591/rental591_spider.py
@@ -6,6 +6,11 @@
class Rental591Spider(ListMixin, DetailMixin):
name = 'rental591'
+ csrf_token = ''
+ session = {
+ '591_new_session': None,
+ 'PHPSESSID': None
+ }
def __init__(self, target_cities=None, **kwargs):
super().__init__(
@@ -36,9 +41,10 @@ def handle_session_init(self, response):
for cookie in response.headers.getlist('Set-Cookie'):
cookie_tokens = cookie.decode('utf-8').split('; ')
- if cookie_tokens and cookie_tokens[0].startswith('591_new_session='):
- self.session_token = cookie_tokens[0].split('=')[1]
- break
+ for cookie in cookie_tokens:
+ tokens = cookie.split('=')
+ if len(tokens) is 2 and tokens[0] in self.session:
+ self.session[tokens[0]] = tokens[1]
for item in self.start_list():
yield item
diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/request_generator.py b/scrapy-package/scrapy_twrh/spiders/rental591/request_generator.py
index cdd10acd..ce017795 100644
--- a/scrapy-package/scrapy_twrh/spiders/rental591/request_generator.py
+++ b/scrapy-package/scrapy_twrh/spiders/rental591/request_generator.py
@@ -1,14 +1,11 @@
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy_twrh.spiders.rental_spider import RentalSpider
-from .util import SITE_URL, LIST_ENDPOINT, ListRequestMeta, DetailRequestMeta
+from .util import SITE_URL, API_URL, LIST_ENDPOINT, ListRequestMeta, DetailRequestMeta
class RequestGenerator(RentalSpider):
def __init__(self, **kwargs):
super().__init__(**kwargs)
- self.csrf_token = None
- self.session_token = None
-
def gen_list_request_args(self, rental_meta: ListRequestMeta):
# don't filter as 591 use 30x to indicate house status...
ret = {
@@ -20,9 +17,10 @@ def gen_list_request_args(self, rental_meta: ListRequestMeta):
rental_meta.page * self.N_PAGE
),
'headers': {
- 'Cookie': 'urlJumpIp={}; 591_new_session={};'.format(
+ 'Cookie': 'urlJumpIp={}; 591_new_session={}; PHPSESSID={}'.format(
rental_meta.id,
- self.session_token
+ self.session['591_new_session'],
+ self.session['PHPSESSID']
),
'X-CSRF-TOKEN': self.csrf_token
}
@@ -30,38 +28,23 @@ def gen_list_request_args(self, rental_meta: ListRequestMeta):
return ret
def gen_detail_request_args(self, rental_meta: DetailRequestMeta):
- if rental_meta.gps:
- # https://rent.591.com.tw/map-houseRound.html?type=1&detail=detail&version=1&post_id=6635655
- url = "{}/map-houseRound.html?type=1&detail=detail&version=1&post_id={}".format(
- SITE_URL, rental_meta.id)
+ # https://bff.591.com.tw/v1/house/rent/detail?id=11501075
+ url = "{}/v1/house/rent/detail?id={}".format(API_URL, rental_meta.id)
- #19, the house may be closed in 3 hours when we found it....
- # retrieve gps in lowest priority
- # don't filter as 591 use 30x to indicate house status...
- return {
- 'dont_filter': True,
- 'url': url,
- 'priority': -1,
- 'errback': self.error_handler,
- 'meta': {
- 'rental': rental_meta,
- 'handle_httpstatus_list': [404]
- }
- }
- else:
- # https://rent.591.com.tw/rent-detail-6635655.html
- url = "{}/rent-detail-{}.html".format(SITE_URL, rental_meta.id)
-
- # don't filter as 591 use 30x to indicate house status...
- return {
- 'dont_filter': True,
- 'url': url,
- 'errback': self.error_handler,
- 'meta': {
- 'rental': rental_meta,
- 'handle_httpstatus_list': [400, 404, 302, 301]
- }
+ # don't filter as 591 use 30x to indicate house status...
+ return {
+ 'dont_filter': True,
+ 'url': url,
+ 'errback': self.error_handler,
+ 'meta': {
+ 'rental': rental_meta,
+ 'handle_httpstatus_list': [400, 404, 302, 301]
+ },
+ 'headers': {
+ 'device': 'pc',
+ 'deviceid': self.session['PHPSESSID']
}
+ }
def error_handler(self, failure):
if failure.check(HttpError):
diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/util.py b/scrapy-package/scrapy_twrh/spiders/rental591/util.py
index e59ef034..9ec3ad37 100644
--- a/scrapy-package/scrapy_twrh/spiders/rental591/util.py
+++ b/scrapy-package/scrapy_twrh/spiders/rental591/util.py
@@ -1,9 +1,21 @@
from collections import namedtuple
+from scrapy_twrh.spiders.util import clean_number
SITE_URL = 'https://rent.591.com.tw'
-LIST_ENDPOINT = '{}/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1'.format(SITE_URL)
+API_URL = 'https://bff.591.com.tw'
+LIST_ENDPOINT = '{}/home/search/rsList?is_new_list=1&type=1&is_format_data=1'.format(SITE_URL)
SESSION_ENDPOINT = '{}/?kind=0®ion=6'.format(SITE_URL)
ListRequestMeta = namedtuple('ListRequestMeta', ['id', 'name', 'page'])
-DetailRequestMeta = namedtuple('DetailRequestMeta', ['id', 'gps'])
+DetailRequestMeta = namedtuple('DetailRequestMeta', ['id'])
+
+def parse_price(number_string: str):
+ #87, 社會住宅's monthly_price is a range
+ tokens = number_string.split('~')
+ price = clean_number(tokens[0])
+ ret = { 'monthly_price': price }
+ if len(tokens) >= 2:
+ ret['min_monthly_price'] = clean_number(tokens[1])
+
+ return ret
diff --git a/scrapy-package/setup.py b/scrapy-package/setup.py
index 70c582b3..fbdf7c1e 100644
--- a/scrapy-package/setup.py
+++ b/scrapy-package/setup.py
@@ -5,7 +5,7 @@
setuptools.setup(
name="scrapy-tw-rental-house",
- version="0.1.2",
+ version="1.0.0",
author="ddio",
author_email="ddio@ddio.io",
description="Scrapy spider for TW Rental House",