Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply scrapy package #93

Merged
merged 4 commits into from
Oct 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 6 additions & 36 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,17 @@ url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]
pylint-django = "*"
pylint = "*"
pylint-plugin-utils = "*"

[packages]
asn1crypto = "==0.24.0"
astroid = "==2.0"
attrs = "==17.4.0"
beautifulsoup4 = "==4.6.3"
cffi = "==1.11.5"
constantly = "==15.1.0"
cryptography = "==2.3"
cssselect = "==1.0.3"
cffi = "==1.13.2"
django = "==2.1.15"
hyperlink = "==18.0.0"
idna = "==2.6"
incremental = "==17.5.0"
isort = "==4.3.4"
jsonfield = "==2.0.2"
lazy-object-proxy = "==1.3.1"
lxml = "==4.2.0"
mccabe = "==0.6.1"
parsel = "==1.4.0"
pyasn1 = "==0.4.2"
pyasn1-modules = "==0.2.1"
pycparser = "==2.18"
pylint = "==2.0.0"
pylint-django = "==2.0.2"
pylint-plugin-utils = "==0.4"
pytz = "==2018.5"
queuelib = "==1.5.0"
raven = "==6.9.0"
service-identity = "==17.0.0"
six = "==1.11.0"
typed-ast = "==1.1.0"
w3lib = "==1.19.0"
wrapt = "==1.10.11"
Automat = "==0.6.0"
PyDispatcher = "==2.0.5"
pyOpenSSL = "==17.5.0"
Scrapy = "==1.5.0"
Twisted = "==17.9.0"
"zope.interface" = "==4.4.3"
psycopg2-binary = "==2.7.5"
psycopg2-binary = "==2.8.6"
scrapy-tw-rental-house = "==1.1.2"

[requires]
python_version = "3"
853 changes: 506 additions & 347 deletions Pipfile.lock

Large diffs are not rendered by default.

12 changes: 4 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@

#### 環境需求

1. Python3.5+
1. Python3.8+
2. pip
3. pipenv (選用)
3. pipenv
4. [PostgreSQL](https://www.postgresql.org) 9.5+
- 使用 PostgresSQL 以外的資料庫時,爬蟲可以順利執行,但使用內建的匯出指令時無法用 `-u --unique` 去除重複物件
5. GeoDjango ,目前[主要的關聯式資料庫都有支援](https://docs.djangoproject.com/en/2.1/ref/contrib/gis/db-api/)
Expand All @@ -39,13 +39,9 @@
#### 資料庫設定

```sh
# 使用 virtualenv 安裝相關套件
virtualenv -p python3 .
pip install -r requirements.txt
. ./bin/activate

# 也可使用 pipenv 安裝相關套件
# 使用 pipenv 安裝相關套件
pipenv install
pipenv shell

cd backend
# 設定資料庫(預設使用 sqlite)
Expand Down
16 changes: 8 additions & 8 deletions backend/rental/libs/export/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,13 @@ def to_human(self, val, use_tf=True):
if self.fn:
val = self.fn(val)

if type(val) is datetime:
if isinstance(val, datetime):
val = timezone.localtime(val).strftime('%Y-%m-%d %H:%M:%S %Z')
elif val is '' or val is None:
elif val == '' or val is None:
val = '-'
elif val is True or val == 'true':
elif val == True or val == 'true':
val = 'T' if use_tf else 1
elif val is False or val == 'false':
elif val == False or val == 'false':
val = 'F' if use_tf else 0

return val
Expand All @@ -51,13 +51,13 @@ def to_machine(self, val):
if self.fn:
val = self.fn(val)

if type(val) is datetime:
if isinstance(val, datetime):
pass
elif val is '' or val is None:
elif val == '' or val is None:
val = None
elif val is True or val == 'true':
elif val == True or val == 'true':
val = True
elif val is False or val == 'false':
elif val == False or val == 'false':
val = False

return val
23 changes: 23 additions & 0 deletions backend/rental/migrations/0008_support_price_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 2.1.15 on 2021-10-26 04:49

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('rental', '0007_more_property_type'),
]

operations = [
migrations.AddField(
model_name='house',
name='min_monthly_price',
field=models.IntegerField(null=True),
),
migrations.AddField(
model_name='housets',
name='min_monthly_price',
field=models.IntegerField(null=True),
),
]
1 change: 1 addition & 0 deletions backend/rental/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class BaseHouse(models.Model):
vendor_house_url = models.URLField(null=True)
# price related
monthly_price = models.IntegerField(null=True)
min_monthly_price = models.IntegerField(null=True)
deposit_type = models.IntegerField(
choices = [(tag, tag.value) for tag in DepositType],
null=True
Expand Down
73 changes: 0 additions & 73 deletions crawler/crawler/items.py

This file was deleted.

28 changes: 22 additions & 6 deletions crawler/crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,24 @@
import logging
import traceback
from django.utils import timezone
from rental.models import HouseTS, House, HouseEtc
from rental.models import HouseTS, House, HouseEtc, Vendor, Author
from rental.enums import DealStatusType
from .items import GenericHouseItem, RawHouseItem
from scrapy_twrh.items import GenericHouseItem, RawHouseItem
from django.contrib.gis.geos import Point
from crawler.utils import now_tuple


class CrawlerPipeline(object):

def __init__(self) -> None:
super().__init__()
self.vendorMap = {}
for vendor in Vendor.objects.all():
self.vendorMap[vendor.name] = vendor

def item_vendor (self, item):
return self.vendorMap[item['vendor']]

def process_item(self, item, spider):
y, m, d, h = now_tuple()

Expand All @@ -24,13 +34,13 @@ def process_item(self, item, spider):

house, created = House.objects.get_or_create(
vendor_house_id=item['house_id'],
vendor=item['vendor']
vendor=self.item_vendor(item)
)

house_etc, created = HouseEtc.objects.get_or_create(
house=house,
vendor_house_id=item['house_id'],
vendor=item['vendor']
vendor=self.item_vendor(item)
)

if 'raw' in item:
Expand All @@ -49,12 +59,12 @@ def process_item(self, item, spider):
house_ts, created = HouseTS.objects.get_or_create(
year=y, month=m, day=d, hour=h,
vendor_house_id=item['vendor_house_id'],
vendor=item['vendor']
vendor=self.item_vendor(item)
)

house, created = House.objects.get_or_create(
vendor_house_id=item['vendor_house_id'],
vendor=item['vendor']
vendor=self.item_vendor(item)
)

to_db = item.copy()
Expand All @@ -69,6 +79,12 @@ def process_item(self, item, spider):
house.deal_status == DealStatusType.DEAL:
should_rollback_house_deal_status = True

if 'rough_coordinate' in to_db:
to_db['rough_coordinate'] = Point(to_db['rough_coordinate'], srid=4326)
if 'author' in to_db:
author_info, created = Author.objects.get_or_create(truth=to_db['author'])
to_db['author'] = author_info

for attr in to_db:
setattr(house_ts, attr, to_db[attr])
setattr(house, attr, to_db[attr])
Expand Down
86 changes: 0 additions & 86 deletions crawler/crawler/spiders/all_591_cities.py

This file was deleted.

Loading