-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample.py
71 lines (52 loc) · 1.93 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# coding: utf-8
from splinter_model import BaseFetcherModel, CSSField, XPathField, RedisCache
class TestFetcher(BaseFetcherModel):
photo_url = XPathField('//*[@id="content"]/div[1]/table/tr[2]/td/a')
nationality = CSSField(
'#content > div:nth-child(1) > table > tr:nth-child(4) > td > a::text',
takes_first=True,
processor=lambda value: value.upper() # it could be a list of funcs
)
links = CSSField(
'#content > div:nth-child(11) > ul > li > a.external::attr(href)',
auto_extract=True
)
def parse_photo_url(self, selector):
return "http://en.m.wikipedia.org/{}".format(
selector.xpath("@href").extract()[0]
)
def parse_name(self, selector):
return selector.extract()[0]
def post_parse(self):
# executed after all parsers
# you can load any data on to self._data
# access self._data and self._fields for current data
# self.selector contains original page
# self.fetch() returns original html
self._data.url = self.url
class DummyModel(object):
"""
For tests only, it can be a model in your database ORM
"""
if __name__ == "__main__":
from pprint import pprint
fetcher = TestFetcher(cache_fetch=True,
cache=RedisCache,
cache_expire=1800)
fetcher.url = "http://en.m.wikipedia.org/wiki/Guido_van_Rossum"
# Mappings can be loaded from a json file
# fetcher.load_mappings_from_file('path/to/file')
fetcher.mappings['name'] = {
"css": ("#section_0::text")
}
fetcher.parse()
print "Fetcher holds the data"
print fetcher._data.name
pprint(fetcher._data)
# How to populate an object
print "Populating an object"
dummy = DummyModel()
fetcher.populate(dummy, fields=["name", "nationality"])
# fields attr is optional
print dummy.nationality
pprint(dummy.__dict__)