forked from semifor/net-twitter-lite
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scrapy-Twitter-API-Spider.py
116 lines (88 loc) · 3.13 KB
/
Scrapy-Twitter-API-Spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re
import json
import types
import codecs
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from mscraper.items import SocialItem
class DocsTwitter(BaseSpider):
name = 'docstwitter'
index = 0
end_urls = []
start_urls = [
'https://dev.twitter.com/docs/api/1.1'
]
# write to perl
tab = 'twitter'
perl = codecs.open('output/docs_' + tab + '.pm', 'w', encoding='utf-8')
def parse(self,response):
hxs = HtmlXPathSelector(response)
endpoints = hxs.select("id('content-main')/div/div/table/tbody/tr/td/a//@href").extract()
for item in endpoints:
itemUrl = 'https://dev.twitter.com' + item
DocsTwitter.end_urls.append(itemUrl)
return Request( url=DocsTwitter.end_urls[DocsTwitter.index], callback=self.parseEndpoint )
def parseEndpoint(self,response):
DocsTwitter.index += 1
hxs = HtmlXPathSelector(response)
title = hxs.select("id('title')//text()").extract()
params = hxs.select("id('content-main')/div/div[@class='field text field-doc-params']/div/div")
method = '';
function = '';
optional = []
required = []
bools = []
title = ''.join(title)
title = title.split(' ')
method = title[0]
function = title[1]
for p in params:
boolean = p.select(".//p[2]/tt//text()").extract()
if len(boolean) > 0:
boolean = boolean[0]
else:
boolean = ''
essence = p.select(".//span/span//text()").extract()
if len(essence) > 0:
essence = essence[0]
else:
essence = ''
name = p.select(".//span//text()").extract()
if len(name) > 0:
name = name[0]
else:
continue
boolean = re.sub(r'^\s+','',boolean)
boolean = re.sub(r'\s+$','',boolean)
essence = re.sub(r'^\s+','',essence)
essence = re.sub(r'\s+$','',essence)
name = re.sub(r'^\s+','',name)
name = re.sub(r'\s+$','',name)
if boolean == 'true' or boolean == 'false':
bools.append(name)
optional.append(name)
if essence == 'required':
required.append(name)
optionalParams = ' '.join(optional)
requiredParams = ' '.join(required)
boolParams = ' '.join(bools)
output = """
{
aliases => [ qw// ],
path => '%s',
method => '%s',
params => [ qw/%s/ ],
required => [ qw/%s/ ],
add_source => 0,
deprecated => 0,
authenticate => 1,
booleans => [ qw/%s/ ],
base_url_method => 'apiurl',
}
""" % (function,method,optionalParams,requiredParams,boolParams)
DocsTwitter.perl.write(output)
if DocsTwitter.index < len(DocsTwitter.end_urls):
return Request( url=DocsTwitter.end_urls[DocsTwitter.index], callback=self.parseEndpoint )
else:
return None