-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeyi_spider.rb
88 lines (77 loc) · 3.51 KB
/
deyi_spider.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
require 'yaml'
require 'mechanize'
require 'mail'
def mailToQQ(qqSubject, qqBody)
ps = YAML.load_file('ps.yaml')
Mail.defaults do
delivery_method :smtp, :address => "smtp.qq.com",
:port => 587,
:user_name => 'ff4415@qq.com',
:password => ps[:ps],
:enable_ssl => true
end
Mail.deliver do
from 'ff4415@qq.com'
to 'ff4415@qq.com'
subject qqSubject
body qqBody
end
end
headers = {
'Host': 'www.deyi.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36a',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4',
'Cookie': 'oPgi_7ff9_saltkey=1734c3a998a3012a7864aa0978bc0c9d; oPgi_7ff9_lastvisit=1479391045; adv%5Fsy-zydl-1_4696=Y; oPgi_7ff9_lastact=1480680007%09forum.php%09viewthread; js-qrcode; Hm_lvt_b051b14789f4b43a39316713ae0ac54a=1480669866; Hm_lpvt_b051b14789f4b43a39316713ae0ac54a=1480680033'
}
agent = Mechanize.new
agent.request_headers = headers
forumPage = agent.get('http://www.deyi.com/forum.php')
forumPage_links = forumPage.links_with(:href => /www.deyi.com\/forum-\d+-\d+.html/).compact
totalPageNumber = 0
while (forumPageLink = forumPage_links.shift)
forumPage = forumPageLink.click
nextforumPageLink = forumPage.parser.css("a.nxt")[0]
fileName = forumPageLink.href.sub(/http:\/\/www.deyi.com\//, '').chomp(".html")
File.open("#{fileName}",'w') { |f|
while true
# puts "forumPage = #{forumPage.class}"
# puts "nextforumPageLink = #{nextforumPageLink}"
forumPage.parser.css("a.xst").each {|forumItemLink|
# puts "forumItemLink = #{forumItemLink}"
pageItem = agent.get(forumItemLink['href'])
nextPageLink = pageItem.parser.css("a.nxt")[0]
while true
# p "pageItem = #{pageItem.class}"
# puts "nextPageLink = #{nextPageLink}"
begin
pageItem.parser.css("td.t_f").text.split("\n").each { |message|
f.puts message.strip
}
nextPageLink ? pageItem = agent.get(nextPageLink['href']) : break
nextPageLink = pageItem.parser.css("a.nxt")[0]
rescue Mechanize::ResponseCodeError
mailToQQ "#{$!.class}", "code = #{$!.response_code}, current_page= #{agent.page}"
rescue Mechanize::ResponseReadError
mailToQQ "#{$!.class}", "code = #{$!.response} error = #{$!.error} uri = #{$!.uri}"
rescue
mailToQQ "#{$!.class}", "message = #{$!.message}, current_page = #{agent.page.uri}"
end
end #end while
sleep rand
totalPageNumber += 1
} #end_forumItemLink.each
nextforumPageLink ? forumpage = agent.get(nextforumPageLink['href']) : break
nextforumPageLink = forumpage.parser.css("a.nxt")[0]
f.puts "totalPageNumber = #{totalPageNumber}"
end #end_while
} #end file#open
Thread.new {
mailToQQ "#{fileName}", File.read(fileName)
}
end #end while forumPageLink