-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.js
executable file
·121 lines (110 loc) · 3.51 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env casperjs
var fs = require('fs');
var utils = require('utils');
var casper = require('casper').create({
verbose : true,
logLevel : 'error',
waitTimeout : 15000,
pageSettings : {
loadImages : false,
loadPlugins : false,
userAgent : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.6.3 (KHTML, like Gecko) Version/7.1.6 Safari/537.85.15',
webSecurityEnabled : false,
ignoreSslErrors : true,
},
});
// Get command line args
var group = casper.cli.args[0];
var count = casper.cli.args[1];
casper.thenPress = function(key) {
return this.then(function() {
this.page.sendEvent("keypress", key);
});
};
var lastTitle;
casper.waitTopic = function(then) {
return this
.wait(100)
.waitFor(function() {
var title = this.fetchText("#t-t");
if (title && title != lastTitle) {
lastTitle = title;
return true;
}
else if (title) {
this.page.sendEvent("keypress", "j");
return false;
}
else if (this.exists('.MV0LWFC-fb-f')) {
this.page.sendEvent("keypress", "j");
this.page.sendEvent("keypress", "o");
return false;
}
else {
return false;
}
})
.wait(100);
}
casper.nextTopic = function(then) {
return this
.thenPress("j")
.waitTopic()
.then(then);
}
casper.firstTopic = function(then) {
return this
.thenPress("o")
.waitTopic()
.then(then);
}
var topics = {};
casper.processTopic = function() {
var url = this.getCurrentUrl();
var id = url.match(/[^\/]+$/)[0];
var title = this.fetchText('#t-t');
topics[id] = {
id : id,
url : url,
title : title,
posts : this.evaluate(function() {
return __utils__
.findAll("#tm-tl > div")
.filter(function(post) {
return !post.innerText.match(/This message has been deleted/);
})
.map(function(node, i) {
var post = {
i : i,
user : node.querySelector('._username').innerText,
date : node.querySelector('.MV0LWFC-nb-Q.MV0LWFC-b-Cb').title,
body : node.querySelector('.MV0LWFC-nb-P').innerText,
};
var links = node.querySelectorAll('.MV0LWFC-fd-a a');
if (links) {
post.links = Array.prototype.map.call(links, function(link) {
return { href : link.href, label : link.innerText };
});
}
return post;
});
}),
};
this.then(function() { this.echo(title) });
return this;
}
casper.dumpTopics = function() {
return this.then(function() { fs.write(count + "-" + group + ".json", JSON.stringify(topics, null, 2), 'w') });
}
casper.fetchTopics = function(n) {
this.firstTopic(function() { this.processTopic() });
for (i = 1; i < n; i++)
this.nextTopic(function() { this.processTopic() });
return this;
}
casper
.start('https://groups.google.com/forum/#!forum/' + group)
.run(function() { this.echo("Scraping " + group ) })
.fetchTopics(count)
.dumpTopics()
.then(function() { this.exit() });