forked from simplecrawler/simplecrawler
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsavetodisk.js
75 lines (56 loc) · 2.11 KB
/
savetodisk.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
// Example use of simplecrawler, courtesy of @breck7! Thanks mate. :)
/**
* @param String. Domain to download.
* @Param Function. Callback when crawl is complete.
*/
var downloadSite = function(initialURL, callback) {
var fs = require("node-fs"),
url = require("url"),
path = require("path"),
Crawler = require("simplecrawler").Crawler;
var myCrawler = new Crawler(initialURL),
domain = url.parse(initialURL).hostname;
myCrawler.interval = 250;
myCrawler.maxConcurrency = 5;
myCrawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
// Parse url
var parsed = url.parse(queueItem.url);
// Rename / to index.html
if (parsed.pathname === "/") {
parsed.pathname = "/index.html";
}
// Where to save downloaded data
var outputDirectory = path.join(__dirname, domain);
// Get directory name in order to create any nested dirs
var dirname = outputDirectory + parsed.pathname.replace(/\/[^/]+$/, "");
// Path to save file
var filepath = outputDirectory + parsed.pathname;
// Check if DIR exists
fs.exists(dirname, function(exists) {
// If DIR exists, write file
if (exists) {
fs.writeFile(filepath, responseBuffer, function() {});
} else {
// Else, recursively create dir using node-fs, then write file
fs.mkdir(dirname, 0755, true, function() {
fs.writeFile(filepath, responseBuffer, function() {});
});
}
});
console.log("I just received %s (%d bytes)", queueItem.url, responseBuffer.length);
console.log("It was a resource of type %s", response.headers["content-type"]);
});
// Fire callback
myCrawler.on("complete", function() {
callback();
});
// Start Crawl
myCrawler.start();
};
if (process.argv.length < 3) {
console.log("Usage: node savetodisk.js mysite.com");
process.exit(1);
}
downloadSite(process.argv[2], function() {
console.log("Done!");
});