-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.js
186 lines (173 loc) · 5.37 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// Used to check whether or not links point to external URLs
const URL = require('url')
// Used for the actual crawling with full browser capabilities
const puppeteer = require('puppeteer')
// Color debugging in the terminal
const chalk = require('chalk')
// Gratuitous use of streams to write report
const fs = require('fs')
const Stream = require('stream')
// The Worker class represents the workers doing the actual crawling,
// which report crawled URLs to the Crawler
const { Worker } = require('./worker.js')
const { Report} = require('./report.js')
class Crawler {
/**
* @description Creates a new Crawler instance
* @param {String} rootUrl - The URL we start crawling from
* @param {Integer} poolSize - The number of concurrent Promise-based workers we'll use to crawl pages
*/
constructor({ rootUrl, poolSize }) {
this.poolSize = poolSize
this.host = URL.parse(rootUrl).host
this.pendingUrls = new Set([rootUrl])
this.locked = new Set()
this.visitedUrls = new Set()
this.count = {}
this.errors = new Set()
}
/**
* @returns {Array} A collection of crawlable URLs
*/
get pending() {
return [...this.pendingUrls]
}
/**
* @description Indicates whether or not we're done crawling the website
*/
get done() {
return this.workers.every(worker => { return worker.done }) && Object.is(this.pending.length, 0)
}
/**
* @description Initializer for the crawler since constructors cannot be async and we need to
* initialize the Puppeteer browser instance
* @returns {Promise} A promise for the intialized crawler
*/
async init() {
return new Promise(async (resolve, reject) => {
try {
this.browser = await puppeteer.launch()
const workers = []
for (let i = 0; i < this.poolSize; i++) {
const worker = new Worker(this.browser, this)
workers.push(worker.init(i))
}
this.workers = await Promise.all(workers)
resolve(this)
} catch (e) {
reject(e)
}
})
}
/**
* @description Writes a report about the crawl
* @todo Implement actual Report class
* @returns {void}
*/
async writeReport() {
return new Promise((resolve, reject) => {
const readStream = new Stream.Readable()
const writeStream = fs.createWriteStream('report.txt')
writeStream.on('error', error => {
reject(error)
})
writeStream.on('close', (event) => {
resolve(true)
})
readStream.pipe(writeStream)
for (let line of [...this.errors].filter(url => {return !this.visitedUrls.has(url)})) {
readStream.push(`${line}\n`)
}
// Indicating the end of data
readStream.push(null)
})
}
/**
* @description Used by the workers to take a job from the pending queue
* @returns {String} URL to crawl
*/
get nextLocation() {
const [location, ...urls] = [...this.pendingUrls]
this.pendingUrls = new Set(urls)
return location
}
async writeReport(path) {
const {visitedUrls, errors, count, mixedContentLocations} = this
const report = new Report({visitedUrls, errors, count, mixedContentLocations, path: './report.txt'})
return report.generate()
}
/**
* @description Orchestrator for the crawler
* Starts the workers and wait for them all to be done
* When everything is settled, writes a report to a file
* @returns {Promise}
*/
async crawl() {
return new Promise(async (resolve, reject) => {
try {
let locations = []
const promises = this.workers.map(worker => {
return worker.run()
})
const workers = await Promise.all(promises)
this.mixedContentLocations = workers.reduce((memo, worker) => {
memo = [...memo, ...worker.mixedContentLocations]
return memo
}, [])
await this.writeReport()
resolve(true)
} catch (error) {
reject(error)
}
})
}
/**
* @description Each time we see a link to some URL on a page, we increment a counter.
* This is then used in the report to indicate what the website links to the most internally.
* @param {String} url - The URL the link points to
*/
incrementCount(url) {
this.count[url] = (this.count[url] || 0) + 1
}
/**
* @description Indicates whether or not a link is an external link
* @param {String} url - The URL we're checking
* @returns {Boolean}
*/
outbound(url) {
return URL.parse(url).host !== this.host
}
/**
* @description A link is crawlable if it's never been visited before and if it's an internal link
* @param {String} url
* @returns {Boolean}
*/
crawlable(url) {
return !this.visitedUrls.has(url) && !this.outbound(url)
}
/**
* @description Adds a bunch of URLs to the pending queue
* @param {Set} urls - The URLs we encountered while crawling
* @returns {void}
*/
push(urls) {
const cleanUrl = url => {
const {protocol, host, path:fullPath} = URL.parse(url)
const [path, ...query] = fullPath.split(/\?/)
return `${protocol}//${host}${path}`
}
for (let url of urls) {
this.incrementCount(url)
}
const crawlableUrls = [...urls].filter(url => {
return this.crawlable(url)
}).map(url => {
return cleanUrl(url)
})
for (let url of crawlableUrls) {
this.pendingUrls.add(url)
}
}
}
// Imported in app.js
module.exports = { Crawler }