-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.js
113 lines (96 loc) · 2.75 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
async function crawl(startUri, maxDepth) {
// Found Thing Descriptions
const allCrawledTDs = new Set();
// Visited URIs
const allVisitedUris = new Set();
// URIs to visit
const urisToVisit = [];
// Number of parallel connections
const parallelCount = 800;
// add start uri
urisToVisit.push(startUri);
let depthCounter = 0;
while (urisToVisit.length > 0) {
// Check depthCounter
if (depthCounter >= maxDepth) {
urisToVisit.length = 0;
break;
}
const newlyFoundTDs = [];
const newlyFoundURIs = [];
// Visit all uris in uriToVisit
while (urisToVisit.length > 0) {
// Get URI to work on in parallel
const currentUrisToVisit = [];
for (let i = 0; i < parallelCount; i++) {
if (urisToVisit.length !== 0) {
const readUri = urisToVisit.pop();
currentUrisToVisit.push(readUri);
} else {
break;
}
}
const promiseArr = [];
// visit all uris in currentUrisToVisit
for (const uri of currentUrisToVisit) {
promiseArr.push(fetchTD(uri));
}
//console.log("Size", promiseArr.length);
const resultArr = await Promise.all(promiseArr);
// Extract TD and new found uris from resultArr
for (const result of resultArr) {
newlyFoundTDs.push(result[0]);
newlyFoundURIs.push(...result[1]);
}
// Add currentUrisToVisit to allVisitedUris
allVisitedUris.add(...currentUrisToVisit);
}
// Add all newly found elements to the total elements
for (const td of newlyFoundTDs) {
allCrawledTDs.add(td);
}
urisToVisit.push(...newlyFoundURIs);
// Increas depth counter
depthCounter++;
}
return allCrawledTDs;
}
// Fetch Thing Description
async function fetchTD(uri) {
let res;
try {
res = await fetch(uri);
} catch (error) {
throw error;
}
if (res.ok) {
const nextTDLinks = [];
// Get TD
const td = await res.json();
if (td.links) {
// find type of link and get href
for (const [_, value] of Object.entries(td.links)) {
// If content type is application/td+json add td
if (value.type === "application/td+json") {
nextTDLinks.push(value.href);
continue;
}
// Use head request to follow link and get content type
const response = await fetch(value.href, {
method: "HEAD",
agent: httpsAgent,
});
const contentType = response.headers.get("Content-Type");
// Check if content type is application/td+json
if (contentType.includes("application/td+json")) {
nextTDLinks.push(value.href);
continue;
}
}
}
return [td, nextTDLinks];
}
}
module.exports = {
crawl: crawl,
};