-
Notifications
You must be signed in to change notification settings - Fork 8
/
brokenLinks.js
83 lines (72 loc) · 2.35 KB
/
brokenLinks.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* eslint-disable import/no-extraneous-dependencies */
/* eslint-disable no-console */
/* eslint-disable import/prefer-default-export */
/* eslint-disable no-await-in-loop */
import fs from "fs/promises";
import path from "path";
import fetch from "node-fetch";
import jsdom from "jsdom";
const parseCitationLinks = async (filePath) => {
const html = await fs.readFile(filePath, "utf8");
const dom = new jsdom.JSDOM(html);
return Array.from(
dom.window.document.querySelectorAll("dd.col-12.col-sm-8.col-lg-9 a")
).map((a) => a.href);
};
const mapCityUrlsToCitationLinks = async () => {
const folderEntries = await fs.readdir("city_detail");
const fileNames = folderEntries.filter(
(entry) => entry !== "attachment_images" && entry.includes(".html")
);
const results = await Promise.all(
fileNames.map(async (fileName) => {
const filePath = path.join("city_detail", fileName);
const cityUrl = `https://parkingreform.org/mandates-map/city_detail/${fileName}`;
const citationLinks = await parseCitationLinks(filePath);
return [cityUrl, citationLinks];
})
);
return results.reduce((acc, [cityUrl, links]) => {
acc[cityUrl] = links;
return acc;
}, {});
};
const findDeadLinks = async (links) => {
const results = await Promise.all(
links.map(async (link) => {
// Don't fetch empty links, but still report them.
if (!link) {
return [link, 0];
}
try {
const response = await fetch(link, {
headers: { "User-Agent": "prn-broken-links-finder" },
});
if (response.status >= 300) {
return [link, response.status];
}
} catch (error) {
console.error(`Failed to fetch ${link}: ${error.message}`);
return [link, -1];
}
return null;
})
);
return results.filter(Boolean);
};
const main = async () => {
const cityUrlsToCitationLinks = await mapCityUrlsToCitationLinks();
// We use a for loop to avoid making too many network calls -> rate limiting.
const result = {};
for (const [cityUrl, links] of Object.entries(cityUrlsToCitationLinks)) {
const deadLinks = await findDeadLinks(links);
if (deadLinks) {
result[cityUrl] = deadLinks;
}
}
console.log(result);
};
if (process.env.NODE_ENV !== "test") {
main().catch((error) => console.error(error));
}
export { parseCitationLinks };