withastro · tony-sull · Apr 22, 2022 · Apr 20, 2022 · Apr 21, 2022 · Apr 21, 2022
diff --git a/package.json b/package.json
@@ -12,9 +12,8 @@
     "lint:a11y": "start-test 'yarn build && yarn preview' 3000 'yarn lint:a11y:local'",
     "lint:a11y:local": "pa11y-ci --sitemap 'http://localhost:3000/sitemap.xml' --sitemap-find 'https://docs.astro.build' --sitemap-replace 'http://localhost:3000'",
     "lint:a11y:remote": "pa11y-ci --sitemap 'https://docs.astro.build/sitemap.xml'",
-    "lint:linkcheck": "start-test 'yarn dev --silent' 3000  'yarn lint:linkcheck:local'",
-    "lint:linkcheck:local": "blc -roe --user-agent 'broken-link-checker/0.7.8' 'http://localhost:3000/en/getting-started'",
-    "lint:linkcheck:remote": "blc -ro --user-agent 'broken-link-checker/0.7.8' 'https://docs.astro.build/'"
+    "lint:linkcheck": "astro build && node ./scripts/lint-linkcheck.mjs",
+    "lint:linkcheck:nobuild": "node ./scripts/lint-linkcheck.mjs"
   },
   "devDependencies": {
     "@algolia/client-search": "^4.13.0",
@@ -24,6 +23,8 @@
     "@babel/core": "^7.17.9",
     "@types/react": "^17.0.43",
     "astro": "^1.0.0-beta.5",
+    "chalk": "^5.0.1",
+    "htmlparser2": "^7.2.0",
     "node-fetch": "^3.2.3",
     "preact": "^10.7.1",
     "prettier": "^2.6.2",

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/scripts/lint-linkcheck.mjs b/scripts/lint-linkcheck.mjs
@@ -0,0 +1,192 @@
+import path from 'path';
+import fs from 'fs';
+import chalk from 'chalk';
+import htmlparser2 from 'htmlparser2';
+
+/**
+ * Contains all link checking logic.
+ */
+class BrokenLinkChecker {
+	constructor ({ baseUrl, buildOutputDir }) {
+		this.baseUrl = baseUrl;
+		this.buildOutputDir = buildOutputDir;
+	}
+
+	/**
+	 * Checks all pages referenced by the sitemap for broken links
+	 * and outputs the result to the console.
+	 */
+	run () {
+		// Get the pathnames of all content pages from the sitemap contained in the build output
+		const pagePathnames = this.getPagePathnamesFromSitemap();
+
+		// Parse all pages referenced by the sitemap and build an index of their contents
+		const pages = this.parsePages(pagePathnames);
+
+		// Find all broken links
+		const brokenLinks = this.findBrokenLinks(pages);
+
+		// Output the result
+		this.outputResult(brokenLinks);
+	}
+
+	/**
+	 * Reads the `sitemap.xml` from the build output and extracts all unique pathnames.
+	 */
+	getPagePathnamesFromSitemap () {
+		const sitemapFilePath = path.join(this.buildOutputDir, 'sitemap.xml');
+		const sitemap = fs.readFileSync(sitemapFilePath, 'utf8');
+		const sitemapRegex = new RegExp(`<loc>${this.baseUrl}(/.*?)</loc>`, 'ig');
+		const uniquePagePaths = [...new Set(Array.from(
+			sitemap.matchAll(sitemapRegex),
+			m => m[1]
+		))];
+
+		return uniquePagePaths;
+	}
+
+	/**
+	 * Parses multiple HTML pages based on their pathnames and builds an index of their contents.
+	 */
+	parsePages (pathnames) {
+		const pages = {};
+		pathnames.forEach(pathname => {
+			pages[pathname] = this.parsePage(pathname);
+		});
+
+		return pages;
+	}
+
+	/**
+	 * Parses an HTML page based on its pathname and builds an index of its contents.
+	 */
+	parsePage (pathname) {
+		const href = this.pathnameToHref(pathname);
+		const htmlFilePath = this.pathnameToHtmlFilePath(pathname);
+
+		if (!fs.existsSync(htmlFilePath)) {
+			throw new Error('Failed to find HTML file referenced by sitemap: ' + htmlFilePath);
+		}
+
+		const dom = htmlparser2.parseDocument(fs.readFileSync(htmlFilePath));
+		const anchors = htmlparser2.DomUtils
+			.getElementsByTagName('a', dom, true);
+
+		// Build a list of unique link hrefs on the page
+		const linkHrefs = [...new Set(anchors
+			.map(el => el.attribs.href)
+		)];
+
+		// Build a list of hashes provided by the page (mostly used as scroll targets)
+		const anchorNames = anchors
+			.map(el => el.attribs.name)
+			.filter(name => name !== undefined);
+		const ids = htmlparser2.DomUtils
+			.getElements({ id: id => id !== undefined }, dom, true)
+			.map(el => el.attribs.id);
+		const hashes = [...anchorNames, ...ids]
+			.map(name => `#${name}`);
+
+		return {
+			pathname,
+			href,
+			htmlFilePath,
+			linkHrefs,
+			hashes,
+		};
+	}
+
+	/**
+	 * Goes through all pre-parsed and indexed pages, checks their links,
+	 * and returns an array containing all broken links (if any).
+	 */
+	findBrokenLinks (pages) {
+		var brokenLinks = [];
+
+		Object.values(pages).forEach(page => {
+			// Go through all link hrefs on the page
+			page.linkHrefs.forEach(linkHref => {
+				const url = new URL(linkHref, page.href);
+
+				// Ignore external URLs
+				if (!url.href.startsWith(this.baseUrl))
+					return;
+
+				var linkPathname = url.pathname;
+				if (!linkPathname.endsWith('/')) {
+					linkPathname += '/';
+				}
+				const linkedPage = pages[linkPathname];
+				const isMissingPage = !linkedPage;
+
+				const decodedHash = url.hash && decodeURIComponent(url.hash);
+				const isMissingHash = (
+					!isMissingPage &&
+					(decodedHash && !linkedPage.hashes.includes(decodedHash))
+				);
+
+				if (isMissingPage || isMissingHash) {
+					brokenLinks.push({
+						page,
+						href: url.href,
+						isMissingPage,
+						isMissingHash,
+					});
+				}
+			});
+		});
+
+		return brokenLinks;
+	}
+
+	/**
+	 * Outputs the result of the broken link check to the console.
+	 */
+	outputResult (brokenLinks) {
+		const totalBroken = brokenLinks.length;
+
+		if (totalBroken > 0) {
+			const brokenHashCount = brokenLinks.filter(brokenLink => brokenLink.isMissingHash).length;
+			const brokenPageCount = totalBroken - brokenHashCount;
+			const prefixPage = chalk.gray(`[${chalk.redBright('404')}]`);
+			const prefixHash = chalk.gray(`[${chalk.yellowBright(' # ')}]`);
+
+			var lastPage;
+			brokenLinks.forEach(brokenLink => {
+				if (lastPage !== brokenLink.page) {
+					console.log(`\n${brokenLink.page.pathname}`);
+					lastPage = brokenLink.page;
+				}
+				console.log(`  ${brokenLink.isMissingHash ? prefixHash : prefixPage} ${brokenLink.href}`);
+			});
+			console.log();
+
+			const summary = [
+				`*** Found ${totalBroken} broken ${totalBroken === 1 ? 'link' : 'links'} in total:`,
+				`  ${prefixPage} ${brokenPageCount} broken page ${brokenPageCount === 1 ? 'link' : 'links'}`,
+				`  ${prefixHash} ${brokenHashCount} broken fragment ${brokenHashCount === 1 ? 'link' : 'links'}`,
+			];
+			console.log(chalk.whiteBright.bold(summary.join('\n')));
+		} else {
+			console.log(chalk.greenBright('*** Found no broken links. Great job!'));
+		}		
+		console.log();
+	}
+
+	pathnameToHref (pathname) {
+		const url = new URL(pathname, this.baseUrl);
+		return url.href;
+	}
+
+	pathnameToHtmlFilePath (pathname) {
+		return path.join(this.buildOutputDir, pathname, 'index.html');
+	}
+}
+
+// Use our class to check for broken links
+const brokenLinkChecker = new BrokenLinkChecker({
+	baseUrl: 'https://docs.astro.build',
+	buildOutputDir: './dist',
+});
+
+brokenLinkChecker.run();