diff --git a/package.json b/package.json index 2eea212b3d2dd..77d6c5e740ece 100644 --- a/package.json +++ b/package.json @@ -12,9 +12,8 @@ "lint:a11y": "start-test 'yarn build && yarn preview' 3000 'yarn lint:a11y:local'", "lint:a11y:local": "pa11y-ci --sitemap 'http://localhost:3000/sitemap.xml' --sitemap-find 'https://docs.astro.build' --sitemap-replace 'http://localhost:3000'", "lint:a11y:remote": "pa11y-ci --sitemap 'https://docs.astro.build/sitemap.xml'", - "lint:linkcheck": "start-test 'yarn dev --silent' 3000 'yarn lint:linkcheck:local'", - "lint:linkcheck:local": "blc -roe --user-agent 'broken-link-checker/0.7.8' 'http://localhost:3000/en/getting-started'", - "lint:linkcheck:remote": "blc -ro --user-agent 'broken-link-checker/0.7.8' 'https://docs.astro.build/'" + "lint:linkcheck": "astro build && node ./scripts/lint-linkcheck.mjs", + "lint:linkcheck:nobuild": "node ./scripts/lint-linkcheck.mjs" }, "devDependencies": { "@algolia/client-search": "^4.13.0", @@ -24,6 +23,8 @@ "@babel/core": "^7.17.9", "@types/react": "^17.0.43", "astro": "^1.0.0-beta.5", + "htmlparser2": "^7.2.0", + "kleur": "^4.1.4", "node-fetch": "^3.2.3", "preact": "^10.7.1", "prettier": "^2.6.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7fa6741d84a66..990e5ea7ab3ac 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -10,7 +10,9 @@ specifiers: '@docsearch/react': ^3.0.0 '@types/react': ^17.0.43 astro: ^1.0.0-beta.5 + htmlparser2: ^7.2.0 jsdoc-api: ^7.1.1 + kleur: ^4.1.4 node-fetch: ^3.2.3 preact: ^10.7.1 prettier: ^2.6.2 @@ -34,6 +36,8 @@ devDependencies: '@babel/core': 7.17.9 '@types/react': 17.0.43 astro: 1.0.0-beta.5_sass@1.50.0 + htmlparser2: 7.2.0 + kleur: 4.1.4 node-fetch: 3.2.3 preact: 10.7.1 prettier: 2.6.2 diff --git a/scripts/lint-linkcheck.mjs b/scripts/lint-linkcheck.mjs new file mode 100644 index 0000000000000..634b68568e3dd --- /dev/null +++ b/scripts/lint-linkcheck.mjs @@ -0,0 +1,192 @@ +import path from 'path'; +import fs from 'fs'; +import kleur from 'kleur'; +import htmlparser2 from 'htmlparser2'; + +/** + * Contains all link checking logic. + */ +class BrokenLinkChecker { + constructor ({ baseUrl, buildOutputDir }) { + this.baseUrl = baseUrl; + this.buildOutputDir = buildOutputDir; + } + + /** + * Checks all pages referenced by the sitemap for broken links + * and outputs the result to the console. + */ + run () { + // Get the pathnames of all content pages from the sitemap contained in the build output + const pagePathnames = this.getPagePathnamesFromSitemap(); + + // Parse all pages referenced by the sitemap and build an index of their contents + const pages = this.parsePages(pagePathnames); + + // Find all broken links + const brokenLinks = this.findBrokenLinks(pages); + + // Output the result + this.outputResult(brokenLinks); + } + + /** + * Reads the `sitemap.xml` from the build output and extracts all unique pathnames. + */ + getPagePathnamesFromSitemap () { + const sitemapFilePath = path.join(this.buildOutputDir, 'sitemap.xml'); + const sitemap = fs.readFileSync(sitemapFilePath, 'utf8'); + const sitemapRegex = new RegExp(`${this.baseUrl}(/.*?)`, 'ig'); + const uniquePagePaths = [...new Set(Array.from( + sitemap.matchAll(sitemapRegex), + m => m[1] + ))]; + + return uniquePagePaths; + } + + /** + * Parses multiple HTML pages based on their pathnames and builds an index of their contents. + */ + parsePages (pathnames) { + const pages = {}; + pathnames.forEach(pathname => { + pages[pathname] = this.parsePage(pathname); + }); + + return pages; + } + + /** + * Parses an HTML page based on its pathname and builds an index of its contents. + */ + parsePage (pathname) { + const href = this.pathnameToHref(pathname); + const htmlFilePath = this.pathnameToHtmlFilePath(pathname); + + if (!fs.existsSync(htmlFilePath)) { + throw new Error('Failed to find HTML file referenced by sitemap: ' + htmlFilePath); + } + + const dom = htmlparser2.parseDocument(fs.readFileSync(htmlFilePath)); + const anchors = htmlparser2.DomUtils + .getElementsByTagName('a', dom, true); + + // Build a list of unique link hrefs on the page + const linkHrefs = [...new Set(anchors + .map(el => el.attribs.href) + )]; + + // Build a list of hashes provided by the page (mostly used as scroll targets) + const anchorNames = anchors + .map(el => el.attribs.name) + .filter(name => name !== undefined); + const ids = htmlparser2.DomUtils + .getElements({ id: id => id !== undefined }, dom, true) + .map(el => el.attribs.id); + const hashes = [...anchorNames, ...ids] + .map(name => `#${name}`); + + return { + pathname, + href, + htmlFilePath, + linkHrefs, + hashes, + }; + } + + /** + * Goes through all pre-parsed and indexed pages, checks their links, + * and returns an array containing all broken links (if any). + */ + findBrokenLinks (pages) { + var brokenLinks = []; + + Object.values(pages).forEach(page => { + // Go through all link hrefs on the page + page.linkHrefs.forEach(linkHref => { + const url = new URL(linkHref, page.href); + + // Ignore external URLs + if (!url.href.startsWith(this.baseUrl)) + return; + + var linkPathname = url.pathname; + if (!linkPathname.endsWith('/')) { + linkPathname += '/'; + } + const linkedPage = pages[linkPathname]; + const isMissingPage = !linkedPage; + + const decodedHash = url.hash && decodeURIComponent(url.hash); + const isMissingHash = ( + !isMissingPage && + (decodedHash && !linkedPage.hashes.includes(decodedHash)) + ); + + if (isMissingPage || isMissingHash) { + brokenLinks.push({ + page, + href: url.href, + isMissingPage, + isMissingHash, + }); + } + }); + }); + + return brokenLinks; + } + + /** + * Outputs the result of the broken link check to the console. + */ + outputResult (brokenLinks) { + const totalBroken = brokenLinks.length; + + if (totalBroken > 0) { + const brokenHashCount = brokenLinks.filter(brokenLink => brokenLink.isMissingHash).length; + const brokenPageCount = totalBroken - brokenHashCount; + const prefixPage = kleur.gray(`[${kleur.red().bold('404')}]`); + const prefixHash = kleur.gray(`[${kleur.yellow().bold(' # ')}]`); + + var lastPage; + brokenLinks.forEach(brokenLink => { + if (lastPage !== brokenLink.page) { + console.log(`\n${brokenLink.page.pathname}`); + lastPage = brokenLink.page; + } + console.log(` ${brokenLink.isMissingHash ? prefixHash : prefixPage} ${brokenLink.href}`); + }); + console.log(); + + const summary = [ + `*** Found ${totalBroken} broken ${totalBroken === 1 ? 'link' : 'links'} in total:`, + ` ${prefixPage} ${brokenPageCount} broken page ${brokenPageCount === 1 ? 'link' : 'links'}`, + ` ${prefixHash} ${brokenHashCount} broken fragment ${brokenHashCount === 1 ? 'link' : 'links'}`, + ]; + console.log(kleur.white().bold(summary.join('\n'))); + } else { + console.log(kleur.green().bold('*** Found no broken links. Great job!')); + } + console.log(); + } + + pathnameToHref (pathname) { + const url = new URL(pathname, this.baseUrl); + return url.href; + } + + pathnameToHtmlFilePath (pathname) { + return path.join(this.buildOutputDir, pathname, 'index.html'); + } +} + +// Use our class to check for broken links +const brokenLinkChecker = new BrokenLinkChecker({ + baseUrl: 'https://docs.astro.build', + buildOutputDir: './dist', +}); + +brokenLinkChecker.run();