Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for i18n docs to lint:linkcheck script #361

Merged
merged 3 commits into from
Apr 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
"lint:a11y": "start-test 'yarn build && yarn preview' 3000 'yarn lint:a11y:local'",
"lint:a11y:local": "pa11y-ci --sitemap 'http://localhost:3000/sitemap.xml' --sitemap-find 'https://docs.astro.build' --sitemap-replace 'http://localhost:3000'",
"lint:a11y:remote": "pa11y-ci --sitemap 'https://docs.astro.build/sitemap.xml'",
"lint:linkcheck": "start-test 'yarn dev --silent' 3000 'yarn lint:linkcheck:local'",
"lint:linkcheck:local": "blc -roe --user-agent 'broken-link-checker/0.7.8' 'http://localhost:3000/en/getting-started'",
"lint:linkcheck:remote": "blc -ro --user-agent 'broken-link-checker/0.7.8' 'https://docs.astro.build/'"
"lint:linkcheck": "astro build && node ./scripts/lint-linkcheck.mjs",
"lint:linkcheck:nobuild": "node ./scripts/lint-linkcheck.mjs"
},
"devDependencies": {
"@algolia/client-search": "^4.13.0",
Expand All @@ -24,6 +23,8 @@
"@babel/core": "^7.17.9",
"@types/react": "^17.0.43",
"astro": "^1.0.0-beta.5",
"htmlparser2": "^7.2.0",
"kleur": "^4.1.4",
"node-fetch": "^3.2.3",
"preact": "^10.7.1",
"prettier": "^2.6.2",
Expand Down
4 changes: 4 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

192 changes: 192 additions & 0 deletions scripts/lint-linkcheck.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import path from 'path';
import fs from 'fs';
import kleur from 'kleur';
import htmlparser2 from 'htmlparser2';

/**
* Contains all link checking logic.
*/
class BrokenLinkChecker {
constructor ({ baseUrl, buildOutputDir }) {
this.baseUrl = baseUrl;
this.buildOutputDir = buildOutputDir;
}

/**
* Checks all pages referenced by the sitemap for broken links
* and outputs the result to the console.
*/
run () {
// Get the pathnames of all content pages from the sitemap contained in the build output
const pagePathnames = this.getPagePathnamesFromSitemap();

// Parse all pages referenced by the sitemap and build an index of their contents
const pages = this.parsePages(pagePathnames);

// Find all broken links
const brokenLinks = this.findBrokenLinks(pages);

// Output the result
this.outputResult(brokenLinks);
}

/**
* Reads the `sitemap.xml` from the build output and extracts all unique pathnames.
*/
getPagePathnamesFromSitemap () {
const sitemapFilePath = path.join(this.buildOutputDir, 'sitemap.xml');
const sitemap = fs.readFileSync(sitemapFilePath, 'utf8');
const sitemapRegex = new RegExp(`<loc>${this.baseUrl}(/.*?)</loc>`, 'ig');
const uniquePagePaths = [...new Set(Array.from(
sitemap.matchAll(sitemapRegex),
m => m[1]
))];

return uniquePagePaths;
}

/**
* Parses multiple HTML pages based on their pathnames and builds an index of their contents.
*/
parsePages (pathnames) {
const pages = {};
pathnames.forEach(pathname => {
pages[pathname] = this.parsePage(pathname);
});

return pages;
}

/**
* Parses an HTML page based on its pathname and builds an index of its contents.
*/
parsePage (pathname) {
const href = this.pathnameToHref(pathname);
const htmlFilePath = this.pathnameToHtmlFilePath(pathname);

if (!fs.existsSync(htmlFilePath)) {
throw new Error('Failed to find HTML file referenced by sitemap: ' + htmlFilePath);
}

const dom = htmlparser2.parseDocument(fs.readFileSync(htmlFilePath));
const anchors = htmlparser2.DomUtils
.getElementsByTagName('a', dom, true);

// Build a list of unique link hrefs on the page
const linkHrefs = [...new Set(anchors
.map(el => el.attribs.href)
)];

// Build a list of hashes provided by the page (mostly used as scroll targets)
const anchorNames = anchors
.map(el => el.attribs.name)
.filter(name => name !== undefined);
const ids = htmlparser2.DomUtils
.getElements({ id: id => id !== undefined }, dom, true)
.map(el => el.attribs.id);
const hashes = [...anchorNames, ...ids]
.map(name => `#${name}`);

return {
pathname,
href,
htmlFilePath,
linkHrefs,
hashes,
};
}

/**
* Goes through all pre-parsed and indexed pages, checks their links,
* and returns an array containing all broken links (if any).
*/
findBrokenLinks (pages) {
var brokenLinks = [];

Object.values(pages).forEach(page => {
// Go through all link hrefs on the page
page.linkHrefs.forEach(linkHref => {
const url = new URL(linkHref, page.href);

// Ignore external URLs
if (!url.href.startsWith(this.baseUrl))
return;

var linkPathname = url.pathname;
if (!linkPathname.endsWith('/')) {
linkPathname += '/';
}
const linkedPage = pages[linkPathname];
const isMissingPage = !linkedPage;

const decodedHash = url.hash && decodeURIComponent(url.hash);
const isMissingHash = (
!isMissingPage &&
(decodedHash && !linkedPage.hashes.includes(decodedHash))
);

if (isMissingPage || isMissingHash) {
brokenLinks.push({
page,
href: url.href,
isMissingPage,
isMissingHash,
});
}
});
});

return brokenLinks;
}

/**
* Outputs the result of the broken link check to the console.
*/
outputResult (brokenLinks) {
const totalBroken = brokenLinks.length;

if (totalBroken > 0) {
const brokenHashCount = brokenLinks.filter(brokenLink => brokenLink.isMissingHash).length;
const brokenPageCount = totalBroken - brokenHashCount;
const prefixPage = kleur.gray(`[${kleur.red().bold('404')}]`);
const prefixHash = kleur.gray(`[${kleur.yellow().bold(' # ')}]`);

var lastPage;
brokenLinks.forEach(brokenLink => {
if (lastPage !== brokenLink.page) {
console.log(`\n${brokenLink.page.pathname}`);
lastPage = brokenLink.page;
}
console.log(` ${brokenLink.isMissingHash ? prefixHash : prefixPage} ${brokenLink.href}`);
});
console.log();

const summary = [
`*** Found ${totalBroken} broken ${totalBroken === 1 ? 'link' : 'links'} in total:`,
` ${prefixPage} ${brokenPageCount} broken page ${brokenPageCount === 1 ? 'link' : 'links'}`,
` ${prefixHash} ${brokenHashCount} broken fragment ${brokenHashCount === 1 ? 'link' : 'links'}`,
];
console.log(kleur.white().bold(summary.join('\n')));
} else {
console.log(kleur.green().bold('*** Found no broken links. Great job!'));
}
console.log();
}

pathnameToHref (pathname) {
const url = new URL(pathname, this.baseUrl);
return url.href;
}

pathnameToHtmlFilePath (pathname) {
return path.join(this.buildOutputDir, pathname, 'index.html');
}
}

// Use our class to check for broken links
const brokenLinkChecker = new BrokenLinkChecker({
baseUrl: 'https://docs.astro.build',
buildOutputDir: './dist',
});

brokenLinkChecker.run();