Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(core): optimize broken links checker #9778

Merged
merged 4 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions packages/docusaurus/src/server/__tests__/brokenLinks.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/

import {jest} from '@jest/globals';
import reactRouterConfig from 'react-router-config';
import {handleBrokenLinks} from '../brokenLinks';
import type {RouteConfig} from '@docusaurus/types';

Expand Down Expand Up @@ -718,4 +719,60 @@ describe('handleBrokenLinks', () => {
"
`);
});

it('is performant and minimize calls to matchRoutes', async () => {
const matchRoutesMock = jest.spyOn(reactRouterConfig, 'matchRoutes');

const scale = 100;

const routes: SimpleRoute[] = [
...Array.from<SimpleRoute>({length: scale}).map((_, i) => ({
path: `/page${i}`,
})),
...Array.from<SimpleRoute>({length: scale}).fill({
path: '/pageDynamic/:subpath1',
}),
];

const collectedLinks: Params['collectedLinks'] = Object.fromEntries(
Array.from<SimpleRoute>({length: scale}).map((_, i) => [
`/page${i}`,
{
links: [
...Array.from<SimpleRoute>({length: scale}).flatMap((_2, j) => [
`/page${j}`,
`/page${j}?age=42`,
`/page${j}#anchor${j}`,
`/page${j}?age=42#anchor${j}`,
`/pageDynamic/subPath${j}`,
`/pageDynamic/subPath${j}?age=42`,
// `/pageDynamic/subPath${j}#anchor${j}`,
// `/pageDynamic/subPath${j}?age=42#anchor${j}`,
]),
],
anchors: Array.from<SimpleRoute>({length: scale}).map(
(_2, j) => `anchor${j}`,
),
},
]),
);

// console.time('testBrokenLinks');
await testBrokenLinks({
routes,
collectedLinks,
});
// console.timeEnd('testBrokenLinks');

// Idiomatic code calling matchRoutes multiple times is not performant
// We try to minimize the calls to this expensive function
// Otherwise large sites will have super long execution times
// See https://github.com/facebook/docusaurus/issues/9754
// See https://twitter.com/sebastienlorber/status/1749392773415858587
// We expect no more matchRoutes calls than number of dynamic route links
expect(matchRoutesMock).toHaveBeenCalledTimes(scale);
// We expect matchRoutes to be called with a reduced number of routes
expect(routes).toHaveLength(scale * 2);
expect(matchRoutesMock.mock.calls[0]![0]).toHaveLength(scale);
});
});
183 changes: 117 additions & 66 deletions packages/docusaurus/src/server/brokenLinks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,18 @@

import _ from 'lodash';
import logger from '@docusaurus/logger';
import {matchRoutes} from 'react-router-config';
import {matchRoutes as reactRouterMatchRoutes} from 'react-router-config';
import {parseURLPath, serializeURLPath, type URLPath} from '@docusaurus/utils';
import {getAllFinalRoutes} from './utils';
import type {RouteConfig, ReportingSeverity} from '@docusaurus/types';

function matchRoutes(routeConfig: RouteConfig[], pathname: string) {
// @ts-expect-error: React router types RouteConfig with an actual React
// component, but we load route components with string paths.
// We don't actually access component here, so it's fine.
return reactRouterMatchRoutes(routeConfig, pathname);
}

type BrokenLink = {
link: string;
resolvedLink: string;
Expand All @@ -26,88 +33,121 @@ type CollectedLinks = {
[pathname: string]: {links: string[]; anchors: string[]};
};

function getBrokenLinksForPage({
// We use efficient data structures for performance reasons
// See https://github.com/facebook/docusaurus/issues/9754
type CollectedLinksNormalized = Map<
string,
{links: Set<string>; anchors: Set<string>}
>;

type BrokenLinksHelper = {
collectedLinks: CollectedLinksNormalized;
isPathBrokenLink: (linkPath: URLPath) => boolean;
isAnchorBrokenLink: (linkPath: URLPath) => boolean;
};

function createBrokenLinksHelper({
collectedLinks,
pagePath,
pageLinks,
routes,
}: {
collectedLinks: CollectedLinks;
pagePath: string;
pageLinks: string[];
pageAnchors: string[];
collectedLinks: CollectedLinksNormalized;
routes: RouteConfig[];
}): BrokenLink[] {
const allCollectedPaths = new Set(Object.keys(collectedLinks));
}): BrokenLinksHelper {
const validPathnames = new Set(collectedLinks.keys());

// Matching against the route array can be expensive
// If the route is already in the valid pathnames,
// we can avoid matching against it as an optimization
const remainingRoutes = routes.filter(
(route) => !validPathnames.has(route.path),
);

function isPathnameMatchingAnyRoute(pathname: string): boolean {
if (matchRoutes(remainingRoutes, pathname).length > 0) {
// IMPORTANT: this is an optimization here
// See https://github.com/facebook/docusaurus/issues/9754
// Large Docusaurus sites have many routes!
// We try to minimize calls to a possibly expensive matchRoutes function
validPathnames.add(pathname);
return true;
}

return false;
}

function isPathBrokenLink(linkPath: URLPath) {
const pathnames = [linkPath.pathname, decodeURI(linkPath.pathname)];
const matchedRoutes = pathnames
// @ts-expect-error: React router types RouteConfig with an actual React
// component, but we load route components with string paths.
// We don't actually access component here, so it's fine.
.map((l) => matchRoutes(routes, l))
.flat();
// The link path is broken if:
// - it doesn't match any route
// - it doesn't match any collected path
return (
matchedRoutes.length === 0 &&
!pathnames.some((p) => allCollectedPaths.has(p))
);
if (pathnames.some((p) => validPathnames.has(p))) {
return false;
}
if (pathnames.some(isPathnameMatchingAnyRoute)) {
return false;
}
return true;
}

function isAnchorBrokenLink(linkPath: URLPath) {
const {pathname, hash} = linkPath;

// Link has no hash: it can't be a broken anchor link
if (hash === undefined) {
return false;
}

// Link has empty hash ("#", "/page#"...): we do not report it as broken
// Empty hashes are used for various weird reasons, by us and other users...
// See for example: https://github.com/facebook/docusaurus/pull/6003
if (hash === '') {
return false;
}

const targetPage =
collectedLinks[pathname] || collectedLinks[decodeURI(pathname)];

collectedLinks.get(pathname) || collectedLinks.get(decodeURI(pathname));
// link with anchor to a page that does not exist (or did not collect any
// link/anchor) is considered as a broken anchor
if (!targetPage) {
return true;
}

// it's a broken anchor if the target page exists
// but the anchor does not exist on that page
const hashes = [hash, decodeURIComponent(hash)];
return !targetPage.anchors.some((anchor) => hashes.includes(anchor));
// it's a not broken anchor if the anchor exists on the target page
if (
targetPage.anchors.has(hash) ||
targetPage.anchors.has(decodeURIComponent(hash))
) {
return false;
}
return true;
}

const brokenLinks = pageLinks.flatMap((link) => {
return {
collectedLinks,
isPathBrokenLink,
isAnchorBrokenLink,
};
}

function getBrokenLinksForPage({
pagePath,
helper,
}: {
pagePath: string;
helper: BrokenLinksHelper;
}): BrokenLink[] {
const pageData = helper.collectedLinks.get(pagePath)!;

const brokenLinks: BrokenLink[] = [];

pageData.links.forEach((link) => {
const linkPath = parseURLPath(link, pagePath);
if (isPathBrokenLink(linkPath)) {
return [
{
link,
resolvedLink: serializeURLPath(linkPath),
anchor: false,
},
];
}
if (isAnchorBrokenLink(linkPath)) {
return [
{
link,
resolvedLink: serializeURLPath(linkPath),
anchor: true,
},
];
if (helper.isPathBrokenLink(linkPath)) {
brokenLinks.push({
link,
resolvedLink: serializeURLPath(linkPath),
anchor: false,
});
} else if (helper.isAnchorBrokenLink(linkPath)) {
brokenLinks.push({
link,
resolvedLink: serializeURLPath(linkPath),
anchor: true,
});
}
return [];
});

return brokenLinks;
Expand All @@ -128,26 +168,30 @@ function getBrokenLinks({
collectedLinks,
routes,
}: {
collectedLinks: CollectedLinks;
collectedLinks: CollectedLinksNormalized;
routes: RouteConfig[];
}): BrokenLinksMap {
const filteredRoutes = filterIntermediateRoutes(routes);

return _.mapValues(collectedLinks, (pageCollectedData, pagePath) => {
const helper = createBrokenLinksHelper({
collectedLinks,
routes: filteredRoutes,
});

const result: BrokenLinksMap = {};
collectedLinks.forEach((_unused, pagePath) => {
try {
return getBrokenLinksForPage({
collectedLinks,
pageLinks: pageCollectedData.links,
pageAnchors: pageCollectedData.anchors,
result[pagePath] = getBrokenLinksForPage({
pagePath,
routes: filteredRoutes,
helper,
});
} catch (e) {
throw new Error(`Unable to get broken links for page ${pagePath}.`, {
cause: e,
});
}
});
return result;
}

function brokenLinkMessage(brokenLink: BrokenLink): string {
Expand Down Expand Up @@ -303,15 +347,22 @@ function reportBrokenLinks({
// JS users might call "collectLink(undefined)" for example
// TS users might call "collectAnchor('#hash')" with/without #
// We clean/normalize the collected data to avoid obscure errors being thrown
// We also use optimized data structures for a faster algorithm
function normalizeCollectedLinks(
collectedLinks: CollectedLinks,
): CollectedLinks {
return _.mapValues(collectedLinks, (pageCollectedData) => ({
links: pageCollectedData.links.filter(_.isString),
anchors: pageCollectedData.anchors
.filter(_.isString)
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
}));
): CollectedLinksNormalized {
const result: CollectedLinksNormalized = new Map();
Object.entries(collectedLinks).forEach(([pathname, pageCollectedData]) => {
result.set(pathname, {
links: new Set(pageCollectedData.links.filter(_.isString)),
anchors: new Set(
pageCollectedData.anchors
.filter(_.isString)
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
),
});
});
return result;
}

export async function handleBrokenLinks({
Expand Down