perf(core): optimize broken links checker (#9778)

facebook · Jan 26, 2024 · 43883f8 · 43883f8
1 parent 47985e7
commit 43883f8
Show file tree

Hide file tree

Showing 2 changed files with 174 additions and 66 deletions.
diff --git a/packages/docusaurus/src/server/__tests__/brokenLinks.test.ts b/packages/docusaurus/src/server/__tests__/brokenLinks.test.ts
@@ -6,6 +6,7 @@
  */
 
 import {jest} from '@jest/globals';
+import reactRouterConfig from 'react-router-config';
 import {handleBrokenLinks} from '../brokenLinks';
 import type {RouteConfig} from '@docusaurus/types';
 
@@ -718,4 +719,60 @@ describe('handleBrokenLinks', () => {
       "
     `);
   });
+
+  it('is performant and minimize calls to matchRoutes', async () => {
+    const matchRoutesMock = jest.spyOn(reactRouterConfig, 'matchRoutes');
+
+    const scale = 100;
+
+    const routes: SimpleRoute[] = [
+      ...Array.from<SimpleRoute>({length: scale}).map((_, i) => ({
+        path: `/page${i}`,
+      })),
+      ...Array.from<SimpleRoute>({length: scale}).fill({
+        path: '/pageDynamic/:subpath1',
+      }),
+    ];
+
+    const collectedLinks: Params['collectedLinks'] = Object.fromEntries(
+      Array.from<SimpleRoute>({length: scale}).map((_, i) => [
+        `/page${i}`,
+        {
+          links: [
+            ...Array.from<SimpleRoute>({length: scale}).flatMap((_2, j) => [
+              `/page${j}`,
+              `/page${j}?age=42`,
+              `/page${j}#anchor${j}`,
+              `/page${j}?age=42#anchor${j}`,
+              `/pageDynamic/subPath${j}`,
+              `/pageDynamic/subPath${j}?age=42`,
+              // `/pageDynamic/subPath${j}#anchor${j}`,
+              // `/pageDynamic/subPath${j}?age=42#anchor${j}`,
+            ]),
+          ],
+          anchors: Array.from<SimpleRoute>({length: scale}).map(
+            (_2, j) => `anchor${j}`,
+          ),
+        },
+      ]),
+    );
+
+    // console.time('testBrokenLinks');
+    await testBrokenLinks({
+      routes,
+      collectedLinks,
+    });
+    // console.timeEnd('testBrokenLinks');
+
+    // Idiomatic code calling matchRoutes multiple times is not performant
+    // We try to minimize the calls to this expensive function
+    // Otherwise large sites will have super long execution times
+    // See https://github.com/facebook/docusaurus/issues/9754
+    // See https://twitter.com/sebastienlorber/status/1749392773415858587
+    // We expect no more matchRoutes calls than number of dynamic route links
+    expect(matchRoutesMock).toHaveBeenCalledTimes(scale);
+    // We expect matchRoutes to be called with a reduced number of routes
+    expect(routes).toHaveLength(scale * 2);
+    expect(matchRoutesMock.mock.calls[0]![0]).toHaveLength(scale);
+  });
 });
diff --git a/packages/docusaurus/src/server/brokenLinks.ts b/packages/docusaurus/src/server/brokenLinks.ts
@@ -7,11 +7,18 @@
 
 import _ from 'lodash';
 import logger from '@docusaurus/logger';
-import {matchRoutes} from 'react-router-config';
+import {matchRoutes as reactRouterMatchRoutes} from 'react-router-config';
 import {parseURLPath, serializeURLPath, type URLPath} from '@docusaurus/utils';
 import {getAllFinalRoutes} from './utils';
 import type {RouteConfig, ReportingSeverity} from '@docusaurus/types';
 
+function matchRoutes(routeConfig: RouteConfig[], pathname: string) {
+  // @ts-expect-error: React router types RouteConfig with an actual React
+  // component, but we load route components with string paths.
+  // We don't actually access component here, so it's fine.
+  return reactRouterMatchRoutes(routeConfig, pathname);
+}
+
 type BrokenLink = {
   link: string;
   resolvedLink: string;
@@ -26,88 +33,121 @@ type CollectedLinks = {
   [pathname: string]: {links: string[]; anchors: string[]};
 };
 
-function getBrokenLinksForPage({
+// We use efficient data structures for performance reasons
+// See https://github.com/facebook/docusaurus/issues/9754
+type CollectedLinksNormalized = Map<
+  string,
+  {links: Set<string>; anchors: Set<string>}
+>;
+
+type BrokenLinksHelper = {
+  collectedLinks: CollectedLinksNormalized;
+  isPathBrokenLink: (linkPath: URLPath) => boolean;
+  isAnchorBrokenLink: (linkPath: URLPath) => boolean;
+};
+
+function createBrokenLinksHelper({
   collectedLinks,
-  pagePath,
-  pageLinks,
   routes,
 }: {
-  collectedLinks: CollectedLinks;
-  pagePath: string;
-  pageLinks: string[];
-  pageAnchors: string[];
+  collectedLinks: CollectedLinksNormalized;
   routes: RouteConfig[];
-}): BrokenLink[] {
-  const allCollectedPaths = new Set(Object.keys(collectedLinks));
+}): BrokenLinksHelper {
+  const validPathnames = new Set(collectedLinks.keys());
+
+  // Matching against the route array can be expensive
+  // If the route is already in the valid pathnames,
+  // we can avoid matching against it as an optimization
+  const remainingRoutes = routes.filter(
+    (route) => !validPathnames.has(route.path),
+  );
+
+  function isPathnameMatchingAnyRoute(pathname: string): boolean {
+    if (matchRoutes(remainingRoutes, pathname).length > 0) {
+      // IMPORTANT: this is an optimization here
+      // See https://github.com/facebook/docusaurus/issues/9754
+      // Large Docusaurus sites have many routes!
+      // We try to minimize calls to a possibly expensive matchRoutes function
+      validPathnames.add(pathname);
+      return true;
+    }
+
+    return false;
+  }
 
   function isPathBrokenLink(linkPath: URLPath) {
     const pathnames = [linkPath.pathname, decodeURI(linkPath.pathname)];
-    const matchedRoutes = pathnames
-      // @ts-expect-error: React router types RouteConfig with an actual React
-      // component, but we load route components with string paths.
-      // We don't actually access component here, so it's fine.
-      .map((l) => matchRoutes(routes, l))
-      .flat();
-    // The link path is broken if:
-    // - it doesn't match any route
-    // - it doesn't match any collected path
-    return (
-      matchedRoutes.length === 0 &&
-      !pathnames.some((p) => allCollectedPaths.has(p))
-    );
+    if (pathnames.some((p) => validPathnames.has(p))) {
+      return false;
+    }
+    if (pathnames.some(isPathnameMatchingAnyRoute)) {
+      return false;
+    }
+    return true;
   }
 
   function isAnchorBrokenLink(linkPath: URLPath) {
     const {pathname, hash} = linkPath;
-
     // Link has no hash: it can't be a broken anchor link
     if (hash === undefined) {
       return false;
     }
-
     // Link has empty hash ("#", "/page#"...): we do not report it as broken
     // Empty hashes are used for various weird reasons, by us and other users...
     // See for example: https://github.com/facebook/docusaurus/pull/6003
     if (hash === '') {
       return false;
     }
-
     const targetPage =
-      collectedLinks[pathname] || collectedLinks[decodeURI(pathname)];
-
+      collectedLinks.get(pathname) || collectedLinks.get(decodeURI(pathname));
     // link with anchor to a page that does not exist (or did not collect any
     // link/anchor) is considered as a broken anchor
     if (!targetPage) {
       return true;
     }
-
-    // it's a broken anchor if the target page exists
-    // but the anchor does not exist on that page
-    const hashes = [hash, decodeURIComponent(hash)];
-    return !targetPage.anchors.some((anchor) => hashes.includes(anchor));
+    // it's a not broken anchor if the anchor exists on the target page
+    if (
+      targetPage.anchors.has(hash) ||
+      targetPage.anchors.has(decodeURIComponent(hash))
+    ) {
+      return false;
+    }
+    return true;
   }
 
-  const brokenLinks = pageLinks.flatMap((link) => {
+  return {
+    collectedLinks,
+    isPathBrokenLink,
+    isAnchorBrokenLink,
+  };
+}
+
+function getBrokenLinksForPage({
+  pagePath,
+  helper,
+}: {
+  pagePath: string;
+  helper: BrokenLinksHelper;
+}): BrokenLink[] {
+  const pageData = helper.collectedLinks.get(pagePath)!;
+
+  const brokenLinks: BrokenLink[] = [];
+
+  pageData.links.forEach((link) => {
     const linkPath = parseURLPath(link, pagePath);
-    if (isPathBrokenLink(linkPath)) {
-      return [
-        {
-          link,
-          resolvedLink: serializeURLPath(linkPath),
-          anchor: false,
-        },
-      ];
-    }
-    if (isAnchorBrokenLink(linkPath)) {
-      return [
-        {
-          link,
-          resolvedLink: serializeURLPath(linkPath),
-          anchor: true,
-        },
-      ];
+    if (helper.isPathBrokenLink(linkPath)) {
+      brokenLinks.push({
+        link,
+        resolvedLink: serializeURLPath(linkPath),
+        anchor: false,
+      });
+    } else if (helper.isAnchorBrokenLink(linkPath)) {
+      brokenLinks.push({
+        link,
+        resolvedLink: serializeURLPath(linkPath),
+        anchor: true,
+      });
     }
-    return [];
   });
 
   return brokenLinks;
@@ -128,26 +168,30 @@ function getBrokenLinks({
   collectedLinks,
   routes,
 }: {
-  collectedLinks: CollectedLinks;
+  collectedLinks: CollectedLinksNormalized;
   routes: RouteConfig[];
 }): BrokenLinksMap {
   const filteredRoutes = filterIntermediateRoutes(routes);
 
-  return _.mapValues(collectedLinks, (pageCollectedData, pagePath) => {
+  const helper = createBrokenLinksHelper({
+    collectedLinks,
+    routes: filteredRoutes,
+  });
+
+  const result: BrokenLinksMap = {};
+  collectedLinks.forEach((_unused, pagePath) => {
     try {
-      return getBrokenLinksForPage({
-        collectedLinks,
-        pageLinks: pageCollectedData.links,
-        pageAnchors: pageCollectedData.anchors,
+      result[pagePath] = getBrokenLinksForPage({
         pagePath,
-        routes: filteredRoutes,
+        helper,
       });
     } catch (e) {
       throw new Error(`Unable to get broken links for page ${pagePath}.`, {
         cause: e,
       });
     }
   });
+  return result;
 }
 
 function brokenLinkMessage(brokenLink: BrokenLink): string {
@@ -303,15 +347,22 @@ function reportBrokenLinks({
 // JS users might call "collectLink(undefined)" for example
 // TS users might call "collectAnchor('#hash')" with/without #
 // We clean/normalize the collected data to avoid obscure errors being thrown
+// We also use optimized data structures for a faster algorithm
 function normalizeCollectedLinks(
   collectedLinks: CollectedLinks,
-): CollectedLinks {
-  return _.mapValues(collectedLinks, (pageCollectedData) => ({
-    links: pageCollectedData.links.filter(_.isString),
-    anchors: pageCollectedData.anchors
-      .filter(_.isString)
-      .map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
-  }));
+): CollectedLinksNormalized {
+  const result: CollectedLinksNormalized = new Map();
+  Object.entries(collectedLinks).forEach(([pathname, pageCollectedData]) => {
+    result.set(pathname, {
+      links: new Set(pageCollectedData.links.filter(_.isString)),
+      anchors: new Set(
+        pageCollectedData.anchors
+          .filter(_.isString)
+          .map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
+      ),
+    });
+  });
+  return result;
 }
 
 export async function handleBrokenLinks({