Make link extraction from markdown more explicit (Qiskit#628)

This is prework for Qiskit#522, which adds `sphinx.inv`. We want to check the links contained in `sphinx.inv`, but we don't expect anyone to have links pointing to `sphinx.inv`. Our original abstractions made it hard to model `objects.inv` correctly with the link checker. Now, `markdown.ts` is renamed to `extractLinks.ts` and it solely deals with parsing files. It no longer has a bad coupling to the complex `linksToOriginFiles` variable from `FileBatch.ts`. This should make it much more obvious how to handle `objects.inv`. --------- Co-authored-by: Frank Harkins <frankharkins@hotmail.co.uk>
frankharkins · Jan 15, 2024 · a2d7b9a · a2d7b9a
1 parent 65497cc
commit a2d7b9a
Show file tree

Hide file tree

Showing 6 changed files with 120 additions and 67 deletions.
diff --git a/scripts/lib/api/specialCaseResults.test.ts b/scripts/lib/api/specialCaseResults.test.ts
@@ -14,6 +14,7 @@ import { expect, test } from "@jest/globals";
 
 import {
   specialCaseResults,
+  transformSpecialCaseUrl,
   PROVIDER_INDEX_META,
   RUNTIME_INDEX_META,
 } from "./specialCaseResults";
@@ -55,3 +56,19 @@ test("specialCaseResults()", () => {
     },
   ]);
 });
+
+test("transformSpecialCaseUrl()", () => {
+  const urls = [
+    "release_notes",
+    "release_notes#release-notes-0-2-1-bug-fixes",
+    "ibm-provider#qiskit-ibm-provider",
+  ];
+  const transformedUrls = urls.map((x) => transformSpecialCaseUrl(x));
+  expect(transformedUrls).toMatchInlineSnapshot(`
+    [
+      "release-notes",
+      "release-notes#release-notes-0-2-1-bug-fixes",
+      "index#qiskit-ibm-provider",
+    ]
+  `);
+});
diff --git a/scripts/lib/api/specialCaseResults.ts b/scripts/lib/api/specialCaseResults.ts
@@ -18,30 +18,31 @@ description: API documentation for qiskit-ibm-runtime`;
 export const PROVIDER_INDEX_META = `title: Qiskit IBM Provider API Docs
 description: API documentation for qiskit-ibm-provider`;
 
+export function transformSpecialCaseUrl(url: string): string {
+  return (
+    url
+      // We use `-` rather than `_` as our delimiter.
+      .replace(/(?<=^|\/)release_notes(?=#|$)/g, "release-notes")
+      .replace(/(?<=^|\/)terra(?=#|$)/g, "index")
+      .replace(/(?<=^|\/)ibm-provider(?=#|$)/g, "index")
+      .replace(/(?<=^|\/)ibm-runtime(?=#|$)/g, "index")
+  );
+}
+
 export function specialCaseResults(results: HtmlToMdResultWithUrl[]): void {
   for (let result of results) {
-    // We use `-` rather than `_` as our delimiter.
-    if (result.url.endsWith("/release_notes")) {
-      result.url = result.url.replace(/\/release_notes$/g, "/release-notes");
-    }
-
-    // Before Qiskit 0.44, the API index page was called terra.html.
-    if (result.url.endsWith("/terra")) {
-      result.url = result.url.replace(/\/terra$/g, "/index");
-    }
-
     if (result.url.endsWith("/ibm-provider")) {
-      result.url = result.url.replace(/\/ibm-provider$/g, "/index");
       result.meta = {
         hardcodedFrontmatter: PROVIDER_INDEX_META,
       };
     }
 
     if (result.url.endsWith("/ibm-runtime")) {
-      result.url = result.url.replace(/\/ibm-runtime$/g, "/index");
       result.meta = {
         hardcodedFrontmatter: RUNTIME_INDEX_META,
       };
     }
+
+    result.url = transformSpecialCaseUrl(result.url);
   }
 }
diff --git a/scripts/lib/links/FileBatch.test.ts b/scripts/lib/links/FileBatch.test.ts
@@ -0,0 +1,35 @@
+// This code is a Qiskit project.
+//
+// (C) Copyright IBM 2024.
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+
+import { expect, test } from "@jest/globals";
+import { addLinksToMap } from "./FileBatch";
+
+test("addLinksToMap()", () => {
+  const linksToMap = new Map();
+
+  addLinksToMap("file1.md", ["https://ibm.com", "./relative"], linksToMap);
+  expect(linksToMap).toEqual(
+    new Map([
+      ["https://ibm.com", ["file1.md"]],
+      ["./relative", ["file1.md"]],
+    ]),
+  );
+
+  addLinksToMap("file2.md", ["./relative", "/images/my_image.png"], linksToMap);
+  expect(linksToMap).toEqual(
+    new Map([
+      ["https://ibm.com", ["file1.md"]],
+      ["./relative", ["file1.md", "file2.md"]],
+      ["/images/my_image.png", ["file2.md"]],
+    ]),
+  );
+});
diff --git a/scripts/lib/links/FileBatch.ts b/scripts/lib/links/FileBatch.ts
@@ -14,7 +14,7 @@ import { globby } from "globby";
 
 import { Link, File } from "./LinkChecker";
 import FILES_TO_IGNORES from "./ignores";
-import { getMarkdownAndAnchors, addLinksToMap } from "./markdown";
+import { parseFile } from "./extractLinks";
 
 export class FileBatch {
   /**
@@ -64,15 +64,15 @@ export class FileBatch {
   async load(): Promise<[File[], Link[], Link[]]> {
     const files: File[] = [];
     for (let filePath of this.toLoad) {
-      const [_, anchors] = await getMarkdownAndAnchors(filePath);
-      files.push(new File(filePath, anchors));
+      const parsed = await parseFile(filePath);
+      files.push(new File(filePath, parsed.anchors));
     }
 
     const linksToOriginFiles = new Map<string, string[]>();
     for (const filePath of this.toCheck) {
-      const [markdown, anchors] = await getMarkdownAndAnchors(filePath);
-      files.push(new File(filePath, anchors));
-      await addLinksToMap(filePath, markdown, linksToOriginFiles);
+      const parsed = await parseFile(filePath);
+      files.push(new File(filePath, parsed.anchors));
+      addLinksToMap(filePath, parsed.links, linksToOriginFiles);
     }
 
     const internalLinks: Link[] = [];
@@ -125,3 +125,18 @@ export class FileBatch {
     return allGood;
   }
 }
+
+export function addLinksToMap(
+  filePath: string,
+  links: string[],
+  linksToOriginFiles: Map<string, string[]>,
+): void {
+  links.forEach((link) => {
+    const entry = linksToOriginFiles.get(link);
+    if (entry === undefined) {
+      linksToOriginFiles.set(link, [filePath]);
+    } else {
+      entry.push(filePath);
+    }
+  });
+}
diff --git a/scripts/lib/links/markdown.test.ts → scripts/lib/links/extractLinks.test.ts b/scripts/lib/links/markdown.test.ts → scripts/lib/links/extractLinks.test.ts
@@ -11,7 +11,7 @@
 // that they have been altered from the originals.
 
 import { expect, test } from "@jest/globals";
-import { markdownFromNotebook, parseAnchors, addLinksToMap } from "./markdown";
+import { markdownFromNotebook, parseAnchors, parseLinks } from "./extractLinks";
 
 test("markdownFromNotebook()", () => {
   const result = markdownFromNotebook(`
@@ -72,8 +72,7 @@ test("parseAnchors()", () => {
   ]);
 });
 
-test("addLinksToMap()", async () => {
-  const linksToMap = new Map();
+test("parseLinks()", async () => {
   const markdown = `
     # A header
     Our [first link!](https://ibm.com) and, look, [another](./relative)!
@@ -82,23 +81,11 @@ test("addLinksToMap()", async () => {
 
     <a href="./explicit-anchor">Explicit anchor</a>
     `;
-  await addLinksToMap("file1.md", markdown, linksToMap);
-  expect(linksToMap).toEqual(
-    new Map([
-      ["https://ibm.com", ["file1.md"]],
-      ["./explicit-anchor", ["file1.md"]],
-      ["./relative", ["file1.md"]],
-      ["/images/my_image.png", ["file1.md"]],
-    ]),
-  );
-
-  await addLinksToMap("file2.md", markdown, linksToMap);
-  expect(linksToMap).toEqual(
-    new Map([
-      ["https://ibm.com", ["file1.md", "file2.md"]],
-      ["./explicit-anchor", ["file1.md", "file2.md"]],
-      ["./relative", ["file1.md", "file2.md"]],
-      ["/images/my_image.png", ["file1.md", "file2.md"]],
-    ]),
-  );
+  const result = await parseLinks(markdown);
+  expect(result).toEqual([
+    "https://ibm.com",
+    "./relative",
+    "/images/my_image.png",
+    "./explicit-anchor",
+  ]);
 });
diff --git a/scripts/lib/links/markdown.ts → scripts/lib/links/extractLinks.ts b/scripts/lib/links/markdown.ts → scripts/lib/links/extractLinks.ts
@@ -22,6 +22,13 @@ import rehypeRemark from "rehype-remark";
 import rehypeParse from "rehype-parse";
 import remarkGfm from "remark-gfm";
 
+export type ParsedFile = {
+  /** Anchors that the file defines. These can be linked to from other files. */
+  anchors: string[];
+  /** Links that this file has to other places. These need to be validated. */
+  links: string[];
+};
+
 interface JupyterCell {
   cell_type: string;
   source: string[];
@@ -36,47 +43,38 @@ export function markdownFromNotebook(rawContent: string): string {
 }
 
 export function parseAnchors(markdown: string): string[] {
+  // Anchors generated from markdown titles.
   const mdAnchors = markdownLinkExtractor(markdown).anchors;
+  // Anchors from HTML id tags.
   const idAnchors = markdown.match(/(?<=id=")(.*)(?=")/gm) || [];
   return [...mdAnchors, ...idAnchors.map((id) => `#${id}`)];
 }
 
-export async function getMarkdownAndAnchors(
-  filePath: string,
-): Promise<[string, string[]]> {
-  const source = await readFile(filePath, { encoding: "utf8" });
-  const markdown =
-    path.extname(filePath) === ".ipynb" ? markdownFromNotebook(source) : source;
-  return [markdown, parseAnchors(markdown)];
-}
-
-export async function addLinksToMap(
-  filePath: string,
-  markdown: string,
-  linksToOriginFiles: Map<string, string[]>,
-): Promise<void> {
-  const addLink = (link: string): void => {
-    const entry = linksToOriginFiles.get(link);
-    if (entry === undefined) {
-      linksToOriginFiles.set(link, [filePath]);
-    } else {
-      entry.push(filePath);
-    }
-  };
-
-  unified()
+export async function parseLinks(markdown: string): Promise<string[]> {
+  const result: string[] = [];
+  await unified()
     .use(rehypeParse)
     .use(remarkGfm)
     .use(rehypeRemark)
     .use(() => (tree: Root) => {
       visit(tree, "text", (TreeNode) => {
         markdownLinkExtractor(String(TreeNode.value)).links.forEach((url) =>
-          addLink(url),
+          result.push(url),
         );
       });
-      visit(tree, "link", (TreeNode) => addLink(TreeNode.url));
-      visit(tree, "image", (TreeNode) => addLink(TreeNode.url));
+      visit(tree, "link", (TreeNode) => result.push(TreeNode.url));
+      visit(tree, "image", (TreeNode) => result.push(TreeNode.url));
     })
     .use(remarkStringify)
     .process(markdown);
+
+  return result;
+}
+
+export async function parseFile(filePath: string): Promise<ParsedFile> {
+  const source = await readFile(filePath, { encoding: "utf8" });
+  const markdown =
+    path.extname(filePath) === ".ipynb" ? markdownFromNotebook(source) : source;
+  const links = await parseLinks(markdown);
+  return { anchors: parseAnchors(markdown), links };
 }