Skip to content

Commit

Permalink
Make link extraction from markdown more explicit (Qiskit#628)
Browse files Browse the repository at this point in the history
This is prework for Qiskit#522,
which adds `sphinx.inv`. We want to check the links contained in
`sphinx.inv`, but we don't expect anyone to have links pointing to
`sphinx.inv`.

Our original abstractions made it hard to model `objects.inv` correctly
with the link checker. Now, `markdown.ts` is renamed to
`extractLinks.ts` and it solely deals with parsing files. It no longer
has a bad coupling to the complex `linksToOriginFiles` variable from
`FileBatch.ts`.

This should make it much more obvious how to handle `objects.inv`.

---------

Co-authored-by: Frank Harkins <frankharkins@hotmail.co.uk>
  • Loading branch information
Eric-Arellano and frankharkins authored Jan 15, 2024
1 parent 65497cc commit a2d7b9a
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 67 deletions.
17 changes: 17 additions & 0 deletions scripts/lib/api/specialCaseResults.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { expect, test } from "@jest/globals";

import {
specialCaseResults,
transformSpecialCaseUrl,
PROVIDER_INDEX_META,
RUNTIME_INDEX_META,
} from "./specialCaseResults";
Expand Down Expand Up @@ -55,3 +56,19 @@ test("specialCaseResults()", () => {
},
]);
});

test("transformSpecialCaseUrl()", () => {
const urls = [
"release_notes",
"release_notes#release-notes-0-2-1-bug-fixes",
"ibm-provider#qiskit-ibm-provider",
];
const transformedUrls = urls.map((x) => transformSpecialCaseUrl(x));
expect(transformedUrls).toMatchInlineSnapshot(`
[
"release-notes",
"release-notes#release-notes-0-2-1-bug-fixes",
"index#qiskit-ibm-provider",
]
`);
});
25 changes: 13 additions & 12 deletions scripts/lib/api/specialCaseResults.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,31 @@ description: API documentation for qiskit-ibm-runtime`;
export const PROVIDER_INDEX_META = `title: Qiskit IBM Provider API Docs
description: API documentation for qiskit-ibm-provider`;

export function transformSpecialCaseUrl(url: string): string {
return (
url
// We use `-` rather than `_` as our delimiter.
.replace(/(?<=^|\/)release_notes(?=#|$)/g, "release-notes")
.replace(/(?<=^|\/)terra(?=#|$)/g, "index")
.replace(/(?<=^|\/)ibm-provider(?=#|$)/g, "index")
.replace(/(?<=^|\/)ibm-runtime(?=#|$)/g, "index")
);
}

export function specialCaseResults(results: HtmlToMdResultWithUrl[]): void {
for (let result of results) {
// We use `-` rather than `_` as our delimiter.
if (result.url.endsWith("/release_notes")) {
result.url = result.url.replace(/\/release_notes$/g, "/release-notes");
}

// Before Qiskit 0.44, the API index page was called terra.html.
if (result.url.endsWith("/terra")) {
result.url = result.url.replace(/\/terra$/g, "/index");
}

if (result.url.endsWith("/ibm-provider")) {
result.url = result.url.replace(/\/ibm-provider$/g, "/index");
result.meta = {
hardcodedFrontmatter: PROVIDER_INDEX_META,
};
}

if (result.url.endsWith("/ibm-runtime")) {
result.url = result.url.replace(/\/ibm-runtime$/g, "/index");
result.meta = {
hardcodedFrontmatter: RUNTIME_INDEX_META,
};
}

result.url = transformSpecialCaseUrl(result.url);
}
}
35 changes: 35 additions & 0 deletions scripts/lib/links/FileBatch.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// This code is a Qiskit project.
//
// (C) Copyright IBM 2024.
//
// This code is licensed under the Apache License, Version 2.0. You may
// obtain a copy of this license in the LICENSE file in the root directory
// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
//
// Any modifications or derivative works of this code must retain this
// copyright notice, and modified files need to carry a notice indicating
// that they have been altered from the originals.

import { expect, test } from "@jest/globals";
import { addLinksToMap } from "./FileBatch";

test("addLinksToMap()", () => {
const linksToMap = new Map();

addLinksToMap("file1.md", ["https://ibm.com", "./relative"], linksToMap);
expect(linksToMap).toEqual(
new Map([
["https://ibm.com", ["file1.md"]],
["./relative", ["file1.md"]],
]),
);

addLinksToMap("file2.md", ["./relative", "/images/my_image.png"], linksToMap);
expect(linksToMap).toEqual(
new Map([
["https://ibm.com", ["file1.md"]],
["./relative", ["file1.md", "file2.md"]],
["/images/my_image.png", ["file2.md"]],
]),
);
});
27 changes: 21 additions & 6 deletions scripts/lib/links/FileBatch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import { globby } from "globby";

import { Link, File } from "./LinkChecker";
import FILES_TO_IGNORES from "./ignores";
import { getMarkdownAndAnchors, addLinksToMap } from "./markdown";
import { parseFile } from "./extractLinks";

export class FileBatch {
/**
Expand Down Expand Up @@ -64,15 +64,15 @@ export class FileBatch {
async load(): Promise<[File[], Link[], Link[]]> {
const files: File[] = [];
for (let filePath of this.toLoad) {
const [_, anchors] = await getMarkdownAndAnchors(filePath);
files.push(new File(filePath, anchors));
const parsed = await parseFile(filePath);
files.push(new File(filePath, parsed.anchors));
}

const linksToOriginFiles = new Map<string, string[]>();
for (const filePath of this.toCheck) {
const [markdown, anchors] = await getMarkdownAndAnchors(filePath);
files.push(new File(filePath, anchors));
await addLinksToMap(filePath, markdown, linksToOriginFiles);
const parsed = await parseFile(filePath);
files.push(new File(filePath, parsed.anchors));
addLinksToMap(filePath, parsed.links, linksToOriginFiles);
}

const internalLinks: Link[] = [];
Expand Down Expand Up @@ -125,3 +125,18 @@ export class FileBatch {
return allGood;
}
}

export function addLinksToMap(
filePath: string,
links: string[],
linksToOriginFiles: Map<string, string[]>,
): void {
links.forEach((link) => {
const entry = linksToOriginFiles.get(link);
if (entry === undefined) {
linksToOriginFiles.set(link, [filePath]);
} else {
entry.push(filePath);
}
});
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
// that they have been altered from the originals.

import { expect, test } from "@jest/globals";
import { markdownFromNotebook, parseAnchors, addLinksToMap } from "./markdown";
import { markdownFromNotebook, parseAnchors, parseLinks } from "./extractLinks";

test("markdownFromNotebook()", () => {
const result = markdownFromNotebook(`
Expand Down Expand Up @@ -72,8 +72,7 @@ test("parseAnchors()", () => {
]);
});

test("addLinksToMap()", async () => {
const linksToMap = new Map();
test("parseLinks()", async () => {
const markdown = `
# A header
Our [first link!](https://ibm.com) and, look, [another](./relative)!
Expand All @@ -82,23 +81,11 @@ test("addLinksToMap()", async () => {
<a href="./explicit-anchor">Explicit anchor</a>
`;
await addLinksToMap("file1.md", markdown, linksToMap);
expect(linksToMap).toEqual(
new Map([
["https://ibm.com", ["file1.md"]],
["./explicit-anchor", ["file1.md"]],
["./relative", ["file1.md"]],
["/images/my_image.png", ["file1.md"]],
]),
);

await addLinksToMap("file2.md", markdown, linksToMap);
expect(linksToMap).toEqual(
new Map([
["https://ibm.com", ["file1.md", "file2.md"]],
["./explicit-anchor", ["file1.md", "file2.md"]],
["./relative", ["file1.md", "file2.md"]],
["/images/my_image.png", ["file1.md", "file2.md"]],
]),
);
const result = await parseLinks(markdown);
expect(result).toEqual([
"https://ibm.com",
"./relative",
"/images/my_image.png",
"./explicit-anchor",
]);
});
52 changes: 25 additions & 27 deletions scripts/lib/links/markdown.ts → scripts/lib/links/extractLinks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ import rehypeRemark from "rehype-remark";
import rehypeParse from "rehype-parse";
import remarkGfm from "remark-gfm";

export type ParsedFile = {
/** Anchors that the file defines. These can be linked to from other files. */
anchors: string[];
/** Links that this file has to other places. These need to be validated. */
links: string[];
};

interface JupyterCell {
cell_type: string;
source: string[];
Expand All @@ -36,47 +43,38 @@ export function markdownFromNotebook(rawContent: string): string {
}

export function parseAnchors(markdown: string): string[] {
// Anchors generated from markdown titles.
const mdAnchors = markdownLinkExtractor(markdown).anchors;
// Anchors from HTML id tags.
const idAnchors = markdown.match(/(?<=id=")(.*)(?=")/gm) || [];
return [...mdAnchors, ...idAnchors.map((id) => `#${id}`)];
}

export async function getMarkdownAndAnchors(
filePath: string,
): Promise<[string, string[]]> {
const source = await readFile(filePath, { encoding: "utf8" });
const markdown =
path.extname(filePath) === ".ipynb" ? markdownFromNotebook(source) : source;
return [markdown, parseAnchors(markdown)];
}

export async function addLinksToMap(
filePath: string,
markdown: string,
linksToOriginFiles: Map<string, string[]>,
): Promise<void> {
const addLink = (link: string): void => {
const entry = linksToOriginFiles.get(link);
if (entry === undefined) {
linksToOriginFiles.set(link, [filePath]);
} else {
entry.push(filePath);
}
};

unified()
export async function parseLinks(markdown: string): Promise<string[]> {
const result: string[] = [];
await unified()
.use(rehypeParse)
.use(remarkGfm)
.use(rehypeRemark)
.use(() => (tree: Root) => {
visit(tree, "text", (TreeNode) => {
markdownLinkExtractor(String(TreeNode.value)).links.forEach((url) =>
addLink(url),
result.push(url),
);
});
visit(tree, "link", (TreeNode) => addLink(TreeNode.url));
visit(tree, "image", (TreeNode) => addLink(TreeNode.url));
visit(tree, "link", (TreeNode) => result.push(TreeNode.url));
visit(tree, "image", (TreeNode) => result.push(TreeNode.url));
})
.use(remarkStringify)
.process(markdown);

return result;
}

export async function parseFile(filePath: string): Promise<ParsedFile> {
const source = await readFile(filePath, { encoding: "utf8" });
const markdown =
path.extname(filePath) === ".ipynb" ? markdownFromNotebook(source) : source;
const links = await parseLinks(markdown);
return { anchors: parseAnchors(markdown), links };
}

0 comments on commit a2d7b9a

Please sign in to comment.