-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #105 from w3c/export-minutes-tool
Add tool to convert Google Docs to Markdown
- Loading branch information
Showing
1 changed file
with
242 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
<!DOCTYPE html> | ||
<!-- | ||
This is a tool to convert the minutes from Google Docs to Github-flavored markdown. | ||
It is designed for use with https://github.com/w3c/webextensions | ||
and only supports the (standard Google Docs) syntax from | ||
https://docs.google.com/document/d/1QkwhEMtMS67JBUkl_WVPZ4lRSKoWcQNlLJSf_GwSXg8/edit | ||
Questions? Ask rob@robwu.nl | ||
--> | ||
<head> | ||
<meta charset="utf-8"> | ||
<title>WECG minutes converter - from Google Docs to Markdown</title> | ||
<style> | ||
html, body { | ||
height: 100%; | ||
margin: 0; | ||
padding: 0; | ||
} | ||
body { | ||
display: flex; | ||
flex-direction: column; | ||
} | ||
#extraInfoOutput { | ||
white-space: pre-wrap; | ||
height: 7em; | ||
} | ||
#input, #output { | ||
flex: 1; | ||
overflow: auto; | ||
background: lightgrey; | ||
} | ||
</style> | ||
</head> | ||
<body> | ||
<div>Select the text in Google Docs and Paste the contents below:</div> | ||
<div id="input" contenteditable></div> | ||
<div> | ||
<input type="button" id="convert" value="Convert above paste from Google Doc to (Github-flavored) markdown"> | ||
</div> | ||
<div id="extraInfoOutput"></div> | ||
<textarea id="output" placeholder="Markdown output appears here"></textarea> | ||
<script> | ||
var input = document.getElementById("input"); | ||
var output = document.getElementById("output"); | ||
var extraInfoOutput = document.getElementById("extraInfoOutput"); | ||
var convert = document.getElementById("convert"); | ||
|
||
convert.onclick = function() { | ||
let markdownText = convertToMarkdown(input); | ||
output.value = markdownText; | ||
let issues = new Set(); | ||
let prs = new Set(); | ||
let mentionedWithoutLink = new Set(); | ||
let pat = /https:\/\/github\.com\/w3c\/webextensions\/(issues|pull)\/(\d+)/g, match; | ||
while ((match = pat.exec(markdownText)) !== null) { | ||
let [, issueOrPr, issueNr] = match; | ||
if (issueOrPr === "pull") { | ||
prs.add(issueNr); | ||
} else { | ||
issues.add(issueNr); | ||
} | ||
} | ||
pat = /\sissue (\d+)/gi; | ||
while ((match = pat.exec(markdownText)) !== null) { | ||
let [, issueNr] = match; | ||
if (!issues.has(issueNr) && !prs.has(issueNr)) { | ||
mentionedWithoutLink.add(issueNr); | ||
} | ||
} | ||
function serializeIssues(issueNrs) { | ||
return Array.from(issueNrs, issueNr => `#${issueNr}`).join(", ") || "-"; | ||
} | ||
extraInfoOutput.textContent = ` | ||
List of issues/PRs in order of appearance in the input: | ||
- Issues: ${serializeIssues(issues)} | ||
- PRs: ${serializeIssues(prs)} | ||
- Mentioned issues without link to issue: ${serializeIssues(mentionedWithoutLink)}`; | ||
}; | ||
|
||
/** | ||
This formatter does the following: | ||
- Apply code formatting. | ||
- Replace < with < | ||
- Replace * and _ with \* and \_. | ||
- Replace boldfaced with **xx** | ||
- Replace italic with _xx_ | ||
- Replace links with [text](anchor) | ||
- Replace h1, h2, h3 with #, ## and ### | ||
- Format h1 header for consistency. | ||
- Replace ol,ul and li with correctly indented list items. | ||
- Fixup whitespace. | ||
*/ | ||
function convertToMarkdown(elemRootInput) { | ||
let root = elemRootInput.cloneNode(true); | ||
|
||
// Apply code formatting first, before escaping characters. | ||
for (let c of root.querySelectorAll(`span[style*="font-family:'Courier New'"]`)) { | ||
c.prepend("`"); | ||
c.append("`"); | ||
// replaceAllInTextNodes skips ` only if they are in the same text node. | ||
c.normalize(); | ||
} | ||
|
||
// Escape < to avoid rendering as HTML. | ||
replaceAllInTextNodes(root, "<", "<"); | ||
|
||
// Replace all unescaped _ and * with escaped ones to avoid undesired formatting. | ||
replaceAllInTextNodes(root, /(?<=\s|^)[*_]|[*_](?=\s|$)/g, "\\$&"); | ||
|
||
// Apply boldfaced appearance. | ||
for (let b of root.querySelectorAll(`span[style*="font-weight:700"]`)) { | ||
b.prepend("**"); | ||
b.append("**"); | ||
} | ||
|
||
// Apply italic appearance. | ||
for (let i of root.querySelectorAll(`span[style*="font-style:italic"]`)) { | ||
i.prepend("_"); | ||
i.append("_"); | ||
} | ||
|
||
// Render links. | ||
for (let a of root.querySelectorAll("a[href]")) { | ||
if (a.href === a.textContent.trim()) { | ||
continue; | ||
} | ||
let href = a.href.replaceAll(")", "%29"); | ||
a.prepend("["); | ||
a.append(`](${href})`); | ||
} | ||
|
||
// Format headers | ||
for (let h of root.querySelectorAll("h1")) { | ||
// Replace header: | ||
// WECG Meetings 2021, Public Notes—Oct 28, 2021 | ||
// WECG Meetings 2021, Public Notes, Oct 28 | ||
replaceAllInTextNodes( | ||
h, | ||
/(WECG Meetings \d{4}, Public Notes)—([A-Za-z]+ \d{1,2}), \d{4}/g, | ||
"$1, $2" | ||
); | ||
h.prepend(`\n# `); | ||
} | ||
for (let h of root.querySelectorAll("h2")) { | ||
h.prepend(`\n## `); | ||
} | ||
for (let h of root.querySelectorAll("h3")) { | ||
h.prepend(`\n### `); | ||
} | ||
|
||
for (let li of root.querySelectorAll("li")) { | ||
let level = 0; | ||
for (let parentNode = li.parentNode; parentNode !== root; parentNode = parentNode.parentNode) { | ||
if (parentNode.tagName === "OL" || parentNode.tagName === "UL") { | ||
++level; | ||
} | ||
} | ||
let listItems = Array.from(li.parentNode.children).filter(e => e.tagName === "LI"); | ||
let listIndex = listItems.indexOf(li) + 1; | ||
|
||
// Top-level (level 1) has no extra indentation, other levels 2 spaces per level. | ||
let prefix = " ".repeat(level - 1); | ||
if (li.parentNode.tagName === "OL") { | ||
prefix += ` ${listIndex}. `; | ||
} else { | ||
prefix += " * "; | ||
} | ||
li.prepend(prefix); | ||
let isNewList = li.parentNode.previousElementSibling?.tagName !== li.parentNode.tagName; | ||
if (level === 1 && listIndex === 1 && isNewList) { | ||
// Insert blank line before top-level list. | ||
li.before("\n"); | ||
} | ||
} | ||
|
||
// Forced line break after every paragraph and br. | ||
for (let elem of root.querySelectorAll("p, br")) { | ||
elem.after("\n"); | ||
} | ||
// Blank line after every header. | ||
for (let elem of root.querySelectorAll("h1,h2,h3")) { | ||
elem.after("\n\n"); | ||
} | ||
|
||
let textContent = root.textContent; | ||
|
||
// Normalize ’ to '. | ||
textContent = textContent.replaceAll("’", "'"); | ||
|
||
// Normalize non-breaking whitespace to regular whitespace. | ||
textContent = textContent.replaceAll("\xA0", " "); | ||
|
||
// Docs sometimes appends a space to a link even if not in the source text. Strip it | ||
textContent = textContent.replaceAll(/ +(\]\([^)\n]+\)) */g, "$1 "); | ||
|
||
// Trim trailing whitespace. | ||
textContent = textContent.replaceAll(/ +$/gm, ""); | ||
|
||
// Remove consecutive line breaks to at most one empty line. | ||
// May happen if header is followed by enumeration. | ||
textContent = textContent.replace(/(\n\n)\n+/g, "$1") | ||
|
||
// Each section header has two blank lines in front of it. | ||
textContent = textContent.replace(/^(?=#+ )/gm, "\n"); | ||
|
||
// Trim leading whitespace. | ||
textContent = textContent.trim(); | ||
|
||
return textContent; | ||
} | ||
|
||
function replaceAllInTextNodes(root, pattern, replacement) { | ||
let treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_TEXT); | ||
let updatesNodes = []; | ||
for (let node = treeWalker.nextNode(); node; node = treeWalker.nextNode()) { | ||
let orig = node.nodeValue; | ||
let proposed; | ||
let origParts = orig.split("`"); | ||
if (origParts.length && (origParts.length % 2)) { | ||
// Contains an even number of `; skip over code blocks. | ||
proposed = origParts.map((str, i) => { | ||
if (i % 2) { | ||
// Outside backtick. | ||
return str; | ||
} | ||
return str.replaceAll(pattern, replacement); | ||
}).join("`"); | ||
} else { | ||
proposed = orig.replaceAll(pattern, replacement); | ||
} | ||
if (orig !== proposed) { | ||
updatesNodes.push([node, proposed]); | ||
} | ||
} | ||
for (let [node, proposed] of updatesNodes) { | ||
node.parentNode.replaceChild(document.createTextNode(proposed), node); | ||
} | ||
} | ||
</script> | ||
</body> | ||
</html> |