Fix to match GitHub’s algorithm on unicode

I reverse engineered GitHub’s slugging algorithm. Somewhat based on #25 and #35. To do that, I created two scripts: * `generate-fixtures.mjs`, which generates a markdown file, in part from manual fixtures and in part on the Unicode General Categories, creates a gist, crawls the gist, removes it, and saves fixtures annotated with the expected result from GitHub * `generate-regex.mjs`, which generates the regex that GitHub uses for characters to ignore. The regex is about 2.5kb minzipped. This increases the file size of this project a bit. But matching GitHub is worth it in my opinion. I also investigated regex `\p{}` classes in `/u` regexes. They work mostly fine, with two caveats: a) they don’t work everywhere, so would be a major release, b) GitHub does not implement the same Unicode version as browsers. I tested with Unicode 13 and 14, and they include characters that GitHub handles differently. In the end, GitHub’s algorithm is mostly fine: strip non-alphanumericals, allow `-`, and turn ` ` (space) into `-`. Finally, I removed the trim functionality, because it is not implemented by GitHub. To assert this, make a heading like so in a readme: `#  `. This is a space encoded as a character reference, meaning that the markdown does not see it as the whitespace between the `#` and the content. In fact, this makes it the content. And GitHub creates a slug of `-` for it. Closes GH-22. Closes GH-25. Closes GH-35. Closes GH-38. Co-authored-by: Dan Flettre <flettre@gmail.com> Co-authored-by: Jack Bates <jack@nottheoilrig.com>
Flet · Aug 24, 2021 · af59f34 · af59f34
1 parent 156591b
commit af59f34
Show file tree

Hide file tree

Showing 17 changed files with 627 additions and 290 deletions.
diff --git a/index.js b/index.js
@@ -1,10 +1,8 @@
-const emoji = require('emoji-regex')
+const regex = require('./regex.js')
 
 module.exports = BananaSlug
 
 const own = Object.hasOwnProperty
-const whitespace = /\s/g
-const specials = /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,./:;<=>?@[\]^`{|}~’]/g
 
 function BananaSlug () {
   const self = this
@@ -46,11 +44,7 @@ BananaSlug.prototype.reset = function () {
 function slugger (string, maintainCase) {
   if (typeof string !== 'string') return ''
   if (!maintainCase) string = string.toLowerCase()
-
-  return string.trim()
-    .replace(specials, '')
-    .replace(emoji(), '')
-    .replace(whitespace, '-')
+  return string.replace(regex, '').replace(/ /g, '-')
 }
 
 BananaSlug.slug = slugger
diff --git a/package.json b/package.json
@@ -11,16 +11,23 @@
     "url": "https://github.com/Flet/github-slugger/issues"
   },
   "files": [
-    "index.js"
+    "index.js",
+    "regex.js"
   ],
-  "dependencies": {
-    "emoji-regex": ">=6.0.0 <=6.1.1"
-  },
   "devDependencies": {
+    "@octokit/rest": "^18.0.0",
+    "@unicode/unicode-12.1.0": "^1.0.0",
+    "hast-util-select": "^5.0.0",
+    "mdast-util-gfm": "^1.0.0",
+    "mdast-util-to-markdown": "^1.0.0",
+    "node-fetch": "^2.0.0",
     "nyc": "^15.0.0",
+    "regenerate": "^1.0.0",
+    "rehype-parse": "^8.0.0",
     "standard": "*",
     "tap-spec": "^5.0.0",
-    "tape": "^4.0.0"
+    "tape": "^4.0.0",
+    "unified": "^10.0.0"
   },
   "homepage": "https://github.com/Flet/github-slugger",
   "keywords": [

diff --git a/regex.js b/regex.js
diff --git a/script/generate-fixtures.mjs b/script/generate-fixtures.mjs
@@ -0,0 +1,145 @@
+import { promises as fs } from 'node:fs'
+import { Octokit } from '@octokit/rest'
+import fetch from 'node-fetch'
+import { unified } from 'unified'
+import rehypeParse from 'rehype-parse'
+import { select, selectAll } from 'hast-util-select'
+import { toMarkdown } from 'mdast-util-to-markdown'
+import { gfmToMarkdown } from 'mdast-util-gfm'
+
+// Note: the GH token needs `gists` access!
+const ghToken = process.env.GH_TOKEN || process.env.GITHUB_TOKEN
+
+if (!ghToken) {
+  throw new Error('Missing GitHub token: expected `GH_TOKEN` in env')
+}
+
+const octo = new Octokit({ auth: 'token ' + ghToken })
+const categoryBase = new URL('../node_modules/@unicode/unicode-12.1.0/General_Category/', import.meta.url)
+
+// Take up to N samples from each category.
+const samples = 400
+
+const otherTests = [
+  { name: 'Basic usage', input: 'alpha' },
+  { name: 'Basic usage (again)', input: 'alpha' },
+  { name: 'Camelcase', input: 'bravoCharlieDelta' },
+  { name: 'Prototypal injection: proto', input: '__proto__' },
+  { name: 'Prototypal injection: proto (again)', input: '__proto__' },
+  { name: 'Prototypal injection: has own', input: 'hasOwnProperty' },
+  { name: 'Repetition (1)', input: 'echo' },
+  { name: 'Repetition (2)', input: 'echo' },
+  { name: 'Repetition (3)', input: 'echo 1' },
+  { name: 'Repetition (4)', input: 'echo-1' },
+  { name: 'Repetition (5)', input: 'echo' },
+  { name: 'More repetition (1)', input: 'foxtrot-1' },
+  { name: 'More repetition (2)', input: 'foxtrot' },
+  { name: 'More repetition (3)', input: 'foxtrot' },
+  { name: 'Characters: dash', input: 'heading with a - dash' },
+  { name: 'Characters: underscore', input: 'heading with an _ underscore' },
+  { name: 'Characters: dot', input: 'heading with a period.txt' },
+  { name: 'Characters: dots, parents, brackets', input: 'exchange.bind_headers(exchange, routing [, bindCallback])' },
+  { name: 'Characters: space', input: ' ', markdownOverwrite: '# &#x20;' },
+  { name: 'Characters: initial space', input: ' a', markdownOverwrite: '# &#x20;a' },
+  { name: 'Characters: final space', input: 'a ', markdownOverwrite: '# a&#x20;' },
+  { name: 'Characters: initial and final spaces', input: ' a ', markdownOverwrite: '# &#x20;a&#x20;' },
+  { name: 'Characters: initial and final dashes', input: '-a-' },
+  { name: 'Characters: apostrophe', input: 'apostrophe’s should be trimmed' },
+  { name: 'Some more duplicates (1)', input: 'golf' },
+  { name: 'Some more duplicates (2)', input: 'golf' },
+  { name: 'Some more duplicates (3)', input: 'golf' },
+  { name: 'Non-ascii: ♥', input: 'I ♥ unicode' },
+  { name: 'Non-ascii: -', input: 'dash-dash' },
+  { name: 'Non-ascii: –', input: 'en–dash' },
+  { name: 'Non-ascii: –', input: 'em–dash' },
+  { name: 'Non-ascii: 😄', input: '😄 unicode emoji' },
+  { name: 'Non-ascii: 😄-😄', input: '😄-😄 unicode emoji' },
+  { name: 'Non-ascii: 😄_😄', input: '😄_😄 unicode emoji' },
+  { name: 'Non-ascii: 😄', input: '😄 - an emoji' },
+  { name: 'Non-ascii: :smile:', input: ':smile: - a gemoji' },
+  { name: 'Non-ascii: Cyrillic (1)', input: 'Привет' },
+  { name: 'Non-ascii: Cyrillic (2)', input: 'Профили пользователей' },
+  { name: 'Non-ascii: Cyrillic + Han', input: 'Привет non-latin 你好' },
+  { name: 'Gemoji (1)', input: ':ok: No underscore' },
+  { name: 'Gemoji (2)', input: ':ok_hand: Single' },
+  { name: 'Gemoji (3)', input: ':ok_hand::hatched_chick: Two in a row with no spaces' },
+  { name: 'Gemoji (4)', input: ':ok_hand: :hatched_chick: Two in a row' }
+]
+
+main()
+
+async function main () {
+  const files = await fs.readdir(categoryBase)
+  const tests = [...otherTests]
+  let index = -1
+
+  // Create a test case with a bunch of examples.
+  while (++index < files.length) {
+    const name = files[index]
+
+    if (name === 'index.js') continue
+
+    // These result in Git(Hub) thinking it’s a binary file.
+    if (name === 'Control' || name === 'Surrogate') continue
+
+    // This prevents GH from rendering markdown to HTML.
+    if (name === 'Other') continue
+
+    const fp = `./${name}/code-points.js`
+    const { default: codePoints } = await import(new URL(fp, categoryBase))
+    const subs = []
+
+    let n = -1
+
+    while (++n < samples) {
+      subs.push(codePoints[Math.floor(codePoints.length / samples * n)])
+    }
+
+    subs.push(codePoints[codePoints.length - 1])
+
+    tests.push({ name, input: 'a' + [...new Set(subs)].map(d => String.fromCodePoint(d)).join(' ') + 'b' })
+  }
+
+  // Create a Gist.
+  const filename = 'readme.md'
+  const gistResult = await octo.gists.create({
+    files: {
+      [filename]: {
+        content: tests.map(d => {
+          return d.markdownOverwrite || toMarkdown({ type: 'heading', depth: 1, children: [{ type: 'text', value: d.input }] }, { extensions: [gfmToMarkdown()] })
+        }).join('\n\n')
+      }
+    }
+  })
+
+  const file = gistResult.data.files[filename]
+
+  if (!file.language) {
+    throw new Error('The generated markdown was seen as binary data instead of text by GitHub. This is likely because there are weird characters (such as control characters or lone surrogates) in it')
+  }
+
+  // Fetch the rendered page.
+  const response = await fetch(gistResult.data.html_url, {
+    headers: { Authorization: 'token ' + ghToken }
+  })
+
+  const doc = await response.text()
+
+  // Remove the Gist.
+  await octo.gists.delete({ gist_id: gistResult.data.id })
+
+  const tree = unified().use(rehypeParse).parse(doc)
+  const markdownBody = select('.markdown-body', tree)
+
+  if (!markdownBody) {
+    throw new Error('The generated markdown could not be rendered by GitHub as HTML. This is likely because there are weird characters in it')
+  }
+
+  const anchors = selectAll('h1 .anchor', markdownBody)
+
+  anchors.forEach((node, i) => {
+    tests[i].expected = node.properties.href.slice(1)
+  })
+
+  await fs.writeFile(new URL('../test/fixtures.json', import.meta.url), JSON.stringify(tests, null, 2) + '\n')
+}
diff --git a/script/generate-regex.mjs b/script/generate-regex.mjs
@@ -0,0 +1,62 @@
+import { promises as fs } from 'node:fs'
+import regenerate from 'regenerate'
+import alphabetics from '@unicode/unicode-12.1.0/Binary_Property/Alphabetic/code-points.js'
+
+const categoryBase = new URL('../node_modules/@unicode/unicode-12.1.0/General_Category/', import.meta.url)
+
+// Unicode General Categories to remove.
+const ranges = [
+  // Some numbers:
+  'Other_Number',
+
+  // Some punctuation:
+  'Close_Punctuation',
+  'Final_Punctuation',
+  'Initial_Punctuation',
+  'Open_Punctuation',
+  'Other_Punctuation',
+  // All except a normal `-` (dash)
+  'Dash_Punctuation',
+
+  // All:
+  'Symbol',
+  'Control',
+  'Private_Use',
+  'Format',
+  'Unassigned',
+
+  // All except a normal ` ` (space)
+  'Separator'
+]
+
+main()
+
+async function main () {
+  const generator = regenerate()
+
+  let index = -1
+
+  // Add code points to strip.
+  while (++index < ranges.length) {
+    const name = ranges[index]
+    const fp = `./${name}/code-points.js`
+    const { default: codePoints } = await import(new URL(fp, categoryBase))
+
+    generator.add(codePoints)
+  }
+
+  generator
+    // Some overlap between letters and Other Symbol.
+    .remove(alphabetics)
+    // Spaces are turned to `-`
+    .remove(' ')
+    // Dash is kept.
+    .remove('-')
+
+  await fs.writeFile('regex.js', [
+    '// This module is generated by `script/`.',
+    '/* eslint-disable no-control-regex, no-misleading-character-class, no-useless-escape */',
+    'module.exports = ' + generator.toRegExp() + 'g',
+    ''
+  ].join('\n'))
+}
diff --git a/test/1-basic-usage.md b/test/1-basic-usage.md
diff --git a/test/2-camel-case.md b/test/2-camel-case.md
diff --git a/test/3-prototype.md b/test/3-prototype.md
diff --git a/test/4-matching-slugs-basic.md b/test/4-matching-slugs-basic.md
diff --git a/test/5-matching-slugs-again.md b/test/5-matching-slugs-again.md
diff --git a/test/6-characters.md b/test/6-characters.md
diff --git a/test/7-duplicates.md b/test/7-duplicates.md
diff --git a/test/8-non-ascii.md b/test/8-non-ascii.md
diff --git a/test/9-emoji.md b/test/9-emoji.md