Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix to match GitHub’s algorithm on unicode #38

Merged
merged 1 commit into from
Aug 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
const emoji = require('emoji-regex')
const regex = require('./regex.js')

module.exports = BananaSlug

const own = Object.hasOwnProperty
const whitespace = /\s/g
const specials = /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,./:;<=>?@[\]^`{|}~’]/g

function BananaSlug () {
const self = this
Expand Down Expand Up @@ -46,11 +44,7 @@ BananaSlug.prototype.reset = function () {
function slugger (string, maintainCase) {
if (typeof string !== 'string') return ''
if (!maintainCase) string = string.toLowerCase()

return string.trim()
.replace(specials, '')
.replace(emoji(), '')
.replace(whitespace, '-')
return string.replace(regex, '').replace(/ /g, '-')
}

BananaSlug.slug = slugger
17 changes: 12 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,23 @@
"url": "https://github.com/Flet/github-slugger/issues"
},
"files": [
"index.js"
"index.js",
"regex.js"
],
"dependencies": {
"emoji-regex": ">=6.0.0 <=6.1.1"
},
"devDependencies": {
"@octokit/rest": "^18.0.0",
"@unicode/unicode-12.1.0": "^1.0.0",
"hast-util-select": "^5.0.0",
"mdast-util-gfm": "^1.0.0",
"mdast-util-to-markdown": "^1.0.0",
"node-fetch": "^2.0.0",
"nyc": "^15.0.0",
"regenerate": "^1.0.0",
"rehype-parse": "^8.0.0",
"standard": "*",
"tap-spec": "^5.0.0",
"tape": "^4.0.0"
"tape": "^4.0.0",
"unified": "^10.0.0"
},
"homepage": "https://github.com/Flet/github-slugger",
"keywords": [
Expand Down
3 changes: 3 additions & 0 deletions regex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

145 changes: 145 additions & 0 deletions script/generate-fixtures.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import { promises as fs } from 'node:fs'
import { Octokit } from '@octokit/rest'
import fetch from 'node-fetch'
import { unified } from 'unified'
import rehypeParse from 'rehype-parse'
import { select, selectAll } from 'hast-util-select'
import { toMarkdown } from 'mdast-util-to-markdown'
import { gfmToMarkdown } from 'mdast-util-gfm'

// Note: the GH token needs `gists` access!
const ghToken = process.env.GH_TOKEN || process.env.GITHUB_TOKEN

if (!ghToken) {
throw new Error('Missing GitHub token: expected `GH_TOKEN` in env')
}

const octo = new Octokit({ auth: 'token ' + ghToken })
const categoryBase = new URL('../node_modules/@unicode/unicode-12.1.0/General_Category/', import.meta.url)

// Take up to N samples from each category.
const samples = 400

const otherTests = [
{ name: 'Basic usage', input: 'alpha' },
{ name: 'Basic usage (again)', input: 'alpha' },
{ name: 'Camelcase', input: 'bravoCharlieDelta' },
{ name: 'Prototypal injection: proto', input: '__proto__' },
{ name: 'Prototypal injection: proto (again)', input: '__proto__' },
{ name: 'Prototypal injection: has own', input: 'hasOwnProperty' },
{ name: 'Repetition (1)', input: 'echo' },
{ name: 'Repetition (2)', input: 'echo' },
{ name: 'Repetition (3)', input: 'echo 1' },
{ name: 'Repetition (4)', input: 'echo-1' },
{ name: 'Repetition (5)', input: 'echo' },
{ name: 'More repetition (1)', input: 'foxtrot-1' },
{ name: 'More repetition (2)', input: 'foxtrot' },
{ name: 'More repetition (3)', input: 'foxtrot' },
{ name: 'Characters: dash', input: 'heading with a - dash' },
{ name: 'Characters: underscore', input: 'heading with an _ underscore' },
{ name: 'Characters: dot', input: 'heading with a period.txt' },
{ name: 'Characters: dots, parents, brackets', input: 'exchange.bind_headers(exchange, routing [, bindCallback])' },
{ name: 'Characters: space', input: ' ', markdownOverwrite: '# &#x20;' },
{ name: 'Characters: initial space', input: ' a', markdownOverwrite: '# &#x20;a' },
{ name: 'Characters: final space', input: 'a ', markdownOverwrite: '# a&#x20;' },
{ name: 'Characters: initial and final spaces', input: ' a ', markdownOverwrite: '# &#x20;a&#x20;' },
{ name: 'Characters: initial and final dashes', input: '-a-' },
{ name: 'Characters: apostrophe', input: 'apostrophe’s should be trimmed' },
{ name: 'Some more duplicates (1)', input: 'golf' },
{ name: 'Some more duplicates (2)', input: 'golf' },
{ name: 'Some more duplicates (3)', input: 'golf' },
{ name: 'Non-ascii: ♥', input: 'I ♥ unicode' },
{ name: 'Non-ascii: -', input: 'dash-dash' },
{ name: 'Non-ascii: –', input: 'en–dash' },
{ name: 'Non-ascii: –', input: 'em–dash' },
{ name: 'Non-ascii: 😄', input: '😄 unicode emoji' },
{ name: 'Non-ascii: 😄-😄', input: '😄-😄 unicode emoji' },
{ name: 'Non-ascii: 😄_😄', input: '😄_😄 unicode emoji' },
{ name: 'Non-ascii: 😄', input: '😄 - an emoji' },
{ name: 'Non-ascii: :smile:', input: ':smile: - a gemoji' },
{ name: 'Non-ascii: Cyrillic (1)', input: 'Привет' },
{ name: 'Non-ascii: Cyrillic (2)', input: 'Профили пользователей' },
{ name: 'Non-ascii: Cyrillic + Han', input: 'Привет non-latin 你好' },
{ name: 'Gemoji (1)', input: ':ok: No underscore' },
{ name: 'Gemoji (2)', input: ':ok_hand: Single' },
{ name: 'Gemoji (3)', input: ':ok_hand::hatched_chick: Two in a row with no spaces' },
{ name: 'Gemoji (4)', input: ':ok_hand: :hatched_chick: Two in a row' }
]

main()

async function main () {
const files = await fs.readdir(categoryBase)
const tests = [...otherTests]
let index = -1

// Create a test case with a bunch of examples.
while (++index < files.length) {
const name = files[index]

if (name === 'index.js') continue

// These result in Git(Hub) thinking it’s a binary file.
if (name === 'Control' || name === 'Surrogate') continue

// This prevents GH from rendering markdown to HTML.
if (name === 'Other') continue

const fp = `./${name}/code-points.js`
const { default: codePoints } = await import(new URL(fp, categoryBase))
const subs = []

let n = -1

while (++n < samples) {
subs.push(codePoints[Math.floor(codePoints.length / samples * n)])
}

subs.push(codePoints[codePoints.length - 1])

tests.push({ name, input: 'a' + [...new Set(subs)].map(d => String.fromCodePoint(d)).join(' ') + 'b' })
}

// Create a Gist.
const filename = 'readme.md'
const gistResult = await octo.gists.create({
files: {
[filename]: {
content: tests.map(d => {
return d.markdownOverwrite || toMarkdown({ type: 'heading', depth: 1, children: [{ type: 'text', value: d.input }] }, { extensions: [gfmToMarkdown()] })
}).join('\n\n')
}
}
})

const file = gistResult.data.files[filename]

if (!file.language) {
throw new Error('The generated markdown was seen as binary data instead of text by GitHub. This is likely because there are weird characters (such as control characters or lone surrogates) in it')
}

// Fetch the rendered page.
const response = await fetch(gistResult.data.html_url, {
headers: { Authorization: 'token ' + ghToken }
})

const doc = await response.text()

// Remove the Gist.
await octo.gists.delete({ gist_id: gistResult.data.id })

const tree = unified().use(rehypeParse).parse(doc)
const markdownBody = select('.markdown-body', tree)

if (!markdownBody) {
throw new Error('The generated markdown could not be rendered by GitHub as HTML. This is likely because there are weird characters in it')
}

const anchors = selectAll('h1 .anchor', markdownBody)

anchors.forEach((node, i) => {
tests[i].expected = node.properties.href.slice(1)
})

await fs.writeFile(new URL('../test/fixtures.json', import.meta.url), JSON.stringify(tests, null, 2) + '\n')
}
62 changes: 62 additions & 0 deletions script/generate-regex.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { promises as fs } from 'node:fs'
import regenerate from 'regenerate'
import alphabetics from '@unicode/unicode-12.1.0/Binary_Property/Alphabetic/code-points.js'

const categoryBase = new URL('../node_modules/@unicode/unicode-12.1.0/General_Category/', import.meta.url)

// Unicode General Categories to remove.
const ranges = [
// Some numbers:
'Other_Number',

// Some punctuation:
'Close_Punctuation',
'Final_Punctuation',
'Initial_Punctuation',
'Open_Punctuation',
'Other_Punctuation',
// All except a normal `-` (dash)
'Dash_Punctuation',

// All:
'Symbol',
'Control',
'Private_Use',
'Format',
'Unassigned',

// All except a normal ` ` (space)
'Separator'
]

main()

async function main () {
const generator = regenerate()

let index = -1

// Add code points to strip.
while (++index < ranges.length) {
const name = ranges[index]
const fp = `./${name}/code-points.js`
const { default: codePoints } = await import(new URL(fp, categoryBase))

generator.add(codePoints)
}

generator
// Some overlap between letters and Other Symbol.
.remove(alphabetics)
// Spaces are turned to `-`
.remove(' ')
// Dash is kept.
.remove('-')

await fs.writeFile('regex.js', [
'// This module is generated by `script/`.',
'/* eslint-disable no-control-regex, no-misleading-character-class, no-useless-escape */',
'module.exports = ' + generator.toRegExp() + 'g',
''
].join('\n'))
}
5 changes: 0 additions & 5 deletions test/1-basic-usage.md

This file was deleted.

5 changes: 0 additions & 5 deletions test/2-camel-case.md

This file was deleted.

7 changes: 0 additions & 7 deletions test/3-prototype.md

This file was deleted.

9 changes: 0 additions & 9 deletions test/4-matching-slugs-basic.md

This file was deleted.

5 changes: 0 additions & 5 deletions test/5-matching-slugs-again.md

This file was deleted.

17 changes: 0 additions & 17 deletions test/6-characters.md

This file was deleted.

5 changes: 0 additions & 5 deletions test/7-duplicates.md

This file was deleted.

23 changes: 0 additions & 23 deletions test/8-non-ascii.md

This file was deleted.

7 changes: 0 additions & 7 deletions test/9-emoji.md

This file was deleted.

Loading