Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to strip Unicode from entry filenames #1135

Merged
merged 13 commits into from
Mar 27, 2018
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
"dependencies": {
"classnames": "^2.2.5",
"create-react-class": "^15.6.0",
"diacritics": "^1.3.0",
"fuzzy": "^0.1.1",
"gotrue-js": "^0.9.15",
"gray-matter": "^3.0.6",
Expand Down
9 changes: 8 additions & 1 deletion src/actions/config.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import yaml from "js-yaml";
import { Map, List, fromJS } from "immutable";
import { trimStart, flow } from "lodash";
import { trimStart, flow, isBoolean } from "lodash";
import { authenticateUser } from "Actions/auth";
import * as publishModes from "Constants/publishModes";

Expand Down Expand Up @@ -43,6 +43,13 @@ export function validateConfig(config) {
if (typeof config.get('media_folder') !== 'string') {
throw new Error("Error in configuration file: Your `media_folder` must be a string. Check your config.yml file.");
}
const slug_encoding = config.getIn(['slug', 'encoding'], "unicode");
if (slug_encoding !== "unicode" && slug_encoding !== "ascii") {
throw new Error("Error in configuration file: Your `slug.encoding` must be either `unicode` or `ascii`. Check your config.yml file.")
}
if (!isBoolean(config.getIn(['slug', 'clean_accents'], false))) {
throw new Error("Error in configuration file: Your `slug.clean_accents` must be a boolean. Check your config.yml file.");
}
if (!config.get('collections')) {
throw new Error("Error in configuration file: A `collections` wasn\'t found. Check your config.yml file.");
}
Expand Down
6 changes: 4 additions & 2 deletions src/actions/mediaLibrary.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { createAssetProxy } from 'ValueObjects/AssetProxy';
import { getAsset, selectIntegration } from 'Reducers';
import { getIntegrationProvider } from 'Integrations';
import { addAsset } from './media';
import { sanitizeSlug } from "Lib/urlHelper";

const { notifSend } = notifActions;

Expand Down Expand Up @@ -79,7 +80,8 @@ export function persistMedia(file, opts = {}) {
const backend = currentBackend(state.config);
const integration = selectIntegration(state, null, 'assetStore');
const files = state.mediaLibrary.get('files');
const existingFile = files.find(existingFile => existingFile.name.toLowerCase() === file.name.toLowerCase());
const fileName = sanitizeSlug(file.name.toLowerCase(), state.config.get('slug'));
const existingFile = files.find(existingFile => existingFile.name.toLowerCase() === fileName);

/**
* Check for existing files of the same name before persisting. If no asset
Expand All @@ -98,7 +100,7 @@ export function persistMedia(file, opts = {}) {
dispatch(mediaPersisting());

try {
const assetProxy = await createAssetProxy(file.name.toLowerCase(), file, false, privateUpload);
const assetProxy = await createAssetProxy(fileName, file, false, privateUpload);
dispatch(addAsset(assetProxy));
if (!integration) {
const asset = await backend.persistMedia(assetProxy);
Expand Down
10 changes: 5 additions & 5 deletions src/backends/backend.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class LocalStorageAuthStore {
}
}

const slugFormatter = (template = "{{slug}}", entryData) => {
const slugFormatter = (template = "{{slug}}", entryData, slugConfig) => {
const date = new Date();

const getIdentifier = (entryData) => {
Expand Down Expand Up @@ -76,10 +76,10 @@ const slugFormatter = (template = "{{slug}}", entryData) => {
// Convert slug to lower-case
.toLocaleLowerCase()

// Replace periods and spaces with dashes.
.replace(/[.\s]/g, '-');
// Replace periods with dashes.
.replace(/[.]/g, '-');

return sanitizeSlug(slug);
return sanitizeSlug(slug, slugConfig);
};

class Backend {
Expand Down Expand Up @@ -242,7 +242,7 @@ class Backend {
if (!selectAllowNewEntries(collection)) {
throw (new Error("Not allowed to create new entries in this collection"));
}
const slug = slugFormatter(collection.get("slug"), entryDraft.getIn(["entry", "data"]));
const slug = slugFormatter(collection.get("slug"), entryDraft.getIn(["entry", "data"]), config.get("slug"));
const path = selectEntryPath(collection, slug);
entryObj = {
path,
Expand Down
67 changes: 46 additions & 21 deletions src/lib/__tests__/urlHelper.spec.js
Original file line number Diff line number Diff line change
@@ -1,52 +1,59 @@
import { sanitizeIRI, sanitizeSlug } from '../urlHelper';
import { Map } from 'immutable';
import { sanitizeURI, sanitizeSlug } from '../urlHelper';

describe('sanitizeIRI', () => {
// `sanitizeIRI` tests from RFC 3987
describe('sanitizeURI', () => {
// `sanitizeURI` tests from RFC 3987
it('should keep valid URI chars (letters digits _ - . ~)', () => {
expect(
sanitizeIRI("This, that-one_or.the~other 123!")
sanitizeURI("This, that-one_or.the~other 123!")
).toEqual('Thisthat-one_or.the~other123');
});

it('should not remove accents', () => {
expect(
sanitizeIRI("ěščřžý")
sanitizeURI("ěščřžý")
).toEqual('ěščřžý');
});

it('should keep valid non-latin chars (ucschars in RFC 3987)', () => {
expect(
sanitizeIRI("日本語のタイトル")
sanitizeURI("日本語のタイトル")
).toEqual('日本語のタイトル');
});

it('should not keep valid non-latin chars (ucschars in RFC 3987) if set to ASCII mode', () => {
expect(
sanitizeURI("ěščřžý日本語のタイトル", { encoding: 'ascii' })
).toEqual('');
});

it('should not normalize Unicode strings', () => {
expect(
sanitizeIRI('\u017F\u0323\u0307')
sanitizeURI('\u017F\u0323\u0307')
).toEqual('\u017F\u0323\u0307');
expect(
sanitizeIRI('\u017F\u0323\u0307')
sanitizeURI('\u017F\u0323\u0307')
).not.toEqual('\u1E9B\u0323');
});

it('should allow a custom replacement character', () => {
expect(
sanitizeIRI("duck\\goose.elephant", { replacement: '-' })
sanitizeURI("duck\\goose.elephant", { replacement: '-' })
).toEqual('duck-goose.elephant');
});

it('should not allow an improper replacement character', () => {
expect(() => {
sanitizeIRI("I! like! dollars!", { replacement: '$' });
sanitizeURI("I! like! dollars!", { replacement: '$' });
}).toThrow();
});

it('should not actually URI-encode the characters', () => {
expect(
sanitizeIRI("🎉")
sanitizeURI("🎉")
).toEqual('🎉');
expect(
sanitizeIRI("🎉")
sanitizeURI("🎉")
).not.toEqual("%F0%9F%8E%89");
});
});
Expand All @@ -65,14 +72,14 @@ describe('sanitizeSlug', ()=> {
});

it('throws an error for non-string replacements', () => {
expect(() => sanitizeSlug('test', { replacement: {} })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: [] })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: false })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: null } )).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: 11232 })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', Map({ sanitize_replacement: {} }))).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', Map({ sanitize_replacement: [] }))).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', Map({ sanitize_replacement: false }))).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', Map({ sanitize_replacement: null } ))).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', Map({ sanitize_replacement: 11232 }))).toThrowError("`options.replacement` must be a string.");
// do not test undefined for this variant since a default is set in the cosntructor.
//expect(() => sanitizeSlug('test', { replacement: undefined })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: ()=>{} })).toThrowError("`options.replacement` must be a string.");
//expect(() => sanitizeSlug('test', { sanitize_replacement: undefined })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', Map({ sanitize_replacement: ()=>{} }))).toThrowError("`options.replacement` must be a string.");
});

it('should keep valid URI chars (letters digits _ - . ~)', () => {
Expand All @@ -81,6 +88,24 @@ describe('sanitizeSlug', ()=> {
).toEqual('This-that-one_or.the~other-123');
});

it('should remove accents with `clean_accents` set', () => {
expect(
sanitizeSlug("ěščřžý", Map({ clean_accents: true }))
).toEqual('escrzy');
});

it('should remove non-latin chars in "ascii" mode', () => {
expect(
sanitizeSlug("ěščřžý日本語のタイトル", Map({ encoding: 'ascii' }))
).toEqual('');
});

it('should clean accents and strip non-latin chars in "ascii" mode with `clean_accents` set', () => {
expect(
sanitizeSlug("ěščřžý日本語のタイトル", Map({ encoding: 'ascii', clean_accents: true }))
).toEqual('escrzy');
});

it('removes double replacements', () => {
expect(sanitizeSlug('test--test')).toEqual('test-test');
expect(sanitizeSlug('test test')).toEqual('test-test');
Expand All @@ -91,7 +116,7 @@ describe('sanitizeSlug', ()=> {
});

it('uses alternate replacements', () => {
expect(sanitizeSlug('test test ', { replacement: '_' })).toEqual('test_test');
expect(sanitizeSlug('test test ', Map({ sanitize_replacement: '_' }))).toEqual('test_test');
});

});
});
66 changes: 43 additions & 23 deletions src/lib/urlHelper.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import url from 'url';
import diacritics from 'diacritics';
import sanitizeFilename from 'sanitize-filename';
import { isString, escapeRegExp, flow, partialRight } from 'lodash';
import { Map } from 'immutable';

function getUrl(url, direct) {
return `${ direct ? '/#' : '' }${ url }`;
function getUrl(urlString, direct) {
return `${ direct ? '/#' : '' }${ urlString }`;
}

export function getCollectionUrl(collectionName, direct) {
Expand All @@ -20,9 +22,9 @@ export function addParams(urlString, params) {
return url.format(parsedUrl);
}

export function stripProtocol(url) {
const protocolEndIndex = url.indexOf('//');
return protocolEndIndex > -1 ? url.slice(protocolEndIndex + 2) : url;
export function stripProtocol(urlString) {
const protocolEndIndex = urlString.indexOf('//');
return protocolEndIndex > -1 ? urlString.slice(protocolEndIndex + 2) : urlString;
}

/* See https://www.w3.org/International/articles/idn-and-iri/#path.
Expand All @@ -34,34 +36,52 @@ export function stripProtocol(url) {
*/
const uriChars = /[\w\-.~]/i;
const ucsChars = /[\xA0-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/u;
const validIRIChar = (char) => (uriChars.test(char) || ucsChars.test(char));
// `sanitizeIRI` does not actually URI-encode the chars (that is the browser's and server's job), just removes the ones that are not allowed.
export function sanitizeIRI(str, { replacement = "" } = {}) {
if (!isString(str)) throw "The input slug must be a string.";
if (!isString(replacement)) throw "`options.replacement` must be a string.";
const validURIChar = char => uriChars.test(char);
const validIRIChar = char => uriChars.test(char) || ucsChars.test(char);
// `sanitizeURI` does not actually URI-encode the chars (that is the browser's and server's job), just removes the ones that are not allowed.
export function sanitizeURI(str, { replacement = "", encoding = "unicode" } = {}) {
if (!isString(str)) {
throw new Error("The input slug must be a string.");
}
if (!isString(replacement)) {
throw new Error("`options.replacement` must be a string.");
}

let validChar;
if (encoding === "unicode") {
validChar = validIRIChar;
} else if (encoding === "ascii") {
validChar = validURIChar;
} else {
throw new Error('`options.encoding` must be "unicode" or "ascii".');
}

// Check and make sure the replacement character is actually a safe char itself.
if (!Array.from(replacement).every(validIRIChar)) throw "The replacement character(s) (options.replacement) is itself unsafe.";
if (!Array.from(replacement).every(validChar)) {
throw new Error("The replacement character(s) (options.replacement) is itself unsafe.");
}

// `Array.from` must be used instead of `String.split` because
// `split` converts things like emojis into UTF-16 surrogate pairs.
return Array.from(str).map(char => (validIRIChar(char) ? char : replacement)).join('');
return Array.from(str).map(char => (validChar(char) ? char : replacement)).join('');
}

export function sanitizeSlug(str, { replacement = '-' } = {}) {
if (!isString(str)) throw "The input slug must be a string.";
if (!isString(replacement)) throw "`options.replacement` must be a string.";
export function sanitizeSlug(str, options = Map()) {
const encoding = options.get('encoding', 'unicode');
const stripDiacritics = options.get('clean_accents', false);
const replacement = options.get('sanitize_replacement', '-');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentionally undocumented?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't sure if we wanted to wait until someone actually had a valid use case for it -- validating it in src/actions/config.js would take a bit of effort. Thoughts?

Copy link
Contributor

@erquhart erquhart Mar 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nah, I'm fine with it as is, was just curious.


if (!isString(str)) { throw new Error("The input slug must be a string."); }

// Sanitize as IRI (i18n URI) and as filename.
const sanitize = flow([
partialRight(sanitizeIRI, { replacement }),
const sanitizedSlug = flow([
...(stripDiacritics ? [diacritics.remove] : []),
partialRight(sanitizeURI, { replacement, encoding }),
partialRight(sanitizeFilename, { replacement }),
]);
const sanitizedSlug = sanitize(str);

])(str);

// Remove any doubled or trailing replacement characters (that were added in the sanitizers).
const doubleReplacement = new RegExp('(?:' + escapeRegExp(replacement) + ')+', 'g');
const trailingReplacment = new RegExp(escapeRegExp(replacement) + '$');
const doubleReplacement = new RegExp(`(?:${ escapeRegExp(replacement) })+`, 'g');
const trailingReplacment = new RegExp(`${ escapeRegExp(replacement) }$`);
const normalizedSlug = sanitizedSlug
.replace(doubleReplacement, replacement)
.replace(trailingReplacment, '');
Expand Down
18 changes: 18 additions & 0 deletions website/site/content/docs/configuration-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,24 @@ public_folder: "/images/uploads"

Based on the settings above, if a user used an image widget field called `avatar` to upload and select an image called `philosoraptor.png`, the image would be saved to the repository at `/static/images/uploads/philosoraptor.png`, and the `avatar` field for the file would be set to `/images/uploads/philosoraptor.png`.

## Slug Type

The `slug` option allows you to change how filenames for entries are created and sanitized. For modifying the actual data in a slug, see the per-collection option below.

`slug` accepts multiple options:

- `encoding`
- `unicode` (default): Sanitize filenames (slugs) according to [RFC3987](https://tools.ietf.org/html/rfc3987) and the [WHATWG URL spec](https://url.spec.whatwg.org/). This spec allows non-ASCII (or non-Latin) characters to exist in URLs.
- `ascii`: Sanitize filenames (slugs) according to [RFC3986](https://tools.ietf.org/html/rfc3986). The only allowed characters are (0-9, a-z, A-Z, `_`, `-`, `~`).
- `clean_accents`: Set to `true` to remove diacritics from slug characters before sanitizing. This is often helpful when using `ascii` encoding.

**Example**

``` yaml
slug:
encoding: "ascii"
clean_accents: true
```

## Collections

Expand Down
4 changes: 4 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2333,6 +2333,10 @@ detect-node@^2.0.3:
version "2.0.3"
resolved "https://registry.yarnpkg.com/detect-node/-/detect-node-2.0.3.tgz#a2033c09cc8e158d37748fbde7507832bd6ce127"

diacritics@^1.3.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/diacritics/-/diacritics-1.3.0.tgz#3efa87323ebb863e6696cebb0082d48ff3d6f7a1"

diff@^3.2.0:
version "3.4.0"
resolved "https://registry.yarnpkg.com/diff/-/diff-3.4.0.tgz#b1d85507daf3964828de54b37d0d73ba67dda56c"
Expand Down