From 8f7371cebfbd4283d96f94f3256c4fc772eb1c7b Mon Sep 17 00:00:00 2001 From: Blade Barringer Date: Tue, 1 Mar 2022 13:40:13 -0600 Subject: [PATCH] feat: decode html entities before sanitizing (#40) --- CHANGELOG.md | 20 ++++++++++++++++++++ README.md | 7 +++++++ src/__tests__/test.ts | 24 ++++++++++++++++++++++++ src/index.ts | 16 +++++++++++++--- 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e44628c..546a1b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,23 @@ +# unreleased + +**Breaking Changes** + +- Decode HTML characters automatically that would result in an XSS vulnerability when rendering links via a server rendered HTML file + +```js +// decodes to javacript:alert('XSS') +const vulnerableUrl = + "javascript:alert('XSS')"; + +sanitizeUrl(vulnerableUrl); // 'about:blank' + +const okUrl = "https://example.com/" + vulnerableUrl; + +// since the javascript bit is in the path instead of the protocol +// this is successfully sanitized +sanitizeUrl(okUrl); // 'https://example.com/javascript:alert('XSS'); +``` + # 5.0.2 - Fix issue where certain invisible white space characters were not being sanitized (#35) diff --git a/README.md b/README.md index b3fbe2a..20b4359 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,15 @@ sanitizeUrl("https://example.com"); // 'https://example.com' sanitizeUrl("http://example.com"); // 'http://example.com' sanitizeUrl("www.example.com"); // 'www.example.com' sanitizeUrl("mailto:hello@example.com"); // 'mailto:hello@example.com' +sanitizeUrl( + "https://example.com" +); // https://example.com sanitizeUrl("javascript:alert(document.domain)"); // 'about:blank' sanitizeUrl("jAvasCrIPT:alert(document.domain)"); // 'about:blank' sanitizeUrl(decodeURIComponent("JaVaScRiP%0at:alert(document.domain)")); // 'about:blank' +// HTML encoded javascript:alert('XSS') +sanitizeUrl( + "javascript:alert('XSS')" +); // 'about:blank' ``` diff --git a/src/__tests__/test.ts b/src/__tests__/test.ts index a70a445..51cb224 100644 --- a/src/__tests__/test.ts +++ b/src/__tests__/test.ts @@ -92,6 +92,30 @@ describe("sanitizeUrl", () => { ); }); + it("decodes html entities", () => { + // all these decode to javascript:alert('xss'); + const attackVectors = [ + "javascript:alert('XSS')", + "javascript:alert('XSS')", + "javascript:alert('XSS')", + "jav ascript:alert('XSS');", + "  javascript:alert('XSS');", + ]; + + attackVectors.forEach((vector) => { + expect(sanitizeUrl(vector)).toBe("about:blank"); + }); + + // https://example.com/javascript:alert('XSS') + // since the javascript is the url path, and not the protocol, + // this url is technically sanitized + expect( + sanitizeUrl( + "https://example.com/javascript:alert('XSS')" + ) + ).toBe("https://example.com/javascript:alert('XSS')"); + }); + describe("invalid protocols", () => { describe.each(["javascript", "data", "vbscript"])("%s", (protocol) => { it(`replaces ${protocol} urls with about:blank`, () => { diff --git a/src/index.ts b/src/index.ts index ae98174..801dfae 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,5 @@ const invalidProtocolRegex = /^([^\w]*)(javascript|data|vbscript)/im; +const htmlEntitiesRegex = /&#(\w+)(^\w|;)?/g; const ctrlCharactersRegex = /[\u0000-\u001F\u007F-\u009F\u2000-\u200D\uFEFF]/gim; const urlSchemeRegex = /^([^:]+):/gm; @@ -8,13 +9,22 @@ function isRelativeUrlWithoutProtocol(url: string): boolean { return relativeFirstCharacters.indexOf(url[0]) > -1; } +// adapted from https://stackoverflow.com/a/29824550/2601552 +function decodeHtmlCharacters(str: string) { + return str.replace(htmlEntitiesRegex, (match, dec) => { + return String.fromCharCode(dec); + }); +} + export function sanitizeUrl(url?: string): string { - if (!url) { + const sanitizedUrl = decodeHtmlCharacters(url || "") + .replace(ctrlCharactersRegex, "") + .trim(); + + if (!sanitizedUrl) { return "about:blank"; } - const sanitizedUrl = url.replace(ctrlCharactersRegex, "").trim(); - if (isRelativeUrlWithoutProtocol(sanitizedUrl)) { return sanitizedUrl; }