diff --git a/actions/setup/js/sanitize_content.cjs b/actions/setup/js/sanitize_content.cjs index 76d5733a80..a68bb2744a 100644 --- a/actions/setup/js/sanitize_content.cjs +++ b/actions/setup/js/sanitize_content.cjs @@ -21,6 +21,7 @@ const { convertXmlTags, neutralizeBotTriggers, applyTruncation, + hardenUnicodeText, } = require("./sanitize_content_core.cjs"); const { balanceCodeRegions } = require("./markdown_code_region_balancer.cjs"); @@ -72,6 +73,9 @@ function sanitizeContent(content, maxLengthOrOptions) { let sanitized = content; + // Apply Unicode hardening first to normalize text representation + sanitized = hardenUnicodeText(sanitized); + // Remove ANSI escape sequences and control characters early sanitized = sanitized.replace(/\x1b\[[0-9;]*[mGKH]/g, ""); sanitized = sanitized.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, ""); diff --git a/actions/setup/js/sanitize_content.test.cjs b/actions/setup/js/sanitize_content.test.cjs index 5ef9dec249..d6f920d3f5 100644 --- a/actions/setup/js/sanitize_content.test.cjs +++ b/actions/setup/js/sanitize_content.test.cjs @@ -1069,4 +1069,253 @@ describe("sanitize_content.cjs", () => { }); }); }); + + describe("Unicode hardening transformations", () => { + describe("zero-width character removal", () => { + it("should remove zero-width space (U+200B)", () => { + const input = "Hello\u200BWorld"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove zero-width non-joiner (U+200C)", () => { + const input = "Test\u200CText"; + const expected = "TestText"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove zero-width joiner (U+200D)", () => { + const input = "Hello\u200DWorld"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove word joiner (U+2060)", () => { + const input = "Word\u2060Joiner"; + const expected = "WordJoiner"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove byte order mark (U+FEFF)", () => { + const input = "\uFEFFHello World"; + const expected = "Hello World"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove multiple zero-width characters", () => { + const input = "A\u200BB\u200CC\u200DD\u2060E\uFEFFF"; + const expected = "ABCDEF"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should handle text with only zero-width characters", () => { + const input = "\u200B\u200C\u200D"; + const expected = ""; + expect(sanitizeContent(input)).toBe(expected); + }); + }); + + describe("Unicode normalization (NFC)", () => { + it("should normalize composed characters", () => { + // e + combining acute accent -> precomposed é + const input = "cafe\u0301"; // café with combining accent + const result = sanitizeContent(input); + // After NFC normalization, should be composed form + expect(result).toBe("café"); + // Verify it's the precomposed character (U+00E9) + expect(result.charCodeAt(3)).toBe(0x00e9); + }); + + it("should normalize multiple combining characters", () => { + const input = "n\u0303"; // ñ with combining tilde + const result = sanitizeContent(input); + expect(result).toBe("ñ"); + }); + + it("should handle already normalized text", () => { + const input = "Hello World"; + const expected = "Hello World"; + expect(sanitizeContent(input)).toBe(expected); + }); + }); + + describe("full-width ASCII conversion", () => { + it("should convert full-width exclamation mark", () => { + const input = "Hello\uFF01"; // Full-width ! + const expected = "Hello!"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should convert full-width letters", () => { + const input = "\uFF21\uFF22\uFF23"; // Full-width ABC + const expected = "ABC"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should convert full-width digits", () => { + const input = "\uFF11\uFF12\uFF13"; // Full-width 123 + const expected = "123"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should convert full-width parentheses", () => { + const input = "\uFF08test\uFF09"; // Full-width (test) + const expected = "(test)"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should convert mixed full-width and normal text", () => { + const input = "Hello\uFF01 \uFF37orld"; // Hello! World with full-width ! and W + const expected = "Hello! World"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should convert full-width at sign", () => { + const input = "\uFF20user"; // Full-width @user + // Note: @ mention will also be neutralized + const result = sanitizeContent(input); + expect(result).toBe("`@user`"); + }); + + it("should handle entire sentence in full-width", () => { + const input = "\uFF28\uFF45\uFF4C\uFF4C\uFF4F"; // Full-width Hello + const expected = "Hello"; + expect(sanitizeContent(input)).toBe(expected); + }); + }); + + describe("directional override removal", () => { + it("should remove left-to-right embedding (U+202A)", () => { + const input = "Hello\u202AWorld"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove right-to-left embedding (U+202B)", () => { + const input = "Hello\u202BWorld"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove pop directional formatting (U+202C)", () => { + const input = "Hello\u202CWorld"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove left-to-right override (U+202D)", () => { + const input = "Hello\u202DWorld"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove right-to-left override (U+202E)", () => { + const input = "Hello\u202EWorld"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove left-to-right isolate (U+2066)", () => { + const input = "Hello\u2066World"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove right-to-left isolate (U+2067)", () => { + const input = "Hello\u2067World"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove first strong isolate (U+2068)", () => { + const input = "Hello\u2068World"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove pop directional isolate (U+2069)", () => { + const input = "Hello\u2069World"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should remove multiple directional controls", () => { + const input = "A\u202AB\u202BC\u202CD\u202DE\u202EF\u2066G\u2067H\u2068I\u2069J"; + const expected = "ABCDEFGHIJ"; + expect(sanitizeContent(input)).toBe(expected); + }); + }); + + describe("combined Unicode attacks", () => { + it("should handle combination of zero-width and directional controls", () => { + const input = "Hello\u200B\u202EWorld\u200C"; + const expected = "HelloWorld"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should handle combination of full-width and zero-width", () => { + const input = "\uFF28\u200Bello"; // Full-width H + zero-width space + ello + const expected = "Hello"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should handle all transformations together", () => { + // Full-width H, zero-width space, combining accent, RTL override, normal text + const input = "\uFF28\u200Be\u0301\u202Ello"; + const expected = "Héllo"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should prevent visual spoofing with mixed scripts", () => { + // Example: trying to hide malicious text with RTL override + const input = "filename\u202E.txt.exe"; + // Should remove the RTL override + const expected = "filename.txt.exe"; + expect(sanitizeContent(input)).toBe(expected); + }); + + it("should handle deeply nested Unicode attacks", () => { + const input = "\uFEFF\u200B\uFF21\u202E\u0301\u200C"; + // BOM + ZWS + full-width A + RTL + combining + ZWNJ + const result = sanitizeContent(input); + // Should result in just "A" with the combining accent normalized + expect(result.replace(/\u0301/g, "")).toBe("A"); + }); + }); + + describe("edge cases and boundary conditions", () => { + it("should handle empty string", () => { + expect(sanitizeContent("")).toBe(""); + }); + + it("should handle string with only invisible characters", () => { + const input = "\u200B\u202E\uFEFF"; + expect(sanitizeContent(input)).toBe(""); + }); + + it("should preserve regular whitespace", () => { + const input = "Hello World\t\nTest"; + const result = sanitizeContent(input); + // Should preserve spaces, tabs, and newlines (though trimmed at end) + expect(result).toContain("Hello"); + expect(result).toContain("World"); + }); + + it("should not affect emoji", () => { + const input = "Hello 👋 World 🌍"; + const result = sanitizeContent(input); + expect(result).toContain("👋"); + expect(result).toContain("🌍"); + }); + + it("should handle long text with scattered Unicode attacks", () => { + const longText = "A".repeat(100) + "\u200B" + "B".repeat(100) + "\u202E" + "C".repeat(100); + const result = sanitizeContent(longText); + // Should remove the invisible characters + expect(result.length).toBe(300); // 100 + 100 + 100 + expect(result.includes("\u200B")).toBe(false); + expect(result.includes("\u202E")).toBe(false); + }); + }); + }); }); diff --git a/actions/setup/js/sanitize_content_core.cjs b/actions/setup/js/sanitize_content_core.cjs index a33dbab942..42c135f47a 100644 --- a/actions/setup/js/sanitize_content_core.cjs +++ b/actions/setup/js/sanitize_content_core.cjs @@ -485,6 +485,46 @@ function applyTruncation(content, maxLength) { return content; } +/** + * Performs text hardening to protect against Unicode-based attacks. + * This applies multiple layers of character normalization and filtering + * to ensure consistent text processing and prevent visual spoofing. + * + * @param {string} text - Input text to harden + * @returns {string} Hardened text with Unicode security applied + */ +function hardenUnicodeText(text) { + if (!text || typeof text !== "string") { + return ""; + } + + let result = text; + + // Step 1: Normalize Unicode to canonical composition (NFC) + // This ensures consistent character representation across different encodings + result = result.normalize("NFC"); + + // Step 2: Strip invisible zero-width characters that can hide content + // These include: zero-width space, zero-width non-joiner, zero-width joiner, + // word joiner, and byte order mark + result = result.replace(/[\u200B\u200C\u200D\u2060\uFEFF]/g, ""); + + // Step 3: Remove bidirectional text override controls + // These can be used to reverse text direction and create visual spoofs + result = result.replace(/[\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]/g, ""); + + // Step 4: Convert full-width ASCII characters to standard ASCII + // Full-width characters (U+FF01-FF5E) can be used to bypass filters + result = result.replace(/[\uFF01-\uFF5E]/g, char => { + const code = char.charCodeAt(0); + // Map full-width to half-width by subtracting offset + const standardCode = code - 0xfee0; + return String.fromCharCode(standardCode); + }); + + return result; +} + /** * Core sanitization function without mention filtering * @param {string} content - The content to sanitize @@ -504,6 +544,10 @@ function sanitizeContentCore(content, maxLength) { let sanitized = content; + // Apply Unicode hardening first to normalize text representation + // This prevents Unicode-based attacks and ensures consistent processing + sanitized = hardenUnicodeText(sanitized); + // Remove ANSI escape sequences and control characters early // This must happen before mention neutralization to avoid creating bare mentions // when control characters are removed between @ and username @@ -566,4 +610,5 @@ module.exports = { convertXmlTags, neutralizeBotTriggers, applyTruncation, + hardenUnicodeText, }; diff --git a/actions/setup/js/sanitize_label_content.cjs b/actions/setup/js/sanitize_label_content.cjs index 938c1994df..5fdd6ef608 100644 --- a/actions/setup/js/sanitize_label_content.cjs +++ b/actions/setup/js/sanitize_label_content.cjs @@ -5,6 +5,8 @@ * @module sanitize_label_content */ +const { hardenUnicodeText } = require("./sanitize_content_core.cjs"); + /** * Sanitizes label content by removing control characters, ANSI escape codes, * and neutralizing @mentions to prevent unintended notifications. @@ -17,6 +19,10 @@ function sanitizeLabelContent(content) { return ""; } let sanitized = content.trim(); + + // Apply Unicode hardening first + sanitized = hardenUnicodeText(sanitized); + // Remove ANSI escape sequences FIRST (before removing control chars) sanitized = sanitized.replace(/\x1b\[[0-9;]*[mGKH]/g, ""); // Then remove control characters (except newlines and tabs) diff --git a/actions/setup/js/sanitize_label_content.test.cjs b/actions/setup/js/sanitize_label_content.test.cjs index b07e343d1d..289a5fb5b4 100644 --- a/actions/setup/js/sanitize_label_content.test.cjs +++ b/actions/setup/js/sanitize_label_content.test.cjs @@ -87,4 +87,38 @@ describe("sanitize_label_content.cjs", () => { expect(sanitizeLabelContent(' @user says & "goodbye" ')).toBe("`@user` says hello goodbye"); })); }); + + describe("Unicode hardening for labels", () => { + it("should remove zero-width characters", () => { + expect(sanitizeLabelContent("bug\u200Blabel")).toBe("buglabel"); + expect(sanitizeLabelContent("test\u200C\u200D\u2060label")).toBe("testlabel"); + }); + + it("should convert full-width ASCII to normal ASCII", () => { + expect(sanitizeLabelContent("\uFF21\uFF22\uFF23")).toBe("ABC"); + expect(sanitizeLabelContent("bug\uFF01")).toBe("bug!"); + }); + + it("should remove directional override characters", () => { + expect(sanitizeLabelContent("label\u202Etest")).toBe("labeltest"); + expect(sanitizeLabelContent("bug\u202A\u202B\u202Cfix")).toBe("bugfix"); + }); + + it("should normalize Unicode characters (NFC)", () => { + const labelWithCombining = "cafe\u0301"; // café with combining accent + const result = sanitizeLabelContent(labelWithCombining); + expect(result).toBe("café"); + expect(result.charCodeAt(3)).toBe(0x00e9); // Precomposed é + }); + + it("should handle combination of Unicode attacks in labels", () => { + const maliciousLabel = "\uFF42\u200Bug\u202E\uFEFF"; + expect(sanitizeLabelContent(maliciousLabel)).toBe("bug"); + }); + + it("should preserve emoji in labels", () => { + expect(sanitizeLabelContent("🐛 bug")).toBe("🐛 bug"); + expect(sanitizeLabelContent("✨ enhancement")).toBe("✨ enhancement"); + }); + }); });