-
Notifications
You must be signed in to change notification settings - Fork 227
Add Unicode hardening to markdown sanitization functions #14795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
258b9ab
0865ff9
3fe4857
9141fe9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -485,6 +485,46 @@ function applyTruncation(content, maxLength) { | |||||||||||
| return content; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /** | ||||||||||||
| * Performs text hardening to protect against Unicode-based attacks. | ||||||||||||
| * This applies multiple layers of character normalization and filtering | ||||||||||||
| * to ensure consistent text processing and prevent visual spoofing. | ||||||||||||
| * | ||||||||||||
| * @param {string} text - Input text to harden | ||||||||||||
| * @returns {string} Hardened text with Unicode security applied | ||||||||||||
| */ | ||||||||||||
| function hardenUnicodeText(text) { | ||||||||||||
| if (!text || typeof text !== "string") { | ||||||||||||
| return ""; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| let result = text; | ||||||||||||
|
|
||||||||||||
| // Step 1: Normalize Unicode to canonical composition (NFC) | ||||||||||||
| // This ensures consistent character representation across different encodings | ||||||||||||
| result = result.normalize("NFC"); | ||||||||||||
|
|
||||||||||||
| // Step 2: Strip invisible zero-width characters that can hide content | ||||||||||||
| // These include: zero-width space, zero-width non-joiner, zero-width joiner, | ||||||||||||
| // word joiner, and byte order mark | ||||||||||||
| result = result.replace(/[\u200B\u200C\u200D\u2060\uFEFF]/g, ""); | ||||||||||||
|
||||||||||||
|
|
||||||||||||
| // Step 3: Remove bidirectional text override controls | ||||||||||||
| // These can be used to reverse text direction and create visual spoofs | ||||||||||||
| result = result.replace(/[\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]/g, ""); | ||||||||||||
|
|
||||||||||||
| // Step 4: Convert full-width ASCII characters to standard ASCII | ||||||||||||
| // Full-width characters (U+FF01-FF5E) can be used to bypass filters | ||||||||||||
| result = result.replace(/[\uFF01-\uFF5E]/g, char => { | ||||||||||||
| const code = char.charCodeAt(0); | ||||||||||||
| // Map full-width to half-width by subtracting offset | ||||||||||||
| const standardCode = code - 0xfee0; | ||||||||||||
| return String.fromCharCode(standardCode); | ||||||||||||
| }); | ||||||||||||
|
|
||||||||||||
|
||||||||||||
| // Step 5: Convert ideographic/full-width space (U+3000) to ASCII space | |
| // This prevents use of U+3000 to bypass space-based filters | |
| result = result.replace(/\u3000/g, " "); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -87,4 +87,38 @@ describe("sanitize_label_content.cjs", () => { | |
| expect(sanitizeLabelContent(' [31m@user[0m says <hello> & "goodbye" ')).toBe("`@user` says hello goodbye"); | ||
| })); | ||
| }); | ||
|
|
||
| describe("Unicode hardening for labels", () => { | ||
| it("should remove zero-width characters", () => { | ||
| expect(sanitizeLabelContent("bug\u200Blabel")).toBe("buglabel"); | ||
| expect(sanitizeLabelContent("test\u200C\u200D\u2060label")).toBe("testlabel"); | ||
| }); | ||
|
|
||
| it("should convert full-width ASCII to normal ASCII", () => { | ||
| expect(sanitizeLabelContent("\uFF21\uFF22\uFF23")).toBe("ABC"); | ||
| expect(sanitizeLabelContent("bug\uFF01")).toBe("bug!"); | ||
| }); | ||
|
|
||
| it("should remove directional override characters", () => { | ||
| expect(sanitizeLabelContent("label\u202Etest")).toBe("labeltest"); | ||
| expect(sanitizeLabelContent("bug\u202A\u202B\u202Cfix")).toBe("bugfix"); | ||
| }); | ||
|
|
||
| it("should normalize Unicode characters (NFC)", () => { | ||
| const labelWithCombining = "cafe\u0301"; // café with combining accent | ||
| const result = sanitizeLabelContent(labelWithCombining); | ||
| expect(result).toBe("café"); | ||
| expect(result.charCodeAt(3)).toBe(0x00e9); // Precomposed é | ||
| }); | ||
|
|
||
| it("should handle combination of Unicode attacks in labels", () => { | ||
| const maliciousLabel = "\uFF42\u200Bug\u202E\uFEFF"; | ||
| expect(sanitizeLabelContent(maliciousLabel)).toBe("bug"); | ||
| }); | ||
|
|
||
| it("should preserve emoji in labels", () => { | ||
| expect(sanitizeLabelContent("🐛 bug")).toBe("🐛 bug"); | ||
| expect(sanitizeLabelContent("✨ enhancement")).toBe("✨ enhancement"); | ||
| }); | ||
|
Comment on lines
+119
to
+122
|
||
| }); | ||
| }); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The emoji preservation test should include examples of ZWJ (zero-width joiner) emoji sequences, since U+200D is removed by the Unicode hardening function at line 510 of sanitize_content_core.cjs. Examples to test:
This would verify whether the intended behavior is to break these sequences (as a security tradeoff) or whether the implementation needs adjustment.