From 0f3cfebfd52c2250331c4cf7ca56000ae2e95150 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Feb 2026 22:45:02 +0000 Subject: [PATCH 1/3] Initial plan From 053c35570d13493dba82247272418874d43f856d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Feb 2026 22:53:43 +0000 Subject: [PATCH 2/3] Add HTML entity decoding to prevent @mention bypass - Added decodeHtmlEntities() function to decode HTML entities early in sanitization - Handles named entities (@), decimal entities (@), and hex entities (@) - Supports double-encoded variants (&commat;, &#64;, &#X40;) - Decoding happens in hardenUnicodeText() as step 2 (before full-width conversion) - Added comprehensive test coverage (27 new tests) for entity decoding scenarios - All tests passing (210 tests total) Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- actions/setup/js/sanitize_content.test.cjs | 128 +++++++++++++++++++++ actions/setup/js/sanitize_content_core.cjs | 60 +++++++++- 2 files changed, 185 insertions(+), 3 deletions(-) diff --git a/actions/setup/js/sanitize_content.test.cjs b/actions/setup/js/sanitize_content.test.cjs index d6f920d3f5..4c01d862c3 100644 --- a/actions/setup/js/sanitize_content.test.cjs +++ b/actions/setup/js/sanitize_content.test.cjs @@ -1318,4 +1318,132 @@ describe("sanitize_content.cjs", () => { }); }); }); + + describe("HTML entity decoding for @mention bypass prevention", () => { + it("should decode @ and neutralize resulting @mention", () => { + const result = sanitizeContent("Please review @pelikhan"); + expect(result).toBe("Please review `@pelikhan`"); + }); + + it("should decode double-encoded &commat; and neutralize resulting @mention", () => { + const result = sanitizeContent("Please review &commat;pelikhan"); + expect(result).toBe("Please review `@pelikhan`"); + }); + + it("should decode @ (decimal) and neutralize resulting @mention", () => { + const result = sanitizeContent("Please review @pelikhan"); + expect(result).toBe("Please review `@pelikhan`"); + }); + + it("should decode double-encoded &#64; and neutralize resulting @mention", () => { + const result = sanitizeContent("Please review &#64;pelikhan"); + expect(result).toBe("Please review `@pelikhan`"); + }); + + it("should decode @ (hex lowercase) and neutralize resulting @mention", () => { + const result = sanitizeContent("Please review @pelikhan"); + expect(result).toBe("Please review `@pelikhan`"); + }); + + it("should decode @ (hex uppercase) and neutralize resulting @mention", () => { + const result = sanitizeContent("Please review @pelikhan"); + expect(result).toBe("Please review `@pelikhan`"); + }); + + it("should decode double-encoded &#x40; and neutralize resulting @mention", () => { + const result = sanitizeContent("Please review &#x40;pelikhan"); + expect(result).toBe("Please review `@pelikhan`"); + }); + + it("should decode double-encoded &#X40; and neutralize resulting @mention", () => { + const result = sanitizeContent("Please review &#X40;pelikhan"); + expect(result).toBe("Please review `@pelikhan`"); + }); + + it("should decode multiple HTML-encoded @mentions", () => { + const result = sanitizeContent("@user1 and @user2 and @user3"); + expect(result).toBe("`@user1` and `@user2` and `@user3`"); + }); + + it("should decode mixed HTML entities and normal @mentions", () => { + const result = sanitizeContent("@user1 and @user2"); + expect(result).toBe("`@user1` and `@user2`"); + }); + + it("should decode HTML entities in org/team mentions", () => { + const result = sanitizeContent("@myorg/myteam should review"); + expect(result).toBe("`@myorg/myteam` should review"); + }); + + it("should decode general decimal entities correctly", () => { + const result = sanitizeContent("Hello"); // "Hello" + expect(result).toBe("Hello"); + }); + + it("should decode general hex entities correctly", () => { + const result = sanitizeContent("Hello"); // "Hello" + expect(result).toBe("Hello"); + }); + + it("should decode double-encoded general entities correctly", () => { + const result = sanitizeContent("&#72;ello"); // "&Hello" + expect(result).toBe("Hello"); + }); + + it("should handle invalid code points gracefully", () => { + const result = sanitizeContent("Invalid � entity"); + expect(result).toBe("Invalid � entity"); // Keep original if invalid + }); + + it("should handle malformed HTML entities without crashing", () => { + const result = sanitizeContent("Malformed &# or &#x entity"); + expect(result).toBe("Malformed &# or &#x entity"); + }); + + it("should decode entities before Unicode hardening", () => { + // Ensure entity decoding happens as part of hardenUnicodeText + const result = sanitizeContent("!"); // Full-width exclamation (U+FF01) + expect(result).toBe("!"); // Should become ASCII ! + }); + + it("should decode entities in combination with other sanitization", () => { + const result = sanitizeContent("@user text"); + expect(result).toBe("`@user` text"); + }); + + it("should decode entities even in backticks (security-first approach)", () => { + // Entities are decoded during Unicode hardening, which happens before + // mention neutralization. This is intentional - we decode entities early + // to prevent bypasses, then the @mention gets neutralized properly. + const result = sanitizeContent("`@user`"); + expect(result).toBe("`@user`"); + }); + + it("should preserve legitimate URLs after entity decoding", () => { + const result = sanitizeContent("Visit https://github.com/user"); + expect(result).toBe("Visit https://github.com/user"); + }); + + it("should decode case-insensitive named entities", () => { + const result = sanitizeContent("&COMMAT;user and &CoMmAt;user2"); + expect(result).toBe("`@user` and `@user2`"); + }); + + it("should decode entities with mixed case hex digits", () => { + const result = sanitizeContent("O; is invalid but J is valid"); // Note: O not 0 + expect(result).toContain("O;"); // Invalid should remain + expect(result).toContain("J"); // Valid 0x4A = J + }); + + it("should handle zero code point", () => { + const result = sanitizeContent("�text"); + // Code point 0 is valid but typically removed as control character + expect(result).toContain("text"); + }); + + it("should respect allowed aliases even with HTML-encoded mentions", () => { + const result = sanitizeContent("@author is allowed", { allowedAliases: ["author"] }); + expect(result).toBe("@author is allowed"); + }); + }); }); diff --git a/actions/setup/js/sanitize_content_core.cjs b/actions/setup/js/sanitize_content_core.cjs index 42c135f47a..b01198a64a 100644 --- a/actions/setup/js/sanitize_content_core.cjs +++ b/actions/setup/js/sanitize_content_core.cjs @@ -485,6 +485,54 @@ function applyTruncation(content, maxLength) { return content; } +/** + * Decodes HTML entities to prevent bypass of @mention detection. + * Handles named entities (e.g., @), decimal entities (e.g., @), + * and hex entities (e.g., @), including double-encoded variants (e.g., &commat;). + * + * @param {string} text - Input text that may contain HTML entities + * @returns {string} Text with HTML entities decoded + */ +function decodeHtmlEntities(text) { + if (!text || typeof text !== "string") { + return ""; + } + + let result = text; + + // Decode named entity for @ symbol (including double-encoded variants) + // @ and &commat; → @ + result = result.replace(/&(?:amp;)?commat;/gi, "@"); + + // Decode decimal entities (including double-encoded variants) + // @ and &#64; → @ + // &#NNN; and &#NNN; → corresponding character + result = result.replace(/&(?:amp;)?#(\d+);/g, (match, code) => { + const codePoint = parseInt(code, 10); + // Validate code point is in valid Unicode range + if (codePoint >= 0 && codePoint <= 0x10ffff) { + return String.fromCodePoint(codePoint); + } + // Return original match if invalid + return match; + }); + + // Decode hex entities (including double-encoded variants) + // @, @, &#x40;, &#X40; → @ + // &#xHHH;, &#XHHH;, &#xHHH;, &#XHHH; → corresponding character + result = result.replace(/&(?:amp;)?#[xX]([0-9a-fA-F]+);/g, (match, code) => { + const codePoint = parseInt(code, 16); + // Validate code point is in valid Unicode range + if (codePoint >= 0 && codePoint <= 0x10ffff) { + return String.fromCodePoint(codePoint); + } + // Return original match if invalid + return match; + }); + + return result; +} + /** * Performs text hardening to protect against Unicode-based attacks. * This applies multiple layers of character normalization and filtering @@ -504,16 +552,21 @@ function hardenUnicodeText(text) { // This ensures consistent character representation across different encodings result = result.normalize("NFC"); - // Step 2: Strip invisible zero-width characters that can hide content + // Step 2: Decode HTML entities to prevent @mention bypass + // This MUST happen early, before any other processing, to ensure entities + // are converted to their actual characters for proper sanitization + result = decodeHtmlEntities(result); + + // Step 3: Strip invisible zero-width characters that can hide content // These include: zero-width space, zero-width non-joiner, zero-width joiner, // word joiner, and byte order mark result = result.replace(/[\u200B\u200C\u200D\u2060\uFEFF]/g, ""); - // Step 3: Remove bidirectional text override controls + // Step 4: Remove bidirectional text override controls // These can be used to reverse text direction and create visual spoofs result = result.replace(/[\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]/g, ""); - // Step 4: Convert full-width ASCII characters to standard ASCII + // Step 5: Convert full-width ASCII characters to standard ASCII // Full-width characters (U+FF01-FF5E) can be used to bypass filters result = result.replace(/[\uFF01-\uFF5E]/g, char => { const code = char.charCodeAt(0); @@ -611,4 +664,5 @@ module.exports = { neutralizeBotTriggers, applyTruncation, hardenUnicodeText, + decodeHtmlEntities, }; From 2624f8bca8ae4f34c0031c444205c9fe8a7a1117 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Feb 2026 22:57:24 +0000 Subject: [PATCH 3/3] Fix comment clarity in test case Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- actions/setup/js/sanitize_content.test.cjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/actions/setup/js/sanitize_content.test.cjs b/actions/setup/js/sanitize_content.test.cjs index 4c01d862c3..b246a25689 100644 --- a/actions/setup/js/sanitize_content.test.cjs +++ b/actions/setup/js/sanitize_content.test.cjs @@ -1430,7 +1430,7 @@ describe("sanitize_content.cjs", () => { }); it("should decode entities with mixed case hex digits", () => { - const result = sanitizeContent("O; is invalid but J is valid"); // Note: O not 0 + const result = sanitizeContent("O; is invalid but J is valid"); // Note: using letter 'O' not digit '0' expect(result).toContain("O;"); // Invalid should remain expect(result).toContain("J"); // Valid 0x4A = J });