Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions actions/setup/js/sanitize_content.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -1318,4 +1318,132 @@ describe("sanitize_content.cjs", () => {
});
});
});

describe("HTML entity decoding for @mention bypass prevention", () => {
it("should decode @ and neutralize resulting @mention", () => {
const result = sanitizeContent("Please review @pelikhan");
expect(result).toBe("Please review `@pelikhan`");
});

it("should decode double-encoded @ and neutralize resulting @mention", () => {
const result = sanitizeContent("Please review @pelikhan");
expect(result).toBe("Please review `@pelikhan`");
});

it("should decode @ (decimal) and neutralize resulting @mention", () => {
const result = sanitizeContent("Please review @pelikhan");
expect(result).toBe("Please review `@pelikhan`");
});

it("should decode double-encoded @ and neutralize resulting @mention", () => {
const result = sanitizeContent("Please review @pelikhan");
expect(result).toBe("Please review `@pelikhan`");
});

it("should decode @ (hex lowercase) and neutralize resulting @mention", () => {
const result = sanitizeContent("Please review @pelikhan");
expect(result).toBe("Please review `@pelikhan`");
});

it("should decode @ (hex uppercase) and neutralize resulting @mention", () => {
const result = sanitizeContent("Please review @pelikhan");
expect(result).toBe("Please review `@pelikhan`");
});

it("should decode double-encoded @ and neutralize resulting @mention", () => {
const result = sanitizeContent("Please review @pelikhan");
expect(result).toBe("Please review `@pelikhan`");
});

it("should decode double-encoded @ and neutralize resulting @mention", () => {
const result = sanitizeContent("Please review @pelikhan");
expect(result).toBe("Please review `@pelikhan`");
});

it("should decode multiple HTML-encoded @mentions", () => {
const result = sanitizeContent("@user1 and @user2 and @user3");
expect(result).toBe("`@user1` and `@user2` and `@user3`");
});

it("should decode mixed HTML entities and normal @mentions", () => {
const result = sanitizeContent("@user1 and @user2");
expect(result).toBe("`@user1` and `@user2`");
});

it("should decode HTML entities in org/team mentions", () => {
const result = sanitizeContent("@myorg/myteam should review");
expect(result).toBe("`@myorg/myteam` should review");
});

it("should decode general decimal entities correctly", () => {
const result = sanitizeContent("Hello"); // "Hello"
expect(result).toBe("Hello");
});

it("should decode general hex entities correctly", () => {
const result = sanitizeContent("Hello"); // "Hello"
expect(result).toBe("Hello");
});

it("should decode double-encoded general entities correctly", () => {
const result = sanitizeContent("Hello"); // "&Hello"
expect(result).toBe("Hello");
});

it("should handle invalid code points gracefully", () => {
const result = sanitizeContent("Invalid � entity");
expect(result).toBe("Invalid � entity"); // Keep original if invalid
});

it("should handle malformed HTML entities without crashing", () => {
const result = sanitizeContent("Malformed &# or &#x entity");
expect(result).toBe("Malformed &# or &#x entity");
});

it("should decode entities before Unicode hardening", () => {
// Ensure entity decoding happens as part of hardenUnicodeText
const result = sanitizeContent("!"); // Full-width exclamation (U+FF01)
expect(result).toBe("!"); // Should become ASCII !
});

it("should decode entities in combination with other sanitization", () => {
const result = sanitizeContent("&commat;user <!-- comment --> text");
expect(result).toBe("`@user` text");
});

it("should decode entities even in backticks (security-first approach)", () => {
// Entities are decoded during Unicode hardening, which happens before
// mention neutralization. This is intentional - we decode entities early
// to prevent bypasses, then the @mention gets neutralized properly.
const result = sanitizeContent("`&commat;user`");
expect(result).toBe("`@user`");
});

it("should preserve legitimate URLs after entity decoding", () => {
const result = sanitizeContent("Visit https://github.com/user");
expect(result).toBe("Visit https://github.com/user");
});

it("should decode case-insensitive named entities", () => {
const result = sanitizeContent("&COMMAT;user and &CoMmAt;user2");
expect(result).toBe("`@user` and `@user2`");
});

it("should decode entities with mixed case hex digits", () => {
const result = sanitizeContent("&#x4O; is invalid but &#x4A; is valid"); // Note: using letter 'O' not digit '0'
expect(result).toContain("&#x4O;"); // Invalid should remain
expect(result).toContain("J"); // Valid 0x4A = J
});

it("should handle zero code point", () => {
const result = sanitizeContent("&#0;text");
// Code point 0 is valid but typically removed as control character
expect(result).toContain("text");
});

it("should respect allowed aliases even with HTML-encoded mentions", () => {
const result = sanitizeContent("&commat;author is allowed", { allowedAliases: ["author"] });
expect(result).toBe("@author is allowed");
});
});
});
60 changes: 57 additions & 3 deletions actions/setup/js/sanitize_content_core.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,54 @@ function applyTruncation(content, maxLength) {
return content;
}

/**
* Decodes HTML entities to prevent bypass of @mention detection.
* Handles named entities (e.g., &commat;), decimal entities (e.g., &#64;),
* and hex entities (e.g., &#x40;), including double-encoded variants (e.g., &amp;commat;).
*
* @param {string} text - Input text that may contain HTML entities
* @returns {string} Text with HTML entities decoded
*/
function decodeHtmlEntities(text) {
if (!text || typeof text !== "string") {
return "";
}

let result = text;

// Decode named entity for @ symbol (including double-encoded variants)
// &commat; and &amp;commat; → @
result = result.replace(/&(?:amp;)?commat;/gi, "@");

// Decode decimal entities (including double-encoded variants)
// &#64; and &amp;#64; → @
// &#NNN; and &amp;#NNN; → corresponding character
result = result.replace(/&(?:amp;)?#(\d+);/g, (match, code) => {
const codePoint = parseInt(code, 10);
// Validate code point is in valid Unicode range
if (codePoint >= 0 && codePoint <= 0x10ffff) {
return String.fromCodePoint(codePoint);
}
// Return original match if invalid
return match;
});

// Decode hex entities (including double-encoded variants)
// &#x40;, &#X40;, &amp;#x40;, &amp;#X40; → @
// &#xHHH;, &#XHHH;, &amp;#xHHH;, &amp;#XHHH; → corresponding character
result = result.replace(/&(?:amp;)?#[xX]([0-9a-fA-F]+);/g, (match, code) => {
const codePoint = parseInt(code, 16);
// Validate code point is in valid Unicode range
if (codePoint >= 0 && codePoint <= 0x10ffff) {
return String.fromCodePoint(codePoint);
}
// Return original match if invalid
return match;
});

return result;
}

/**
* Performs text hardening to protect against Unicode-based attacks.
* This applies multiple layers of character normalization and filtering
Expand All @@ -504,16 +552,21 @@ function hardenUnicodeText(text) {
// This ensures consistent character representation across different encodings
result = result.normalize("NFC");

// Step 2: Strip invisible zero-width characters that can hide content
// Step 2: Decode HTML entities to prevent @mention bypass
// This MUST happen early, before any other processing, to ensure entities
// are converted to their actual characters for proper sanitization
result = decodeHtmlEntities(result);

// Step 3: Strip invisible zero-width characters that can hide content
// These include: zero-width space, zero-width non-joiner, zero-width joiner,
// word joiner, and byte order mark
result = result.replace(/[\u200B\u200C\u200D\u2060\uFEFF]/g, "");

// Step 3: Remove bidirectional text override controls
// Step 4: Remove bidirectional text override controls
// These can be used to reverse text direction and create visual spoofs
result = result.replace(/[\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]/g, "");

// Step 4: Convert full-width ASCII characters to standard ASCII
// Step 5: Convert full-width ASCII characters to standard ASCII
// Full-width characters (U+FF01-FF5E) can be used to bypass filters
result = result.replace(/[\uFF01-\uFF5E]/g, char => {
const code = char.charCodeAt(0);
Expand Down Expand Up @@ -611,4 +664,5 @@ module.exports = {
neutralizeBotTriggers,
applyTruncation,
hardenUnicodeText,
decodeHtmlEntities,
};
Loading