Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions actions/setup/js/sanitize_content.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const {
convertXmlTags,
neutralizeBotTriggers,
applyTruncation,
hardenUnicodeText,
} = require("./sanitize_content_core.cjs");

const { balanceCodeRegions } = require("./markdown_code_region_balancer.cjs");
Expand Down Expand Up @@ -72,6 +73,9 @@ function sanitizeContent(content, maxLengthOrOptions) {

let sanitized = content;

// Apply Unicode hardening first to normalize text representation
sanitized = hardenUnicodeText(sanitized);

// Remove ANSI escape sequences and control characters early
sanitized = sanitized.replace(/\x1b\[[0-9;]*[mGKH]/g, "");
sanitized = sanitized.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
Expand Down
249 changes: 249 additions & 0 deletions actions/setup/js/sanitize_content.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -1069,4 +1069,253 @@ describe("sanitize_content.cjs", () => {
});
});
});

describe("Unicode hardening transformations", () => {
describe("zero-width character removal", () => {
it("should remove zero-width space (U+200B)", () => {
const input = "Hello\u200BWorld";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove zero-width non-joiner (U+200C)", () => {
const input = "Test\u200CText";
const expected = "TestText";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove zero-width joiner (U+200D)", () => {
const input = "Hello\u200DWorld";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove word joiner (U+2060)", () => {
const input = "Word\u2060Joiner";
const expected = "WordJoiner";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove byte order mark (U+FEFF)", () => {
const input = "\uFEFFHello World";
const expected = "Hello World";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove multiple zero-width characters", () => {
const input = "A\u200BB\u200CC\u200DD\u2060E\uFEFFF";
const expected = "ABCDEF";
expect(sanitizeContent(input)).toBe(expected);
});

it("should handle text with only zero-width characters", () => {
const input = "\u200B\u200C\u200D";
const expected = "";
expect(sanitizeContent(input)).toBe(expected);
});
});

describe("Unicode normalization (NFC)", () => {
it("should normalize composed characters", () => {
// e + combining acute accent -> precomposed é
const input = "cafe\u0301"; // café with combining accent
const result = sanitizeContent(input);
// After NFC normalization, should be composed form
expect(result).toBe("café");
// Verify it's the precomposed character (U+00E9)
expect(result.charCodeAt(3)).toBe(0x00e9);
});

it("should normalize multiple combining characters", () => {
const input = "n\u0303"; // ñ with combining tilde
const result = sanitizeContent(input);
expect(result).toBe("ñ");
});

it("should handle already normalized text", () => {
const input = "Hello World";
const expected = "Hello World";
expect(sanitizeContent(input)).toBe(expected);
});
});

describe("full-width ASCII conversion", () => {
it("should convert full-width exclamation mark", () => {
const input = "Hello\uFF01"; // Full-width !
const expected = "Hello!";
expect(sanitizeContent(input)).toBe(expected);
});

it("should convert full-width letters", () => {
const input = "\uFF21\uFF22\uFF23"; // Full-width ABC
const expected = "ABC";
expect(sanitizeContent(input)).toBe(expected);
});

it("should convert full-width digits", () => {
const input = "\uFF11\uFF12\uFF13"; // Full-width 123
const expected = "123";
expect(sanitizeContent(input)).toBe(expected);
});

it("should convert full-width parentheses", () => {
const input = "\uFF08test\uFF09"; // Full-width (test)
const expected = "(test)";
expect(sanitizeContent(input)).toBe(expected);
});

it("should convert mixed full-width and normal text", () => {
const input = "Hello\uFF01 \uFF37orld"; // Hello! World with full-width ! and W
const expected = "Hello! World";
expect(sanitizeContent(input)).toBe(expected);
});

it("should convert full-width at sign", () => {
const input = "\uFF20user"; // Full-width @user
// Note: @ mention will also be neutralized
const result = sanitizeContent(input);
expect(result).toBe("`@user`");
});

it("should handle entire sentence in full-width", () => {
const input = "\uFF28\uFF45\uFF4C\uFF4C\uFF4F"; // Full-width Hello
const expected = "Hello";
expect(sanitizeContent(input)).toBe(expected);
});
});

describe("directional override removal", () => {
it("should remove left-to-right embedding (U+202A)", () => {
const input = "Hello\u202AWorld";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove right-to-left embedding (U+202B)", () => {
const input = "Hello\u202BWorld";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove pop directional formatting (U+202C)", () => {
const input = "Hello\u202CWorld";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove left-to-right override (U+202D)", () => {
const input = "Hello\u202DWorld";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove right-to-left override (U+202E)", () => {
const input = "Hello\u202EWorld";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove left-to-right isolate (U+2066)", () => {
const input = "Hello\u2066World";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove right-to-left isolate (U+2067)", () => {
const input = "Hello\u2067World";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove first strong isolate (U+2068)", () => {
const input = "Hello\u2068World";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove pop directional isolate (U+2069)", () => {
const input = "Hello\u2069World";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should remove multiple directional controls", () => {
const input = "A\u202AB\u202BC\u202CD\u202DE\u202EF\u2066G\u2067H\u2068I\u2069J";
const expected = "ABCDEFGHIJ";
expect(sanitizeContent(input)).toBe(expected);
});
});

describe("combined Unicode attacks", () => {
it("should handle combination of zero-width and directional controls", () => {
const input = "Hello\u200B\u202EWorld\u200C";
const expected = "HelloWorld";
expect(sanitizeContent(input)).toBe(expected);
});

it("should handle combination of full-width and zero-width", () => {
const input = "\uFF28\u200Bello"; // Full-width H + zero-width space + ello
const expected = "Hello";
expect(sanitizeContent(input)).toBe(expected);
});

it("should handle all transformations together", () => {
// Full-width H, zero-width space, combining accent, RTL override, normal text
const input = "\uFF28\u200Be\u0301\u202Ello";
const expected = "Héllo";
expect(sanitizeContent(input)).toBe(expected);
});

it("should prevent visual spoofing with mixed scripts", () => {
// Example: trying to hide malicious text with RTL override
const input = "filename\u202E.txt.exe";
// Should remove the RTL override
const expected = "filename.txt.exe";
expect(sanitizeContent(input)).toBe(expected);
});

it("should handle deeply nested Unicode attacks", () => {
const input = "\uFEFF\u200B\uFF21\u202E\u0301\u200C";
// BOM + ZWS + full-width A + RTL + combining + ZWNJ
const result = sanitizeContent(input);
// Should result in just "A" with the combining accent normalized
expect(result.replace(/\u0301/g, "")).toBe("A");
});
});

describe("edge cases and boundary conditions", () => {
it("should handle empty string", () => {
expect(sanitizeContent("")).toBe("");
});

it("should handle string with only invisible characters", () => {
const input = "\u200B\u202E\uFEFF";
expect(sanitizeContent(input)).toBe("");
});

it("should preserve regular whitespace", () => {
const input = "Hello World\t\nTest";
const result = sanitizeContent(input);
// Should preserve spaces, tabs, and newlines (though trimmed at end)
expect(result).toContain("Hello");
expect(result).toContain("World");
});

it("should not affect emoji", () => {
const input = "Hello 👋 World 🌍";
const result = sanitizeContent(input);
expect(result).toContain("👋");
expect(result).toContain("🌍");
});
Comment on lines +1304 to +1309
Copy link

Copilot AI Feb 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The emoji preservation test should include examples of ZWJ (zero-width joiner) emoji sequences, since U+200D is removed by the Unicode hardening function at line 510 of sanitize_content_core.cjs. Examples to test:

  • Family emoji: "👨‍👩‍👧‍👦" (uses ZWJ between each member)
  • Profession emoji: "👨‍⚕️" (man + ZWJ + medical symbol)
  • Flag sequences that use ZWJ

This would verify whether the intended behavior is to break these sequences (as a security tradeoff) or whether the implementation needs adjustment.

Copilot uses AI. Check for mistakes.

it("should handle long text with scattered Unicode attacks", () => {
const longText = "A".repeat(100) + "\u200B" + "B".repeat(100) + "\u202E" + "C".repeat(100);
const result = sanitizeContent(longText);
// Should remove the invisible characters
expect(result.length).toBe(300); // 100 + 100 + 100
expect(result.includes("\u200B")).toBe(false);
expect(result.includes("\u202E")).toBe(false);
});
});
});
});
45 changes: 45 additions & 0 deletions actions/setup/js/sanitize_content_core.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,46 @@ function applyTruncation(content, maxLength) {
return content;
}

/**
* Performs text hardening to protect against Unicode-based attacks.
* This applies multiple layers of character normalization and filtering
* to ensure consistent text processing and prevent visual spoofing.
*
* @param {string} text - Input text to harden
* @returns {string} Hardened text with Unicode security applied
*/
function hardenUnicodeText(text) {
if (!text || typeof text !== "string") {
return "";
}

let result = text;

// Step 1: Normalize Unicode to canonical composition (NFC)
// This ensures consistent character representation across different encodings
result = result.normalize("NFC");

// Step 2: Strip invisible zero-width characters that can hide content
// These include: zero-width space, zero-width non-joiner, zero-width joiner,
// word joiner, and byte order mark
result = result.replace(/[\u200B\u200C\u200D\u2060\uFEFF]/g, "");
Copy link

Copilot AI Feb 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The removal of zero-width joiner (U+200D) in line 510 may break complex emoji sequences that legitimately use ZWJ to combine multiple emoji. For example, family emoji (👨‍👩‍👧‍👦), profession emoji (👨‍⚕️), and flag sequences use ZWJ.

While the test at line 1304-1309 checks basic emoji preservation, it doesn't test ZWJ emoji sequences. Consider either:

  1. Adding an exception for ZWJ when it appears between emoji characters, or
  2. Accepting this tradeoff and documenting that complex ZWJ emoji may be broken as a security measure

If option 1 is chosen, you could detect emoji by checking if adjacent characters are in emoji Unicode ranges (U+1F300-1F9FF, U+2600-26FF, etc.) before deciding whether to remove ZWJ.

Copilot uses AI. Check for mistakes.

// Step 3: Remove bidirectional text override controls
// These can be used to reverse text direction and create visual spoofs
result = result.replace(/[\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]/g, "");

// Step 4: Convert full-width ASCII characters to standard ASCII
// Full-width characters (U+FF01-FF5E) can be used to bypass filters
result = result.replace(/[\uFF01-\uFF5E]/g, char => {
const code = char.charCodeAt(0);
// Map full-width to half-width by subtracting offset
const standardCode = code - 0xfee0;
return String.fromCharCode(standardCode);
});

Copy link

Copilot AI Feb 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The full-width conversion range [\uFF01-\uFF5E] excludes U+FF00. However, U+FF00 is actually undefined in Unicode. The actual full-width/ideographic space is U+3000, not U+FF00. If conversion of full-width spaces is desired for security purposes, consider adding:

// Convert ideographic/full-width space to ASCII space
result = result.replace(/\u3000/g, " ");

This would prevent attackers from using U+3000 to bypass space-based filters. However, this might be omitted intentionally to preserve certain CJK formatting semantics.

This issue also appears on line 503 of the same file.

Suggested change
// Step 5: Convert ideographic/full-width space (U+3000) to ASCII space
// This prevents use of U+3000 to bypass space-based filters
result = result.replace(/\u3000/g, " ");

Copilot uses AI. Check for mistakes.
return result;
}

/**
* Core sanitization function without mention filtering
* @param {string} content - The content to sanitize
Expand All @@ -504,6 +544,10 @@ function sanitizeContentCore(content, maxLength) {

let sanitized = content;

// Apply Unicode hardening first to normalize text representation
// This prevents Unicode-based attacks and ensures consistent processing
sanitized = hardenUnicodeText(sanitized);

// Remove ANSI escape sequences and control characters early
// This must happen before mention neutralization to avoid creating bare mentions
// when control characters are removed between @ and username
Expand Down Expand Up @@ -566,4 +610,5 @@ module.exports = {
convertXmlTags,
neutralizeBotTriggers,
applyTruncation,
hardenUnicodeText,
};
6 changes: 6 additions & 0 deletions actions/setup/js/sanitize_label_content.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* @module sanitize_label_content
*/

const { hardenUnicodeText } = require("./sanitize_content_core.cjs");

/**
* Sanitizes label content by removing control characters, ANSI escape codes,
* and neutralizing @mentions to prevent unintended notifications.
Expand All @@ -17,6 +19,10 @@ function sanitizeLabelContent(content) {
return "";
}
let sanitized = content.trim();

// Apply Unicode hardening first
sanitized = hardenUnicodeText(sanitized);

// Remove ANSI escape sequences FIRST (before removing control chars)
sanitized = sanitized.replace(/\x1b\[[0-9;]*[mGKH]/g, "");
// Then remove control characters (except newlines and tabs)
Expand Down
34 changes: 34 additions & 0 deletions actions/setup/js/sanitize_label_content.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,38 @@ describe("sanitize_label_content.cjs", () => {
expect(sanitizeLabelContent(' @user says <hello> & "goodbye" ')).toBe("`@user` says hello goodbye");
}));
});

describe("Unicode hardening for labels", () => {
it("should remove zero-width characters", () => {
expect(sanitizeLabelContent("bug\u200Blabel")).toBe("buglabel");
expect(sanitizeLabelContent("test\u200C\u200D\u2060label")).toBe("testlabel");
});

it("should convert full-width ASCII to normal ASCII", () => {
expect(sanitizeLabelContent("\uFF21\uFF22\uFF23")).toBe("ABC");
expect(sanitizeLabelContent("bug\uFF01")).toBe("bug!");
});

it("should remove directional override characters", () => {
expect(sanitizeLabelContent("label\u202Etest")).toBe("labeltest");
expect(sanitizeLabelContent("bug\u202A\u202B\u202Cfix")).toBe("bugfix");
});

it("should normalize Unicode characters (NFC)", () => {
const labelWithCombining = "cafe\u0301"; // café with combining accent
const result = sanitizeLabelContent(labelWithCombining);
expect(result).toBe("café");
expect(result.charCodeAt(3)).toBe(0x00e9); // Precomposed é
});

it("should handle combination of Unicode attacks in labels", () => {
const maliciousLabel = "\uFF42\u200Bug\u202E\uFEFF";
expect(sanitizeLabelContent(maliciousLabel)).toBe("bug");
});

it("should preserve emoji in labels", () => {
expect(sanitizeLabelContent("🐛 bug")).toBe("🐛 bug");
expect(sanitizeLabelContent("✨ enhancement")).toBe("✨ enhancement");
});
Comment on lines +119 to +122
Copy link

Copilot AI Feb 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to the sanitize_content.test.cjs tests, the emoji preservation test should include examples of ZWJ (zero-width joiner) emoji sequences to verify whether they are intentionally broken or should be preserved. Since U+200D is removed by hardenUnicodeText(), complex emoji like "👨‍👩‍👧‍👦" (family) or "👨‍⚕️" (profession) will be broken into their component parts.

Copilot uses AI. Check for mistakes.
});
});
Loading