Skip to content

Commit

Permalink
Fix encoding of special chars
Browse files Browse the repository at this point in the history
  • Loading branch information
jrockwar committed Nov 24, 2024
1 parent 98bc967 commit f10c7c6
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 8 deletions.
163 changes: 163 additions & 0 deletions src/mhtml/__tests__/generator.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import { generateMHTML } from '../generator';
import { describe, expect, it } from 'bun:test';

describe('MHTML Generator', () => {
// Helper to create a mock HTML collection
function createMockHTMLCollection(): HTMLCollectionOf<HTMLImageElement> {
const collection = [] as unknown as HTMLCollectionOf<HTMLImageElement>;
collection.item = (index: number) => collection[index] || null;
collection.namedItem = (name: string) => {
const found = Array.from(collection).find(item => item.id === name);
return found || null;
};
return collection;
}

// Helper to create a mock StyleSheetList
function createMockStyleSheetList(): StyleSheetList {
const list = [] as unknown as StyleSheetList;
list.item = (index: number) => list[index] || null;
return list;
}

// Basic character encoding test
const mockDoc = {
title: "Test Page with Special Characters: Don't Encode Me!",
documentElement: {
outerHTML: `
<html>
<head>
<title>Test Page</title>
</head>
<body>
<h1>Testing Special Characters</h1>
<p>Here's a paragraph with an apostrophe and some dots...</p>
<p>Special characters like = should be encoded</p>
<p>Non-ASCII like é and ñ should be encoded</p>
<p>Line breaks should
be encoded properly</p>
</body>
</html>
`
},
location: {
href: 'https://example.com/test'
},
getElementsByTagName: () => createMockHTMLCollection(),
styleSheets: createMockStyleSheetList()
};

it('should properly handle special characters in MHTML generation', async () => {
const result = await generateMHTML(mockDoc);
const content = new TextDecoder().decode(result);

// Check that apostrophes are not encoded
expect(content).toContain("Here's");
expect(content).not.toContain("Here=27s");

// Check that dots are not encoded
expect(content).toContain("...");
expect(content).not.toContain("=2E=2E=2E");

// Check that equals signs are encoded
expect(content).toContain("=3D");

// Check that non-ASCII characters are encoded
expect(content).toContain("=E9"); // é
expect(content).toContain("=F1"); // ñ

// Check that line breaks are properly encoded
expect(content).toContain("=0D=0A");
});

it('should handle real-world article content correctly', async () => {
const realWorldDoc = {
title: "The AI Revolution: What's Next?",
documentElement: {
outerHTML: `
<html>
<head>
<title>The AI Revolution: What's Next?</title>
<meta charset="utf-8">
<style>
body { font-family: 'Arial', sans-serif; }
.article { max-width: 800px; margin: 0 auto; }
blockquote { border-left: 3px solid #ccc; padding-left: 1em; }
</style>
</head>
<body>
<article class="article">
<h1>The AI Revolution: What's Next?</h1>
<div class="metadata">
By Sarah O'Connor | Published: January 15th, 2024
</div>
<p>
As we enter 2024, artificial intelligence continues to reshape our world.
Let's explore what's on the horizon...
</p>
<blockquote>
"AI isn't just about automation — it's about augmentation of human capabilities"
— Dr. María González
</blockquote>
<h2>Key Developments in 2023</h2>
<ul>
<li>GPT-4's impact on natural language processing</li>
<li>Breakthrough in protein folding prediction</li>
<li>Advanced robotics integration with AI</li>
</ul>
<p>
The field of AI ethics has also seen significant progress, with researchers
tackling questions about bias, fairness, and transparency. Companies like
DeepMind and OpenAI have implemented new guidelines for responsible AI development.
</p>
<figure>
<img src="" alt="AI Development Timeline">
<figcaption>Fig.1 - AI Development Timeline (2020-2024)</figcaption>
</figure>
</article>
</body>
</html>
`
},
location: {
href: 'https://example.com/ai-revolution'
},
getElementsByTagName: () => createMockHTMLCollection(),
styleSheets: createMockStyleSheetList()
};

const result = await generateMHTML(realWorldDoc);
const content = new TextDecoder().decode(result);

// Test proper handling of various content types
expect(content).toContain("What's Next?"); // Apostrophes in title
expect(content).toContain("O'Connor"); // Apostrophes in names

// Test that non-ASCII characters are properly encoded in quoted-printable format
// These will be decoded back to Unicode when parsing the MHTML for EPUB conversion
const decodedContent = content.replace(/=([0-9A-F]{2})/g, (_, hex) =>
String.fromCharCode(parseInt(hex, 16))
);
expect(decodedContent).toContain("María"); // Non-ASCII characters should decode correctly
expect(decodedContent).toContain("González"); // Non-ASCII characters should decode correctly

// Test structural elements
expect(content).toContain('<blockquote>'); // HTML tags
expect(content).toContain('class=3D"article"'); // CSS classes with encoded equals
expect(content).toContain('max-width: 800px'); // CSS styles

// Test proper MIME structure
expect(content).toContain('Content-Type: multipart/related');
expect(content).toContain('Content-Transfer-Encoding: quoted-printable');
expect(content).toContain('MIME-Version: 1.0');

// Test proper encoding of special characters in metadata
expect(content).toContain('charset=3D"utf-8"'); // Encoded equals sign
expect(content).not.toContain('charset="utf-8"'); // Raw equals sign should not exist

// Verify line breaks are properly encoded
const lineBreaks = content.match(/=0D=0A/g);
expect(lineBreaks).toBeTruthy();
expect(lineBreaks!.length).toBeGreaterThan(0);
});
});
21 changes: 13 additions & 8 deletions src/mhtml/generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,19 @@ function createMHTMLHeader(doc: MHTMLDocument): string {
* Encodes text as quoted-printable
*/
function encodeQuotedPrintable(text: string): string {
return text.split('').map(char => {
const code = char.charCodeAt(0);
// If character is non-ASCII or special MIME character, encode it
if (code > 127 || char === '=' || char === '.' || char === '\r' || char === '\n') {
return '=' + code.toString(16).toUpperCase().padStart(2, '0');
}
return char;
}).join('');
return text
// First normalize all line endings to CRLF
.replace(/\r\n|\n|\r/g, '\r\n')
.split('')
.map(char => {
const code = char.charCodeAt(0);
// Only encode non-ASCII and essential MIME special characters
if (code > 127 || char === '=' || char === '\r' || char === '\n') {
return '=' + code.toString(16).toUpperCase().padStart(2, '0');
}
return char;
})
.join('');
}

/**
Expand Down

0 comments on commit f10c7c6

Please sign in to comment.