🐛 Fix crash when email headers contained unencoded unicode

okfde · Nov 15, 2024 · f7779cf · f7779cf
1 parent b276924
commit f7779cf
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 4 deletions.
diff --git a/froide/helper/email_parsing.py b/froide/helper/email_parsing.py
@@ -12,8 +12,8 @@
 from dataclasses import dataclass, field
 from datetime import datetime
 from datetime import timezone as dt_timezone
-from email.header import decode_header
-from email.message import EmailMessage
+from email.header import Header, decode_header
+from email.message import EmailMessage, Message
 from email.parser import BytesParser as Parser
 from email.utils import getaddresses, parseaddr, parsedate_to_datetime
 from io import BytesIO
@@ -87,7 +87,7 @@ class EmailAttachment(BytesIO):
 
 
 def parse_email_body(
-    msgobj: EmailMessage,
+    msgobj: Message,
 ) -> Tuple[List[str], List[str], List[EmailAttachment]]:
     body = []
     html = []
@@ -141,7 +141,7 @@ def parse_dispositions(dispo):
     return dispo_name, dispo_dict
 
 
-def parse_attachment(message_part, ignore_content_types=None):
+def parse_attachment(message_part: Message, ignore_content_types=None):
     # this gives more info then message_part.get_content_disposition()
     content_disposition = message_part.get("Content-Disposition", None)
     content_type = message_part.get_content_type()
@@ -245,6 +245,8 @@ def parse_header_field(field):
         # But encoded words may have been split up!
         # Let's remove newlines that are not preceded by
         # encoded word terminator and try again
+        if isinstance(field, Header):
+            field = str(field)
         field = re.sub(r"(?<!\?\=)\n ", "=20", field)
         decodefrag = decode_header(field)
 

diff --git a/froide/helper/tests/test_email_parsing.py b/froide/helper/tests/test_email_parsing.py
@@ -0,0 +1,14 @@
+from io import BytesIO
+from pathlib import Path
+
+from ..email_parsing import parse_email
+from .test_email_log_parsing import TEST_DATA_ROOT
+
+
+def test_parse_utf8_subject():
+    with open(Path(TEST_DATA_ROOT) / "email_utf8-subject.eml", "rb") as f:
+        email = parse_email(BytesIO(f.read()))
+
+    assert email.subject.startswith(
+        "utf8-subject - äöüß - "
+    )  # putting unicode directly into an email is undefined, so we only check for the well-formatted part of the subject
diff --git a/froide/helper/tests/testdata/email_utf8-subject.eml b/froide/helper/tests/testdata/email_utf8-subject.eml
@@ -0,0 +1,10 @@
+Return-Path: <MAILER-DAEMON>
+Message-Id: <12345678@localhost>
+From:  Mitgliederservice  <1234@localhost>
+Subject: utf8-subject - =?utf-8?Q?=C3=A4=C3=B6=C3=BC=C3=9F?= - 🥲
+To: mail@mail.example.com
+Content-Transfer-Encoding: 7bit
+Content-Type: text/html; charset=UTF-8
+Date: Wed, 13 Nov 2024 20:49:29 +0100
+
+EMail-Body