From 024ce88b9060621cb3d29f8d7b50c69d18551aec Mon Sep 17 00:00:00 2001 From: Kara Engelhardt Date: Fri, 15 Nov 2024 12:23:17 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20crash=20when=20email=20hea?= =?UTF-8?q?ders=20contained=20unencoded=20unicode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- froide/helper/email_parsing.py | 10 ++++++---- froide/helper/tests/test_email_parsing.py | 14 ++++++++++++++ .../helper/tests/testdata/email_utf8-subject.eml | 10 ++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 froide/helper/tests/test_email_parsing.py create mode 100644 froide/helper/tests/testdata/email_utf8-subject.eml diff --git a/froide/helper/email_parsing.py b/froide/helper/email_parsing.py index f349d9597..5b3b4eac8 100644 --- a/froide/helper/email_parsing.py +++ b/froide/helper/email_parsing.py @@ -12,8 +12,8 @@ from dataclasses import dataclass, field from datetime import datetime from datetime import timezone as dt_timezone -from email.header import decode_header -from email.message import EmailMessage +from email.header import Header, decode_header +from email.message import EmailMessage, Message from email.parser import BytesParser as Parser from email.utils import getaddresses, parseaddr, parsedate_to_datetime from io import BytesIO @@ -87,7 +87,7 @@ class EmailAttachment(BytesIO): def parse_email_body( - msgobj: EmailMessage, + msgobj: Message, ) -> Tuple[List[str], List[str], List[EmailAttachment]]: body = [] html = [] @@ -141,7 +141,7 @@ def parse_dispositions(dispo): return dispo_name, dispo_dict -def parse_attachment(message_part, ignore_content_types=None): +def parse_attachment(message_part: Message, ignore_content_types=None): # this gives more info then message_part.get_content_disposition() content_disposition = message_part.get("Content-Disposition", None) content_type = message_part.get_content_type() @@ -245,6 +245,8 @@ def parse_header_field(field): # But encoded words may have been split up! # Let's remove newlines that are not preceded by # encoded word terminator and try again + if isinstance(field, Header): + field = str(field) field = re.sub(r"(? +Message-Id: <12345678@localhost> +From: Mitgliederservice <1234@localhost> +Subject: utf8-subject - =?utf-8?Q?=C3=A4=C3=B6=C3=BC=C3=9F?= - 🥲 +To: mail@mail.example.com +Content-Transfer-Encoding: 7bit +Content-Type: text/html; charset=UTF-8 +Date: Wed, 13 Nov 2024 20:49:29 +0100 + +EMail-Body