Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Fix crash when email headers contained unencoded unicode #897

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions froide/helper/email_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from dataclasses import dataclass, field
from datetime import datetime
from datetime import timezone as dt_timezone
from email.header import decode_header
from email.message import EmailMessage
from email.header import Header, decode_header
from email.message import EmailMessage, Message
from email.parser import BytesParser as Parser
from email.utils import getaddresses, parseaddr, parsedate_to_datetime
from io import BytesIO
Expand Down Expand Up @@ -87,7 +87,7 @@ class EmailAttachment(BytesIO):


def parse_email_body(
msgobj: EmailMessage,
msgobj: Message,
) -> Tuple[List[str], List[str], List[EmailAttachment]]:
body = []
html = []
Expand Down Expand Up @@ -141,7 +141,7 @@ def parse_dispositions(dispo):
return dispo_name, dispo_dict


def parse_attachment(message_part, ignore_content_types=None):
def parse_attachment(message_part: Message, ignore_content_types=None):
# this gives more info then message_part.get_content_disposition()
content_disposition = message_part.get("Content-Disposition", None)
content_type = message_part.get_content_type()
Expand Down Expand Up @@ -245,6 +245,8 @@ def parse_header_field(field):
# But encoded words may have been split up!
# Let's remove newlines that are not preceded by
# encoded word terminator and try again
if isinstance(field, Header):
field = str(field)
field = re.sub(r"(?<!\?\=)\n ", "=20", field)
decodefrag = decode_header(field)

Expand Down
14 changes: 14 additions & 0 deletions froide/helper/tests/test_email_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from io import BytesIO
from pathlib import Path

from ..email_parsing import parse_email
from .test_email_log_parsing import TEST_DATA_ROOT


def test_parse_utf8_subject():
with open(Path(TEST_DATA_ROOT) / "email_utf8-subject.eml", "rb") as f:
email = parse_email(BytesIO(f.read()))

assert email.subject.startswith(
"utf8-subject - äöüß - "
) # putting unicode directly into an email is undefined, so we only check for the well-formatted part of the subject
10 changes: 10 additions & 0 deletions froide/helper/tests/testdata/email_utf8-subject.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Return-Path: <MAILER-DAEMON>
Message-Id: <12345678@localhost>
From: Mitgliederservice <1234@localhost>
Subject: utf8-subject - =?utf-8?Q?=C3=A4=C3=B6=C3=BC=C3=9F?= - 🥲
To: mail@mail.example.com
Content-Transfer-Encoding: 7bit
Content-Type: text/html; charset=UTF-8
Date: Wed, 13 Nov 2024 20:49:29 +0100

EMail-Body