Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure UTF-8 surogates escaped on save - fix #159 #164

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ include/*
dummy_project/*
.cache/
.tox/
messages
/messages

9 changes: 6 additions & 3 deletions django_mailbox/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,6 @@ def _get_dehydrated_message(self, msg, record):
def _process_message(self, message):
msg = Message()
settings = utils.get_settings()

if settings['store_original_message']:
self._process_save_original_message(message, msg)
msg.mailbox = self
Expand Down Expand Up @@ -386,10 +385,14 @@ def _process_message(self, message):

def _process_save_original_message(self, message, msg):
settings = utils.get_settings()
if six.PY3:
content = message.as_string().encode('ascii', 'surrogateescape')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm feeling a little conflicted about this -- we'd be fundamentally changing the method of encoding stored files if we do this, no? I might be misunderstanding what we're doing, here, though.

else:
content = message.as_string()
if settings['compress_original_message']:
with NamedTemporaryFile(suffix=".eml.gz") as fp_tmp:
with gzip.GzipFile(fileobj=fp_tmp, mode="w") as fp:
fp.write(message.as_string().encode('utf-8'))
fp.write(content)
msg.eml.save(
"%s.eml.gz" % (uuid.uuid4(), ),
File(fp_tmp),
Expand All @@ -399,7 +402,7 @@ def _process_save_original_message(self, message, msg):
else:
msg.eml.save(
'%s.eml' % uuid.uuid4(),
ContentFile(message.as_string()),
ContentFile(content),
save=False
)

Expand Down
46 changes: 46 additions & 0 deletions django_mailbox/tests/messages/message_with_utf8_surrogates.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Return-path: <SRS0=HZOvtU=DM=REDACTED@porady.REDACTED>
Envelope-to: sprawa-1418@porady.REDACTED
Delivery-date: Sat, 16 Dec 2017 16:22:42 +0100
Received: from mx1.wp.pl ([212.77.101.6])
by s50.hekko.net.pl with esmtps (TLSv1.2:ECDHE-RSA-AES256-GCM-SHA384:256)
(Exim 4.89) (envelope-from <REDACTED@wp.pl>) id 1eQEII-0005Fu-Cs
for sprawa-1418@porady.REDACTED; Sat, 16 Dec 2017 16:22:42 +0100
Received: (wp-smtpd smtp.wp.pl 33592 invoked from network);
16 Dec 2017 16:22:11 +0100
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=wp.pl; s=1024a;
t=1513437731; bh=6Ox0FVna7vMxBu5CbF0s6HvpkNalENwSSgDxNJ/Rsdc=;
h=From:To:Subject;
b=QnrdNDBDNLuENN9QS0Pvn85/bDE1Fc6jJrvKUdnApFrykwzbHxXxCG4qX7g3sS3Qj
xxHGNf8UXZh3zyCln2EZpUD03LgkppMpTbv3tLKA4HSnaT7txr6AWHq2y8A/YQo7EY
2806CYWtFCKYoVolzDN9lctM2nEoZpD5jOVZqYsM=
Received: from public-gprs394416.centertel.pl (HELO REDACTED)
(REDACTED@wp.pl@[37.47.171.241]) (envelope-sender <REDACTED@wp.pl>)
by smtp.wp.pl (WP-SMTPD) with SMTP
for <REDACTED@sejm.pl>; 16 Dec 2017 16:22:11 +0100
Message-ID: <BD6D8EECA3A74E9A9ABD7A1EEA593F76@REDACTED>
From: <REDACTED@wp.pl>
To: <sprawa-1418@porady.REDACTED>
Subject: =?windows-1250?Q?Do_czego_te=BF_s=B9_zdolni_Polscy_s=EAdziowie_..._?=
Date: Sat, 16 Dec 2017 16:21:04 +0100
MIME-Version: 1.0
Content-Type: multipart/mixed;
boundary="----=_NextPart_000_0018_01D37689.E058CEF0"
X-Spam-Status: No, message_size=7331926 larger than 200K

To jest wielocz�ciowa wiadomo�� w formacie MIME.

------=_NextPart_000_0018_01D37689.E058CEF0
Content-Type: multipart/alternative;
boundary="----=_NextPart_001_0019_01D37689.E058CEF0"


------=_NextPart_001_0019_01D37689.E058CEF0
Content-Type: text/plain;
charset="windows-1250"
Content-Transfer-Encoding: quoted-printable

REDACTED

------=_NextPart_001_0019_01D37689.E058CEF0--

------=_NextPart_000_0018_01D37689.E058CEF0--
36 changes: 35 additions & 1 deletion django_mailbox/tests/test_process_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,40 @@ def test_message_with_utf8_attachment_header(self):
u'odpowied\u017a Burmistrza.jpg'
)

def test_message_with_utf8_surrogates(self):
"""Ensure that we properly handle UTF-8 surrogates

The problem observed in Python 3.5. It safes from regress of #159.
"""

email_object = self._get_email_object(
'message_with_utf8_surrogates.eml',
)
mailbox = Mailbox.objects.create()
default_settings = utils.get_settings()
with mock.patch('django_mailbox.utils.get_settings') as get_settings:
altered = copy.deepcopy(default_settings)
altered['store_original_message'] = True

get_settings.return_value = altered

# This call throws the UnicodeEncodeError exception.
msg = mailbox.process_incoming_message(email_object)

self.assertEqual(
msg.subject,
u'Do czego te\u017c s\u0105 zdolni Polscy s\u0119dziowie ... '
)

self.assertEqual(
msg.attachments.count(),
0
)

with open(msg.eml.name, 'rb') as f:
self.assertEqual(f.read(),
self._get_email_as_text('message_with_utf8_surrogates.eml'))

def test_message_get_text_body(self):
message = self._get_email_object('multipart_text.eml')

Expand Down Expand Up @@ -468,4 +502,4 @@ def test_message_compressed(self):

with gzip.open(msg.eml.name, 'rb') as f:
self.assertEqual(f.read(),
self._get_email_as_text('generic_message.eml'))
self._get_email_as_text('generic_message.eml'))