Skip to content

Commit 68858f0

Browse files
Merge pull request #463 from TeamMsgExtractor/next-release
Version 0.54.1
2 parents 5585364 + 5931d1d commit 68858f0

File tree

7 files changed

+105
-8
lines changed

7 files changed

+105
-8
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
**v0.54.1**
2+
* [[TeamMsgExtractor #462](https://github.com/TeamMsgExtractor/msg-extractor/issues/462)] Fix potential issue where child MSG might have incompatible encoding to parent MSG when trying to grab a stream from the parent.
3+
* Added code to attempt to significantly improve RTF deencapsulation times. This tries to strip away unneeded data before passing it to `RTFDE`. This shows improvements on all files that take more than one second. Currently, this actually fixes some files previously outputting wrong from `RTFDE` when deencapsulating the HTML body, specifically around non breaking spaces sometimes not transferring over.
4+
15
**v0.54.0**
26
* [[TeamMsgExtractor #456](https://github.com/TeamMsgExtractor/msg-extractor/issues/456)] Changed the prepared html output to use plainly encoded HTML instead of prettified, since current prettification options used mangles the output and causes the output to sometimes be very large.
37

README.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ your access to the newest major version of extract-msg.
260260
.. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
261261
:target: LICENSE.txt
262262

263-
.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.0-blue.svg
264-
:target: https://pypi.org/project/extract-msg/0.54.0/
263+
.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.1-blue.svg
264+
:target: https://pypi.org/project/extract-msg/0.54.1/
265265

266266
.. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg
267267
:target: https://www.python.org/downloads/release/python-3810/

extract_msg/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2828

2929
__author__ = 'Destiny Peterson & Matthew Walker'
30-
__date__ = '2025-03-23'
31-
__version__ = '0.54.0'
30+
__date__ = '2025-04-10'
31+
__version__ = '0.54.1'
3232

3333
__all__ = [
3434
# Modules:

extract_msg/constants/re.py

+12
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
'HTML_SAN_SPACE',
99
'INVALID_FILENAME_CHARS',
1010
'INVALID_OLE_PATH',
11+
'RTF_BODY_STRIP_INIT',
12+
'RTF_BODY_STRIP_PRE_CLOSE',
13+
'RTF_BODY_STRIP_PRE_OPEN',
1114
'RTF_ENC_BODY_START',
1215
]
1316

@@ -40,3 +43,12 @@
4043
# invalid.
4144
INVALID_OLE_PATH: Final[_RE_STR_TYPE] = re.compile(r'[:/\\!]')
4245

46+
# Used as the initial step in stripping RTF files for deencapsulation. Finds
47+
# ignored sections that do not contrain groups *and* finds HTML tag sections
48+
# that are entirely empty. It also then finds sections of data that can be
49+
# merged together without affecting the results
50+
RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)|(\\htmlrtf1? ?\{\}\\htmlrtf0 ?)|(\\htmlrtf1? ?\\\'[a-fA-F0-9]{2}\\htmlrtf0 ?)')
51+
52+
# Preprocessing steps to simplify the RTF.
53+
RTF_BODY_STRIP_PRE_CLOSE: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1? ?}\\htmlrtf0 ?)|(\\htmlrtf1? ?[^0{}][^{}]*?} ?\\htmlrtf0 ?)')
54+
RTF_BODY_STRIP_PRE_OPEN: Final[_RE_BYTES_TYPE] = re.compile(rb'\\htmlrtf1? ?{[^{}]*?\\htmlrtf0 ?')

extract_msg/msg_classes/message_base.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@
4848
from ..utils import (
4949
addNumToDir, addNumToZipDir, createZipOpen, decodeRfc2047, findWk,
5050
htmlSanitize, inputToBytes, inputToString, isEncapsulatedRtf,
51-
prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, validateHtml
51+
prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, stripRtf,
52+
validateHtml
5253
)
5354

5455

@@ -1012,6 +1013,11 @@ def deencapsulatedRtf(self) -> Optional[RTFDE.DeEncapsulator]:
10121013
while body and body[-1] != 125:
10131014
body = body[:-1]
10141015

1016+
# Some files take a long time due to how they are structured and
1017+
# how RTFDE works. The longer a file would normally take, the
1018+
# better this fix works:
1019+
body = stripRtf(body)
1020+
10151021
try:
10161022
deencapsultor = RTFDE.DeEncapsulator(body)
10171023
deencapsultor.deencapsulate()

extract_msg/msg_classes/msg.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,23 @@ def __init__(self, path, **kwargs):
203203
self.__overrideEncoding = overrideEncoding
204204

205205
if prefix and not filename:
206-
filename = self.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False)
206+
# We actually need to get this from the parent.
207+
msg = None
208+
parentNeedsClose = False
209+
if self.__parentMsg:
210+
msg = self.__parentMsg()
211+
if msg is None:
212+
# We *NEED* the parent here, so we're going to do something
213+
# dumb and just generate it *manually*, grab what we need, # and them immediately close it.
214+
#
215+
# We don't need anything more advanced than MSGFile.
216+
msg = MSGFile(path, prefix = prefixl[:-2], delayAttachments = True)
217+
parentNeedsClose = True
218+
# Now that we know we have the parent, grab the stream.
219+
filename = msg.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False)
220+
# Now if we opened the parent, close it.
221+
if parentNeedsClose:
222+
msg.close()
207223
if filename:
208224
self.filename = filename
209225
elif hasattr(path, '__len__'):
@@ -492,7 +508,7 @@ def export(self, path, allowBadEmbed: bool = False) -> None:
492508
493509
:param path: A path-like object (including strings and ``pathlib.Path``
494510
objects) or an IO device with a write method which accepts bytes.
495-
:param allowBadEmbed: If True, attempts to skip steps that will fail if
511+
:param allowBadEmbed: If True, attempts to skip steps that will fail if
496512
the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
497513
"""
498514
from ..ole_writer import OleWriter
@@ -507,7 +523,7 @@ def exportBytes(self, allowBadEmbed: bool = False) -> bytes:
507523
"""
508524
Saves a new copy of the MSG file, returning the bytes.
509525
510-
:param allowBadEmbed: If True, attempts to skip steps that will fail if
526+
:param allowBadEmbed: If True, attempts to skip steps that will fail if
511527
the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
512528
"""
513529
out = io.BytesIO()

extract_msg/utils.py

+59
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
'rtfSanitizeHtml',
3838
'rtfSanitizePlain',
3939
'setupLogging',
40+
'stripRtf',
4041
'tryGetMimetype',
4142
'unsignedToSignedInt',
4243
'unwrapMsg',
@@ -61,6 +62,7 @@
6162
import logging.config
6263
import os
6364
import pathlib
65+
import re
6466
import shutil
6567
import struct
6668
import sys
@@ -1012,6 +1014,63 @@ def setupLogging(defaultPath = None, defaultLevel = logging.WARN, logfile = None
10121014
return True
10131015

10141016

1017+
def stripRtf(rtfBody: bytes) -> bytes:
1018+
"""
1019+
Cleans up RTF before sending it to RTFDE.
1020+
1021+
Attempts to find common sections of RTF data that will
1022+
"""
1023+
# First, do a pre-strip to try and simplify ignored sections as much as possible.
1024+
rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(_stripRtfOpenHelper, rtfBody)
1025+
rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(_stripRtfCloseHelper, rtfBody)
1026+
# Second do an initial strip to simplify our data stream.
1027+
rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody)
1028+
# Do it one more time to help with some things that might not have gotten
1029+
# caught the first time, perhaps because something now exists after
1030+
# stripping.
1031+
rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody)
1032+
1033+
# TODO: Further processing...
1034+
1035+
return rtfBody
1036+
1037+
def _stripRtfCloseHelper(match: re.Match) -> bytes:
1038+
if (ret := match.expand(b'\\g<0>')).count(b'\\htmlrtf0') > 1:
1039+
return ret
1040+
1041+
if b'\\f' in ret:
1042+
return ret
1043+
1044+
return b'\\htmlrtf}\\htmlrtf0 '
1045+
1046+
1047+
def _stripRtfOpenHelper(match: re.Match) -> bytes:
1048+
if b'\\f' in (ret := match.expand(b'\\g<0>')):
1049+
return ret
1050+
1051+
return b'\\htmlrtf{\\htmlrtf0 '
1052+
1053+
1054+
def _stripRtfHelper(match: re.Match) -> bytes:
1055+
res = match.string
1056+
1057+
# If these don't match, don't even try.
1058+
if res.count(b'{') != res.count(b'}') or res.count(b'{') == 0:
1059+
return res
1060+
1061+
# If any group markers are prefixed by a backslash, give up.
1062+
if res.find(b'\\{') != -1 or res.find(b'\\}') != -1:
1063+
return res
1064+
1065+
# Last little bit of processing to validate everything. We know the {}
1066+
# match, but let's be *absolutely* sure.
1067+
# TODO
1068+
1069+
return res
1070+
1071+
1072+
1073+
10151074
def tryGetMimetype(att: AttachmentBase, mimetype: Union[str, None]) -> Union[str, None]:
10161075
"""
10171076
Uses an optional dependency to try and get the mimetype of an attachment.

0 commit comments

Comments
 (0)