Merge pull request #463 from TeamMsgExtractor/next-release

TheElementalOfDestruction · web-flow · commit 68858f016d01 · 2025-04-10T14:02:24.000-07:00
Version 0.54.1
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+**v0.54.1**
+* [[TeamMsgExtractor #462](https://github.com/TeamMsgExtractor/msg-extractor/issues/462)] Fix potential issue where child MSG might have incompatible encoding to parent MSG when trying to grab a stream from the parent.
+* Added code to attempt to significantly improve RTF deencapsulation times. This tries to strip away unneeded data before passing it to `RTFDE`. This shows improvements on all files that take more than one second. Currently, this actually fixes some files previously outputting wrong from `RTFDE` when deencapsulating the HTML body, specifically around non breaking spaces sometimes not transferring over.
+
 **v0.54.0**
 * [[TeamMsgExtractor #456](https://github.com/TeamMsgExtractor/msg-extractor/issues/456)] Changed the prepared html output to use plainly encoded HTML instead of prettified, since current prettification options used mangles the output and causes the output to sometimes be very large.
 
diff --git a/README.rst b/README.rst
@@ -260,8 +260,8 @@ your access to the newest major version of extract-msg.
 .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
    :target: LICENSE.txt
 
-.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.0-blue.svg
-   :target: https://pypi.org/project/extract-msg/0.54.0/
+.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.1-blue.svg
+   :target: https://pypi.org/project/extract-msg/0.54.1/
 
 .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg
    :target: https://www.python.org/downloads/release/python-3810/
diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py
@@ -27,8 +27,8 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 __author__ = 'Destiny Peterson & Matthew Walker'
-__date__ = '2025-03-23'
-__version__ = '0.54.0'
+__date__ = '2025-04-10'
+__version__ = '0.54.1'
 
 __all__ = [
     # Modules:
diff --git a/extract_msg/constants/re.py b/extract_msg/constants/re.py
@@ -8,6 +8,9 @@
     'HTML_SAN_SPACE',
     'INVALID_FILENAME_CHARS',
     'INVALID_OLE_PATH',
+    'RTF_BODY_STRIP_INIT',
+    'RTF_BODY_STRIP_PRE_CLOSE',
+    'RTF_BODY_STRIP_PRE_OPEN',
     'RTF_ENC_BODY_START',
 ]
 
@@ -40,3 +43,12 @@
 # invalid.
 INVALID_OLE_PATH: Final[_RE_STR_TYPE] = re.compile(r'[:/\\!]')
 
+# Used as the initial step in stripping RTF files for deencapsulation. Finds
+# ignored sections that do not contrain groups *and* finds HTML tag sections
+# that are entirely empty. It also then finds sections of data that can be
+# merged together without affecting the results
+RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)|(\\htmlrtf1? ?\{\}\\htmlrtf0 ?)|(\\htmlrtf1? ?\\\'[a-fA-F0-9]{2}\\htmlrtf0 ?)')
+
+# Preprocessing steps to simplify the RTF.
+RTF_BODY_STRIP_PRE_CLOSE: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1? ?}\\htmlrtf0 ?)|(\\htmlrtf1? ?[^0{}][^{}]*?} ?\\htmlrtf0 ?)')
+RTF_BODY_STRIP_PRE_OPEN: Final[_RE_BYTES_TYPE] = re.compile(rb'\\htmlrtf1? ?{[^{}]*?\\htmlrtf0 ?')
diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py
@@ -48,7 +48,8 @@
 from ..utils import (
         addNumToDir, addNumToZipDir, createZipOpen, decodeRfc2047, findWk,
         htmlSanitize, inputToBytes, inputToString, isEncapsulatedRtf,
-        prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, validateHtml
+        prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, stripRtf,
+        validateHtml
     )
 
 
@@ -1012,6 +1013,11 @@ def deencapsulatedRtf(self) -> Optional[RTFDE.DeEncapsulator]:
             while body and body[-1] != 125:
                 body = body[:-1]
 
+            # Some files take a long time due to how they are structured and
+            # how RTFDE works. The longer a file would normally take, the
+            # better this fix works:
+            body = stripRtf(body)
+
             try:
                 deencapsultor = RTFDE.DeEncapsulator(body)
                 deencapsultor.deencapsulate()
diff --git a/extract_msg/msg_classes/msg.py b/extract_msg/msg_classes/msg.py
@@ -203,7 +203,23 @@ def __init__(self, path, **kwargs):
             self.__overrideEncoding = overrideEncoding
 
             if prefix and not filename:
-                filename = self.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False)
+                # We actually need to get this from the parent.
+                msg = None
+                parentNeedsClose = False
+                if self.__parentMsg:
+                    msg = self.__parentMsg()
+                if msg is None:
+                    # We *NEED* the parent here, so we're going to do something
+                    # dumb and just generate it *manually*, grab what we need, # and them immediately close it.
+                    #
+                    # We don't need anything more advanced than MSGFile.
+                    msg = MSGFile(path, prefix = prefixl[:-2], delayAttachments = True)
+                    parentNeedsClose = True
+                # Now that we know we have the parent, grab the stream.
+                filename = msg.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False)
+                # Now if we opened the parent, close it.
+                if parentNeedsClose:
+                    msg.close()
             if filename:
                 self.filename = filename
             elif hasattr(path, '__len__'):
@@ -492,7 +508,7 @@ def export(self, path, allowBadEmbed: bool = False) -> None:
 
         :param path: A path-like object (including strings and ``pathlib.Path``
             objects) or an IO device with a write method which accepts bytes.
-        :param allowBadEmbed: If True, attempts to skip steps that will fail if 
+        :param allowBadEmbed: If True, attempts to skip steps that will fail if
             the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
         """
         from ..ole_writer import OleWriter
@@ -507,7 +523,7 @@ def exportBytes(self, allowBadEmbed: bool = False) -> bytes:
         """
         Saves a new copy of the MSG file, returning the bytes.
 
-        :param allowBadEmbed: If True, attempts to skip steps that will fail if 
+        :param allowBadEmbed: If True, attempts to skip steps that will fail if
             the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
         """
         out = io.BytesIO()
diff --git a/extract_msg/utils.py b/extract_msg/utils.py
@@ -37,6 +37,7 @@
     'rtfSanitizeHtml',
     'rtfSanitizePlain',
     'setupLogging',
+    'stripRtf',
     'tryGetMimetype',
     'unsignedToSignedInt',
     'unwrapMsg',
@@ -61,6 +62,7 @@
 import logging.config
 import os
 import pathlib
+import re
 import shutil
 import struct
 import sys
@@ -1012,6 +1014,63 @@ def setupLogging(defaultPath = None, defaultLevel = logging.WARN, logfile = None
     return True
 
 
+def stripRtf(rtfBody: bytes) -> bytes:
+    """
+    Cleans up RTF before sending it to RTFDE.
+
+    Attempts to find common sections of RTF data that will
+    """
+    # First, do a pre-strip to try and simplify ignored sections as much as possible.
+    rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(_stripRtfOpenHelper, rtfBody)
+    rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(_stripRtfCloseHelper, rtfBody)
+    # Second do an initial strip to simplify our data stream.
+    rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody)
+    # Do it one more time to help with some things that might not have gotten
+    # caught the first time, perhaps because something now exists after
+    # stripping.
+    rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody)
+
+    # TODO: Further processing...
+
+    return rtfBody
+
+def _stripRtfCloseHelper(match: re.Match) -> bytes:
+    if (ret := match.expand(b'\\g<0>')).count(b'\\htmlrtf0') > 1:
+        return ret
+
+    if b'\\f' in ret:
+        return ret
+
+    return b'\\htmlrtf}\\htmlrtf0 '
+
+
+def _stripRtfOpenHelper(match: re.Match) -> bytes:
+    if b'\\f' in (ret := match.expand(b'\\g<0>')):
+        return ret
+
+    return b'\\htmlrtf{\\htmlrtf0 '
+
+
+def _stripRtfHelper(match: re.Match) -> bytes:
+    res = match.string
+
+    # If these don't match, don't even try.
+    if res.count(b'{') != res.count(b'}') or res.count(b'{') == 0:
+        return res
+
+    # If any group markers are prefixed by a backslash, give up.
+    if res.find(b'\\{') != -1 or res.find(b'\\}') != -1:
+        return res
+
+    # Last little bit of processing to validate everything. We know the {}
+    # match, but let's be *absolutely* sure.
+    # TODO
+
+    return res
+
+
+
+
 def tryGetMimetype(att: AttachmentBase, mimetype: Union[str, None]) -> Union[str, None]:
     """
     Uses an optional dependency to try and get the mimetype of an attachment.