Transforms to text/xxx should deal with bytes and text. Output is text

plone · May 17, 2018 · 5e297b6 · ale-rt · Oct 19, 2018 · petschki
1 parent fdcef5d
commit 5e297b6
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 7 deletions.
diff --git a/Products/PortalTransforms/libtransforms/retransform.py b/Products/PortalTransforms/libtransforms/retransform.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from Products.CMFPlone.utils import safe_unicode
 from Products.PortalTransforms.interfaces import ITransform
 from zope.interface import implementer
 
@@ -25,6 +26,7 @@ def addRegex(self, pat, repl):
         self.regexes.append((r, repl))
 
     def convert(self, orig, data, **kwargs):
+        orig = safe_unicode(orig)
         for r, repl in self.regexes:
             orig = r.sub(repl, orig)
         data.setData(orig)

diff --git a/Products/PortalTransforms/libtransforms/utils.py b/Products/PortalTransforms/libtransforms/utils.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from Products.CMFPlone.utils import safe_unicode
 from Products.PortalTransforms.utils import log
 
 import os
@@ -76,7 +77,9 @@ def bodyfinder(text):
     """ Return body or unchanged text if no body tags found.
 
     Always use html_headcheck() first.
+    Accepts bytes or text. Returns text.
     """
+    text = safe_unicode(text)
     lowertext = text.lower()
     bodystart = lowertext.find('<body')
     if bodystart == -1:

diff --git a/Products/PortalTransforms/transforms/html_to_text.py b/Products/PortalTransforms/transforms/html_to_text.py
@@ -29,10 +29,10 @@ def sub_func(matchobj):
         return res.encode('utf-8')
 
     return html_to_text("html_to_text",
-                        ('<script [^>]>.*</script>(?im)', ' '),
-                        ('<style [^>]>.*</style>(?im)', ' '),
-                        ('<head [^>]>.*</head>(?im)', ' '),
-                        ('(?im)</?(font|em|i|strong|b)(?=\W)[^>]*>', ''),
-                        ('<[^>]*>(?i)(?m)', ' '),
-                        (r'&([a-zA-Z0-9#]*?);', sub_func),
+                        (u'<script [^>]>.*</script>(?im)', ' '),
+                        (u'<style [^>]>.*</style>(?im)', ' '),
+                        (u'<head [^>]>.*</head>(?im)', ' '),
+                        (u'(?im)</?(font|em|i|strong|b)(?=\W)[^>]*>', ''),
+                        (u'<[^>]*>(?i)(?m)', ' '),
+                        (ur'&([a-zA-Z0-9#]*?);', sub_func),
                         )
diff --git a/Products/PortalTransforms/transforms/pdf_to_html.py b/Products/PortalTransforms/transforms/pdf_to_html.py
@@ -54,7 +54,7 @@ def invokeCommand(self, tmpdir, fullname):
             subprocess.run(cmd, shell=True)
         try:
             htmlfilename = os.path.join(tmpdir, sansext(fullname) + '.html')
-            htmlfile = open(htmlfilename, 'r')
+            htmlfile = open(htmlfilename, 'rb')
             html = htmlfile.read()
             htmlfile.close()
         except: