diff --git a/Products/PortalTransforms/libtransforms/retransform.py b/Products/PortalTransforms/libtransforms/retransform.py index ad5699b..aae9eba 100644 --- a/Products/PortalTransforms/libtransforms/retransform.py +++ b/Products/PortalTransforms/libtransforms/retransform.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from Products.CMFPlone.utils import safe_unicode from Products.PortalTransforms.interfaces import ITransform from zope.interface import implementer @@ -25,6 +26,7 @@ def addRegex(self, pat, repl): self.regexes.append((r, repl)) def convert(self, orig, data, **kwargs): + orig = safe_unicode(orig) for r, repl in self.regexes: orig = r.sub(repl, orig) data.setData(orig) diff --git a/Products/PortalTransforms/libtransforms/utils.py b/Products/PortalTransforms/libtransforms/utils.py index 664a438..8272130 100644 --- a/Products/PortalTransforms/libtransforms/utils.py +++ b/Products/PortalTransforms/libtransforms/utils.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from Products.CMFPlone.utils import safe_unicode from Products.PortalTransforms.utils import log import os @@ -76,7 +77,9 @@ def bodyfinder(text): """ Return body or unchanged text if no body tags found. Always use html_headcheck() first. + Accepts bytes or text. Returns text. """ + text = safe_unicode(text) lowertext = text.lower() bodystart = lowertext.find('
]>.*(?im)', ' '), - ('(?im)', ' '), - (']>.*(?im)', ' '), - ('(?im)?(font|em|i|strong|b)(?=\W)[^>]*>', ''), - ('<[^>]*>(?i)(?m)', ' '), - (r'&([a-zA-Z0-9#]*?);', sub_func), + (u'(?im)', ' '), + (u'(?im)', ' '), + (u']>.*(?im)', ' '), + (u'(?im)?(font|em|i|strong|b)(?=\W)[^>]*>', ''), + (u'<[^>]*>(?i)(?m)', ' '), + (ur'&([a-zA-Z0-9#]*?);', sub_func), ) diff --git a/Products/PortalTransforms/transforms/pdf_to_html.py b/Products/PortalTransforms/transforms/pdf_to_html.py index da24a85..03993ac 100644 --- a/Products/PortalTransforms/transforms/pdf_to_html.py +++ b/Products/PortalTransforms/transforms/pdf_to_html.py @@ -54,7 +54,7 @@ def invokeCommand(self, tmpdir, fullname): subprocess.run(cmd, shell=True) try: htmlfilename = os.path.join(tmpdir, sansext(fullname) + '.html') - htmlfile = open(htmlfilename, 'r') + htmlfile = open(htmlfilename, 'rb') html = htmlfile.read() htmlfile.close() except: