Skip to content

Commit

Permalink
Disable htmllaundry by default. Fix some annoying issues with br tags
Browse files Browse the repository at this point in the history
  • Loading branch information
glutanimate committed May 27, 2017
1 parent 88eff17 commit 4135b96
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 46 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@ The add-on's HTML processing is highly configurable. All options can be accessed

## Platform Support

HTML processing is provided by the Bleach library on all platforms. Additionally, the add-on also utilizes the [`htmllaundry` library](https://github.com/wichert/htmllaundry) which can improve the cleaning results under under some circumstances.
HTML processing is provided by the Bleach library on all platforms. The add-on can also be configured to use the [`htmllaundry` library](https://github.com/wichert/htmllaundry) which can improve the cleaning results under under some circumstances.

`htmllaundry` depends on `lxml` which Anki unfortunately does not ship with. In contrast to the other libraries included in this add-on, `lxml` cannot be easily be packaged for all platforms because it needs to be compiled. For that reason `htmllaundry` support is only available on Windows and Linux right now.

## License and Credits

*Cloze Overlapper* is *Copyright © 2016-2017 [Aristotelis P.](https://github.com/Glutanimate)*

This add-on was developed on a commission by a fellow Anki user. All credit for the original idea goes to them.

I'm always happy for new add-on commissions. If you'd like to hire my services to work an add-on or new feature, please feel free to reach out to me through *ankiglutanimate [αt] gmail . com*.

Licensed under the [GNU AGPL v3](https://www.gnu.org/licenses/agpl.html).

This add-on would not not have been possible without the following open-source libraries:
Expand Down
54 changes: 34 additions & 20 deletions html_cleaner/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,12 @@
keep_styles = ["color", "background", "font-weight", "font-family",
"font-style", "font-size", "text-decoration", "margin-left"]

# Whether or not to also process HTML with htmllaundry (if available)
use_html_laundry = False

### USER CONFIGURATION END ###


import sys
import os
import re
Expand All @@ -57,15 +61,16 @@
sys.path.insert(0, os.path.dirname(__file__))
import bleach

from htmllaundry import cleaners, sanitize
# Htmllaundry depends on lxml which we cannot ship with this add-on

# Htmllaundry depends on lxml which we cannot ship on all platforms
# If we can't import htmllaundry we will skip using it further down below
try:
from htmllaundry import cleaners, sanitize
LAUNDROMAT = True
except ImportError:
LAUNDROMAT = False


from aqt.qt import *
from aqt.editor import Editor
from anki.hooks import wrap
Expand All @@ -74,35 +79,37 @@

# insert linebreak after regex match
brtags = (r"(</(div|p|br|li|ul|ol|blockquote|tr|"
"table|thead|tfoot|tbody|h[1-9]|)>)([^\n])")
"table|thead|tfoot|tbody|h[1-9])>|<br>)([^\n])")


def laundryHtml(html):
"""Clean using htmllaundry/lxml"""
# docs: http://lxml.de/api/lxml.html.clean.Cleaner-class.html

cleaner = cleaners.LaundryCleaner(
allow_tags = keep_tags,
safe_attrs = keep_attrs,
page_structure = False,
remove_unknown_tags = False,
safe_attrs_only = True,
add_nofollow = False,
processing_instructions = True,
meta = True,
scripts = True,
javascript = True,
comments = True,
style = True,
javascript = True,
annoying_tags = True,
page_structure=False,
remove_unknown_tags=False,
safe_attrs_only = False,
add_nofollow = False,
style = False,
links = False,
meta = False,
processing_instructions = True,
frames = False,
annoying_tags = True)
frames = False)

return sanitize(html, cleaner)


def bleachHtml(html):
"""Clean using bleach/html5lib"""
# docs: https://bleach.readthedocs.io/en/latest/clean.html

cleaned = bleach.clean(html,
tags = keep_tags,
attributes = keep_attrs,
Expand All @@ -115,16 +122,23 @@ def bleachHtml(html):
def cleanHtml(html):
"""Clean HTML with cleaners and custom regexes"""
html = html.replace("\n", " ")
if LAUNDROMAT:
# both bleach and htmllaundry eat "<br />"...
html = html.replace("<br />", "<br>")

if use_html_laundry and LAUNDROMAT:
# lxml.clean will munch <br> tags for some reason, even though
# they're whitelisted. This is an ugly workaround, but it works.
html = html.replace("<br>", "|||LBR|||").replace("</br>", "|||LBR|||")
html = laundryHtml(html)
cleaned = bleachHtml(html)
html = html.replace("|||LBR|||", "<br>")
html = bleachHtml(html)

# remove empty style attributes, try to pretty-format tags
cleaned = cleaned.replace('<div><br></div>', '<br>')
cleaned = cleaned.replace(' style=""', '').replace("\n", "")
cleaned = re.sub(brtags, r"\1\n\3", cleaned)
html = html.replace('<div><br></div>', '<br>')
html = html.replace(' style=""', '')
html = re.sub(brtags, r"\1\n\3", html)

return cleaned
return html


def onHtmlClean(self):
Expand All @@ -151,7 +165,7 @@ def onHtmlClean(self):


def onFieldUndo(self):
"""Executued on undo toggle"""
"""Executed on undo toggle"""
if not hasattr(self, "_fieldUndo") or not self._fieldUndo:
return
n, html = self._fieldUndo
Expand Down
92 changes: 67 additions & 25 deletions tools/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import bleach


# Htmllaundry depends on lxml which we cannot ship with this add-on
# Htmllaundry depends on lxml which we cannot ship on all platforms
# for that reason we have to check if we can import htmllaundry.
# If we can't we will skip using htmllaundry further down below
try:
Expand All @@ -17,11 +17,34 @@
except ImportError:
LAUNDROMAT = False


html = u"""<div><b>EXAMPLE 1</b></div>
<!--StartFragment--><span style='font-size:15.0pt;mso-bidi-font-size:12.0pt;font-family:""Times New Roman"",""serif"";color:black;mso-ansi-language:EN;mso-fareast-language:EN;mso-bidi-language: SW-AS'>LOREM DOLOR <b style=""mso-bidi-font-weight:normal"">SIT</b> <u>AMET</u></span><!--EndFragment-->
<div><b>EXAMPLE 2</b></div>
<span style="color: rgb(51, 51, 51);font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif;font-size: 18px">Lorem ipsum doler sit amet&nbsp;</span><a href="http://www.spiegel.de/" title="Lorem" class="text-link-int lp-text-link-int" style="color: rgb(153, 0, 0); font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif; font-size: 18px;">Lorem ipsum</a><span style="color: rgb(51, 51, 51); font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif; font-size: 18px;">&nbsp;ipsum dolore sit amet.</span>"""
<span style="color: rgb(51, 51, 51);font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif;font-size: 18px">Lorem ipsum doler sit amet&nbsp;</span><a href="http://www.spiegel.de/" title="Lorem" class="text-link-int lp-text-link-int" style="color: rgb(153, 0, 0); font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif; font-size: 18px;">Lorem ipsum</a><span style="color: rgb(51, 51, 51); font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif; font-size: 18px;">&nbsp;ipsum dolore sit amet.</span>
<div><b>EXAMPLE 3</b></div>
<!--StartFragment--><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:AR-SA">Überschrift</span><div><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:AR-SA"><br></span></div>
<div>Das ist der <b>erste</b>&nbsp;Beispielsatz an
dieser Stelle.</div>
<p class="MsoNormal" align="left" style="margin:0cm;margin-bottom:.0001pt;
text-align:left;text-indent:0cm;line-height:normal"><o:p></o:p></p>
<span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:
AR-SA">Das ist der zweite
Beispielsatz</span><!--EndFragment--><div><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:
AR-SA"><br></span></div>
<div><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:
AR-SA"><br></span></div>
<div><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:
AR-SA"><br></span></div>
<div><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:
AR-SA"><br></span></div>
<div><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:
AR-SA"><!--StartFragment--><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:AR-SA">Das ist ein
dritter Satz.&nbsp;</span><!--EndFragment--></span></div>
<div><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:
AR-SA"><br></span></div>
<div><span style="font-size:12.0pt;line-height:100%;font-family:&quot;Times New Roman&quot;,&quot;serif&quot;;mso-fareast-font-family:&quot;Times New Roman&quot;;mso-fareast-theme-font:minor-fareast;color:black;mso-ansi-language:DE;mso-fareast-language:DE;mso-bidi-language:
AR-SA"><br></span></div>
<div><img src="paste-921834426494981.jpg"></div>"""


# Html tags to preserve
Expand All @@ -31,6 +54,7 @@
'dft', 'br', 'table', 'tr', 'td', 'th', 'thead',
'tbody', 'tfoot', 'div', 'u', 'i']


# Tag attributes to preserve
keep_attrs = [ 'style', 'rev', 'prompt', 'color', 'colspan',
'usemap', 'cols', 'accept', 'datetime', 'char',
Expand All @@ -45,57 +69,75 @@
'value', 'longdesc', 'headers', 'vspace', 'noshade', 'coords', 'width',
'maxlength', 'cellpadding', 'title', 'dir', 'tabindex']


# Styles to preserve in the style attribute
keep_styles = ["margin-left"]


# insert linebreak after regex match
brtags = (r"(</(div|p|br|li|ul|ol|blockquote|tr|"
"table|thead|tfoot|tbody|h[1-9]|)>)([^\n])")
"table|thead|tfoot|tbody|h[1-9])>|<br>)([^\n])")

use_html_laundry = False


def laundryHtml(html):
"""Clean using htmllaundry/lxml"""
# docs: http://lxml.de/api/lxml.html.clean.Cleaner-class.html

cleaner = cleaners.LaundryCleaner(
page_structure=False,
remove_unknown_tags=False,
allow_tags = keep_tags,
safe_attrs = keep_attrs,
safe_attrs_only=True,
add_nofollow=False,
scripts=True,
javascript=True,
comments=True,
style=True,
links=False,
meta=False,
processing_instructions=True,
frames=False,
annoying_tags=True)
processing_instructions = True,
meta = True,
scripts = True,
comments = True,
javascript = True,
annoying_tags = True,
page_structure=False,
remove_unknown_tags=False,
safe_attrs_only = False,
add_nofollow = False,
style = False,
links = False,
frames = False)

return sanitize(html, cleaner)


def bleachHtml(html):
"""Clean using bleach/html5lib"""
# docs: https://bleach.readthedocs.io/en/latest/clean.html

cleaned = bleach.clean(html,
tags = keep_tags,
attributes = keep_attrs,
styles = keep_styles,
strip = True
)
strip = True)

return cleaned


def cleanHtml(html):
"""Clean HTML with cleaners and custom regexes"""
html = html.replace("\n", " ")
if LAUNDROMAT:
# both bleach and htmllaundry eat "<br />"...
html = html.replace("<br />", "<br>")

if use_html_laundry and LAUNDROMAT:
# lxml.clean will munch <br> tags for some reason, even though
# they're whitelisted. This is an ugly workaround, but it works.
html = html.replace("<br>", "|||LBR|||").replace("</br>", "|||LBR|||")
html = laundryHtml(html)
cleaned = bleachHtml(html)
html = html.replace("|||LBR|||", "<br>")
html = bleachHtml(html)

# remove empty style attributes, try to pretty-format tags
cleaned = cleaned.replace('<div><br></div>', '<br>')
cleaned = cleaned.replace(' style=""', '').replace("\n", "")
cleaned = re.sub(brtags, r"\1\n\3", cleaned)
html = html.replace('<div><br></div>', '<br>')
html = html.replace(' style=""', '')
html = re.sub(brtags, r"\1\n\3", html)

return cleaned
return html

cleaned = cleanHtml(html)

Expand Down

0 comments on commit 4135b96

Please sign in to comment.