-
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement support for htmllaundry/lxml.cleaner
By passing the HTML code through both htmllaundry and bleach we can achieve a much better result than with either of these libraries individually.
- Loading branch information
1 parent
05ec38c
commit 88eff17
Showing
177 changed files
with
79,353 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
Copyright (c) 2010-2016, Wichert Akkerman | ||
All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are met: | ||
|
||
1. Redistributions of source code must retain the above copyright notice, this | ||
list of conditions and the following disclaimer. | ||
2. Redistributions in binary form must reproduce the above copyright notice, | ||
this list of conditions and the following disclaimer in the documentation | ||
and/or other materials provided with the distribution. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR | ||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | ||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | ||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
The views and conclusions contained in the software and documentation are those | ||
of the authors and should not be interpreted as representing official policies, | ||
either expressed or implied, of the FreeBSD Project. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
lxml is copyright Infrae and distributed under the BSD license (see | ||
doc/licenses/BSD.txt), with the following exceptions: | ||
|
||
Some code, such a selftest.py, selftest2.py and | ||
src/lxml/_elementpath.py are derived from ElementTree and | ||
cElementTree. See doc/licenses/elementtree.txt for the license text. | ||
|
||
lxml.cssselect and lxml.html are copyright Ian Bicking and distributed | ||
under the BSD license (see doc/licenses/BSD.txt). | ||
|
||
test.py, the test-runner script, is GPL and copyright Shuttleworth | ||
Foundation. See doc/licenses/GPL.txt. It is believed the unchanged | ||
inclusion of test.py to run the unit test suite falls under the | ||
"aggregation" clause of the GPL and thus does not affect the license | ||
of the rest of the package. | ||
|
||
The isoschematron implementation uses several XSL and RelaxNG resources: | ||
* The (XML syntax) RelaxNG schema for schematron, copyright International | ||
Organization for Standardization (see | ||
src/lxml/isoschematron/resources/rng/iso-schematron.rng for the license | ||
text) | ||
* The skeleton iso-schematron-xlt1 pure-xslt schematron implementation | ||
xsl stylesheets, copyright Rick Jelliffe and Academia Sinica Computing | ||
Center, Taiwan (see the xsl files here for the license text: | ||
src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/) | ||
* The xsd/rng schema schematron extraction xsl transformations are unlicensed | ||
and copyright the respective authors as noted (see | ||
src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl and | ||
src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from htmllaundry.utils import sanitize | ||
from htmllaundry.utils import strip_markup | ||
from htmllaundry.utils import StripMarkup | ||
|
||
|
||
__all__ = ['sanitize', 'strip_markup', 'StripMarkup'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
from lxml.html.clean import Cleaner | ||
from lxml.html.clean import _find_external_links | ||
|
||
|
||
marker = [] | ||
|
||
|
||
class LaundryCleaner(Cleaner): | ||
link_target = marker | ||
|
||
def __call__(self, doc): | ||
super(LaundryCleaner, self).__call__(doc) | ||
if self.link_target is not marker: | ||
self.force_link_target(doc, self.link_target) | ||
|
||
def force_link_target(self, doc, target): | ||
for el in _find_external_links(doc): | ||
if target is None: | ||
if 'target' in el.attrib: | ||
del el.attrib['target'] | ||
else: | ||
el.set('target', target) | ||
|
||
|
||
DocumentCleaner = LaundryCleaner( | ||
page_structure=False, | ||
remove_unknown_tags=False, | ||
allow_tags=['blockquote', 'a', 'img', 'em', 'p', 'strong', | ||
'h3', 'h4', 'h5', 'ul', 'ol', 'li', 'sub', 'sup', | ||
'abbr', 'acronym', 'dl', 'dt', 'dd', 'cite', | ||
'dft', 'br', 'table', 'tr', 'td', 'th', 'thead', | ||
'tbody', 'tfoot'], | ||
safe_attrs_only=True, | ||
add_nofollow=True, | ||
scripts=True, | ||
javascript=True, | ||
comments=False, | ||
style=True, | ||
links=False, | ||
meta=False, | ||
processing_instructions=False, | ||
frames=False, | ||
annoying_tags=False) | ||
|
||
|
||
# Useful for line fields such as titles | ||
LineCleaner = LaundryCleaner( | ||
page_structure=False, | ||
safe_attrs_only=True, | ||
remove_unknown_tags=False, # Weird API.. | ||
allow_tags=['em', 'strong'], | ||
add_nofollow=True, | ||
scripts=True, | ||
javascript=True, | ||
comments=False, | ||
style=True, | ||
processing_instructions=False, | ||
frames=False, | ||
annoying_tags=False) | ||
|
||
CommentCleaner = LaundryCleaner( | ||
page_structure=False, | ||
safe_attrs_only=True, | ||
remove_unknown_tags=False, # Weird API.. | ||
allow_tags=['blockquote', 'a', 'em', 'p', 'strong'], | ||
add_nofollow=True, | ||
scripts=False, | ||
javascript=True, | ||
comments=False, | ||
style=True, | ||
processing_instructions=False, | ||
frames=False, | ||
annoying_tags=False, | ||
link_target="_blank") | ||
|
||
|
||
__all__ = ['DocumentCleaner', 'LineCleaner', 'CommentCleaner'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
import re | ||
import six | ||
from lxml import etree | ||
from lxml import html | ||
from lxml.html import defs | ||
from htmllaundry.cleaners import DocumentCleaner | ||
|
||
|
||
INLINE_TAGS = defs.special_inline_tags | defs.phrase_tags | defs.font_style_tags | ||
TAG = re.compile(six.u('<.*?>')) | ||
ANCHORS = etree.XPath('descendant-or-self::a | descendant-or-self::x:a', | ||
namespaces={'x': html.XHTML_NAMESPACE}) | ||
ALL_WHITESPACE = re.compile(r'^\s*$', re.UNICODE) | ||
|
||
|
||
def is_whitespace(txt): | ||
"""Utility method to test if txt is all whitespace or None.""" | ||
return txt is None or bool(ALL_WHITESPACE.match(txt)) | ||
|
||
|
||
def strip_markup(markup): | ||
"""Strip all markup from a HTML fragment.""" | ||
return TAG.sub(six.u(""), markup) | ||
|
||
|
||
StripMarkup = strip_markup # BBB for htmllaundry <2.0 | ||
|
||
|
||
def remove_element(el): | ||
parent = el.getparent() | ||
if el.tail: | ||
previous = el.getprevious() | ||
if previous is not None: | ||
if previous.tail: | ||
previous.tail += el.tail | ||
else: | ||
previous.tail = el.tail | ||
else: | ||
if parent.text: | ||
parent.text += el.tail | ||
else: | ||
parent.text = el.tail | ||
|
||
parent.remove(el) | ||
|
||
|
||
def remove_empty_tags(doc, extra_empty_tags=[]): | ||
"""Removes all empty tags from a HTML document. Javascript editors | ||
and browsers have a nasty habit of leaving stray tags around after | ||
their contents have been removed. This function removes all such | ||
empty tags, leaving only valid empty tags. | ||
In addition consecutive <br/> tags are folded into a single tag. | ||
This forces whitespace styling to be done using CSS instead of via an | ||
editor, which almost always produces better and more consistent results. | ||
""" | ||
empty_tags = set(['br', 'hr', 'img', 'input']) | ||
empty_tags.update(set(extra_empty_tags)) | ||
legal_empty_tags = frozenset(empty_tags) | ||
|
||
if hasattr(doc, 'getroot'): | ||
doc = doc.getroot() | ||
|
||
def clean(doc): | ||
victims = [] | ||
for el in doc.iter(): | ||
if el.tag == 'br': | ||
preceding = el.getprevious() | ||
parent = el.getparent() | ||
|
||
if (preceding is None and not parent.text) or \ | ||
(preceding is not None and preceding.tag == el.tag | ||
and not preceding.tail) or \ | ||
(not el.tail and el.getnext() is None): | ||
victims.append(el) | ||
continue | ||
|
||
if el.tag in legal_empty_tags: | ||
continue | ||
|
||
# Empty <a> can be used as anchor. | ||
if (el.tag == 'a') and (('name' in el.attrib) or ('id' in el.attrib)): | ||
continue | ||
|
||
if len(el) == 0 and is_whitespace(el.text): | ||
victims.append(el) | ||
continue | ||
|
||
if victims and victims[0] == doc: | ||
doc.clear() | ||
return 0 | ||
else: | ||
for victim in victims: | ||
remove_element(victim) | ||
|
||
return len(victims) | ||
|
||
while clean(doc): | ||
pass | ||
|
||
return doc | ||
|
||
|
||
def strip_outer_breaks(doc): | ||
"""Remove any toplevel break elements.""" | ||
victims = [] | ||
|
||
for i in range(len(doc)): | ||
el = doc[i] | ||
if el.tag == 'br': | ||
victims.append(el) | ||
|
||
for victim in victims: | ||
remove_element(victim) | ||
|
||
|
||
MARKER = 'LAUNDRY-INSERT' | ||
|
||
|
||
def wrap_text(doc, element='p'): | ||
"""Make sure there is no unwrapped text at the top level. Any bare text | ||
found is wrapped in a `<p>` element. | ||
""" | ||
def par(text): | ||
el = etree.Element(element, {MARKER: ''}) | ||
el.text = text | ||
return el | ||
|
||
if doc.text: | ||
doc.insert(0, par(doc.text)) | ||
doc.text = None | ||
|
||
while True: | ||
for (i, el) in enumerate(doc): | ||
if html._nons(el.tag) in INLINE_TAGS and i and MARKER in doc[i - 1].attrib: | ||
doc[i - 1].append(el) | ||
break | ||
if not is_whitespace(el.tail): | ||
doc.insert(i + 1, par(el.tail)) | ||
el.tail = None | ||
break | ||
else: | ||
break | ||
|
||
for el in doc: | ||
if MARKER in el.attrib: | ||
del el.attrib[MARKER] | ||
|
||
|
||
def sanitize(input, cleaner=DocumentCleaner, wrap='p'): | ||
"""Cleanup markup using a given cleanup configuration. | ||
Unwrapped text will be wrapped with wrap parameter. | ||
""" | ||
if 'body' not in cleaner.allow_tags: | ||
cleaner.allow_tags.append('body') | ||
|
||
input = six.u("<html><body>%s</body></html>") % input | ||
document = html.document_fromstring(input) | ||
bodies = [e for e in document if html._nons(e.tag) == 'body'] | ||
body = bodies[0] | ||
|
||
cleaned = cleaner.clean_html(body) | ||
remove_empty_tags(cleaned) | ||
strip_outer_breaks(cleaned) | ||
|
||
if wrap is not None: | ||
if wrap in html.defs.tags: | ||
wrap_text(cleaned, wrap) | ||
else: | ||
raise ValueError( | ||
'Invalid html tag provided for wrapping the sanitized text') | ||
|
||
output = six.u('').join([etree.tostring(fragment, encoding=six.text_type) | ||
for fragment in cleaned.iterchildren()]) | ||
if wrap is None and cleaned.text: | ||
output = cleaned.text + output | ||
|
||
return output |
Oops, something went wrong.