-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathsanitize_test.py
156 lines (132 loc) · 7.95 KB
/
sanitize_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""unit tests for sanitize"""
__author__ = "Aaron Swartz <http://www.aaronsw.com/>"
__version__ = "0.33"
__license__ = "public domain"
_adaptedfrom_ = [
"http://www.iamcal.com/publish/articles/php/processing_html_part_2/",
"http://feedparser.org/tests/wellformed/sanitize/",
"http://alpha-geek.com/example/crazy_html2.html"]
import sanitize
def HTML(a, b, baseuri=None, addnofollow=False):
try:
assert sanitize.HTML(a, baseuri=baseuri, addnofollow=addnofollow) == b
except AssertionError:
print ' in:', repr(a)
print 'out:', repr(sanitize.HTML(a, baseuri=baseuri))
print 'exp:', repr(b)
raise
## basics
HTML("","")
HTML("hello","hello")
## balancing tags
HTML("<b>hello","<b>hello</b>")
HTML("hello<b>","hello<b></b>")
HTML("hello</b>","hello")
HTML("hello<b/>","hello<b></b>")
HTML("<b><b><b>hello","<b><b><b>hello</b></b></b>")
HTML("</b><b>","<b></b>")
## trailing slashes
HTML('<img>','<img />')
HTML('<img/>','<img />')
HTML('<b/></b>','<b></b>')
## balancing angle brakets
HTML('<img src="foo"','')
HTML('b>','b>')
HTML('<img src="foo"/','')
HTML('>','>')
HTML('foo<b','foo')
HTML('b>foo','b>foo')
HTML('><b','>')
HTML('b><','b>')
HTML('><b>','><b></b>')
## attributes
HTML('<img src=foo>','<img src="foo" />')
HTML('<img asrc=foo>','<img />')
HTML('<img src=test test>','<img src="test" />')
HTML('<input type="checkbox" checked>', '<input type="checkbox" checked="checked" />')
## dangerous tags (a small sample)
sHTML = lambda x: HTML(x, 'safe <b>description</b>')
sHTML('safe<applet code="foo.class" codebase="http://example.com/"></applet> <b>description</b>')
sHTML('<notinventedyet>safe</notinventedyet> <b>description</b>')
sHTML('<blink>safe</blink> <b>description</b>')
sHTML('safe<embed src="http://example.com/"> <b>description</b>')
sHTML('safe<frameset rows="*"><frame src="http://example.com/"></frameset> <b>description</b>')
sHTML('safe<iframe src="http://example.com/"> <b>description</b></iframe>')
sHTML('safe<link rel="stylesheet" type="text/css" href="http://example.com/evil.css"> <b>description</b>')
sHTML('safe<meta http-equiv="Refresh" content="0; URL=http://example.com/"> <b>description</b>')
sHTML('safe<object classid="clsid:C932BA85-4374-101B-A56C-00AA003668DC"> <b>description</b>')
sHTML('safe<script type="text/javascript">location.href=\'http:/\'+\'/example.com/\';</script> <b>description</b>')
for x in ['onabort', 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload', 'onmousedown', 'onmouseout', 'onmouseover', 'onmouseup', 'onreset', 'resize', 'onsubmit', 'onunload']:
HTML('<img src="http://www.ragingplatypus.com/i/cam-full.jpg" %s="location.href=\'http://www.ragingplatypus.com/\';" />' % x,
'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />')
HTML('<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>', '<a href="http://www.ragingplatypus.com/">never trust your upstream platypus</a>')
## ignorables
HTML('foo<style>bar', 'foo')
HTML('foo<style>bar</style>', 'foo')
## non-allowed tags
HTML('<script>','')
HTML('<script','')
HTML('<script/>','')
HTML('</script>','')
HTML('<script woo=yay>','')
HTML('<script woo="yay">','')
HTML('<script woo="yay>','')
HTML('<script woo="yay<b>','')
HTML('<script<script>>','')
HTML('<<script>script<script>>','')
HTML('<<script><script>>','')
HTML('<<script>script>>','')
HTML('<<script<script>>','')
HTML('<scr\0ipt>', '')
## bad protocols (a small sample)
HTML('<a href="http://foo">bar</a>', '<a href="http://foo">bar</a>')
HTML('<a href="ftp://foo">bar</a>', '<a href="ftp://foo">bar</a>')
HTML('<a href="mailto:foo">bar</a>', '<a href="mailto:foo">bar</a>')
HTML('<a href="javascript:foo">bar</a>', '<a href="#foo">bar</a>')
HTML('<a href="java script:foo">bar</a>', '<a href="#foo">bar</a>')
HTML('<a href="java\tscript:foo">bar</a>', '<a href="#foo">bar</a>')
HTML('<a href="java\nscript:foo">bar</a>', '<a href="#foo">bar</a>')
HTML('<a href="java'+chr(1)+'script:foo">bar</a>', '<a href="#foo">bar</a>')
HTML('<a href="jscript:foo">bar</a>', '<a href="#foo">bar</a>')
HTML('<a href="vbscript:foo">bar</a>', '<a href="#foo">bar</a>')
HTML('<a href="view-source:foo">bar</a>', '<a href="#foo">bar</a>')
HTML('<a href="notinventedyet:foo">bar</a>', '<a href="#foo">bar</a>')
## base uris
HTML('<a href="foo">bar</a>', '<a href="http://baz.net/foo">bar</a>', baseuri='http://baz.net')
HTML('<a href="foo">bar</a>', '<a href="http://baz.net/foo">bar</a>', baseuri='http://baz.net/')
HTML('<a href="foo">bar</a>', '<a href="http://baz.net/foo">bar</a>', baseuri='http://baz.net/goo')
HTML('<img src="foo" />', '<img src="http://baz.net/foo" />', baseuri='http://baz.net')
## auto closers
HTML('<img src="a">', '<img src="a" />')
HTML('<img src="a">foo</img>', '<img src="a" />foo')
HTML('</img>', '')
## comments
HTML('<!--foo-->', '<!--foo-->')
HTML('doo<!--', 'doo')
## crazy: http://alpha-geek.com/example/crazy_html2.html
HTML('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n<head>\r\n<title>Crazy HTML -- Can Your Regex Parse This?</title>\r\n</head>\r\n<body notRealAttribute="value"onload="executeMe();"foo="bar"\r\n\r\n>\r\n<!-- <script> -->\r\n\r\n<!-- \r\n\t<script> \r\n-->\r\n\r\n</script>\r\n\r\n\r\n<script\r\n\r\n\r\n>\r\n\r\nfunction executeMe()\r\n{\r\n\r\n\r\n\r\n\r\n/* <script> \r\nfunction am_i_javascript()\r\n{\r\n\tvar str = "Some innocuously commented out stuff";\r\n}\r\n< /script>\r\n*/\r\n\r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\talert("Executed");\r\n}\r\n\r\n </script\r\n\r\n\r\n\r\n>\r\n<h1>Did The Javascript Execute?</h1>\r\n<div notRealAttribute="value\r\n"onmouseover="\r\nexecuteMe();\r\n"foo="bar">\r\nI will execute here, too, if you mouse over me\r\n</div>\r\nThis is to keep you guys honest...<br />\r\nI like ontonology. I like to script ontology. Though, script>style>this.\r\n</body>\r\n</html>', 'Crazy HTML -- Can Your Regex Parse This?\n\n\n<!-- <script> -->\n\n<!-- \n\t<script> \n-->\n\n\n\nfunction executeMe()\n{\n\n\n\n\n/* \n<h1>Did The Javascript Execute?</h1>\n<div>\nI will execute here, too, if you mouse over me\n</div>\nThis is to keep you guys honest...<br />\nI like ontonology. I like to script ontology. Though, script>style>this.')
## valid entity references
HTML(" "," ")
HTML(" "," ")
HTML(" "," ")
HTML(" "," ")
## unescaped ampersands
HTML("AT&T","AT&T")
HTML("http://example.org?a=1&b=2","http://example.org?a=1&b=2")
## attribute values
HTML('<a title="&">amp</a>','<a title="&">amp</a>')
HTML('<a title="AT&T">test</a>', '<a title="AT&T">test</a>')
HTML('<a title=\'"foo"\'>test</a>', '<a title=""foo"">test</a>')
HTML('<a title=""foo"">test</a>', '<a title=""foo"">test</a>')
HTML('<a title="foo bar">test</a>', '<a title="foo bar">test</a>')
HTML('<a title="foo bar">test</a>', '<a title="foo bar">test</a>')
HTML('<a title="foo bar">test</a>', '<a title="foo bar">test</a>')
HTML('<a title="foo bar">test</a>', '<a title="foo bar">test</a>')
HTML('<a title="foo<bar">test</a>', '<a title="foo<bar">test</a>')
HTML('<a title="foo<bar">test</a>', '<a title="foo<bar">test</a>')
## quote characters
HTML('<a title=""">quote</a>','<a title=""">quote</a>')
HTML('<a title="'">quote</a>','<a title="\'">quote</a>')
## adding nofollow
HTML('<a href="http://spam.example.org/">viagra</a>', '<a href="http://spam.example.org/" rel="nofollow">viagra</a>', addnofollow=True)
HTML('<a rel="foo" href="http://spam.example.org/">viagra</a>', '<a href="http://spam.example.org/" rel="nofollow">viagra</a>', addnofollow=True)