-
-
Notifications
You must be signed in to change notification settings - Fork 382
/
sanitization_filter.rb
135 lines (126 loc) · 5.98 KB
/
sanitization_filter.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
begin
require "sanitize"
rescue LoadError => _
raise HTML::Pipeline::MissingDependencyError, "Missing dependency 'sanitize' for SanitizationFilter. See README.md for details."
end
module HTML
class Pipeline
# HTML filter with sanization routines and whitelists. This module defines
# what HTML is allowed in user provided content and fixes up issues with
# unbalanced tags and whatnot.
#
# See the Sanitize docs for more information on the underlying library:
#
# https://github.com/rgrove/sanitize/#readme
#
# Context options:
# :whitelist - The sanitizer whitelist configuration to use. This
# can be one of the options constants defined in this
# class or a custom sanitize options hash.
# :anchor_schemes - The URL schemes to allow in <a href> attributes. The
# default set is provided in the ANCHOR_SCHEMES
# constant in this class. If passed, this overrides any
# schemes specified in the whitelist configuration.
#
# This filter does not write additional information to the context.
class SanitizationFilter < Filter
LISTS = Set.new(%w(ul ol).freeze)
LIST_ITEM = 'li'.freeze
# List of table child elements. These must be contained by a <table> element
# or they are not allowed through. Otherwise they can be used to break out
# of places we're using tables to contain formatted user content (like pull
# request review comments).
TABLE_ITEMS = Set.new(%w(tr td th).freeze)
TABLE = 'table'.freeze
TABLE_SECTIONS = Set.new(%w(thead tbody tfoot).freeze)
# These schemes are the only ones allowed in <a href> attributes by default.
ANCHOR_SCHEMES = ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac'].freeze
# The main sanitization whitelist. Only these elements and attributes are
# allowed through by default.
WHITELIST = {
:elements => %w(
h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
div ins del sup sub p ol ul table thead tbody tfoot blockquote
dl dt dd kbd q samp var hr ruby rt rp li tr td th s strike summary details
),
:remove_contents => ['script'],
:attributes => {
'a' => ['href'],
'img' => ['src', 'longdesc'],
'div' => ['itemscope', 'itemtype'],
'blockquote' => ['cite'],
'del' => ['cite'],
'ins' => ['cite'],
'q' => ['cite'],
:all => ['abbr', 'accept', 'accept-charset',
'accesskey', 'action', 'align', 'alt', 'axis',
'border', 'cellpadding', 'cellspacing', 'char',
'charoff', 'charset', 'checked',
'clear', 'cols', 'colspan', 'color',
'compact', 'coords', 'datetime', 'dir',
'disabled', 'enctype', 'for', 'frame',
'headers', 'height', 'hreflang',
'hspace', 'ismap', 'label', 'lang',
'maxlength', 'media', 'method',
'multiple', 'name', 'nohref', 'noshade',
'nowrap', 'open', 'prompt', 'readonly', 'rel', 'rev',
'rows', 'rowspan', 'rules', 'scope',
'selected', 'shape', 'size', 'span',
'start', 'summary', 'tabindex', 'target',
'title', 'type', 'usemap', 'valign', 'value',
'vspace', 'width', 'itemprop']
},
:protocols => {
'a' => {'href' => ANCHOR_SCHEMES},
'blockquote' => {'cite' => ['http', 'https', :relative]},
'del' => {'cite' => ['http', 'https', :relative]},
'ins' => {'cite' => ['http', 'https', :relative]},
'q' => {'cite' => ['http', 'https', :relative]},
'img' => {
'src' => ['http', 'https', :relative],
'longdesc' => ['http', 'https', :relative]
}
},
:transformers => [
# Top-level <li> elements are removed because they can break out of
# containing markup.
lambda { |env|
name, node = env[:node_name], env[:node]
if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
node.replace(node.children)
end
},
# Table child elements that are not contained by a <table> are removed.
lambda { |env|
name, node = env[:node_name], env[:node]
if (TABLE_SECTIONS.include?(name) || TABLE_ITEMS.include?(name)) && !node.ancestors.any? { |n| n.name == TABLE }
node.replace(node.children)
end
}
]
}
# A more limited sanitization whitelist. This includes all attributes,
# protocols, and transformers from WHITELIST but with a more locked down
# set of allowed elements.
LIMITED = WHITELIST.merge(
:elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
# Strip all HTML tags from the document.
FULL = { :elements => [] }
# Sanitize markup using the Sanitize library.
def call
Sanitize.clean_node!(doc, whitelist)
end
# The whitelist to use when sanitizing. This can be passed in the context
# hash to the filter but defaults to WHITELIST constant value above.
def whitelist
whitelist = context[:whitelist] || WHITELIST
anchor_schemes = context[:anchor_schemes]
return whitelist unless anchor_schemes
whitelist = whitelist.dup
whitelist[:protocols] = (whitelist[:protocols] || {}).dup
whitelist[:protocols]['a'] = (whitelist[:protocols]['a'] || {}).merge('href' => anchor_schemes)
whitelist
end
end
end
end