Skip to content

Commit

Permalink
Various minor efficiency tweaks to improve perf on large documents.
Browse files Browse the repository at this point in the history
In my tests parsing the 7-megabyte single page WHATWG HTML spec, these
tweaks resulted in a 1.3x performance improvement.
  • Loading branch information
rgrove committed May 19, 2014
1 parent e4def09 commit 93feeb3
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 43 deletions.
4 changes: 1 addition & 3 deletions lib/sanitize/transformers/clean_cdata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
class Sanitize; module Transformers

CleanCDATA = lambda do |env|
return if env[:is_whitelisted]

node = env[:node]

if node.cdata?
if node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
node.replace(Nokogiri::XML::Text.new(node.text, node.document))
end
end
Expand Down
7 changes: 5 additions & 2 deletions lib/sanitize/transformers/clean_comment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
class Sanitize; module Transformers

CleanComment = lambda do |env|
return if env[:is_whitelisted]
env[:node].unlink if env[:node].comment?
node = env[:node]

if node.type == Nokogiri::XML::Node::COMMENT_NODE
node.unlink unless env[:is_whitelisted]
end
end

end; end
7 changes: 5 additions & 2 deletions lib/sanitize/transformers/clean_doctype.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
class Sanitize; module Transformers

CleanDoctype = lambda do |env|
return if env[:is_whitelisted]
env[:node].unlink if env[:node].type == Nokogiri::XML::Node::DTD_NODE
node = env[:node]

if node.type == Nokogiri::XML::Node::DTD_NODE
node.unlink unless env[:is_whitelisted]
end
end

end; end
75 changes: 39 additions & 36 deletions lib/sanitize/transformers/clean_element.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# encoding: utf-8

require 'set'

class Sanitize; module Transformers; class CleanElement

# Matches a valid HTML5 data attribute name. The unicode ranges included here
Expand All @@ -24,21 +26,28 @@ class Sanitize; module Transformers; class CleanElement
REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i

def initialize(config)
@config = config

# For faster lookups.
@add_attributes = config[:add_attributes]
@allowed_elements = Set.new(config[:elements])
@attributes = config[:attributes]
@attributes = config[:attributes].dup
@elements = Set.new(config[:elements])
@protocols = config[:protocols]
@remove_all_contents = false
@remove_element_contents = Set.new
@whitespace_elements = Hash.new
@whitespace_elements = {}

if @attributes.include?(:all)
@attributes[:all] = Set.new(@attributes[:all])
end

@attributes.each do |element_name, attrs|
unless element_name == :all
@attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
end
end

# Converting :whitespace_element into a Hash for backwards compatibility.
# Backcompat: if :whitespace_elements is an array, convert it to a hash.
if config[:whitespace_elements].is_a?(Array)
config[:whitespace_elements].each do |element|
@whitespace_elements[element] = { :before => ' ', :after => ' ' }
@whitespace_elements[element] = {:before => ' ', :after => ' '}
end
else
@whitespace_elements = config[:whitespace_elements]
Expand All @@ -55,10 +64,10 @@ def call(env)
name = env[:node_name]
node = env[:node]

return if env[:is_whitelisted] || !node.element?
return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]

# Delete any element that isn't in the config whitelist.
unless @allowed_elements.include?(name)
unless @elements.include?(name)
# Elements like br, div, p, etc. need to be replaced with whitespace in
# order to preserve readability.
if @whitespace_elements.include?(name)
Expand All @@ -77,21 +86,33 @@ def call(env)
return
end

attr_whitelist = Set.new((@attributes[name] || []) +
(@attributes[:all] || []))

allow_data_attributes = attr_whitelist.include?(:data)
attr_whitelist = @attributes[name] || @attributes[:all]

if attr_whitelist.empty?
if attr_whitelist.nil?
# Delete all attributes from elements with no whitelisted attributes.
node.attribute_nodes.each {|attr| attr.unlink }
else
allow_data_attributes = attr_whitelist.include?(:data)

# Delete any attribute that isn't allowed on this element.
node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase

unless attr_whitelist.include?(attr_name)
# The attribute isn't explicitly whitelisted.
if attr_whitelist.include?(attr_name)
# The attribute is whitelisted.

# Remove any attributes that use unacceptable protocols.
if @protocols.include?(name) && @protocols[name].include?(attr_name)
attr_protocols = @protocols[name][attr_name]

if attr.value.to_s.downcase =~ REGEX_PROTOCOL
attr.unlink unless attr_protocols.include?($1.downcase)
else
attr.unlink unless attr_protocols.include?(:relative)
end
end
else
# The attribute isn't whitelisted.

if allow_data_attributes && attr_name.start_with?('data-')
# Arbitrary data attributes are allowed. Verify that the attribute
Expand All @@ -104,28 +125,10 @@ def call(env)
end
end
end

# Delete remaining attributes that use unacceptable protocols.
if @protocols.has_key?(name)
protocol = @protocols[name]

node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase
next false unless protocol.has_key?(attr_name)

del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
!protocol[attr_name].include?($1.downcase)
else
!protocol[attr_name].include?(:relative)
end

attr.unlink if del
end
end
end

# Add required attributes.
if @add_attributes.has_key?(name)
if @add_attributes.include?(name)
@add_attributes[name].each {|key, val| node[key] = val }
end
end
Expand Down

0 comments on commit 93feeb3

Please sign in to comment.