From e011676535661534fe0bb73b9f7d6b864d2437c3 Mon Sep 17 00:00:00 2001 From: Jose Colella Date: Wed, 8 May 2024 12:01:59 -0400 Subject: [PATCH 1/4] test: Get basic test case for scrubber --- test/integration/test_scrubbers.rb | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/integration/test_scrubbers.rb b/test/integration/test_scrubbers.rb index b9401b6..7d987de 100644 --- a/test/integration/test_scrubbers.rb +++ b/test/integration/test_scrubbers.rb @@ -47,6 +47,9 @@ class IntegrationTestScrubbers < Loofah::TestCase ENTITY_HACK_ATTACK_TEXT_SCRUB = "Hack attack!<script>alert('evil')</script>" ENTITY_HACK_ATTACK_TEXT_SCRUB_UNESC = "Hack attack!" + BREAKPOINT_FRAGMENT = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

Et cetera...

" + BREAKPOINT_RESULT = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

Et cetera...

" + context "scrubbing shortcuts" do context "#scrub_document" do it "is a shortcut for parse-and-scrub" do @@ -225,6 +228,16 @@ def html5? assert_equal doc, result end end + + context ":double_breakpoint" do + it "replaces double line breaks with paragraph tags" do + doc = klass.parse("#{BREAKPOINT_FRAGMENT}") + result = doc.scrub!(:double_breakpoint) + + assert_equal BREAKPOINT_RESULT, doc.xpath("/html/body").inner_html + assert_equal doc, result + end + end end context "#text" do From 26dd83794c26296bc0f9b682188b400c36eec278 Mon Sep 17 00:00:00 2001 From: Jose Colella Date: Wed, 8 May 2024 12:23:17 -0400 Subject: [PATCH 2/4] initial scaffold --- lib/loofah/scrubbers.rb | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lib/loofah/scrubbers.rb b/lib/loofah/scrubbers.rb index 9dd8869..2435a7b 100644 --- a/lib/loofah/scrubbers.rb +++ b/lib/loofah/scrubbers.rb @@ -348,6 +348,24 @@ def scrub(node) end end + # + # === scrub!(:double_breakpoint) + # + # +:double_breakpoint+ replaces double-break tags with closing/opening paragraph tags. + # + # double_breakpoint_markup = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

" + # Loofah.html5_fragment(messy_markup).scrub!(:double_breakpoint) + # => "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

" + # + class DoubleBreakpoint < Scrubber + def initialize # rubocop:disable Lint/MissingSuper + @direction = :top_down + end + + def scrub(node) + + end + end # # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune). # @@ -362,6 +380,7 @@ def scrub(node) targetblank: TargetBlank, newline_block_elements: NewlineBlockElements, unprintable: Unprintable, + double_breakpoint: DoubleBreakpoint, } class << self From 9955c245272bd1b398dc9067316dda66ddb7e10a Mon Sep 17 00:00:00 2001 From: Jose Colella Date: Wed, 8 May 2024 14:55:40 -0400 Subject: [PATCH 3/4] feat: Add implementation for method --- lib/loofah/scrubbers.rb | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/lib/loofah/scrubbers.rb b/lib/loofah/scrubbers.rb index 2435a7b..d58847c 100644 --- a/lib/loofah/scrubbers.rb +++ b/lib/loofah/scrubbers.rb @@ -363,7 +363,40 @@ def initialize # rubocop:disable Lint/MissingSuper end def scrub(node) - + return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "p") + + paragraph_with_break_point_nodes = node.xpath("//p[br[following-sibling::br]]") + + paragraph_with_break_point_nodes.each do |paragraph_node| + new_paragraph = paragraph_node.add_previous_sibling("

").first + + paragraph_node.children.each do |child| + remove_blank_text_nodes(child) + end + + paragraph_node.children.each do |child| + # already unlinked + next if child.parent.nil? + + if child.name == "br" && child.next_sibling.name == "br" + new_paragraph = paragraph_node.add_previous_sibling("

").first + child.next_sibling.unlink + child.unlink + else + child.parent = new_paragraph + end + end + + paragraph_node.unlink + end + + CONTINUE + end + + private + + def remove_blank_text_nodes(node) + node.unlink if node.text? && node.blank? end end # From 59aac527d90f35d70e84349dfb4c85a504c0c369 Mon Sep 17 00:00:00 2001 From: Jose Colella Date: Wed, 8 May 2024 14:59:56 -0400 Subject: [PATCH 4/4] docs: Update documentation --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ac669bd..9088f65 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Active Record extensions for HTML sanitization are available in the [`loofah-act * Add the _nofollow_ attribute to all hyperlinks. * Add the _target=\_blank_ attribute to all hyperlinks. * Remove _unprintable_ characters from text nodes. + * Modify _double breakpoints_ characters to paragraph nodes. * Format markup as plain text, with (or without) sensible whitespace handling around block elements. * Replace Rails's `strip_tags` and `sanitize` view helper methods. @@ -235,6 +236,7 @@ doc.scrub!(:noopener) # adds rel="noopener" attribute to links doc.scrub!(:noreferrer) # adds rel="noreferrer" attribute to links doc.scrub!(:unprintable) # removes unprintable characters from text nodes doc.scrub!(:targetblank) # adds target="_blank" attribute to links +doc.scrub!(:double_breakpoint) # removes double breakpoints to paragraph nodes ``` See `Loofah::Scrubbers` for more details and example usage.