diff --git a/bench.cr b/bench.cr index d738b77c0..fc92e2bcf 100644 --- a/bench.cr +++ b/bench.cr @@ -1,21 +1,86 @@ require "benchmark" -text = <<-TEXT -Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua -TEXT +class String + # Original + def squish_regex : String + gsub(/[[:space:]]+/, " ").strip + end -string_text = text * 40_000 -io_text = IO::Memory.new(string_text) + def squish_new : String + if ascii_only? + squish_ascii + else + squish_unicode + end + end -Benchmark.ips do |x| - x.report("string") do - io = IO::Memory.new + def squish_simplified : String + squish_unicode + end - io.print(io_text.to_s) + private def squish_ascii : String + String.build(size) do |str| + print_blank = false + each_char do |chr| + if chr.ascii_whitespace? + if print_blank + str << ' ' + print_blank = false + end + else + print_blank = true + str << chr + end + end + end.strip end - x.report("io") do - io = IO::Memory.new - io.print(io_text) + private def squish_unicode : String + String.build(size) do |str| + print_blank = false + each_char do |chr| + if chr.whitespace? + if print_blank + str << ' ' + print_blank = false + end + else + print_blank = true + str << chr + end + end + end.strip end end + +puts "Sanity check the return output is consistent:" +example = " f f\u00A0\u00A0\u00A0f f \n \t \v\v \f\f 11111 a l0* あ\u00A0\u00A0\u00A0 " +puts "String to squish " + example.inspect +puts "regex: " + example.squish_regex.inspect +puts "new: " + example.squish_new.inspect +puts "simplified: " + example.squish_simplified.inspect + +# Original regex doesn't seem to work correctly with trailing unicode +if example.squish_regex != example.squish_new + puts "WARN: regex version does not match:" + puts "Regex: #{example.squish_regex.inspect}".ljust(50) + puts "Ours: #{example.squish_new.inspect}".ljust(50) +end + +puts +puts "Benchmarking String#squish ..." +puts +example = " f f f f \n \t\r\r 11111 a l0* " * 20 +Benchmark.ips(warmup: 5, calculation: 10) do |x| + x.report("squish regex ascii-only whitespace (#{example.bytesize} bytes)") { example.squish_regex } + x.report("squish optimized ascii-only whitespace (#{example.bytesize} bytes)") { example.squish_new } + x.report("squish simplified ascii-only whitespace (#{example.bytesize} bytes)") { example.squish_simplified } +end + +puts +example = "あ\u00A0あ\u00A0\u00A0 \t z \n \r \r \t \v \f zzzz XXX asdf k ; ;, \u1680\u1680 " * 10 +Benchmark.ips(warmup: 5, calculation: 10) do |x| + x.report("squish regex w/ unicode whitespace (#{example.bytesize} bytes)") { example.squish_regex } + x.report("squish optimized w/ unicode whitespace (#{example.bytesize} bytes)") { example.squish_new } + x.report("squish simplified w/ unicode whitespace (#{example.bytesize} bytes)") { example.squish_simplified } +end diff --git a/spec/charms/string_spec.cr b/spec/charms/string_spec.cr index ffc8c3745..882536820 100644 --- a/spec/charms/string_spec.cr +++ b/spec/charms/string_spec.cr @@ -8,5 +8,11 @@ describe "String charm" do og_string.squish.should eq("foo bar boo") og_string.should eq(" foo bar \n \t boo") end + + it "squishes the text by removing ascii/unicode whitespace" do + og_string = "\u1680 \v\v\v\v\v\r\r\r\r hello foo bar\n\u00A0\t\t\u00A0\u1680\u1680 " + + og_string.squish.should eq("hello foo bar") + end end end diff --git a/src/charms/string_extensions.cr b/src/charms/string_extensions.cr index 4c6f039d9..1eb866a3e 100644 --- a/src/charms/string_extensions.cr +++ b/src/charms/string_extensions.cr @@ -10,6 +10,45 @@ class String # replace newlines with a single space and convert mutiple spaces to just one # space. def squish : String - gsub(/[[:space:]]+/, " ").strip + if ascii_only? + squish_ascii + else + squish_unicode + end + end + + # Optimized for ASCII using String#ascii_whitespace? + private def squish_ascii : String + String.build(size) do |str| + print_blank = false + each_char do |chr| + if chr.ascii_whitespace? + if print_blank + str << ' ' + print_blank = false + end + else + print_blank = true + str << chr + end + end + end.strip + end + + private def squish_unicode : String + String.build(size) do |str| + print_blank = false + each_char do |chr| + if chr.whitespace? + if print_blank + str << ' ' + print_blank = false + end + else + print_blank = true + str << chr + end + end + end.strip end end