Skip to content

Commit

Permalink
Faster String#squish (#1159)
Browse files Browse the repository at this point in the history
* Faster String#squish

  squish original ascii-only whitespace (600 bytes)  42.92k ( 23.30µs) (± 0.55%)  5.92kB/op   8.58× slower
 squish optimized ascii-only whitespace (600 bytes) 368.48k (  2.71µs) (± 3.49%)  1.19kB/op        fastest

  squish original w/ unicode whitespace (60 bytes) 528.12k (  1.89µs) (± 2.31%)  624B/op   3.39× slower
squish optimized w/ unicode whitespace (60 bytes)   1.79M (557.76ns) (± 2.61%)  225B/op        fastest

* Replace existing benchmark with one for String#squish per comment in

* Include original spec for String#squish
  • Loading branch information
jamescook authored May 30, 2020
1 parent 60a7077 commit d1a2352
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 13 deletions.
89 changes: 77 additions & 12 deletions bench.cr
Original file line number Diff line number Diff line change
@@ -1,21 +1,86 @@
require "benchmark"

text = <<-TEXT
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua
TEXT
class String
# Original
def squish_regex : String
gsub(/[[:space:]]+/, " ").strip
end

string_text = text * 40_000
io_text = IO::Memory.new(string_text)
def squish_new : String
if ascii_only?
squish_ascii
else
squish_unicode
end
end

Benchmark.ips do |x|
x.report("string") do
io = IO::Memory.new
def squish_simplified : String
squish_unicode
end

io.print(io_text.to_s)
private def squish_ascii : String
String.build(size) do |str|
print_blank = false
each_char do |chr|
if chr.ascii_whitespace?
if print_blank
str << ' '
print_blank = false
end
else
print_blank = true
str << chr
end
end
end.strip
end
x.report("io") do
io = IO::Memory.new

io.print(io_text)
private def squish_unicode : String
String.build(size) do |str|
print_blank = false
each_char do |chr|
if chr.whitespace?
if print_blank
str << ' '
print_blank = false
end
else
print_blank = true
str << chr
end
end
end.strip
end
end

puts "Sanity check the return output is consistent:"
example = " f f\u00A0\u00A0\u00A0f f \n \t \v\v \f\f 11111 a l0* あ\u00A0\u00A0\u00A0 "
puts "String to squish " + example.inspect
puts "regex: " + example.squish_regex.inspect
puts "new: " + example.squish_new.inspect
puts "simplified: " + example.squish_simplified.inspect

# Original regex doesn't seem to work correctly with trailing unicode
if example.squish_regex != example.squish_new
puts "WARN: regex version does not match:"
puts "Regex: #{example.squish_regex.inspect}".ljust(50)
puts "Ours: #{example.squish_new.inspect}".ljust(50)
end

puts
puts "Benchmarking String#squish ..."
puts
example = " f f f f \n \t\r\r 11111 a l0* " * 20
Benchmark.ips(warmup: 5, calculation: 10) do |x|
x.report("squish regex ascii-only whitespace (#{example.bytesize} bytes)") { example.squish_regex }
x.report("squish optimized ascii-only whitespace (#{example.bytesize} bytes)") { example.squish_new }
x.report("squish simplified ascii-only whitespace (#{example.bytesize} bytes)") { example.squish_simplified }
end

puts
example = "\u00A0あ\u00A0\u00A0 \t z \n \r \r \t \v \f zzzz XXX asdf k ; ;, \u1680\u1680 " * 10
Benchmark.ips(warmup: 5, calculation: 10) do |x|
x.report("squish regex w/ unicode whitespace (#{example.bytesize} bytes)") { example.squish_regex }
x.report("squish optimized w/ unicode whitespace (#{example.bytesize} bytes)") { example.squish_new }
x.report("squish simplified w/ unicode whitespace (#{example.bytesize} bytes)") { example.squish_simplified }
end
6 changes: 6 additions & 0 deletions spec/charms/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,11 @@ describe "String charm" do
og_string.squish.should eq("foo bar boo")
og_string.should eq(" foo bar \n \t boo")
end

it "squishes the text by removing ascii/unicode whitespace" do
og_string = "\u1680 \v\v\v\v\v\r\r\r\r hello foo bar\n\u00A0\t\t\u00A0\u1680\u1680 "

og_string.squish.should eq("hello foo bar")
end
end
end
41 changes: 40 additions & 1 deletion src/charms/string_extensions.cr
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,45 @@ class String
# replace newlines with a single space and convert mutiple spaces to just one
# space.
def squish : String
gsub(/[[:space:]]+/, " ").strip
if ascii_only?
squish_ascii
else
squish_unicode
end
end

# Optimized for ASCII using String#ascii_whitespace?
private def squish_ascii : String
String.build(size) do |str|
print_blank = false
each_char do |chr|
if chr.ascii_whitespace?
if print_blank
str << ' '
print_blank = false
end
else
print_blank = true
str << chr
end
end
end.strip
end

private def squish_unicode : String
String.build(size) do |str|
print_blank = false
each_char do |chr|
if chr.whitespace?
if print_blank
str << ' '
print_blank = false
end
else
print_blank = true
str << chr
end
end
end.strip
end
end

0 comments on commit d1a2352

Please sign in to comment.