Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set is much faster than array for checking uniqueness #52

Merged
merged 1 commit into from
Apr 29, 2017

Conversation

jrafanie
Copy link
Contributor

Specifically, when we call Array#include? to check if we've already
visited a node, Set is much faster especially as the number of nodes
visited increases.

It's probably neglible for most use cases but there is a difference.

Benchmark results: Array is between 1.42x and 3.57x slower than Set
in DeepDup with ruby 2.3.4.

$ ruby benchmark.rb
Warming up --------------------------------------
Small hash - Array way
                        33.169k i/100ms
Small hash - Set way    25.883k i/100ms
Calculating -------------------------------------
Small hash - Array way
                        379.253k (± 3.3%) i/s -      1.924M in   5.078544s
Small hash - Set way    266.919k (± 7.9%) i/s -      1.346M in   5.077567s

Comparison:
Small hash - Array way:   379253.4 i/s
Small hash - Set way:   266919.4 i/s - 1.42x  slower

Warming up --------------------------------------
Large hash - Array way
                        20.000  i/100ms
Large hash - Set way    67.000  i/100ms
Calculating -------------------------------------
Large hash - Array way
                        218.159  (± 4.6%) i/s -      1.100k in   5.052188s
Large hash - Set way    779.445  (± 8.0%) i/s -      3.886k in   5.019390s

Comparison:
Large hash - Set way:      779.4 i/s
Large hash - Array way:      218.2 i/s - 3.57x  slower

Benchmark script:

require 'yaml'
require "net/http"
require "uri"

uri = URI.parse("https://raw.githubusercontent.com/ManageIQ/manageiq/95130f3360a4abd82128ad2c23550d837799bb45/locale/en.yml")
response = Net::HTTP.get_response(uri)

large_hash = YAML.load(response.body)
small_hash = { :a => { :b => 'c' } }

require "benchmark/ips"
require 'set'

class DeepDupOriginal
  def initialize(opts={})
    @recurse_over_arrays = opts.fetch(:recurse_over_arrays, false)
    @preserve_original_keys = opts.fetch(:preserve_original_keys, false)
  end

  def call(obj)
    deep_dup(obj)
  end

  private

  def deep_dup(obj, visited=[])
    if obj.is_a?(Hash)
      obj.each_with_object({}) do |(key, value), h|
        h[@preserve_original_keys ? key : key.to_sym] = value_or_deep_dup(value, visited)
      end
    elsif obj.is_a?(Array) && @recurse_over_arrays
      obj.each_with_object([]) do |value, arr|
        value = value.is_a?(RecursiveOpenStruct) ? value.to_h : value
        arr << value_or_deep_dup(value, visited)
      end
    else
      obj
    end
  end

  def value_or_deep_dup(value, visited)
    obj_id = value.object_id
    visited.include?(obj_id) ? value : deep_dup(value, visited << obj_id)
  end
end

class DeepDupSet
  def initialize(opts={})
    @recurse_over_arrays = opts.fetch(:recurse_over_arrays, false)
    @preserve_original_keys = opts.fetch(:preserve_original_keys, false)
  end

  def call(obj)
    deep_dup(obj)
  end

  private

  def deep_dup(obj, visited=Set.new)
    if obj.is_a?(Hash)
      obj.each_with_object({}) do |(key, value), h|
        h[@preserve_original_keys ? key : key.to_sym] = value_or_deep_dup(value, visited)
      end
    elsif obj.is_a?(Array) && @recurse_over_arrays
      obj.each_with_object([]) do |value, arr|
        value = value.is_a?(RecursiveOpenStruct) ? value.to_h : value
        arr << value_or_deep_dup(value, visited)
      end
    else
      obj
    end
  end

  def value_or_deep_dup(value, visited)
    obj_id = value.object_id
    visited.include?(obj_id) ? value : deep_dup(value, visited << obj_id)
  end
end

Benchmark.ips do |x|
  x.report("Small hash - Array way") { DeepDupOriginal.new.call(small_hash)}
  x.report("Small hash - Set way")   { DeepDupSet.new.call(small_hash)}
  x.compare!
end

Benchmark.ips do |x|
  x.report("Large hash - Array way") { DeepDupOriginal.new.call(large_hash)}
  x.report("Large hash - Set way")   { DeepDupSet.new.call(large_hash)}
  x.compare!
end

Specifically, when we call Array#include? to check if we've already
visited a node, Set is much faster especially as the number of nodes
visited increases.

It's probably neglible for most use cases but there is a difference.

Benchmark results:  Array is between 1.42x and 3.57x slower than Set
in DeepDup with ruby 2.3.4.

```
$ ruby benchmark.rb
Warming up --------------------------------------
Small hash - Array way
                        33.169k i/100ms
Small hash - Set way    25.883k i/100ms
Calculating -------------------------------------
Small hash - Array way
                        379.253k (± 3.3%) i/s -      1.924M in   5.078544s
Small hash - Set way    266.919k (± 7.9%) i/s -      1.346M in   5.077567s

Comparison:
Small hash - Array way:   379253.4 i/s
Small hash - Set way:   266919.4 i/s - 1.42x  slower

Warming up --------------------------------------
Large hash - Array way
                        20.000  i/100ms
Large hash - Set way    67.000  i/100ms
Calculating -------------------------------------
Large hash - Array way
                        218.159  (± 4.6%) i/s -      1.100k in   5.052188s
Large hash - Set way    779.445  (± 8.0%) i/s -      3.886k in   5.019390s

Comparison:
Large hash - Set way:      779.4 i/s
Large hash - Array way:      218.2 i/s - 3.57x  slower
```

Benchmark script:

```
require 'yaml'
require "net/http"
require "uri"

uri = URI.parse("https://raw.githubusercontent.com/ManageIQ/manageiq/95130f3360a4abd82128ad2c23550d837799bb45/locale/en.yml")
response = Net::HTTP.get_response(uri)

large_hash = YAML.load(response.body)
small_hash = { :a => { :b => 'c' } }

require "benchmark/ips"
require 'set'

class DeepDupOriginal
  def initialize(opts={})
    @recurse_over_arrays = opts.fetch(:recurse_over_arrays, false)
    @preserve_original_keys = opts.fetch(:preserve_original_keys, false)
  end

  def call(obj)
    deep_dup(obj)
  end

  private

  def deep_dup(obj, visited=[])
    if obj.is_a?(Hash)
      obj.each_with_object({}) do |(key, value), h|
        h[@preserve_original_keys ? key : key.to_sym] = value_or_deep_dup(value, visited)
      end
    elsif obj.is_a?(Array) && @recurse_over_arrays
      obj.each_with_object([]) do |value, arr|
        value = value.is_a?(RecursiveOpenStruct) ? value.to_h : value
        arr << value_or_deep_dup(value, visited)
      end
    else
      obj
    end
  end

  def value_or_deep_dup(value, visited)
    obj_id = value.object_id
    visited.include?(obj_id) ? value : deep_dup(value, visited << obj_id)
  end
end

class DeepDupSet
  def initialize(opts={})
    @recurse_over_arrays = opts.fetch(:recurse_over_arrays, false)
    @preserve_original_keys = opts.fetch(:preserve_original_keys, false)
  end

  def call(obj)
    deep_dup(obj)
  end

  private

  def deep_dup(obj, visited=Set.new)
    if obj.is_a?(Hash)
      obj.each_with_object({}) do |(key, value), h|
        h[@preserve_original_keys ? key : key.to_sym] = value_or_deep_dup(value, visited)
      end
    elsif obj.is_a?(Array) && @recurse_over_arrays
      obj.each_with_object([]) do |value, arr|
        value = value.is_a?(RecursiveOpenStruct) ? value.to_h : value
        arr << value_or_deep_dup(value, visited)
      end
    else
      obj
    end
  end

  def value_or_deep_dup(value, visited)
    obj_id = value.object_id
    visited.include?(obj_id) ? value : deep_dup(value, visited << obj_id)
  end
end

Benchmark.ips do |x|
  x.report("Small hash - Array way") { DeepDupOriginal.new.call(small_hash)}
  x.report("Small hash - Set way")   { DeepDupSet.new.call(small_hash)}
  x.compare!
end

Benchmark.ips do |x|
  x.report("Large hash - Array way") { DeepDupOriginal.new.call(large_hash)}
  x.report("Large hash - Set way")   { DeepDupSet.new.call(large_hash)}
  x.compare!
end
```
@aetherknight
Copy link
Owner

Looks good. I'll get it merged in and cut a new release in a moment.

@aetherknight aetherknight merged commit e5ad1bb into aetherknight:master Apr 29, 2017
@jrafanie jrafanie deleted the use_set_for_uniqueness branch May 8, 2017 18:32
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants