Skip to content

Commit

Permalink
Add bucketize summarizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
blambeau committed Jun 26, 2024
1 parent a8a57f9 commit da8f4d2
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
`Database#to_data_folder` and `Database#to_xlsx` dump methods.
See README for details.

* Add `Summarizer.bucketize` to distribute attribute values in a number of
buckets. We support :boundaries, :value_length and :distinct options.

## 0.22.0 - 2024-05-17

* Add the `minus` operation (also known as set difference, or EXCEPT in SQL).
Expand Down
1 change: 1 addition & 0 deletions lib/bmg/summarizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,4 @@ def extract_value(tuple)
require_relative 'summarizer/positional'
require_relative 'summarizer/first'
require_relative 'summarizer/last'
require_relative 'summarizer/bucketize'
82 changes: 82 additions & 0 deletions lib/bmg/summarizer/bucketize.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
module Bmg
class Summarizer
#
# Bucketizer summarizer.
#
# Example:
#
# # direct ruby usage
# Bmg::Summarizer.bucketize(:qty, :size => 2).summarize(...)
#
class Bucketize < Summarizer

# Sets default options.
def default_options
{ :size => 10 }
end

# Returns least value (defaults to "")
def least()
[[], []]
end

# Concatenates current memo with val.to_s
def _happens(memo, val)
memo.first << val
memo
end

# Finalizes computation
def finalize(memo)
buckets = compute_buckets(memo.first, options[:size])
buckets = touching_buckets(buckets) if options[:boundaries] == :touching
buckets
end

private

def compute_buckets(values, num_buckets = 10)
sorted_values = values.sort
sorted_values = sorted_values.map{|v| v.to_s[0...options[:value_length]] } if options[:value_length]
sorted_values = sorted_values.uniq if options[:distinct]

# Calculate the size of each bucket
total_values = sorted_values.length
bucket_size = (total_values / num_buckets.to_f).ceil

# Create the ranges for each bucket
bucket_ranges = []
num_buckets.times do |i|
start_index = i * bucket_size
break if start_index >= total_values # Ensure we do not exceed the array bounds

end_index = [(start_index + bucket_size - 1), total_values - 1].min
start_value = sorted_values[start_index]
end_value = sorted_values[end_index]
bucket_ranges << (start_value..end_value)
end

bucket_ranges
end

def touching_buckets(buckets)
result = []
buckets.each do |b|
r_start = result.empty? ? b.begin : result.last.end
r_end = b.end
result << (r_start...r_end)
end
result[-1] = (result.last.begin..result.last.end)

result
end

end # class Concat

# Factors a bucketize summarizer
def self.bucketize(*args, &bl)
Bucketize.new(*args, &bl)
end

end # class Summarizer
end # module Bmg
2 changes: 1 addition & 1 deletion lib/bmg/summarizer/concat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def least()
end

# Concatenates current memo with val.to_s
def _happens(memo, val)
def _happens(memo, val)
memo << options[:between].to_s unless memo.empty?
memo << val.to_s
end
Expand Down
159 changes: 159 additions & 0 deletions spec/unit/summarizer/test_bucketize.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
require 'spec_helper'
module Bmg
class Summarizer
describe Bucketize do

def bucketizer(attr, opts)
Bucketize.new(attr, options.merge(opts))
end

let(:options) {
{ }
}

context 'with distinct values' do
let(:rel){[
{:sid => 'S1', :qty => 10},
{:sid => 'S2', :qty => 20},
{:sid => 'S3', :qty => 30},
{:sid => 'S4', :qty => 40}
]}

context 'with separate boundaries' do
let(:options) {
{ boundaries: :separate }
}

it 'should work when used standalone' do
got = bucketizer(:qty, :size => 1).summarize(rel)
expect(got).to eql([10..40])

got = bucketizer(:qty, :size => 2).summarize(rel)
expect(got).to eql([10..20, 30..40])

got = bucketizer(:qty, :size => 3).summarize(rel)
expect(got).to eql([10..20, 30..40])

got = bucketizer(:qty, :size => 4).summarize(rel)
expect(got).to eql([10..10, 20..20, 30..30, 40..40])
end
end

context 'with touching boundaries' do
let(:options) {
{ boundaries: :touching }
}

it 'should work when used standalone' do
got = bucketizer(:qty, :size => 1).summarize(rel)
expect(got).to eql([10..40])

got = bucketizer(:qty, :size => 2).summarize(rel)
expect(got).to eql([10...20, 20..40])

got = bucketizer(:qty, :size => 3).summarize(rel)
expect(got).to eql([10...20, 20..40])

got = bucketizer(:qty, :size => 4).summarize(rel)
expect(got).to eql([10...10, 10...20, 20...30, 30..40])
end
end
end

context 'with non distinct values' do
let(:rel){[
{:sid => 'S1', :qty => 10},
{:sid => 'S2', :qty => 20},
{:sid => 'S3', :qty => 30},
{:sid => 'S4', :qty => 40},
{:sid => 'S5', :qty => 40}
]}

context 'with separate boundaries' do
let(:options) {
{ boundaries: :separate }
}

it 'should work when used standalone' do
got = bucketizer(:qty, :size => 1).summarize(rel)
expect(got).to eql([10..40])

got = bucketizer(:qty, :size => 2).summarize(rel)
expect(got).to eql([10..30, 40..40])

got = bucketizer(:qty, :size => 3).summarize(rel)
expect(got).to eql([10..20, 30..40, 40..40])

got = bucketizer(:qty, :size => 4).summarize(rel)
expect(got).to eql([10..20, 30..40, 40..40])
end

it 'supports distinct' do
got = bucketizer(:qty, :size => 2).summarize(rel)
expect(got).to eql([10..30, 40..40])

got = bucketizer(:qty, :size => 2, :distinct => true).summarize(rel)
expect(got).to eql([10..20, 30..40])
end
end

context 'with touching boundaries' do
let(:options) {
{ boundaries: :touching }
}

it 'should work when used standalone' do
got = bucketizer(:qty, :size => 1).summarize(rel)
expect(got).to eql([10..40])

got = bucketizer(:qty, :size => 2).summarize(rel)
expect(got).to eql([10...30, 30..40])

got = bucketizer(:qty, :size => 3).summarize(rel)
expect(got).to eql([10...20, 20...40, 40..40])

got = bucketizer(:qty, :size => 4).summarize(rel)
expect(got).to eql([10...20, 20...40, 40..40])
end

it 'supports distinct' do
got = bucketizer(:qty, :size => 2).summarize(rel)
expect(got).to eql([10...30, 30..40])

got = bucketizer(:qty, :size => 2, :distinct => true).summarize(rel)
expect(got).to eql([10...20, 20..40])
end
end
end

context 'with string values and touching boundaries' do
let(:rel){
["Denver", "Austin", "Chicago", "Boston", "Dallas", "Atlanta", "Detroit", "Houston", "San Francisco", "Los Angeles", "New York", "Seattle", "Miami", "Phoenix", "Las Vegas"].map{|city|
{ :city => city }
}
}

it 'should work as expected with touching' do
got = bucketizer(:city, size: 5, boundaries: :touching).summarize(rel)
expect(got).to eql(["Atlanta"..."Boston", "Boston"..."Denver", "Denver"..."Las Vegas", "Las Vegas"..."New York", "New York".."Seattle"])
end

it 'should work as expected with touching and value_length' do
got = bucketizer(:city, size: 5, boundaries: :touching, value_length: 3).summarize(rel)
expect(got).to eql(["Atl"..."Bos", "Bos"..."Den", "Den"..."Las", "Las"..."New", "New".."Sea"])
end

it 'should work as expected with separate' do
got = bucketizer(:city, size: 5, boundaries: :separate).summarize(rel)
expect(got).to eql(["Atlanta".."Boston", "Chicago".."Denver", "Detroit".."Las Vegas", "Los Angeles".."New York", "Phoenix".."Seattle"])
end

it 'should work as expected with separate and value_length' do
got = bucketizer(:city, size: 5, boundaries: :separate, value_length: 3).summarize(rel)
expect(got).to eql(["Atl".."Bos", "Chi".."Den", "Det".."Las", "Los".."New", "Pho".."Sea"])
end
end

end
end
end

0 comments on commit da8f4d2

Please sign in to comment.