Skip to content

Commit

Permalink
Issue-17: Spearman's rank correlation & Kolmogorov–Smirnov test (#27)
Browse files Browse the repository at this point in the history
* spearman-rank-coefficient: Implement coefficient calcluations.

* spearman-rank-coefficient: Extend unit test cases.

* distribution: Implement empirical distribution.

This change implements the Empirical distribution that is used
by the KS-Test. It only implements the CDF, also known as ECDF.

* statistical-test: Implement the two-samples kolmogorov-smirnof test.

This change implements the ks-test for two groups of samples, using the
empirical distribution and calculates the D statistic, which is
accessible on the response of the class method.

* ks-test: Try to implementfunction that tries to find critical values for test.

* version: Codename Random 2.1.0.
  • Loading branch information
estebanz01 authored Dec 31, 2018
1 parent 5fcdee0 commit 87a49b5
Show file tree
Hide file tree
Showing 8 changed files with 410 additions and 2 deletions.
26 changes: 26 additions & 0 deletions lib/statistics/distribution/empirical.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
module Statistics
module Distribution
class Empirical
attr_accessor :samples

def initialize(samples:)
self.samples = samples
end

# Formula grabbed from here: https://statlect.com/asymptotic-theory/empirical-distribution
def cumulative_function(x:)
cumulative_sum = samples.reduce(0) do |summation, sample|
summation += if sample <= x
1
else
0
end

summation
end

cumulative_sum / samples.size.to_f
end
end
end
end
2 changes: 1 addition & 1 deletion lib/statistics/distribution/weibull.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def variance
# Using the inverse CDF function, also called quantile, we can calculate
# a random sample that follows a weibull distribution.
#
# Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
# Formula extracted from https://www.taygeta.com/random/weibull.html
def random(elements: 1, seed: Random.new_seed)
results = []

Expand Down
71 changes: 71 additions & 0 deletions lib/statistics/spearman_rank_coefficient.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
module Statistics
class SpearmanRankCoefficient
def self.rank(data:, return_ranks_only: true)
descending_order_data = data.sort { |a, b| b <=> a }
rankings = {}

data.each do |value|
# If we have ties, the find_index method will only retrieve the index of the
# first element in the list (i.e, the most close to the left of the array),
# so when a tie is detected, we increase the temporal ranking by the number of
# counted elements at that particular time and then we increase the counter.
temporal_ranking = descending_order_data.find_index(value) + 1 # 0-index

if rankings.fetch(value, false)
rankings[value][:rank] += (temporal_ranking + rankings[value][:counter])
rankings[value][:counter] += 1
rankings[value][:tie_rank] = rankings[value][:rank] / rankings[value][:counter].to_f
else
rankings[value] = { counter: 1, rank: temporal_ranking, tie_rank: temporal_ranking }
end
end

if return_ranks_only
data.map do |value|
rankings[value][:tie_rank]
end
else
rankings
end
end

# Formulas extracted from: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide.php
def self.coefficient(set_one, set_two)
raise 'Both group sets must have the same number of cases.' if set_one.size != set_two.size
return if set_one.size == 0 && set_two.size == 0

set_one_mean, set_two_mean = set_one.mean, set_two.mean
have_tie_ranks = (set_one + set_two).any? { |rank| rank.is_a?(Float) }

if have_tie_ranks
numerator = 0
squared_differences_set_one = 0
squared_differences_set_two = 0

set_one.size.times do |idx|
local_diff_one = (set_one[idx] - set_one_mean)
local_diff_two = (set_two[idx] - set_two_mean)

squared_differences_set_one += local_diff_one ** 2
squared_differences_set_two += local_diff_two ** 2

numerator += local_diff_one * local_diff_two
end

denominator = Math.sqrt(squared_differences_set_one * squared_differences_set_two)

numerator / denominator.to_f # This is rho or spearman's coefficient.
else
sum_squared_differences = set_one.each_with_index.reduce(0) do |memo, (rank_one, index)|
memo += ((rank_one - set_two[index]) ** 2)
memo
end

numerator = 6 * sum_squared_differences
denominator = ((set_one.size ** 3) - set_one.size)

1.0 - (numerator / denominator.to_f) # This is rho or spearman's coefficient.
end
end
end
end
70 changes: 70 additions & 0 deletions lib/statistics/statistical_test/kolmogorov_smirnov_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
module Statistics
module StatisticalTest
class KolmogorovSmirnovTest
# Common alpha, and critical D are calculated following formulas from: https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test#Two-sample_Kolmogorov%E2%80%93Smirnov_test
def self.two_samples(group_one:, group_two:, alpha: 0.05)
samples = group_one + group_two # We can use unbalaced group samples

ecdf_one = Distribution::Empirical.new(samples: group_one)
ecdf_two = Distribution::Empirical.new(samples: group_two)

d_max = samples.sort.map do |sample|
d1 = ecdf_one.cumulative_function(x: sample)
d2 = ecdf_two.cumulative_function(x: sample)

(d1 - d2).abs
end.max

# TODO: Validate calculation of Common alpha.
common_alpha = Math.sqrt((-0.5 * Math.log(alpha)))
radicand = (group_one.size + group_two.size) / (group_one.size * group_two.size).to_f

critical_d = common_alpha * Math.sqrt(radicand)
# critical_d = self.critical_d(alpha: alpha, n: samples.size)

# We are unable to calculate the p_value, because we don't have the Kolmogorov distribution
# defined. We reject the null hypotesis if Dmax is > than Dcritical.
{ d_max: d_max,
d_critical: critical_d,
total_samples: samples.size,
alpha: alpha,
null: d_max <= critical_d,
alternative: d_max > critical_d,
confidence_level: 1.0 - alpha }
end

# This is an implementation of the formula presented by Paul Molin and Hervé Abdi in a paper,
# called "New Table and numerical approximations for Kolmogorov-Smirnov / Lilliefors / Van Soest
# normality test".
# In this paper, the authors defines a couple of 6th-degree polynomial functions that allow us
# to find an aproximation of the real critical value. This is based in the conclusions made by
# Dagnelie (1968), where indicates that critical values given by Lilliefors can be approximated
# numerically.
#
# In general, the formula found is:
# C(N, alpha) ^ -2 = A(alpha) * N + B(alpha).
#
# Where A(alpha), B(alpha) are two 6th degree polynomial functions computed using the principle
# of Monte Carlo simulations.
#
# paper can be found here: https://utdallas.edu/~herve/MolinAbdi1998-LillieforsTechReport.pdf
# def self.critical_d(alpha:, n:)
# confidence = 1.0 - alpha

# a_alpha = 6.32207539843126 -17.1398870006148 * confidence +
# 38.42812675101057 * (confidence ** 2) - 45.93241384693391 * (confidence ** 3) +
# 7.88697700041829 * (confidence ** 4) + 29.79317711037858 * (confidence ** 5) -
# 18.48090137098585 * (confidence ** 6)

# b_alpha = 12.940399038404 - 53.458334259532 * confidence +
# 186.923866119699 * (confidence ** 2) - 410.582178349305 * (confidence ** 3) +
# 517.377862566267 * (confidence ** 4) - 343.581476222384 * (confidence ** 5) +
# 92.123451358715 * (confidence ** 6)

# Math.sqrt(1.0 / (a_alpha * n + b_alpha))
# end
end

KSTest = KolmogorovSmirnovTest # Alias
end
end
2 changes: 1 addition & 1 deletion lib/statistics/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module Statistics
VERSION = "2.0.5"
VERSION = "2.1.0"
end
17 changes: 17 additions & 0 deletions spec/statistics/distribution/empirical_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
require 'spec_helper'

describe Statistics::Distribution::Empirical do
describe '#cumulative_function' do
it 'calculates the CDF for the specified value using a group of samples' do
# Result in R
# > cdf <- ecdf(c(1,2,3,4,5,6,7,8,9,0))
# > cdf(7)
# [1] 0.8
samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
x = 7
x_prob = 0.8

expect(described_class.new(samples: samples).cumulative_function(x: x)).to eq x_prob
end
end
end
154 changes: 154 additions & 0 deletions spec/statistics/spearman_rank_coefficient_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
require 'spec_helper'

describe Statistics::SpearmanRankCoefficient do
describe '.rank' do
context 'when only ranks are needed' do
it 'returns an array of elements corresponding to the expected ranks wihout altering order' do
expected_ranks = [4, 1, 3, 2, 5]

result = described_class.rank(data: [10, 30, 12, 15, 3], return_ranks_only: true)

expect(result).to eq expected_ranks
end
end

context 'when ranks and passed elements are needed' do
it 'returns a hash composed by the elements and ranking information' do
expected_ranks = {
30 => { counter: 1, rank: 1, tie_rank: 1 },
15 => { counter: 1, rank: 2, tie_rank: 2 },
12 => { counter: 1, rank: 3, tie_rank: 3 },
10 => { counter: 1, rank: 4, tie_rank: 4 },
3 => { counter: 1, rank: 5, tie_rank: 5 }
}

result = described_class.rank(data: [10, 30, 12, 15, 3], return_ranks_only: false)

expect(result).to eq expected_ranks
end
end

context 'when there are ties' do
it 'returns a ranking list with solved ties when ranks only are needed' do
expected_ranking = [9, 3, 10, 4, 6.5, 5, 8, 1, 2, 6.5]
data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]

result = described_class.rank(data: data, return_ranks_only: true)

expect(result).to eq expected_ranking
end

it 'returns a hash composed by the elements and some ranking information' do
expected_ranks = {
80 => { counter: 1, rank: 1, tie_rank: 1 },
76 => { counter: 1, rank: 2, tie_rank: 2 },
75 => { counter: 1, rank: 3, tie_rank: 3 },
71 => { counter: 1, rank: 4, tie_rank: 4 },
64 => { counter: 1, rank: 5, tie_rank: 5 },
61 => { counter: 2, rank: 13, tie_rank: 6.5 },
58 => { counter: 1, rank: 8, tie_rank: 8 },
56 => { counter: 1, rank: 9, tie_rank: 9 },
45 => { counter: 1, rank: 10, tie_rank: 10 }
}
data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]

result = described_class.rank(data: data, return_ranks_only: false)

expect(result).to include(expected_ranks)
end

it 'returns a hash containing information about the existing ties' do
tie_rank = { 61 => { counter: 2, tie_rank: 6.5, rank: 13 } }
data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]

result = described_class.rank(data: data, return_ranks_only: false)

expect(result).to include(tie_rank)
end
end
end

describe '.coefficient' do
it 'raises an error when the groups have different number of cases' do
expect do
described_class.coefficient([1, 2, 3], [1, 2, 3, 4])
end.to raise_error(StandardError, 'Both group sets must have the same number of cases.')
end

it 'returns nothing when both groups have a size of zero cases' do
expect(described_class.coefficient([], [])).to be_nil
end

context 'when there are ties in the data' do
it 'calculates the spearman rank coefficient for example one' do
# Example taken from http://www.biostathandbook.com/spearman.html
volume = [1760, 2040, 2440, 2550, 2730, 2740, 3010, 3080, 3370, 3740, 4910, 5090, 5090, 5380, 5850, 6730, 6990, 7960]
frequency = [529, 566, 473, 461, 465, 532, 484, 527, 488, 485, 478, 434, 468, 449, 425, 389, 421, 416]

volume_rank = described_class.rank(data: volume)
frequency_rank = described_class.rank(data: frequency)

rho = described_class.coefficient(volume_rank, frequency_rank)
expect(rho.round(3)).to eq -0.763
end

it 'calcultes the spearman rank coefficient for example two' do
# Example taken from https://geographyfieldwork.com/SpearmansRank.htm
# Results from R:
# cor(c(50, 175, 270, 375, 425, 580, 710, 790, 890, 980), c(1.80, 1.20, 2.0, 1.0, 1.0, 1.20, 0.80, 0.60, 1.0, 0.85), method = 'spearman')
# [1] -0.7570127
distance = [50, 175, 270, 375, 425, 580, 710, 790, 890, 980]
price = [1.80, 1.20, 2.0, 1.0, 1.0, 1.20, 0.80, 0.60, 1.0, 0.85]

distance_rank = described_class.rank(data: distance)
price_rank = described_class.rank(data: price)

rho = described_class.coefficient(distance_rank, price_rank)

expect(rho.round(7)).to eq -0.7570127
end

it 'calculates the spearman rank coefficient for example three' do
# Example taken from http://www.real-statistics.com/correlation/spearmans-rank-correlation/spearmans-rank-correlation-detailed/

life_exp = [80, 78, 60, 53, 85, 84, 73, 79, 81, 75, 68, 72, 58, 92, 65]
cigarretes = [5, 23, 25, 48, 17, 8, 4, 26, 11, 19, 14, 35, 29, 4, 23]

life_rank = described_class.rank(data: life_exp)
cigarretes_rank = described_class.rank(data: cigarretes)

rho = described_class.coefficient(life_rank, cigarretes_rank)

expect(rho.round(5)).to eq -0.67442
end
end

context 'when there are no ties in the data' do
it 'calculates the spearman rank coefficient for example one' do
# Example taken from here: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
english_data = [56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
math_data = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]

english_rank = described_class.rank(data: english_data)
math_rank = described_class.rank(data: math_data)

rho = described_class.coefficient(english_rank, math_rank)

expect(rho.round(2)).to eq 0.67
end

it 'calculates the spearman rank coefficient for example two' do
# Example taken from here: https://www.statisticshowto.datasciencecentral.com/spearman-rank-correlation-definition-calculate/
physics = [35, 23, 47, 17, 10, 43, 9, 6, 28]
math = [30, 33, 45, 23, 8, 49, 12, 4, 31]

physics_rank = described_class.rank(data: physics)
math_rank = described_class.rank(data: math)

rho = described_class.coefficient(physics_rank, math_rank)

expect(rho).to eq 0.9
end
end
end
end
Loading

0 comments on commit 87a49b5

Please sign in to comment.