Issue-17: Spearman's rank correlation & Kolmogorov–Smirnov test (#27)

* spearman-rank-coefficient: Implement coefficient calcluations. * spearman-rank-coefficient: Extend unit test cases. * distribution: Implement empirical distribution. This change implements the Empirical distribution that is used by the KS-Test. It only implements the CDF, also known as ECDF. * statistical-test: Implement the two-samples kolmogorov-smirnof test. This change implements the ks-test for two groups of samples, using the empirical distribution and calculates the D statistic, which is accessible on the response of the class method. * ks-test: Try to implementfunction that tries to find critical values for test. * version: Codename Random 2.1.0.
estebanz01 · Dec 31, 2018 · 87a49b5 · 87a49b5
1 parent 5fcdee0
commit 87a49b5
Show file tree

Hide file tree

Showing 8 changed files with 410 additions and 2 deletions.
diff --git a/lib/statistics/distribution/empirical.rb b/lib/statistics/distribution/empirical.rb
@@ -0,0 +1,26 @@
+module Statistics
+ module Distribution
+ class Empirical
+ attr_accessor :samples
+
+ def initialize(samples:)
+ self.samples = samples
+ end
+
+ # Formula grabbed from here: https://statlect.com/asymptotic-theory/empirical-distribution
+ def cumulative_function(x:)
+ cumulative_sum = samples.reduce(0) do |summation, sample|
+ summation += if sample <= x
+ 1
+ else
+ 0
+ end
+
+ summation
+ end
+
+ cumulative_sum / samples.size.to_f
+ end
+ end
+ end
+end
diff --git a/lib/statistics/distribution/weibull.rb b/lib/statistics/distribution/weibull.rb
@@ -45,7 +45,7 @@ def variance
  # Using the inverse CDF function, also called quantile, we can calculate
  # a random sample that follows a weibull distribution.
  #
- # Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
+ # Formula extracted from https://www.taygeta.com/random/weibull.html
  def random(elements: 1, seed: Random.new_seed)
  results = []
 

diff --git a/lib/statistics/spearman_rank_coefficient.rb b/lib/statistics/spearman_rank_coefficient.rb
@@ -0,0 +1,71 @@
+module Statistics
+ class SpearmanRankCoefficient
+ def self.rank(data:, return_ranks_only: true)
+ descending_order_data = data.sort { |a, b| b <=> a }
+ rankings = {}
+
+ data.each do |value|
+ # If we have ties, the find_index method will only retrieve the index of the
+ # first element in the list (i.e, the most close to the left of the array),
+ # so when a tie is detected, we increase the temporal ranking by the number of
+ # counted elements at that particular time and then we increase the counter.
+ temporal_ranking = descending_order_data.find_index(value) + 1 # 0-index
+
+ if rankings.fetch(value, false)
+ rankings[value][:rank] += (temporal_ranking + rankings[value][:counter])
+ rankings[value][:counter] += 1
+ rankings[value][:tie_rank] = rankings[value][:rank] / rankings[value][:counter].to_f
+ else
+ rankings[value] = { counter: 1, rank: temporal_ranking, tie_rank: temporal_ranking }
+ end
+ end
+
+ if return_ranks_only
+ data.map do |value|
+ rankings[value][:tie_rank]
+ end
+ else
+ rankings
+ end
+ end
+
+ # Formulas extracted from: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide.php
+ def self.coefficient(set_one, set_two)
+ raise 'Both group sets must have the same number of cases.' if set_one.size != set_two.size
+ return if set_one.size == 0 && set_two.size == 0
+
+ set_one_mean, set_two_mean = set_one.mean, set_two.mean
+ have_tie_ranks = (set_one + set_two).any? { |rank| rank.is_a?(Float) }
+
+ if have_tie_ranks
+ numerator = 0
+ squared_differences_set_one = 0
+ squared_differences_set_two = 0
+
+ set_one.size.times do |idx|
+ local_diff_one = (set_one[idx] - set_one_mean)
+ local_diff_two = (set_two[idx] - set_two_mean)
+
+ squared_differences_set_one += local_diff_one ** 2
+ squared_differences_set_two += local_diff_two ** 2
+
+ numerator += local_diff_one * local_diff_two
+ end
+
+ denominator = Math.sqrt(squared_differences_set_one * squared_differences_set_two)
+
+ numerator / denominator.to_f # This is rho or spearman's coefficient.
+ else
+ sum_squared_differences = set_one.each_with_index.reduce(0) do |memo, (rank_one, index)|
+ memo += ((rank_one - set_two[index]) ** 2)
+ memo
+ end
+
+ numerator = 6 * sum_squared_differences
+ denominator = ((set_one.size ** 3) - set_one.size)
+
+ 1.0 - (numerator / denominator.to_f) # This is rho or spearman's coefficient.
+ end
+ end
+ end
+end
diff --git a/lib/statistics/statistical_test/kolmogorov_smirnov_test.rb b/lib/statistics/statistical_test/kolmogorov_smirnov_test.rb
@@ -0,0 +1,70 @@
+module Statistics
+ module StatisticalTest
+ class KolmogorovSmirnovTest
+ # Common alpha, and critical D are calculated following formulas from: https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test#Two-sample_Kolmogorov%E2%80%93Smirnov_test
+ def self.two_samples(group_one:, group_two:, alpha: 0.05)
+ samples = group_one + group_two # We can use unbalaced group samples
+
+ ecdf_one = Distribution::Empirical.new(samples: group_one)
+ ecdf_two = Distribution::Empirical.new(samples: group_two)
+
+ d_max = samples.sort.map do |sample|
+ d1 = ecdf_one.cumulative_function(x: sample)
+ d2 = ecdf_two.cumulative_function(x: sample)
+
+ (d1 - d2).abs
+ end.max
+
+ # TODO: Validate calculation of Common alpha.
+ common_alpha = Math.sqrt((-0.5 * Math.log(alpha)))
+ radicand = (group_one.size + group_two.size) / (group_one.size * group_two.size).to_f
+
+ critical_d = common_alpha * Math.sqrt(radicand)
+ # critical_d = self.critical_d(alpha: alpha, n: samples.size)
+
+ # We are unable to calculate the p_value, because we don't have the Kolmogorov distribution
+ # defined. We reject the null hypotesis if Dmax is > than Dcritical.
+ { d_max: d_max,
+ d_critical: critical_d,
+ total_samples: samples.size,
+ alpha: alpha,
+ null: d_max <= critical_d,
+ alternative: d_max > critical_d,
+ confidence_level: 1.0 - alpha }
+ end
+
+ # This is an implementation of the formula presented by Paul Molin and Hervé Abdi in a paper,
+ # called "New Table and numerical approximations for Kolmogorov-Smirnov / Lilliefors / Van Soest
+ # normality test".
+ # In this paper, the authors defines a couple of 6th-degree polynomial functions that allow us
+ # to find an aproximation of the real critical value. This is based in the conclusions made by
+ # Dagnelie (1968), where indicates that critical values given by Lilliefors can be approximated
+ # numerically.
+ #
+ # In general, the formula found is:
+ # C(N, alpha) ^ -2 = A(alpha) * N + B(alpha).
+ #
+ # Where A(alpha), B(alpha) are two 6th degree polynomial functions computed using the principle
+ # of Monte Carlo simulations.
+ #
+ # paper can be found here: https://utdallas.edu/~herve/MolinAbdi1998-LillieforsTechReport.pdf
+ # def self.critical_d(alpha:, n:)
+ # confidence = 1.0 - alpha
+
+ # a_alpha = 6.32207539843126 -17.1398870006148 * confidence +
+ # 38.42812675101057 * (confidence ** 2) - 45.93241384693391 * (confidence ** 3) +
+ # 7.88697700041829 * (confidence ** 4) + 29.79317711037858 * (confidence ** 5) -
+ # 18.48090137098585 * (confidence ** 6)
+
+ # b_alpha = 12.940399038404 - 53.458334259532 * confidence +
+ # 186.923866119699 * (confidence ** 2) - 410.582178349305 * (confidence ** 3) +
+ # 517.377862566267 * (confidence ** 4) - 343.581476222384 * (confidence ** 5) +
+ # 92.123451358715 * (confidence ** 6)
+
+ # Math.sqrt(1.0 / (a_alpha * n + b_alpha))
+ # end
+ end
+
+ KSTest = KolmogorovSmirnovTest # Alias
+ end
+end
diff --git a/lib/statistics/version.rb b/lib/statistics/version.rb
@@ -1,3 +1,3 @@
 module Statistics
- VERSION = "2.0.5"
+ VERSION = "2.1.0"
 end
diff --git a/spec/statistics/distribution/empirical_spec.rb b/spec/statistics/distribution/empirical_spec.rb
@@ -0,0 +1,17 @@
+require 'spec_helper'
+
+describe Statistics::Distribution::Empirical do
+ describe '#cumulative_function' do
+ it 'calculates the CDF for the specified value using a group of samples' do
+ # Result in R
+ # > cdf <- ecdf(c(1,2,3,4,5,6,7,8,9,0))
+ # > cdf(7)
+ # [1] 0.8
+ samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
+ x = 7
+ x_prob = 0.8
+
+ expect(described_class.new(samples: samples).cumulative_function(x: x)).to eq x_prob
+ end
+ end
+end
diff --git a/spec/statistics/spearman_rank_coefficient_spec.rb b/spec/statistics/spearman_rank_coefficient_spec.rb
@@ -0,0 +1,154 @@
+require 'spec_helper'
+
+describe Statistics::SpearmanRankCoefficient do
+ describe '.rank' do
+ context 'when only ranks are needed' do
+ it 'returns an array of elements corresponding to the expected ranks wihout altering order' do
+ expected_ranks = [4, 1, 3, 2, 5]
+
+ result = described_class.rank(data: [10, 30, 12, 15, 3], return_ranks_only: true)
+
+ expect(result).to eq expected_ranks
+ end
+ end
+
+ context 'when ranks and passed elements are needed' do
+ it 'returns a hash composed by the elements and ranking information' do
+ expected_ranks = {
+ 30 => { counter: 1, rank: 1, tie_rank: 1 },
+ 15 => { counter: 1, rank: 2, tie_rank: 2 },
+ 12 => { counter: 1, rank: 3, tie_rank: 3 },
+ 10 => { counter: 1, rank: 4, tie_rank: 4 },
+ 3 => { counter: 1, rank: 5, tie_rank: 5 }
+ }
+
+ result = described_class.rank(data: [10, 30, 12, 15, 3], return_ranks_only: false)
+
+ expect(result).to eq expected_ranks
+ end
+ end
+
+ context 'when there are ties' do
+ it 'returns a ranking list with solved ties when ranks only are needed' do
+ expected_ranking = [9, 3, 10, 4, 6.5, 5, 8, 1, 2, 6.5]
+ data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
+
+ result = described_class.rank(data: data, return_ranks_only: true)
+
+ expect(result).to eq expected_ranking
+ end
+
+ it 'returns a hash composed by the elements and some ranking information' do
+ expected_ranks = {
+ 80 => { counter: 1, rank: 1, tie_rank: 1 },
+ 76 => { counter: 1, rank: 2, tie_rank: 2 },
+ 75 => { counter: 1, rank: 3, tie_rank: 3 },
+ 71 => { counter: 1, rank: 4, tie_rank: 4 },
+ 64 => { counter: 1, rank: 5, tie_rank: 5 },
+ 61 => { counter: 2, rank: 13, tie_rank: 6.5 },
+ 58 => { counter: 1, rank: 8, tie_rank: 8 },
+ 56 => { counter: 1, rank: 9, tie_rank: 9 },
+ 45 => { counter: 1, rank: 10, tie_rank: 10 }
+ }
+ data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
+
+ result = described_class.rank(data: data, return_ranks_only: false)
+
+ expect(result).to include(expected_ranks)
+ end
+
+ it 'returns a hash containing information about the existing ties' do
+ tie_rank = { 61 => { counter: 2, tie_rank: 6.5, rank: 13 } }
+ data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
+
+ result = described_class.rank(data: data, return_ranks_only: false)
+
+ expect(result).to include(tie_rank)
+ end
+ end
+ end
+
+ describe '.coefficient' do
+ it 'raises an error when the groups have different number of cases' do
+ expect do
+ described_class.coefficient([1, 2, 3], [1, 2, 3, 4])
+ end.to raise_error(StandardError, 'Both group sets must have the same number of cases.')
+ end
+
+ it 'returns nothing when both groups have a size of zero cases' do
+ expect(described_class.coefficient([], [])).to be_nil
+ end
+
+ context 'when there are ties in the data' do
+ it 'calculates the spearman rank coefficient for example one' do
+ # Example taken from http://www.biostathandbook.com/spearman.html
+ volume = [1760, 2040, 2440, 2550, 2730, 2740, 3010, 3080, 3370, 3740, 4910, 5090, 5090, 5380, 5850, 6730, 6990, 7960]
+ frequency = [529, 566, 473, 461, 465, 532, 484, 527, 488, 485, 478, 434, 468, 449, 425, 389, 421, 416]
+
+ volume_rank = described_class.rank(data: volume)
+ frequency_rank = described_class.rank(data: frequency)
+
+ rho = described_class.coefficient(volume_rank, frequency_rank)
+ expect(rho.round(3)).to eq -0.763
+ end
+
+ it 'calcultes the spearman rank coefficient for example two' do
+ # Example taken from https://geographyfieldwork.com/SpearmansRank.htm
+ # Results from R:
+ # cor(c(50, 175, 270, 375, 425, 580, 710, 790, 890, 980), c(1.80, 1.20, 2.0, 1.0, 1.0, 1.20, 0.80, 0.60, 1.0, 0.85), method = 'spearman')
+ # [1] -0.7570127
+ distance = [50, 175, 270, 375, 425, 580, 710, 790, 890, 980]
+ price = [1.80, 1.20, 2.0, 1.0, 1.0, 1.20, 0.80, 0.60, 1.0, 0.85]
+
+ distance_rank = described_class.rank(data: distance)
+ price_rank = described_class.rank(data: price)
+
+ rho = described_class.coefficient(distance_rank, price_rank)
+
+ expect(rho.round(7)).to eq -0.7570127
+ end
+
+ it 'calculates the spearman rank coefficient for example three' do
+ # Example taken from http://www.real-statistics.com/correlation/spearmans-rank-correlation/spearmans-rank-correlation-detailed/
+
+ life_exp = [80, 78, 60, 53, 85, 84, 73, 79, 81, 75, 68, 72, 58, 92, 65]
+ cigarretes = [5, 23, 25, 48, 17, 8, 4, 26, 11, 19, 14, 35, 29, 4, 23]
+
+ life_rank = described_class.rank(data: life_exp)
+ cigarretes_rank = described_class.rank(data: cigarretes)
+
+ rho = described_class.coefficient(life_rank, cigarretes_rank)
+
+ expect(rho.round(5)).to eq -0.67442
+ end
+ end
+
+ context 'when there are no ties in the data' do
+ it 'calculates the spearman rank coefficient for example one' do
+ # Example taken from here: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
+ english_data = [56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
+ math_data = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
+
+ english_rank = described_class.rank(data: english_data)
+ math_rank = described_class.rank(data: math_data)
+
+ rho = described_class.coefficient(english_rank, math_rank)
+
+ expect(rho.round(2)).to eq 0.67
+ end
+
+ it 'calculates the spearman rank coefficient for example two' do
+ # Example taken from here: https://www.statisticshowto.datasciencecentral.com/spearman-rank-correlation-definition-calculate/
+ physics = [35, 23, 47, 17, 10, 43, 9, 6, 28]
+ math = [30, 33, 45, 23, 8, 49, 12, 4, 31]
+
+ physics_rank = described_class.rank(data: physics)
+ math_rank = described_class.rank(data: math)
+
+ rho = described_class.coefficient(physics_rank, math_rank)
+
+ expect(rho).to eq 0.9
+ end
+ end
+ end
+end