From e5eae18f14e40e8eae453457cc257d316c7fffc4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 25 May 2022 08:13:24 -0700 Subject: [PATCH] Chatterjee Correlation Coefficient (#770) * Implement rank vector [ci skip] * Add documentation. Admittedly terrible. * Add unit tests. * Cleanup method of detecting if execution policies are valid or not [ci skip] * Implement and test chatterjee correlation [ci skip] * Add spot checks and special handling for constant Y [ci skip] * Add performance file [ci skip] * Add execution policy support to rank [ci skip] * Remove duplicates from v when generating the order vector [ci skip] * Fix macro error for use of [ci skip] * Use explicit types instead of auto to avoid warnings [ci skip] * Add execution policy testing to rank [ci skip] * Add threaded implementation [ci skip] * Added threaded testing * Fix formatting and ASCII issues in test * Fix more ASCII issues * refactoring * Fix threaded impl * Remove non-ASCII apostrophe [ci skip] * Doc fixes and add test comparing generally to paper values * Significantly tighten tolerance around expected values from paper * Change tolerance for sin comparison Co-authored-by: Nick Thompson --- doc/statistics/chatterjee_correlation.qbk | 74 ++++++++ .../math/statistics/bivariate_statistics.hpp | 15 +- .../statistics/chatterjee_correlation.hpp | 159 ++++++++++++++++++ include/boost/math/statistics/detail/rank.hpp | 140 +++++++++++++++ .../math/statistics/univariate_statistics.hpp | 6 +- include/boost/math/tools/config.hpp | 6 + include/boost/math/tools/random_vector.hpp | 26 ++- .../chatterjee_correlation_performance.cpp | 34 ++++ test/Jamfile.v2 | 2 + .../stats_chaterjee_incl_test.cpp | 9 + test/math_unit_test.hpp | 2 + test/test_chatterjee_correlation.cpp | 159 ++++++++++++++++++ test/test_rank.cpp | 102 +++++++++++ 13 files changed, 719 insertions(+), 15 deletions(-) create mode 100644 doc/statistics/chatterjee_correlation.qbk create mode 100644 include/boost/math/statistics/chatterjee_correlation.hpp create mode 100644 include/boost/math/statistics/detail/rank.hpp create mode 100644 reporting/performance/chatterjee_correlation_performance.cpp create mode 100644 test/compile_test/stats_chaterjee_incl_test.cpp create mode 100644 test/test_chatterjee_correlation.cpp create mode 100644 test/test_rank.cpp diff --git a/doc/statistics/chatterjee_correlation.qbk b/doc/statistics/chatterjee_correlation.qbk new file mode 100644 index 0000000000..520ab0cc6b --- /dev/null +++ b/doc/statistics/chatterjee_correlation.qbk @@ -0,0 +1,74 @@ +[/ + Copyright 2022 Matt Borland + + Distributed under the Boost Software License, Version 1.0. + (See accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt). +] + +[section:chatterjee_correlation Chatterjee Correlation] + +[heading Synopsis] + +`` +#include + +namespace boost::math::statistics { + + C++17: + template + auto chatterjee_correlation(ExecutionPolicy&& exec, const Container& u, const Container& v); + + C++11: + template + auto chatterjee_correlation(const Container& u, const Container& v); +} +`` + +[heading Description] + +The classical correlation coefficients like the Pearson's correlation are useful primarily for distinguishing when one dataset depends linearly on another. +However, Pearson's correlation coefficient has a known weakness in that when the dependent variable has an obvious functional relationship with the independent variable, the value of the correlation coefficient can take on any value. +As Chatterjee says: + +> Ideally, one would like a coefficient that approaches +its maximum value if and only if one variable looks more and more like a +noiseless function of the other, just as Pearson correlation is close to its maximum value if and only if one variable is close to being a noiseless linear function of the other. + +This is the problem Chatterjee's coefficient solves. +Let X and Y be random variables, where Y is not constant, and let (X_i, Y_i) be samples from this distribution. +Rearrange these samples so that X_(0) < X_{(1)} < ... X_{(n-1)} and create (R(X_{(i)}), R(Y_{(i)})). +The Chatterjee correlation is then given by + +[$../equations/chatterjee_correlation.svg] + +In the limit of an infinite amount of i.i.d data, the statistic lies in [0, 1]. +However, if the data is not infinite, the statistic may be negative. +If X and Y are independent, the value is zero, and if Y is a measurable function of X, then the statistic is unity. +The complexity is O(n log n). + +An example is given below: + + std::vector X{1,2,3,4,5}; + std::vector Y{1,2,3,4,5}; + using boost::math::statistics::chatterjee_correlation; + double coeff = chatterjee_correlation(X, Y); + +The implementation follows [@https://arxiv.org/pdf/1909.10140.pdf Chatterjee's paper]. + +/Nota bene:/ If the input is an integer type the output will be a double precision type. + +[heading Invariants] + +The function expects at least two samples, a non-constant vector Y, and the same number of X's as Y's. +If Y is constant, the result is a quiet NaN. +The data set must be sorted by X values. +If there are ties in the values of X, then the statistic is random due to the random breaking of ties. +Of course, random numbers are not used internally, but the result is not guaranteed to be identical on different systems. + +[heading References] + +* Chatterjee, Sourav. "A new coefficient of correlation." Journal of the American Statistical Association 116.536 (2021): 2009-2022. + +[endsect] +[/section:chatterjee_correlation Chatterjee Correlation] diff --git a/include/boost/math/statistics/bivariate_statistics.hpp b/include/boost/math/statistics/bivariate_statistics.hpp index 1854898139..d6f91faa93 100644 --- a/include/boost/math/statistics/bivariate_statistics.hpp +++ b/include/boost/math/statistics/bivariate_statistics.hpp @@ -18,13 +18,10 @@ #include #include -// Support compilers with P0024R2 implemented without linking TBB -// https://en.cppreference.com/w/cpp/compiler_support -#if !defined(BOOST_NO_CXX17_HDR_EXECUTION) && defined(BOOST_HAS_THREADS) +#ifdef BOOST_MATH_EXEC_COMPATIBLE #include #include #include -#define EXEC_COMPATIBLE #endif namespace boost{ namespace math{ namespace statistics { namespace detail { @@ -60,7 +57,7 @@ ReturnType means_and_covariance_seq_impl(ForwardIterator u_begin, ForwardIterato return std::make_tuple(mu_u, mu_v, cov/i, Real(i)); } -#ifdef EXEC_COMPATIBLE +#ifdef BOOST_MATH_EXEC_COMPATIBLE // Numerically stable parallel computation of (co-)variance // https://dl.acm.org/doi/10.1145/3221269.3223036 @@ -154,7 +151,7 @@ ReturnType means_and_covariance_parallel_impl(ForwardIterator u_begin, ForwardIt return std::make_tuple(mu_u_a, mu_v_a, cov_a, n_a); } -#endif // EXEC_COMPATIBLE +#endif // BOOST_MATH_EXEC_COMPATIBLE template ReturnType correlation_coefficient_seq_impl(ForwardIterator u_begin, ForwardIterator u_end, ForwardIterator v_begin, ForwardIterator v_end) @@ -204,7 +201,7 @@ ReturnType correlation_coefficient_seq_impl(ForwardIterator u_begin, ForwardIter return std::make_tuple(mu_u, Qu, mu_v, Qv, cov, rho, Real(i)); } -#ifdef EXEC_COMPATIBLE +#ifdef BOOST_MATH_EXEC_COMPATIBLE // Numerically stable parallel computation of (co-)variance: // https://dl.acm.org/doi/10.1145/3221269.3223036 @@ -324,11 +321,11 @@ ReturnType correlation_coefficient_parallel_impl(ForwardIterator u_begin, Forwar return std::make_tuple(mu_u_a, Qu_a, mu_v_a, Qv_a, cov_a, rho, n_a); } -#endif // EXEC_COMPATIBLE +#endif // BOOST_MATH_EXEC_COMPATIBLE } // namespace detail -#ifdef EXEC_COMPATIBLE +#ifdef BOOST_MATH_EXEC_COMPATIBLE template inline auto means_and_covariance(ExecutionPolicy&& exec, Container const & u, Container const & v) diff --git a/include/boost/math/statistics/chatterjee_correlation.hpp b/include/boost/math/statistics/chatterjee_correlation.hpp new file mode 100644 index 0000000000..ad0b33a429 --- /dev/null +++ b/include/boost/math/statistics/chatterjee_correlation.hpp @@ -0,0 +1,159 @@ +// (C) Copyright Matt Borland 2022. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_MATH_STATISTICS_CHATTERJEE_CORRELATION_HPP +#define BOOST_MATH_STATISTICS_CHATTERJEE_CORRELATION_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef BOOST_MATH_EXEC_COMPATIBLE +#include +#include +#include +#endif + +namespace boost { namespace math { namespace statistics { + +namespace detail { + +template +std::size_t chatterjee_transform(BDIter begin, BDIter end) +{ + std::size_t sum = 0; + + while(++begin != end) + { + if(*begin > *std::prev(begin)) + { + sum += *begin - *std::prev(begin); + } + else + { + sum += *std::prev(begin) - *begin; + } + } + + return sum; +} + +template +ReturnType chatterjee_correlation_seq_impl(ForwardIterator u_begin, ForwardIterator u_end, ForwardIterator v_begin, ForwardIterator v_end) +{ + using std::abs; + + BOOST_MATH_ASSERT_MSG(std::is_sorted(u_begin, u_end), "The x values must be sorted in order to use this functionality"); + + const std::vector rank_vector = rank(v_begin, v_end); + + std::size_t sum = chatterjee_transform(rank_vector.begin(), rank_vector.end()); + + ReturnType result = static_cast(1) - (static_cast(3 * sum) / static_cast(rank_vector.size() * rank_vector.size() - 1)); + + // If the result is 1 then Y is constant and all the elements must be ties + if (abs(result - static_cast(1)) < std::numeric_limits::epsilon()) + { + return std::numeric_limits::quiet_NaN(); + } + + return result; +} + +} // Namespace detail + +template ::value, double, Real>::type> +inline ReturnType chatterjee_correlation(const Container& u, const Container& v) +{ + return detail::chatterjee_correlation_seq_impl(std::begin(u), std::end(u), std::begin(v), std::end(v)); +} + +}}} // Namespace boost::math::statistics + +#ifdef BOOST_MATH_EXEC_COMPATIBLE + +namespace boost::math::statistics { + +namespace detail { + +template +ReturnType chatterjee_correlation_par_impl(ExecutionPolicy&& exec, ForwardIterator u_begin, ForwardIterator u_end, + ForwardIterator v_begin, ForwardIterator v_end) +{ + using std::abs; + BOOST_MATH_ASSERT_MSG(std::is_sorted(std::forward(exec), u_begin, u_end), "The x values must be sorted in order to use this functionality"); + + auto rank_vector = rank(std::forward(exec), v_begin, v_end); + + const auto num_threads = std::thread::hardware_concurrency() == 0 ? 2u : std::thread::hardware_concurrency(); + std::vector> future_manager {}; + const auto elements_per_thread = std::ceil(static_cast(rank_vector.size()) / num_threads); + + auto it = rank_vector.begin(); + auto end = rank_vector.end(); + for(std::size_t i {}; i < num_threads - 1; ++i) + { + future_manager.emplace_back(std::async(std::launch::async | std::launch::deferred, [it, elements_per_thread]() -> std::size_t + { + return chatterjee_transform(it, std::next(it, elements_per_thread)); + })); + it = std::next(it, elements_per_thread - 1); + } + + future_manager.emplace_back(std::async(std::launch::async | std::launch::deferred, [it, end]() -> std::size_t + { + return chatterjee_transform(it, end); + })); + + std::size_t sum {}; + for(std::size_t i {}; i < future_manager.size(); ++i) + { + sum += future_manager[i].get(); + } + + ReturnType result = static_cast(1) - (static_cast(3 * sum) / static_cast(rank_vector.size() * rank_vector.size() - 1)); + + // If the result is 1 then Y is constant and all the elements must be ties + if (abs(result - static_cast(1)) < std::numeric_limits::epsilon()) + { + return std::numeric_limits::quiet_NaN(); + } + + return result; +} + +} // Namespace detail + +template , double, Real>> +inline ReturnType chatterjee_correlation(ExecutionPolicy&& exec, const Container& u, const Container& v) +{ + if constexpr (std::is_same_v, decltype(std::execution::seq)>) + { + return detail::chatterjee_correlation_seq_impl(std::cbegin(u), std::cend(u), + std::cbegin(v), std::cend(v)); + } + else + { + return detail::chatterjee_correlation_par_impl(std::forward(exec), + std::cbegin(u), std::cend(u), + std::cbegin(v), std::cend(v)); + } +} + +} // Namespace boost::math::statistics + +#endif + +#endif // BOOST_MATH_STATISTICS_CHATTERJEE_CORRELATION_HPP diff --git a/include/boost/math/statistics/detail/rank.hpp b/include/boost/math/statistics/detail/rank.hpp new file mode 100644 index 0000000000..4e5211607e --- /dev/null +++ b/include/boost/math/statistics/detail/rank.hpp @@ -0,0 +1,140 @@ +// (C) Copyright Matt Borland 2022 +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_MATH_STATISTICS_DETAIL_RANK_HPP +#define BOOST_MATH_STATISTICS_DETAIL_RANK_HPP + +#include +#include +#include +#include +#include +#include +#include + +#ifdef BOOST_MATH_EXEC_COMPATIBLE +#include +#endif + +namespace boost { namespace math { namespace statistics { namespace detail { + +struct pair_equal +{ + template + bool operator()(const std::pair& a, const std::pair& b) const + { + return a.first == b.first; + } +}; + +}}}} // Namespaces + +#ifndef BOOST_MATH_EXEC_COMPATIBLE + +namespace boost { namespace math { namespace statistics { namespace detail { + +template ::value_type> +auto rank(ForwardIterator first, ForwardIterator last) -> std::vector +{ + std::size_t elements = std::distance(first, last); + + std::vector> rank_vector(elements); + std::size_t i = 0; + while (first != last) + { + rank_vector[i] = std::make_pair(*first, i); + ++i; + ++first; + } + + std::sort(rank_vector.begin(), rank_vector.end()); + + // Remove duplicates + rank_vector.erase(std::unique(rank_vector.begin(), rank_vector.end(), pair_equal()), rank_vector.end()); + elements = rank_vector.size(); + + std::pair rank; + std::vector result(elements); + for (i = 0; i < elements; ++i) + { + if (rank_vector[i].first != rank.first) + { + rank = std::make_pair(rank_vector[i].first, i); + } + result[rank_vector[i].second] = rank.second; + } + + return result; +} + +template +inline auto rank(const Container& c) -> std::vector +{ + return rank(std::begin(c), std::end(c)); +} + +}}}} // Namespaces + +#else + +namespace boost::math::statistics::detail { + +template ::value_type> +auto rank(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) +{ + std::size_t elements = std::distance(first, last); + + std::vector> rank_vector(elements); + std::size_t i = 0; + while (first != last) + { + rank_vector[i] = std::make_pair(*first, i); + ++i; + ++first; + } + + std::sort(exec, rank_vector.begin(), rank_vector.end()); + + // Remove duplicates + rank_vector.erase(std::unique(exec, rank_vector.begin(), rank_vector.end(), pair_equal()), rank_vector.end()); + elements = rank_vector.size(); + + std::pair rank; + std::vector result(elements); + for (i = 0; i < elements; ++i) + { + if (rank_vector[i].first != rank.first) + { + rank = std::make_pair(rank_vector[i].first, i); + } + result[rank_vector[i].second] = rank.second; + } + + return result; +} + +template +inline auto rank(ExecutionPolicy&& exec, const Container& c) +{ + return rank(exec, std::cbegin(c), std::cend(c)); +} + +template ::value_type> +inline auto rank(ForwardIterator first, ForwardIterator last) +{ + return rank(std::execution::seq, first, last); +} + +template +inline auto rank(const Container& c) +{ + return rank(std::execution::seq, std::cbegin(c), std::cend(c)); +} + +} // Namespaces + +#endif // BOOST_MATH_EXEC_COMPATIBLE + +#endif // BOOST_MATH_STATISTICS_DETAIL_RANK_HPP diff --git a/include/boost/math/statistics/univariate_statistics.hpp b/include/boost/math/statistics/univariate_statistics.hpp index 63b0bd7701..711d807c79 100644 --- a/include/boost/math/statistics/univariate_statistics.hpp +++ b/include/boost/math/statistics/univariate_statistics.hpp @@ -20,9 +20,7 @@ #include #include -// Support compilers with P0024R2 implemented without linking TBB -// https://en.cppreference.com/w/cpp/compiler_support -#if !defined(BOOST_NO_CXX17_HDR_EXECUTION) && defined(BOOST_HAS_THREADS) +#ifdef BOOST_MATH_EXEC_COMPATIBLE #include namespace boost::math::statistics { @@ -672,7 +670,7 @@ inline auto mode(Container & v) } // Namespace boost::math::statistics -#else // Backwards compatible bindings for C++11 +#else // Backwards compatible bindings for C++11 or execution is not implemented namespace boost { namespace math { namespace statistics { diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 2a054fc261..d809338cc6 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -79,6 +79,12 @@ #endif // BOOST_MATH_STANDALONE +// Support compilers with P0024R2 implemented without linking TBB +// https://en.cppreference.com/w/cpp/compiler_support +#if !defined(BOOST_NO_CXX17_HDR_EXECUTION) && defined(BOOST_HAS_THREADS) +# define BOOST_MATH_EXEC_COMPATIBLE +#endif + #include // for min and max #include #include diff --git a/include/boost/math/tools/random_vector.hpp b/include/boost/math/tools/random_vector.hpp index 8e3a3289aa..d1954ee3b5 100644 --- a/include/boost/math/tools/random_vector.hpp +++ b/include/boost/math/tools/random_vector.hpp @@ -12,8 +12,8 @@ namespace boost { namespace math { // To stress test, set global_seed = 0, global_size = huge. -static const std::size_t global_seed = 0; -static const std::size_t global_size = 128; +static constexpr std::size_t global_seed = 0; +static constexpr std::size_t global_size = 128; template::value, bool>::type = true> std::vector generate_random_vector(std::size_t size, std::size_t seed) @@ -35,6 +35,28 @@ std::vector generate_random_vector(std::size_t size, std::size_t seed) return v; } +template::value, bool>::type = true> +std::vector generate_random_uniform_vector(std::size_t size, std::size_t seed, T lower_bound = T(0), T upper_bound = T(1)) +{ + if (seed == 0) + { + std::random_device rd; + seed = rd(); + } + std::vector v(size); + + std::mt19937 gen(seed); + + std::uniform_real_distribution dis(lower_bound, upper_bound); + + for (auto& i : v) + { + i = dis(gen); + } + + return v; +} + template::value, bool>::type = true> std::vector generate_random_vector(std::size_t size, std::size_t seed, T mean, T stddev) { diff --git a/reporting/performance/chatterjee_correlation_performance.cpp b/reporting/performance/chatterjee_correlation_performance.cpp new file mode 100644 index 0000000000..866edb1048 --- /dev/null +++ b/reporting/performance/chatterjee_correlation_performance.cpp @@ -0,0 +1,34 @@ +// (C) Copyright Matt Borland 2022. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include + +using boost::math::generate_random_vector; + +template +void chatterjee_correlation(benchmark::State& state) +{ + constexpr std::size_t seed {}; + const std::size_t size = state.range(0); + std::vector u = generate_random_vector(size, seed); + std::vector v = generate_random_vector(size, seed); + + std::sort(u.begin(), u.end()); + + for (auto _ : state) + { + benchmark::DoNotOptimize(boost::math::statistics::chatterjee_correlation(u, v)); + } + state.SetComplexityN(state.range(0)); +} + +BENCHMARK_TEMPLATE(chatterjee_correlation, float)->RangeMultiplier(2)->Range(1 << 6, 1 << 20)->Complexity()->UseRealTime(); +BENCHMARK_TEMPLATE(chatterjee_correlation, double)->RangeMultiplier(2)->Range(1 << 6, 1 << 20)->Complexity()->UseRealTime(); + +BENCHMARK_MAIN(); diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 5deced8cb0..f3df8c2c4f 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -1012,6 +1012,8 @@ test-suite misc : [ run bivariate_statistics_test.cpp : : : [ requires cxx11_hdr_forward_list cxx11_hdr_atomic cxx11_hdr_thread cxx11_hdr_tuple cxx11_hdr_future cxx11_sfinae_expr ] [ check-target-builds ../config//is_cygwin_run "Cygwin CI run" : no ] ] [ run linear_regression_test.cpp : : : [ requires cxx11_hdr_forward_list cxx11_hdr_atomic cxx11_hdr_thread cxx11_hdr_tuple cxx11_hdr_future cxx11_sfinae_expr ] ] [ run test_runs_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] + [ run test_chatterjee_correlation.cpp ../../test/build//boost_unit_test_framework ] + [ run test_rank.cpp ../../test/build//boost_unit_test_framework ] [ run lanczos_smoothing_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run condition_number_test.cpp ../../test/build//boost_unit_test_framework : : : [ check-target-builds ../config//has_float128 "GCC libquadmath and __float128 support" : "-Bstatic -lquadmath -Bdynamic" ] [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run test_real_concept.cpp ../../test/build//boost_unit_test_framework ] diff --git a/test/compile_test/stats_chaterjee_incl_test.cpp b/test/compile_test/stats_chaterjee_incl_test.cpp new file mode 100644 index 0000000000..84c235c244 --- /dev/null +++ b/test/compile_test/stats_chaterjee_incl_test.cpp @@ -0,0 +1,9 @@ +// Copyright Matt Borland 2021. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Basic sanity check that header +// #includes all the files that it needs to. +// +#include diff --git a/test/math_unit_test.hpp b/test/math_unit_test.hpp index 6a6af9fc15..0a526a15a3 100644 --- a/test/math_unit_test.hpp +++ b/test/math_unit_test.hpp @@ -384,6 +384,8 @@ int report_errors() #define CHECK_ULP_CLOSE(X, Y, Z) boost::math::test::check_ulp_close((X), (Y), (Z), __FILE__, __func__, __LINE__) +#define CHECK_GE(X, Y) boost::math::test::check_le((Y), (X), __FILE__, __func__, __LINE__) + #define CHECK_LE(X, Y) boost::math::test::check_le((X), (Y), __FILE__, __func__, __LINE__) #define CHECK_NAN(X) boost::math::test::check_nan((X), __FILE__, __func__, __LINE__) diff --git a/test/test_chatterjee_correlation.cpp b/test/test_chatterjee_correlation.cpp new file mode 100644 index 0000000000..be91a925e6 --- /dev/null +++ b/test/test_chatterjee_correlation.cpp @@ -0,0 +1,159 @@ +// (C) Copyright Matt Borland 2022. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "math_unit_test.hpp" + +// The Chatterjee correlation is invariant under: +// - Shuffles. (X_i, Y_i) -> (X_sigma(i), Y_sigma(i)), where sigma is a permutation. +// - Strictly monotone transformations: (X_i, Y_i) -> (f(X_i), g(Y_i)) where f' > 0 and g' > 0. +// + +using boost::math::statistics::chatterjee_correlation; + +template +void properties() +{ + std::size_t vector_size = 256; + std::mt19937_64 mt(123521); + std::uniform_real_distribution unif(-1, 1); + std::vector X(vector_size); + std::vector Y(vector_size); + + for (std::size_t i = 0; i < vector_size; ++i) + { + X[i] = unif(mt); + Y[i] = unif(mt); + } + std::sort(X.begin(), X.end()); + Real coeff1 = chatterjee_correlation(X, Y); + // The minimum possible value of En(X, Y) is -1/2 + O(1/n) + CHECK_GE(coeff1, Real(-0.5)); + CHECK_LE(coeff1, Real(1)); + + // Now apply a monotone function to the data + for (std::size_t i = 0; i < vector_size; ++i) + { + X[i] = Real(2.3)*X[i] - Real(7.3); + Y[i] = Real(7.6)*Y[i] - Real(8.6); + } + auto coeff3 = chatterjee_correlation(X, Y); + CHECK_EQUAL(coeff1, coeff3); + + // If there are no ties among the Yis, the maximum possible value of Xi(X, Y) is (n - 2)/(n + 1), which is attained if Yi = Xi for all i + auto coeff = chatterjee_correlation(X, X); + // These floating point numbers are computed by two different methods, so we can expect some floating point error: + const auto n = X.size(); + CHECK_ULP_CLOSE(coeff, Real(n-2)/Real(n+1), 1); + std::sort(Y.begin(), Y.end()); + coeff = chatterjee_correlation(Y, Y); + CHECK_ULP_CLOSE(coeff, Real(n-2)/Real(n+1), 1); +} + +template +void test_spots() +{ + // Rank Order: Result will be 1 - 3*3 / (4^2 - 1) = 1 - 9/15 = 0.6 + std::vector x = {1, 2, 3, 4}; + std::vector y = {1, 2, 3, 4}; + CHECK_ULP_CLOSE(chatterjee_correlation(x, y), 1 - Real(9)/15, 1); + + // Reverse rank order should be the same as above + y = {4, 3, 2, 1}; + CHECK_ULP_CLOSE(chatterjee_correlation(x, y), 1 - Real(9)/15, 1); + + // Alternating order: 1 - 3*5 / (4^2 - 1) = 1 - 15/15 = 0 + y = {1, 3, 2, 4}; + CHECK_ULP_CLOSE(chatterjee_correlation(x, y), Real(0), 1); + + // All ties will yield quiet NaN + y = {1, 1, 1, 1}; + CHECK_NAN(chatterjee_correlation(x, y)); +} + +#ifdef BOOST_MATH_EXEC_COMPATIBLE + +template +void test_threaded(ExecutionPolicy&& exec) +{ + std::vector x = boost::math::generate_random_vector(1024, 0); + std::vector y = boost::math::generate_random_vector(1024, 1); + + std::sort(std::forward(exec), x.begin(), x.end()); + + auto seq_ans = chatterjee_correlation(x, y); + auto par_ans = chatterjee_correlation(exec, x, y); + + CHECK_ULP_CLOSE(seq_ans, par_ans, 1); +}; + +#endif // BOOST_MATH_EXEC_COMPATIBLE + +template +void test_paper() +{ + constexpr Real two_pi = boost::math::constants::two_pi(); + + // Page 9 figure (a) y = x + std::vector x = boost::math::generate_random_uniform_vector(100, 0, -two_pi, two_pi); + std::sort(x.begin(), x.end()); + auto result = chatterjee_correlation(x, x); + CHECK_MOLLIFIED_CLOSE(result, Real(0.970), 0.005); + + // Page 9 figure (d) y = x^2 + std::vector y = x; + for (auto& i : y) + { + i *= i; + } + + result = chatterjee_correlation(x, y); + CHECK_MOLLIFIED_CLOSE(result, Real(0.941), 0.005); + + // Page 9 figure (g) y = sin(x) + for (std::size_t i {}; i < x.size(); ++i) + { + y[i] = std::sin(x[i]); + } + + result = chatterjee_correlation(x, y); + CHECK_MOLLIFIED_CLOSE(result, Real(0.885), 0.01); +} + +int main(void) +{ + properties(); + properties(); + properties(); + + test_spots(); + test_spots(); + test_spots(); + + #ifdef BOOST_MATH_EXEC_COMPATIBLE + + test_threaded(std::execution::par); + test_threaded(std::execution::par); + test_threaded(std::execution::par); + test_threaded(std::execution::par_unseq); + test_threaded(std::execution::par_unseq); + test_threaded(std::execution::par_unseq); + + #endif // BOOST_MATH_EXEC_COMPATIBLE + + test_paper(); + test_paper(); + test_paper(); + + return boost::math::test::report_errors(); +} diff --git a/test/test_rank.cpp b/test/test_rank.cpp new file mode 100644 index 0000000000..73c022121d --- /dev/null +++ b/test/test_rank.cpp @@ -0,0 +1,102 @@ +// (C) Copyright Matt Borland 2022. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "math_unit_test.hpp" + +template +void test() +{ + std::vector test_vals {T(1.0), T(3.2), T(2.4), T(5.6), T(4.1)}; + auto rank_vector = boost::math::statistics::detail::rank(test_vals.begin(), test_vals.end()); + + CHECK_EQUAL(static_cast(0), rank_vector[0]); + CHECK_EQUAL(static_cast(2), rank_vector[1]); + CHECK_EQUAL(static_cast(1), rank_vector[2]); + CHECK_EQUAL(static_cast(4), rank_vector[3]); + CHECK_EQUAL(static_cast(3), rank_vector[4]); + + // Remove duplicates + test_vals.push_back(T(4.1)); + test_vals.push_back(T(2.4)); + rank_vector = boost::math::statistics::detail::rank(test_vals.begin(), test_vals.end()); + + // Check the size is correct and the ordering is not disrupted + CHECK_EQUAL(static_cast(5), rank_vector.size()); + CHECK_EQUAL(static_cast(0), rank_vector[0]); + CHECK_EQUAL(static_cast(2), rank_vector[1]); + CHECK_EQUAL(static_cast(1), rank_vector[2]); + CHECK_EQUAL(static_cast(4), rank_vector[3]); + CHECK_EQUAL(static_cast(3), rank_vector[4]); +} + +template +void container_test() +{ + std::vector test_vals {T(1.0), T(3.2), T(2.4), T(5.6), T(4.1)}; + auto rank_vector = boost::math::statistics::detail::rank(test_vals); + + CHECK_EQUAL(static_cast(0), rank_vector[0]); + CHECK_EQUAL(static_cast(2), rank_vector[1]); + CHECK_EQUAL(static_cast(1), rank_vector[2]); + CHECK_EQUAL(static_cast(4), rank_vector[3]); + CHECK_EQUAL(static_cast(3), rank_vector[4]); +} + +#ifdef BOOST_MATH_EXEC_COMPATIBLE + +#include + +template +void execution_test(ExecutionPolicy&& exec) +{ + std::vector test_vals {T(1.0), T(3.2), T(2.4), T(5.6), T(4.1)}; + auto rank_vector = boost::math::statistics::detail::rank(exec, test_vals.begin(), test_vals.end()); + + CHECK_EQUAL(static_cast(0), rank_vector[0]); + CHECK_EQUAL(static_cast(2), rank_vector[1]); + CHECK_EQUAL(static_cast(1), rank_vector[2]); + CHECK_EQUAL(static_cast(4), rank_vector[3]); + CHECK_EQUAL(static_cast(3), rank_vector[4]); + + // Remove duplicates + test_vals.push_back(T(4.1)); + test_vals.push_back(T(2.4)); + rank_vector = boost::math::statistics::detail::rank(exec, test_vals.begin(), test_vals.end()); + + // Check the size is correct and the ordering is not disrupted + CHECK_EQUAL(static_cast(5), rank_vector.size()); + CHECK_EQUAL(static_cast(0), rank_vector[0]); + CHECK_EQUAL(static_cast(2), rank_vector[1]); + CHECK_EQUAL(static_cast(1), rank_vector[2]); + CHECK_EQUAL(static_cast(4), rank_vector[3]); + CHECK_EQUAL(static_cast(3), rank_vector[4]); +} + +#endif // BOOST_MATH_EXEC_COMPATIBLE + +int main(void) +{ + test(); + test(); + test(); + + container_test(); + container_test(); + container_test(); + + #ifdef BOOST_MATH_EXEC_COMPATIBLE + + execution_test(std::execution::par); + execution_test(std::execution::par); + execution_test(std::execution::par); + + #endif // BOOST_MATH_EXEC_COMPATIBLE + + return boost::math::test::report_errors(); +}