From cb7cf44e7839ee74a0ae3c62505ed0307a51409b Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 22 Feb 2019 14:15:18 +0100 Subject: [PATCH] Add pairwise convenience method for tables Also add docstrings for pairwise and pairwise!. --- Project.toml | 4 ++++ README.md | 9 +++++++++ REQUIRE | 3 ++- src/Distances.jl | 1 + src/generic.jl | 33 +++++++++++++++++++++++++++++++++ test/test_dists.jl | 8 ++++++++ 6 files changed, 57 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 41a4e7a..2e7cf7f 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [extras] Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -11,3 +12,6 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Random", "Test"] + +[compat] +Tables = ">= 0.1.15" \ No newline at end of file diff --git a/README.md b/README.md index 03363f3..8211f5f 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,15 @@ For performance reasons, it is recommended to use matrices with observations in the ``Array`` type in Julia is column-major, making it more efficient to access memory column by column. However, matrices with observations stored in rows are also supported via the argument ``dims=1``. +A convenience method is provided to compute pairwise distances between observations stored as rows in +any type of tabular data structure supported by the [Tables.jl](https://github.com/JuliaData/Tables.jl) +interface. Here is an example using a [`DataFrame`](https://github.com/JuliaData/DataFrames.jl): +```julia +using DataFrames +df = DataFrame(x = [1, 2, 3], y = [2, 5, 3]) +pairwise(Euclidean(), df) +``` + #### Computing column-wise and pairwise distances inplace If the vector/matrix to store the results are pre-allocated, you may use the storage (without creating a new array) using the following syntax (``i`` being either ``1`` or ``2``): diff --git a/REQUIRE b/REQUIRE index bc15a3a..e7c5ecd 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1 +1,2 @@ -julia 0.7- \ No newline at end of file +julia 0.7- +Tables 0.1.15 \ No newline at end of file diff --git a/src/Distances.jl b/src/Distances.jl index 4df5b29..dee4985 100644 --- a/src/Distances.jl +++ b/src/Distances.jl @@ -4,6 +4,7 @@ module Distances using LinearAlgebra using Statistics +using Tables export # generic types/functions diff --git a/src/generic.jl b/src/generic.jl index 1e23a31..78209fe 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -122,6 +122,17 @@ function deprecated_dims(dims::Union{Nothing,Integer}) end end +""" + pairwise!(r::AbstractMatrix, metric::PreMetric, + a::AbstractMatrix, b::AbstractMatrix=a; dims) + +Compute distances between each pair of rows (if `dims=1`) or columns (if `dims=2`) +in `a` and `b` according to distance `metric`, and store the result in `r`. +If a single matrix `a` is provided, compute distances between its rows or columns. + +`a` and `b` must have the same numbers of columns if `dims=1`, or of rows if `dims=2`. +`r` must be a square matrix with size `size(a, dims) == size(b, dims)`. +""" function pairwise!(r::AbstractMatrix, metric::PreMetric, a::AbstractMatrix, b::AbstractMatrix; dims::Union{Nothing,Integer}=nothing) @@ -165,6 +176,15 @@ function pairwise!(r::AbstractMatrix, metric::PreMetric, a::AbstractMatrix; end end +""" + pairwise(metric::PreMetric, a::AbstractMatrix, b::AbstractMatrix=a; dims) + +Compute distances between each pair of rows (if `dims=1`) or columns (if `dims=2`) +in `a` and `b` according to distance `metric`. If a single matrix `a` is provided, +compute distances between its rows or columns. + +`a` and `b` must have the same numbers of columns if `dims=1`, or of rows if `dims=2`. +""" function pairwise(metric::PreMetric, a::AbstractMatrix, b::AbstractMatrix; dims::Union{Nothing,Integer}=nothing) dims = deprecated_dims(dims) @@ -183,3 +203,16 @@ function pairwise(metric::PreMetric, a::AbstractMatrix; r = Matrix{result_type(metric, a, a)}(undef, n, n) pairwise!(r, metric, a, dims=dims) end + +""" + pairwise(metric::PreMetric, t) + +Compute distances between each pair of observations (i.e. rows) in table `t` +according to distance `metric`. `t` can be any type of table supported by +the [Tables.jl](https://github.com/JuliaData/Tables.jl) interface. +""" +function pairwise(metric::PreMetric, t::Any) + # TODO: avoid permuting using https://github.com/JuliaData/Tables.jl/pull/66 + a = permutedims(Tables.matrix(t)) + pairwise(metric, a, dims=2) +end \ No newline at end of file diff --git a/test/test_dists.jl b/test/test_dists.jl index 7dcc25e..38ebe90 100644 --- a/test/test_dists.jl +++ b/test/test_dists.jl @@ -429,6 +429,14 @@ end test_colwise(Mahalanobis(Q), X, Y, T) end +@testset "pairwise Tables.jl interface" begin + t = [(a=1, b=2), (a=2, b=3), (a=0, b=5)] + a = [1 2; 2 3; 0 5] + @test pairwise(Euclidean(), t) == pairwise(Euclidean(), a, dims=1) + + @test_throws ArgumentError pairwise(Euclidean(), [1]) +end + function test_pairwise(dist, x, y, T) @testset "Pairwise test for $(typeof(dist))" begin nx = size(x, 2)