From 4b3c4f2c72048822a57eb335bf0701899ba6826e Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sat, 26 May 2018 20:12:08 -0700 Subject: [PATCH 01/14] prep for consolidation of density/violin plot code --- src/geom/{violin.jl => density.jl} | 0 src/geometry.jl | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename src/geom/{violin.jl => density.jl} (100%) diff --git a/src/geom/violin.jl b/src/geom/density.jl similarity index 100% rename from src/geom/violin.jl rename to src/geom/density.jl diff --git a/src/geometry.jl b/src/geometry.jl index a814e6c23..fe953534e 100644 --- a/src/geometry.jl +++ b/src/geometry.jl @@ -62,7 +62,7 @@ include("geom/point.jl") include("geom/rectbin.jl") include("geom/subplot.jl") include("geom/ribbon.jl") -include("geom/violin.jl") +include("geom/density.jl") include("geom/polygon.jl") include("geom/beeswarm.jl") include("geom/segment.jl") From 944fc295efb82afd7bff585e631520ad07cd6c92 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 27 May 2018 21:26:11 -0700 Subject: [PATCH 02/14] [WIP] prelim version of the density geometry revamp working on #1152. Note: This is a WIP and currently completely breaks `Geom.density` and `Geom.violin` has several regressions. --- src/aesthetics.jl | 71 +++++++++++++++++ src/geom/density.jl | 70 ++++++++++++----- src/geom/line.jl | 10 --- src/statistics.jl | 187 ++++++++++++++++++++++++++++++++++++-------- 4 files changed, 277 insertions(+), 61 deletions(-) diff --git a/src/aesthetics.jl b/src/aesthetics.jl index 6aa67d9bc..c1d955849 100755 --- a/src/aesthetics.jl +++ b/src/aesthetics.jl @@ -1,3 +1,5 @@ +using IterTools + const NumericalOrCategoricalAesthetic = Union{(Void), Vector, DataArray, IndirectArray} @@ -413,3 +415,72 @@ function inherit!(a::Aesthetics, b::Aesthetics; end nothing end + +""" + Given aesthetics to group with, `by`, and an aesthetic to group `togroupvar` +this function constructs a dictionary that maps each given combination of the +`by` aesthetics to the positions which they apply to. Thus the output is a +dictionary of tuples of each unique combination of `by` mapped to a boolean +array of length `n` where `n` is the length of the aesthetics (they have to all +have the same length). If the provided aesthetics are missing, a placeholder +`nothing` is return instead of the unique value. + +## Examples + +```jldoctest +aes = Gadfly.Aesthetics() +aes.x = repeat([1, 2], inner=3) +aes.y = collect(1:6) + +groupby(aes, [:x, :color], :y) + +# output + +Dict((2, nothing)=>Bool[false, false, false, true, true, true],(1, nothing)=>Bool[true, true, true, false, false, false]) +``` + +```jldoctest +aes = Gadfly.Aesthetics() +aes.x = repeat([:a, :b], inner=2) +aes.y = collect(1:4) +aes.color = repeat([colorant"red", colorant"blue"], inner=2) + +groupby(aes, [:x, :color], :y) + +# output + +Dict((:a, RGB{N0f8}(1.0,0.0,0.0))=>Bool[true, true, false, false],(:b, RGB{N0f8}(0.0,0.0,1.0))=>Bool[false, false, true, true]) +``` + +""" +function groupby(aes::Gadfly.Aesthetics, by::Vector{Symbol}, togroupvar::Symbol) + types = fill(Nothing, length(by)) + isconcrete = fill(false, length(by)) + for i in 1:length(by) + isconcrete[i] = getfield(aes, by[i]) != nothing + (!isconcrete[i]) && continue + types[i] = eltype(getfield(aes, by[i])) + @assert length(getfield(aes, togroupvar)) == length(getfield(aes, by[i])) "$togroupvar and $(by[i]) aesthetics must have same length" + end + + T = Tuple{types...} + grouped = Dict{T, Vector{Bool}}() + + # gather options for each `by` aesthetic + opt = [if isconcrete[i] unique(getfield(aes, by[i])) else [nothing] end for i in 1:length(by)] + + # The approach is to identify positions were multiple by aesthetics overlap + # and thus grouping the data positions. We first assume that all positions + # belong to a combination of aesthetics and then whittle it down + for combo in product(opt...) + belongs = fill(true, length(getfield(aes, togroupvar))) + for i in 1:length(combo) + (combo[i] == nothing) && continue + belongs .&= getfield(aes, by[i]) .== combo[i] + end + # for multiple by variables we need to check whether there is any overlap + # between this specific combo before adding it to the dict + (any(belongs)) && (grouped[combo] = belongs) + end + grouped +end diff --git a/src/geom/density.jl b/src/geom/density.jl index 21539287b..c35e82a8b 100644 --- a/src/geom/density.jl +++ b/src/geom/density.jl @@ -1,8 +1,29 @@ +struct DensityGeometry <: Gadfly.GeometryElement + stat::Gadfly.StatisticElement + order::Int + tag::Symbol +end + +function DensityGeometry(; order=1, tag=empty_tag, kwargs...) + DensityGeometry(Gadfly.Stat.DensityStatistic(; kwargs...), order, tag) +end + +DensityGeometry(stat; order=1, tag=empty_tag) = DensityGeometry(stat, order, tag) + +const density = DensityGeometry + +element_aesthetics(::DensityGeometry) = Symbol[] +default_statistic(geom::DensityGeometry) = Gadfly.Stat.DensityStatistic(geom.stat) + struct ViolinGeometry <: Gadfly.GeometryElement + stat::Gadfly.StatisticElement + split::Bool order::Int tag::Symbol end -ViolinGeometry(; order=1, tag=empty_tag) = ViolinGeometry(order, tag) +function ViolinGeometry(; order=1, tag=empty_tag, split=false, kwargs...) + ViolinGeometry(Gadfly.Stat.DensityStatistic(; kwargs...), split, order, tag) +end """ Geom.violin[(; order=1)] @@ -15,29 +36,42 @@ const violin = ViolinGeometry element_aesthetics(::ViolinGeometry) = [:x, :y, :color] -default_statistic(::ViolinGeometry) = Gadfly.Stat.violin() +default_statistic(geom::ViolinGeometry) = Gadfly.Stat.DensityStatistic(geom.stat) function render(geom::ViolinGeometry, theme::Gadfly.Theme, aes::Gadfly.Aesthetics) - # TODO: What should we do with the color aesthetic? Gadfly.assert_aesthetics_defined("Geom.violin", aes, :y, :width) Gadfly.assert_aesthetics_equal_length("Geom.violin", aes, :y, :width) - default_aes = Gadfly.Aesthetics() - default_aes.color = fill(theme.default_color, length(aes.y)) - aes = Gadfly.inherit(aes, default_aes) - - # Group y, width and color by x - ux = unique(aes.x) - grouped_color = Dict(x => first(aes.color[aes.x.==x]) for x in ux) - grouped_y = Dict(x => aes.y[aes.x.==x] for x in ux) - grouped_width = Dict(x => aes.width[aes.x.==x] for x in ux) - - kgy = keys(grouped_y) - violins = [vcat([(x - w/2, y) for (y, w) in zip(grouped_y[x], grouped_width[x])], - reverse!([(x + w/2, y) for (y, w) in zip(grouped_y[x], grouped_width[x])])) - for x in kgy] - colors = [grouped_color[x] for x in kgy] + grouped_data = Gadfly.groupby(aes, [:x, :color], :y) + violins = Array{NTuple{2, Float64}}[] + + colors = [] + (aes.color == nothing) && (aes.color = fill(theme.default_color, length(aes.x))) + color_opts = unique(aes.color) + if geom.split && length(color_opts) > 2 + error("Split violins require 2 colors, not more") + end + + for (keys, belongs) in grouped_data + x, color = keys + ys = aes.y[belongs] + ws = aes.width[belongs] + + if geom.split + pos = findfirst(color_opts, color) + if pos == 1 + push!(violins, [(x - w/2, y) for (y, w) in zip(ys, ws)]) + else + push!(violins, reverse!([(x + w/2, y) for (y, w) in zip(ys, ws)])) + end + push!(colors, color) + else + push!(violins, vcat([(x - w/2, y) for (y, w) in zip(ys, ws)], + reverse!([(x + w/2, y) for (y, w) in zip(ys, ws)]))) + push!(colors, color != nothing ? color : theme.default_color) + end + end ctx = context(order=geom.order) compose!(ctx, Compose.polygon(violins, geom.tag), fill(colors)) diff --git a/src/geom/line.jl b/src/geom/line.jl index fe10e24fc..b2ac03aae 100644 --- a/src/geom/line.jl +++ b/src/geom/line.jl @@ -51,16 +51,6 @@ geometry is equivalent to [`Geom.line`](@ref) with `preserve_order=true`. """ path() = LineGeometry(preserve_order=true) -""" - Geom.density[(; bandwidth=-Inf)] - -Draw a line showing the density estimate of the `x` aesthetic. -This geometry is equivalent to [`Geom.line`](@ref) with -[`Stat.density`](@ref); see the latter for more information. -""" -density(; bandwidth::Real=-Inf) = - LineGeometry(Gadfly.Stat.density(bandwidth=bandwidth)) - """ Geom.density2d[(; bandwidth=(-Inf,-Inf), levels=15)] diff --git a/src/statistics.jl b/src/statistics.jl index 2be810105..3be0de33d 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -17,8 +17,12 @@ using IndirectArrays import Gadfly: Scale, Coord, input_aesthetics, output_aesthetics, default_scales, isconcrete, setfield!, discretize_make_ia, aes2str import KernelDensity +<<<<<<< HEAD # import Distributions: Uniform, Distribution, qqbuild import IterTools: distinct +======= +import IterTools: chain, distinct +>>>>>>> [WIP] prelim version of the density geometry revamp import Compat.Iterators: cycle, product include("bincount.jl") @@ -502,13 +506,57 @@ end struct DensityStatistic <: Gadfly.StatisticElement - # Number of points sampled + """ + Number of points sampled for estimate. Powers of two yields better + performance. + """ n::Int - # Bandwidth used for the kernel density estimation + + """ + Bandwidth used for the kernel density estimation. This corresponds to the + standard deviation of the `kernel`. + """ bw::Real + + """ + Multiplicative adjustment of the computed optimal bandwidth. This is a + relative adjustment, see `bw` to enforce a specific numerical bandwidth. + """ + adjust::Float64 + + """ + Kernel used for density estimation, see `KernelDensity.jl` for more details. + Default is the Normal Distribution. + """ + kernel + + """ + If set to `true` (default), trim the tails of the estimate to fit the range + of the data. + """ + trim::Bool + + """ + Method for scaling across multiple estimates. If `:area` (default), all + density estimates will have the same area under the curve (prior to trimming + ). If `:count`, the areas are scaled proportionally to the total number of + observations for each density estimate. If `:peak`, then all densities will + have the same maximum peak height. + """ + scale::Symbol end -DensityStatistic(; n=256, bandwidth=-Inf) = DensityStatistic(n, bandwidth) +function DensityStatistic(; n=256, + bandwidth=-Inf, + adjust=1.0, + kernel=Normal, + trim=true, + scale=:area, + ) + DensityStatistic(n, bandwidth, adjust, kernel, trim, scale) +end + +<<<<<<< HEAD input_aesthetics(stat::DensityStatistic) = [:x, :color] output_aesthetics(stat::DensityStatistic) = [:x, :y, :color] default_scales(::DensityStatistic) = [Gadfly.Scale.y_continuous()] @@ -520,47 +568,117 @@ Estimate the density of `x` at `n` points, and put the result in `x` and `y`. Smoothing is controlled by `bandwidth`. Used by [`Geom.density`](@ref Gadfly.Geom.density). """ const density = DensityStatistic +======= + +input_aesthetics(stat::DensityStatistic) = [:x, :y, :color] +output_aesthetics(stat::DensityStatistic) = [:x, :y, :color] + +""" +Given a scale and aesthetic, figures out which variable will be fed into the KDE, +which aesthetics will be overridden with what and provides a nested dictionary +mapping category to dictionaries of color mapped to a boolean array. The array +corresponds to whether a certain datapoint belongs to a given (category, color) +group. The length of the boolean array is equal to `length(aes.x)`, etc. +""" +function determine_density_grouping(scales::Dict{Symbol, Gadfly.ScaleElement}, + aes::Gadfly.Aesthetics) + + # TODO: This general approach should work for any statistic that has a + # categorical component and color. So this should be generalized to bar and + # boxplot + defined = intersect(Set([:x, :y]), Gadfly.defined_aesthetics(aes)) + (length(defined) == 0) && error("DensityStatistic requires either the x or y aesthetics to be defined") + densityvar = :x # variable on which the KDE will be run + + # violin and density have different `element_aesthetics` functions and + # therefore both x and y will only be defined if `Geom.violin()` is used + if length(defined) == 1 && :y in defined + densityvar = :y + end + + # what aes variable to store the category info, points, and densities + outputs = (nothing, :x, :y) + + unqcat = [1] + # densities grouped by the categorical variable and color + catgroup = Dict(1 => fill(true, length(getfield(aes, densityvar)))) + + if :x in defined && :y in defined # is violin? + # We first need to establish whether this is a horizontal or vertical violin + xcat, ycat = Scale.iscategorical(scales, :x), Scale.iscategorical(scales, :y) + catvar = :x + if xcat && ycat + error("Either the x or y aesthetics must be Real for kernel density estimation") + elseif xcat + densityvar = :y + elseif ycat + catvar = :y + densityvar = :x + else # neither x or y is categorical so we'll assume x is meant to be categorical, see #968 + new_scale = Scale.x_discrete(order=sortperm(unique(aes.x))) + Scale.apply_scale(new_scale, [aes], Gadfly.Data(x=aes.x)) + scales[:x] = new_scale + densityvar = :y + warn( + """ + Both x and y aesthetics are continuous, violin plots require a + categorical variable. Transforming x to be categorical. + """) + end + + outputs = (catvar, densityvar, :width) + + end + + densityvar, outputs, Gadfly.groupby(aes, [catvar, :color], densityvar) +end +>>>>>>> [WIP] prelim version of the density geometry revamp function apply_statistic(stat::DensityStatistic, scales::Dict{Symbol, Gadfly.ScaleElement}, coord::Gadfly.CoordinateElement, aes::Gadfly.Aesthetics) - Gadfly.assert_aesthetics_defined("DensityStatistic", aes, :x) - if aes.color === nothing - isa(aes.x[1], Real) || error("Kernel density estimation only works on Real types.") - x_f64 = collect(Float64, aes.x) + densityvar, outputs, grouped_data = determine_density_grouping(scales, aes) - window = stat.bw <= 0.0 ? KernelDensity.default_bandwidth(x_f64) : stat.bw - f = KernelDensity.kde(x_f64, bandwidth=window, npoints=stat.n) - aes.x = collect(Float64, f.x) - aes.y = f.density - else - groups = Dict() - for (x, c) in zip(aes.x, cycle(aes.color)) - if !haskey(groups, c) - groups[c] = Float64[x] - else - push!(groups[c], x) - end - end + densityinput = getfield(aes, densityvar) - colors = Array{RGB{Float32}}(0) - aes.x = Array{Float64}(0) - aes.y = Array{Float64}(0) - for (c, xs) in groups - window = stat.bw <= 0.0 ? KernelDensity.default_bandwidth(xs) : stat.bw - f = KernelDensity.kde(xs, bandwidth=window, npoints=stat.n) - append!(aes.x, f.x) - append!(aes.y, f.density) - for _ in 1:length(f.x) - push!(colors, c) - end + aes.x = Array{Float64}(0) + aes.y = Array{Float64}(0) + aes.width = Array{Float64}(0) + colors = eltype(aes.color)[] + + for ((cat, color), belongs) in grouped_data + input = densityinput[belongs] + window = stat.n > 1 ? KernelDensity.default_bandwidth(input)*stat.adjust : 0.1 + if stat.trim + f = KernelDensity.kde(input, kernel=stat.kernel, + boundary=extrema(input), + bandwidth=window, + npoints=stat.n) + else + f = KernelDensity.kde(input, kernel=stat.kernel, bandwidth=window, npoints=stat.n) end - aes.color = discretize_make_ia(colors) + # only store category information if this is a violin plot and we need it + (outputs[1] != nothing) && append!(getfield(aes, outputs[1]), fill(cat, length(f.density))) + append!(getfield(aes, outputs[2]), f.x) + + if stat.scale == :area + append!(getfield(aes, outputs[3]), f.density) + elseif stat.scale == :count + append!(getfield(aes, outputs[3]), f.density.*sum(input)) + else + append!(getfield(aes, outputs[3]), f.density ./ maximum(f.density)) + end + append!(colors, fill(color, length(f.density))) end - aes.y_label = Gadfly.Scale.identity_formatter + (aes.color != nothing) && (aes.color = colors) + + pad = 0.1 + maxwidth = maximum(aes.width) + broadcast!(*, aes.width, aes.width, 1 - pad) + broadcast!(/, aes.width, aes.width, maxwidth) end @@ -1621,6 +1739,7 @@ function apply_statistic(stat::QQStatistic, end end +<<<<<<< HEAD struct ViolinStatistic <: Gadfly.StatisticElement n::Int # Number of points sampled @@ -1680,6 +1799,8 @@ function apply_statistic(stat::ViolinStatistic, end +======= +>>>>>>> [WIP] prelim version of the density geometry revamp struct JitterStatistic <: Gadfly.StatisticElement vars::Vector{Symbol} range::Float64 From ca168c36243fdc23d5b8165b2b4492a350ca7f30 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Thu, 31 May 2018 16:39:21 -0700 Subject: [PATCH 03/14] preserve insertion order when grouping this is necessary for allowing user control over ordering in stacked density plots --- src/aesthetics.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/aesthetics.jl b/src/aesthetics.jl index c1d955849..9d1e018d3 100755 --- a/src/aesthetics.jl +++ b/src/aesthetics.jl @@ -1,5 +1,3 @@ -using IterTools - const NumericalOrCategoricalAesthetic = Union{(Void), Vector, DataArray, IndirectArray} @@ -464,7 +462,7 @@ function groupby(aes::Gadfly.Aesthetics, by::Vector{Symbol}, togroupvar::Symbol) end T = Tuple{types...} - grouped = Dict{T, Vector{Bool}}() + grouped = DataStructures.OrderedDict{T, Vector{Bool}}() # gather options for each `by` aesthetic opt = [if isconcrete[i] unique(getfield(aes, by[i])) else [nothing] end for i in 1:length(by)] @@ -472,7 +470,7 @@ function groupby(aes::Gadfly.Aesthetics, by::Vector{Symbol}, togroupvar::Symbol) # The approach is to identify positions were multiple by aesthetics overlap # and thus grouping the data positions. We first assume that all positions # belong to a combination of aesthetics and then whittle it down - for combo in product(opt...) + for combo in IterTools.product(opt...) belongs = fill(true, length(getfield(aes, togroupvar))) for i in 1:length(combo) (combo[i] == nothing) && continue From 6b2462e9dc06d5c9e3a64d267c72211157ab8408 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Thu, 31 May 2018 16:44:07 -0700 Subject: [PATCH 04/14] update doctests for groupby --- src/aesthetics.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/aesthetics.jl b/src/aesthetics.jl index 9d1e018d3..aa79c66a3 100755 --- a/src/aesthetics.jl +++ b/src/aesthetics.jl @@ -434,7 +434,7 @@ groupby(aes, [:x, :color], :y) # output -Dict((2, nothing)=>Bool[false, false, false, true, true, true],(1, nothing)=>Bool[true, true, true, false, false, false]) +DataStructures.OrderedDict((2, nothing)=>Bool[false, false, false, true, true, true],(1, nothing)=>Bool[true, true, true, false, false, false]) ``` ```jldoctest @@ -447,7 +447,7 @@ groupby(aes, [:x, :color], :y) # output -Dict((:a, RGB{N0f8}(1.0,0.0,0.0))=>Bool[true, true, false, false],(:b, RGB{N0f8}(0.0,0.0,1.0))=>Bool[false, false, true, true]) +DataStructures.OrderedDict((:a, RGB{N0f8}(1.0,0.0,0.0))=>Bool[true, true, false, false],(:b, RGB{N0f8}(0.0,0.0,1.0))=>Bool[false, false, true, true]) ``` """ @@ -461,8 +461,7 @@ function groupby(aes::Gadfly.Aesthetics, by::Vector{Symbol}, togroupvar::Symbol) @assert length(getfield(aes, togroupvar)) == length(getfield(aes, by[i])) "$togroupvar and $(by[i]) aesthetics must have same length" end - T = Tuple{types...} - grouped = DataStructures.OrderedDict{T, Vector{Bool}}() + grouped = DataStructures.OrderedDict{Tuple{types...}, Vector{Bool}}() # gather options for each `by` aesthetic opt = [if isconcrete[i] unique(getfield(aes, by[i])) else [nothing] end for i in 1:length(by)] From d278888743d72ee0283e53cfe0c82ee24ab0e821 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Thu, 31 May 2018 17:56:17 -0700 Subject: [PATCH 05/14] add a render() to Geom.density --- src/geom/density.jl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/geom/density.jl b/src/geom/density.jl index c35e82a8b..ebc1a20c0 100644 --- a/src/geom/density.jl +++ b/src/geom/density.jl @@ -15,6 +15,32 @@ const density = DensityGeometry element_aesthetics(::DensityGeometry) = Symbol[] default_statistic(geom::DensityGeometry) = Gadfly.Stat.DensityStatistic(geom.stat) +function render(geom::DensityGeometry, theme::Gadfly.Theme, aes::Gadfly.Aesthetics) + Gadfly.assert_aesthetics_defined("Geom.density", aes, :x, :y) + Gadfly.assert_aesthetics_equal_length("Geom.density", aes, :x, :y) + + grouped_data = Gadfly.groupby(aes, [:color], :y) + densities = Array{NTuple{2, Float64}}[] + colors = [] + + for (keys, belongs) in grouped_data + xs = aes.x[belongs] + ys = aes.y[belongs] + + push!(densities, [(x, y) for (x, y) in zip(xs, ys)]) + push!(colors, keys[1]) + end + + ctx = context(order=geom.order) + if geom.stat.position == :dodge + compose!(ctx, Compose.polygon(densities, geom.tag), stroke(colors), fill(nothing)) + else + compose!(ctx, Compose.polygon(densities, geom.tag), fill(colors)) + end + + compose!(ctx, svgclass("geometry")) +end + struct ViolinGeometry <: Gadfly.GeometryElement stat::Gadfly.StatisticElement split::Bool @@ -22,6 +48,9 @@ struct ViolinGeometry <: Gadfly.GeometryElement tag::Symbol end function ViolinGeometry(; order=1, tag=empty_tag, split=false, kwargs...) + if findfirst(x->x[1] == :trim, kwargs) == 0 + push!(kwargs, (:trim, true)) + end ViolinGeometry(Gadfly.Stat.DensityStatistic(; kwargs...), split, order, tag) end From 978bc4a61da80cdc6d10a1bc7a2ae0474fb70498 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Thu, 31 May 2018 17:57:36 -0700 Subject: [PATCH 06/14] try to be less clever and add ability to stack densities My original implementation was too clever in that it figured out the orientation of density and violin plots automatically. The logic ended up being quite convoluted and so I switched back to using the standard `orientation` logic and an explicit flag for whether a density plot is a violin or not. This commit also adds the ability to stack either raw densities and to create conditional density distributions. --- src/statistics.jl | 203 +++++++++++++++++++++++++++------------------- 1 file changed, 118 insertions(+), 85 deletions(-) diff --git a/src/statistics.jl b/src/statistics.jl index 3be0de33d..20cff731b 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -531,8 +531,8 @@ struct DensityStatistic <: Gadfly.StatisticElement kernel """ - If set to `true` (default), trim the tails of the estimate to fit the range - of the data. + If set to `true`, trim the tails of the estimate to fit the range of the + data. (default is `false`) """ trim::Bool @@ -544,16 +544,25 @@ struct DensityStatistic <: Gadfly.StatisticElement have the same maximum peak height. """ scale::Symbol + + position::Symbol + + orientation::Symbol + + isviolin::Bool end function DensityStatistic(; n=256, bandwidth=-Inf, adjust=1.0, kernel=Normal, - trim=true, + trim=false, scale=:area, + position=:dodge, + orientation=:horizontal, + isviolin=false ) - DensityStatistic(n, bandwidth, adjust, kernel, trim, scale) + DensityStatistic(n, bandwidth, adjust, kernel, trim, scale, position, orientation, isviolin) end <<<<<<< HEAD @@ -570,67 +579,28 @@ Smoothing is controlled by `bandwidth`. Used by [`Geom.density`](@ref Gadfly.Ge const density = DensityStatistic ======= -input_aesthetics(stat::DensityStatistic) = [:x, :y, :color] +function input_aesthetics(stat::DensityStatistic) + if stat.isviolin + return [:x, :y, :color] + elseif stat.orientation == :horizontal + return [:x, :color] + else + return [:y, :color] + end +end + output_aesthetics(stat::DensityStatistic) = [:x, :y, :color] -""" -Given a scale and aesthetic, figures out which variable will be fed into the KDE, -which aesthetics will be overridden with what and provides a nested dictionary -mapping category to dictionaries of color mapped to a boolean array. The array -corresponds to whether a certain datapoint belongs to a given (category, color) -group. The length of the boolean array is equal to `length(aes.x)`, etc. -""" -function determine_density_grouping(scales::Dict{Symbol, Gadfly.ScaleElement}, - aes::Gadfly.Aesthetics) - - # TODO: This general approach should work for any statistic that has a - # categorical component and color. So this should be generalized to bar and - # boxplot - defined = intersect(Set([:x, :y]), Gadfly.defined_aesthetics(aes)) - (length(defined) == 0) && error("DensityStatistic requires either the x or y aesthetics to be defined") - densityvar = :x # variable on which the KDE will be run - - # violin and density have different `element_aesthetics` functions and - # therefore both x and y will only be defined if `Geom.violin()` is used - if length(defined) == 1 && :y in defined - densityvar = :y - end - - # what aes variable to store the category info, points, and densities - outputs = (nothing, :x, :y) - - unqcat = [1] - # densities grouped by the categorical variable and color - catgroup = Dict(1 => fill(true, length(getfield(aes, densityvar)))) - - if :x in defined && :y in defined # is violin? - # We first need to establish whether this is a horizontal or vertical violin - xcat, ycat = Scale.iscategorical(scales, :x), Scale.iscategorical(scales, :y) - catvar = :x - if xcat && ycat - error("Either the x or y aesthetics must be Real for kernel density estimation") - elseif xcat - densityvar = :y - elseif ycat - catvar = :y - densityvar = :x - else # neither x or y is categorical so we'll assume x is meant to be categorical, see #968 - new_scale = Scale.x_discrete(order=sortperm(unique(aes.x))) - Scale.apply_scale(new_scale, [aes], Gadfly.Data(x=aes.x)) - scales[:x] = new_scale - densityvar = :y - warn( - """ - Both x and y aesthetics are continuous, violin plots require a - categorical variable. Transforming x to be categorical. - """) +function default_scales(stat::DensityStatistic) + if stat.isviolin + if stat.orientation == :vertical + [Gadfly.Scale.x_discrete(), Gadfly.Scale.y_continuous()] + else + [Gadfly.Scale.x_continuous(), Gadfly.Scale.y_discrete()] end - - outputs = (catvar, densityvar, :width) - + else + [Gadfly.Scale.x_continuous(), Gadfly.Scale.x_continuous()] end - - densityvar, outputs, Gadfly.groupby(aes, [catvar, :color], densityvar) end >>>>>>> [WIP] prelim version of the density geometry revamp @@ -639,46 +609,109 @@ function apply_statistic(stat::DensityStatistic, coord::Gadfly.CoordinateElement, aes::Gadfly.Aesthetics) + # For all density/violin plots we're computing a new dimension, the density + # dimension. We're also overwriting the other dimensions and part of the + # trickiness is tracking which dimension refers to what. + # Three output dimensions: (1) grouping (2) evaluation points (i.e where + # we're evaluating the KDE) (3) density values + output_dims = Union{Symbol, Nothing}[:x, :y] + (stat.orientation == :vertical) && (output_dims = reverse(output_dims)) + + Gadfly.assert_aesthetics_defined("DensityStatistic", aes, output_dims[1]) + groupon = [:color] + if stat.isviolin + # For violin plots we need an additional dimension for the density data + # so we add an additional dimension on the end + push!(output_dims, :width) + insert!(groupon, 1, output_dims[1]) + else + # For simple density plots there are no categories so we'll insert a + # placeholder value into the first dimension + insert!(output_dims, 1, nothing) + end - densityvar, outputs, grouped_data = determine_density_grouping(scales, aes) + grouped_data = Gadfly.groupby(aes, groupon, output_dims[2]) - densityinput = getfield(aes, densityvar) + n_pts = stat.trim ? stat.n + 2 : stat.n + n_groups = length(grouped_data) + boundary = (-Inf, Inf) - aes.x = Array{Float64}(0) - aes.y = Array{Float64}(0) - aes.width = Array{Float64}(0) - colors = eltype(aes.color)[] + groups = Array{Float64}(n_groups) + eval_points = fill(0.0, n_groups, n_pts) + densities = fill(0.0, n_groups, n_pts) + colors = Array{eltype(aes.color)}(n_groups) + + # if the densities are stacked then we'll need to clamp them so that they + # share the same evaluation points (e.g. x values) + (stat.position != :dodge) && (boundary = extrema(getfield(aes, densityvar))) - for ((cat, color), belongs) in grouped_data - input = densityinput[belongs] + for (idx, (keys, belongs)) in enumerate(grouped_data) + input = getfield(aes, output_dims[2])[belongs] window = stat.n > 1 ? KernelDensity.default_bandwidth(input)*stat.adjust : 0.1 - if stat.trim + (stat.trim) && (boundary = extrema(input)) + if boundary != (-Inf, Inf) f = KernelDensity.kde(input, kernel=stat.kernel, - boundary=extrema(input), + boundary=boundary, bandwidth=window, npoints=stat.n) else f = KernelDensity.kde(input, kernel=stat.kernel, bandwidth=window, npoints=stat.n) end - # only store category information if this is a violin plot and we need it - (outputs[1] != nothing) && append!(getfield(aes, outputs[1]), fill(cat, length(f.density))) - append!(getfield(aes, outputs[2]), f.x) - if stat.scale == :area - append!(getfield(aes, outputs[3]), f.density) - elseif stat.scale == :count - append!(getfield(aes, outputs[3]), f.density.*sum(input)) + # only store category information if this is a violin plot and we need it + if stat.isviolin + groups[idx] = keys[1] + colors[idx] = keys[2] + elseif length(keys) == 1 + colors[idx] = keys[1] else - append!(getfield(aes, outputs[3]), f.density ./ maximum(f.density)) + error("Density plots do not support grouping by more than two dimensions.") end - append!(colors, fill(color, length(f.density))) + + # scale density output depending on `scale` flag + scaled_density = stat.trim ? vcat(0.0, f.density, 0.0) : f.density + if stat.scale == :count + scaled_density = f.density.*sum(input) + elseif stat.scale == :peak + scaled_density = f.density ./ maximum(f.density) + end + + minval, maxval = extrema(input) + eval_points[idx, :] = stat.trim ? vcat(minval, f.x, maxval) : f.x + densities[idx, :] = scaled_density end - (aes.color != nothing) && (aes.color = colors) - pad = 0.1 - maxwidth = maximum(aes.width) - broadcast!(*, aes.width, aes.width, 1 - pad) - broadcast!(/, aes.width, aes.width, maxwidth) + if stat.position == :dodge + # if this is a violin plot, make sure to set the grouping + (stat.isviolin) && setfield!(aes, output_dims[1], repeat(groups, outer=n_pts)) + setfield!(aes, output_dims[2], vec(eval_points)) + setfield!(aes, output_dims[3], vec(densities)) + (aes.color != nothing) && (aes.color = repeat(colors, outer=n_pts)) + elseif stat.position == :stack || stat.position == :fill + if stat.position == :fill + densities ./= sum(densities, 1) + end + stacked_densities = hcat(copy(densities), fill(0.0, size(densities)...)) + for i in 1:n_groups + for j in 1:i-1 + stacked_densities[i, 1:n_pts] .+= densities[j, :] + stacked_densities[i, n_pts+1:2*n_pts] .+= densities[j, end:-1:1] + end + end + setfield!(aes, output_dims[2], vec(hcat(eval_points, eval_points[:, end:-1:1]))) + setfield!(aes, output_dims[3], vec(stacked_densities)) + (aes.color != nothing) && (aes.color = repeat(colors, outer=n_pts*2)) + end + + if stat.isviolin + pad = 0.1 + maxwidth = maximum(aes.width) + broadcast!(*, aes.width, aes.width, 1 - pad) + broadcast!(/, aes.width, aes.width, maxwidth) + else + scales[:y] = Scale.y_continuous() + aes.y_label = Gadfly.Scale.identity_formatter + end end From 084bb2981850c8b98026ff0d4202db8de54f01e9 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Thu, 31 May 2018 18:02:41 -0700 Subject: [PATCH 07/14] fix for missed renaming --- src/statistics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/statistics.jl b/src/statistics.jl index 20cff731b..f4d1a0266 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -643,7 +643,7 @@ function apply_statistic(stat::DensityStatistic, # if the densities are stacked then we'll need to clamp them so that they # share the same evaluation points (e.g. x values) - (stat.position != :dodge) && (boundary = extrema(getfield(aes, densityvar))) + (stat.position != :dodge) && (boundary = extrema(getfield(aes, output_dims[2]))) for (idx, (keys, belongs)) in enumerate(grouped_data) input = getfield(aes, output_dims[2])[belongs] From 29022ded4cc68ce8345454d50b6d34abfaf179f6 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 3 Jun 2018 15:07:51 -0700 Subject: [PATCH 08/14] move default values from densitystat to respective geoms There are no set of defaults that apply to both density and violin geometries so it's better if they each have their respective defaults --- src/geom/density.jl | 34 +++++++++++++++++++------ src/statistics.jl | 61 +++++++++++++++++++-------------------------- 2 files changed, 52 insertions(+), 43 deletions(-) diff --git a/src/geom/density.jl b/src/geom/density.jl index ebc1a20c0..e390370e6 100644 --- a/src/geom/density.jl +++ b/src/geom/density.jl @@ -4,8 +4,19 @@ struct DensityGeometry <: Gadfly.GeometryElement tag::Symbol end -function DensityGeometry(; order=1, tag=empty_tag, kwargs...) - DensityGeometry(Gadfly.Stat.DensityStatistic(; kwargs...), order, tag) +function DensityGeometry(; n=256, + bandwidth=-Inf, + adjust=1.0, + kernel=Normal, + trim=false, + scale=:area, + position=:dodge, + orientation=:horizontal, + order=1, + tag=empty_tag) + stat = Gadfly.Stat.DensityStatistic(n, bandwidth, adjust, kernel, trim, + scale, position, orientation, false) + DensityGeometry(stat, order, tag) end DensityGeometry(stat; order=1, tag=empty_tag) = DensityGeometry(stat, order, tag) @@ -47,11 +58,20 @@ struct ViolinGeometry <: Gadfly.GeometryElement order::Int tag::Symbol end -function ViolinGeometry(; order=1, tag=empty_tag, split=false, kwargs...) - if findfirst(x->x[1] == :trim, kwargs) == 0 - push!(kwargs, (:trim, true)) - end - ViolinGeometry(Gadfly.Stat.DensityStatistic(; kwargs...), split, order, tag) + +function ViolinGeometry(; n=256, + bandwidth=-Inf, + adjust=1.0, + kernel=Normal, + trim=true, + scale=:area, + orientation=:vertical, + split=false, + order=1, + tag=empty_tag) + stat = Gadfly.Stat.DensityStatistic(n, bandwidth, adjust, kernel, trim, + scale, :dodge, orientation, true) + ViolinGeometry(stat, split, order, tag) end """ diff --git a/src/statistics.jl b/src/statistics.jl index f4d1a0266..218f321b6 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -17,12 +17,8 @@ using IndirectArrays import Gadfly: Scale, Coord, input_aesthetics, output_aesthetics, default_scales, isconcrete, setfield!, discretize_make_ia, aes2str import KernelDensity -<<<<<<< HEAD # import Distributions: Uniform, Distribution, qqbuild import IterTools: distinct -======= -import IterTools: chain, distinct ->>>>>>> [WIP] prelim version of the density geometry revamp import Compat.Iterators: cycle, product include("bincount.jl") @@ -504,7 +500,10 @@ function apply_statistic(stat::Density2DStatistic, apply_statistic(ContourStatistic(levels=stat.levels), scales, coord, aes) end - +""" + A general statistic for density plots (e.g. KDE plots and violin plots). +See [`Geom.density`](@ref) or [`Geom.violin`](@ref) for more details. +""" struct DensityStatistic <: Gadfly.StatisticElement """ Number of points sampled for estimate. Powers of two yields better @@ -513,8 +512,8 @@ struct DensityStatistic <: Gadfly.StatisticElement n::Int """ - Bandwidth used for the kernel density estimation. This corresponds to the - standard deviation of the `kernel`. + Smoothing bandwidth used for the kernel density estimation. This + corresponds to the standard deviation of the `kernel`. """ bw::Real @@ -531,8 +530,11 @@ struct DensityStatistic <: Gadfly.StatisticElement kernel """ - If set to `true`, trim the tails of the estimate to fit the range of the - data. (default is `false`) + This parameter only applies in the context of multiple densities. If set to + `false` (the default), the densities are computed over the full range of + data. If `true`, then each density's range will be computed only over the + range of data belonging to that group. This option is incompatible with + stacked densities since the ranges might not line up any more. """ trim::Bool @@ -545,40 +547,27 @@ struct DensityStatistic <: Gadfly.StatisticElement """ scale::Symbol + """ + Control handling of multiple overlapping densities. The `:dodge` option + (default) just overlays each density such that they are in front of each + other. The `:stack` option places the densities a top of each other. The + `:fill` option is similar to `:stack`, but the stacks are all normalized to + a constant height of 1.0. This last option is useful for generating + conditional density estimates. + """ position::Symbol + """ + Whether the plot is `:horizontal` or `:vertical` + """ orientation::Symbol + """ + Internal flag that is `true` if this density statistic is a violin plot + """ isviolin::Bool end -function DensityStatistic(; n=256, - bandwidth=-Inf, - adjust=1.0, - kernel=Normal, - trim=false, - scale=:area, - position=:dodge, - orientation=:horizontal, - isviolin=false - ) - DensityStatistic(n, bandwidth, adjust, kernel, trim, scale, position, orientation, isviolin) -end - -<<<<<<< HEAD -input_aesthetics(stat::DensityStatistic) = [:x, :color] -output_aesthetics(stat::DensityStatistic) = [:x, :y, :color] -default_scales(::DensityStatistic) = [Gadfly.Scale.y_continuous()] - -""" - Stat.density[(; n=256, bandwidth=-Inf)] - -Estimate the density of `x` at `n` points, and put the result in `x` and `y`. -Smoothing is controlled by `bandwidth`. Used by [`Geom.density`](@ref Gadfly.Geom.density). -""" -const density = DensityStatistic -======= - function input_aesthetics(stat::DensityStatistic) if stat.isviolin return [:x, :y, :color] From 0f90c441984ecdecbebd4c4bda25106c2d85a0a1 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 3 Jun 2018 15:13:31 -0700 Subject: [PATCH 09/14] get density estimation to a functional state Simple KDEs should now be working --- src/geom/density.jl | 3 +- src/statistics.jl | 154 +++++++++++++++----------------------------- 2 files changed, 54 insertions(+), 103 deletions(-) diff --git a/src/geom/density.jl b/src/geom/density.jl index e390370e6..009b76dd7 100644 --- a/src/geom/density.jl +++ b/src/geom/density.jl @@ -39,10 +39,11 @@ function render(geom::DensityGeometry, theme::Gadfly.Theme, aes::Gadfly.Aestheti ys = aes.y[belongs] push!(densities, [(x, y) for (x, y) in zip(xs, ys)]) - push!(colors, keys[1]) + push!(colors, keys[1] != nothing ? keys[1] : theme.default_color) end ctx = context(order=geom.order) + # TODO: This should be user controllable if geom.stat.position == :dodge compose!(ctx, Compose.polygon(densities, geom.tag), stroke(colors), fill(nothing)) else diff --git a/src/statistics.jl b/src/statistics.jl index 218f321b6..ce2789f1f 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -580,18 +580,24 @@ end output_aesthetics(stat::DensityStatistic) = [:x, :y, :color] -function default_scales(stat::DensityStatistic) +function _find_output_dims(stat::DensityStatistic) + output_dims = Union{Symbol, Nothing}[:x, :y] + (stat.orientation == :vertical) && reverse!(output_dims) + + groupon = [:color] if stat.isviolin - if stat.orientation == :vertical - [Gadfly.Scale.x_discrete(), Gadfly.Scale.y_continuous()] - else - [Gadfly.Scale.x_continuous(), Gadfly.Scale.y_discrete()] - end + reverse!(output_dims) + # For violin plots we need an additional dimension for the density data + # so we add an additional dimension on the end + push!(output_dims, :width) + insert!(groupon, 1, output_dims[1]) else - [Gadfly.Scale.x_continuous(), Gadfly.Scale.x_continuous()] + # For simple density plots there are no categories so we'll insert a + # placeholder value into the first dimension + insert!(output_dims, 1, nothing) end + output_dims, groupon end ->>>>>>> [WIP] prelim version of the density geometry revamp function apply_statistic(stat::DensityStatistic, scales::Dict{Symbol, Gadfly.ScaleElement}, @@ -603,27 +609,38 @@ function apply_statistic(stat::DensityStatistic, # trickiness is tracking which dimension refers to what. # Three output dimensions: (1) grouping (2) evaluation points (i.e where # we're evaluating the KDE) (3) density values - output_dims = Union{Symbol, Nothing}[:x, :y] - (stat.orientation == :vertical) && (output_dims = reverse(output_dims)) + output_dims, groupon = _find_output_dims(stat) - Gadfly.assert_aesthetics_defined("DensityStatistic", aes, output_dims[1]) - groupon = [:color] if stat.isviolin - # For violin plots we need an additional dimension for the density data - # so we add an additional dimension on the end - push!(output_dims, :width) - insert!(groupon, 1, output_dims[1]) - else - # For simple density plots there are no categories so we'll insert a - # placeholder value into the first dimension - insert!(output_dims, 1, nothing) + xcat, ycat = Scale.iscategorical(scales, :x), Scale.iscategorical(scales, :y) + if xcat && ycat + error("Either the x or y aesthetics must be Real for kernel density estimation") + elseif xcat && stat.orientation == :horizontal + error("Horizontal violins require a continuous x axis for kernel density estimation") + elseif ycat && stat.orientation == :vertical + error("Vertical violins require a continuous y axis for kernel density estimation") + elseif !xcat && !ycat # neither x or y is categorical so we'll assume x is meant to be categorical, see #968 + new_scale = Scale.x_discrete(order=sortperm(unique(aes.x))) + Scale.apply_scale(new_scale, [aes], Gadfly.Data(x=aes.x)) + scales[:x] = new_scale + warn( + """ + Both x and y aesthetics are continuous, violin plots require a + categorical variable. Transforming x to be categorical. + """) + end + if getfield(aes, output_dims[1]) == nothing + setfield!(aes, output_dims[1], fill(1.0, length(getfield(aes, output_dims[2])))) + end + elseif getfield(aes, output_dims[2]) == nothing + error("The $(output_dims[2]) aesthetic is required for $(stat.orientation) density plots") end grouped_data = Gadfly.groupby(aes, groupon, output_dims[2]) - n_pts = stat.trim ? stat.n + 2 : stat.n + + n_pts = stat.position == :fill ? stat.n : stat.n + 2 n_groups = length(grouped_data) - boundary = (-Inf, Inf) groups = Array{Float64}(n_groups) eval_points = fill(0.0, n_groups, n_pts) @@ -632,21 +649,18 @@ function apply_statistic(stat::DensityStatistic, # if the densities are stacked then we'll need to clamp them so that they # share the same evaluation points (e.g. x values) - (stat.position != :dodge) && (boundary = extrema(getfield(aes, output_dims[2]))) + boundary = extrema(getfield(aes, output_dims[2])) for (idx, (keys, belongs)) in enumerate(grouped_data) input = getfield(aes, output_dims[2])[belongs] - window = stat.n > 1 ? KernelDensity.default_bandwidth(input)*stat.adjust : 0.1 + window = stat.n > 1 ? KernelDensity.default_bandwidth(input)*stat.adjust : stat.bw (stat.trim) && (boundary = extrema(input)) - if boundary != (-Inf, Inf) - f = KernelDensity.kde(input, kernel=stat.kernel, - boundary=boundary, - bandwidth=window, - npoints=stat.n) - else - f = KernelDensity.kde(input, kernel=stat.kernel, bandwidth=window, npoints=stat.n) - end - + kde_est = KernelDensity.kde(input, kernel=stat.kernel, + boundary=boundary, + npoints=stat.n, + bandwidth=window) + evalpts = kde_est.x + density = kde_est.density # only store category information if this is a violin plot and we need it if stat.isviolin groups[idx] = keys[1] @@ -656,17 +670,15 @@ function apply_statistic(stat::DensityStatistic, else error("Density plots do not support grouping by more than two dimensions.") end - # scale density output depending on `scale` flag - scaled_density = stat.trim ? vcat(0.0, f.density, 0.0) : f.density + scaled_density = stat.position == :fill ? density : vcat(0.0, density, 0.0) if stat.scale == :count - scaled_density = f.density.*sum(input) + scaled_density .*= sum(input) elseif stat.scale == :peak - scaled_density = f.density ./ maximum(f.density) + scaled_density ./= maximum(density) end - minval, maxval = extrema(input) - eval_points[idx, :] = stat.trim ? vcat(minval, f.x, maxval) : f.x + eval_points[idx, :] = stat.position == :fill ? evalpts : vcat(boundary[1], evalpts, boundary[2]) densities[idx, :] = scaled_density end @@ -898,7 +910,7 @@ data with `coverage_weight`; and of having a nice numbering with granularity_weight=1/4, simplicity_weight=1/6, coverage_weight=1/3, - niceness_weight=1/4) = + niceness_weight=1/4) = TickStatistic("x", granularity_weight, simplicity_weight, coverage_weight, niceness_weight, ticks) @@ -1761,68 +1773,6 @@ function apply_statistic(stat::QQStatistic, end end -<<<<<<< HEAD - -struct ViolinStatistic <: Gadfly.StatisticElement - n::Int # Number of points sampled -end -ViolinStatistic() = ViolinStatistic(300) - -input_aesthetics(::ViolinStatistic) = [:x, :y, :color] -output_aesthetics(::ViolinStatistic) = [:x, :y, :width, :color] -default_scales(stat::ViolinStatistic) = [Gadfly.Scale.x_discrete(), Gadfly.Scale.y_continuous()] - -### very similar to Stat.density; Geom.violin could be refactored to us it instead -""" - Stat.violin[(n=300)] - -Transform $(aes2str(input_aesthetics(violin()))). -""" -const violin = ViolinStatistic - -function apply_statistic(stat::ViolinStatistic, - scales::Dict{Symbol, Gadfly.ScaleElement}, - coord::Gadfly.CoordinateElement, - aes::Gadfly.Aesthetics) - - isa(aes.y[1], Real) || error("Kernel density estimation only works on Real types.") - - grouped_y = Dict(1=>aes.y) - grouped_color = Dict{Int, Gadfly.ColorOrNothing}(1=>nothing) - ux = unique(aes.x) - uxflag = length(ux) < length(aes.x) - colorflag = aes.color != nothing - - uxflag && (grouped_y = Dict(x=>aes.y[aes.x.==x] for x in ux)) - - grouped_color = (colorflag ? Dict(x=>first(aes.color[aes.x.==x]) for x in ux) : - uxflag && Dict(x=>nothing for x in ux) ) - - aes.x = Array{Float64}(0) - aes.y = Array{Float64}(0) - aes.width = Array{Float64}(0) - colors = eltype(aes.color)[] - - for (x, ys) in grouped_y - window = stat.n > 1 ? KernelDensity.default_bandwidth(ys) : 0.1 - f = KernelDensity.kde(ys, bandwidth=window, npoints=stat.n) - append!(aes.x, fill(x, length(f.x))) - append!(aes.y, f.x) - append!(aes.width, f.density) - append!(colors, fill(grouped_color[x], length(f.x))) - end - - colorflag && (aes.color = colors) - - pad = 0.1 - maxwidth = maximum(aes.width) - broadcast!(*, aes.width, aes.width, 1 - pad) - broadcast!(/, aes.width, aes.width, maxwidth) -end - - -======= ->>>>>>> [WIP] prelim version of the density geometry revamp struct JitterStatistic <: Gadfly.StatisticElement vars::Vector{Symbol} range::Float64 From 3e9617c9d832481b6ab37784374a7bc679190af9 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 3 Jun 2018 16:29:49 -0700 Subject: [PATCH 10/14] fix up violin render() - adds support for horizontal violins - removes manual control over splitting (temp until position code is rewritten) --- src/geom/density.jl | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/geom/density.jl b/src/geom/density.jl index 009b76dd7..c6d27fcb3 100644 --- a/src/geom/density.jl +++ b/src/geom/density.jl @@ -55,7 +55,6 @@ end struct ViolinGeometry <: Gadfly.GeometryElement stat::Gadfly.StatisticElement - split::Bool order::Int tag::Symbol end @@ -67,12 +66,11 @@ function ViolinGeometry(; n=256, trim=true, scale=:area, orientation=:vertical, - split=false, order=1, tag=empty_tag) stat = Gadfly.Stat.DensityStatistic(n, bandwidth, adjust, kernel, trim, scale, :dodge, orientation, true) - ViolinGeometry(stat, split, order, tag) + ViolinGeometry(stat, order, tag) end """ @@ -80,11 +78,11 @@ end Draw `y` versus `width`, optionally grouping categorically by `x` and coloring with `color`. Alternatively, if `width` is not supplied, the data in `y` will -be transformed to a density estimate using [`Stat.violin`](@ref) +be transformed to a density estimate using [`Stat.density`](@ref) """ const violin = ViolinGeometry -element_aesthetics(::ViolinGeometry) = [:x, :y, :color] +element_aesthetics(::ViolinGeometry) = [] default_statistic(geom::ViolinGeometry) = Gadfly.Stat.DensityStatistic(geom.stat) @@ -93,22 +91,29 @@ function render(geom::ViolinGeometry, theme::Gadfly.Theme, aes::Gadfly.Aesthetic Gadfly.assert_aesthetics_defined("Geom.violin", aes, :y, :width) Gadfly.assert_aesthetics_equal_length("Geom.violin", aes, :y, :width) - grouped_data = Gadfly.groupby(aes, [:x, :color], :y) + output_dims, groupon = Gadfly.Stat._find_output_dims(geom.stat) + grouped_data = Gadfly.groupby(aes, groupon, output_dims[2]) violins = Array{NTuple{2, Float64}}[] - colors = [] (aes.color == nothing) && (aes.color = fill(theme.default_color, length(aes.x))) + colors = eltype(aes.color)[] color_opts = unique(aes.color) - if geom.split && length(color_opts) > 2 - error("Split violins require 2 colors, not more") + split = false + # TODO: Add support for dodging violins (i.e. having more than two colors + # per major category). Also splitting should not happen automatically, but + # as a optional keyword to Geom.violin + if length(keys(grouped_data)) > 2*length(unique(getfield(aes, output_dims[1]))) + error("Violin plots do not currently support having more than 2 colors per $(output_dims[1]) category") + elseif length(color_opts) == 2 + split = true end for (keys, belongs) in grouped_data x, color = keys - ys = aes.y[belongs] + ys = getfield(aes, output_dims[2])[belongs] ws = aes.width[belongs] - if geom.split + if split pos = findfirst(color_opts, color) if pos == 1 push!(violins, [(x - w/2, y) for (y, w) in zip(ys, ws)]) @@ -123,6 +128,14 @@ function render(geom::ViolinGeometry, theme::Gadfly.Theme, aes::Gadfly.Aesthetic end end + if geom.stat.orientation == :horizontal + for violin in violins + for i in 1:length(violin) + violin[i] = reverse(violin[i]) + end + end + end + ctx = context(order=geom.order) compose!(ctx, Compose.polygon(violins, geom.tag), fill(colors)) From d3c77ac2dea5a40c8fb6e9546bc3fad594126dbd Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 3 Jun 2018 18:33:13 -0700 Subject: [PATCH 11/14] fix bandwidth setting example --- src/statistics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/statistics.jl b/src/statistics.jl index ce2789f1f..516bb7414 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -653,7 +653,7 @@ function apply_statistic(stat::DensityStatistic, for (idx, (keys, belongs)) in enumerate(grouped_data) input = getfield(aes, output_dims[2])[belongs] - window = stat.n > 1 ? KernelDensity.default_bandwidth(input)*stat.adjust : stat.bw + window = stat.bw <= 0.0 ? KernelDensity.default_bandwidth(input)*stat.adjust : stat.bw (stat.trim) && (boundary = extrema(input)) kde_est = KernelDensity.kde(input, kernel=stat.kernel, boundary=boundary, From 55e4ff83d61164cca6859ad1396f014b39f27d61 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 3 Jun 2018 18:33:54 -0700 Subject: [PATCH 12/14] flesh out docs and fix doctests --- src/aesthetics.jl | 18 +++++++++++++----- src/geom/density.jl | 35 ++++++++++++++++++++++++++++++----- src/statistics.jl | 3 ++- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/src/aesthetics.jl b/src/aesthetics.jl index aa79c66a3..e952a4753 100755 --- a/src/aesthetics.jl +++ b/src/aesthetics.jl @@ -415,7 +415,9 @@ function inherit!(a::Aesthetics, b::Aesthetics; end """ - Given aesthetics to group with, `by`, and an aesthetic to group `togroupvar` + groupby(aes, by, togroupvar) + +Given aesthetics to group with, `by`, and an aesthetic to group `togroupvar` this function constructs a dictionary that maps each given combination of the `by` aesthetics to the positions which they apply to. Thus the output is a dictionary of tuples of each unique combination of `by` mapped to a boolean @@ -426,28 +428,34 @@ have the same length). If the provided aesthetics are missing, a placeholder ## Examples ```jldoctest +using Gadfly aes = Gadfly.Aesthetics() aes.x = repeat([1, 2], inner=3) aes.y = collect(1:6) -groupby(aes, [:x, :color], :y) +Gadfly.groupby(aes, [:x, :color], :y) # output -DataStructures.OrderedDict((2, nothing)=>Bool[false, false, false, true, true, true],(1, nothing)=>Bool[true, true, true, false, false, false]) +DataStructures.OrderedDict{Tuple{Int64,Void},Array{Bool,1}} with 2 entries: + (1, nothing) => Bool[true, true, true, false, false, false] + (2, nothing) => Bool[false, false, false, true, true, true] ``` ```jldoctest +using Gadfly aes = Gadfly.Aesthetics() aes.x = repeat([:a, :b], inner=2) aes.y = collect(1:4) aes.color = repeat([colorant"red", colorant"blue"], inner=2) -groupby(aes, [:x, :color], :y) +Gadfly.groupby(aes, [:x, :color], :y) # output -DataStructures.OrderedDict((:a, RGB{N0f8}(1.0,0.0,0.0))=>Bool[true, true, false, false],(:b, RGB{N0f8}(0.0,0.0,1.0))=>Bool[false, false, true, true]) +DataStructures.OrderedDict{Tuple{Symbol,ColorTypes.RGB{FixedPointNumbers.Normed{UInt8,8}}},Array{Bool,1}} with 2 entries: + (:a, RGB{N0f8}(1.0,0.0,0.0)) => Bool[true, true, false, false] + (:b, RGB{N0f8}(0.0,0.0,1.0)) => Bool[false, false, true, true] ``` """ diff --git a/src/geom/density.jl b/src/geom/density.jl index c6d27fcb3..7c51aa23d 100644 --- a/src/geom/density.jl +++ b/src/geom/density.jl @@ -74,11 +74,36 @@ function ViolinGeometry(; n=256, end """ - Geom.violin[(; order=1)] - -Draw `y` versus `width`, optionally grouping categorically by `x` and coloring -with `color`. Alternatively, if `width` is not supplied, the data in `y` will -be transformed to a density estimate using [`Stat.density`](@ref) + Geom.violin[(; bandwidth, adjust, kernel, trim, order)] + +Draws a violin plot which is a combination of [`Geom.density`](@ref) and +[`Geom.boxplot`](@ref). This plot type is useful for comparing differences in +the distribution of quantitative data between categories, especially when the +data is non-normally distributed. As with [`Geom.density`](@ref) plots, there +are a couple caveats: + +1) Plot components do not necessarily correspond to the raw datapoints, but + instead to the kernel density estimation of the underlying distribution +2) Density estimation improves as a function of the number of data points and + can be misleadingly smooth when the number of datapoints is small. + +In the case of standard vertical violins, `Geom.violin` draws the density +estimate of `y` optionally grouped categorically by `x` and colored +with `color`. Alternatively, `width` can be supplied directly and will be +used instead. See [`Stat.DensityStatistic`](@ref Gadfly.Stat.DensityStatistic) +for details on optional parameters that can control the `bandwidth`, `kernel`, +etc used. + +```@example +using RDatasets, Gadfly + +df = dataset("ggplot2", "diamonds") + +p = plot(df, x=:Cut, y=:Carat, color=:Cut, Geom.violin()) +draw(SVG("diamonds_violin1.svg", 10cm, 8cm), p) # hide +nothing # hide +``` +![](diamonds_violin1.svg) """ const violin = ViolinGeometry diff --git a/src/statistics.jl b/src/statistics.jl index 516bb7414..7366a9062 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -502,7 +502,8 @@ end """ A general statistic for density plots (e.g. KDE plots and violin plots). -See [`Geom.density`](@ref) or [`Geom.violin`](@ref) for more details. +See [`Geom.density`](@ref Gadfly.Geom.density) or [`Geom.violin`](@ref +Gadfly.Geom.violin) for more details. """ struct DensityStatistic <: Gadfly.StatisticElement """ From c6b21044d6b8ca8ff10687f01140ff252d43e202 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 3 Jun 2018 19:35:36 -0700 Subject: [PATCH 13/14] update docs and examples for new density features --- docs/src/gallery/geometries.md | 41 +++++++++++++++++++++++++++++----- src/geom/density.jl | 40 +++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/docs/src/gallery/geometries.md b/docs/src/gallery/geometries.md index 98295d8c4..010d42c53 100644 --- a/docs/src/gallery/geometries.md +++ b/docs/src/gallery/geometries.md @@ -92,8 +92,9 @@ gridstack([p1 p2; p3 p4]) ```@example using Gadfly, RDatasets, Distributions set_default_plot_size(21cm, 8cm) -p1 = plot(dataset("ggplot2", "diamonds"), x="Price", Geom.density) -p2 = plot(dataset("ggplot2", "diamonds"), x="Price", color="Cut", Geom.density) +data = dataset("ggplot2", "diamonds") +p1 = plot(data, x="Price", Geom.density) +p2 = plot(data, x="Price", color="Cut", Geom.density) hstack(p1,p2) ``` @@ -102,13 +103,27 @@ using Gadfly, RDatasets, Distributions set_default_plot_size(14cm, 8cm) dist = MixtureModel(Normal, [(0.5, 0.2), (1, 0.1)]) xs = rand(dist, 10^5) -plot(layer(x=xs, Geom.density, Theme(default_color="orange")), +plot(layer(x=xs, Geom.density, Theme(default_color="orange")), layer(x=xs, Geom.density(bandwidth=0.0003), Theme(default_color="green")), layer(x=xs, Geom.density(bandwidth=0.25), Theme(default_color="purple")), Guide.manual_color_key("bandwidth", ["auto", "bw=0.0003", "bw=0.25"], ["orange", "green", "purple"])) ``` +```@example +using Gadfly, RDatasets +set_default_plot_size(21cm, 8cm) +data = dataset("ggplot2", "diamonds") +p1 = plot(data, x=:Carat, color=:Cut, Geom.density(position=:stack), Guide.title("Loses marginal densities")) +p2 = plot(data, x=:Carat, color=:Cut, Geom.density(position=:stack, scale=:count), Guide.title("Preserve marginal densities")) +hstack(p1, p2) +``` + +```@example +using Gadfly, RDatasets +plot(dataset("ggplot2", "diamonds"), x=:Carat, color=:Cut, Geom.density(position=:fill), Guide.title("Conditional density estimate"), Coord.cartesian(ymax=1.0, xmax=5)) +``` + ## [`Geom.density2d`](@ref) @@ -464,8 +479,8 @@ using Gadfly, RDatasets set_default_plot_size(21cm, 8cm) coord = Coord.cartesian(xmin=-2, xmax=2, ymin=-2, ymax=2) -p1 = plot(coord, z=(x,y)->x*exp(-(x^2+y^2)), - xmin=[-2], xmax=[2], ymin=[-2], ymax=[2], +p1 = plot(coord, z=(x,y)->x*exp(-(x^2+y^2)), + xmin=[-2], xmax=[2], ymin=[-2], ymax=[2], # or: x=-2:0.25:2.0, y=-2:0.25:2.0, Geom.vectorfield(scale=0.4, samples=17), Geom.contour(levels=6), Scale.x_continuous(minvalue=-2.0, maxvalue=2.0), @@ -473,7 +488,7 @@ p1 = plot(coord, z=(x,y)->x*exp(-(x^2+y^2)), Guide.xlabel("x"), Guide.ylabel("y"), Guide.colorkey(title="z")) volcano = Matrix{Float64}(dataset("datasets", "volcano")) -volc = volcano[1:4:end, 1:4:end] +volc = volcano[1:4:end, 1:4:end] coord = Coord.cartesian(xmin=1, xmax=22, ymin=1, ymax=16) p2 = plot(coord, z=volc, x=1.0:22, y=1.0:16, Geom.vectorfield(scale=0.05), Geom.contour(levels=7), @@ -495,3 +510,17 @@ Dsing = dataset("lattice","singer") Dsing[:Voice] = [x[1:5] for x in Dsing[:VoicePart]] plot(Dsing, x=:VoicePart, y=:Height, color=:Voice, Geom.violin) ``` + +```@example +using Gadfly, RDatasets +set_default_plot_size(14cm, 8cm) +tips = dataset("reshape2", "tips") +plot(tips, x=:Day, y=:TotalBill, color=:Sex, Geom.violin(scale=:count), Scale.x_discrete(order=[3,4,2,1])) +``` + +```@example +using Gadfly, RDatasets +set_default_plot_size(12cm, 16cm) +melanoma = dataset("mlmRev", "Mmmec") +plot(melanoma, y=:Nation, x=:Deaths, color=:Nation, Geom.violin(orientation=:horizontal, scale=:count)) +``` diff --git a/src/geom/density.jl b/src/geom/density.jl index 7c51aa23d..c6136ab22 100644 --- a/src/geom/density.jl +++ b/src/geom/density.jl @@ -21,6 +21,35 @@ end DensityGeometry(stat; order=1, tag=empty_tag) = DensityGeometry(stat, order, tag) +""" + Geom.density(; bandwidth, adjust, kernel, trim, scale, position, orientation, order) + +Draws a kernel density estimate. This is a cousin of [`Geom.histogram`](@ref) +that is especially useful when the datapoints originate from a underlying smooth +distribution. Unlike histograms, density estimates do not suffer from edge +effects from incorrect bin choices. Some caveats do apply: + +1) Plot components do not necessarily correspond to the raw datapoints, but + instead to the kernel density estimation of the underlying distribution +2) Density estimation improves as a function of the number of data points and + can be misleadingly smooth when the number of datapoints is small. +3) Results can be sensitive to the choise of `kernel` and `bandwidth` + +For horizontal histograms (default), `Geom.density` draws the kernel density +estimate of `x` optionally grouped by `color`. If the `orientation=:vertical` +flag is passed to the function, then densities will be computed along `y`. The +estimates are normalized by default to have areas equal to 1, but this can +changed by passing `scale=:count` to scale by the raw number of datapoints or +`scale=:peak` to scale by the max height of the estimate. Additionally, multiple +densities can be stacked using the `position=:stack` flag or the conditional +density estimate can be drawn using `position=:fill`. See +[`Stat.DensityStatistic`](@ref Gadfly.Stat.DensityStatistic) for details on +optional parameters that can control the `bandwidth`, `kernel`, etc used. + +External links + +* [Kernel Density Estimation on Wikipedia](https://en.wikipedia.org/wiki/Kernel_density_estimation) +""" const density = DensityGeometry element_aesthetics(::DensityGeometry) = Symbol[] @@ -79,18 +108,11 @@ end Draws a violin plot which is a combination of [`Geom.density`](@ref) and [`Geom.boxplot`](@ref). This plot type is useful for comparing differences in the distribution of quantitative data between categories, especially when the -data is non-normally distributed. As with [`Geom.density`](@ref) plots, there -are a couple caveats: - -1) Plot components do not necessarily correspond to the raw datapoints, but - instead to the kernel density estimation of the underlying distribution -2) Density estimation improves as a function of the number of data points and - can be misleadingly smooth when the number of datapoints is small. +data is non-normally distributed. See [`Geom.density`](@ref) for some caveats. In the case of standard vertical violins, `Geom.violin` draws the density estimate of `y` optionally grouped categorically by `x` and colored -with `color`. Alternatively, `width` can be supplied directly and will be -used instead. See [`Stat.DensityStatistic`](@ref Gadfly.Stat.DensityStatistic) +with `color`. See [`Stat.DensityStatistic`](@ref Gadfly.Stat.DensityStatistic) for details on optional parameters that can control the `bandwidth`, `kernel`, etc used. From 229b9fd5eab7fd6296dfeec72cbb3fc1dd352ee1 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 3 Jun 2018 19:42:54 -0700 Subject: [PATCH 14/14] update news [ci skip] --- NEWS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index e1b8dd49b..8c066a180 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ Each release typically has a number of minor bug fixes beyond what is listed her # Version 0.7.1 * `Geom.contour`: add support for `DataFrame` (#1150) + * `Geom.density`: add ability to use custom kernels and adds support for scaling, stacking, vertical orientation ([#1157](https://github.com/GiovineItalia/Gadfly.jl/pull/1157)) + * `Geom.violin`: add ability to adjust scaling and bandwidth and support for horizontal and split violins ([#1157](https://github.com/GiovineItalia/Gadfly.jl/pull/1157)) # Version 0.7.0 @@ -23,7 +25,7 @@ Each release typically has a number of minor bug fixes beyond what is listed her # Version 0.6.4 * Regression testing tools (#1020) - + # Version 0.6.3 * Wide format data (#1013) @@ -214,5 +216,3 @@ Each release typically has a number of minor bug fixes beyond what is listed her keys are wrapped automatically. * Default Theme changes. - -