From 6102f894f6d70f425293d39ddb67835072d0e717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 2 Aug 2019 20:12:46 +0200 Subject: [PATCH] add disallowmissing, allowmissing and categorical; fix a bug in categorical! --- docs/src/lib/functions.md | 3 + src/abstractdataframe/abstractdataframe.jl | 168 ++++++++++++++++++ src/dataframe/dataframe.jl | 4 +- test/dataframe.jl | 188 ++++++++++++++++++++- 4 files changed, 360 insertions(+), 3 deletions(-) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index d80ab5f2e7..c0c2d5bffc 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -29,13 +29,16 @@ meltdf ## Basics ```@docs +allowmissing allowmissing! +categorical categorical! completecases copy DataFrame! deleterows! describe +disallowmissing disallowmissing! dropmissing dropmissing! diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 1d3824536b..b5306763de 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -31,6 +31,9 @@ The following are normally implemented for AbstractDataFrames: * [`dropmissing!`](@ref) : remove rows with missing values in-place * [`nonunique`](@ref) : indexes of duplicate rows * [`unique!`](@ref) : remove duplicate rows +* [`disallowmissing`](@ref) : drop support for missing values in columns +* [`allowmissing`](@ref) : add support for missing values in columns +* [`categorical`](@ref) : change column types to categorical * `similar` : a DataFrame with similar columns as `d` * `filter` : remove rows * `filter!` : remove rows in-place @@ -1280,3 +1283,168 @@ julia> ncol(df) """ (nrow, ncol) + +""" + disallowmissing(df::AbstractDataFrame, + cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:) + +Return a copy of data frame `df` with columns `cols` converted +from element type `Union{T, Missing}` to `T` to drop support for missing values. + +If `cols` is omitted all columns in the data frame are converted. + +**Examples** + +```jldoctest +julia> df = DataFrame(a=Union{Int,Missing}[1,2]) +2×1 DataFrame +│ Row │ a │ +│ │ Int64⍰ │ +├─────┼────────┤ +│ 1 │ 1 │ +│ 2 │ 2 │ + +julia> disallowmissing(df) +2×1 DataFrame +│ Row │ a │ +│ │ Int64 │ +├─────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 2 │ +``` +""" +function Missings.disallowmissing(df::AbstractDataFrame, + cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:) + idxcols = Set(index(df)[cols]) + newcols = AbstractVector[] + for i in axes(df, 2) + x = df[!, i] + if i in idxcols + y = disallowmissing(x) + push!(newcols, y === x ? copy(y) : y) + else + push!(newcols, copy(x)) + end + end + DataFrame(newcols, _names(df), copycols=false) +end + +""" + allowmissing(df::AbstractDataFrame, + cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:) + +Return a copy of data frame `df` with columns `cols` converted +to element type `Union{T, Missing}` from `T` to allow support for missing values. + +If `cols` is omitted all columns in the data frame are converted. + +**Examples** + +```jldoctest +julia> df = DataFrame(a=[1,2]) +2×1 DataFrame +│ Row │ a │ +│ │ Int64 │ +├─────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 2 │ + +julia> allowmissing(df) +2×1 DataFrame +│ Row │ a │ +│ │ Int64⍰ │ +├─────┼────────┤ +│ 1 │ 1 │ +│ 2 │ 2 │ +``` +""" +function Missings.allowmissing(df::AbstractDataFrame, + cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:) + idxcols = Set(index(df)[cols]) + newcols = AbstractVector[] + for i in axes(df, 2) + x = df[!, i] + if i in idxcols + y = allowmissing(x) + push!(newcols, y === x ? copy(y) : y) + else + push!(newcols, copy(x)) + end + end + DataFrame(newcols, _names(df), copycols=false) +end + +""" + categorical(df::AbstractDataFrame; compress::Bool=false) + categorical(df::AbstractDataFrame, + cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}; + compress::Bool=false) + +Return a copy of data frame `df` with columns `cols` converted to `CategoricalVector`. +If the function is called without passing the `cols` argument, all columns whose element type +is a subtype of `Union{AbstractString, Missing}` will be converted to categorical. + +If the `compress` keyword argument is set to `true` then the created `CategoricalVector`s +will be compressed. + +All created `CategoricalVector`s are unordered. + +**Examples** + +```jldoctest +julia> df = DataFrame(a=[1,2], b=["a","b"]) +2×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ String │ +├─────┼───────┼────────┤ +│ 1 │ 1 │ a │ +│ 2 │ 2 │ b │ + +julia> categorical(df) +2×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ Categorical… │ +├─────┼───────┼──────────────┤ +│ 1 │ 1 │ a │ +│ 2 │ 2 │ b │ + +julia> categorical(df, :) +2×2 DataFrame +│ Row │ a │ b │ +│ │ Categorical… │ Categorical… │ +├─────┼──────────────┼──────────────┤ +│ 1 │ 1 │ a │ +│ 2 │ 2 │ b │ +``` + +""" +function CategoricalArrays.categorical(df::AbstractDataFrame, + cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}; + compress::Bool=false) + idxcols = Set(index(df)[cols]) + newcols = AbstractVector[] + for i in axes(df, 2) + x = df[!, i] + if i in idxcols + # categorical always copies + push!(newcols, categorical(x, compress)) + else + push!(newcols, copy(x)) + end + end + DataFrame(newcols, _names(df), copycols=false) +end + +function CategoricalArrays.categorical(df::AbstractDataFrame; compress::Bool=false) + newcols = AbstractVector[] + for i in axes(df, 2) + x = df[!, i] + if eltype(x) <: Union{AbstractString, Missing} + # categorical always copies + push!(newcols, categorical(x, compress)) + else + push!(newcols, copy(x)) + end + end + DataFrame(newcols, _names(df), copycols=false) +end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index f0d1b9e361..2b26db8eab 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1047,10 +1047,10 @@ function categorical!(df::DataFrame, cnames::AbstractVector{<:ColumnIndex}; df end -categorical!(df::DataFrame, cnames::Union{Regex, Not}; compress::Bool=false) = +categorical!(df::DataFrame, cnames::Union{Regex, Not, Colon}; compress::Bool=false) = categorical!(df, index(df)[cnames], compress=compress) -function categorical!(df::DataFrame, cnames::Colon=:; compress::Bool=false) +function categorical!(df::DataFrame; compress::Bool=false) for i in 1:size(df, 2) if eltype(df[!, i]) <: Union{AbstractString, Missing} df[!, i] = categorical(df[!, i], compress) diff --git a/test/dataframe.jl b/test/dataframe.jl index b2cc06973e..7a0bd4a1c9 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1067,7 +1067,9 @@ end CategoricalArrays.CategoricalString{UInt32}])) @test all(map(<:, eltypes(categorical!(deepcopy(df), :)), [CategoricalArrays.CategoricalString{UInt32}, - Char, Bool, Int, + CategoricalArrays.CategoricalValue{Char,UInt32}, + CategoricalArrays.CategoricalValue{Bool,UInt32}, + CategoricalArrays.CategoricalValue{Int,UInt32}, CategoricalArrays.CategoricalString{UInt32}])) @test all(map(<:, eltypes(categorical!(deepcopy(df), compress=true)), [CategoricalArrays.CategoricalString{UInt8}, @@ -1101,6 +1103,12 @@ end df = DataFrame([["a", missing]]) categorical!(df) @test df.x1 isa CategoricalVector{Union{Missing, String}} + + df = DataFrame(x1=[1, 2]) + categorical!(df) + @test df.x1 isa Vector{Int} + categorical!(df, :) + @test df.x1 isa CategoricalVector{Int} end @testset "unstack promotion to support missing values" begin @@ -1332,6 +1340,184 @@ end @test eltype(df.d) == Union{Int, Missing} end +@testset "test disallowmissing" begin + df = DataFrame(x=Union{Int,Missing}[1,2,3], + y=Union{Int,Missing}[1,2,3], + z=[1,2,3]) + for x in [df, view(df, :, :)] + y = disallowmissing(x) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == [Int, Int, Int] + + for colsel in [:, names(x), [1,2,3], [true,true,true], r"", Not(r"a")] + y = disallowmissing(x, colsel) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == [Int, Int, Int] + end + + for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)] + y = disallowmissing(x, colsel) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == [Int, Union{Missing, Int}, Int] + end + + for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)] + y = disallowmissing(x, colsel) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == [Union{Int, Missing}, Union{Int, Missing}, Int] + end + + for colsel in [Int[], Symbol[], [false, false, false], r"a", Not(:)] + y = disallowmissing(x, colsel) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == [Union{Int, Missing}, Union{Int, Missing}, Int] + end + end + + @test_throws MethodError disallowmissing(DataFrame(x=[missing])) + @test_throws MethodError disallowmissing(DataFrame(x=[1, missing])) +end + +@testset "test allowmissing" begin + df = DataFrame(x=Union{Int,Missing}[1,2,3], + y=[1,2,3], + z=[1,2,3]) + for x in [df, view(df, :, :)] + y = allowmissing(x) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == fill(Union{Missing, Int}, 3) + + for colsel in [:, names(x), [1,2,3], [true,true,true], r"", Not(r"a")] + y = allowmissing(x, colsel) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == fill(Union{Missing, Int}, 3) + end + + for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)] + y = allowmissing(x, colsel) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == [Union{Missing, Int}, Int, Int] + end + + for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)] + y = allowmissing(x, colsel) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == [Union{Int, Missing}, Int, Union{Missing, Int}] + end + + for colsel in [Int[], Symbol[], [false, false, false], r"a", Not(:)] + y = allowmissing(x, colsel) + @test y isa DataFrame + @test x == y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test eltypes(y) == [Union{Int, Missing}, Int, Int] + end + end +end + +@testset "test categorical" begin + df = DataFrame(x=["a", "b", "c"], + y=["a", "b", missing], + z=[1,2,3]) + for x in [df, view(df, :, :)] + y = categorical(x) + @test y isa DataFrame + @test x ≅ y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test y.x isa CategoricalVector{String} + @test y.y isa CategoricalVector{Union{Missing, String}} + @test y.z isa Vector{Int} + + for colsel in [:, names(x), [1,2,3], [true,true,true], r"", Not(r"a")] + y = categorical(x, colsel) + @test y isa DataFrame + @test x ≅ y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test y.x isa CategoricalVector{String} + @test y.y isa CategoricalVector{Union{Missing, String}} + @test y.z isa CategoricalVector{Int} + end + + for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)] + y = categorical(x, colsel) + @test y isa DataFrame + @test x ≅ y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test y.x isa CategoricalVector{String} + @test y.y isa Vector{Union{Missing, String}} + @test y.z isa Vector{Int} + end + + for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)] + y = categorical(x, colsel) + @test y isa DataFrame + @test x ≅ y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test y.x isa Vector{String} + @test y.y isa Vector{Union{Missing, String}} + @test y.z isa CategoricalVector{Int} + end + + for colsel in [Int[], Symbol[], [false, false, false], r"a", Not(:)] + y = categorical(x, colsel) + @test y isa DataFrame + @test x ≅ y + @test x.x !== y.x + @test x.y !== y.y + @test x.z !== y.z + @test y.x isa Vector{String} + @test y.y isa Vector{Union{Missing, String}} + @test y.z isa Vector{Int} + end + end +end + @testset "similar" begin df = DataFrame(a = ["foo"], b = CategoricalArray(["foo"]),