From 6102f894f6d70f425293d39ddb67835072d0e717 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 2 Aug 2019 20:12:46 +0200
Subject: [PATCH] add disallowmissing, allowmissing and categorical; fix a bug
 in categorical!

---
 docs/src/lib/functions.md                  |   3 +
 src/abstractdataframe/abstractdataframe.jl | 168 ++++++++++++++++++
 src/dataframe/dataframe.jl                 |   4 +-
 test/dataframe.jl                          | 188 ++++++++++++++++++++-
 4 files changed, 360 insertions(+), 3 deletions(-)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index d80ab5f2e7..c0c2d5bffc 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -29,13 +29,16 @@ meltdf
 ## Basics
 
 ```@docs
+allowmissing
 allowmissing!
+categorical
 categorical!
 completecases
 copy
 DataFrame!
 deleterows!
 describe
+disallowmissing
 disallowmissing!
 dropmissing
 dropmissing!
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 1d3824536b..b5306763de 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -31,6 +31,9 @@ The following are normally implemented for AbstractDataFrames:
 * [`dropmissing!`](@ref) : remove rows with missing values in-place
 * [`nonunique`](@ref) : indexes of duplicate rows
 * [`unique!`](@ref) : remove duplicate rows
+* [`disallowmissing`](@ref) : drop support for missing values in columns
+* [`allowmissing`](@ref) : add support for missing values in columns
+* [`categorical`](@ref) : change column types to categorical
 * `similar` : a DataFrame with similar columns as `d`
 * `filter` : remove rows
 * `filter!` : remove rows in-place
@@ -1280,3 +1283,168 @@ julia> ncol(df)
 
 """
 (nrow, ncol)
+
+"""
+    disallowmissing(df::AbstractDataFrame,
+                    cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:)
+
+Return a copy of data frame `df` with columns `cols` converted
+from element type `Union{T, Missing}` to `T` to drop support for missing values.
+
+If `cols` is omitted all columns in the data frame are converted.
+
+**Examples**
+
+```jldoctest
+julia> df = DataFrame(a=Union{Int,Missing}[1,2])
+2×1 DataFrame
+│ Row │ a      │
+│     │ Int64⍰ │
+├─────┼────────┤
+│ 1   │ 1      │
+│ 2   │ 2      │
+
+julia> disallowmissing(df)
+2×1 DataFrame
+│ Row │ a     │
+│     │ Int64 │
+├─────┼───────┤
+│ 1   │ 1     │
+│ 2   │ 2     │
+```
+"""
+function Missings.disallowmissing(df::AbstractDataFrame,
+                                  cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:)
+    idxcols = Set(index(df)[cols])
+    newcols = AbstractVector[]
+    for i in axes(df, 2)
+        x = df[!, i]
+        if i in idxcols
+            y = disallowmissing(x)
+            push!(newcols, y === x ? copy(y) : y)
+        else
+            push!(newcols, copy(x))
+        end
+    end
+    DataFrame(newcols, _names(df), copycols=false)
+end
+
+"""
+    allowmissing(df::AbstractDataFrame,
+                 cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:)
+
+Return a copy of data frame `df` with columns `cols` converted
+to element type `Union{T, Missing}` from `T` to allow support for missing values.
+
+If `cols` is omitted all columns in the data frame are converted.
+
+**Examples**
+
+```jldoctest
+julia> df = DataFrame(a=[1,2])
+2×1 DataFrame
+│ Row │ a     │
+│     │ Int64 │
+├─────┼───────┤
+│ 1   │ 1     │
+│ 2   │ 2     │
+
+julia> allowmissing(df)
+2×1 DataFrame
+│ Row │ a      │
+│     │ Int64⍰ │
+├─────┼────────┤
+│ 1   │ 1      │
+│ 2   │ 2      │
+```
+"""
+function Missings.allowmissing(df::AbstractDataFrame,
+                               cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:)
+    idxcols = Set(index(df)[cols])
+    newcols = AbstractVector[]
+    for i in axes(df, 2)
+        x = df[!, i]
+        if i in idxcols
+            y = allowmissing(x)
+            push!(newcols, y === x ? copy(y) : y)
+        else
+            push!(newcols, copy(x))
+        end
+    end
+    DataFrame(newcols, _names(df), copycols=false)
+end
+
+"""
+    categorical(df::AbstractDataFrame; compress::Bool=false)
+    categorical(df::AbstractDataFrame,
+                cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon};
+                compress::Bool=false)
+
+Return a copy of data frame `df` with columns `cols` converted to `CategoricalVector`.
+If the function is called without passing the `cols` argument, all columns whose element type
+is a subtype of `Union{AbstractString, Missing}` will be converted to categorical.
+
+If the `compress` keyword argument is set to `true` then the created `CategoricalVector`s
+will be compressed.
+
+All created `CategoricalVector`s are unordered.
+
+**Examples**
+
+```jldoctest
+julia> df = DataFrame(a=[1,2], b=["a","b"])
+2×2 DataFrame
+│ Row │ a     │ b      │
+│     │ Int64 │ String │
+├─────┼───────┼────────┤
+│ 1   │ 1     │ a      │
+│ 2   │ 2     │ b      │
+
+julia> categorical(df)
+2×2 DataFrame
+│ Row │ a     │ b            │
+│     │ Int64 │ Categorical… │
+├─────┼───────┼──────────────┤
+│ 1   │ 1     │ a            │
+│ 2   │ 2     │ b            │
+
+julia> categorical(df, :)
+2×2 DataFrame
+│ Row │ a            │ b            │
+│     │ Categorical… │ Categorical… │
+├─────┼──────────────┼──────────────┤
+│ 1   │ 1            │ a            │
+│ 2   │ 2            │ b            │
+```
+
+"""
+function CategoricalArrays.categorical(df::AbstractDataFrame,
+                                       cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon};
+                                       compress::Bool=false)
+    idxcols = Set(index(df)[cols])
+    newcols = AbstractVector[]
+    for i in axes(df, 2)
+        x = df[!, i]
+        if i in idxcols
+            # categorical always copies
+            push!(newcols, categorical(x, compress))
+        else
+            push!(newcols, copy(x))
+        end
+    end
+    DataFrame(newcols, _names(df), copycols=false)
+end
+
+function CategoricalArrays.categorical(df::AbstractDataFrame; compress::Bool=false)
+    newcols = AbstractVector[]
+    for i in axes(df, 2)
+        x = df[!, i]
+        if eltype(x) <: Union{AbstractString, Missing}
+            # categorical always copies
+            push!(newcols, categorical(x, compress))
+        else
+            push!(newcols, copy(x))
+        end
+    end
+    DataFrame(newcols, _names(df), copycols=false)
+end
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
index f0d1b9e361..2b26db8eab 100644
--- a/src/dataframe/dataframe.jl
+++ b/src/dataframe/dataframe.jl
@@ -1047,10 +1047,10 @@ function categorical!(df::DataFrame, cnames::AbstractVector{<:ColumnIndex};
     df
 end
 
-categorical!(df::DataFrame, cnames::Union{Regex, Not}; compress::Bool=false) =
+categorical!(df::DataFrame, cnames::Union{Regex, Not, Colon}; compress::Bool=false) =
     categorical!(df, index(df)[cnames], compress=compress)
 
-function categorical!(df::DataFrame, cnames::Colon=:; compress::Bool=false)
+function categorical!(df::DataFrame; compress::Bool=false)
     for i in 1:size(df, 2)
         if eltype(df[!, i]) <: Union{AbstractString, Missing}
             df[!, i] = categorical(df[!, i], compress)
diff --git a/test/dataframe.jl b/test/dataframe.jl
index b2cc06973e..7a0bd4a1c9 100644
--- a/test/dataframe.jl
+++ b/test/dataframe.jl
@@ -1067,7 +1067,9 @@ end
                    CategoricalArrays.CategoricalString{UInt32}]))
     @test all(map(<:, eltypes(categorical!(deepcopy(df), :)),
                   [CategoricalArrays.CategoricalString{UInt32},
-                   Char, Bool, Int,
+                   CategoricalArrays.CategoricalValue{Char,UInt32},
+                   CategoricalArrays.CategoricalValue{Bool,UInt32},
+                   CategoricalArrays.CategoricalValue{Int,UInt32},
                    CategoricalArrays.CategoricalString{UInt32}]))
     @test all(map(<:, eltypes(categorical!(deepcopy(df), compress=true)),
                   [CategoricalArrays.CategoricalString{UInt8},
@@ -1101,6 +1103,12 @@ end
     df = DataFrame([["a", missing]])
     categorical!(df)
     @test df.x1 isa CategoricalVector{Union{Missing, String}}
+
+    df = DataFrame(x1=[1, 2])
+    categorical!(df)
+    @test df.x1 isa Vector{Int}
+    categorical!(df, :)
+    @test df.x1 isa CategoricalVector{Int}
 end
 
 @testset "unstack promotion to support missing values" begin
@@ -1332,6 +1340,184 @@ end
     @test eltype(df.d) == Union{Int, Missing}
 end
 
+@testset "test disallowmissing" begin
+    df = DataFrame(x=Union{Int,Missing}[1,2,3],
+                   y=Union{Int,Missing}[1,2,3],
+                   z=[1,2,3])
+    for x in [df, view(df, :, :)]
+        y = disallowmissing(x)
+        @test y isa DataFrame
+        @test x == y
+        @test x.x !== y.x
+        @test x.y !== y.y
+        @test x.z !== y.z
+        @test eltypes(y) == [Int, Int, Int]
+
+        for colsel in [:, names(x), [1,2,3], [true,true,true], r"", Not(r"a")]
+            y = disallowmissing(x, colsel)
+            @test y isa DataFrame
+            @test x == y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test eltypes(y) == [Int, Int, Int]
+        end
+
+        for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)]
+            y = disallowmissing(x, colsel)
+            @test y isa DataFrame
+            @test x == y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test eltypes(y) == [Int, Union{Missing, Int}, Int]
+        end
+
+        for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)]
+            y = disallowmissing(x, colsel)
+            @test y isa DataFrame
+            @test x == y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test eltypes(y) == [Union{Int, Missing}, Union{Int, Missing}, Int]
+        end
+
+        for colsel in [Int[], Symbol[], [false, false, false], r"a", Not(:)]
+            y = disallowmissing(x, colsel)
+            @test y isa DataFrame
+            @test x == y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test eltypes(y) == [Union{Int, Missing}, Union{Int, Missing}, Int]
+        end
+    end
+
+    @test_throws MethodError disallowmissing(DataFrame(x=[missing]))
+    @test_throws MethodError disallowmissing(DataFrame(x=[1, missing]))
+end
+
+@testset "test allowmissing" begin
+    df = DataFrame(x=Union{Int,Missing}[1,2,3],
+                   y=[1,2,3],
+                   z=[1,2,3])
+    for x in [df, view(df, :, :)]
+        y = allowmissing(x)
+        @test y isa DataFrame
+        @test x == y
+        @test x.x !== y.x
+        @test x.y !== y.y
+        @test x.z !== y.z
+        @test eltypes(y) == fill(Union{Missing, Int}, 3)
+
+        for colsel in [:, names(x), [1,2,3], [true,true,true], r"", Not(r"a")]
+            y = allowmissing(x, colsel)
+            @test y isa DataFrame
+            @test x == y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test eltypes(y) == fill(Union{Missing, Int}, 3)
+        end
+
+        for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)]
+            y = allowmissing(x, colsel)
+            @test y isa DataFrame
+            @test x == y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test eltypes(y) == [Union{Missing, Int}, Int, Int]
+        end
+
+        for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)]
+            y = allowmissing(x, colsel)
+            @test y isa DataFrame
+            @test x == y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test eltypes(y) == [Union{Int, Missing}, Int, Union{Missing, Int}]
+        end
+
+        for colsel in [Int[], Symbol[], [false, false, false], r"a", Not(:)]
+            y = allowmissing(x, colsel)
+            @test y isa DataFrame
+            @test x == y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test eltypes(y) == [Union{Int, Missing}, Int, Int]
+        end
+    end
+end
+
+@testset "test categorical" begin
+    df = DataFrame(x=["a", "b", "c"],
+                   y=["a", "b", missing],
+                   z=[1,2,3])
+    for x in [df, view(df, :, :)]
+        y = categorical(x)
+        @test y isa DataFrame
+        @test x ≅ y
+        @test x.x !== y.x
+        @test x.y !== y.y
+        @test x.z !== y.z
+        @test y.x isa CategoricalVector{String}
+        @test y.y isa CategoricalVector{Union{Missing, String}}
+        @test y.z isa Vector{Int}
+
+        for colsel in [:, names(x), [1,2,3], [true,true,true], r"", Not(r"a")]
+            y = categorical(x, colsel)
+            @test y isa DataFrame
+            @test x ≅ y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test y.x isa CategoricalVector{String}
+            @test y.y isa CategoricalVector{Union{Missing, String}}
+            @test y.z isa CategoricalVector{Int}
+        end
+
+        for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)]
+            y = categorical(x, colsel)
+            @test y isa DataFrame
+            @test x ≅ y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test y.x isa CategoricalVector{String}
+            @test y.y isa Vector{Union{Missing, String}}
+            @test y.z isa Vector{Int}
+        end
+
+        for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)]
+            y = categorical(x, colsel)
+            @test y isa DataFrame
+            @test x ≅ y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test y.x isa Vector{String}
+            @test y.y isa Vector{Union{Missing, String}}
+            @test y.z isa CategoricalVector{Int}
+        end
+
+        for colsel in [Int[], Symbol[], [false, false, false], r"a", Not(:)]
+            y = categorical(x, colsel)
+            @test y isa DataFrame
+            @test x ≅ y
+            @test x.x !== y.x
+            @test x.y !== y.y
+            @test x.z !== y.z
+            @test y.x isa Vector{String}
+            @test y.y isa Vector{Union{Missing, String}}
+            @test y.z isa Vector{Int}
+        end
+    end
+end
+
 @testset "similar" begin
     df = DataFrame(a = ["foo"],
                    b = CategoricalArray(["foo"]),