Skip to content

Commit

Permalink
add disallowmissing, allowmissing and categorical; fix a bug in categ…
Browse files Browse the repository at this point in the history
…orical!
  • Loading branch information
bkamins authored Aug 2, 2019
1 parent 94cd7ad commit 6102f89
Show file tree
Hide file tree
Showing 4 changed files with 360 additions and 3 deletions.
3 changes: 3 additions & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,16 @@ meltdf
## Basics

```@docs
allowmissing
allowmissing!
categorical
categorical!
completecases
copy
DataFrame!
deleterows!
describe
disallowmissing
disallowmissing!
dropmissing
dropmissing!
Expand Down
168 changes: 168 additions & 0 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ The following are normally implemented for AbstractDataFrames:
* [`dropmissing!`](@ref) : remove rows with missing values in-place
* [`nonunique`](@ref) : indexes of duplicate rows
* [`unique!`](@ref) : remove duplicate rows
* [`disallowmissing`](@ref) : drop support for missing values in columns
* [`allowmissing`](@ref) : add support for missing values in columns
* [`categorical`](@ref) : change column types to categorical
* `similar` : a DataFrame with similar columns as `d`
* `filter` : remove rows
* `filter!` : remove rows in-place
Expand Down Expand Up @@ -1280,3 +1283,168 @@ julia> ncol(df)
"""
(nrow, ncol)

"""
disallowmissing(df::AbstractDataFrame,
cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:)
Return a copy of data frame `df` with columns `cols` converted
from element type `Union{T, Missing}` to `T` to drop support for missing values.
If `cols` is omitted all columns in the data frame are converted.
**Examples**
```jldoctest
julia> df = DataFrame(a=Union{Int,Missing}[1,2])
2×1 DataFrame
│ Row │ a │
│ │ Int64⍰ │
├─────┼────────┤
│ 1 │ 1 │
│ 2 │ 2 │
julia> disallowmissing(df)
2×1 DataFrame
│ Row │ a │
│ │ Int64 │
├─────┼───────┤
│ 1 │ 1 │
│ 2 │ 2 │
```
"""
function Missings.disallowmissing(df::AbstractDataFrame,
cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:)
idxcols = Set(index(df)[cols])
newcols = AbstractVector[]
for i in axes(df, 2)
x = df[!, i]
if i in idxcols
y = disallowmissing(x)
push!(newcols, y === x ? copy(y) : y)
else
push!(newcols, copy(x))
end
end
DataFrame(newcols, _names(df), copycols=false)
end

"""
allowmissing(df::AbstractDataFrame,
cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:)
Return a copy of data frame `df` with columns `cols` converted
to element type `Union{T, Missing}` from `T` to allow support for missing values.
If `cols` is omitted all columns in the data frame are converted.
**Examples**
```jldoctest
julia> df = DataFrame(a=[1,2])
2×1 DataFrame
│ Row │ a │
│ │ Int64 │
├─────┼───────┤
│ 1 │ 1 │
│ 2 │ 2 │
julia> allowmissing(df)
2×1 DataFrame
│ Row │ a │
│ │ Int64⍰ │
├─────┼────────┤
│ 1 │ 1 │
│ 2 │ 2 │
```
"""
function Missings.allowmissing(df::AbstractDataFrame,
cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon}=:)
idxcols = Set(index(df)[cols])
newcols = AbstractVector[]
for i in axes(df, 2)
x = df[!, i]
if i in idxcols
y = allowmissing(x)
push!(newcols, y === x ? copy(y) : y)
else
push!(newcols, copy(x))
end
end
DataFrame(newcols, _names(df), copycols=false)
end

"""
categorical(df::AbstractDataFrame; compress::Bool=false)
categorical(df::AbstractDataFrame,
cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon};
compress::Bool=false)
Return a copy of data frame `df` with columns `cols` converted to `CategoricalVector`.
If the function is called without passing the `cols` argument, all columns whose element type
is a subtype of `Union{AbstractString, Missing}` will be converted to categorical.
If the `compress` keyword argument is set to `true` then the created `CategoricalVector`s
will be compressed.
All created `CategoricalVector`s are unordered.
**Examples**
```jldoctest
julia> df = DataFrame(a=[1,2], b=["a","b"])
2×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ String │
├─────┼───────┼────────┤
│ 1 │ 1 │ a │
│ 2 │ 2 │ b │
julia> categorical(df)
2×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ Categorical… │
├─────┼───────┼──────────────┤
│ 1 │ 1 │ a │
│ 2 │ 2 │ b │
julia> categorical(df, :)
2×2 DataFrame
│ Row │ a │ b │
│ │ Categorical… │ Categorical… │
├─────┼──────────────┼──────────────┤
│ 1 │ 1 │ a │
│ 2 │ 2 │ b │
```
"""
function CategoricalArrays.categorical(df::AbstractDataFrame,
cols::Union{ColumnIndex, AbstractVector, Regex, Not, Colon};
compress::Bool=false)
idxcols = Set(index(df)[cols])
newcols = AbstractVector[]
for i in axes(df, 2)
x = df[!, i]
if i in idxcols
# categorical always copies
push!(newcols, categorical(x, compress))
else
push!(newcols, copy(x))
end
end
DataFrame(newcols, _names(df), copycols=false)
end

function CategoricalArrays.categorical(df::AbstractDataFrame; compress::Bool=false)
newcols = AbstractVector[]
for i in axes(df, 2)
x = df[!, i]
if eltype(x) <: Union{AbstractString, Missing}
# categorical always copies
push!(newcols, categorical(x, compress))
else
push!(newcols, copy(x))
end
end
DataFrame(newcols, _names(df), copycols=false)
end
4 changes: 2 additions & 2 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1047,10 +1047,10 @@ function categorical!(df::DataFrame, cnames::AbstractVector{<:ColumnIndex};
df
end

categorical!(df::DataFrame, cnames::Union{Regex, Not}; compress::Bool=false) =
categorical!(df::DataFrame, cnames::Union{Regex, Not, Colon}; compress::Bool=false) =
categorical!(df, index(df)[cnames], compress=compress)

function categorical!(df::DataFrame, cnames::Colon=:; compress::Bool=false)
function categorical!(df::DataFrame; compress::Bool=false)
for i in 1:size(df, 2)
if eltype(df[!, i]) <: Union{AbstractString, Missing}
df[!, i] = categorical(df[!, i], compress)
Expand Down
Loading

0 comments on commit 6102f89

Please sign in to comment.