From 289fadfd752d22b6bec3d9450d46a95e25eda724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 11 Jun 2021 19:50:59 +0200 Subject: [PATCH 01/29] add setindex! rules --- src/subdataframe/subdataframe.jl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 6b2b308aad..7001c27e3b 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -181,8 +181,16 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, id setindex!(sdf, val, idx[1], idx[2]) end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) - parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val - return sdf + if colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index && + && val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val) + T = eltype(val) + newcol = Tables.allocatecolumn(Union{T, Missing}, n) + fill!(newcol, missing) + view(newcol, rows(sdf)) = val + else + parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val + end +return sdf end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::typeof(!), colinds::Any) throw(ArgumentError("setting index of SubDataFrame using ! as row selector is not allowed")) From a2532363d99bc1ba00177177ba3fb43eb415aeb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 20 Jun 2021 23:44:29 +0200 Subject: [PATCH 02/29] implement setindex! and broadcasting assignment --- src/other/broadcasting.jl | 27 +++++++++++++++++++++++---- src/subdataframe/subdataframe.jl | 9 +++++---- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl index 60e54075ac..b8fc05c857 100644 --- a/src/other/broadcasting.jl +++ b/src/other/broadcasting.jl @@ -80,8 +80,8 @@ end ### Broadcasting assignment -struct LazyNewColDataFrame{T} - df::DataFrame +struct LazyNewColDataFrame{T,D} + df::D col::T end @@ -108,6 +108,18 @@ function Base.dotview(df::DataFrame, ::Colon, cols::ColumnIndex) return LazyNewColDataFrame(df, Symbol(cols)) end +function Base.dotview(df::SubDataFrame, ::Colon, cols::ColumnIndex) + haskey(index(df), cols) && return view(df, :, cols) + if !(cols isa SymbolOrString) + throw(ArgumentError("creating new columns using an integer index is disallowed")) + end + if !(getfield(df, :colindex) isa Index) + throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * + "columns of a parent data frame is disallowed")) + end + return LazyNewColDataFrame(df, Symbol(cols)) +end + function Base.dotview(df::DataFrame, ::typeof(!), cols) if !(cols isa ColumnIndex) return ColReplaceDataFrame(df, index(df)[cols]) @@ -144,15 +156,22 @@ if isdefined(Base, :dotgetproperty) end function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted{T}) where T + df = lazydf.df + @assert columnindex(df, lazydf.col) == 0 if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}} bc_tmp = Base.Broadcast.Broadcasted{T}(bc.f, bc.args, ()) v = Base.Broadcast.materialize(bc_tmp) - col = similar(Vector{typeof(v)}, nrow(lazydf.df)) + col = similar(Vector{typeof(v)}, nrow(df)) copyto!(col, bc) else col = Base.Broadcast.materialize(bc) end - lazydf.df[!, lazydf.col] = col + if df isa DataFrame + return df[!, lazydf.col] = col + else + @assert df isa SubDataFrame && getfield(df, :colindex) isa Index + return df[:, lazydf.col] = col + end end function _copyto_helper!(dfcol::AbstractVector, bc::Base.Broadcast.Broadcasted, col::Int) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 7001c27e3b..059cc50de0 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -182,15 +182,16 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, id end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) if colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index && - && val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val) + val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val) T = eltype(val) - newcol = Tables.allocatecolumn(Union{T, Missing}, n) + newcol = Tables.allocatecolumn(Union{T, Missing}, nrow(parent(sdf))) fill!(newcol, missing) - view(newcol, rows(sdf)) = val + newcol[rows(sdf)] = val + parent(sdf)[!, colinds] = newcol else parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val end -return sdf + return sdf end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::typeof(!), colinds::Any) throw(ArgumentError("setting index of SubDataFrame using ! as row selector is not allowed")) From e33c60526ea252a7ccd4afa4e470fc28e27f8633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 21 Jun 2021 11:14:33 +0200 Subject: [PATCH 03/29] implement insertcols! --- docs/src/lib/indexing.md | 8 +- src/abstractdataframe/abstractdataframe.jl | 206 +++++++++++++++++++++ src/dataframe/dataframe.jl | 180 ------------------ 3 files changed, 213 insertions(+), 181 deletions(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 44869d3c3d..0c3f192f5f 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -143,6 +143,9 @@ so it is unsafe to use it afterwards (the column length correctness will be pres * `sdf[CartesianIndex(row, col)] = v` -> the same as `sdf[row, col] = v`; * `sdf[row, cols] = v` -> the same as `dfr = df[row, cols]; dfr[:] = v` in-place; * `sdf[rows, col] = v` -> set rows `rows` of column `col`, in-place; `v` must be an abstract vector; + if `sdf` was created with `:` as column selector, `rows` is `:` and `col` is a `Symbol` or `AbstractString` + that is not present in `df` then a new column in `df` is created and holds `v` in rows selected in `sdf` + and `missing` in all rows present in `parent(sdf)` but not present in `sdf`. * `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` when column names must match; @@ -171,7 +174,6 @@ The following broadcasting rules apply to `AbstractDataFrame` objects: Note that if broadcasting assignment operation throws an error the target data frame may be partially changed so it is unsafe to use it afterwards (the column length correctness will be preserved). - Broadcasting `DataFrameRow` is currently not allowed (which is consistent with `NamedTuple`). It is possible to assign a value to `AbstractDataFrame` and `DataFrameRow` objects using the `.=` operator. @@ -190,6 +192,10 @@ Additional rules: `df` is performed in-place; if `rows` is `:` and `col` is `Symbol` or `AbstractString` and it is missing from `df` then a new column is allocated and added; the length of the column is always the value of `nrow(df)` before the assignment takes place; +* in the `sdf[:, col] .= v` if `sdf` was created with `:` as column selector + and `col` is a `Symbol` or `AbstractString` that is not present in `df` then a new column in `df` + is created and holds contents of `v` broadcasted onto rows selected in `sdf` + and `missing` in all rows present in `parent(sdf)` but not present in `sdf`. * in the `df[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; if `col` is `Symbol` or `AbstractString` and it is missing from `df` then a new column is allocated added; the length of the column is always the value of `nrow(df)` before the assignment takes place; diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index d50a346c51..0a2c1bc716 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2173,3 +2173,209 @@ Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) = Base.setindex!(::AbstractDataFrame, ::Any, ::Union{Symbol, Integer, AbstractString}) = throw(ArgumentError("syntax df[column] is not supported use df[!, column] instead")) + +# insertcols! + +""" + insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...; + makeunique::Bool=false, copycols::Bool=true) + +Insert a column into a data frame in place. Return the updated data frame. +If `col` is omitted it is set to `ncol(df)+1` +(the column is inserted as the last column). + +# Arguments +- `df` : the data frame to which we want to add columns +- `col` : a position at which we want to insert a column, passed as an integer + or a column name (a string or a `Symbol`); the column selected with `col` + and columns following it are shifted to the right in `df` after the operation +- `name` : the name of the new column +- `val` : an `AbstractVector` giving the contents of the new column or a value of any + type other than `AbstractArray` which will be repeated to fill a new vector; + As a particular rule a values stored in a `Ref` or a `0`-dimensional `AbstractArray` + are unwrapped and treated in the same way. +- `makeunique` : Defines what to do if `name` already exists in `df`; + if it is `false` an error will be thrown; if it is `true` a new unique name will + be generated by adding a suffix +- `copycols` : whether vectors passed as columns should be copied + +If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. + +If `df` is a `SubDataFrame` then it must be created with `:` as column selector +(otherwise an error is thrown). In this case the `copycols` keyword argument +is ignored an added column is always copied and the parent data frame is +filled with `missing` in rows that are filtered out by `df`. + +If `df` isa `DataFrame` that has no columns and only values +other than `AbstractVector` are passed then it is used to create a one element +column. +If `df` isa `DataFrame` that has no columns and at least one `AbstractVector` is +passed then its length is used to determine the number of elements in all +created columns. +In all other cases the number of rows in all created columns must match +`nrow(df)`. +. + +# Examples +```jldoctest +julia> df = DataFrame(a=1:3) +3×1 DataFrame + Row │ a + │ Int64 +─────┼─────── + 1 │ 1 + 2 │ 2 + 3 │ 3 + +julia> insertcols!(df, 1, :b => 'a':'c') +3×2 DataFrame + Row │ b a + │ Char Int64 +─────┼───────────── + 1 │ a 1 + 2 │ b 2 + 3 │ c 3 + +julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) +3×4 DataFrame + Row │ b c c_1 a + │ Char Int64 Int64 Int64 +─────┼─────────────────────────── + 1 │ a 2 3 1 + 2 │ b 3 4 2 + 3 │ c 4 5 3 +``` +""" +function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...; + makeunique::Bool=false, copycols::Bool=true) + if !(df isa DataFrame || (df isa SubDataFrame && getfield(df, :colindex) isa Index)) + throw(ArgumentError("insertcols! is only supported for DataFrame or" * + "SubDataFrame created with `:` as column selector")) + end + col_ind = Int(col isa SymbolOrString ? columnindex(df, col) : col) + if !(0 < col_ind <= ncol(df) + 1) + throw(ArgumentError("attempt to insert a column to a data frame with " * + "$(ncol(df)) columns at index $col_ind")) + end + + if !makeunique + if !allunique(first.(name_cols)) + throw(ArgumentError("Names of columns to be inserted into a data frame " * + "must be unique when `makeunique=true`")) + end + for (n, _) in name_cols + if hasproperty(df, n) + throw(ArgumentError("Column $n is already present in the data frame " * + "which is not allowed when `makeunique=true`")) + end + end + end + + if ncol(df) == 0 && df isa DataFrame + target_row_count = -1 + else + target_row_count = nrow(df) + end + + for (n, v) in name_cols + if v isa AbstractVector + if target_row_count == -1 + target_row_count = length(v) + elseif length(v) != target_row_count + if target_row_count == nrow(df) + throw(DimensionMismatch("length of new column $n which is " * + "$(length(v)) must match the number " * + "of rows in data frame ($(nrow(df)))")) + else + throw(DimensionMismatch("all vectors passed to be inserted into " * + "a data frame must have the same length")) + end + end + elseif v isa AbstractArray && ndims(v) > 1 + throw(ArgumentError("adding AbstractArray other than AbstractVector as " * + "a column of a data frame is not allowed")) + end + end + if target_row_count == -1 + target_row_count = 1 + end + + for (name, item) in name_cols + if !(item isa AbstractVector) + if item isa Union{AbstractArray{<:Any, 0}, Ref} + x = item[] + item_new = fill!(Tables.allocatecolumn(typeof(x), target_row_count), x) + else + @assert !(item isa AbstractArray) + item_new = fill!(Tables.allocatecolumn(typeof(item), target_row_count), item) + end + elseif item isa AbstractRange + item_new = collect(item) + elseif copycols + item_new = copy(item) + else + item_new = item + end + + if df isa DataFrame + dfp = df + else + dfp = parent(df) + T = eltype(item_new) + newcol = Tables.allocatecolumn(Union{T, Missing}, nrow(dfp)) + fill!(newcol, missing) + newcol[rows(df)] = item_new + item_new = newcol + end + + firstindex(item_new) != 1 && _onebased_check_error() + + if ncol(dfp) == 0 + dfp[!, name] = item_new + else + if hasproperty(dfp, name) + @assert makeunique + k = 1 + while true + nn = Symbol("$(name)_$k") + if !hasproperty(dfp, nn) + name = nn + break + end + k += 1 + end + end + insert!(index(dfp), col_ind, name) + insert!(_columns(dfp), col_ind, item_new) + end + col_ind += 1 + end + return df +end + +insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString, <:Any}...; + makeunique::Bool=false, copycols::Bool=true) = + insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)..., + makeunique=makeunique, copycols=copycols) + +insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol, <:Any}...; + makeunique::Bool=false, copycols::Bool=true) = + insertcols!(df, ncol(df)+1, name_cols..., makeunique=makeunique, copycols=copycols) + +insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString, <:Any}...; + makeunique::Bool=false, copycols::Bool=true) = + insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)..., + makeunique=makeunique, copycols=copycols) + +function insertcols!(df::AbstractDataFrame, col::Int=ncol(df)+1; makeunique::Bool=false, name_cols...) + if !(0 < col <= ncol(df) + 1) + throw(ArgumentError("attempt to insert a column to a data frame with " * + "$(ncol(df)) columns at index $col")) + end + if !isempty(name_cols) + # an explicit error is thrown as keyword argument was supported in the past + throw(ArgumentError("inserting colums using a keyword argument is not supported, " * + "pass a Pair as a positional argument instead")) + end + return df +end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 148b04eb6a..044fb6c549 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -735,186 +735,6 @@ for T1 in (:AbstractVector, :Not, :Colon, :(typeof(!))), end end -############################################################################## -## -## Mutating methods -## -############################################################################## - -""" - insertcols!(df::DataFrame[, col], (name=>val)::Pair...; - makeunique::Bool=false, copycols::Bool=true) - -Insert a column into a data frame in place. Return the updated `DataFrame`. -If `col` is omitted it is set to `ncol(df)+1` -(the column is inserted as the last column). - -# Arguments -- `df` : the DataFrame to which we want to add columns -- `col` : a position at which we want to insert a column, passed as an integer - or a column name (a string or a `Symbol`); the column selected with `col` - and columns following it are shifted to the right in `df` after the operation -- `name` : the name of the new column -- `val` : an `AbstractVector` giving the contents of the new column or a value of any - type other than `AbstractArray` which will be repeated to fill a new vector; - As a particular rule a values stored in a `Ref` or a `0`-dimensional `AbstractArray` - are unwrapped and treated in the same way. -- `makeunique` : Defines what to do if `name` already exists in `df`; - if it is `false` an error will be thrown; if it is `true` a new unique name will - be generated by adding a suffix -- `copycols` : whether vectors passed as columns should be copied - -If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. - -# Examples -```jldoctest -julia> df = DataFrame(a=1:3) -3×1 DataFrame - Row │ a - │ Int64 -─────┼─────── - 1 │ 1 - 2 │ 2 - 3 │ 3 - -julia> insertcols!(df, 1, :b => 'a':'c') -3×2 DataFrame - Row │ b a - │ Char Int64 -─────┼───────────── - 1 │ a 1 - 2 │ b 2 - 3 │ c 3 - -julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) -3×4 DataFrame - Row │ b c c_1 a - │ Char Int64 Int64 Int64 -─────┼─────────────────────────── - 1 │ a 2 3 1 - 2 │ b 3 4 2 - 3 │ c 4 5 3 -``` -""" -function insertcols!(df::DataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...; - makeunique::Bool=false, copycols::Bool=true) - col_ind = Int(col isa SymbolOrString ? columnindex(df, col) : col) - if !(0 < col_ind <= ncol(df) + 1) - throw(ArgumentError("attempt to insert a column to a data frame with " * - "$(ncol(df)) columns at index $col_ind")) - end - - if !makeunique - if !allunique(first.(name_cols)) - throw(ArgumentError("Names of columns to be inserted into a data frame " * - "must be unique when `makeunique=true`")) - end - for (n, _) in name_cols - if hasproperty(df, n) - throw(ArgumentError("Column $n is already present in the data frame " * - "which is not allowed when `makeunique=true`")) - end - end - end - - if ncol(df) == 0 - target_row_count = -1 - else - target_row_count = nrow(df) - end - - for (n, v) in name_cols - if v isa AbstractVector - if target_row_count == -1 - target_row_count = length(v) - elseif length(v) != target_row_count - if target_row_count == nrow(df) - throw(DimensionMismatch("length of new column $n which is " * - "$(length(v)) must match the number " * - "of rows in data frame ($(nrow(df)))")) - else - throw(DimensionMismatch("all vectors passed to be inserted into " * - "a data frame must have the same length")) - end - end - elseif v isa AbstractArray && ndims(v) > 1 - throw(ArgumentError("adding AbstractArray other than AbstractVector as " * - "a column of a data frame is not allowed")) - end - end - if target_row_count == -1 - target_row_count = 1 - end - - for (name, item) in name_cols - if !(item isa AbstractVector) - if item isa Union{AbstractArray{<:Any, 0}, Ref} - x = item[] - item_new = fill!(Tables.allocatecolumn(typeof(x), target_row_count), x) - else - @assert !(item isa AbstractArray) - item_new = fill!(Tables.allocatecolumn(typeof(item), target_row_count), item) - end - elseif item isa AbstractRange - item_new = collect(item) - elseif copycols - item_new = copy(item) - else - item_new = item - end - - firstindex(item_new) != 1 && _onebased_check_error() - - if ncol(df) == 0 - df[!, name] = item_new - else - if hasproperty(df, name) - @assert makeunique - k = 1 - while true - nn = Symbol("$(name)_$k") - if !hasproperty(df, nn) - name = nn - break - end - k += 1 - end - end - insert!(index(df), col_ind, name) - insert!(_columns(df), col_ind, item_new) - end - col_ind += 1 - end - return df -end - -insertcols!(df::DataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString, <:Any}...; - makeunique::Bool=false, copycols::Bool=true) = - insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)..., - makeunique=makeunique, copycols=copycols) - -insertcols!(df::DataFrame, name_cols::Pair{Symbol, <:Any}...; - makeunique::Bool=false, copycols::Bool=true) = - insertcols!(df, ncol(df)+1, name_cols..., makeunique=makeunique, copycols=copycols) - -insertcols!(df::DataFrame, name_cols::Pair{<:AbstractString, <:Any}...; - makeunique::Bool=false, copycols::Bool=true) = - insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)..., - makeunique=makeunique, copycols=copycols) - -function insertcols!(df::DataFrame, col::Int=ncol(df)+1; makeunique::Bool=false, name_cols...) - if !(0 < col <= ncol(df) + 1) - throw(ArgumentError("attempt to insert a column to a data frame with " * - "$(ncol(df)) columns at index $col")) - end - if !isempty(name_cols) - # an explicit error is thrown as keyword argument was supported in the past - throw(ArgumentError("inserting colums using a keyword argument is not supported, " * - "pass a Pair as a positional argument instead")) - end - return df -end - """ copy(df::DataFrame; copycols::Bool=true) From 7f1814a72ccb39f6f8d34db598b701bad652d104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 21 Jun 2021 11:17:13 +0200 Subject: [PATCH 04/29] add NEWS.md entry --- NEWS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/NEWS.md b/NEWS.md index 243e5da8c4..bece7fc60a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,12 @@ * correctly handle selectors of the form `:col => AsTable` and `:col => cols` by expanding a single column into multiple columns ([#2780](https://github.com/JuliaData/DataFrames.jl/pull/2780)) +* if `sdf` is a `SubDataFrame` created with `:` as a column selector then + `insertcols!`, `sdf[:, col] = v`, and `sdf[:, col] .= v` where `col` is + a column not present in `sdf` is allowed and it creates a new column in + `parent(sdf)` with `missing` values stored in rows that are filtered-out + in `sdf`. + ([XXXX](https://github.com/JuliaData/DataFrames.jl/pull/XXXX)) ## Bug fixes From 4a22a1dd9d6ce6fc8790d5d9dc2ec531cf2c2a75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 27 Jun 2021 09:51:24 +0200 Subject: [PATCH 05/29] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/lib/indexing.md | 7 ++++--- src/abstractdataframe/abstractdataframe.jl | 8 ++++---- src/other/broadcasting.jl | 2 +- src/subdataframe/subdataframe.jl | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 0c3f192f5f..4a55c5fae2 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -143,9 +143,10 @@ so it is unsafe to use it afterwards (the column length correctness will be pres * `sdf[CartesianIndex(row, col)] = v` -> the same as `sdf[row, col] = v`; * `sdf[row, cols] = v` -> the same as `dfr = df[row, cols]; dfr[:] = v` in-place; * `sdf[rows, col] = v` -> set rows `rows` of column `col`, in-place; `v` must be an abstract vector; - if `sdf` was created with `:` as column selector, `rows` is `:` and `col` is a `Symbol` or `AbstractString` - that is not present in `df` then a new column in `df` is created and holds `v` in rows selected in `sdf` - and `missing` in all rows present in `parent(sdf)` but not present in `sdf`. + if `rows` is `:` and `col` is a `Symbol` or `AbstractString` that is not present in `df` and + `sdf` was created with `:` as column selector, then a new column is added to `df` holding + `v` in rows selected in `sdf` and `missing` in all rows present in `parent(sdf)` + but not present in `sdf`. * `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` when column names must match; diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 0a2c1bc716..6e74b7e372 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2201,10 +2201,10 @@ If `col` is omitted it is set to `ncol(df)+1` If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. -If `df` is a `SubDataFrame` then it must be created with `:` as column selector +If `df` is a `SubDataFrame` then it must have been created with `:` as column selector (otherwise an error is thrown). In this case the `copycols` keyword argument -is ignored an added column is always copied and the parent data frame is -filled with `missing` in rows that are filtered out by `df`. +is ignored (i.e. the added column is always copied) and the parent data frame's +column is filled with `missing` in rows that are filtered out by `df`. If `df` isa `DataFrame` that has no columns and only values other than `AbstractVector` are passed then it is used to create a one element @@ -2249,7 +2249,7 @@ julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...; makeunique::Bool=false, copycols::Bool=true) if !(df isa DataFrame || (df isa SubDataFrame && getfield(df, :colindex) isa Index)) - throw(ArgumentError("insertcols! is only supported for DataFrame or" * + throw(ArgumentError("insertcols! is only supported for DataFrame or " * "SubDataFrame created with `:` as column selector")) end col_ind = Int(col isa SymbolOrString ? columnindex(df, col) : col) diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl index b8fc05c857..bf8df74922 100644 --- a/src/other/broadcasting.jl +++ b/src/other/broadcasting.jl @@ -115,7 +115,7 @@ function Base.dotview(df::SubDataFrame, ::Colon, cols::ColumnIndex) end if !(getfield(df, :colindex) isa Index) throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * - "columns of a parent data frame is disallowed")) + "columns of its parent data frame is disallowed")) end return LazyNewColDataFrame(df, Symbol(cols)) end diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 059cc50de0..fba6d51ec1 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -182,7 +182,7 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, id end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) if colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index && - val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val) + val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val) T = eltype(val) newcol = Tables.allocatecolumn(Union{T, Missing}, nrow(parent(sdf))) fill!(newcol, missing) From b721a460b14944ebea96ef941fc91288737a218c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 27 Jun 2021 10:02:39 +0200 Subject: [PATCH 06/29] changes after code review part 2 --- src/abstractdataframe/abstractdataframe.jl | 206 --------------------- src/dataframe/dataframe.jl | 206 +++++++++++++++++++++ src/other/broadcasting.jl | 9 +- src/subdataframe/subdataframe.jl | 13 +- 4 files changed, 220 insertions(+), 214 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 6e74b7e372..d50a346c51 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2173,209 +2173,3 @@ Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) = Base.setindex!(::AbstractDataFrame, ::Any, ::Union{Symbol, Integer, AbstractString}) = throw(ArgumentError("syntax df[column] is not supported use df[!, column] instead")) - -# insertcols! - -""" - insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...; - makeunique::Bool=false, copycols::Bool=true) - -Insert a column into a data frame in place. Return the updated data frame. -If `col` is omitted it is set to `ncol(df)+1` -(the column is inserted as the last column). - -# Arguments -- `df` : the data frame to which we want to add columns -- `col` : a position at which we want to insert a column, passed as an integer - or a column name (a string or a `Symbol`); the column selected with `col` - and columns following it are shifted to the right in `df` after the operation -- `name` : the name of the new column -- `val` : an `AbstractVector` giving the contents of the new column or a value of any - type other than `AbstractArray` which will be repeated to fill a new vector; - As a particular rule a values stored in a `Ref` or a `0`-dimensional `AbstractArray` - are unwrapped and treated in the same way. -- `makeunique` : Defines what to do if `name` already exists in `df`; - if it is `false` an error will be thrown; if it is `true` a new unique name will - be generated by adding a suffix -- `copycols` : whether vectors passed as columns should be copied - -If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. - -If `df` is a `SubDataFrame` then it must have been created with `:` as column selector -(otherwise an error is thrown). In this case the `copycols` keyword argument -is ignored (i.e. the added column is always copied) and the parent data frame's -column is filled with `missing` in rows that are filtered out by `df`. - -If `df` isa `DataFrame` that has no columns and only values -other than `AbstractVector` are passed then it is used to create a one element -column. -If `df` isa `DataFrame` that has no columns and at least one `AbstractVector` is -passed then its length is used to determine the number of elements in all -created columns. -In all other cases the number of rows in all created columns must match -`nrow(df)`. -. - -# Examples -```jldoctest -julia> df = DataFrame(a=1:3) -3×1 DataFrame - Row │ a - │ Int64 -─────┼─────── - 1 │ 1 - 2 │ 2 - 3 │ 3 - -julia> insertcols!(df, 1, :b => 'a':'c') -3×2 DataFrame - Row │ b a - │ Char Int64 -─────┼───────────── - 1 │ a 1 - 2 │ b 2 - 3 │ c 3 - -julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) -3×4 DataFrame - Row │ b c c_1 a - │ Char Int64 Int64 Int64 -─────┼─────────────────────────── - 1 │ a 2 3 1 - 2 │ b 3 4 2 - 3 │ c 4 5 3 -``` -""" -function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...; - makeunique::Bool=false, copycols::Bool=true) - if !(df isa DataFrame || (df isa SubDataFrame && getfield(df, :colindex) isa Index)) - throw(ArgumentError("insertcols! is only supported for DataFrame or " * - "SubDataFrame created with `:` as column selector")) - end - col_ind = Int(col isa SymbolOrString ? columnindex(df, col) : col) - if !(0 < col_ind <= ncol(df) + 1) - throw(ArgumentError("attempt to insert a column to a data frame with " * - "$(ncol(df)) columns at index $col_ind")) - end - - if !makeunique - if !allunique(first.(name_cols)) - throw(ArgumentError("Names of columns to be inserted into a data frame " * - "must be unique when `makeunique=true`")) - end - for (n, _) in name_cols - if hasproperty(df, n) - throw(ArgumentError("Column $n is already present in the data frame " * - "which is not allowed when `makeunique=true`")) - end - end - end - - if ncol(df) == 0 && df isa DataFrame - target_row_count = -1 - else - target_row_count = nrow(df) - end - - for (n, v) in name_cols - if v isa AbstractVector - if target_row_count == -1 - target_row_count = length(v) - elseif length(v) != target_row_count - if target_row_count == nrow(df) - throw(DimensionMismatch("length of new column $n which is " * - "$(length(v)) must match the number " * - "of rows in data frame ($(nrow(df)))")) - else - throw(DimensionMismatch("all vectors passed to be inserted into " * - "a data frame must have the same length")) - end - end - elseif v isa AbstractArray && ndims(v) > 1 - throw(ArgumentError("adding AbstractArray other than AbstractVector as " * - "a column of a data frame is not allowed")) - end - end - if target_row_count == -1 - target_row_count = 1 - end - - for (name, item) in name_cols - if !(item isa AbstractVector) - if item isa Union{AbstractArray{<:Any, 0}, Ref} - x = item[] - item_new = fill!(Tables.allocatecolumn(typeof(x), target_row_count), x) - else - @assert !(item isa AbstractArray) - item_new = fill!(Tables.allocatecolumn(typeof(item), target_row_count), item) - end - elseif item isa AbstractRange - item_new = collect(item) - elseif copycols - item_new = copy(item) - else - item_new = item - end - - if df isa DataFrame - dfp = df - else - dfp = parent(df) - T = eltype(item_new) - newcol = Tables.allocatecolumn(Union{T, Missing}, nrow(dfp)) - fill!(newcol, missing) - newcol[rows(df)] = item_new - item_new = newcol - end - - firstindex(item_new) != 1 && _onebased_check_error() - - if ncol(dfp) == 0 - dfp[!, name] = item_new - else - if hasproperty(dfp, name) - @assert makeunique - k = 1 - while true - nn = Symbol("$(name)_$k") - if !hasproperty(dfp, nn) - name = nn - break - end - k += 1 - end - end - insert!(index(dfp), col_ind, name) - insert!(_columns(dfp), col_ind, item_new) - end - col_ind += 1 - end - return df -end - -insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString, <:Any}...; - makeunique::Bool=false, copycols::Bool=true) = - insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)..., - makeunique=makeunique, copycols=copycols) - -insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol, <:Any}...; - makeunique::Bool=false, copycols::Bool=true) = - insertcols!(df, ncol(df)+1, name_cols..., makeunique=makeunique, copycols=copycols) - -insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString, <:Any}...; - makeunique::Bool=false, copycols::Bool=true) = - insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)..., - makeunique=makeunique, copycols=copycols) - -function insertcols!(df::AbstractDataFrame, col::Int=ncol(df)+1; makeunique::Bool=false, name_cols...) - if !(0 < col <= ncol(df) + 1) - throw(ArgumentError("attempt to insert a column to a data frame with " * - "$(ncol(df)) columns at index $col")) - end - if !isempty(name_cols) - # an explicit error is thrown as keyword argument was supported in the past - throw(ArgumentError("inserting colums using a keyword argument is not supported, " * - "pass a Pair as a positional argument instead")) - end - return df -end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 044fb6c549..70f41597de 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -735,6 +735,212 @@ for T1 in (:AbstractVector, :Not, :Colon, :(typeof(!))), end end +# insertcols! + +""" + insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...; + makeunique::Bool=false, copycols::Bool=true) + +Insert a column into a data frame in place. Return the updated data frame. +If `col` is omitted it is set to `ncol(df)+1` +(the column is inserted as the last column). + +# Arguments +- `df` : the data frame to which we want to add columns +- `col` : a position at which we want to insert a column, passed as an integer + or a column name (a string or a `Symbol`); the column selected with `col` + and columns following it are shifted to the right in `df` after the operation +- `name` : the name of the new column +- `val` : an `AbstractVector` giving the contents of the new column or a value of any + type other than `AbstractArray` which will be repeated to fill a new vector; + As a particular rule a values stored in a `Ref` or a `0`-dimensional `AbstractArray` + are unwrapped and treated in the same way. +- `makeunique` : Defines what to do if `name` already exists in `df`; + if it is `false` an error will be thrown; if it is `true` a new unique name will + be generated by adding a suffix +- `copycols` : whether vectors passed as columns should be copied + +If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. + +If `df` is a `SubDataFrame` then it must have been created with `:` as column selector +(otherwise an error is thrown). In this case the `copycols` keyword argument +is ignored (i.e. the added column is always copied) and the parent data frame's +column is filled with `missing` in rows that are filtered out by `df`. + +If `df` isa `DataFrame` that has no columns and only values +other than `AbstractVector` are passed then it is used to create a one element +column. +If `df` isa `DataFrame` that has no columns and at least one `AbstractVector` is +passed then its length is used to determine the number of elements in all +created columns. +In all other cases the number of rows in all created columns must match +`nrow(df)`. +. + +# Examples +```jldoctest +julia> df = DataFrame(a=1:3) +3×1 DataFrame + Row │ a + │ Int64 +─────┼─────── + 1 │ 1 + 2 │ 2 + 3 │ 3 + +julia> insertcols!(df, 1, :b => 'a':'c') +3×2 DataFrame + Row │ b a + │ Char Int64 +─────┼───────────── + 1 │ a 1 + 2 │ b 2 + 3 │ c 3 + +julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) +3×4 DataFrame + Row │ b c c_1 a + │ Char Int64 Int64 Int64 +─────┼─────────────────────────── + 1 │ a 2 3 1 + 2 │ b 3 4 2 + 3 │ c 4 5 3 +``` +""" +function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...; + makeunique::Bool=false, copycols::Bool=true) + if !(df isa DataFrame || (df isa SubDataFrame && getfield(df, :colindex) isa Index)) + throw(ArgumentError("insertcols! is only supported for DataFrame or " * + "SubDataFrame created with `:` as column selector")) + end + col_ind = Int(col isa SymbolOrString ? columnindex(df, col) : col) + if !(0 < col_ind <= ncol(df) + 1) + throw(ArgumentError("attempt to insert a column to a data frame with " * + "$(ncol(df)) columns at index $col_ind")) + end + + if !makeunique + if !allunique(first.(name_cols)) + throw(ArgumentError("Names of columns to be inserted into a data frame " * + "must be unique when `makeunique=true`")) + end + for (n, _) in name_cols + if hasproperty(df, n) + throw(ArgumentError("Column $n is already present in the data frame " * + "which is not allowed when `makeunique=true`")) + end + end + end + + if ncol(df) == 0 && df isa DataFrame + target_row_count = -1 + else + target_row_count = nrow(df) + end + + for (n, v) in name_cols + if v isa AbstractVector + if target_row_count == -1 + target_row_count = length(v) + elseif length(v) != target_row_count + if target_row_count == nrow(df) + throw(DimensionMismatch("length of new column $n which is " * + "$(length(v)) must match the number " * + "of rows in data frame ($(nrow(df)))")) + else + throw(DimensionMismatch("all vectors passed to be inserted into " * + "a data frame must have the same length")) + end + end + elseif v isa AbstractArray && ndims(v) > 1 + throw(ArgumentError("adding AbstractArray other than AbstractVector as " * + "a column of a data frame is not allowed")) + end + end + if target_row_count == -1 + target_row_count = 1 + end + + for (name, item) in name_cols + if !(item isa AbstractVector) + if item isa Union{AbstractArray{<:Any, 0}, Ref} + x = item[] + item_new = fill!(Tables.allocatecolumn(typeof(x), target_row_count), x) + else + @assert !(item isa AbstractArray) + item_new = fill!(Tables.allocatecolumn(typeof(item), target_row_count), item) + end + elseif item isa AbstractRange + item_new = collect(item) + elseif copycols && df isa DataFrame + item_new = copy(item) + else + item_new = item + end + + if df isa DataFrame + dfp = df + else + dfp = parent(df) + T = eltype(item_new) + newcol = Tables.allocatecolumn(Union{T, Missing}, nrow(dfp)) + fill!(newcol, missing) + newcol[rows(df)] = item_new + item_new = newcol + end + + firstindex(item_new) != 1 && _onebased_check_error() + + if ncol(dfp) == 0 + dfp[!, name] = item_new + else + if hasproperty(dfp, name) + @assert makeunique + k = 1 + while true + nn = Symbol("$(name)_$k") + if !hasproperty(dfp, nn) + name = nn + break + end + k += 1 + end + end + insert!(index(dfp), col_ind, name) + insert!(_columns(dfp), col_ind, item_new) + end + col_ind += 1 + end + return df +end + +insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString, <:Any}...; + makeunique::Bool=false, copycols::Bool=true) = + insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)..., + makeunique=makeunique, copycols=copycols) + +insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol, <:Any}...; + makeunique::Bool=false, copycols::Bool=true) = + insertcols!(df, ncol(df)+1, name_cols..., makeunique=makeunique, copycols=copycols) + +insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString, <:Any}...; + makeunique::Bool=false, copycols::Bool=true) = + insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)..., + makeunique=makeunique, copycols=copycols) + +function insertcols!(df::AbstractDataFrame, col::Int=ncol(df)+1; makeunique::Bool=false, name_cols...) + if !(0 < col <= ncol(df) + 1) + throw(ArgumentError("attempt to insert a column to a data frame with " * + "$(ncol(df)) columns at index $col")) + end + if !isempty(name_cols) + # an explicit error is thrown as keyword argument was supported in the past + throw(ArgumentError("inserting colums using a keyword argument is not supported, " * + "pass a Pair as a positional argument instead")) + end + return df +end + """ copy(df::DataFrame; copycols::Bool=true) diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl index bf8df74922..9fea19024f 100644 --- a/src/other/broadcasting.jl +++ b/src/other/broadcasting.jl @@ -158,6 +158,7 @@ end function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted{T}) where T df = lazydf.df @assert columnindex(df, lazydf.col) == 0 + df isa SubDataFrame && @assert getfield(df, :colindex) isa Index if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}} bc_tmp = Base.Broadcast.Broadcasted{T}(bc.f, bc.args, ()) v = Base.Broadcast.materialize(bc_tmp) @@ -166,12 +167,8 @@ function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcaste else col = Base.Broadcast.materialize(bc) end - if df isa DataFrame - return df[!, lazydf.col] = col - else - @assert df isa SubDataFrame && getfield(df, :colindex) isa Index - return df[:, lazydf.col] = col - end + + return df[!, lazydf.col] = col end function _copyto_helper!(dfcol::AbstractVector, bc::Base.Broadcast.Broadcasted, col::Int) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index fba6d51ec1..1270b53a02 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -181,8 +181,17 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, id setindex!(sdf, val, idx[1], idx[2]) end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) - if colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index && - val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val) + if columnindex(sdf, colinds) == 0 + if !(colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index) + throw(ArgumentError("Creation of new columns in the SubDataFrame " * + "is only allowed when a column name is passed " * + "and the SubDataFrame was created using : " * + "as column selector")) + end + if !(val isa AbstractVector && nrow(sdf) == length(val)) + throw(ArgumentError("Assigned value must be a vector with length " * + "equal to number of rows in the SubDataFrame")) + end T = eltype(val) newcol = Tables.allocatecolumn(Union{T, Missing}, nrow(parent(sdf))) fill!(newcol, missing) From 412b89c8f4bfd6ba856b83f65af0ff0a02d7298f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 27 Jun 2021 13:26:09 +0200 Subject: [PATCH 07/29] docs update --- NEWS.md | 11 +++++++---- docs/src/lib/indexing.md | 26 +++++++++++++++++--------- src/dataframe/dataframe.jl | 1 + 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index bece7fc60a..564ee5955a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,10 +12,13 @@ by expanding a single column into multiple columns ([#2780](https://github.com/JuliaData/DataFrames.jl/pull/2780)) * if `sdf` is a `SubDataFrame` created with `:` as a column selector then - `insertcols!`, `sdf[:, col] = v`, and `sdf[:, col] .= v` where `col` is - a column not present in `sdf` is allowed and it creates a new column in - `parent(sdf)` with `missing` values stored in rows that are filtered-out - in `sdf`. + `insertcols!`, `setindex!`, broadcasted assignment, `select!` and `transform!` + (also on `GroupedDataFrame` created from such a `SubDataFrame`) + works exactly the same like for parent `DataFrame` except that for + rows that are filtered-ou in `sdf`: + - new columns are created with `missing` values stored in these rows; + - assignment to existing columns retains values already stored in them in + these rows; ([XXXX](https://github.com/JuliaData/DataFrames.jl/pull/XXXX)) ## Bug fixes diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 4a55c5fae2..633b9c5400 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -119,6 +119,15 @@ In particular a description explicitly mentions if the assignment is *in-place*. Note that if a `setindex!` operation throws an error the target data frame may be partially changed so it is unsafe to use it afterwards (the column length correctness will be preserved). +!!! note + + The rules described below for `DataFrame` also apply to `SubDataFrame` if + it was created with `:` as column selector, except that for + rows that are filtered-ou in `sdf`: + - new columns are created with `missing` values stored in these rows, + - assignment to existing columns retains values already stored in them in + these rows. + `setindex!` on `DataFrame`: * `df[row, col] = v` -> set value of `col` in row `row` to `v` in-place; * `df[CartesianIndex(row, col)] = v` -> the same as `df[row, col] = v`; @@ -138,15 +147,11 @@ so it is unsafe to use it afterwards (the column length correctness will be pres `v` must be an `AbstractMatrix` or an `AbstractDataFrame` (in the latter case column names must match); -`setindex!` on `SubDataFrame`: +`setindex!` on `SubDataFrame` (not created with `:` as column selector): * `sdf[row, col] = v` -> set value of `col` in row `row` to `v` in-place; * `sdf[CartesianIndex(row, col)] = v` -> the same as `sdf[row, col] = v`; * `sdf[row, cols] = v` -> the same as `dfr = df[row, cols]; dfr[:] = v` in-place; * `sdf[rows, col] = v` -> set rows `rows` of column `col`, in-place; `v` must be an abstract vector; - if `rows` is `:` and `col` is a `Symbol` or `AbstractString` that is not present in `df` and - `sdf` was created with `:` as column selector, then a new column is added to `df` holding - `v` in rows selected in `sdf` and `missing` in all rows present in `parent(sdf)` - but not present in `sdf`. * `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` when column names must match; @@ -185,6 +190,13 @@ In such an operation `AbstractDataFrame` is considered as two-dimensional and `D The rule above means that, similar to single-dimensional objects in Base (e.g. vectors), `DataFrameRow` is considered to be column-oriented. +!!! note + + The rules described below for `DataFrame` also apply to `SubDataFrame` if + it was created with `:` as column selector following the same approach + as for `setindex!`. In the list below when `sdf` is present it is assumed + to be created with column selector other than `:`. + Additional rules: * in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` syntaxes `v` is broadcasted into the contents of `df[row, col]` (this is consistent with Julia Base); @@ -193,10 +205,6 @@ Additional rules: `df` is performed in-place; if `rows` is `:` and `col` is `Symbol` or `AbstractString` and it is missing from `df` then a new column is allocated and added; the length of the column is always the value of `nrow(df)` before the assignment takes place; -* in the `sdf[:, col] .= v` if `sdf` was created with `:` as column selector - and `col` is a `Symbol` or `AbstractString` that is not present in `df` then a new column in `df` - is created and holds contents of `v` broadcasted onto rows selected in `sdf` - and `missing` in all rows present in `parent(sdf)` but not present in `sdf`. * in the `df[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; if `col` is `Symbol` or `AbstractString` and it is missing from `df` then a new column is allocated added; the length of the column is always the value of `nrow(df)` before the assignment takes place; diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 70f41597de..2be327fab7 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -736,6 +736,7 @@ for T1 in (:AbstractVector, :Not, :Colon, :(typeof(!))), end # insertcols! +# TODO: move to abstractdataframe/abstractdataframe.jl """ insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...; From b95f07ac6b869955c9e220cf7ad31805692837fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 27 Jun 2021 23:11:02 +0200 Subject: [PATCH 08/29] setindex! for ! and setproperty --- src/subdataframe/subdataframe.jl | 66 +++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 1270b53a02..d7bd652f64 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -176,10 +176,21 @@ Base.@propagate_inbounds Base.getindex(df::SubDataFrame, row_ind::typeof(!), col_inds::MultiColumnIndex) = select(df, col_inds, copycols=false) +Base.setproperty!(df::SubDataFrame{T, Index}, col_ind::Symbol, v::AbstractVector) where {T} = + (df[!, col_ind] = v) +Base.setproperty!(df::SubDataFrame{T, Index}, col_ind::AbstractString, v::AbstractVector) where {T} = + (df[!, col_ind] = v) +Base.setproperty!(::SubDataFrame{T, Index}, col_ind::Symbol, v::Any) where {T} = + throw(ArgumentError("It is only allowed to pass a vector as a column of a SubDataFrame. " * + "Instead use `df[!, col_ind] .= v` if you want to use broadcasting.")) +Base.setproperty!(::SubDataFrame{T, Index}, col_ind::AbstractString, v::Any) where {T} = + throw(ArgumentError("It is only allowed to pass a vector as a column of a SubDataFrame. " * + "Instead use `df[!, col_ind] .= v` if you want to use broadcasting.")) Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, idx::CartesianIndex{2}) - setindex!(sdf, val, idx[1], idx[2]) + return setindex!(sdf, val, idx[1], idx[2]) end + Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) if columnindex(sdf, colinds) == 0 if !(colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index) @@ -202,13 +213,66 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, :: end return sdf end + +function Base.setindex!(sdf::SubDataFrame{D, Index}, v::AbstractVector, + ::typeof(!), col_ind::ColumnIndex) where {D} + if if columnindex(sdf, col_ind) == 0 + sdf[:, col_ind] = v + else + pdf = parent(sdf) + old_col = pdf[!, col_ind] + T = typeof(old_col) + S = typeof(v) + newcol = Tables.allocatecolumn(Union{T, S}, length(old_col)) + newcol .= old_col + newcol[rows(sdf)] = v + pdf[!, col_ind] = newcol + end + return sdf +end + +for T in MULTICOLUMNINDEX_TUPLE + @eval function Base.setindex!(sdf::SubDataFrame{D, Index}, + new_df::AbstractDataFrame, + row_inds::typeof(!), + col_inds::$T) where {D} + idxs = index(sdf)[col_inds] + if view(_names(sdf), idxs) != _names(new_df) + throw(ArgumentError("Column names in source and target data frames do not match")) + end + for (j, col) in enumerate(idxs) + # we will make a copy on assignment later + sdf[!, col] = new_df[!, j] + end + return df + end + + @eval function Base.setindex!(sdf::SubDataFrame{D, Index}, + mx::AbstractMatrix, + row_inds::typeof(!), + col_inds::$T) where {D} + idxs = index(sdf)[col_inds] + if size(mx, 2) != length(idxs) + throw(DimensionMismatch("number of selected columns ($(length(idxs))) " * + "and number of columns in " * + "matrix ($(size(mx, 2))) do not match")) + end + for (j, col) in enumerate(idxs) + sdf[!, col] = view(mx, :, j) + end + return sdf + end +end + Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::typeof(!), colinds::Any) throw(ArgumentError("setting index of SubDataFrame using ! as row selector is not allowed")) end + Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, rowinds::Any, colinds::Any) parent(sdf)[rows(sdf)[rowinds], parentcols(index(sdf), colinds)] = val return sdf end + Base.@propagate_inbounds Base.setindex!(sdf::SubDataFrame, val::Any, rowinds::Bool, colinds::Any) = throw(ArgumentError("invalid row index of type Bool")) From dc0d2415c01d159b6be4f009a25e2a1fb524b6fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 6 Aug 2021 17:37:32 +0200 Subject: [PATCH 09/29] fix NEWS.md --- NEWS.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1e574ebc39..b127e9c82d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,12 +2,13 @@ ## New functionalities -* in the `groupby` function the `sort` keyword argument now allows three values +* in the `groupby` function the `sort` keyword argument now allows three values: - `nothing` (the default) leaves the order of groups undefined and allows `groupby` to pick the fastest available grouping algorithm; - `true` sorts groups by key columns; - `false` creates groups in the order of their appearance in the parent data frame; + In previous versions, the `sort` keyword argument allowed only `Bool` values and `false` (which was the default) corresponded to the new behavior when `nothing` is passed. Therefore only the user visible change @@ -25,7 +26,7 @@ rows that are filtered-ou in `sdf`: - new columns are created with `missing` values stored in these rows; - assignment to existing columns retains values already stored in them in - these rows; + these rows. ([2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)) # DataFrames.jl v1.2.2 Patch Release Notes From 9f02571b77e8fddf8fe60a806fb9e29d36c1b27e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 6 Aug 2021 17:38:39 +0200 Subject: [PATCH 10/29] another NEWS.md fix --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index b127e9c82d..09c7e526b9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,7 @@ when grouping columns implemented the `DataAPI.refpool` API (notably `PooledArray` and `CategoricalArray`) or when they contained only integers in a small range. + ([#2812](https://github.com/JuliaData/DataFrames.jl/pull/2812)) * if `sdf` is a `SubDataFrame` created with `:` as a column selector then `insertcols!`, `setindex!`, broadcasted assignment, `select!` and `transform!` @@ -27,6 +28,7 @@ - new columns are created with `missing` values stored in these rows; - assignment to existing columns retains values already stored in them in these rows. + ([2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)) # DataFrames.jl v1.2.2 Patch Release Notes From 121bb54965ecdd3530dfa027e5f54d1d0b72b25e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 6 Aug 2021 17:40:57 +0200 Subject: [PATCH 11/29] another small NEWS.md change --- NEWS.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 09c7e526b9..3027f382a6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,10 @@ ## New functionalities -* in the `groupby` function the `sort` keyword argument now allows three values: +* Improve `sort` keyword argument in `groupby` + ([#2812](https://github.com/JuliaData/DataFrames.jl/pull/2812)). + + In the `groupby` function the `sort` keyword argument now allows three values: - `nothing` (the default) leaves the order of groups undefined and allows `groupby` to pick the fastest available grouping algorithm; - `true` sorts groups by key columns; @@ -19,8 +22,10 @@ (notably `PooledArray` and `CategoricalArray`) or when they contained only integers in a small range. - ([#2812](https://github.com/JuliaData/DataFrames.jl/pull/2812)) -* if `sdf` is a `SubDataFrame` created with `:` as a column selector then +* Allow adding columns to a `SubDataFrame` created with `:` as column selector + ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). + + If `sdf` is a `SubDataFrame` created with `:` as a column selector then `insertcols!`, `setindex!`, broadcasted assignment, `select!` and `transform!` (also on `GroupedDataFrame` created from such a `SubDataFrame`) works exactly the same like for parent `DataFrame` except that for @@ -29,8 +34,6 @@ - assignment to existing columns retains values already stored in them in these rows. - ([2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)) - # DataFrames.jl v1.2.2 Patch Release Notes ## Bug fixes From 1a83b61f92723141ed4dabaf00650dc28608c30d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 7 Aug 2021 11:46:18 +0200 Subject: [PATCH 12/29] finished tests for df[!, col] assignment and broadcasted assignment --- NEWS.md | 20 +- docs/src/lib/indexing.md | 49 +-- src/dataframe/dataframe.jl | 2 +- src/other/broadcasting.jl | 43 ++- src/subdataframe/subdataframe.jl | 82 ++--- test/index.jl | 496 ++++++++++++++++++++++++++++++- 6 files changed, 597 insertions(+), 95 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3027f382a6..9e0cb382f5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,17 +22,21 @@ (notably `PooledArray` and `CategoricalArray`) or when they contained only integers in a small range. -* Allow adding columns to a `SubDataFrame` created with `:` as column selector +* Allow adding new columns to a `SubDataFrame` created with `:` as column selector ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). If `sdf` is a `SubDataFrame` created with `:` as a column selector then - `insertcols!`, `setindex!`, broadcasted assignment, `select!` and `transform!` - (also on `GroupedDataFrame` created from such a `SubDataFrame`) - works exactly the same like for parent `DataFrame` except that for - rows that are filtered-ou in `sdf`: - - new columns are created with `missing` values stored in these rows; - - assignment to existing columns retains values already stored in them in - these rows. + `insertcols!`, `setindex!`, broadcasted assignment allow for creation + of new columns with `missing` values stored in filtered-out rows; + +* Allow replacing existing columns in a `SubDataFrame` with `!` as row selector in assignment and broadcasted assignment + ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). + + Assignment to existing columns retains allocates a new column. Values already stored in filtered-out rows are retained. + +* TODO DESIGN: Allow `SubDataFrame` to be passed as argument of , `select!` and `transform!` + (also on `GroupedDataFrame` created a `SubDataFrame`) + ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). # DataFrames.jl v1.2.2 Patch Release Notes diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 633b9c5400..befc5be800 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -119,15 +119,6 @@ In particular a description explicitly mentions if the assignment is *in-place*. Note that if a `setindex!` operation throws an error the target data frame may be partially changed so it is unsafe to use it afterwards (the column length correctness will be preserved). -!!! note - - The rules described below for `DataFrame` also apply to `SubDataFrame` if - it was created with `:` as column selector, except that for - rows that are filtered-ou in `sdf`: - - new columns are created with `missing` values stored in these rows, - - assignment to existing columns retains values already stored in them in - these rows. - `setindex!` on `DataFrame`: * `df[row, col] = v` -> set value of `col` in row `row` to `v` in-place; * `df[CartesianIndex(row, col)] = v` -> the same as `df[row, col] = v`; @@ -154,8 +145,19 @@ so it is unsafe to use it afterwards (the column length correctness will be pres * `sdf[rows, col] = v` -> set rows `rows` of column `col`, in-place; `v` must be an abstract vector; * `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` when column names must match; - -Note that `sdf[!, col] = v`, `sdf[!, cols] = v` and `sdf.col = v` are not allowed as `sdf` can be only modified in-place. +* `sdf[!, col] = v` -> replaces `col` with `v` with copying; if `col` is present if `sdf` + then filtered-out rows in newly created vector are filled with + values already present in that column; + if `col` is not present in `sdf` then the operation is only allowed + if `sdf` was created with `:` as column selector. In this case + filtered-out rows are filled with `missing`; + equivalent to `sdf.col = v` if `col` is a valid identifier; + operation is allowed if `length(v) == nrow(sdf)`; +* `sdf[!, cols] = v` -> replaces existing columns `cols` in data frame `sdf` with copying; + `v` must be an `AbstractMatrix` or an `AbstractDataFrame` + (in the latter case column names must match); + filtered-out rows in newly created vectors are filled with + values already present in respective columns; `setindex!` on `DataFrameRow`: * `dfr[col] = v` -> set value of `col` in row `row` to `v` in-place; @@ -190,13 +192,6 @@ In such an operation `AbstractDataFrame` is considered as two-dimensional and `D The rule above means that, similar to single-dimensional objects in Base (e.g. vectors), `DataFrameRow` is considered to be column-oriented. -!!! note - - The rules described below for `DataFrame` also apply to `SubDataFrame` if - it was created with `:` as column selector following the same approach - as for `setindex!`. In the list below when `sdf` is present it is assumed - to be created with column selector other than `:`. - Additional rules: * in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` syntaxes `v` is broadcasted into the contents of `df[row, col]` (this is consistent with Julia Base); @@ -214,8 +209,22 @@ Additional rules: Starting from Julia 1.7 if `:col` is not present in `df` then a new column will be created in `df`. * in the `sdf[CartesianIndex(row, col)] .= v`, `sdf[row, col] .= v` and `sdf[row, cols] .= v` syntaxes the assignment to `sdf` is performed in-place; * in the `sdf[rows, col] .= v` and `sdf[rows, cols] .= v` syntaxes the assignment to `sdf` is performed in-place; -* `sdf.col .= v` syntax is performs an in-place assignment to an existing vector `sdf.col` and is deprecated; - in the future this operation will not be allowed. + if `rows` is `:` and `col` is `Symbol` or `AbstractString` + and it is missing from `sdf` and `sdf` was created with `:` as column selector then a new column is allocated and added; + the length of the column is always the value of `nrow(sdf)` before the assignment takes place; + the filtered-out rows are filled with `missing`; +* in the `sdf[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; + the filtered-out rows are filled with values already present in `col`; + if `col` is `Symbol` or `AbstractString` and it is missing from `sdf` + that was created with `:` as column selector then a new column is allocated added; + the length of the column is always the value of `nrow(df)` before the assignment takes place; + in this case the filtered-out rows are filled with `missing`; +* the `sdf[!, cols] .= v` syntax replaces existing columns `cols` in data frame `sdf` with freshly allocated vectors; + the filtered-out rows are filled with values already present in `cols`; +* `sdf.col .= v` syntax currently performs in-place assignment to an existing vector `sdf.col`; + this behavior is deprecated and a new column will be allocated in the future. + Starting from Julia 1.7 if `:col` is not present in `sdf` then a new column will be created in `sdf` + if it was created with `:` as a column selector. * `dfr.col .= v` syntax is allowed and performs in-place assignment to a value extracted by `dfr.col`. Note that `sdf[!, col] .= v` and `sdf[!, cols] .= v` syntaxes are not allowed as `sdf` can be only modified in-place. diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 2cf356db05..926bb0386c 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -815,7 +815,7 @@ julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) """ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...; makeunique::Bool=false, copycols::Bool=true) - if !(df isa DataFrame || (df isa SubDataFrame && getfield(df, :colindex) isa Index)) + if !is_column_adding_allowed(df) throw(ArgumentError("insertcols! is only supported for DataFrame or " * "SubDataFrame created with `:` as column selector")) end diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl index 9fea19024f..c1bec04e1b 100644 --- a/src/other/broadcasting.jl +++ b/src/other/broadcasting.jl @@ -100,45 +100,42 @@ Base.maybeview(df::AbstractDataFrame, idx::CartesianIndex{2}) = df[idx] Base.maybeview(df::AbstractDataFrame, row::Integer, col::ColumnIndex) = df[row, col] Base.maybeview(df::AbstractDataFrame, rows, cols) = view(df, rows, cols) -function Base.dotview(df::DataFrame, ::Colon, cols::ColumnIndex) +function Base.dotview(df::AbstractDataFrame, ::Colon, cols::ColumnIndex) haskey(index(df), cols) && return view(df, :, cols) if !(cols isa SymbolOrString) throw(ArgumentError("creating new columns using an integer index is disallowed")) end - return LazyNewColDataFrame(df, Symbol(cols)) -end - -function Base.dotview(df::SubDataFrame, ::Colon, cols::ColumnIndex) - haskey(index(df), cols) && return view(df, :, cols) - if !(cols isa SymbolOrString) - throw(ArgumentError("creating new columns using an integer index is disallowed")) - end - if !(getfield(df, :colindex) isa Index) + if !is_column_adding_allowed(df) throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * "columns of its parent data frame is disallowed")) end return LazyNewColDataFrame(df, Symbol(cols)) end -function Base.dotview(df::DataFrame, ::typeof(!), cols) +function Base.dotview(df::AbstractDataFrame, ::typeof(!), cols) if !(cols isa ColumnIndex) return ColReplaceDataFrame(df, index(df)[cols]) end - if !(cols isa SymbolOrString) && cols > ncol(df) + if cols isa SymbolOrString + if columnindex(df, cols) == 0 && !is_column_adding_allowed(df) + throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * + "columns of its parent data frame is disallowed")) + end + elseif !(1 <= cols <= ncol(df)) throw(ArgumentError("creating new columns using an integer index is disallowed")) end return LazyNewColDataFrame(df, cols isa AbstractString ? Symbol(cols) : cols) end -Base.dotview(df::SubDataFrame, ::typeof(!), idxs) = - throw(ArgumentError("broadcasting with ! row selector is not allowed for SubDataFrame")) - - # TODO: remove the deprecations when Julia 1.7 functionality is commonly used # by the community if isdefined(Base, :dotgetproperty) - function Base.dotgetproperty(df::DataFrame, col::SymbolOrString) + function Base.dotgetproperty(df::AbstractDataFrame, col::SymbolOrString) if columnindex(df, col) == 0 + if !is_column_adding_allowed(df) + throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * + "columns of its parent data frame is disallowed")) + end return LazyNewColDataFrame(df, Symbol(col)) else Base.depwarn("In the future this operation will allocate a new column " * @@ -146,19 +143,13 @@ if isdefined(Base, :dotgetproperty) return getproperty(df, col) end end - - function Base.dotgetproperty(df::SubDataFrame, col::SymbolOrString) - Base.depwarn("broadcasting getproperty is deprecated for SubDataFrame and " * - "will be disallowed in the future. Use `df[:, $(repr(col))] .= ... instead", - :dotgetproperty) - return getproperty(df, col) - end end function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted{T}) where T df = lazydf.df - @assert columnindex(df, lazydf.col) == 0 - df isa SubDataFrame && @assert getfield(df, :colindex) isa Index + if !haskey(index(df), lazydf.col) && df isa SubDataFrame && lazydf.col isa SymbolOrString + @assert is_column_adding_allowed(df) + end if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}} bc_tmp = Base.Broadcast.Broadcasted{T}(bc.f, bc.args, ()) v = Base.Broadcast.materialize(bc_tmp) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index d7bd652f64..e42c8ee65d 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -176,28 +176,15 @@ Base.@propagate_inbounds Base.getindex(df::SubDataFrame, row_ind::typeof(!), col_inds::MultiColumnIndex) = select(df, col_inds, copycols=false) -Base.setproperty!(df::SubDataFrame{T, Index}, col_ind::Symbol, v::AbstractVector) where {T} = - (df[!, col_ind] = v) -Base.setproperty!(df::SubDataFrame{T, Index}, col_ind::AbstractString, v::AbstractVector) where {T} = - (df[!, col_ind] = v) -Base.setproperty!(::SubDataFrame{T, Index}, col_ind::Symbol, v::Any) where {T} = - throw(ArgumentError("It is only allowed to pass a vector as a column of a SubDataFrame. " * - "Instead use `df[!, col_ind] .= v` if you want to use broadcasting.")) -Base.setproperty!(::SubDataFrame{T, Index}, col_ind::AbstractString, v::Any) where {T} = - throw(ArgumentError("It is only allowed to pass a vector as a column of a SubDataFrame. " * - "Instead use `df[!, col_ind] .= v` if you want to use broadcasting.")) - Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, idx::CartesianIndex{2}) return setindex!(sdf, val, idx[1], idx[2]) end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) - if columnindex(sdf, colinds) == 0 - if !(colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index) - throw(ArgumentError("Creation of new columns in the SubDataFrame " * - "is only allowed when a column name is passed " * - "and the SubDataFrame was created using : " * - "as column selector")) + if colinds isa SymbolOrString && columnindex(sdf, colinds) == 0 + if !is_column_adding_allowed(sdf) + throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * + "columns of its parent data frame is disallowed")) end if !(val isa AbstractVector && nrow(sdf) == length(val)) throw(ArgumentError("Assigned value must be a vector with length " * @@ -214,16 +201,23 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, :: return sdf end -function Base.setindex!(sdf::SubDataFrame{D, Index}, v::AbstractVector, - ::typeof(!), col_ind::ColumnIndex) where {D} - if if columnindex(sdf, col_ind) == 0 +function Base.setindex!(sdf::SubDataFrame, v::AbstractVector, + ::typeof(!), col_ind::ColumnIndex) + if col_ind isa Union{Signed, Unsigned} && !(1 <= col_ind <= ncol(sdf)) + throw(ArgumentError("Cannot assign to non-existent column: $col_ind")) + end + if col_ind isa SymbolOrString && columnindex(sdf, col_ind) == 0 + if !is_column_adding_allowed(sdf) + throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * + "columns of its parent data frame is disallowed")) + end sdf[:, col_ind] = v else pdf = parent(sdf) old_col = pdf[!, col_ind] - T = typeof(old_col) - S = typeof(v) - newcol = Tables.allocatecolumn(Union{T, S}, length(old_col)) + T = eltype(old_col) + S = eltype(v) + newcol = Tables.allocatecolumn(promote_type(T, S), length(old_col)) newcol .= old_col newcol[rows(sdf)] = v pdf[!, col_ind] = newcol @@ -232,10 +226,10 @@ function Base.setindex!(sdf::SubDataFrame{D, Index}, v::AbstractVector, end for T in MULTICOLUMNINDEX_TUPLE - @eval function Base.setindex!(sdf::SubDataFrame{D, Index}, + @eval function Base.setindex!(sdf::SubDataFrame, new_df::AbstractDataFrame, row_inds::typeof(!), - col_inds::$T) where {D} + col_inds::$T) idxs = index(sdf)[col_inds] if view(_names(sdf), idxs) != _names(new_df) throw(ArgumentError("Column names in source and target data frames do not match")) @@ -247,10 +241,10 @@ for T in MULTICOLUMNINDEX_TUPLE return df end - @eval function Base.setindex!(sdf::SubDataFrame{D, Index}, + @eval function Base.setindex!(sdf::SubDataFrame, mx::AbstractMatrix, row_inds::typeof(!), - col_inds::$T) where {D} + col_inds::$T) idxs = index(sdf)[col_inds] if size(mx, 2) != length(idxs) throw(DimensionMismatch("number of selected columns ($(length(idxs))) " * @@ -264,10 +258,6 @@ for T in MULTICOLUMNINDEX_TUPLE end end -Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::typeof(!), colinds::Any) - throw(ArgumentError("setting index of SubDataFrame using ! as row selector is not allowed")) -end - Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, rowinds::Any, colinds::Any) parent(sdf)[rows(sdf)[rowinds], parentcols(index(sdf), colinds)] = val return sdf @@ -276,14 +266,16 @@ end Base.@propagate_inbounds Base.setindex!(sdf::SubDataFrame, val::Any, rowinds::Bool, colinds::Any) = throw(ArgumentError("invalid row index of type Bool")) -Base.setproperty!(::SubDataFrame, ::Symbol, ::Any) = - throw(ArgumentError("Replacing or adding of columns of a SubDataFrame is not allowed. " * - "Instead use `df[:, col_ind] = v` or `df[:, col_ind] .= v` " * - "to perform an in-place assignment.")) -Base.setproperty!(::SubDataFrame, ::AbstractString, ::Any) = - throw(ArgumentError("Replacing or adding of columns of a SubDataFrame is not allowed. " * - "Instead use `df[:, col_ind] = v` or `df[:, col_ind] .= v` " * - "to perform an in-place assignment.")) +Base.setproperty!(df::SubDataFrame, col_ind::Symbol, v::AbstractVector) = + (df[!, col_ind] = v) +Base.setproperty!(df::SubDataFrame, col_ind::AbstractString, v::AbstractVector) = + (df[!, col_ind] = v) +Base.setproperty!(::SubDataFrame, col_ind::Symbol, v::Any) = + throw(ArgumentError("It is only allowed to pass a vector as a column of a SubDataFrame. " * + "Instead use `df[!, col_ind] .= v` if you want to use broadcasting.")) +Base.setproperty!(::SubDataFrame, col_ind::AbstractString, v::Any) = + throw(ArgumentError("It is only allowed to pass a vector as a column of a SubDataFrame. " * + "Instead use `df[!, col_ind] .= v` if you want to use broadcasting.")) ############################################################################## ## @@ -305,3 +297,15 @@ function DataFrame(sdf::SubDataFrame; copycols::Bool=true) end Base.convert(::Type{DataFrame}, sdf::SubDataFrame) = DataFrame(sdf) + +# this function tests if it is allowed to add columns to passed sdf +# currently it is only allowed when sdf is created with : as column selector +# which results in using Index as its index (as opposed to other columns selectors +# which result in SubIndex) +function is_column_adding_allowed(df::AbstractDataFrame) + df isa DataFrame && return true + if df isa SubDataFrame + return getfield(df, :colindex) isa Index + end + throw(ArgumentError("Unsupported data frame type")) +end \ No newline at end of file diff --git a/test/index.jl b/test/index.jl index 5dbf1d5b17..e9a633ff45 100644 --- a/test/index.jl +++ b/test/index.jl @@ -1,7 +1,8 @@ module TestIndex -using Test, DataFrames +using Test, DataFrames, CategoricalArrays using DataFrames: Index, SubIndex, fuzzymatch +const ≅ = isequal @testset "Index indexing" begin i = Index() @@ -524,4 +525,497 @@ end @test crossjoin(dfv, dfv, makeunique=true) == DataFrame(a=1, a_1=1) end +@testset "mutating SubDataFrame with assignment to [!, col]" begin + df = DataFrame() + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(a=[]) + + df = DataFrame() + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test isempty(df) + + df = DataFrame() + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(a=[]) + + df = DataFrame() + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test isempty(df) + + df = DataFrame(x=Int[]) + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(x=Int[], a=[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df == DataFrame(x=Int[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(x=Int[], a=[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df == DataFrame(x=Int[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=1:5) + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + + df = DataFrame(x=1:5) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df ≅ DataFrame(x=1:5) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5) + + df = DataFrame(x=1:5) + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = 11:15 + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=11:15) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = fill(nothing, 5) + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=nothing, a=11:15) + + df = DataFrame(x=1:5) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = 11:15 + @test df ≅ DataFrame(x=1:5) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = fill(nothing, 5) + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5)) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], :] + sdf[!, :d] = [101, 103] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[101, missing, 103, missing, missing]) + sdf[!, :a] = [-1.0, -3.0] + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 103, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], 1:end] + @test_throws ArgumentError sdf[!, :d] = [101, 103] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, :a] = [-1.0, -3.0] + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], + b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + sdf[!, :d] = [103, 102] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing]) + sdf[!, "e"] = [1003, 1002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, 0] = [10003, 10002] + @test_throws ArgumentError sdf[!, 6] = [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, 1] = ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, :b] = [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, :x] = 1 + @test_throws ArgumentError sdf[!, :x] = [1] + @test_throws ArgumentError sdf[!, :a] = 1 + @test_throws DimensionMismatch sdf[!, :a] = [1] + sdf[!, :f] = categorical(["3", "2"]) + @test df.f isa CategoricalArray + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + tmpc = df.c + sdf[!, 3] = [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + sdf[!, 3] = categorical(["33", "22"]) + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "33", 24, 25], + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:3] + @test_throws ArgumentError sdf[!, :d] = [103, 102] + @test_throws ArgumentError sdf[!, "e"] = [1003, 1002] + @test_throws ArgumentError sdf[!, 0] = [10003, 10002] + @test_throws ArgumentError sdf[!, 6] = [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, 1] = ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25) + sdf[!, :b] = [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25) + @test_throws ArgumentError sdf[!, :x] = 1 + @test_throws ArgumentError sdf[!, :x] = [1] + @test_throws ArgumentError sdf[!, :a] = 1 + @test_throws DimensionMismatch sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :f] = categorical(["3", "2"]) + tmpc = df.c + sdf[!, 3] = [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25]) + sdf[!, 3] = categorical(["33", "22"]) + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "33", 24, 25]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + sdf = @view df[[3, 2], 1:2] + @test_trows ArgumentError df[!, :c] = 1:2 +end + +@testset "mutating SubDataFrame with broadcasting assignment to [!, col]" begin + df = DataFrame() + sdf = @view df[:, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[!, :b] .= 1 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + @test_throws DimensionMismatch sdf[!, :c] .= 1:2 + @test_throws DimensionMismatch sdf[!, :a] .= 1:2 + sdf[!, :a] .= [1.0] + @test df.a isa Vector{Union{Missing, Float64}} + @test isempty(df.a) + sdf[!, :b] .= 1.0 + @test df.b isa Vector{Union{Missing, Float64}} + @test isempty(df.b) + + df = DataFrame() + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 1 + @test isempty(df) + + df = DataFrame() + sdf = @view df[1:0, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[!, :b] .= 1 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + @test_throws DimensionMismatch sdf[!, :c] .= 1:2 + @test_throws DimensionMismatch sdf[!, :a] .= 1:2 + sdf[!, :a] .= [1.0] + @test df.a isa Vector{Union{Missing, Float64}} + @test isempty(df.a) + sdf[!, :b] .= 1.0 + @test df.b isa Vector{Union{Missing, Float64}} + @test isempty(df.b) + + df = DataFrame() + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 1 + @test isempty(df) + + df = DataFrame(x=Int[]) + sdf = @view df[:, :] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[:, 1:end] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, :] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, 1:end] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=1:5) + sdf = @view df[1:0, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + sdf[!, :x] .= Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + + df = DataFrame(x=1:5) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test df == DataFrame(x=1:5) + sdf[!, :x] .= Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5) + + df = DataFrame(x=1:5) + sdf = @view df[:, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=1) + sdf[!, :b] .= 2 + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=1, b=2) + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5), a=1, b=2) + + df = DataFrame(x=1:5) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 2 + @test df == DataFrame(x=1:5) + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5)) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], :] + sdf[!, :d] .= 101 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + sdf[!, :a] .= -1.0 + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] + sdf[!, :a] .= [-1.0, -2.0] + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + sdf[!, :e] .= 1:2 + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing], + e=[1, missing, 2, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], 1:end] + @test_throws ArgumentError sdf[!, :d] .= 101 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, :a] .= -1.0 + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], + b=11:15, c=21:25) + @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] + sdf[!, :a] .= [-1.0, -2.0] + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25) + @test_throws ArgumentError sdf[!, :e] .= 1:2 + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + sdf[!, :d] .= 102 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing]) + sdf[!, "e"] .= [1003, 1002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] + @test_throws ArgumentError sdf[!, 6] .= 10002 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, 1] .= "10002" + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, :b] .= [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws DimensionMismatch sdf[!, :x] .= 1:3 + @test_throws DimensionMismatch sdf[!, :a] .= 1:3 + sdf[!, :f] .= categorical(["3", "2"]) + @test df.f isa CategoricalArray + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + tmpc = df.c + sdf[!, 3] .= [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + sdf[!, 3] .= categorical(["33", "22"])[2] + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "22", 24, 25], + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:3] + @test_throws ArgumentError sdf[!, :d] .= [103, 102] + @test_throws ArgumentError sdf[!, "e"] .= [1003, 1002] + @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] + @test_throws ArgumentError sdf[!, 6] .= [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, 1] .= ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25) + sdf[!, :b] .= -12.0 + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=21:25) + @test_throws ArgumentError sdf[!, :x] .= 1 + @test_throws ArgumentError sdf[!, :x] .= [1] + @test_throws ArgumentError sdf[!, :f] .= categorical(["3", "2"]) + tmpc = df.c + sdf[!, 3] .= [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=[21, 22, 33, 24, 25]) + sdf[!, 3] .= categorical(["33", "22"])[2] + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=[21, "22", "22", 24, 25]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + sdf = @view df[[3, 2], 1:2] + @test_trows ArgumentError df[!, :c] .= 1:2 +end + +@testset "mutating SubDataFrame with assignment to [!, cols]" begin +end + +@testset "mutating SubDataFrame with broadcasting assignment to [!, cols]" begin +end + +@testset "mutating SubDataFrame with assignment to [:, col]" begin +end + +@testset "mutating SubDataFrame with broadcasting assignment to [:, col]" begin +end + +@testset "mutating SubDataFrame with assignment to [:, cols]" begin +end + +@testset "mutating SubDataFrame with broadcasting assignment to [:, cols]" begin +end + +@testset "mutating SubDataFrame with assignment to sdf.col" begin +end + +@testset "mutating SubDataFrame with broadcasting assignment to sdf.col" begin +end + end # module From 7d5a65b8d77e7d85c1362ecbea72642759c09326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 7 Aug 2021 15:48:45 +0200 Subject: [PATCH 13/29] some more tests --- src/subdataframe/subdataframe.jl | 3 +- test/index.jl | 496 +--------------------------- test/indexing.jl | 549 ++++++++++++++++++++++++++++++- 3 files changed, 550 insertions(+), 498 deletions(-) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index e42c8ee65d..d20f08f55b 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -235,10 +235,9 @@ for T in MULTICOLUMNINDEX_TUPLE throw(ArgumentError("Column names in source and target data frames do not match")) end for (j, col) in enumerate(idxs) - # we will make a copy on assignment later sdf[!, col] = new_df[!, j] end - return df + return sdf end @eval function Base.setindex!(sdf::SubDataFrame, diff --git a/test/index.jl b/test/index.jl index e9a633ff45..5dbf1d5b17 100644 --- a/test/index.jl +++ b/test/index.jl @@ -1,8 +1,7 @@ module TestIndex -using Test, DataFrames, CategoricalArrays +using Test, DataFrames using DataFrames: Index, SubIndex, fuzzymatch -const ≅ = isequal @testset "Index indexing" begin i = Index() @@ -525,497 +524,4 @@ end @test crossjoin(dfv, dfv, makeunique=true) == DataFrame(a=1, a_1=1) end -@testset "mutating SubDataFrame with assignment to [!, col]" begin - df = DataFrame() - sdf = @view df[:, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df == DataFrame(a=[]) - - df = DataFrame() - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test isempty(df) - - df = DataFrame() - sdf = @view df[1:0, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df == DataFrame(a=[]) - - df = DataFrame() - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test isempty(df) - - df = DataFrame(x=Int[]) - sdf = @view df[:, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df == DataFrame(x=Int[], a=[]) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test df == DataFrame(x=Int[]) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[1:0, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df == DataFrame(x=Int[], a=[]) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test df == DataFrame(x=Int[]) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=1:5) - sdf = @view df[1:0, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=missing) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=1:5, a=missing) - - df = DataFrame(x=1:5) - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test df ≅ DataFrame(x=1:5) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=1:5) - - df = DataFrame(x=1:5) - sdf = @view df[:, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = 11:15 - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=11:15) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = fill(nothing, 5) - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=nothing, a=11:15) - - df = DataFrame(x=1:5) - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = 11:15 - @test df ≅ DataFrame(x=1:5) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = fill(nothing, 5) - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=fill(nothing, 5)) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[1, 3], :] - sdf[!, :d] = [101, 103] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[101, missing, 103, missing, missing]) - sdf[!, :a] = [-1.0, -3.0] - @test eltype(df.a) === Float64 - @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], - b=11:15, c=21:25, - d=[101, missing, 103, missing, missing]) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[1, 3], 1:end] - @test_throws ArgumentError sdf[!, :d] = [101, 103] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) - sdf[!, :a] = [-1.0, -3.0] - @test eltype(df.a) === Float64 - @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], - b=11:15, c=21:25) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], :] - sdf[!, :d] = [103, 102] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 103, missing, missing]) - sdf[!, "e"] = [1003, 1002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - @test_throws ArgumentError sdf[!, 0] = [10003, 10002] - @test_throws ArgumentError sdf[!, 6] = [10003, 10002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - sdf[!, 1] = ["10003", "10002"] - @test eltype(df.a) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=11:15, c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - sdf[!, :b] = [-13.0, -12.0] - @test eltype(df.b) === Float64 - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - @test_throws ArgumentError sdf[!, :x] = 1 - @test_throws ArgumentError sdf[!, :x] = [1] - @test_throws ArgumentError sdf[!, :a] = 1 - @test_throws DimensionMismatch sdf[!, :a] = [1] - sdf[!, :f] = categorical(["3", "2"]) - @test df.f isa CategoricalArray - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - tmpc = df.c - sdf[!, 3] = [33, 22] - @test tmpc == 21:25 - @test tmpc != df.c - @test eltype(df.c) === Int - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, 22, 33, 24, 25], - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - sdf[!, 3] = categorical(["33", "22"]) - @test eltype(df.c) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, "22", "33", 24, 25], - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - @test df.c[2] isa CategoricalValue - @test df.c[3] isa CategoricalValue - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], 1:3] - @test_throws ArgumentError sdf[!, :d] = [103, 102] - @test_throws ArgumentError sdf[!, "e"] = [1003, 1002] - @test_throws ArgumentError sdf[!, 0] = [10003, 10002] - @test_throws ArgumentError sdf[!, 6] = [10003, 10002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) - sdf[!, 1] = ["10003", "10002"] - @test eltype(df.a) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=11:15, c=21:25) - sdf[!, :b] = [-13.0, -12.0] - @test eltype(df.b) === Float64 - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25) - @test_throws ArgumentError sdf[!, :x] = 1 - @test_throws ArgumentError sdf[!, :x] = [1] - @test_throws ArgumentError sdf[!, :a] = 1 - @test_throws DimensionMismatch sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :f] = categorical(["3", "2"]) - tmpc = df.c - sdf[!, 3] = [33, 22] - @test tmpc == 21:25 - @test tmpc != df.c - @test eltype(df.c) === Int - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, 22, 33, 24, 25]) - sdf[!, 3] = categorical(["33", "22"]) - @test eltype(df.c) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, "22", "33", 24, 25]) - @test df.c[2] isa CategoricalValue - @test df.c[3] isa CategoricalValue - - sdf = @view df[[3, 2], 1:2] - @test_trows ArgumentError df[!, :c] = 1:2 -end - -@testset "mutating SubDataFrame with broadcasting assignment to [!, col]" begin - df = DataFrame() - sdf = @view df[:, :] - sdf[!, :a] .= [1] - @test df.a isa Vector{Union{Missing, Int}} - @test isempty(df.a) - sdf[!, :b] .= 1 - @test df.b isa Vector{Union{Missing, Int}} - @test isempty(df.b) - @test_throws DimensionMismatch sdf[!, :c] .= 1:2 - @test_throws DimensionMismatch sdf[!, :a] .= 1:2 - sdf[!, :a] .= [1.0] - @test df.a isa Vector{Union{Missing, Float64}} - @test isempty(df.a) - sdf[!, :b] .= 1.0 - @test df.b isa Vector{Union{Missing, Float64}} - @test isempty(df.b) - - df = DataFrame() - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] .= [1] - @test_throws ArgumentError sdf[!, :b] .= 1 - @test isempty(df) - - df = DataFrame() - sdf = @view df[1:0, :] - sdf[!, :a] .= [1] - @test df.a isa Vector{Union{Missing, Int}} - @test isempty(df.a) - sdf[!, :b] .= 1 - @test df.b isa Vector{Union{Missing, Int}} - @test isempty(df.b) - @test_throws DimensionMismatch sdf[!, :c] .= 1:2 - @test_throws DimensionMismatch sdf[!, :a] .= 1:2 - sdf[!, :a] .= [1.0] - @test df.a isa Vector{Union{Missing, Float64}} - @test isempty(df.a) - sdf[!, :b] .= 1.0 - @test df.b isa Vector{Union{Missing, Float64}} - @test isempty(df.b) - - df = DataFrame() - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] .= [1] - @test_throws ArgumentError sdf[!, :b] .= 1 - @test isempty(df) - - df = DataFrame(x=Int[]) - sdf = @view df[:, :] - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[:, 1:end] - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[1:0, :] - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[1:0, 1:end] - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=1:5) - sdf = @view df[1:0, :] - sdf[!, :a] .= [1] - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=missing) - sdf[!, :x] .= Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=1:5, a=missing) - - df = DataFrame(x=1:5) - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] .= [1] - @test df == DataFrame(x=1:5) - sdf[!, :x] .= Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=1:5) - - df = DataFrame(x=1:5) - sdf = @view df[:, :] - sdf[!, :a] .= [1] - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=1) - sdf[!, :b] .= 2 - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=1, b=2) - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=fill(nothing, 5), a=1, b=2) - - df = DataFrame(x=1:5) - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] .= [1] - @test_throws ArgumentError sdf[!, :b] .= 2 - @test df == DataFrame(x=1:5) - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=fill(nothing, 5)) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[1, 3], :] - sdf[!, :d] .= 101 - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[101, missing, 101, missing, missing]) - sdf[!, :a] .= -1.0 - @test eltype(df.a) === Float64 - @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], - b=11:15, c=21:25, - d=[101, missing, 101, missing, missing]) - @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] - sdf[!, :a] .= [-1.0, -2.0] - @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], - b=11:15, c=21:25, - d=[101, missing, 101, missing, missing]) - sdf[!, :e] .= 1:2 - @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], - b=11:15, c=21:25, - d=[101, missing, 101, missing, missing], - e=[1, missing, 2, missing, missing]) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[1, 3], 1:end] - @test_throws ArgumentError sdf[!, :d] .= 101 - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) - sdf[!, :a] .= -1.0 - @test eltype(df.a) === Float64 - @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], - b=11:15, c=21:25) - @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] - sdf[!, :a] .= [-1.0, -2.0] - @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], - b=11:15, c=21:25) - @test_throws ArgumentError sdf[!, :e] .= 1:2 - @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], - b=11:15, c=21:25) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], :] - sdf[!, :d] .= 102 - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 102, missing, missing]) - sdf[!, "e"] .= [1003, 1002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] - @test_throws ArgumentError sdf[!, 6] .= 10002 - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - sdf[!, 1] .= "10002" - @test eltype(df.a) === Any - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=11:15, c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - sdf[!, :b] .= [-13.0, -12.0] - @test eltype(df.b) === Float64 - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - @test_throws DimensionMismatch sdf[!, :x] .= 1:3 - @test_throws DimensionMismatch sdf[!, :a] .= 1:3 - sdf[!, :f] .= categorical(["3", "2"]) - @test df.f isa CategoricalArray - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - tmpc = df.c - sdf[!, 3] .= [33, 22] - @test tmpc == 21:25 - @test tmpc != df.c - @test eltype(df.c) === Int - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, 22, 33, 24, 25], - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - sdf[!, 3] .= categorical(["33", "22"])[2] - @test eltype(df.c) === Any - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, "22", "22", 24, 25], - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - @test df.c[2] isa CategoricalValue - @test df.c[3] isa CategoricalValue - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], 1:3] - @test_throws ArgumentError sdf[!, :d] .= [103, 102] - @test_throws ArgumentError sdf[!, "e"] .= [1003, 1002] - @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] - @test_throws ArgumentError sdf[!, 6] .= [10003, 10002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) - sdf[!, 1] .= ["10003", "10002"] - @test eltype(df.a) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=11:15, c=21:25) - sdf[!, :b] .= -12.0 - @test eltype(df.b) === Float64 - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -12.0, 14, 15], - c=21:25) - @test_throws ArgumentError sdf[!, :x] .= 1 - @test_throws ArgumentError sdf[!, :x] .= [1] - @test_throws ArgumentError sdf[!, :f] .= categorical(["3", "2"]) - tmpc = df.c - sdf[!, 3] .= [33, 22] - @test tmpc == 21:25 - @test tmpc != df.c - @test eltype(df.c) === Int - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -12.0, 14, 15], - c=[21, 22, 33, 24, 25]) - sdf[!, 3] .= categorical(["33", "22"])[2] - @test eltype(df.c) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -12.0, 14, 15], - c=[21, "22", "22", 24, 25]) - @test df.c[2] isa CategoricalValue - @test df.c[3] isa CategoricalValue - - sdf = @view df[[3, 2], 1:2] - @test_trows ArgumentError df[!, :c] .= 1:2 -end - -@testset "mutating SubDataFrame with assignment to [!, cols]" begin -end - -@testset "mutating SubDataFrame with broadcasting assignment to [!, cols]" begin -end - -@testset "mutating SubDataFrame with assignment to [:, col]" begin -end - -@testset "mutating SubDataFrame with broadcasting assignment to [:, col]" begin -end - -@testset "mutating SubDataFrame with assignment to [:, cols]" begin -end - -@testset "mutating SubDataFrame with broadcasting assignment to [:, cols]" begin -end - -@testset "mutating SubDataFrame with assignment to sdf.col" begin -end - -@testset "mutating SubDataFrame with broadcasting assignment to sdf.col" begin -end - end # module diff --git a/test/indexing.jl b/test/indexing.jl index 41ce1fd798..64aa4b29a1 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -1,6 +1,8 @@ module TestIndexing -using Test, DataFrames +using Test, DataFrames, CategoricalArrays + +const ≅ = isequal @testset "getindex DataFrame" begin df = DataFrame(a=1:3, b=4:6, c=7:9) @@ -2020,4 +2022,549 @@ end end end +@testset "mutating SubDataFrame with assignment to [!, col]" begin + df = DataFrame() + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(a=[]) + + df = DataFrame() + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test isempty(df) + + df = DataFrame() + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(a=[]) + + df = DataFrame() + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test isempty(df) + + df = DataFrame(x=Int[]) + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(x=Int[], a=[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df == DataFrame(x=Int[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(x=Int[], a=[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df == DataFrame(x=Int[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=1:5) + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + + df = DataFrame(x=1:5) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df ≅ DataFrame(x=1:5) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5) + + df = DataFrame(x=1:5) + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = 11:15 + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=11:15) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = fill(nothing, 5) + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=nothing, a=11:15) + + df = DataFrame(x=1:5) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = 11:15 + @test df ≅ DataFrame(x=1:5) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = fill(nothing, 5) + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5)) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], :] + sdf[!, :d] = [101, 103] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[101, missing, 103, missing, missing]) + sdf[!, :a] = [-1.0, -3.0] + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 103, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], 1:end] + @test_throws ArgumentError sdf[!, :d] = [101, 103] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, :a] = [-1.0, -3.0] + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], + b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + sdf[!, :d] = [103, 102] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing]) + sdf[!, "e"] = [1003, 1002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, 0] = [10003, 10002] + @test_throws ArgumentError sdf[!, 6] = [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, 1] = ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, :b] = [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, :x] = 1 + @test_throws ArgumentError sdf[!, :x] = [1] + @test_throws ArgumentError sdf[!, :a] = 1 + @test_throws DimensionMismatch sdf[!, :a] = [1] + sdf[!, :f] = categorical(["3", "2"]) + @test df.f isa CategoricalArray + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + tmpc = df.c + sdf[!, 3] = [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + sdf[!, 3] = categorical(["33", "22"]) + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "33", 24, 25], + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:3] + @test_throws ArgumentError sdf[!, :d] = [103, 102] + @test_throws ArgumentError sdf[!, "e"] = [1003, 1002] + @test_throws ArgumentError sdf[!, 0] = [10003, 10002] + @test_throws ArgumentError sdf[!, 6] = [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, 1] = ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25) + sdf[!, :b] = [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25) + @test_throws ArgumentError sdf[!, :x] = 1 + @test_throws ArgumentError sdf[!, :x] = [1] + @test_throws ArgumentError sdf[!, :a] = 1 + @test_throws DimensionMismatch sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :f] = categorical(["3", "2"]) + tmpc = df.c + sdf[!, 3] = [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25]) + sdf[!, 3] = categorical(["33", "22"]) + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "33", 24, 25]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + sdf = @view df[[3, 2], 1:2] + @test_trows ArgumentError df[!, :c] = 1:2 +end + +@testset "mutating SubDataFrame with broadcasting assignment to [!, col]" begin + df = DataFrame() + sdf = @view df[:, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[!, :b] .= 1 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + @test_throws DimensionMismatch sdf[!, :c] .= 1:2 + @test_throws DimensionMismatch sdf[!, :a] .= 1:2 + sdf[!, :a] .= [1.0] + @test df.a isa Vector{Union{Missing, Float64}} + @test isempty(df.a) + sdf[!, :b] .= 1.0 + @test df.b isa Vector{Union{Missing, Float64}} + @test isempty(df.b) + + df = DataFrame() + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 1 + @test isempty(df) + + df = DataFrame() + sdf = @view df[1:0, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[!, :b] .= 1 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + @test_throws DimensionMismatch sdf[!, :c] .= 1:2 + @test_throws DimensionMismatch sdf[!, :a] .= 1:2 + sdf[!, :a] .= [1.0] + @test df.a isa Vector{Union{Missing, Float64}} + @test isempty(df.a) + sdf[!, :b] .= 1.0 + @test df.b isa Vector{Union{Missing, Float64}} + @test isempty(df.b) + + df = DataFrame() + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 1 + @test isempty(df) + + df = DataFrame(x=Int[]) + sdf = @view df[:, :] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[:, 1:end] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, :] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, 1:end] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=1:5) + sdf = @view df[1:0, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + sdf[!, :x] .= Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + + df = DataFrame(x=1:5) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test df == DataFrame(x=1:5) + sdf[!, :x] .= Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5) + + df = DataFrame(x=1:5) + sdf = @view df[:, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=1) + sdf[!, :b] .= 2 + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=1, b=2) + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5), a=1, b=2) + + df = DataFrame(x=1:5) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 2 + @test df == DataFrame(x=1:5) + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5)) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], :] + sdf[!, :d] .= 101 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + sdf[!, :a] .= -1.0 + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] + sdf[!, :a] .= [-1.0, -2.0] + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + sdf[!, :e] .= 1:2 + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing], + e=[1, missing, 2, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], 1:end] + @test_throws ArgumentError sdf[!, :d] .= 101 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, :a] .= -1.0 + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], + b=11:15, c=21:25) + @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] + sdf[!, :a] .= [-1.0, -2.0] + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25) + @test_throws ArgumentError sdf[!, :e] .= 1:2 + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + sdf[!, :d] .= 102 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing]) + sdf[!, "e"] .= [1003, 1002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] + @test_throws ArgumentError sdf[!, 6] .= 10002 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, 1] .= "10002" + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, :b] .= [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws DimensionMismatch sdf[!, :x] .= 1:3 + @test_throws DimensionMismatch sdf[!, :a] .= 1:3 + sdf[!, :f] .= categorical(["3", "2"]) + @test df.f isa CategoricalArray + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + tmpc = df.c + sdf[!, 3] .= [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + sdf[!, 3] .= categorical(["33", "22"])[2] + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "22", 24, 25], + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:3] + @test_throws ArgumentError sdf[!, :d] .= [103, 102] + @test_throws ArgumentError sdf[!, "e"] .= [1003, 1002] + @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] + @test_throws ArgumentError sdf[!, 6] .= [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, 1] .= ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25) + sdf[!, :b] .= -12.0 + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=21:25) + @test_throws ArgumentError sdf[!, :x] .= 1 + @test_throws ArgumentError sdf[!, :x] .= [1] + @test_throws ArgumentError sdf[!, :f] .= categorical(["3", "2"]) + tmpc = df.c + sdf[!, 3] .= [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=[21, 22, 33, 24, 25]) + sdf[!, 3] .= categorical(["33", "22"])[2] + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=[21, "22", "22", 24, 25]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + sdf = @view df[[3, 2], 1:2] + @test_trows ArgumentError df[!, :c] .= 1:2 +end + +@testset "mutating SubDataFrame with assignment to [!, cols]" begin +(:AbstractVector, :Regex, :Not, :Between, :All, :Cols, :Colon) + + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, [:c, :b, :a]] = DataFrame(c=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=[21, "d", "c", 24, 25]) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + @test eltype(df.c) == Any + + @test_throws ArgumentError sdf[!, [:c, :b, :a]] = DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]") + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + + @test_throws ArgumentError sdf[!, cols] = DataFrame(b=[1.0, 2.0], a=[13, 12]) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + @test_throws ArgumentError sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) + end +end + +@testset "mutating SubDataFrame with broadcasting assignment to [!, cols]" begin + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + + sdf[!, [:c, :b, :a]] .= DataFrame(c=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + +end + +@testset "mutating SubDataFrame with assignment to [:, col]" begin +end + +@testset "mutating SubDataFrame with broadcasting assignment to [:, col]" begin +end + +@testset "mutating SubDataFrame with assignment to [:, cols]" begin +end + +@testset "mutating SubDataFrame with broadcasting assignment to [:, cols]" begin +end + +@testset "mutating SubDataFrame with assignment to sdf.col" begin +end + +@testset "mutating SubDataFrame with broadcasting assignment to sdf.col" begin +end + end # module From f614e5833c7b1aa28c1511cb320561da0b051893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 7 Aug 2021 19:09:09 +0200 Subject: [PATCH 14/29] done tests of ! assignment and broadcasting assignment --- src/other/broadcasting.jl | 8 +- test/indexing.jl | 162 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 160 insertions(+), 10 deletions(-) diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl index c1bec04e1b..6c02e70539 100644 --- a/src/other/broadcasting.jl +++ b/src/other/broadcasting.jl @@ -88,13 +88,13 @@ end Base.axes(x::LazyNewColDataFrame) = (Base.OneTo(nrow(x.df)),) Base.ndims(::Type{<:LazyNewColDataFrame}) = 1 -struct ColReplaceDataFrame - df::DataFrame +struct ColReplaceDataFrame{T<:AbstractDataFrame} + df::T cols::Vector{Int} end Base.axes(x::ColReplaceDataFrame) = (axes(x.df, 1), Base.OneTo(length(x.cols))) -Base.ndims(::Type{ColReplaceDataFrame}) = 2 +Base.ndims(::Type{<:ColReplaceDataFrame}) = 2 Base.maybeview(df::AbstractDataFrame, idx::CartesianIndex{2}) = df[idx] Base.maybeview(df::AbstractDataFrame, row::Integer, col::ColumnIndex) = df[row, col] @@ -114,7 +114,7 @@ end function Base.dotview(df::AbstractDataFrame, ::typeof(!), cols) if !(cols isa ColumnIndex) - return ColReplaceDataFrame(df, index(df)[cols]) + return ColReplaceDataFrame(df, convert(Vector{Int}, index(df)[cols])) end if cols isa SymbolOrString if columnindex(df, cols) == 0 && !is_column_adding_allowed(df) diff --git a/test/indexing.jl b/test/indexing.jl index 64aa4b29a1..d8e2adfc95 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -2492,8 +2492,6 @@ end end @testset "mutating SubDataFrame with assignment to [!, cols]" begin -(:AbstractVector, :Regex, :Not, :Between, :All, :Cols, :Colon) - for sel in (:, 1:3) df = DataFrame(a=1:5, b=11:15, c=21:25) sdf = @view df[[3, 2], sel] @@ -2508,9 +2506,10 @@ end @test eltype(df.c) == Any @test_throws ArgumentError sdf[!, [:c, :b, :a]] = DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test_throws ArgumentError sdf[!, [:c, :b, :a]] = DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) end - for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]") + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) df = DataFrame(a=1:5, b=11:15, c=21:25) sdf = @view df[[3, 2], sel] tmpa = df.a @@ -2538,15 +2537,166 @@ end @test eltype(df.b) == Float64 @test_throws ArgumentError sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) end + + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, [:c, :b, :a]] = ["b" "d" "f"; "a" "c" "e"] + @test df == DataFrame(a=[1, "e", "f", 4, 5], + b=[11.0, "c", "d", 14.0, 15.0], + c=[21, "a", "b", 24, 25]) + @test tmpa !== df.a + @test eltype(df.a) == Any + @test eltype(df.b) == Any + @test eltype(df.c) == Any + + @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] = ones(2, 2) + @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] = ones(1, 3) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, cols] = [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], + b=[11.0, 4.0, 3.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Float64 + @test eltype(df.b) == Float64 + + @test_throws DimensionMismatch sdf[!, cols] = ones(1, 3) + @test_throws DimensionMismatch sdf[!, cols] = ones(3, 1) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[!, cols] = [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], + b=[11.0, 4.0, 3.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Float64 + @test eltype(df.b) == Float64 + @test_throws DimensionMismatch sdf[!, cols] = ones(1, 3) + @test_throws DimensionMismatch sdf[!, cols] = ones(3, 1) + end end @testset "mutating SubDataFrame with broadcasting assignment to [!, cols]" begin + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, [:c, :b, :a]] .= DataFrame(c=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=[21, "d", "c", 24, 25]) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + @test eltype(df.c) == Any - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], :] + sdf[!, [:c, :b, :a]] .= [100, 200] + @test df == DataFrame(a=[1, 200, 100, 4, 5], + b=[11.0, 200.0, 100.0, 14.0, 15.0], + c=[21, 200, 100, 24, 25]) - sdf[!, [:c, :b, :a]] .= DataFrame(c=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test_throws ArgumentError sdf[!, [:c, :b, :a]] .= DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test_throws ArgumentError sdf[!, [:c, :b, :a]] .= DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) + end + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + + sdf[!, cols] .= [100 200] + @test df == DataFrame(a=[1, 100, 100, 4, 5], + b=[11.0, 200.0, 200.0, 14.0, 15.0], + c=21:25) + + @test_throws ArgumentError sdf[!, cols] .= DataFrame(b=[1.0, 2.0], a=[13, 12]) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + + sdf[!, cols] .= 100 + @test df == DataFrame(a=[1, 100, 100, 4, 5], + b=[11.0, 100.0, 100.0, 14.0, 15.0], + c=21:25) + + @test_throws DimensionMismatch sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) + end + + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, [:c, :b, :a]] .= ["b" "d" "f"; "a" "c" "e"] + @test df == DataFrame(a=[1, "e", "f", 4, 5], + b=[11.0, "c", "d", 14.0, 15.0], + c=[21, "a", "b", 24, 25]) + @test tmpa !== df.a + @test eltype(df.a) == Any + @test eltype(df.b) == Any + @test eltype(df.c) == Any + + @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] .= ones(2, 2) + @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] .= ones(4, 3) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, cols] .= [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], + b=[11.0, 4.0, 3.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Float64 + @test eltype(df.b) == Float64 + + @test_throws DimensionMismatch sdf[!, cols] .= ones(4, 3) + @test_throws DimensionMismatch sdf[!, cols] .= ones(3, 4) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[!, cols] .= [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], + b=[11.0, 4.0, 3.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Float64 + @test eltype(df.b) == Float64 + @test_throws DimensionMismatch sdf[!, cols] .= ones(4, 3) + @test_throws DimensionMismatch sdf[!, cols] .= ones(3, 4) + end end @testset "mutating SubDataFrame with assignment to [:, col]" begin From e886dc10841da3896146b867c5a28f9961becd33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 8 Aug 2021 01:07:38 +0200 Subject: [PATCH 15/29] finished assignment, broadcasted assignment and insertcols! --- test/indexing.jl | 745 +---------------- test/runtests.jl | 1 + test/subdataframe_mutation.jl | 1425 +++++++++++++++++++++++++++++++++ 3 files changed, 1457 insertions(+), 714 deletions(-) create mode 100644 test/subdataframe_mutation.jl diff --git a/test/indexing.jl b/test/indexing.jl index d8e2adfc95..4670df2f54 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -1,8 +1,6 @@ module TestIndexing -using Test, DataFrames, CategoricalArrays - -const ≅ = isequal +using Test, DataFrames @testset "getindex DataFrame" begin df = DataFrame(a=1:3, b=4:6, c=7:9) @@ -1301,10 +1299,17 @@ end sdf[:, 1] = 10:12 @test df == DataFrame(a=10:12, b=4:6, c=7:9) @test_throws MethodError sdf[:, 1] = ["a", "b", "c"] - @test_throws ArgumentError sdf[:, :z] = ["a", "b", "c"] @test_throws BoundsError sdf[:, 4] = ["a", "b", "c"] @test_throws DimensionMismatch sdf[:, 1] = [1] @test_throws MethodError sdf[:, 1] = 1 + if DataFrames.is_column_adding_allowed(sdf) + sdf[:, :z] = ["a", "b", "c"] + @test df.z == ["a", "b", "c"] + @test eltype(df.z) == Union{String, Missing} + select!(df, 1:3) + else + @test_throws ArgumentError sdf[:, :z] = ["a", "b", "c"] + end end df = DataFrame(a=1:3, b=4:6, c=7:9) @@ -1315,7 +1320,13 @@ end sdf[:, names(sdf)[1]] = 10:12 @test df == DataFrame(a=10:12, b=4:6, c=7:9) @test_throws MethodError sdf[:, names(sdf)[1]] = ["a", "b", "c"] - @test_throws ArgumentError sdf[:, "z"] = ["a", "b", "c"] + if DataFrames.is_column_adding_allowed(sdf) + sdf[:, "z"] = ["a", "b", "c"] + @test df.z == ["a", "b", "c"] + select!(df, 1:3) + else + @test_throws ArgumentError sdf[:, "z"] = ["a", "b", "c"] + end end # `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; @@ -1348,14 +1359,13 @@ end @test df == df2 end - # Note that `sdf[!, col] = v` and `sdf.col = v` are not allowed as `sdf` can be only modified in-place. for (row_sel, col_sel) in [(:, :), (:, 1:3), (1:3, :), (1:3, 1:3), (1:3, ["a", "b", "c"])] df = DataFrame(a=1:3, b=4:6, c=7:9) sdf = view(df, row_sel, col_sel) - @test_throws ArgumentError sdf[!, 1] = [1, 2, 3] - @test_throws ArgumentError sdf[!, "a"] = [1, 2, 3] - @test_throws ArgumentError sdf[!, 1:3] = ones(Int, 3, 3) - @test_throws ArgumentError sdf[!, 1] = [1, 2, 3] + sdf[!, 1] = [11, 12, 13] + sdf[!, "b"] = [14, 15, 16] + sdf[!, 3:3] = ones(Int, 3, 1) + @test df == DataFrame(a=11:13, b=14:16, c=1) end end @@ -1583,8 +1593,10 @@ end @test df[:, 1:3] == DataFrame(reshape(1:12, 3, :), :auto)[:, 1:3] dfv = view(df, :, :) - @test_throws ArgumentError dfv[!, :] = DataFrame(reshape(1:12, 3, :), :auto) - @test_throws ArgumentError dfv[!, :] = reshape(1:12, 3, :) + dfv[!, :] = DataFrame(reshape(1:12, 3, :), :auto) + @test df == DataFrame(reshape(1:12, 3, :), :auto) + dfv[!, :] = reshape(1:12, 3, :) + @test df == DataFrame(reshape(1:12, 3, :), :auto) for rows in [:, 1:3], cols in [:, r"", Not(r"xx"), 1:4] df = DataFrame(ones(3, 4), :auto) @@ -1664,12 +1676,8 @@ end @test df[1:2, "a"] == [20, 30] df[:, "a"] = [30, 40] @test df[:, "a"] == [30, 40] - if df isa DataFrame - df[!, "a"] = [1, 2] - @test df[!, "a"] == [1, 2] - else - @test_throws ArgumentError df[!, "a"] = [1, 2] - end + df[!, "a"] = [1, 2] + @test df[!, "a"] == [1, 2] df[1, ["a", "b"]] = (a=1000, b=2000) @test copy(df[1, ["a", "b"]]) == (a=1000, b=2000) @@ -1845,8 +1853,11 @@ end @test_throws ArgumentError df.a = 1 @test_throws ArgumentError df."a" = 1 dfv = @view df[:, :] - @test_throws ArgumentError dfv.a = [1] - @test_throws ArgumentError dfv."a" = [1] + dfv.a = [5] + @test df == DataFrame(a=5) + dfv."a" = [6] + @test df == DataFrame(a=6) + @test eltype(df.a) === Int @test_throws ArgumentError dfv.a = 1 @test_throws ArgumentError dfv."a" = 1 end @@ -2022,699 +2033,5 @@ end end end -@testset "mutating SubDataFrame with assignment to [!, col]" begin - df = DataFrame() - sdf = @view df[:, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df == DataFrame(a=[]) - - df = DataFrame() - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test isempty(df) - - df = DataFrame() - sdf = @view df[1:0, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df == DataFrame(a=[]) - - df = DataFrame() - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test isempty(df) - - df = DataFrame(x=Int[]) - sdf = @view df[:, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df == DataFrame(x=Int[], a=[]) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test df == DataFrame(x=Int[]) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[1:0, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df == DataFrame(x=Int[], a=[]) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test df == DataFrame(x=Int[]) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=1:5) - sdf = @view df[1:0, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = Int[] - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=missing) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=1:5, a=missing) - - df = DataFrame(x=1:5) - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = Int[] - @test df ≅ DataFrame(x=1:5) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=1:5) - - df = DataFrame(x=1:5) - sdf = @view df[:, :] - @test_throws ArgumentError sdf[!, :a] = [1] - sdf[!, :a] = 11:15 - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=11:15) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = fill(nothing, 5) - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=nothing, a=11:15) - - df = DataFrame(x=1:5) - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :a] = 11:15 - @test df ≅ DataFrame(x=1:5) - @test_throws DimensionMismatch sdf[!, :x] = ["a"] - sdf[!, :x] = fill(nothing, 5) - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=fill(nothing, 5)) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[1, 3], :] - sdf[!, :d] = [101, 103] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[101, missing, 103, missing, missing]) - sdf[!, :a] = [-1.0, -3.0] - @test eltype(df.a) === Float64 - @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], - b=11:15, c=21:25, - d=[101, missing, 103, missing, missing]) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[1, 3], 1:end] - @test_throws ArgumentError sdf[!, :d] = [101, 103] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) - sdf[!, :a] = [-1.0, -3.0] - @test eltype(df.a) === Float64 - @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], - b=11:15, c=21:25) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], :] - sdf[!, :d] = [103, 102] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 103, missing, missing]) - sdf[!, "e"] = [1003, 1002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - @test_throws ArgumentError sdf[!, 0] = [10003, 10002] - @test_throws ArgumentError sdf[!, 6] = [10003, 10002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - sdf[!, 1] = ["10003", "10002"] - @test eltype(df.a) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=11:15, c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - sdf[!, :b] = [-13.0, -12.0] - @test eltype(df.b) === Float64 - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - @test_throws ArgumentError sdf[!, :x] = 1 - @test_throws ArgumentError sdf[!, :x] = [1] - @test_throws ArgumentError sdf[!, :a] = 1 - @test_throws DimensionMismatch sdf[!, :a] = [1] - sdf[!, :f] = categorical(["3", "2"]) - @test df.f isa CategoricalArray - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25, - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - tmpc = df.c - sdf[!, 3] = [33, 22] - @test tmpc == 21:25 - @test tmpc != df.c - @test eltype(df.c) === Int - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, 22, 33, 24, 25], - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - sdf[!, 3] = categorical(["33", "22"]) - @test eltype(df.c) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, "22", "33", 24, 25], - d=[missing, 102, 103, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - @test df.c[2] isa CategoricalValue - @test df.c[3] isa CategoricalValue - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], 1:3] - @test_throws ArgumentError sdf[!, :d] = [103, 102] - @test_throws ArgumentError sdf[!, "e"] = [1003, 1002] - @test_throws ArgumentError sdf[!, 0] = [10003, 10002] - @test_throws ArgumentError sdf[!, 6] = [10003, 10002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) - sdf[!, 1] = ["10003", "10002"] - @test eltype(df.a) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=11:15, c=21:25) - sdf[!, :b] = [-13.0, -12.0] - @test eltype(df.b) === Float64 - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25) - @test_throws ArgumentError sdf[!, :x] = 1 - @test_throws ArgumentError sdf[!, :x] = [1] - @test_throws ArgumentError sdf[!, :a] = 1 - @test_throws DimensionMismatch sdf[!, :a] = [1] - @test_throws ArgumentError sdf[!, :f] = categorical(["3", "2"]) - tmpc = df.c - sdf[!, 3] = [33, 22] - @test tmpc == 21:25 - @test tmpc != df.c - @test eltype(df.c) === Int - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, 22, 33, 24, 25]) - sdf[!, 3] = categorical(["33", "22"]) - @test eltype(df.c) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, "22", "33", 24, 25]) - @test df.c[2] isa CategoricalValue - @test df.c[3] isa CategoricalValue - - sdf = @view df[[3, 2], 1:2] - @test_trows ArgumentError df[!, :c] = 1:2 -end - -@testset "mutating SubDataFrame with broadcasting assignment to [!, col]" begin - df = DataFrame() - sdf = @view df[:, :] - sdf[!, :a] .= [1] - @test df.a isa Vector{Union{Missing, Int}} - @test isempty(df.a) - sdf[!, :b] .= 1 - @test df.b isa Vector{Union{Missing, Int}} - @test isempty(df.b) - @test_throws DimensionMismatch sdf[!, :c] .= 1:2 - @test_throws DimensionMismatch sdf[!, :a] .= 1:2 - sdf[!, :a] .= [1.0] - @test df.a isa Vector{Union{Missing, Float64}} - @test isempty(df.a) - sdf[!, :b] .= 1.0 - @test df.b isa Vector{Union{Missing, Float64}} - @test isempty(df.b) - - df = DataFrame() - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] .= [1] - @test_throws ArgumentError sdf[!, :b] .= 1 - @test isempty(df) - - df = DataFrame() - sdf = @view df[1:0, :] - sdf[!, :a] .= [1] - @test df.a isa Vector{Union{Missing, Int}} - @test isempty(df.a) - sdf[!, :b] .= 1 - @test df.b isa Vector{Union{Missing, Int}} - @test isempty(df.b) - @test_throws DimensionMismatch sdf[!, :c] .= 1:2 - @test_throws DimensionMismatch sdf[!, :a] .= 1:2 - sdf[!, :a] .= [1.0] - @test df.a isa Vector{Union{Missing, Float64}} - @test isempty(df.a) - sdf[!, :b] .= 1.0 - @test df.b isa Vector{Union{Missing, Float64}} - @test isempty(df.b) - - df = DataFrame() - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] .= [1] - @test_throws ArgumentError sdf[!, :b] .= 1 - @test isempty(df) - - df = DataFrame(x=Int[]) - sdf = @view df[:, :] - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[:, 1:end] - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[1:0, :] - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=Int[]) - sdf = @view df[1:0, 1:end] - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - - df = DataFrame(x=1:5) - sdf = @view df[1:0, :] - sdf[!, :a] .= [1] - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=missing) - sdf[!, :x] .= Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=1:5, a=missing) - - df = DataFrame(x=1:5) - sdf = @view df[1:0, 1:end] - @test_throws ArgumentError sdf[!, :a] .= [1] - @test df == DataFrame(x=1:5) - sdf[!, :x] .= Nothing[] - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=1:5) - - df = DataFrame(x=1:5) - sdf = @view df[:, :] - sdf[!, :a] .= [1] - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=1) - sdf[!, :b] .= 2 - @test df.a isa Vector{Union{Missing, Int}} - @test df ≅ DataFrame(x=1:5, a=1, b=2) - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=fill(nothing, 5), a=1, b=2) - - df = DataFrame(x=1:5) - sdf = @view df[:, 1:end] - @test_throws ArgumentError sdf[!, :a] .= [1] - @test_throws ArgumentError sdf[!, :b] .= 2 - @test df == DataFrame(x=1:5) - sdf[!, :x] .= nothing - @test df.x isa Vector{Union{Nothing, Int}} - @test df ≅ DataFrame(x=fill(nothing, 5)) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[1, 3], :] - sdf[!, :d] .= 101 - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[101, missing, 101, missing, missing]) - sdf[!, :a] .= -1.0 - @test eltype(df.a) === Float64 - @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], - b=11:15, c=21:25, - d=[101, missing, 101, missing, missing]) - @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] - sdf[!, :a] .= [-1.0, -2.0] - @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], - b=11:15, c=21:25, - d=[101, missing, 101, missing, missing]) - sdf[!, :e] .= 1:2 - @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], - b=11:15, c=21:25, - d=[101, missing, 101, missing, missing], - e=[1, missing, 2, missing, missing]) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[1, 3], 1:end] - @test_throws ArgumentError sdf[!, :d] .= 101 - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) - sdf[!, :a] .= -1.0 - @test eltype(df.a) === Float64 - @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], - b=11:15, c=21:25) - @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] - sdf[!, :a] .= [-1.0, -2.0] - @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], - b=11:15, c=21:25) - @test_throws ArgumentError sdf[!, :e] .= 1:2 - @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], - b=11:15, c=21:25) - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], :] - sdf[!, :d] .= 102 - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 102, missing, missing]) - sdf[!, "e"] .= [1003, 1002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] - @test_throws ArgumentError sdf[!, 6] .= 10002 - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - sdf[!, 1] .= "10002" - @test eltype(df.a) === Any - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=11:15, c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - sdf[!, :b] .= [-13.0, -12.0] - @test eltype(df.b) === Float64 - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing]) - @test_throws DimensionMismatch sdf[!, :x] .= 1:3 - @test_throws DimensionMismatch sdf[!, :a] .= 1:3 - sdf[!, :f] .= categorical(["3", "2"]) - @test df.f isa CategoricalArray - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=21:25, - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - tmpc = df.c - sdf[!, 3] .= [33, 22] - @test tmpc == 21:25 - @test tmpc != df.c - @test eltype(df.c) === Int - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, 22, 33, 24, 25], - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - sdf[!, 3] .= categorical(["33", "22"])[2] - @test eltype(df.c) === Any - @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], - b=[11, -12.0, -13.0, 14, 15], - c=[21, "22", "22", 24, 25], - d=[missing, 102, 102, missing, missing], - e=[missing, 1002, 1003, missing, missing], - f=[missing, "2", "3", missing, missing]) - @test df.c[2] isa CategoricalValue - @test df.c[3] isa CategoricalValue - - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], 1:3] - @test_throws ArgumentError sdf[!, :d] .= [103, 102] - @test_throws ArgumentError sdf[!, "e"] .= [1003, 1002] - @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] - @test_throws ArgumentError sdf[!, 6] .= [10003, 10002] - @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) - sdf[!, 1] .= ["10003", "10002"] - @test eltype(df.a) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=11:15, c=21:25) - sdf[!, :b] .= -12.0 - @test eltype(df.b) === Float64 - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -12.0, 14, 15], - c=21:25) - @test_throws ArgumentError sdf[!, :x] .= 1 - @test_throws ArgumentError sdf[!, :x] .= [1] - @test_throws ArgumentError sdf[!, :f] .= categorical(["3", "2"]) - tmpc = df.c - sdf[!, 3] .= [33, 22] - @test tmpc == 21:25 - @test tmpc != df.c - @test eltype(df.c) === Int - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -12.0, 14, 15], - c=[21, 22, 33, 24, 25]) - sdf[!, 3] .= categorical(["33", "22"])[2] - @test eltype(df.c) === Any - @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], - b=[11, -12.0, -12.0, 14, 15], - c=[21, "22", "22", 24, 25]) - @test df.c[2] isa CategoricalValue - @test df.c[3] isa CategoricalValue - - sdf = @view df[[3, 2], 1:2] - @test_trows ArgumentError df[!, :c] .= 1:2 -end - -@testset "mutating SubDataFrame with assignment to [!, cols]" begin - for sel in (:, 1:3) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], sel] - tmpa = df.a - sdf[!, [:c, :b, :a]] = DataFrame(c=["c", "d"], b=[1.0, 2.0], a=[13, 12]) - @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11.0, 2.0, 1.0, 14.0, 15.0], - c=[21, "d", "c", 24, 25]) - @test tmpa !== df.a - @test eltype(df.a) == Int - @test eltype(df.b) == Float64 - @test eltype(df.c) == Any - - @test_throws ArgumentError sdf[!, [:c, :b, :a]] = DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) - @test_throws ArgumentError sdf[!, [:c, :b, :a]] = DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) - end - - for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], sel] - tmpa = df.a - sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) - @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11.0, 2.0, 1.0, 14.0, 15.0], - c=21:25) - @test tmpa !== df.a - @test eltype(df.a) == Int - @test eltype(df.b) == Float64 - - @test_throws ArgumentError sdf[!, cols] = DataFrame(b=[1.0, 2.0], a=[13, 12]) - end - - for cols in (All(), :, Cols(:a, :b)) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], 1:2] - tmpa = df.a - sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) - @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11.0, 2.0, 1.0, 14.0, 15.0], - c=21:25) - @test tmpa !== df.a - @test eltype(df.a) == Int - @test eltype(df.b) == Float64 - @test_throws ArgumentError sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) - end - - for sel in (:, 1:3) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], sel] - tmpa = df.a - sdf[!, [:c, :b, :a]] = ["b" "d" "f"; "a" "c" "e"] - @test df == DataFrame(a=[1, "e", "f", 4, 5], - b=[11.0, "c", "d", 14.0, 15.0], - c=[21, "a", "b", 24, 25]) - @test tmpa !== df.a - @test eltype(df.a) == Any - @test eltype(df.b) == Any - @test eltype(df.c) == Any - - @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] = ones(2, 2) - @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] = ones(1, 3) - end - - for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], sel] - tmpa = df.a - sdf[!, cols] = [1.0 3.0; 2.0 4.0] - @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], - b=[11.0, 4.0, 3.0, 14.0, 15.0], - c=21:25) - @test tmpa !== df.a - @test eltype(df.a) == Float64 - @test eltype(df.b) == Float64 - - @test_throws DimensionMismatch sdf[!, cols] = ones(1, 3) - @test_throws DimensionMismatch sdf[!, cols] = ones(3, 1) - end - - for cols in (All(), :, Cols(:a, :b)) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], 1:2] - tmpa = df.a - sdf[!, cols] = [1.0 3.0; 2.0 4.0] - @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], - b=[11.0, 4.0, 3.0, 14.0, 15.0], - c=21:25) - @test tmpa !== df.a - @test eltype(df.a) == Float64 - @test eltype(df.b) == Float64 - @test_throws DimensionMismatch sdf[!, cols] = ones(1, 3) - @test_throws DimensionMismatch sdf[!, cols] = ones(3, 1) - end -end - -@testset "mutating SubDataFrame with broadcasting assignment to [!, cols]" begin - for sel in (:, 1:3) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], sel] - tmpa = df.a - sdf[!, [:c, :b, :a]] .= DataFrame(c=["c", "d"], b=[1.0, 2.0], a=[13, 12]) - @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11.0, 2.0, 1.0, 14.0, 15.0], - c=[21, "d", "c", 24, 25]) - @test tmpa !== df.a - @test eltype(df.a) == Int - @test eltype(df.b) == Float64 - @test eltype(df.c) == Any - - sdf[!, [:c, :b, :a]] .= [100, 200] - @test df == DataFrame(a=[1, 200, 100, 4, 5], - b=[11.0, 200.0, 100.0, 14.0, 15.0], - c=[21, 200, 100, 24, 25]) - - @test_throws ArgumentError sdf[!, [:c, :b, :a]] .= DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) - @test_throws ArgumentError sdf[!, [:c, :b, :a]] .= DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) - end - - for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], sel] - tmpa = df.a - sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) - @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11.0, 2.0, 1.0, 14.0, 15.0], - c=21:25) - @test tmpa !== df.a - @test eltype(df.a) == Int - @test eltype(df.b) == Float64 - - sdf[!, cols] .= [100 200] - @test df == DataFrame(a=[1, 100, 100, 4, 5], - b=[11.0, 200.0, 200.0, 14.0, 15.0], - c=21:25) - - @test_throws ArgumentError sdf[!, cols] .= DataFrame(b=[1.0, 2.0], a=[13, 12]) - end - - for cols in (All(), :, Cols(:a, :b)) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], 1:2] - tmpa = df.a - sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) - @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11.0, 2.0, 1.0, 14.0, 15.0], - c=21:25) - @test tmpa !== df.a - @test eltype(df.a) == Int - @test eltype(df.b) == Float64 - - sdf[!, cols] .= 100 - @test df == DataFrame(a=[1, 100, 100, 4, 5], - b=[11.0, 100.0, 100.0, 14.0, 15.0], - c=21:25) - - @test_throws DimensionMismatch sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) - end - - for sel in (:, 1:3) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], sel] - tmpa = df.a - sdf[!, [:c, :b, :a]] .= ["b" "d" "f"; "a" "c" "e"] - @test df == DataFrame(a=[1, "e", "f", 4, 5], - b=[11.0, "c", "d", 14.0, 15.0], - c=[21, "a", "b", 24, 25]) - @test tmpa !== df.a - @test eltype(df.a) == Any - @test eltype(df.b) == Any - @test eltype(df.c) == Any - - @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] .= ones(2, 2) - @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] .= ones(4, 3) - end - - for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], sel] - tmpa = df.a - sdf[!, cols] .= [1.0 3.0; 2.0 4.0] - @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], - b=[11.0, 4.0, 3.0, 14.0, 15.0], - c=21:25) - @test tmpa !== df.a - @test eltype(df.a) == Float64 - @test eltype(df.b) == Float64 - - @test_throws DimensionMismatch sdf[!, cols] .= ones(4, 3) - @test_throws DimensionMismatch sdf[!, cols] .= ones(3, 4) - end - - for cols in (All(), :, Cols(:a, :b)) - df = DataFrame(a=1:5, b=11:15, c=21:25) - sdf = @view df[[3, 2], 1:2] - tmpa = df.a - sdf[!, cols] .= [1.0 3.0; 2.0 4.0] - @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], - b=[11.0, 4.0, 3.0, 14.0, 15.0], - c=21:25) - @test tmpa !== df.a - @test eltype(df.a) == Float64 - @test eltype(df.b) == Float64 - @test_throws DimensionMismatch sdf[!, cols] .= ones(4, 3) - @test_throws DimensionMismatch sdf[!, cols] .= ones(3, 4) - end -end - -@testset "mutating SubDataFrame with assignment to [:, col]" begin -end - -@testset "mutating SubDataFrame with broadcasting assignment to [:, col]" begin -end - -@testset "mutating SubDataFrame with assignment to [:, cols]" begin -end - -@testset "mutating SubDataFrame with broadcasting assignment to [:, cols]" begin -end - -@testset "mutating SubDataFrame with assignment to sdf.col" begin -end - -@testset "mutating SubDataFrame with broadcasting assignment to sdf.col" begin -end end # module diff --git a/test/runtests.jl b/test/runtests.jl index b5ab3f02b1..4ade9e1b6f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -35,6 +35,7 @@ my_tests = ["utils.jl", "duplicates.jl", "show.jl", "subdataframe.jl", + "subdataframe_mutation.jl", "tables.jl", "tabletraits.jl", "indexing.jl", diff --git a/test/subdataframe_mutation.jl b/test/subdataframe_mutation.jl new file mode 100644 index 0000000000..65234535e5 --- /dev/null +++ b/test/subdataframe_mutation.jl @@ -0,0 +1,1425 @@ +module TestIndexing + +using Test, DataFrames, CategoricalArrays + +const ≅ = isequal + +@testset "mutating SubDataFrame with assignment to [!, col]" begin + df = DataFrame() + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(a=[]) + + df = DataFrame() + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test isempty(df) + + df = DataFrame() + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(a=[]) + + df = DataFrame() + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test isempty(df) + + df = DataFrame(x=Int[]) + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(x=Int[], a=[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df == DataFrame(x=Int[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(x=Int[], a=[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df == DataFrame(x=Int[]) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=1:5) + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + + df = DataFrame(x=1:5) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = Int[] + @test df ≅ DataFrame(x=1:5) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5) + + df = DataFrame(x=1:5) + sdf = @view df[:, :] + @test_throws ArgumentError sdf[!, :a] = [1] + sdf[!, :a] = 11:15 + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=11:15) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = fill(nothing, 5) + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=nothing, a=11:15) + + df = DataFrame(x=1:5) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :a] = 11:15 + @test df ≅ DataFrame(x=1:5) + @test_throws DimensionMismatch sdf[!, :x] = ["a"] + sdf[!, :x] = fill(nothing, 5) + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5)) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], :] + sdf[!, :d] = [101, 103] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[101, missing, 103, missing, missing]) + sdf[!, :a] = [-1.0, -3.0] + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 103, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], 1:end] + @test_throws ArgumentError sdf[!, :d] = [101, 103] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, :a] = [-1.0, -3.0] + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -3.0, 4, 5], + b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + sdf[!, :d] = [103, 102] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing]) + sdf[!, "e"] = [1003, 1002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, 0] = [10003, 10002] + @test_throws ArgumentError sdf[!, 6] = [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, 1] = ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, :b] = [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, :x] = 1 + @test_throws ArgumentError sdf[!, :x] = [1] + @test_throws ArgumentError sdf[!, :a] = 1 + @test_throws DimensionMismatch sdf[!, :a] = [1] + sdf[!, :f] = categorical(["3", "2"]) + @test df.f isa CategoricalArray + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + tmpc = df.c + sdf[!, 3] = [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + sdf[!, 3] = categorical(["33", "22"]) + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "33", 24, 25], + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:3] + @test_throws ArgumentError sdf[!, :d] = [103, 102] + @test_throws ArgumentError sdf[!, "e"] = [1003, 1002] + @test_throws ArgumentError sdf[!, 0] = [10003, 10002] + @test_throws ArgumentError sdf[!, 6] = [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, 1] = ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25) + sdf[!, :b] = [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25) + @test_throws ArgumentError sdf[!, :x] = 1 + @test_throws ArgumentError sdf[!, :x] = [1] + @test_throws ArgumentError sdf[!, :a] = 1 + @test_throws DimensionMismatch sdf[!, :a] = [1] + @test_throws ArgumentError sdf[!, :f] = categorical(["3", "2"]) + tmpc = df.c + sdf[!, 3] = [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25]) + sdf[!, 3] = categorical(["33", "22"]) + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "33", 24, 25]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + sdf = @view df[[3, 2], 1:2] + @test_throws ArgumentError sdf[!, :c] = 1:2 +end + +@testset "mutating SubDataFrame with broadcasting assignment to [!, col]" begin + df = DataFrame() + sdf = @view df[:, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[!, :b] .= 1 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + @test_throws DimensionMismatch sdf[!, :c] .= 1:2 + @test_throws DimensionMismatch sdf[!, :a] .= 1:2 + sdf[!, :a] .= [1.0] + @test df.a isa Vector{Union{Missing, Float64}} + @test isempty(df.a) + sdf[!, :b] .= 1.0 + @test df.b isa Vector{Union{Missing, Float64}} + @test isempty(df.b) + + df = DataFrame() + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 1 + @test isempty(df) + + df = DataFrame() + sdf = @view df[1:0, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[!, :b] .= 1 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + @test_throws DimensionMismatch sdf[!, :c] .= 1:2 + @test_throws DimensionMismatch sdf[!, :a] .= 1:2 + sdf[!, :a] .= [1.0] + @test df.a isa Vector{Union{Missing, Float64}} + @test isempty(df.a) + sdf[!, :b] .= 1.0 + @test df.b isa Vector{Union{Missing, Float64}} + @test isempty(df.b) + + df = DataFrame() + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 1 + @test isempty(df) + + df = DataFrame(x=Int[]) + sdf = @view df[:, :] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[:, 1:end] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, :] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, 1:end] + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + + df = DataFrame(x=1:5) + sdf = @view df[1:0, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + sdf[!, :x] .= Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + + df = DataFrame(x=1:5) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test df == DataFrame(x=1:5) + sdf[!, :x] .= Nothing[] + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=1:5) + + df = DataFrame(x=1:5) + sdf = @view df[:, :] + sdf[!, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=1) + sdf[!, :b] .= 2 + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=1, b=2) + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5), a=1, b=2) + + df = DataFrame(x=1:5) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[!, :a] .= [1] + @test_throws ArgumentError sdf[!, :b] .= 2 + @test df == DataFrame(x=1:5) + sdf[!, :x] .= nothing + @test df.x isa Vector{Union{Nothing, Int}} + @test df ≅ DataFrame(x=fill(nothing, 5)) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], :] + sdf[!, :d] .= 101 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + sdf[!, :a] .= -1.0 + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] + sdf[!, :a] .= [-1.0, -2.0] + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + sdf[!, :e] .= 1:2 + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing], + e=[1, missing, 2, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], 1:end] + @test_throws ArgumentError sdf[!, :d] .= 101 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, :a] .= -1.0 + @test eltype(df.a) === Float64 + @test df ≅ DataFrame(a=[-1.0, 2, -1.0, 4, 5], + b=11:15, c=21:25) + @test_throws DimensionMismatch sdf[!, :a] .= [-1.0, -2.0, -3.0] + sdf[!, :a] .= [-1.0, -2.0] + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25) + @test_throws ArgumentError sdf[!, :e] .= 1:2 + @test df ≅ DataFrame(a=[-1.0, 2, -2.0, 4, 5], + b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + sdf[!, :d] .= 102 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing]) + sdf[!, "e"] .= [1003, 1002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] + @test_throws ArgumentError sdf[!, 6] .= 10002 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, 1] .= "10002" + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[!, :b] .= [-13.0, -12.0] + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws DimensionMismatch sdf[!, :x] .= 1:3 + @test_throws DimensionMismatch sdf[!, :a] .= 1:3 + sdf[!, :f] .= categorical(["3", "2"]) + @test df.f isa CategoricalArray + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + tmpc = df.c + sdf[!, 3] .= [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + sdf[!, 3] .= categorical(["33", "22"])[2] + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10002", 4, 5], + b=[11, -12.0, -13.0, 14, 15], + c=[21, "22", "22", 24, 25], + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:3] + @test_throws ArgumentError sdf[!, :d] .= [103, 102] + @test_throws ArgumentError sdf[!, "e"] .= [1003, 1002] + @test_throws ArgumentError sdf[!, 0] .= [10003, 10002] + @test_throws ArgumentError sdf[!, 6] .= [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[!, 1] .= ["10003", "10002"] + @test eltype(df.a) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=11:15, c=21:25) + sdf[!, :b] .= -12.0 + @test eltype(df.b) === Float64 + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=21:25) + @test_throws ArgumentError sdf[!, :x] .= 1 + @test_throws ArgumentError sdf[!, :x] .= [1] + @test_throws ArgumentError sdf[!, :f] .= categorical(["3", "2"]) + tmpc = df.c + sdf[!, 3] .= [33, 22] + @test tmpc == 21:25 + @test tmpc != df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=[21, 22, 33, 24, 25]) + sdf[!, 3] .= categorical(["33", "22"])[2] + @test eltype(df.c) === Any + @test df ≅ DataFrame(a=[1, "10002", "10003", 4, 5], + b=[11, -12.0, -12.0, 14, 15], + c=[21, "22", "22", 24, 25]) + @test df.c[2] isa CategoricalValue + @test df.c[3] isa CategoricalValue + + sdf = @view df[[3, 2], 1:2] + @test_throws ArgumentError sdf[!, :c] .= 1:2 +end + +@testset "mutating SubDataFrame with assignment to [!, cols]" begin + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, [:c, :b, :a]] = DataFrame(c=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=[21, "d", "c", 24, 25]) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + @test eltype(df.c) == Any + + @test_throws ArgumentError sdf[!, [:c, :b, :a]] = DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test_throws ArgumentError sdf[!, [:c, :b, :a]] = DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + + @test_throws ArgumentError sdf[!, cols] = DataFrame(b=[1.0, 2.0], a=[13, 12]) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + @test_throws ArgumentError sdf[!, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) + end + + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, [:c, :b, :a]] = ["b" "d" "f"; "a" "c" "e"] + @test df == DataFrame(a=[1, "e", "f", 4, 5], + b=[11.0, "c", "d", 14.0, 15.0], + c=[21, "a", "b", 24, 25]) + @test tmpa !== df.a + @test eltype(df.a) == Any + @test eltype(df.b) == Any + @test eltype(df.c) == Any + + @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] = ones(2, 2) + @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] = ones(1, 3) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, cols] = [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], + b=[11.0, 4.0, 3.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Float64 + @test eltype(df.b) == Float64 + + @test_throws DimensionMismatch sdf[!, cols] = ones(1, 3) + @test_throws DimensionMismatch sdf[!, cols] = ones(3, 1) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[!, cols] = [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], + b=[11.0, 4.0, 3.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Float64 + @test eltype(df.b) == Float64 + @test_throws DimensionMismatch sdf[!, cols] = ones(1, 3) + @test_throws DimensionMismatch sdf[!, cols] = ones(3, 1) + end +end + +@testset "mutating SubDataFrame with broadcasting assignment to [!, cols]" begin + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, [:c, :b, :a]] .= DataFrame(c=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=[21, "d", "c", 24, 25]) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + @test eltype(df.c) == Any + + sdf[!, [:c, :b, :a]] .= [100, 200] + @test df == DataFrame(a=[1, 200, 100, 4, 5], + b=[11.0, 200.0, 100.0, 14.0, 15.0], + c=[21, 200, 100, 24, 25]) + + @test_throws ArgumentError sdf[!, [:c, :b, :a]] .= DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test_throws ArgumentError sdf[!, [:c, :b, :a]] .= DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + + sdf[!, cols] .= [100 200] + @test df == DataFrame(a=[1, 100, 100, 4, 5], + b=[11.0, 200.0, 200.0, 14.0, 15.0], + c=21:25) + + @test_throws ArgumentError sdf[!, cols] .= DataFrame(b=[1.0, 2.0], a=[13, 12]) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Float64 + + sdf[!, cols] .= 100 + @test df == DataFrame(a=[1, 100, 100, 4, 5], + b=[11.0, 100.0, 100.0, 14.0, 15.0], + c=21:25) + + @test_throws DimensionMismatch sdf[!, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) + end + + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, [:c, :b, :a]] .= ["b" "d" "f"; "a" "c" "e"] + @test df == DataFrame(a=[1, "e", "f", 4, 5], + b=[11.0, "c", "d", 14.0, 15.0], + c=[21, "a", "b", 24, 25]) + @test tmpa !== df.a + @test eltype(df.a) == Any + @test eltype(df.b) == Any + @test eltype(df.c) == Any + + @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] .= ones(2, 2) + @test_throws DimensionMismatch sdf[!, [:c, :b, :a]] .= ones(4, 3) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[!, cols] .= [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], + b=[11.0, 4.0, 3.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Float64 + @test eltype(df.b) == Float64 + + @test_throws DimensionMismatch sdf[!, cols] .= ones(4, 3) + @test_throws DimensionMismatch sdf[!, cols] .= ones(3, 4) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[!, cols] .= [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2.0, 1.0, 4, 5], + b=[11.0, 4.0, 3.0, 14.0, 15.0], + c=21:25) + @test tmpa !== df.a + @test eltype(df.a) == Float64 + @test eltype(df.b) == Float64 + @test_throws DimensionMismatch sdf[!, cols] .= ones(4, 3) + @test_throws DimensionMismatch sdf[!, cols] .= ones(3, 4) + end +end + +@testset "mutating SubDataFrame with assignment to [:, col]" begin + df = DataFrame() + sdf = @view df[:, :] + @test_throws ArgumentError sdf[:, :a] = [1] + sdf[:, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(a=[]) + + df = DataFrame() + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[:, :a] = [1] + @test_throws ArgumentError sdf[:, :a] = Int[] + @test isempty(df) + + df = DataFrame() + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[:, :a] = [1] + sdf[:, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(a=[]) + + df = DataFrame() + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[:, :a] = [1] + @test_throws ArgumentError sdf[:, :a] = Int[] + @test isempty(df) + + df = DataFrame(x=Int[]) + sdf = @view df[:, :] + @test_throws ArgumentError sdf[:, :a] = [1] + sdf[:, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(x=Int[], a=[]) + @test_throws DimensionMismatch sdf[:, :x] = ["a"] + sdf[:, :x] = Nothing[] + @test df.x isa Vector{Int} + + df = DataFrame(x=Int[]) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[:, :a] = [1] + @test_throws ArgumentError sdf[:, :a] = Int[] + @test df == DataFrame(x=Int[]) + @test_throws DimensionMismatch sdf[:, :x] = ["a"] + sdf[:, :x] = Nothing[] + @test df.x isa Vector{Int} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[:, :a] = [1] + sdf[:, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df == DataFrame(x=Int[], a=[]) + @test_throws DimensionMismatch sdf[:, :x] = ["a"] + sdf[:, :x] = Nothing[] + @test df.x isa Vector{Int} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[:, :a] = [1] + @test_throws ArgumentError sdf[:, :a] = Int[] + @test df == DataFrame(x=Int[]) + @test_throws DimensionMismatch sdf[:, :x] = ["a"] + sdf[:, :x] = Nothing[] + @test df.x isa Vector{Int} + + df = DataFrame(x=1:5) + sdf = @view df[1:0, :] + @test_throws ArgumentError sdf[:, :a] = [1] + sdf[:, :a] = Int[] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + @test_throws DimensionMismatch sdf[:, :x] = ["a"] + sdf[:, :x] = Nothing[] + @test df.x isa Vector{Int} + @test df ≅ DataFrame(x=1:5, a=missing) + + df = DataFrame(x=1:5) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[:, :a] = [1] + @test_throws ArgumentError sdf[:, :a] = Int[] + @test df ≅ DataFrame(x=1:5) + @test_throws DimensionMismatch sdf[:, :x] = ["a"] + sdf[:, :x] = Nothing[] + @test df.x isa Vector{Int} + @test df ≅ DataFrame(x=1:5) + + df = DataFrame(x=1:5) + sdf = @view df[:, :] + @test_throws ArgumentError sdf[:, :a] = [1] + sdf[:, :a] = 11:15 + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=11:15) + @test_throws DimensionMismatch sdf[:, :x] = ["a"] + @test_throws MethodError sdf[:, :x] = fill(nothing, 5) + @test df ≅ DataFrame(x=1:5, a=11:15) + + df = DataFrame(x=1:5) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[:, :a] = [1] + @test_throws ArgumentError sdf[:, :a] = 11:15 + @test df ≅ DataFrame(x=1:5) + @test_throws DimensionMismatch sdf[:, :x] = ["a"] + @test_throws MethodError sdf[:, :x] = fill(nothing, 5) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], :] + sdf[:, :d] = [101, 103] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[101, missing, 103, missing, missing]) + sdf[:, :a] = [-1.0, -3.0] + @test eltype(df.a) === Int + @test df ≅ DataFrame(a=[-1, 2, -3, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 103, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], 1:end] + @test_throws ArgumentError sdf[:, :d] = [101, 103] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[:, :a] = [-1.0, -3.0] + @test eltype(df.a) === Int + @test df ≅ DataFrame(a=[-1, 2, -3, 4, 5], + b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + sdf[:, :d] = [103, 102] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing]) + sdf[:, "e"] = [1003, 1002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws BoundsError sdf[:, 0] = [10003, 10002] + @test_throws BoundsError sdf[:, 6] = [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws MethodError sdf[:, 1] = ["10003", "10002"] + sdf[:, 1] = [10003, 10002] + @test eltype(df.a) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=11:15, c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[:, :b] = [-13.0, -12.0] + @test eltype(df.b) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -13, 14, 15], + c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[:, :x] = 1 + @test_throws ArgumentError sdf[:, :x] = [1] + @test_throws MethodError sdf[:, :a] = 1 + @test_throws DimensionMismatch sdf[:, :a] = [1] + sdf[:, :f] = categorical(["3", "2"]) + @test df.f isa CategoricalArray + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -13, 14, 15], + c=21:25, + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + tmpc = df.c + sdf[:, 3] = [33, 22] + @test tmpc === df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -13, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + @test_throws MethodError sdf[:, 3] = categorical(["33", "22"]) + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -13, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 103, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:3] + @test_throws ArgumentError sdf[:, :d] = [103, 102] + @test_throws ArgumentError sdf[:, "e"] = [1003, 1002] + @test_throws BoundsError sdf[:, 0] = [10003, 10002] + @test_throws BoundsError sdf[:, 6] = [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + @test_throws MethodError sdf[:, 1] = ["10003", "10002"] + sdf[:, 1] = [10003, 10002] + @test eltype(df.a) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=11:15, c=21:25) + sdf[:, :b] = [-13.0, -12.0] + @test eltype(df.b) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -13, 14, 15], + c=21:25) + @test_throws ArgumentError sdf[:, :x] = 1 + @test_throws ArgumentError sdf[:, :x] = [1] + @test_throws MethodError sdf[:, :a] = 1 + @test_throws DimensionMismatch sdf[:, :a] = [1] + @test_throws ArgumentError sdf[:, :f] = categorical(["3", "2"]) + tmpc = df.c + sdf[:, 3] = [33, 22] + @test tmpc === df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -13, 14, 15], + c=[21, 22, 33, 24, 25]) + @test_throws MethodError sdf[:, 3] = categorical(["33", "22"]) + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -13, 14, 15], + c=[21, 22, 33, 24, 25]) + + sdf = @view df[[3, 2], 1:2] + @test_throws ArgumentError df[!, :c] = 1:2 +end + +@testset "mutating SubDataFrame with broadcasting assignment to [:, col]" begin + df = DataFrame() + sdf = @view df[:, :] + sdf[:, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[:, :b] .= 1 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + @test_throws DimensionMismatch sdf[:, :c] .= 1:2 + @test_throws DimensionMismatch sdf[:, :a] .= 1:2 + sdf[:, :a] .= [1.0] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[:, :b] .= 1.0 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + + df = DataFrame() + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[:, :a] .= [1] + @test_throws ArgumentError sdf[:, :b] .= 1 + @test isempty(df) + + df = DataFrame() + sdf = @view df[1:0, :] + sdf[:, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[:, :b] .= 1 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + @test_throws DimensionMismatch sdf[:, :c] .= 1:2 + @test_throws DimensionMismatch sdf[:, :a] .= 1:2 + sdf[:, :a] .= [1.0] + @test df.a isa Vector{Union{Missing, Int}} + @test isempty(df.a) + sdf[:, :b] .= 1.0 + @test df.b isa Vector{Union{Missing, Int}} + @test isempty(df.b) + + df = DataFrame() + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[:, :a] .= [1] + @test_throws ArgumentError sdf[:, :b] .= 1 + @test isempty(df) + + df = DataFrame(x=Int[]) + sdf = @view df[:, :] + @test_throws MethodError sdf[:, :x] .= nothing + @test df.x isa Vector{Int} + + df = DataFrame(x=Int[]) + sdf = @view df[:, 1:end] + @test_throws MethodError sdf[:, :x] .= nothing + @test df.x isa Vector{Int} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, :] + @test_throws MethodError sdf[:, :x] .= nothing + @test df.x isa Vector{Int} + + df = DataFrame(x=Int[]) + sdf = @view df[1:0, 1:end] + @test_throws MethodError sdf[:, :x] .= nothing + @test df.x isa Vector{Int} + + df = DataFrame(x=1:5) + sdf = @view df[1:0, :] + sdf[:, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=missing) + sdf[:, :x] .= Nothing[] + @test df.x isa Vector{Int} + @test df ≅ DataFrame(x=1:5, a=missing) + + df = DataFrame(x=1:5) + sdf = @view df[1:0, 1:end] + @test_throws ArgumentError sdf[:, :a] .= [1] + @test df == DataFrame(x=1:5) + sdf[:, :x] .= Nothing[] + @test df.x isa Vector{Int} + @test df ≅ DataFrame(x=1:5) + + df = DataFrame(x=1:5) + sdf = @view df[:, :] + sdf[:, :a] .= [1] + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=1) + sdf[:, :b] .= 2 + @test df.a isa Vector{Union{Missing, Int}} + @test df ≅ DataFrame(x=1:5, a=1, b=2) + @test_throws MethodError sdf[:, :x] .= nothing + @test df.x isa Vector{Int} + sdf[:, :x] .= 1 + @test df ≅ DataFrame(x=fill(1, 5), a=1, b=2) + + df = DataFrame(x=1:5) + sdf = @view df[:, 1:end] + @test_throws ArgumentError sdf[:, :a] .= [1] + @test_throws ArgumentError sdf[:, :b] .= 2 + @test df == DataFrame(x=1:5) + @test_throws MethodError sdf[:, :x] .= nothing + @test df.x isa Vector{Int} + sdf[:, :x] .= 1 + @test df ≅ DataFrame(x=fill(1, 5)) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], :] + sdf[:, :d] .= 101 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + sdf[:, :a] .= -1.0 + @test eltype(df.a) === Int + @test df ≅ DataFrame(a=[-1, 2, -1, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + @test_throws DimensionMismatch sdf[:, :a] .= [-1.0, -2.0, -3.0] + sdf[:, :a] .= [-1.0, -2.0] + @test df ≅ DataFrame(a=[-1, 2, -2, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing]) + sdf[:, :e] .= 1:2 + @test df ≅ DataFrame(a=[-1, 2, -2, 4, 5], + b=11:15, c=21:25, + d=[101, missing, 101, missing, missing], + e=[1, missing, 2, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[1, 3], 1:end] + @test_throws ArgumentError sdf[:, :d] .= 101 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + sdf[:, :a] .= -1.0 + @test eltype(df.a) === Int + @test df ≅ DataFrame(a=[-1, 2, -1, 4, 5], + b=11:15, c=21:25) + @test_throws DimensionMismatch sdf[:, :a] .= [-1.0, -2.0, -3.0] + sdf[:, :a] .= [-1.0, -2.0] + @test df ≅ DataFrame(a=[-1, 2, -2, 4, 5], + b=11:15, c=21:25) + @test_throws ArgumentError sdf[:, :e] .= 1:2 + @test df ≅ DataFrame(a=[-1, 2, -2, 4, 5], + b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + sdf[:, :d] .= 102 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing]) + sdf[:, "e"] .= [1003, 1002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws ArgumentError sdf[:, 0] .= [10003, 10002] + @test_throws ArgumentError sdf[:, 6] .= 10002 + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws MethodError sdf[:, 1] .= "10002" + sdf[:, 1] .= 10002 + @test eltype(df.a) === Int + @test df ≅ DataFrame(a=[1, 10002, 10002, 4, 5], + b=11:15, c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + sdf[:, :b] .= [-13.0, -12.0] + @test eltype(df.b) === Int + @test df ≅ DataFrame(a=[1, 10002, 10002, 4, 5], + b=[11, -12, -13, 14, 15], + c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing]) + @test_throws DimensionMismatch sdf[:, :x] .= 1:3 + @test_throws DimensionMismatch sdf[:, :a] .= 1:3 + sdf[:, :f] .= categorical(["3", "2"]) + @test df.f isa CategoricalArray + @test df ≅ DataFrame(a=[1, 10002, 10002, 4, 5], + b=[11, -12, -13, 14, 15], + c=21:25, + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + tmpc = df.c + sdf[:, 3] .= [33, 22] + @test tmpc === df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, 10002, 10002, 4, 5], + b=[11, -12, -13, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + @test_throws MethodError sdf[:, 3] .= categorical(["33", "22"])[2] + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, 10002, 10002, 4, 5], + b=[11, -12, -13, 14, 15], + c=[21, 22, 33, 24, 25], + d=[missing, 102, 102, missing, missing], + e=[missing, 1002, 1003, missing, missing], + f=[missing, "2", "3", missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:3] + @test_throws ArgumentError sdf[:, :d] .= [103, 102] + @test_throws ArgumentError sdf[:, "e"] .= [1003, 1002] + @test_throws ArgumentError sdf[:, 0] .= [10003, 10002] + @test_throws ArgumentError sdf[:, 6] .= [10003, 10002] + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25) + @test_throws MethodError sdf[:, 1] .= ["10003", "10002"] + sdf[:, 1] .= [10003, 10002] + @test eltype(df.a) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=11:15, c=21:25) + sdf[:, :b] .= -12.0 + @test eltype(df.b) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -12, 14, 15], + c=21:25) + @test_throws ArgumentError sdf[:, :x] .= 1 + @test_throws ArgumentError sdf[:, :x] .= [1] + @test_throws ArgumentError sdf[:, :f] .= categorical(["3", "2"]) + tmpc = df.c + sdf[:, 3] .= [33, 22] + @test tmpc === df.c + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -12, 14, 15], + c=[21, 22, 33, 24, 25]) + @test_throws MethodError sdf[:, 3] .= categorical(["33", "22"])[2] + @test eltype(df.c) === Int + @test df ≅ DataFrame(a=[1, 10002, 10003, 4, 5], + b=[11, -12, -12, 14, 15], + c=[21, 22, 33, 24, 25]) + + sdf = @view df[[3, 2], 1:2] + @test_throws ArgumentError sdf[:, :c] .= 1:2 +end + +@testset "mutating SubDataFrame with assignment to [:, cols]" begin + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[:, [:c, :b, :a]] = DataFrame(c=[5, 6], b=[1.0, 2.0], a=[13, 12]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=[21, 6, 5, 24, 25]) + @test tmpa === df.a + + @test_throws ArgumentError sdf[:, [:c, :b, :a]] = DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test_throws ArgumentError sdf[:, [:c, :b, :a]] = DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[:, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11, 2, 1, 14, 15], + c=21:25) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + + @test_throws ArgumentError sdf[:, cols] = DataFrame(b=[1.0, 2.0], a=[13, 12]) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[:, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11, 2, 1, 14, 15], + c=21:25) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + @test_throws ArgumentError sdf[:, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) + end + + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[:, [:c, :b, :a]] = [100 101 102; 103 104 105] + @test df == DataFrame(a=[1, 105, 102, 4, 5], + b=[11.0, 104, 101, 14.0, 15.0], + c=[21, 103, 100, 24, 25]) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + @test eltype(df.c) == Int + + @test_throws DimensionMismatch sdf[:, [:c, :b, :a]] = ones(2, 2) + @test_throws DimensionMismatch sdf[:, [:c, :b, :a]] = ones(1, 3) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[:, cols] = [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2, 1, 4, 5], + b=[11, 4, 3, 14, 15], + c=21:25) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + + @test_throws DimensionMismatch sdf[:, cols] = ones(1, 3) + @test_throws DimensionMismatch sdf[:, cols] = ones(3, 1) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[:, cols] = [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2, 1, 4, 5], + b=[11, 4, 3, 14, 15], + c=21:25) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + @test_throws DimensionMismatch sdf[:, cols] = ones(1, 3) + @test_throws DimensionMismatch sdf[:, cols] = ones(3, 1) + end +end + +@testset "mutating SubDataFrame with broadcasting assignment to [:, cols]" begin + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[:, [:c, :b, :a]] .= DataFrame(c=[100, 101], b=[1.0, 2.0], a=[13, 12]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11, 2, 1, 14, 15], + c=[21, 101, 100, 24, 25]) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + @test eltype(df.c) == Int + + sdf[:, [:c, :b, :a]] .= [100, 200] + @test df == DataFrame(a=[1, 200, 100, 4, 5], + b=[11, 200, 100, 14, 15], + c=[21, 200, 100, 24, 25]) + + @test_throws ArgumentError sdf[:, [:c, :b, :a]] .= DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) + @test_throws ArgumentError sdf[:, [:c, :b, :a]] .= DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[:, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11, 2, 1, 14, 15], + c=21:25) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + + sdf[:, cols] .= [100 200] + @test df == DataFrame(a=[1, 100, 100, 4, 5], + b=[11, 200, 200, 14, 15], + c=21:25) + + @test_throws ArgumentError sdf[:, cols] .= DataFrame(b=[1.0, 2.0], a=[13, 12]) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[:, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) + @test df == DataFrame(a=[1, 12, 13, 4, 5], + b=[11, 2, 1, 14, 15], + c=21:25) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + + sdf[:, cols] .= 100 + @test df == DataFrame(a=[1, 100, 100, 4, 5], + b=[11, 100, 100, 14, 15], + c=21:25) + + @test_throws DimensionMismatch sdf[:, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) + end + + for sel in (:, 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[:, [:c, :b, :a]] .= [100 101 102; 103 104 105] + @test df == DataFrame(a=[1, 105, 102, 4, 5], + b=[11.0, 104, 101, 14.0, 15.0], + c=[21, 103, 100, 24, 25]) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + @test eltype(df.c) == Int + + @test_throws DimensionMismatch sdf[:, [:c, :b, :a]] .= ones(2, 2) + @test_throws DimensionMismatch sdf[:, [:c, :b, :a]] .= ones(4, 3) + end + + for sel in (:, 1:3), cols in (Between(:a, :b), Not(:c), r"[ab]", [true, true, false]) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], sel] + tmpa = df.a + sdf[:, cols] .= [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2, 1, 4, 5], + b=[11, 4, 3, 14, 15], + c=21:25) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + + @test_throws DimensionMismatch sdf[:, cols] .= ones(4, 3) + @test_throws DimensionMismatch sdf[:, cols] .= ones(3, 4) + end + + for cols in (All(), :, Cols(:a, :b)) + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], 1:2] + tmpa = df.a + sdf[:, cols] .= [1.0 3.0; 2.0 4.0] + @test df == DataFrame(a=[1, 2, 1, 4, 5], + b=[11, 4, 3, 14, 15], + c=21:25) + @test tmpa === df.a + @test eltype(df.a) == Int + @test eltype(df.b) == Int + @test_throws DimensionMismatch sdf[:, cols] .= ones(4, 3) + @test_throws DimensionMismatch sdf[:, cols] .= ones(3, 4) + end +end + +@testset "mutating SubDataFrame with assignment to sdf.col" begin + df = DataFrame(a=1:3) + sdf = @view df[[3, 2], :] + sdf.c = [5, 6] + @test df ≅ DataFrame(a=1:3, c=[missing, 6, 5]) + sdf.a = [13.0, 12.0] + @test eltype(sdf.a) === Float64 + @test df ≅ DataFrame(a=[1.0, 12.0, 13.0], c=[missing, 6, 5]) + + df = DataFrame(a=1:3) + sdf = @view df[[3, 2], 1:1] + @test_throws ArgumentError sdf.c = [5, 6] + sdf.a = [13.0, 12.0] + @test eltype(sdf.a) === Float64 + @test df ≅ DataFrame(a=[1.0, 12.0, 13.0]) +end + +@testset "mutating SubDataFrame with broadcasting assignment to sdf.col" begin + df = DataFrame(a=1:3) + sdf = @view df[[3, 2], :] + sdf.a .= 12.0 + @test eltype(sdf.a) === Int + @test df ≅ DataFrame(a=[1, 12, 12]) + + if VERSION >= v"1.7" + sdf.c .= 100 + @test df ≅ DataFrame(a=[1, 12, 12], c=[missing, 100, 100]) + else + @test_throws ArgumentError sdf.c .= 100 + end + + df = DataFrame(a=1:3) + sdf = @view df[[3, 2], 1:1] + @test_throws ArgumentError sdf.c = [5, 6] + sdf.a .= 12.0 + @test eltype(sdf.a) === Int + @test df ≅ DataFrame(a=[1, 12, 12]) +end + +@testset "insertcols! for SubDataFrame" begin + df = DataFrame(a=1:5, b=11:15) + sdf = @view df[:, 1:end] + @test_throws ArgumentError insertcols!(sdf, :c => 1) + @test df == DataFrame(a=1:5, b=11:15) + + df = DataFrame(a=1:5, b=11:15) + sdf = @view df[:, :] + insertcols!(sdf, :c => 1) + @test df == DataFrame(a=1:5, b=11:15, c=1) + @test eltype(df.c) === Union{Int, Missing} + @test_throws DimensionMismatch insertcols!(sdf, :d => [1]) + insertcols!(sdf, :d => 101:105) + @test df == DataFrame(a=1:5, b=11:15, c=1, d=101:105) + @test eltype(df.d) === Union{Int, Missing} + + df = DataFrame(a=1:5, b=11:15) + sdf = @view df[[3, 2], :] + insertcols!(sdf, :c => 1) + @test df ≅ DataFrame(a=1:5, b=11:15, c=[missing, 1, 1, missing, missing]) + @test eltype(df.c) === Union{Int, Missing} + @test_throws DimensionMismatch insertcols!(sdf, :d => [1]) + insertcols!(sdf, :d => [103, 102]) + @test df ≅ DataFrame(a=1:5, b=11:15, c=[missing, 1, 1, missing, missing], + d=[missing, 102, 103, missing, missing]) + @test eltype(df.d) === Union{Int, Missing} + + df = DataFrame(a=1:5, b=11:15) + sdf = @view df[3:2, :] + insertcols!(sdf, :c => 1) + @test df ≅ DataFrame(a=1:5, b=11:15, c=[missing, missing, missing, missing, missing]) + @test eltype(df.c) === Union{Int, Missing} + @test_throws DimensionMismatch insertcols!(sdf, :d => [1]) + insertcols!(sdf, :d => Int[]) + @test df ≅ DataFrame(a=1:5, b=11:15, c=[missing, missing, missing, missing, missing], + d=[missing, missing, missing, missing, missing]) + @test eltype(df.d) === Union{Int, Missing} +end + +end # module \ No newline at end of file From 59afd61b38f3c03c966bb60826f0033afc8ac4cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 8 Aug 2021 08:48:22 +0200 Subject: [PATCH 16/29] fix tests --- src/subdataframe/subdataframe.jl | 5 ++- test/broadcasting.jl | 64 ++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 29 deletions(-) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index d20f08f55b..92681c0df2 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -214,13 +214,14 @@ function Base.setindex!(sdf::SubDataFrame, v::AbstractVector, sdf[:, col_ind] = v else pdf = parent(sdf) - old_col = pdf[!, col_ind] + p_col_ind = parentcols(index(sdf), col_ind) + old_col = pdf[!, p_col_ind] T = eltype(old_col) S = eltype(v) newcol = Tables.allocatecolumn(promote_type(T, S), length(old_col)) newcol .= old_col newcol[rows(sdf)] = v - pdf[!, col_ind] = newcol + pdf[!, p_col_ind] = newcol end return sdf end diff --git a/test/broadcasting.jl b/test/broadcasting.jl index 570b2fb892..dbc2b40d73 100644 --- a/test/broadcasting.jl +++ b/test/broadcasting.jl @@ -148,7 +148,11 @@ end @test df[:, 2:end] == refdf[:, 2:end] dfv = @view df[1:2, 2:end] - @test_throws ArgumentError dfv[!, 1] .+= 1 + dfv[!, 1] .+= 100 + @test df.x2 == [104.5, 105.5, 6.5] + df.x1 -= [1, 1, 1] + df.x2 -= [100, 100, 0] + @test df == refdf df = copy(refdf) df[:, 1] .+= 1 @@ -220,7 +224,8 @@ end @test df[:, Not("x1")] == refdf[:, 2:end] dfv = @view df[1:2, 2:end] - @test_throws ArgumentError dfv[!, 1] .+= [0, 1] .+ 1 + dfv[!, 1] .+= [0, 1] .+ 1 + @test df.x2 == [5.5, 7.5, 6.5] dfv = @view df[1:2, 2:end] @test_throws ArgumentError dfv[!, "x1"] .+= [0, 1] .+ 1 @@ -287,13 +292,13 @@ end df = copy(refdf) dfv = @view df[1:2, 2:end] dfr = df[1, 3:end] + @test_throws DimensionMismatch dfv[!, 1] .= fill(100, 2, 1) + @test_throws DimensionMismatch dfv[!, 1] .= reshape(fill(200, 2), :, 1) @test_throws DimensionMismatch df[!, 1] .= rand(1, 2) - @test_throws ArgumentError dfv[!, 1] .= rand(2, 1) @test_throws DimensionMismatch dfr[end-1:end] .= rand(3, 1) @test_throws DimensionMismatch df[:, 1] .= rand(1, 3) @test_throws DimensionMismatch dfv[:, 1] .= rand(1, 2) @test_throws DimensionMismatch df[!, 1] .= reshape(rand(3), 1, :) - @test_throws ArgumentError dfv[!, 1] .= reshape(rand(2), :, 1) @test_throws DimensionMismatch dfr[end-1:end] .= reshape(rand(3), :, 1) @test_throws DimensionMismatch df[:, 1] .= reshape(rand(3), 1, :, 1) @test_throws DimensionMismatch dfv[:, 1] .= reshape(rand(2), 1, :, 1) @@ -304,13 +309,13 @@ end @test df[:, 2:end] == refdf[:, 2:end] dfv = @view df[1:2, 2:end] - @test_throws ArgumentError dfv[!, :x2] .+= 1 + dfv[!, :x2] .+= 1 dfr = df[1, 3:end] dfr[[:x4, :x5]] .= 10 @test Vector(dfr) == [7.5, 10.0, 10.0] - @test Matrix(df) == [2.5 4.5 7.5 10.0 10.0 - 3.5 5.5 8.5 11.5 14.5 + @test Matrix(df) == [2.5 5.5 7.5 10.0 10.0 + 3.5 6.5 8.5 11.5 14.5 4.5 6.5 9.5 12.5 15.5] df = copy(refdf) @@ -319,13 +324,13 @@ end @test df[:, 2:end] == refdf[:, 2:end] dfv = @view df[1:2, 2:end] - @test_throws ArgumentError dfv[!, "x2"] .+= 1 + dfv[!, "x2"] .+= 1 dfr = df[1, 3:end] dfr[["x4", "x5"]] .= 10 @test Vector(dfr) == [7.5, 10.0, 10.0] - @test Matrix(df) == [2.5 4.5 7.5 10.0 10.0 - 3.5 5.5 8.5 11.5 14.5 + @test Matrix(df) == [2.5 5.5 7.5 10.0 10.0 + 3.5 6.5 8.5 11.5 14.5 4.5 6.5 9.5 12.5 15.5] df = copy(refdf) @@ -360,13 +365,13 @@ end @test df[:, 2:end] == refdf[:, 2:end] dfv = @view df[1:2, 2:end] - @test_throws ArgumentError dfv[!, :x2] .+= [1, 2] + dfv[!, :x2] .+= [1, 2] dfr = df[1, 3:end] dfr[[:x4, :x5]] .= [10, 11] @test Vector(dfr) == [7.5, 10.0, 11.0] - @test Matrix(df) == [2.5 4.5 7.5 10.0 11.0 - 4.5 5.5 8.5 11.5 14.5 + @test Matrix(df) == [2.5 5.5 7.5 10.0 11.0 + 4.5 7.5 8.5 11.5 14.5 6.5 6.5 9.5 12.5 15.5] df = copy(refdf) @@ -375,13 +380,13 @@ end @test df[:, 2:end] == refdf[:, 2:end] dfv = @view df[1:2, 2:end] - @test_throws ArgumentError dfv[!, :x2] .+= [1, 2] + dfv[!, :x2] .+= [1, 2] dfr = df[1, 3:end] dfr[["x4", "x5"]] .= [10, 11] @test Vector(dfr) == [7.5, 10.0, 11.0] - @test Matrix(df) == [2.5 4.5 7.5 10.0 11.0 - 4.5 5.5 8.5 11.5 14.5 + @test Matrix(df) == [2.5 5.5 7.5 10.0 11.0 + 4.5 7.5 8.5 11.5 14.5 6.5 6.5 9.5 12.5 15.5] df = copy(refdf) @@ -413,18 +418,19 @@ end df = copy(refdf) dfv = @view df[1:2, 2:end] dfr = df[1, 3:end] + + @test_throws DimensionMismatch dfv[!, :x2] .= fill(100, 2, 1) + @test_throws DimensionMismatch dfv[!, 1] .= reshape(fill(200, 2), :, 1) + @test_throws DimensionMismatch dfv[!, "x2"] .= fill(100, 2, 1) @test_throws DimensionMismatch df[!, :x1] .= rand(1, 3) - @test_throws ArgumentError dfv[!, :x2] .= rand(2, 1) @test_throws DimensionMismatch dfr[[:x4, :x5]] .= rand(3, 1) @test_throws DimensionMismatch df[:, :x1] .= rand(1, 3) @test_throws DimensionMismatch dfv[:, :x2] .= rand(1, 2) @test_throws DimensionMismatch df[!, 1] .= reshape(rand(3), 1, :) - @test_throws ArgumentError dfv[!, 1] .= reshape(rand(2), :, 1) @test_throws DimensionMismatch dfr[end-1:end] .= reshape(rand(3), :, 1) @test_throws DimensionMismatch df[:, 1] .= reshape(rand(3), 1, :) @test_throws DimensionMismatch dfv[:, 1] .= reshape(rand(2), 1, :) @test_throws DimensionMismatch df[!, "x1"] .= rand(1, 3) - @test_throws ArgumentError dfv[!, "x2"] .= rand(2, 1) @test_throws DimensionMismatch dfr[["x4", "x5"]] .= rand(3, 1) @test_throws DimensionMismatch df[:, "x1"] .= rand(1, 3) @test_throws DimensionMismatch dfv[:, "x2"] .= rand(1, 2) @@ -1566,16 +1572,19 @@ end @test v2 == [103.0, 104.0, 105.0] df = view(copy(refdf), :, :) - @test_throws ArgumentError df[!, 1] .= 100.0 - @test df == refdf + df[!, 1] .= 100 + @test parent(df).x1 == [100, 100, 100] + @test eltype(parent(df).x1) == Float64 df = view(copy(refdf), :, :) - @test_throws ArgumentError df[!, :x1] .= 100.0 - @test df == refdf + df[!, :x1] .= 100.0 + @test parent(df).x1 == [100, 100, 100] + @test eltype(parent(df).x1) == Float64 df = view(copy(refdf), :, :) - @test_throws ArgumentError df[!, :newcol] .= 100.0 - @test df == refdf + df[!, :newcol] .= 100.0 + @test parent(df).newcol == [100, 100, 100] + @test eltype(parent(df).newcol) == Union{Float64, Missing} df = view(copy(refdf), :, :) @test_throws ArgumentError df[!, 10] .= 'a' @@ -1586,8 +1595,9 @@ end @test df == refdf df = view(copy(refdf), :, :) - @test_throws ArgumentError df[!, 1:2] .= 'a' - @test df == refdf + df[!, 1:2] .= 'a' + @test parent(df).x1 == parent(df).x2 == ['a', 'a', 'a'] + @test eltype(parent(df).x1) === Any df = view(copy(refdf), :, :) v1 = df[!, 1] From 700e65d963bac253c249beedce6333d46e3372c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 8 Aug 2021 09:38:57 +0200 Subject: [PATCH 17/29] fix tests on Julia 1.7 --- test/broadcasting.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/broadcasting.jl b/test/broadcasting.jl index dbc2b40d73..8bdf44f9fd 100644 --- a/test/broadcasting.jl +++ b/test/broadcasting.jl @@ -292,8 +292,8 @@ end df = copy(refdf) dfv = @view df[1:2, 2:end] dfr = df[1, 3:end] - @test_throws DimensionMismatch dfv[!, 1] .= fill(100, 2, 1) - @test_throws DimensionMismatch dfv[!, 1] .= reshape(fill(200, 2), :, 1) + @test_throws DimensionMismatch dfv[!, 1] .= fill(100, 2, 2) + @test_throws DimensionMismatch dfv[!, 1] .= reshape(fill(200, 4), :, 2) @test_throws DimensionMismatch df[!, 1] .= rand(1, 2) @test_throws DimensionMismatch dfr[end-1:end] .= rand(3, 1) @test_throws DimensionMismatch df[:, 1] .= rand(1, 3) @@ -419,8 +419,8 @@ end dfv = @view df[1:2, 2:end] dfr = df[1, 3:end] - @test_throws DimensionMismatch dfv[!, :x2] .= fill(100, 2, 1) - @test_throws DimensionMismatch dfv[!, 1] .= reshape(fill(200, 2), :, 1) + @test_throws DimensionMismatch dfv[!, :x2] .= fill(100, 2, 2) + @test_throws DimensionMismatch dfv[!, 1] .= reshape(fill(200, 4), :, 2) @test_throws DimensionMismatch dfv[!, "x2"] .= fill(100, 2, 1) @test_throws DimensionMismatch df[!, :x1] .= rand(1, 3) @test_throws DimensionMismatch dfr[[:x4, :x5]] .= rand(3, 1) From 50d9f8bc0149ffe59333b2bff808ddd73380b6df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 8 Aug 2021 10:43:22 +0200 Subject: [PATCH 18/29] one more test fix --- test/broadcasting.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/broadcasting.jl b/test/broadcasting.jl index 8bdf44f9fd..5ffc76be2d 100644 --- a/test/broadcasting.jl +++ b/test/broadcasting.jl @@ -421,7 +421,7 @@ end @test_throws DimensionMismatch dfv[!, :x2] .= fill(100, 2, 2) @test_throws DimensionMismatch dfv[!, 1] .= reshape(fill(200, 4), :, 2) - @test_throws DimensionMismatch dfv[!, "x2"] .= fill(100, 2, 1) + @test_throws DimensionMismatch dfv[!, "x2"] .= fill(100, 2, 2) @test_throws DimensionMismatch df[!, :x1] .= rand(1, 3) @test_throws DimensionMismatch dfr[[:x4, :x5]] .= rand(3, 1) @test_throws DimensionMismatch df[:, :x1] .= rand(1, 3) From 6045034a320728fb5d6de9c93146cf6f15f3a91c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 8 Aug 2021 13:12:17 +0200 Subject: [PATCH 19/29] finalize all required changes --- NEWS.md | 7 +- docs/src/man/split_apply_combine.md | 168 +++++++++++++++++++++- src/abstractdataframe/selection.jl | 33 +++-- src/dataframe/dataframe.jl | 1 + src/groupeddataframe/splitapplycombine.jl | 18 ++- src/subdataframe/subdataframe.jl | 25 +++- test/grouping.jl | 8 +- test/select.jl | 11 +- test/subdataframe_mutation.jl | 166 +++++++++++++++++++++ 9 files changed, 414 insertions(+), 23 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9e0cb382f5..efc7ebde6c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,10 +34,15 @@ Assignment to existing columns retains allocates a new column. Values already stored in filtered-out rows are retained. -* TODO DESIGN: Allow `SubDataFrame` to be passed as argument of , `select!` and `transform!` +* Allow `SubDataFrame` to be passed as argument of , `select!` and `transform!` (also on `GroupedDataFrame` created a `SubDataFrame`) ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). + Assignment to existing columns retains allocates a new column. Values already stored in filtered-out rows are retained. + In case of creation of new columns `missing` values stored in filtered-out rows; + If `SubDataFrame` is not created with `:` as column selector the resulting operation + must produce the same column names as stored in the source `SubDataFrame` or an error is thrown. + # DataFrames.jl v1.2.2 Patch Release Notes ## Bug fixes diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 0126c12bf2..23f12dfaaf 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -108,9 +108,10 @@ In all of these cases, `function` can return either a single row or multiple rows. As a particular rule, values wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and then treated as a single row. -`select`/`select!` and `transform`/`transform!` always return a `DataFrame` +`select`/`select!` and `transform`/`transform!` always return a data frame with the same number and order of rows as the source (even if `GroupedDataFrame` -had its groups reordered). +had its groups reordered), except when selection results in zero columns +in the resulting data frame. For `combine`, rows in the returned object appear in the order of groups in the `GroupedDataFrame`. The functions can return an arbitrary number of rows for @@ -612,3 +613,166 @@ julia> gd[1] ─────┼─────── 1 │ 1 ``` + +# Simulating the SQL `where` clause + +You can conveniently work on subsets of a data frame by using `SubDataFrame`s. +Operations performed on such objects can both create a new data frame and be +performed in-place. Here are some examples: + +```jldoctest sac +julia> df = DataFrame(a=1:5) +5×1 DataFrame + Row │ a + │ Int64 +─────┼─────── + 1 │ 1 + 2 │ 2 + 3 │ 3 + 4 │ 4 + 5 │ 5 + +julia> sdf = @view df[2:3, :] +2×1 SubDataFrame + Row │ a + │ Int64 +─────┼─────── + 1 │ 2 + 2 │ 3 + +julia> transform(sdf, :a => ByRow(string)) # create a new data frame +2×2 DataFrame + Row │ a a_string + │ Int64 String +─────┼───────────────── + 1 │ 2 2 + 2 │ 3 3 + +julia> transform!(sdf, :a => ByRow(string)) # update the source df in-place +2×2 SubDataFrame + Row │ a a_string + │ Int64 String? +─────┼───────────────── + 1 │ 2 2 + 2 │ 3 3 + +julia> df # new column was created filled with missing in filtered-out rows +5×2 DataFrame + Row │ a a_string + │ Int64 String? +─────┼───────────────── + 1 │ 1 missing + 2 │ 2 2 + 3 │ 3 3 + 4 │ 4 missing + 5 │ 5 missing + +julia> select!(sdf, :a => -, renamecols=false) # update the source df in-place +2×1 SubDataFrame + Row │ a + │ Int64 +─────┼─────── + 1 │ -2 + 2 │ -3 + +julia> df # the column replaced an existing column; previously stored values are re-used in filtered-out rows +5×1 DataFrame + Row │ a + │ Int64 +─────┼─────── + 1 │ 1 + 2 │ -2 + 3 │ -3 + 4 │ 4 + 5 │ 5 +``` + +Similar operations can be performed on `GroupedDataFrame` as well: +```jldoctest sac +julia> df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=1:6) +6×2 DataFrame + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 1 2 + 3 │ 1 3 + 4 │ 2 4 + 5 │ 2 5 + 6 │ 3 6 + +julia> sdf = @view df[2:4, :] +3×2 SubDataFrame + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 1 2 + 2 │ 1 3 + 3 │ 2 4 + +julia> gsdf = groupby(sdf, :a) +GroupedDataFrame with 2 groups based on key: a +First Group (2 rows): a = 1 + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 1 2 + 2 │ 1 3 +⋮ +Last Group (1 row): a = 2 + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 2 4 + +julia> transform(gsdf, nrow) # create a new data frame +3×3 DataFrame + Row │ a b nrow + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 1 2 2 + 2 │ 1 3 2 + 3 │ 2 4 1 + +julia> transform!(gsdf, nrow, :b => :b_copy) +3×4 SubDataFrame + Row │ a b nrow b_copy + │ Int64 Int64 Int64? Int64? +─────┼────────────────────────────── + 1 │ 1 2 2 2 + 2 │ 1 3 2 3 + 3 │ 2 4 1 4 + +julia> df +6×4 DataFrame + Row │ a b nrow b_copy + │ Int64 Int64 Int64? Int64? +─────┼──────────────────────────────── + 1 │ 1 1 missing missing + 2 │ 1 2 2 2 + 3 │ 1 3 2 3 + 4 │ 2 4 1 4 + 5 │ 2 5 missing missing + 6 │ 3 6 missing missing + +julia> select!(gsdf, :b_copy, :b => sum, renamecols=false) +3×3 SubDataFrame + Row │ a b_copy b + │ Int64 Int64? Int64 +─────┼────────────────────── + 1 │ 1 2 5 + 2 │ 1 3 5 + 3 │ 2 4 4 + +julia> df +6×3 DataFrame + Row │ a b_copy b + │ Int64 Int64? Int64 +─────┼─────────────────────── + 1 │ 1 missing 1 + 2 │ 1 2 5 + 3 │ 1 3 5 + 4 │ 2 4 4 + 5 │ 2 missing 5 + 6 │ 3 missing 6 +``` diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 755e81a7d6..cd6c2400b6 100755 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -102,9 +102,10 @@ const TRANSFORMATION_COMMON_RULES = rows. As a particular rule, values wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and then treated as a single row. - `select`/`select!` and `transform`/`transform!` always return a `DataFrame` + `select`/`select!` and `transform`/`transform!` always return a data frame with the same number and order of rows as the source (even if `GroupedDataFrame` - had its groups reordered). + had its groups reordered), except when selection results in zero columns + in the resulting data frame. For `combine`, rows in the returned object appear in the order of groups in the `GroupedDataFrame`. The functions can return an arbitrary number of rows for @@ -618,9 +619,9 @@ function select_transform!((nc,)::Ref{Any}, df::AbstractDataFrame, newdf::DataFr end """ - select!(df::DataFrame, args...; renamecols::Bool=true) + select!(df::AbstractDataFrame, args...; renamecols::Bool=true) select!(args::Base.Callable, df::DataFrame; renamecols::Bool=true) - select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true, renamecols::Bool=true) + select!(gd::GroupedDataFrame, args...; ungroup::Bool=true, renamecols::Bool=true) select!(f::Base.Callable, gd::GroupedDataFrame; ungroup::Bool=true, renamecols::Bool=true) Mutate `df` or `gd` in place to retain only columns or transformations specified by `args...` and @@ -628,6 +629,16 @@ return it. The result is guaranteed to have the same number of rows as `df` or parent of `gd`, except when no columns are selected (in which case the result has zero rows). +If `SubDataFrame` or `GroupedDataFrame{SubDataFrame}` is passed the resulting +operation follows the same rules as indexing: +- for existing columns filtered-out rows are filled with values present in the + old columns +- for new columns (which is only allowed if `SubDataFrame` was created with `:` + as column selector) filtered-out rows are filled with `missing` +- if `SubDataFrame` was not created with `:` as column selector then `select!` + is only allowed if the transformations keep exactly the same sequence of column + names as is in the passed `df` + If `gd` is passed then it is updated to reflect the new rows of its updated parent. If there are independent `GroupedDataFrame` objects constructed using the same parent data frame they might get corrupt. @@ -645,6 +656,9 @@ See [`select`](@ref) for examples. select!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) = _replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols)) +select!(df::SubDataFrame, @nospecialize(args...); renamecols::Bool=true) = + _replace_columns!(df, select(df, args..., copycols=true, renamecols=renamecols)) + function select!(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon throw(ArgumentError("First argument must be a transformation if the second argument is a data frame")) @@ -653,14 +667,15 @@ function select!(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renam end """ - transform!(df::DataFrame, args...; renamecols::Bool=true) - transform!(args::Callable, df::DataFrame; renamecols::Bool=true) - transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true, renamecols::Bool=true) + transform!(df::AbstractDataFrame, args...; renamecols::Bool=true) + transform!(args::Callable, df::AbstractDataFrame; renamecols::Bool=true) + transform!(gd::GroupedDataFrame, args...; ungroup::Bool=true, renamecols::Bool=true) transform!(f::Base.Callable, gd::GroupedDataFrame; ungroup::Bool=true, renamecols::Bool=true) Mutate `df` or `gd` in place to add columns specified by `args...` and return it. The result is guaranteed to have the same number of rows as `df`. -Equivalent to `select!(df, :, args...)` or `select!(gd, :, args...)`. +Equivalent to `select!(df, :, args...)` or `select!(gd, :, args...)`, +except that column renaming performs a copy. $TRANSFORMATION_COMMON_RULES @@ -672,7 +687,7 @@ $TRANSFORMATION_COMMON_RULES See [`select`](@ref) for examples. """ -function transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) +function transform!(df::AbstractDataFrame, @nospecialize(args...); renamecols::Bool=true) idx = index(df) newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol} idx[first(sel)] => copy => last(sel) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 926bb0386c..eccb5d29aa 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1748,6 +1748,7 @@ end # This is not exactly copy! as in general we allow axes to be different function _replace_columns!(df::DataFrame, newdf::DataFrame) + @assert ncol(newdf) == 0 || nrow(df) == nrow(newdf) copy!(_columns(df), _columns(newdf)) copy!(_names(index(df)), _names(newdf)) copy!(index(df).lookup, index(newdf).lookup) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 6ca3656b9a..7895d9ce3d 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -749,12 +749,17 @@ function select!(@nospecialize(f::Base.Callable), gd::GroupedDataFrame; ungroup: return select!(gd, f, ungroup=ungroup) end -function select!(gd::GroupedDataFrame{DataFrame}, +function select!(gd::GroupedDataFrame, @nospecialize(args::Union{Pair, Base.Callable, ColumnIndex, MultiColumnIndex, AbstractVecOrMat{<:Pair}}...); ungroup::Bool=true, renamecols::Bool=true) - newdf = select(gd, args..., copycols=false, renamecols=renamecols) df = parent(gd) + if df isa DataFrame + newdf = select(gd, args..., copycols=false, renamecols=renamecols) + else + @assert df isa SubDataFrame + newdf = select(gd, args..., copycols=true, renamecols=renamecols) + end _replace_columns!(df, newdf) return ungroup ? df : gd end @@ -766,12 +771,17 @@ function transform!(@nospecialize(f::Base.Callable), gd::GroupedDataFrame; ungro return transform!(gd, f, ungroup=ungroup) end -function transform!(gd::GroupedDataFrame{DataFrame}, +function transform!(gd::GroupedDataFrame, @nospecialize(args::Union{Pair, Base.Callable, ColumnIndex, MultiColumnIndex, AbstractVecOrMat{<:Pair}}...); ungroup::Bool=true, renamecols::Bool=true) - newdf = select(gd, :, args..., copycols=false, renamecols=renamecols) df = parent(gd) + if df isa DataFrame + newdf = select(gd, :, args..., copycols=false, renamecols=renamecols) + else + @assert df isa SubDataFrame + newdf = select(gd, :, args..., copycols=true, renamecols=renamecols) + end select!(newdf, propertynames(df), :) _replace_columns!(df, newdf) return ungroup ? df : gd diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 92681c0df2..c8235457c7 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -308,4 +308,27 @@ function is_column_adding_allowed(df::AbstractDataFrame) return getfield(df, :colindex) isa Index end throw(ArgumentError("Unsupported data frame type")) -end \ No newline at end of file +end + +function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame) + if _names(sdf) == _names(newdf) + for col in _names(newdf) + sdf[!, col] = newdf[!, col] + end + return sdf + end + + if !is_column_adding_allowed(sdf) + throw(ArgumentError("changing the sequence of column names in a SubDataFrame " * + "that subsets columns of its parent data frame is disallowed")) + end + + psdf = parent(sdf) + @assert psdf isa DataFrame + for colname in _names(newdf) + sdf[!, colname] = newdf[!, colname] + end + select!(psdf, _names(newdf)) + + return sdf +end diff --git a/test/grouping.jl b/test/grouping.jl index 80cdac8f36..1d9b19fdeb 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -2561,8 +2561,12 @@ end DataFrame(g=categorical([3, 1, 1, missing]), x=1:4, y=5:8)), dosort in (true, false, nothing) - @test_throws MethodError select!(groupby_checked(view(df, :, :), :g), :x) - @test_throws MethodError transform!(groupby_checked(view(df, :, :), :g), :x) + dfc = copy(df) + select!(groupby_checked(view(dfc, :, :), :g), :x) + @test dfc ≅ df[!, [:g, :x]] + dfc = copy(df) + transform!(groupby_checked(view(dfc, :, :), :g), :x) + @test dfc ≅ df dfc = copy(df) g = dfc.g diff --git a/test/select.jl b/test/select.jl index 64e2b893e7..7d74d8c512 100644 --- a/test/select.jl +++ b/test/select.jl @@ -249,8 +249,9 @@ end @test_throws ArgumentError select!(df, :f) @test_throws BoundsError select!(df, [true, false]) - @test_throws MethodError select!(view(df, :, :), 1:2) + @test select!(view(df, :, :), 1:2) == DataFrame(a=1, b=2) + df = DataFrame(a=1, b=2, c=3, d=4, e=5) d = copy(df, copycols=false) @test select!(d, 1:0) == DataFrame() @test select!(d, Not(r"")) == DataFrame() @@ -403,8 +404,9 @@ end @test_throws BoundsError select(df, 6) @test_throws ArgumentError select(df, [1, 1]) @test_throws ArgumentError select(df, :f) - @test_throws MethodError select!(df, [true, false]) + @test_throws BoundsError select!(df, [true, false]) + df = view(DataFrame(a=1, b=2, c=3, d=4, e=5), :, :) @test select(df, 1:0) == DataFrame() @test select(df, Not(r"")) == DataFrame() @test select(df, 1:0, copycols=false) == DataFrame() @@ -1315,8 +1317,9 @@ end @test df == DataFrame(x=1:3, y=4:6) dfv = view(df, [2, 1], [2, 1]) - @test_throws MethodError select!(dfv, 1) - @test_throws MethodError transform!(dfv, 1) + @test_throws ArgumentError select!(dfv, 1) + @test transform!(dfv, 1) == dfv + @test df == DataFrame(x=1:3, y=4:6) end @testset "renamecols=false tests" begin diff --git a/test/subdataframe_mutation.jl b/test/subdataframe_mutation.jl index 65234535e5..ab61d510c9 100644 --- a/test/subdataframe_mutation.jl +++ b/test/subdataframe_mutation.jl @@ -1422,4 +1422,170 @@ end @test eltype(df.d) === Union{Int, Missing} end +@testset "select! on SubDataFrame" begin + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], [3, 2]] + @test_throws ArgumentError select!(sdf, :c => :b, :b => :c) + @test df == DataFrame(a=1:5, b=11:15, c=21:25) + select!(sdf, :b => :c, :c => :b) + @test df == DataFrame(a=1:5, + b=[11, 22, 23, 14, 15], + c=[21, 12, 13, 24, 25]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], [3, 2]] + @test_throws ArgumentError select!(sdf, :b => x -> ["b3", "b2"], :c => (x -> ["c3", "c2"]), renamecols=false) + @test df == DataFrame(a=1:5, b=11:15, c=21:25) + select!(sdf, :c => x -> ["c3", "c2"], :b => (x -> ["b3", "b2"]), renamecols=false) + @test df == DataFrame(a=1:5, + b=[11, "b2", "b3", 14, 15], + c=[21, "c2", "c3", 24, 25]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + select!(sdf, :a, :b => :c, :c => :b) + @test df == DataFrame(a=1:5, + c=[21, 12, 13, 24, 25], + b=[11, 22, 23, 14, 15]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + select!(sdf, :b => :d, :c => :b) + @test df ≅ DataFrame(d=[missing, 12, 13, missing, missing], + b=[11, 22, 23, 14, 15]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], [3, 2]] + @test_throws ArgumentError select!(sdf) + @test df == DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + select!(sdf) + @test df == DataFrame() + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[3:2, [3, 2]] + @test_throws ArgumentError select!(sdf, :c => (x -> Int[]) => :d) + @test df == DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[3:2, :] + select!(sdf, :c => (x -> Int[]) => :d) + @test df ≅ DataFrame(d=missings(Int, 5)) + @test df.d isa Vector{Union{Int, Missing}} +end + +@testset "transform! on SubDataFrame" begin + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], [3, 2]] + transform!(sdf, :c => :b, :b => :c) + @test df == DataFrame(a=1:5, + b=[11, 22, 23, 14, 15], + c=[21, 12, 13, 24, 25]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], [3, 2]] + transform!(sdf, :b => :c, :c => :b) + @test df == DataFrame(a=1:5, + b=[11, 22, 23, 14, 15], + c=[21, 12, 13, 24, 25]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], [3, 2]] + transform!(sdf, :b => x -> ["b3", "b2"], :c => (x -> ["c3", "c2"]), renamecols=false) + @test df == DataFrame(a=1:5, + b=[11, "b2", "b3", 14, 15], + c=[21, "c2", "c3", 24, 25]) + transform!(sdf, :c => x -> ["c3", "c2"], :b => (x -> ["b3", "b2"]), renamecols=false) + @test df == DataFrame(a=1:5, + b=[11, "b2", "b3", 14, 15], + c=[21, "c2", "c3", 24, 25]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + transform!(sdf, :b => :c, :c => :b) + @test df == DataFrame(a=1:5, + b=[11, 22, 23, 14, 15], + c=[21, 12, 13, 24, 25]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + transform!(sdf, :b => :d, :c => :b) + @test df ≅ DataFrame(a=1:5, + b=[11, 22, 23, 14, 15], + c=21:25, + d=[missing, 12, 13, missing, missing]) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], [3, 2]] + transform!(sdf) + @test df == DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[[3, 2], :] + transform!(sdf) + @test df == DataFrame(a=1:5, b=11:15, c=21:25) + + df = DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[3:2, [3, 2]] + @test_throws ArgumentError transform!(sdf, :c => (x -> Int[]) => :d) + @test df == DataFrame(a=1:5, b=11:15, c=21:25) + sdf = @view df[3:2, :] + transform!(sdf, :c => (x -> Int[]) => :d) + @test df ≅ DataFrame(a=1:5, b=11:15, c=21:25, d=missings(Int, 5)) + @test df.d isa Vector{Union{Int, Missing}} +end + +@testset "select! on GroupedDataFrame{SubDataFrame}" begin + df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=11:16, c=21:26, d=31:36) + sdf = @view df[2:4, [2, 1, 3]] + gsdf = groupby(sdf, :a) + @test_throws ArgumentError select!(gsdf, :b => x -> x .+ 100, renamecols=false) + @test df == DataFrame(a=[1, 1, 1, 2, 2, 3], b=11:16, c=21:26, d=31:36) + sdf = @view df[2:4, [2, 1]] + gsdf = groupby(sdf, :a) + @test_throws ArgumentError select!(gsdf, :b => x -> x .+ 100, renamecols=false) + @test df == DataFrame(a=[1, 1, 1, 2, 2, 3], b=11:16, c=21:26, d=31:36) + sdf = @view df[2:4, [1, 3, 2]] + gsdf = groupby(sdf, :a) + @test select!(gsdf, :c, :b => x -> x .+ 100, renamecols=false, ungroup=false) === gsdf + @test df == DataFrame(a=[1, 1, 1, 2, 2, 3], + b=[11, 112, 113, 114, 15, 16], c=21:26, d=31:36) + + df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=11:16, c=21:26, d=31:36) + sdf = @view df[[3, 4, 2], :] + gsdf = groupby(sdf, :a) + @test select!(gsdf, :b => x -> x .+ 100, :c => :e, renamecols=false) === sdf + @test df ≅ DataFrame(a=[1, 1, 1, 2, 2, 3], + b=[11, 112, 113, 114, 15, 16], + e=[missing, 22, 23, 24, missing, missing]) +end + +@testset "transform! on GroupedDataFrame{SubDataFrame}" begin + df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=11:16, c=21:26, d=31:36) + sdf = @view df[2:4, [2, 1, 3]] + gsdf = groupby(sdf, :a) + transform!(gsdf, :b => x -> x .+ 100, renamecols=false) + @test df == DataFrame(a=[1, 1, 1, 2, 2, 3], + b=[11, 112, 113, 114, 15, 16], c=21:26, d=31:36) + + df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=11:16, c=21:26, d=31:36) + sdf = @view df[2:4, [2, 1]] + gsdf = groupby(sdf, :a) + transform!(gsdf, :b => x -> x .+ 100, renamecols=false) + @test df == DataFrame(a=[1, 1, 1, 2, 2, 3], + b=[11, 112, 113, 114, 15, 16], c=21:26, d=31:36) + + df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=11:16, c=21:26, d=31:36) + sdf = @view df[2:4, [1, 3, 2]] + gsdf = groupby(sdf, :a) + @test transform!(gsdf, :c, :b => x -> x .+ 100, renamecols=false, ungroup=false) == gsdf + @test df == DataFrame(a=[1, 1, 1, 2, 2, 3], + b=[11, 112, 113, 114, 15, 16], c=21:26, d=31:36) + + df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=11:16, c=21:26, d=31:36) + sdf = @view df[[3, 4, 2], :] + gsdf = groupby(sdf, :a) + @test transform!(gsdf, :b => x -> x .+ 100, :c => :e, renamecols=false) === sdf + @test df ≅ DataFrame(a=[1, 1, 1, 2, 2, 3], + b=[11, 112, 113, 114, 15, 16], + c=21:26, d=31:36, + e=[missing, 22, 23, 24, missing, missing]) +end + end # module \ No newline at end of file From 1ae053439806b1aeae6e677c0b3a80a6659d7834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 8 Aug 2021 14:01:55 +0200 Subject: [PATCH 20/29] fix 1.7 broadcasting --- test/broadcasting.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/broadcasting.jl b/test/broadcasting.jl index 5ffc76be2d..0b8ff9f10d 100644 --- a/test/broadcasting.jl +++ b/test/broadcasting.jl @@ -1609,8 +1609,13 @@ end @test v1 == [100.0, 100.0, 100.0] df = view(copy(refdf), :, :) - @test_throws ArgumentError df.newcol .= 'd' - @test df == refdf + if VERSION >= v"1.7" + df.newcol .= 'd' + df.newcol == fill('d', 3) + else + @test_throws ArgumentError df.newcol .= 'd' + @test df == refdf + end end @testset "DataFrameRow getproperty broadcasted assignment" begin From 971c282e442fc31e10f581813f85b4da468d4798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 25 Aug 2021 08:14:02 +0200 Subject: [PATCH 21/29] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- NEWS.md | 19 +++++++++++-------- docs/src/lib/indexing.md | 21 ++++++++++----------- docs/src/man/split_apply_combine.md | 4 ++-- src/abstractdataframe/selection.jl | 8 ++++---- src/dataframe/dataframe.jl | 4 ++-- src/subdataframe/subdataframe.jl | 9 +++++---- 6 files changed, 34 insertions(+), 31 deletions(-) diff --git a/NEWS.md b/NEWS.md index efc7ebde6c..24a11b5322 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,21 +26,24 @@ ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). If `sdf` is a `SubDataFrame` created with `:` as a column selector then - `insertcols!`, `setindex!`, broadcasted assignment allow for creation - of new columns with `missing` values stored in filtered-out rows; + `insertcols!`, `setindex!`, and broadcasted assignment allow for creation + of new columns, automatically filling filtered-out rows with `missing` values; * Allow replacing existing columns in a `SubDataFrame` with `!` as row selector in assignment and broadcasted assignment ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). - Assignment to existing columns retains allocates a new column. Values already stored in filtered-out rows are retained. + Assignment to existing columns allocates a new column. + Values already stored in filtered-out rows are copied. -* Allow `SubDataFrame` to be passed as argument of , `select!` and `transform!` - (also on `GroupedDataFrame` created a `SubDataFrame`) +* Allow `SubDataFrame` to be passed as an argument to `select!` and `transform!` + (also on `GroupedDataFrame` created from a `SubDataFrame`) ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). - Assignment to existing columns retains allocates a new column. Values already stored in filtered-out rows are retained. - In case of creation of new columns `missing` values stored in filtered-out rows; - If `SubDataFrame` is not created with `:` as column selector the resulting operation + Assignment to existing columns allocates a new column. + Values already stored in filtered-out rows are copied. + In case of creation of new columns, filtered-out rows are automatically + filled with `missing` values. + If `SubDataFrame` was not created with `:` as column selector the resulting operation must produce the same column names as stored in the source `SubDataFrame` or an error is thrown. # DataFrames.jl v1.2.2 Patch Release Notes diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index befc5be800..2351280362 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -145,14 +145,14 @@ so it is unsafe to use it afterwards (the column length correctness will be pres * `sdf[rows, col] = v` -> set rows `rows` of column `col`, in-place; `v` must be an abstract vector; * `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` when column names must match; -* `sdf[!, col] = v` -> replaces `col` with `v` with copying; if `col` is present if `sdf` +* `sdf[!, col] = v` -> replaces `col` with `v` with copying; if `col` is present in `sdf` then filtered-out rows in newly created vector are filled with values already present in that column; if `col` is not present in `sdf` then the operation is only allowed - if `sdf` was created with `:` as column selector. In this case + if `sdf` was created with `:` as column selector, in which case filtered-out rows are filled with `missing`; - equivalent to `sdf.col = v` if `col` is a valid identifier; - operation is allowed if `length(v) == nrow(sdf)`; + equivalent to `sdf.col = v` if `col` is a valid identifier; + operation is allowed if `length(v) == nrow(sdf)`; * `sdf[!, cols] = v` -> replaces existing columns `cols` in data frame `sdf` with copying; `v` must be an `AbstractMatrix` or an `AbstractDataFrame` (in the latter case column names must match); @@ -209,22 +209,21 @@ Additional rules: Starting from Julia 1.7 if `:col` is not present in `df` then a new column will be created in `df`. * in the `sdf[CartesianIndex(row, col)] .= v`, `sdf[row, col] .= v` and `sdf[row, cols] .= v` syntaxes the assignment to `sdf` is performed in-place; * in the `sdf[rows, col] .= v` and `sdf[rows, cols] .= v` syntaxes the assignment to `sdf` is performed in-place; - if `rows` is `:` and `col` is `Symbol` or `AbstractString` - and it is missing from `sdf` and `sdf` was created with `:` as column selector then a new column is allocated and added; - the length of the column is always the value of `nrow(sdf)` before the assignment takes place; + if `rows` is `:` and `col` is a `Symbol` or `AbstractString` + referring to a column missing from `sdf` and `sdf` was created with `:` as column selector + then a new column is allocated and added; the filtered-out rows are filled with `missing`; * in the `sdf[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; the filtered-out rows are filled with values already present in `col`; - if `col` is `Symbol` or `AbstractString` and it is missing from `sdf` - that was created with `:` as column selector then a new column is allocated added; - the length of the column is always the value of `nrow(df)` before the assignment takes place; + if `col` is a `Symbol` or `AbstractString` referring to a column missing from `sdf` + and was `sdf` created with `:` as column selector then a new column is allocated and added; in this case the filtered-out rows are filled with `missing`; * the `sdf[!, cols] .= v` syntax replaces existing columns `cols` in data frame `sdf` with freshly allocated vectors; the filtered-out rows are filled with values already present in `cols`; * `sdf.col .= v` syntax currently performs in-place assignment to an existing vector `sdf.col`; this behavior is deprecated and a new column will be allocated in the future. Starting from Julia 1.7 if `:col` is not present in `sdf` then a new column will be created in `sdf` - if it was created with `:` as a column selector. + if `sdf` was created with `:` as a column selector. * `dfr.col .= v` syntax is allowed and performs in-place assignment to a value extracted by `dfr.col`. Note that `sdf[!, col] .= v` and `sdf[!, cols] .= v` syntaxes are not allowed as `sdf` can be only modified in-place. diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 23f12dfaaf..12eb19155c 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -111,7 +111,7 @@ rows. As a particular rule, values wrapped in a `Ref` or a `0`-dimensional `select`/`select!` and `transform`/`transform!` always return a data frame with the same number and order of rows as the source (even if `GroupedDataFrame` had its groups reordered), except when selection results in zero columns -in the resulting data frame. +in the resulting data frame (in which case the result has zero rows). For `combine`, rows in the returned object appear in the order of groups in the `GroupedDataFrame`. The functions can return an arbitrary number of rows for @@ -617,7 +617,7 @@ julia> gd[1] # Simulating the SQL `where` clause You can conveniently work on subsets of a data frame by using `SubDataFrame`s. -Operations performed on such objects can both create a new data frame and be +Operations performed on such objects can either create a new data frame or be performed in-place. Here are some examples: ```jldoctest sac diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index cd6c2400b6..49ad276b91 100755 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -105,7 +105,7 @@ const TRANSFORMATION_COMMON_RULES = `select`/`select!` and `transform`/`transform!` always return a data frame with the same number and order of rows as the source (even if `GroupedDataFrame` had its groups reordered), except when selection results in zero columns - in the resulting data frame. + in the resulting data frame (in which case the result has zero rows). For `combine`, rows in the returned object appear in the order of groups in the `GroupedDataFrame`. The functions can return an arbitrary number of rows for @@ -629,8 +629,8 @@ return it. The result is guaranteed to have the same number of rows as `df` or parent of `gd`, except when no columns are selected (in which case the result has zero rows). -If `SubDataFrame` or `GroupedDataFrame{SubDataFrame}` is passed the resulting -operation follows the same rules as indexing: +If a `SubDataFrame` or `GroupedDataFrame{SubDataFrame}` is passed, the parent data frame +is updated using columns generated by `args...`, following the same rules as indexing: - for existing columns filtered-out rows are filled with values present in the old columns - for new columns (which is only allowed if `SubDataFrame` was created with `:` @@ -639,7 +639,7 @@ operation follows the same rules as indexing: is only allowed if the transformations keep exactly the same sequence of column names as is in the passed `df` -If `gd` is passed then it is updated to reflect the new rows of its updated +If a `GroupedDataFrame` is passed then it is updated to reflect the new rows of its updated parent. If there are independent `GroupedDataFrame` objects constructed using the same parent data frame they might get corrupt. diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index eccb5d29aa..3615a98d3e 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -774,7 +774,7 @@ is ignored (i.e. the added column is always copied) and the parent data frame's column is filled with `missing` in rows that are filtered out by `df`. If `df` isa `DataFrame` that has no columns and only values -other than `AbstractVector` are passed then it is used to create a one element +other than `AbstractVector` are passed then it is used to create a one-element column. If `df` isa `DataFrame` that has no columns and at least one `AbstractVector` is passed then its length is used to determine the number of elements in all @@ -816,7 +816,7 @@ julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...; makeunique::Bool=false, copycols::Bool=true) if !is_column_adding_allowed(df) - throw(ArgumentError("insertcols! is only supported for DataFrame or " * + throw(ArgumentError("insertcols! is only supported for DataFrame, or for " * "SubDataFrame created with `:` as column selector")) end col_ind = Int(col isa SymbolOrString ? columnindex(df, col) : col) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index c8235457c7..b0f51ecfb4 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -298,13 +298,14 @@ end Base.convert(::Type{DataFrame}, sdf::SubDataFrame) = DataFrame(sdf) -# this function tests if it is allowed to add columns to passed sdf -# currently it is only allowed when sdf is created with : as column selector +# this function tests if it is allowed to add columns to passed SubDataFrame +# currently it is only allowed when SubDataFrame was created with : as column selector # which results in using Index as its index (as opposed to other columns selectors # which result in SubIndex) function is_column_adding_allowed(df::AbstractDataFrame) - df isa DataFrame && return true - if df isa SubDataFrame + if df isa DataFrame + return true + elseif df isa SubDataFrame return getfield(df, :colindex) isa Index end throw(ArgumentError("Unsupported data frame type")) From c4cb1aec1233774c55e708fb7a6aa929c2967e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 25 Aug 2021 09:51:38 +0200 Subject: [PATCH 22/29] apply suggestions after code review --- docs/src/lib/indexing.md | 12 ++++++-- src/dataframe/dataframe.jl | 16 +++++++---- src/other/broadcasting.jl | 9 +++--- src/subdataframe/subdataframe.jl | 47 ++++++++++++++++++++++---------- test/indexing.jl | 4 +-- test/subdataframe_mutation.jl | 2 ++ 6 files changed, 61 insertions(+), 29 deletions(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 2351280362..5251780cd8 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -138,13 +138,14 @@ so it is unsafe to use it afterwards (the column length correctness will be pres `v` must be an `AbstractMatrix` or an `AbstractDataFrame` (in the latter case column names must match); -`setindex!` on `SubDataFrame` (not created with `:` as column selector): +`setindex!` on `SubDataFrame`: * `sdf[row, col] = v` -> set value of `col` in row `row` to `v` in-place; * `sdf[CartesianIndex(row, col)] = v` -> the same as `sdf[row, col] = v`; * `sdf[row, cols] = v` -> the same as `dfr = df[row, cols]; dfr[:] = v` in-place; * `sdf[rows, col] = v` -> set rows `rows` of column `col`, in-place; `v` must be an abstract vector; * `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; - `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` when column names must match; + `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` + when column names must match; * `sdf[!, col] = v` -> replaces `col` with `v` with copying; if `col` is present in `sdf` then filtered-out rows in newly created vector are filled with values already present in that column; @@ -159,6 +160,13 @@ so it is unsafe to use it afterwards (the column length correctness will be pres filtered-out rows in newly created vectors are filled with values already present in respective columns; +!!! note + + The rules above mean that `sdf[:, col] = v` is an in-place operation if `col` is present in `sdf`, + therefore it will be fast in general. On the other hand using `sdf[!, col] = v` + or `sdf.col = v` will always allocate a new vector which is more expensive computationally. + + `setindex!` on `DataFrameRow`: * `dfr[col] = v` -> set value of `col` in row `row` to `v` in-place; equivalent to `dfr.col = v` if `col` is a valid identifier; diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3615a98d3e..b9c573fb51 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -815,10 +815,13 @@ julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) """ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...; makeunique::Bool=false, copycols::Bool=true) - if !is_column_adding_allowed(df) + if !is_column_insertion_allowed(df) throw(ArgumentError("insertcols! is only supported for DataFrame, or for " * "SubDataFrame created with `:` as column selector")) end + if !(copycols || df isa DataFrame) + throw(ArgumentError("copycols=false is only allowed if df isa DataFrame ")) + end col_ind = Int(col isa SymbolOrString ? columnindex(df, col) : col) if !(0 < col_ind <= ncol(df) + 1) throw(ArgumentError("attempt to insert a column to a data frame with " * @@ -887,12 +890,13 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy if df isa DataFrame dfp = df else + @assert df isa SubDataFrame dfp = parent(df) - T = eltype(item_new) - newcol = Tables.allocatecolumn(Union{T, Missing}, nrow(dfp)) - fill!(newcol, missing) - newcol[rows(df)] = item_new - item_new = newcol + item_new_df = item_new + T = eltype(item_new_df) + item_new = similar(item_new_df, Union{T, Missing}, nrow(dfp)) + fill!(item_new, missing) + item_new[rows(df)] = item_new_df end firstindex(item_new) != 1 && _onebased_check_error() diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl index 6c02e70539..c84eb11726 100644 --- a/src/other/broadcasting.jl +++ b/src/other/broadcasting.jl @@ -105,7 +105,7 @@ function Base.dotview(df::AbstractDataFrame, ::Colon, cols::ColumnIndex) if !(cols isa SymbolOrString) throw(ArgumentError("creating new columns using an integer index is disallowed")) end - if !is_column_adding_allowed(df) + if !is_column_insertion_allowed(df) throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * "columns of its parent data frame is disallowed")) end @@ -117,7 +117,7 @@ function Base.dotview(df::AbstractDataFrame, ::typeof(!), cols) return ColReplaceDataFrame(df, convert(Vector{Int}, index(df)[cols])) end if cols isa SymbolOrString - if columnindex(df, cols) == 0 && !is_column_adding_allowed(df) + if columnindex(df, cols) == 0 && !is_column_insertion_allowed(df) throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * "columns of its parent data frame is disallowed")) end @@ -132,10 +132,11 @@ end if isdefined(Base, :dotgetproperty) function Base.dotgetproperty(df::AbstractDataFrame, col::SymbolOrString) if columnindex(df, col) == 0 - if !is_column_adding_allowed(df) + if !is_column_insertion_allowed(df) throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * "columns of its parent data frame is disallowed")) end + # TODO: double check that this is tested return LazyNewColDataFrame(df, Symbol(col)) else Base.depwarn("In the future this operation will allocate a new column " * @@ -148,7 +149,7 @@ end function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted{T}) where T df = lazydf.df if !haskey(index(df), lazydf.col) && df isa SubDataFrame && lazydf.col isa SymbolOrString - @assert is_column_adding_allowed(df) + @assert is_column_insertion_allowed(df) end if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}} bc_tmp = Base.Broadcast.Broadcasted{T}(bc.f, bc.args, ()) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index b0f51ecfb4..5ffa25db1c 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -182,7 +182,7 @@ end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) if colinds isa SymbolOrString && columnindex(sdf, colinds) == 0 - if !is_column_adding_allowed(sdf) + if !is_column_insertion_allowed(sdf) throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * "columns of its parent data frame is disallowed")) end @@ -191,7 +191,7 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, :: "equal to number of rows in the SubDataFrame")) end T = eltype(val) - newcol = Tables.allocatecolumn(Union{T, Missing}, nrow(parent(sdf))) + newcol = similar(val, Union{T, Missing}, nrow(parent(sdf))) fill!(newcol, missing) newcol[rows(sdf)] = val parent(sdf)[!, colinds] = newcol @@ -207,7 +207,7 @@ function Base.setindex!(sdf::SubDataFrame, v::AbstractVector, throw(ArgumentError("Cannot assign to non-existent column: $col_ind")) end if col_ind isa SymbolOrString && columnindex(sdf, col_ind) == 0 - if !is_column_adding_allowed(sdf) + if !is_column_insertion_allowed(sdf) throw(ArgumentError("creating new columns in a SubDataFrame that subsets " * "columns of its parent data frame is disallowed")) end @@ -218,6 +218,7 @@ function Base.setindex!(sdf::SubDataFrame, v::AbstractVector, old_col = pdf[!, p_col_ind] T = eltype(old_col) S = eltype(v) + # TODO: change to similar when promote_type vs Base.promote_typejoin decision is made newcol = Tables.allocatecolumn(promote_type(T, S), length(old_col)) newcol .= old_col newcol[rows(sdf)] = v @@ -302,7 +303,7 @@ Base.convert(::Type{DataFrame}, sdf::SubDataFrame) = DataFrame(sdf) # currently it is only allowed when SubDataFrame was created with : as column selector # which results in using Index as its index (as opposed to other columns selectors # which result in SubIndex) -function is_column_adding_allowed(df::AbstractDataFrame) +function is_column_insertion_allowed(df::AbstractDataFrame) if df isa DataFrame return true elseif df isa SubDataFrame @@ -312,24 +313,40 @@ function is_column_adding_allowed(df::AbstractDataFrame) end function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame) - if _names(sdf) == _names(newdf) - for col in _names(newdf) - sdf[!, col] = newdf[!, col] - end - return sdf - end + colsmatch = _names(sdf) == _names(newdf) - if !is_column_adding_allowed(sdf) + if !(colsmatch || is_column_insertion_allowed(sdf)) throw(ArgumentError("changing the sequence of column names in a SubDataFrame " * "that subsets columns of its parent data frame is disallowed")) end - psdf = parent(sdf) - @assert psdf isa DataFrame for colname in _names(newdf) - sdf[!, colname] = newdf[!, colname] + oldcol = sdf[!, colname] + newcol = newdf[!, colname] + # We perform an in-place operation if possible for performance. + # This has an additional effect that for CategoricalVector levels + # and ordering will be retained or not depending on which code patch is taken. + + # TODO: add tests when promote_type vs Base.promote_typejoin decision is made + if eltype(newcol) <: eltype(oldcol) + sdf[:, colname] = newcol + else + sdf[!, colname] = newcol + end + end + + # If columns did not match this means that we have either: + # 1. inserted some columns into pdf + # or + # 2. requested to reorder the existing columns + # and that operation was allowed. + # Therefore we need to update the parent of sdf in place to make sure + # it holds only the required target columns in a correct order. + if !colsmatch + pdf = parent(sdf) + @assert pdf isa DataFrame + select!(pdf, _names(newdf)) end - select!(psdf, _names(newdf)) return sdf end diff --git a/test/indexing.jl b/test/indexing.jl index 4670df2f54..8da25732e2 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -1302,7 +1302,7 @@ end @test_throws BoundsError sdf[:, 4] = ["a", "b", "c"] @test_throws DimensionMismatch sdf[:, 1] = [1] @test_throws MethodError sdf[:, 1] = 1 - if DataFrames.is_column_adding_allowed(sdf) + if DataFrames.is_column_insertion_allowed(sdf) sdf[:, :z] = ["a", "b", "c"] @test df.z == ["a", "b", "c"] @test eltype(df.z) == Union{String, Missing} @@ -1320,7 +1320,7 @@ end sdf[:, names(sdf)[1]] = 10:12 @test df == DataFrame(a=10:12, b=4:6, c=7:9) @test_throws MethodError sdf[:, names(sdf)[1]] = ["a", "b", "c"] - if DataFrames.is_column_adding_allowed(sdf) + if DataFrames.is_column_insertion_allowed(sdf) sdf[:, "z"] = ["a", "b", "c"] @test df.z == ["a", "b", "c"] select!(df, 1:3) diff --git a/test/subdataframe_mutation.jl b/test/subdataframe_mutation.jl index ab61d510c9..d5db9e80e0 100644 --- a/test/subdataframe_mutation.jl +++ b/test/subdataframe_mutation.jl @@ -1391,6 +1391,8 @@ end df = DataFrame(a=1:5, b=11:15) sdf = @view df[:, :] + @test_throws ArgumentError insertcols!(sdf, :c => 1, copycols=false) + @test df == DataFrame(a=1:5, b=11:15) insertcols!(sdf, :c => 1) @test df == DataFrame(a=1:5, b=11:15, c=1) @test eltype(df.c) === Union{Int, Missing} From cadc128b268be940f9f7620e97b41bc669749d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 25 Aug 2021 11:05:13 +0200 Subject: [PATCH 23/29] fix fast path is select!/transform! --- src/subdataframe/subdataframe.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 5ffa25db1c..c8a66da36a 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -321,17 +321,19 @@ function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame) end for colname in _names(newdf) - oldcol = sdf[!, colname] + oldcol_idx = columnindex(sdf, colname) newcol = newdf[!, colname] # We perform an in-place operation if possible for performance. # This has an additional effect that for CategoricalVector levels # and ordering will be retained or not depending on which code patch is taken. # TODO: add tests when promote_type vs Base.promote_typejoin decision is made - if eltype(newcol) <: eltype(oldcol) + if oldcol_idx == 0 sdf[:, colname] = newcol + elseif eltype(newcol) <: eltype(sdf[!, oldcol_idx]) + sdf[:, oldcol_idx] = newcol else - sdf[!, colname] = newcol + sdf[!, oldcol_idx] = newcol end end From 4ad940c420735a3e0280a1d05810fcf797d8f8c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 29 Aug 2021 11:40:16 +0200 Subject: [PATCH 24/29] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/lib/indexing.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 5251780cd8..56e759660e 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -145,7 +145,7 @@ so it is unsafe to use it afterwards (the column length correctness will be pres * `sdf[rows, col] = v` -> set rows `rows` of column `col`, in-place; `v` must be an abstract vector; * `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` - when column names must match; + in which case column names must match; * `sdf[!, col] = v` -> replaces `col` with `v` with copying; if `col` is present in `sdf` then filtered-out rows in newly created vector are filled with values already present in that column; @@ -164,7 +164,7 @@ so it is unsafe to use it afterwards (the column length correctness will be pres The rules above mean that `sdf[:, col] = v` is an in-place operation if `col` is present in `sdf`, therefore it will be fast in general. On the other hand using `sdf[!, col] = v` - or `sdf.col = v` will always allocate a new vector which is more expensive computationally. + or `sdf.col = v` will always allocate a new vector, which is more expensive computationally. `setindex!` on `DataFrameRow`: From 8f134ef7f912294a26187139be999c6800e04baf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 29 Aug 2021 12:30:15 +0200 Subject: [PATCH 25/29] changes after code review and promote_type decision --- docs/src/lib/indexing.md | 6 ++-- src/dataframe/dataframe.jl | 8 ++--- src/subdataframe/subdataframe.jl | 21 ++++++++----- test/subdataframe_mutation.jl | 54 ++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 13 deletions(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 56e759660e..386ab2150c 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -148,7 +148,8 @@ so it is unsafe to use it afterwards (the column length correctness will be pres in which case column names must match; * `sdf[!, col] = v` -> replaces `col` with `v` with copying; if `col` is present in `sdf` then filtered-out rows in newly created vector are filled with - values already present in that column; + values already present in that column and `promote_type` is used + to determine the `eltype` of the new column; if `col` is not present in `sdf` then the operation is only allowed if `sdf` was created with `:` as column selector, in which case filtered-out rows are filled with `missing`; @@ -158,7 +159,8 @@ so it is unsafe to use it afterwards (the column length correctness will be pres `v` must be an `AbstractMatrix` or an `AbstractDataFrame` (in the latter case column names must match); filtered-out rows in newly created vectors are filled with - values already present in respective columns; + values already present in respective columns + and `promote_type` is used to determine the `eltype` of the new columns; !!! note diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index b9c573fb51..8111336d93 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -892,11 +892,11 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy else @assert df isa SubDataFrame dfp = parent(df) - item_new_df = item_new - T = eltype(item_new_df) - item_new = similar(item_new_df, Union{T, Missing}, nrow(dfp)) + item_new_orig = item_new + T = eltype(item_new_orig) + item_new = similar(item_new_orig, Union{T, Missing}, nrow(dfp)) fill!(item_new, missing) - item_new[rows(df)] = item_new_df + item_new[rows(df)] = item_new_orig end firstindex(item_new) != 1 && _onebased_check_error() diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index c8a66da36a..e79d667075 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -201,6 +201,11 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, :: return sdf end +# TODO: in the future, when refactoring source code +# (presumably when we would first define all the types that the package provides +# and then define methods for them) +# consider merging SubDataFrame and DataFrame setindex! methods + function Base.setindex!(sdf::SubDataFrame, v::AbstractVector, ::typeof(!), col_ind::ColumnIndex) if col_ind isa Union{Signed, Unsigned} && !(1 <= col_ind <= ncol(sdf)) @@ -218,8 +223,7 @@ function Base.setindex!(sdf::SubDataFrame, v::AbstractVector, old_col = pdf[!, p_col_ind] T = eltype(old_col) S = eltype(v) - # TODO: change to similar when promote_type vs Base.promote_typejoin decision is made - newcol = Tables.allocatecolumn(promote_type(T, S), length(old_col)) + newcol = similar(old_col, promote_type(T, S), length(old_col)) newcol .= old_col newcol[rows(sdf)] = v pdf[!, p_col_ind] = newcol @@ -326,14 +330,17 @@ function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame) # We perform an in-place operation if possible for performance. # This has an additional effect that for CategoricalVector levels # and ordering will be retained or not depending on which code patch is taken. - - # TODO: add tests when promote_type vs Base.promote_typejoin decision is made if oldcol_idx == 0 sdf[:, colname] = newcol - elseif eltype(newcol) <: eltype(sdf[!, oldcol_idx]) - sdf[:, oldcol_idx] = newcol else - sdf[!, oldcol_idx] = newcol + oldcol = sdf[!, oldcol_idx] + # if oldcol is a view of Vector and the eltype of new values is supported + # by eltype of old values we perform an in-place operation as it will be faster + if parent(oldcol) isa Vector && eltype(newcol) <: eltype(oldcol) + sdf[:, oldcol_idx] = newcol + else + sdf[!, oldcol_idx] = newcol + end end end diff --git a/test/subdataframe_mutation.jl b/test/subdataframe_mutation.jl index d5db9e80e0..49010563bf 100644 --- a/test/subdataframe_mutation.jl +++ b/test/subdataframe_mutation.jl @@ -1590,4 +1590,58 @@ end e=[missing, 22, 23, 24, missing, missing]) end +@testset "promote_type tests" begin + df = DataFrame(a=1:4) + sdf = @view df[1:1, :] + sdf[!, 1] = [1.5] + @test df.a == [1.5, 2, 3, 4] + @test eltype(df.a) === Float64 + + # note that CategoricalVector is dropped as + # similar(::CategoricalVector, String, length) + # produces a Vector{String} + df = DataFrame(a=categorical(string.(1:4))) + sdf = @view df[1:1, :] + sdf[!, 1] = ["a"] + @test df.a == ["a", "2", "3", "4"] + @test df.a isa Vector{String} + + df = DataFrame(a=categorical(string.(1:4))) + sdf = @view df[1:1, :] + sdf[!, 1] = categorical(["a"]) + @test df.a isa CategoricalVector{String} + @test df.a == ["a", "2", "3", "4"] + # we first copy old data and then add new data so "1" is in levels + # although it is not present in df.a + @test levels(df.a) == ["1", "2", "3", "4", "a"] + + df = DataFrame(a=1:4) + a = df.a + sdf = @view df[1:1, :] + select!(sdf, :a => (x -> x) => :a) + @test df.a === a + select!(sdf, :a => (x -> [1.5]) => :a) + @test df.a == [1.5, 2, 3, 4] + @test eltype(df.a) === Float64 + @test a == 1:4 + + df = DataFrame(a=collect(Any, 1:4)) + a = df.a + sdf = @view df[1:1, :] + select!(sdf, :a => (x -> x) => :a) + @test df.a === a + select!(sdf, :a => (x -> [1.5]) => :a) + @test df.a == [1.5, 2, 3, 4] + @test df.a === a + + df = DataFrame(a=PooledArray(1:4)) + a = df.a + sdf = @view df[1:1, :] + select!(sdf, :a => (x -> x) => :a) + @test df.a == a + @test df.a !== a + # we keep PooledVector though as similar of PooledVector is PooledVector + @test df.a isa PooledVector{Int} +end + end # module \ No newline at end of file From 1f4aa7886f8357451bd46d361bc058bbeea2323c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 29 Aug 2021 13:20:15 +0200 Subject: [PATCH 26/29] fix tests --- test/subdataframe_mutation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/subdataframe_mutation.jl b/test/subdataframe_mutation.jl index 49010563bf..a2f3c55ca1 100644 --- a/test/subdataframe_mutation.jl +++ b/test/subdataframe_mutation.jl @@ -1,6 +1,6 @@ module TestIndexing -using Test, DataFrames, CategoricalArrays +using Test, DataFrames, CategoricalArrays, PooledArrays const ≅ = isequal From 55a6d757716e77d756577e63bd1aac0548df6ea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 29 Aug 2021 23:09:40 +0200 Subject: [PATCH 27/29] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- test/broadcasting.jl | 2 +- test/indexing.jl | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/broadcasting.jl b/test/broadcasting.jl index 0b8ff9f10d..bbbd6f5de0 100644 --- a/test/broadcasting.jl +++ b/test/broadcasting.jl @@ -1611,7 +1611,7 @@ end df = view(copy(refdf), :, :) if VERSION >= v"1.7" df.newcol .= 'd' - df.newcol == fill('d', 3) + @test df.newcol == fill('d', 3) else @test_throws ArgumentError df.newcol .= 'd' @test df == refdf diff --git a/test/indexing.jl b/test/indexing.jl index 8da25732e2..2dfa35e459 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -1855,6 +1855,7 @@ end dfv = @view df[:, :] dfv.a = [5] @test df == DataFrame(a=5) + @test eltype(df.a) === Int dfv."a" = [6] @test df == DataFrame(a=6) @test eltype(df.a) === Int From 93de0646ef7059a94fa43f7cb5274be558c4d862 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 29 Aug 2021 23:44:17 +0200 Subject: [PATCH 28/29] apply changes after code review --- src/subdataframe/subdataframe.jl | 19 +----- test/broadcasting.jl | 15 +++-- test/indexing.jl | 2 + test/subdataframe_mutation.jl | 109 +++++++++++++++++++++---------- 4 files changed, 89 insertions(+), 56 deletions(-) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index e79d667075..02e2ba6541 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -325,23 +325,8 @@ function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame) end for colname in _names(newdf) - oldcol_idx = columnindex(sdf, colname) - newcol = newdf[!, colname] - # We perform an in-place operation if possible for performance. - # This has an additional effect that for CategoricalVector levels - # and ordering will be retained or not depending on which code patch is taken. - if oldcol_idx == 0 - sdf[:, colname] = newcol - else - oldcol = sdf[!, oldcol_idx] - # if oldcol is a view of Vector and the eltype of new values is supported - # by eltype of old values we perform an in-place operation as it will be faster - if parent(oldcol) isa Vector && eltype(newcol) <: eltype(oldcol) - sdf[:, oldcol_idx] = newcol - else - sdf[!, oldcol_idx] = newcol - end - end + # This will allocate a fresh column in parent(sdf) for each colname + sdf[!, colname] = newdf[!, colname] end # If columns did not match this means that we have either: diff --git a/test/broadcasting.jl b/test/broadcasting.jl index bbbd6f5de0..2d37f9e178 100644 --- a/test/broadcasting.jl +++ b/test/broadcasting.jl @@ -150,9 +150,10 @@ end dfv = @view df[1:2, 2:end] dfv[!, 1] .+= 100 @test df.x2 == [104.5, 105.5, 6.5] + # reverse the performed operations df.x1 -= [1, 1, 1] df.x2 -= [100, 100, 0] - @test df == refdf + @test df == DataFrame(reshape(1.5:15.5, (3, 5)), :auto) df = copy(refdf) df[:, 1] .+= 1 @@ -215,20 +216,26 @@ end # test a more complex broadcasting pattern df = copy(refdf) df[!, 1] .+= [0, 1, 2] .+ 1 - @test df.x1 == [2.5, 4.5, 6.5] + @test df.x1 == df[!, 1] == [2.5, 4.5, 6.5] @test df[:, 2:end] == refdf[:, 2:end] df = copy(refdf) df[!, "x1"] .+= [0, 1, 2] .+ 1 - @test df."x1" == [2.5, 4.5, 6.5] + @test df."x1" == df[!, 1] == [2.5, 4.5, 6.5] @test df[:, Not("x1")] == refdf[:, 2:end] + df = copy(refdf) dfv = @view df[1:2, 2:end] dfv[!, 1] .+= [0, 1] .+ 1 - @test df.x2 == [5.5, 7.5, 6.5] + @test df == DataFrame([1.5 5.5 7.5 10.5 13.5 + 2.5 7.5 8.5 11.5 14.5 + 3.5 6.5 9.5 12.5 15.5], :auto) dfv = @view df[1:2, 2:end] @test_throws ArgumentError dfv[!, "x1"] .+= [0, 1] .+ 1 + @test df == DataFrame([1.5 5.5 7.5 10.5 13.5 + 2.5 7.5 8.5 11.5 14.5 + 3.5 6.5 9.5 12.5 15.5], :auto) df = copy(refdf) df.x1 .+= [0, 1, 2] .+ 1 diff --git a/test/indexing.jl b/test/indexing.jl index 2dfa35e459..a322019d33 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -1302,6 +1302,7 @@ end @test_throws BoundsError sdf[:, 4] = ["a", "b", "c"] @test_throws DimensionMismatch sdf[:, 1] = [1] @test_throws MethodError sdf[:, 1] = 1 + @test DataFrames.is_column_insertion_allowed(sdf) == (DataFrames.index(sdf) isa DataFrames.Index) if DataFrames.is_column_insertion_allowed(sdf) sdf[:, :z] = ["a", "b", "c"] @test df.z == ["a", "b", "c"] @@ -1320,6 +1321,7 @@ end sdf[:, names(sdf)[1]] = 10:12 @test df == DataFrame(a=10:12, b=4:6, c=7:9) @test_throws MethodError sdf[:, names(sdf)[1]] = ["a", "b", "c"] + @test DataFrames.is_column_insertion_allowed(sdf) == (DataFrames.index(sdf) isa DataFrames.Index) if DataFrames.is_column_insertion_allowed(sdf) sdf[:, "z"] = ["a", "b", "c"] @test df.z == ["a", "b", "c"] diff --git a/test/subdataframe_mutation.jl b/test/subdataframe_mutation.jl index a2f3c55ca1..81360e20d0 100644 --- a/test/subdataframe_mutation.jl +++ b/test/subdataframe_mutation.jl @@ -1146,8 +1146,8 @@ end tmpa = df.a sdf[:, [:c, :b, :a]] = DataFrame(c=[5, 6], b=[1.0, 2.0], a=[13, 12]) @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11.0, 2.0, 1.0, 14.0, 15.0], - c=[21, 6, 5, 24, 25]) + b=[11.0, 2.0, 1.0, 14.0, 15.0], + c=[21, 6, 5, 24, 25]) @test tmpa === df.a @test_throws ArgumentError sdf[:, [:c, :b, :a]] = DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) @@ -1160,8 +1160,8 @@ end tmpa = df.a sdf[:, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11, 2, 1, 14, 15], - c=21:25) + b=[11, 2, 1, 14, 15], + c=21:25) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1175,8 +1175,8 @@ end tmpa = df.a sdf[:, cols] = DataFrame(a=[13, 12], b=[1.0, 2.0]) @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11, 2, 1, 14, 15], - c=21:25) + b=[11, 2, 1, 14, 15], + c=21:25) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1189,8 +1189,8 @@ end tmpa = df.a sdf[:, [:c, :b, :a]] = [100 101 102; 103 104 105] @test df == DataFrame(a=[1, 105, 102, 4, 5], - b=[11.0, 104, 101, 14.0, 15.0], - c=[21, 103, 100, 24, 25]) + b=[11.0, 104, 101, 14.0, 15.0], + c=[21, 103, 100, 24, 25]) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1206,8 +1206,8 @@ end tmpa = df.a sdf[:, cols] = [1.0 3.0; 2.0 4.0] @test df == DataFrame(a=[1, 2, 1, 4, 5], - b=[11, 4, 3, 14, 15], - c=21:25) + b=[11, 4, 3, 14, 15], + c=21:25) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1222,8 +1222,8 @@ end tmpa = df.a sdf[:, cols] = [1.0 3.0; 2.0 4.0] @test df == DataFrame(a=[1, 2, 1, 4, 5], - b=[11, 4, 3, 14, 15], - c=21:25) + b=[11, 4, 3, 14, 15], + c=21:25) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1239,8 +1239,8 @@ end tmpa = df.a sdf[:, [:c, :b, :a]] .= DataFrame(c=[100, 101], b=[1.0, 2.0], a=[13, 12]) @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11, 2, 1, 14, 15], - c=[21, 101, 100, 24, 25]) + b=[11, 2, 1, 14, 15], + c=[21, 101, 100, 24, 25]) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1248,8 +1248,8 @@ end sdf[:, [:c, :b, :a]] .= [100, 200] @test df == DataFrame(a=[1, 200, 100, 4, 5], - b=[11, 200, 100, 14, 15], - c=[21, 200, 100, 24, 25]) + b=[11, 200, 100, 14, 15], + c=[21, 200, 100, 24, 25]) @test_throws ArgumentError sdf[:, [:c, :b, :a]] .= DataFrame(d=["c", "d"], b=[1.0, 2.0], a=[13, 12]) @test_throws ArgumentError sdf[:, [:c, :b, :a]] .= DataFrame(a=["c", "d"], b=[1.0, 2.0], c=[13, 12]) @@ -1261,16 +1261,16 @@ end tmpa = df.a sdf[:, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11, 2, 1, 14, 15], - c=21:25) + b=[11, 2, 1, 14, 15], + c=21:25) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int sdf[:, cols] .= [100 200] @test df == DataFrame(a=[1, 100, 100, 4, 5], - b=[11, 200, 200, 14, 15], - c=21:25) + b=[11, 200, 200, 14, 15], + c=21:25) @test_throws ArgumentError sdf[:, cols] .= DataFrame(b=[1.0, 2.0], a=[13, 12]) end @@ -1281,16 +1281,16 @@ end tmpa = df.a sdf[:, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0]) @test df == DataFrame(a=[1, 12, 13, 4, 5], - b=[11, 2, 1, 14, 15], - c=21:25) + b=[11, 2, 1, 14, 15], + c=21:25) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int sdf[:, cols] .= 100 @test df == DataFrame(a=[1, 100, 100, 4, 5], - b=[11, 100, 100, 14, 15], - c=21:25) + b=[11, 100, 100, 14, 15], + c=21:25) @test_throws DimensionMismatch sdf[:, cols] .= DataFrame(a=[13, 12], b=[1.0, 2.0], c=1) end @@ -1301,8 +1301,8 @@ end tmpa = df.a sdf[:, [:c, :b, :a]] .= [100 101 102; 103 104 105] @test df == DataFrame(a=[1, 105, 102, 4, 5], - b=[11.0, 104, 101, 14.0, 15.0], - c=[21, 103, 100, 24, 25]) + b=[11.0, 104, 101, 14.0, 15.0], + c=[21, 103, 100, 24, 25]) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1318,8 +1318,8 @@ end tmpa = df.a sdf[:, cols] .= [1.0 3.0; 2.0 4.0] @test df == DataFrame(a=[1, 2, 1, 4, 5], - b=[11, 4, 3, 14, 15], - c=21:25) + b=[11, 4, 3, 14, 15], + c=21:25) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1334,8 +1334,8 @@ end tmpa = df.a sdf[:, cols] .= [1.0 3.0; 2.0 4.0] @test df == DataFrame(a=[1, 2, 1, 4, 5], - b=[11, 4, 3, 14, 15], - c=21:25) + b=[11, 4, 3, 14, 15], + c=21:25) @test tmpa === df.a @test eltype(df.a) == Int @test eltype(df.b) == Int @@ -1592,9 +1592,18 @@ end @testset "promote_type tests" begin df = DataFrame(a=1:4) + a = df.a sdf = @view df[1:1, :] + sdf[!, 1] = [0] + @test df.a == [0, 2, 3, 4] + @test df.a != a + @test a == 1:4 + @test eltype(df.a) === Int + a = df.a sdf[!, 1] = [1.5] @test df.a == [1.5, 2, 3, 4] + @test df.a != a + @test a == [0, 2, 3, 4] @test eltype(df.a) === Float64 # note that CategoricalVector is dropped as @@ -1611,15 +1620,21 @@ end sdf[!, 1] = categorical(["a"]) @test df.a isa CategoricalVector{String} @test df.a == ["a", "2", "3", "4"] - # we first copy old data and then add new data so "1" is in levels - # although it is not present in df.a @test levels(df.a) == ["1", "2", "3", "4", "a"] df = DataFrame(a=1:4) a = df.a sdf = @view df[1:1, :] select!(sdf, :a => (x -> x) => :a) - @test df.a === a + @test df.a !== a + @test df.a == a == 1:4 + @test eltype(df.a) === Int + a = df.a + select!(sdf, :a => (x -> [1]) => :a) + @test df.a !== a + @test df.a == a == 1:4 + @test eltype(df.a) === Int + a = df.a select!(sdf, :a => (x -> [1.5]) => :a) @test df.a == [1.5, 2, 3, 4] @test eltype(df.a) === Float64 @@ -1629,10 +1644,22 @@ end a = df.a sdf = @view df[1:1, :] select!(sdf, :a => (x -> x) => :a) - @test df.a === a + @test df.a == a + @test a == 1:4 + @test df.a !== a + @test eltype(df.a) === Any + a = df.a + select!(sdf, :a => (x -> [1]) => :a) + @test df.a == a + @test a == 1:4 + @test df.a !== a + @test eltype(df.a) === Any + a = df.a select!(sdf, :a => (x -> [1.5]) => :a) @test df.a == [1.5, 2, 3, 4] - @test df.a === a + @test df.a !== a + @test a == 1:4 + @test eltype(df.a) === Any df = DataFrame(a=PooledArray(1:4)) a = df.a @@ -1640,8 +1667,20 @@ end select!(sdf, :a => (x -> x) => :a) @test df.a == a @test df.a !== a + @test a == 1:4 # we keep PooledVector though as similar of PooledVector is PooledVector @test df.a isa PooledVector{Int} + select!(sdf, :a => (x -> [1]) => :a) + @test df.a == a + @test a == 1:4 + @test df.a !== a + @test df.a isa PooledVector{Int} + a = df.a + select!(sdf, :a => (x -> [1.5]) => :a) + @test df.a == [1.5, 2, 3, 4] + @test df.a !== a + @test a == 1:4 + @test df.a isa PooledVector{Float64} end end # module \ No newline at end of file From f25d333a1329cb59704ce79a3cf393f7a0871e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 1 Sep 2021 08:03:31 +0200 Subject: [PATCH 29/29] Update NEWS.md --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 25edd4d59a..318e64dc99 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,7 +30,8 @@ `insertcols!`, `setindex!`, and broadcasted assignment allow for creation of new columns, automatically filling filtered-out rows with `missing` values; -* Allow replacing existing columns in a `SubDataFrame` with `!` as row selector in assignment and broadcasted assignment +* Allow replacing existing columns in a `SubDataFrame` with `!` as row selector + in assignment and broadcasted assignment ([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)). Assignment to existing columns allocates a new column.