@@ -17,122 +17,210 @@ dcast <- function(data, formula, fun.aggregate = NULL, ..., margins = NULL,
17
17
subset = subset , fill = fill , value.var = value.var )
18
18
}
19
19
20
+ check_formula <- function (formula , varnames , valnames ) {
21
+ if (is.character(formula )) formula = as.formula(formula )
22
+ if (class(formula ) != " formula" || length(formula ) != 3L )
23
+ stop(" Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c." )
24
+ vars = all.vars(formula )
25
+ vars = vars [! vars %chin % c(" ." , " ..." )]
26
+ allvars = c(vars , valnames )
27
+ ans = deparse_formula(as.list(formula )[- 1L ], varnames , allvars )
28
+ }
29
+
30
+ deparse_formula <- function (expr , varnames , allvars ) {
31
+ lvars = lapply(expr , function (this ) {
32
+ if (is.call(this )) {
33
+ if (this [[1L ]] == quote(`+` ))
34
+ unlist(deparse_formula(as.list(this )[- 1L ], varnames , allvars ))
35
+ else this
36
+ } else if (is.name(this )) {
37
+ if (this == quote(`...` )) {
38
+ subvars = setdiff(varnames , allvars )
39
+ lapply(subvars , as.name )
40
+ } else this
41
+ }
42
+ })
43
+ lvars = lapply(lvars , function (x ) if (length(x ) && ! is.list(x )) list (x ) else x )
44
+ }
45
+
46
+ value_vars <- function (value.var , varnames ) {
47
+ if (is.character(value.var ))
48
+ value.var = list (value.var )
49
+ value.var = lapply(value.var , unique )
50
+ valnames = unique(unlist(value.var ))
51
+ iswrong = which(! valnames %in% varnames )
52
+ if (length(iswrong ))
53
+ stop(" value.var values [" , paste(value.var [iswrong ], collapse = " , " ), " ] are not found in 'data'." )
54
+ value.var
55
+ }
56
+
57
+ aggregate_funs <- function (funs , vals , ... ) {
58
+ if (is.call(funs ) && funs [[1L ]] == " eval" )
59
+ funs = eval(funs [[2L ]], parent.frame(2L ), parent.frame(2L ))
60
+ if (is.call(funs ) && as.character(funs [[1L ]]) %in% c(" c" , " list" ))
61
+ funs = lapply(as.list(funs )[- 1L ], function (x ) {
62
+ if (is.call(x )) as.list(x )[- 1L ] else x
63
+ })
64
+ else funs = list (funs )
65
+ if (length(funs ) != length(vals )) {
66
+ if (length(vals ) == 1L )
67
+ vals = replicate(length(funs ), vals )
68
+ else stop(" When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be either of length =1 or =length(fun.aggregate)." )
69
+ }
70
+ dots = list (... )
71
+ construct_funs <- function (fun , val ) {
72
+ if (is.name(fun )) fun = list (fun )
73
+ ans = vector(" list" , length(fun )* length(val ))
74
+ nms = vector(" character" , length(ans ))
75
+ k = 1L
76
+ for (i in fun ) {
77
+ for (j in val ) {
78
+ expr = list (i , as.name(j ))
79
+ if (length(dots ))
80
+ expr = c(expr , dots )
81
+ ans [[k ]] = as.call(expr )
82
+ nms [k ] = paste(all.names(i , max.names = 1L , functions = TRUE ), j , sep = " _" )
83
+ k = k + 1L ;
84
+ }
85
+ }
86
+ setattr(ans , ' names' , nms )
87
+ }
88
+ ans = lapply(seq_along(funs ), function (i ) construct_funs(funs [[i ]], vals [[i ]]))
89
+ as.call(c(quote(list ), unlist(ans )))
90
+ }
91
+
20
92
dcast.data.table <- function (data , formula , fun.aggregate = NULL , ... , margins = NULL ,
21
93
subset = NULL , fill = NULL , drop = TRUE , value.var = guess(data ), verbose = getOption(" datatable.verbose" )) {
22
94
if (! is.data.table(data )) stop(" 'data' must be a data.table." )
23
95
if (anyDuplicated(names(data ))) stop(' data.table to cast must have unique column names' )
24
- is.formula <- function ( x ) class( x ) == " formula "
25
- strip <- function ( x ) gsub( " [[:space:]]* " , " " , x )
26
- if (is.formula( formula )) formula <- deparse( formula , 500 )
27
- if (is.character( formula )) {
28
- ff <- strsplit(strip( formula ), " ~ " , fixed = TRUE )[[ 1 ]]
29
- if (length(ff ) > 2 )
30
- stop( " Cast formula length is > 2, must be = 2. " )
31
- ff <- strsplit( ff , " + " , fixed = TRUE )
32
- setattr( ff , ' names ' , c( " ll " , " rr " ))
33
- ff <- lapply( ff , function ( x ) x [ x != " . " ])
34
- ff_ <- unlist( ff , use.names = FALSE )
35
- replace_dots <- function ( x ) {
36
- if ( ! is.list( x )) x = as.list( x )
37
- for ( i in seq_along( x )) {
38
- if ( x [[ i ]] == " ... " )
39
- x [[ i ]] = setdiff(names( data ), c( value.var , ff_ ))
40
- }
41
- unlist( x )
42
- }
43
- ff <- lapply( ff , replace_dots )
44
- } else stop( " Invalid formula. " )
45
- ff_ <- unlist( ff , use.names = FALSE )
46
- if (length( is_wrong <- which(is.na(chmatch( ff_ , names( data ))))) > 0 ) stop( " Column ' " , ff_ [ is_wrong [ 1 ]], " ' not found. " )
47
- if (length( ff $ ll ) == 0 ) stop( " LHS of formula evaluates to 'character(0)', invalid formula. " )
48
- if (length( value.var ) != 1 || ! is.character( value.var )) stop( " 'value.var' must be a character vector of length 1. " )
49
- if (is.na(chmatch( value.var , names( data )))) stop( " 'value.var' column ' " , value.var , " ' not found. " )
50
- if (any(unlist(lapply(as.list( data )[ ff_ ], class ), use.names = FALSE ) == " list " ))
51
- stop( " Only 'value.var' column maybe of type 'list'. This may change in the future. " )
52
- drop <- as.logical( drop [ 1 ] )
53
- if (is.na( drop )) stop( " 'drop' must be TRUE/FALSE " )
54
-
55
- # subset
56
- m <- as.list(match.call()[- 1 ])
57
- subset <- m $ subset [[ 2 ]]
96
+ drop = as.logical( drop [ 1 ])
97
+ if (is.na( drop )) stop( " 'drop' must be logical TRUE/FALSE " )
98
+ lvals = value_vars( value.var , names( data ) )
99
+ valnames = unique(unlist( lvals ))
100
+ lvars = check_formula( formula , names( data ), valnames )
101
+ lvars = lapply( lvars , function ( x ) if (! length(x )) quote( `.` ) else x )
102
+ # tired of lapply and the way it handles environments!
103
+ allcols = c(unlist( lvars ), lapply( valnames , as.name ) )
104
+ dat = vector( " list " , length( allcols ))
105
+ for ( i in seq_along( allcols )) {
106
+ x = allcols [[ i ]]
107
+ dat [[ i ]] = if (identical( x , quote( `.` ))) rep( " . " , nrow( data ))
108
+ else eval( x , data , parent.frame() )
109
+ if (is.function( dat [[ i ]]))
110
+ stop( " Column [ " , deparse( x ), " ] not found or of unknown type. " )
111
+ }
112
+ setattr( lvars , ' names ' , c( " lhs " , " rhs " ))
113
+ # Have to take care of duplicate names, and provide names for expression columns properly.
114
+ varnames = make.unique(sapply(unlist( lvars ), all.vars , max.names = 1L ), sep = " _ " )
115
+ dupidx = which( valnames %in% varnames )
116
+ if (length( dupidx )) {
117
+ dups = valnames [ dupidx ]
118
+ valnames = tail(make.unique(c( varnames , valnames )), - length( varnames ) )
119
+ lvals = lapply( lvals , function ( x ) { x [ x %in% dups ] = valnames [ dupidx ]; x } )
120
+ }
121
+ lhsnames = head( varnames , length( lvars $ lhs ) )
122
+ rhsnames = tail( varnames , - length( lvars $ lhs ))
123
+ setattr( dat , ' names ' , c( varnames , valnames ) )
124
+ setDT( dat )
125
+ if (any(sapply(as.list( dat )[ varnames ], is.list ))) {
126
+ stop( " Columns specified in formula can not be of type list " )
127
+ }
128
+ m <- as.list(match.call()[- 1L ])
129
+ subset <- m [[ " subset" ]][[ 2L ]]
58
130
if (! is.null(subset )) {
59
131
if (is.name(subset )) subset = as.call(list (quote(`(` ), subset ))
60
- data = data [eval(subset , data , parent.frame()), unique(c(ff_ , value.var )), with = FALSE ]
132
+ idx = which(eval(subset , data , parent.frame())) # any advantage thro' secondary keys?
133
+ dat = .Call(CsubsetDT , dat , idx , seq_along(dat ))
61
134
}
62
- if (nrow(data ) == 0L || ncol(data ) == 0L ) stop(" Can't 'cast' on an empty data.table" )
63
-
64
- # set 'fun.aggregate = length' if max group size > 1
65
- fun.null = FALSE
66
- if (is.null(fun.aggregate )) {
67
- fun.null = TRUE
68
- oo = forderv(data , by = ff_ , retGrp = TRUE )
135
+ if (! nrow(dat ) || ! ncol(dat )) stop(" Can not cast an empty data.table" )
136
+ fun.call = m [[" fun.aggregate" ]]
137
+ fill.default = NULL
138
+ if (is.null(fun.call )) {
139
+ oo = forderv(dat , by = varnames , retGrp = TRUE )
69
140
if (attr(oo , ' maxgrpn' ) > 1L ) {
70
141
message(" Aggregate function missing, defaulting to 'length'" )
71
- fun.aggregate <- length
72
- m [[" fun.aggregate" ]] = quote(length )
142
+ fun.call = quote(length )
73
143
}
74
144
}
75
- fill.default <- NULL
76
- if (! is.null(fun.aggregate )) { # construct the 'call'
77
- fill.default = fun.aggregate(data [[value.var ]][0 ], ... )
78
- if (! length(fill.default ) && (is.null(fill ) || ! length(fill )))
79
- stop(" Aggregating function provided to argument 'fun.aggregate' should always return a length 1 vector, but returns 0-length value for fun.aggregate(" , typeof(data [[value.var ]]), " (0))." , " This value will have to be used to fill missing combinations, if any, and therefore can not be of length 0. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately." )
80
- args <- c(" data" , " formula" , " margins" , " subset" , " fill" , " value.var" , " verbose" , " drop" )
81
- m <- m [setdiff(names(m ), args )]
82
- .CASTcall = as.call(c(m [1 ], as.name(value.var ), m [- 1 ])) # issues/713
83
- .CASTcall = as.call(c(as.name(" list" ), setattr(list (.CASTcall ), ' names' , value.var )))
84
- # workaround until #5191 (issues/497) is fixed
85
- if (length(intersect(value.var , ff_ )))
86
- .CASTcall = as.call(list (as.name(" {" ), as.name(" .SD" ), .CASTcall ))
87
- }
88
- # special case
89
- if (length(ff $ rr ) == 0 ) {
90
- if (is.null(fun.aggregate ))
91
- ans = data [, c(ff $ ll , value.var ), with = FALSE ]
92
- else {
93
- # workaround until #5191 (issues/497) is fixed
94
- if (length(intersect(value.var , ff_ ))) ans = data [, eval(.CASTcall ), by = c(ff $ ll ), .SDcols = value.var ]
95
- else ans = data [, eval(.CASTcall ), by = c(ff $ ll )]
145
+ if (! is.null(fun.call )) {
146
+ fun.call = aggregate_funs(fun.call , lvals , ... )
147
+ errmsg = " Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately."
148
+ if (is.null(fill )) {
149
+ tryCatch(fill.default <- dat [0 ][, eval(fun.call )], warning = function (x ) stop(errmsg , call. = FALSE ))
150
+ if (nrow(fill.default ) != 1L ) stop(errmsg , call. = FALSE )
96
151
}
97
- if (anyDuplicated(names(ans ))) {
98
- message(" Duplicate column names found in cast data.table. Setting unique names using 'make.unique'" )
99
- setnames(ans , make.unique(names(ans )))
152
+ if (! any(valnames %chin % varnames )) {
153
+ dat = dat [, eval(fun.call ), by = c(varnames )]
154
+ } else {
155
+ dat = dat [, { .SD ; eval(fun.call ) }, by = c(varnames ), .SDcols = valnames ]
100
156
}
101
- if (! identical(key(ans ), ff $ ll )) setkeyv(ans , names(ans )[seq_along(ff $ ll )])
102
- return (ans )
103
157
}
104
- # aggregation moved to R now that 'adhoc-by' is crazy fast!
105
- if (! is.null(fun.aggregate )) {
106
- if (length(intersect(value.var , ff_ ))) {
107
- data = data [, eval(.CASTcall ), by = c(ff_ ), .SDcols = value.var ]
108
- value.var = tail(make.unique(names(data )), 1L )
109
- setnames(data , ncol(data ), value.var )
110
- }
111
- else data = data [, eval(.CASTcall ), by = c(ff_ )]
112
- setkeyv(data , ff_ )
113
- # issues/693
114
- fun_agg_chk <- function (x ) {
115
- # sorted now, 'forderv' should be as fast as uniqlist+uniqlengths
116
- oo = forderv(data , by = key(data ), retGrp = TRUE )
117
- attr(oo , ' maxgrpn' ) > 1L
158
+ order_ <- function (x ) {
159
+ o = forderv(x , retGrp = TRUE , sort = TRUE )
160
+ idx = attr(o , ' starts' )
161
+ if (! length(o )) o = seq_along(x )
162
+ o [idx ] # subsetVector retains attributes, using R's subset for now
163
+ }
164
+ cj_uniq <- function (DT ) {
165
+ do.call(" CJ" , lapply(DT , function (x )
166
+ if (is.factor(x )) {
167
+ xint = seq_along(levels(x ))
168
+ setattr(xint , ' levels' , levels(x ))
169
+ setattr(xint , ' class' , class(x ))
170
+ } else .Call(CsubsetVector , x , order_(x ))
171
+ ))}
172
+ valnames = setdiff(names(dat ), varnames )
173
+ # 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
174
+ if (! is.null(fun.call ) || ! is.null(subset ))
175
+ setkeyv(dat , varnames )
176
+ if (length(rhsnames )) {
177
+ lhs = shallow(dat , lhsnames ); rhs = shallow(dat , rhsnames ); val = shallow(dat , valnames )
178
+ # handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
179
+ if (drop ) {
180
+ map = setDT(lapply(list (lhsnames , rhsnames ), function (cols ) frankv(dat , cols = cols , ties.method = " dense" )))
181
+ maporder = lapply(map , order_ )
182
+ mapunique = lapply(seq_along(map ), function (i ) .Call(CsubsetVector , map [[i ]], maporder [[i ]]))
183
+ lhs = .Call(CsubsetDT , lhs , maporder [[1L ]], seq_along(lhs ))
184
+ rhs = .Call(CsubsetDT , rhs , maporder [[2L ]], seq_along(rhs ))
185
+ } else {
186
+ lhs_ = cj_uniq(lhs ); rhs_ = cj_uniq(rhs )
187
+ map = vector(" list" , 2L )
188
+ .Call(Csetlistelt , map , 1L , lhs_ [lhs , which = TRUE ])
189
+ .Call(Csetlistelt , map , 2L , rhs_ [rhs , which = TRUE ])
190
+ setDT(map )
191
+ mapunique = vector(" list" , 2L )
192
+ .Call(Csetlistelt , mapunique , 1L , seq_len(nrow(lhs_ )))
193
+ .Call(Csetlistelt , mapunique , 2L , seq_len(nrow(rhs_ )))
194
+ lhs = lhs_ ; rhs = rhs_
118
195
}
119
- if (! fun.null && fun_agg_chk(data ))
120
- stop(" Aggregating function provided to argument 'fun.aggregate' should always return a length 1 vector for each group, but returns length != 1 for atleast one group. Please have a look at the DETAILS section of ?dcast.data.table " )
196
+ maplen = sapply(mapunique , length )
197
+ idx = do.call(" CJ" , mapunique )[map , I : = .I ][[" I" ]] # TO DO: move this to C and avoid materialising the Cross Join.
198
+ ans = .Call(" Cfcast" , lhs , val , maplen [[1L ]], maplen [[2L ]], idx , fill , fill.default , is.null(fun.call ))
199
+ allcols = do.call(" paste" , c(rhs , sep = " _" ))
200
+ if (length(valnames ) > 1L )
201
+ allcols = do.call(" paste" , c(setcolorder(CJ(valnames , allcols , sorted = FALSE ), 2 : 1 ), sep = " _" ))
202
+ setattr(ans , ' names' , c(lhsnames , allcols ))
203
+ setDT(ans ); setattr(ans , ' sorted' , lhsnames )
121
204
} else {
122
- if (is.null(subset ))
123
- data = data [, unique(c(ff_ , value.var )), with = FALSE ] # data is untouched so far. subset only required columns
124
- if (length(oo )) .Call(Creorder , data , oo )
125
- setattr(data , ' sorted' , ff_ )
126
- }
127
- .CASTenv = new.env(parent = parent.frame())
128
- assign(" forder" , forderv , .CASTenv )
129
- assign(" CJ" , CJ , .CASTenv )
130
- ans <- .Call(" Cfcast" , data , ff $ ll , ff $ rr , value.var , fill , fill.default , is.null(fun.aggregate ), .CASTenv , drop )
131
- setDT(ans )
132
- if (anyDuplicated(names(ans ))) {
133
- message(" Duplicate column names found in cast data.table. Setting unique names using 'make.unique'" )
134
- setnames(ans , make.unique(names(ans )))
205
+ # formula is of the form x + y ~ . (rare case)
206
+ if (drop ) {
207
+ if (is.null(subset ) && is.null(fun.call )) {
208
+ dat = copy(dat ) # can't be avoided
209
+ setkeyv(dat , lhsnames )
210
+ }
211
+ ans = dat
212
+ } else {
213
+ lhs = shallow(dat , lhsnames )
214
+ val = shallow(dat , valnames )
215
+ lhs_ = cj_uniq(lhs )
216
+ idx = lhs_ [lhs , I : = .I ][[" I" ]]
217
+ lhs_ [, I : = NULL ]
218
+ ans = .Call(" Cfcast" , lhs_ , val , nrow(lhs_ ), 1L , idx , fill , fill.default , is.null(fun.call ))
219
+ setDT(ans ); setattr(ans , ' sorted' , lhsnames )
220
+ setnames(ans , c(lhsnames , valnames ))
221
+ }
222
+ if (length(valnames ) == 1L )
223
+ setnames(ans , valnames , value.var )
135
224
}
136
- setattr(ans , ' sorted' , names(ans )[seq_along(ff $ ll )])
137
- ans
225
+ return (ans )
138
226
}
0 commit comments