-
Notifications
You must be signed in to change notification settings - Fork 991
/
data.table.R
3111 lines (2986 loc) · 163 KB
/
data.table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
if (!exists("trimws", "package:base")) {
# trimws was new in R 3.2.0. Backport it for internal data.table use in R 3.1.0
trimws = function(x) {
mysub = function(re, x) sub(re, "", x, perl = TRUE)
mysub("[ \t\r\n]+$", mysub("^[ \t\r\n]+", x))
}
}
dim.data.table = function(x)
{
.Call(Cdim, x)
}
.global = new.env() # thanks to: http://stackoverflow.com/a/12605694/403310
setPackageName("data.table",.global)
.global$print = ""
# NB: if adding to/editing this list, be sure to do the following:
# (1) add to man/special-symbols.Rd
# (2) export() in NAMESPACE
# (3) add to vignettes/datatable-importing.Rmd#globals section
.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = NULL
# These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler.
# But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols.
# And NULL makes it clear (to the R's mask check on loading) that they're variables not functions.
# utils::globalVariables(c(".SD",".N")) was tried as well, but exporting seems better.
# So even though .BY doesn't appear in this file, it should still be NULL here and exported because it's
# defined in SDenv and can be used by users.
is.data.table = function(x) inherits(x, "data.table")
is.ff = function(x) inherits(x, "ff") # define this in data.table so that we don't have to require(ff), but if user is using ff we'd like it to work
#NCOL = function(x) {
# # copied from base, but additionally covers data.table via is.list()
# # because NCOL in base explicitly tests using is.data.frame()
# if (is.list(x) && !is.ff(x)) return(length(x))
# if (is.array(x) && length(dim(x)) > 1L) ncol(x) else as.integer(1L)
#}
#NROW = function(x) {
# if (is.data.frame(x) || is.data.table(x)) return(nrow(x))
# if (is.list(x) && !is.ff(x)) stop("List is not a data.frame or data.table. Convert first before using NROW") # list may have different length elements, which data.table and data.frame's resolve.
# if (is.array(x)) nrow(x) else length(x)
#}
null.data.table = function() {
ans = list()
setattr(ans,"class",c("data.table","data.frame"))
setattr(ans,"row.names",.set_row_names(0L))
setalloccol(ans)
}
data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
{
# NOTE: It may be faster in some circumstances for users to create a data.table by creating a list l
# first, and then setattr(l,"class",c("data.table","data.frame")) and forgo checking.
x = list(...) # list() doesn't copy named inputs as from R >= 3.1.0 (a very welcome change)
nd = name_dots(...)
names(x) = nd$vnames
if (length(x)==0L) return( null.data.table() )
if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #48
ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names, .named=nd$.named) # see comments inside as.data.table.list re copies
if (!is.null(key)) {
if (!is.character(key)) stop("key argument of data.table() must be character")
if (length(key)==1L) {
key = strsplit(key,split=",")[[1L]]
# eg key="A,B"; a syntax only useful in key argument to data.table(), really.
}
setkeyv(ans,key)
} else {
# retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table().
# If DT inputs with keys have been recycled then can't retain key
ckey = NULL
for (i in seq_along(x)) {
xi = x[[i]]
if (is.data.table(xi) && haskey(xi) && nrow(xi)==nrow(ans)) ckey=c(ckey, key(xi))
}
if (length(ckey) &&
!anyDuplicated(ckey) &&
identical(is.na(chmatchdup(c(ckey,ckey), names(ans))), rep(c(FALSE,TRUE),each=length(ckey)))) {
setattr(ans, "sorted", ckey)
}
}
if (isTRUE(stringsAsFactors)) {
for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
# as_factor is internal function in fread.R currently
}
setalloccol(ans) # returns a NAMED==0 object, unlike data.frame()
}
replace_dot_alias = function(e) {
# we don't just simply alias .=list because i) list is a primitive (faster to iterate) and ii) we test for use
# of "list" in several places so it saves having to remember to write "." || "list" in those places
if (is.call(e) && !is.function(e[[1L]])) {
# . alias also used within bquote, #1912
if (e[[1L]] == 'bquote') return(e)
if (e[[1L]] == ".") e[[1L]] = quote(list)
for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_dot_alias(e[[i]])
}
e
}
.massagei = function(x) {
# J alias for list as well in i, just if the first symbol
# if x = substitute(base::order) then as.character(x[[1L]]) == c("::", "base", "order")
if (x %iscall% c("J","."))
x[[1L]] = quote(list)
x
}
.checkTypos = function(err, ref) {
if (grepl('object.*not found', err$message)) {
used = gsub(".*object '([^']+)'.*", "\\1", err$message)
found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE)
if (length(found)) {
stop("Object '", used, "' not found. Perhaps you intended ",
paste(head(found, 5L), collapse=", "),
if (length(found)<=5L) "" else paste(" or",length(found)-5L, "more"))
} else {
stop("Object '", used, "' not found amongst ",
paste(head(ref, 5L), collapse=', '),
if (length(ref)<=5L) "" else paste(" and", length(ref)-5L, "more"))
}
} else {
stop(err$message, call.=FALSE)
}
}
"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
{
# ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could
# test explicitly if the caller is [.data.table (even stronger test. TO DO.)
# the drop=NULL is to sink drop argument when dispatching to [.data.frame; using '...' stops test 147
if (!cedta()) {
# Fix for #500 (to do)
Nargs = nargs() - (!missing(drop))
ans = if (Nargs<3L) { `[.data.frame`(x,i) } # drop ignored anyway by DF[i]
else if (missing(drop)) `[.data.frame`(x,i,j)
else `[.data.frame`(x,i,j,drop)
# added is.data.table(ans) check to fix bug #81
if (!missing(i) & is.data.table(ans)) setkey(ans,NULL) # See test 304
return(ans)
}
if (!missing(verbose)) {
stopifnot(isTRUEorFALSE(verbose))
# set the global verbose option because that is fetched from C code without having to pass it through
oldverbose = options(datatable.verbose=verbose)
on.exit(options(oldverbose))
}
.global$print=""
missingby = missing(by) && missing(keyby) # for tests 359 & 590 where passing by=NULL results in data.table not vector
if (!missing(keyby)) {
if (!missing(by)) stop("Provide either by= or keyby= but not both")
if (missing(j)) { warning("Ignoring keyby= because j= is not supplied"); keyby=NULL; }
by=bysub=substitute(keyby)
keyby=TRUE
# Assign to 'by' so that by is no longer missing and we can proceed as if there were one by
} else {
if (!missing(by) && missing(j)) { warning("Ignoring by= because j= is not supplied"); by=NULL; }
by=bysub= if (missing(by)) NULL else substitute(by)
keyby=FALSE
}
bynull = !missingby && is.null(by) #3530
byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI"
naturaljoin = FALSE
names_x = names(x)
if (missing(i) && !missing(on)) {
tt = eval.parent(.massagei(substitute(on)))
if (!is.list(tt) || !length(names(tt))) {
warning("When on= is provided but not i=, on= must be a named list or data.table|frame, and a natural join (i.e. join on common names) is invoked. Ignoring on= which is '",class(tt)[1L],"'.")
on = NULL
} else {
i = tt
naturaljoin = TRUE
}
}
if (missing(i) && missing(j)) {
tt_isub = substitute(i)
tt_jsub = substitute(j)
if (!is.null(names(sys.call())) && # not relying on nargs() as it considers DT[,] to have 3 arguments, #3163
tryCatch(!is.symbol(tt_isub), error=function(e)TRUE) && # a symbol that inherits missingness from caller isn't missing for our purpose; test 1974
tryCatch(!is.symbol(tt_jsub), error=function(e)TRUE)) {
warning("i and j are both missing so ignoring the other arguments. This warning will be upgraded to error in future.")
}
return(x)
}
if (!mult %chin% c("first","last","all")) stop("mult argument can only be 'first', 'last' or 'all'")
missingroll = missing(roll)
if (length(roll)!=1L || is.na(roll)) stop("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
if (is.character(roll)) {
if (roll!="nearest") stop("roll is '",roll,"' (type character). Only valid character value is 'nearest'.")
} else {
roll = if (isTRUE(roll)) +Inf else as.double(roll)
}
force(rollends)
if (!is.logical(rollends)) stop("rollends must be a logical vector")
if (length(rollends)>2L) stop("rollends must be length 1 or 2")
if (length(rollends)==1L) rollends=rep.int(rollends,2L)
# TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one.
.unsafe.opt() #3585
missingnomatch = missing(nomatch)
if (is.null(nomatch)) nomatch = 0L # allow nomatch=NULL API already now, part of: https://github.com/Rdatatable/data.table/issues/857
if (!is.na(nomatch) && nomatch!=0L) stop("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)")
nomatch = as.integer(nomatch)
if (!is.logical(which) || length(which)>1L) stop("which= must be a logical vector length 1. Either FALSE, TRUE or NA.")
if ((isTRUE(which)||is.na(which)) && !missing(j)) stop("which==",which," (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.")
if (!is.na(nomatch) && is.na(which)) stop("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.")
if (!with && missing(j)) stop("j must be provided when with=FALSE")
irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency.
notjoin = FALSE
rightcols = leftcols = integer()
optimizedSubset = FALSE ## flag: tells whether a normal query was optimized into a join.
..syms = NULL
av = NULL
jsub = NULL
if (!missing(j)) {
jsub = replace_dot_alias(substitute(j))
root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
if (root == ":" ||
(root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') ||
( (!length(av<-all.vars(jsub)) || all(substring(av,1L,2L)=="..")) &&
root %chin% c("","c","paste","paste0","-","!") &&
missingby )) { # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed)
# When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find.
# If variable names do occur, but they are all prefixed with .., then that means look up in calling scope.
# Automatically set with=FALSE in this case so that DT[,1], DT[,2:3], DT[,"someCol"] and DT[,c("colB","colD")]
# work as expected. As before, a vector will never be returned, but a single column data.table
# for type consistency with >1 cases. To return a single vector use DT[["someCol"]] or DT[[3]].
# The root==":" is to allow DT[,colC:colH] even though that contains two variable names.
# root == "-" or "!" is for tests 1504.11 and 1504.13 (a : with a ! or - modifier root)
# We don't want to evaluate j at all in making this decision because i) evaluating could itself
# increment some variable and not intended to be evaluated a 2nd time later on and ii) we don't
# want decisions like this to depend on the data or vector lengths since that can introduce
# inconsistency reminiscent of drop=TRUE in [.data.frame that we seek to avoid.
with=FALSE
if (length(av)) {
for (..name in av) {
name = substring(..name, 3L)
if (name=="") stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.")
if (!exists(name, where=parent.frame())) {
stop("Variable '",name,"' is not found in calling scope. Looking in calling scope because you used the .. prefix.",
if (exists(..name, where=parent.frame()))
paste0(" Variable '..",name,"' does exist in calling scope though, so please just removed the .. prefix from that variable name in calling scope.")
# We have recommended 'manual' .. prefix in the past, so try to be helpful
else
""
)
} else if (exists(..name, where=parent.frame())) {
warning("Both '",name,"' and '..", name, "' exist in calling scope. Please remove the '..", name,"' variable in calling scope for clarity.")
}
}
..syms = av
}
} else if (is.name(jsub)) {
if (substring(jsub, 1L, 2L) == "..") stop("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov
if (!with && !exists(as.character(jsub), where=parent.frame()))
stop("Variable '",jsub,"' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.")
}
if (root=="{") {
if (length(jsub) == 2L) {
jsub = jsub[[2L]] # to allow {} wrapping of := e.g. [,{`:=`(...)},] [#376]
root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
} else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") {
#2142 -- j can be {} and have length 1
stop("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}")
}
}
if (root=="eval" && !any(all.vars(jsub[[2L]]) %chin% names_x)) {
# TODO: this && !any depends on data. Can we remove it?
# Grab the dynamic expression from calling scope now to give the optimizer a chance to optimize it
# Only when top level is eval call. Not nested like x:=eval(...) or `:=`(x=eval(...), y=eval(...))
jsub = eval(jsub[[2L]], parent.frame(), parent.frame()) # this evals the symbol to return the dynamic expression
if (is.expression(jsub)) jsub = jsub[[1L]] # if expression, convert it to call
# Note that the dynamic expression could now be := (new in v1.9.7)
root = if (is.call(jsub)) {
jsub = replace_dot_alias(jsub)
as.character(jsub[[1L]])[1L]
} else ""
}
if (root == ":=") {
allow.cartesian=TRUE # (see #800)
if (!missing(i) && keyby)
stop(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i")
if (!missingnomatch) {
warning("nomatch isn't relevant together with :=, ignoring nomatch")
nomatch=0L
}
}
}
# setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead.
dupdiff = function(x, y) x[!x %chin% y]
if (!missing(i)) {
xo = NULL
isub = substitute(i)
if (identical(isub, NA)) {
# only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA]
# replace NA in this case with NA_integer_ as that's almost surely what user intended to
# return a single row with NA in all columns. (DT[0] returns an empty table, with correct types.)
# Any expression (including length 1 vectors) that evaluates to a single NA logical will
# however be left as NA logical since that's important for consistency to return empty in that
# case; e.g. DT[Col==3] where DT is 1 row and Col contains NA.
# Replacing the NA symbol makes DT[NA] and DT[c(1,NA)] consistent and provides
# an easy way to achieve a single row of NA as users expect rather than requiring them
# to know and change to DT[NA_integer_].
isub=NA_integer_
}
isnull_inames = FALSE
# Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires
# the "eval" to be checked before `as.name("!")`. Therefore interchanged.
restore.N = remove.N = FALSE
if (exists(".N", envir=parent.frame(), inherits=FALSE)) {
old.N = get(".N", envir=parent.frame(), inherits=FALSE)
locked.N = bindingIsLocked(".N", parent.frame())
if (locked.N) eval(call("unlockBinding", ".N", parent.frame())) # eval call to pass R CMD check NOTE until we find cleaner way
assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
restore.N = TRUE
# the comment below is invalid hereafter (due to fix for #1145)
# binding locked when .SD[.N] but that's ok as that's the .N we want anyway
# TO DO: change isub at C level s/.N/nrow(x); changing a symbol to a constant should be ok
} else {
assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
remove.N = TRUE
}
if (isub %iscall% "eval") { # TO DO: or ..()
isub = eval(.massagei(isub[[2L]]), parent.frame(), parent.frame())
if (is.expression(isub)) isub=isub[[1L]]
}
if (isub %iscall% "!") {
notjoin = TRUE
if (!missingnomatch) stop("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch.");
nomatch = 0L
isub = isub[[2L]]
# #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!"
if (isub %iscall% "(" && !is.name(isub[[2L]]))
isub = isub[[2L]]
}
if (is.null(isub)) return( null.data.table() )
if (length(o <- .prepareFastSubset(isub = isub, x = x,
enclos = parent.frame(),
notjoin = notjoin, verbose = verbose))){
## redirect to the is.data.table(x) == TRUE branch.
## Additional flag to adapt things after bmerge:
optimizedSubset = TRUE
notjoin = o$notjoin
i = o$i
on = o$on
## the following two are ignored if i is not a data.table.
## Since we are converting i to data.table, it is important to set them properly.
nomatch = 0L
mult = "all"
}
else if (!is.name(isub)) {
ienv = new.env(parent=parent.frame())
if (getOption("datatable.optimize")>=1L) assign("order", forder, ienv)
i = tryCatch(eval(.massagei(isub), x, ienv), error=function(e) {
if (grepl(":=.*defined for use in j.*only", e$message))
stop("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number.")
else
.checkTypos(e, names_x)
})
} else {
# isub is a single symbol name such as B in DT[B]
i = try(eval(isub, parent.frame(), parent.frame()), silent=TRUE)
if (inherits(i,"try-error")) {
# must be "not found" since isub is a mere symbol
col = try(eval(isub, x), silent=TRUE) # is it a column name?
msg = if (inherits(col,"try-error")) " and it is not a column name either."
else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE",
", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.")
stop(as.character(isub), " is not found in calling scope", msg,
" When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.")
}
}
if (restore.N) {
assign(".N", old.N, envir=parent.frame())
if (locked.N) lockBinding(".N", parent.frame())
}
if (remove.N) rm(list=".N", envir=parent.frame())
if (is.matrix(i)) {
if (is.numeric(i) && ncol(i)==1L) { # #826 - subset DT on single integer vector stored as matrix
i = as.integer(i)
} else {
stop("i is invalid type (matrix). Perhaps in future a 2 column matrix could return a list of elements of DT (in the spirit of A[B] in FAQ 2.14). Please report to data.table issue tracker if you'd like this, or add your comments to FR #657.")
}
}
if (is.logical(i)) {
if (notjoin) {
notjoin = FALSE
i = !i
}
}
if (is.null(i)) return( null.data.table() )
if (is.character(i)) {
isnull_inames = TRUE
i = data.table(V1=i) # for user convenience; e.g. DT["foo"] without needing DT[.("foo")]
} else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) { i = as.data.table(i[[1L]]) }
else if (identical(class(i),"data.frame")) { i = as.data.table(i) } # TO DO: avoid these as.data.table() and use a flag instead
else if (identical(class(i),"list")) {
isnull_inames = is.null(names(i))
i = as.data.table(i)
}
if (is.data.table(i)) {
if (missing(on)) {
if (!haskey(x)) {
stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
}
} else if (identical(substitute(on), as.name(".NATURAL"))) {
naturaljoin = TRUE
}
if (naturaljoin) { # natural join #629
common_names = intersect(names_x, names(i))
len_common_names = length(common_names)
if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables")
if (verbose) {
which_cols_msg = if (len_common_names == length(x)) " all 'x' columns"
else paste(":", brackify(common_names))
cat("Joining but 'x' has no key, natural join using", which_cols_msg, "\n", sep = "")
}
on = common_names
}
if (!missing(on)) {
# on = .() is now possible, #1257
on_ops = .parse_on(substitute(on), isnull_inames)
on = on_ops[[1L]]
ops = on_ops[[2L]]
# TODO: collect all '==' ops first to speeden up Cnestedid
rightcols = colnamesInt(x, names(on), check_dups=FALSE)
leftcols = colnamesInt(i, unname(on), check_dups=FALSE)
} else {
## missing on
rightcols = chmatch(key(x), names_x) # NAs here (i.e. invalid data.table) checked in bmerge()
leftcols = if (haskey(i))
chmatch(head(key(i), length(rightcols)), names(i))
else
seq_len(min(length(i),length(rightcols)))
rightcols = head(rightcols,length(leftcols))
ops = rep(1L, length(leftcols))
}
# Implementation for not-join along with by=.EACHI, #604
if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571
notjoin = FALSE
if (verbose) {last.started.at=proc.time();cat("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()}
orignames = copy(names(i))
i = setdiff_(x, i, rightcols, leftcols) # part of #547
if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()}
setnames(i, orignames[leftcols])
setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
}
i = .shallow(i, retain.key = TRUE)
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
xo = ans$xo ## to make it available for further use.
# temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
# 'setorder', as there's another 'setorder' in generating 'irows' below...
if (length(ans$indices)) setorder(setDT(ans[1L:3L]), indices)
allLen1 = ans$allLen1
f__ = ans$starts
len__ = ans$lens
allGrp1 = FALSE # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time.
indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix
# length of input nomatch (single 0 or NA) is 1 in both cases.
# When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N
# If using secondary key of x, f__ will refer to xo
if (is.na(which)) {
w = if (notjoin) f__!=0L else is.na(f__)
return( if (length(xo)) fsort(xo[w], internal=TRUE) else which(w) )
}
if (mult=="all") {
# is by=.EACHI along with non-equi join?
nqbyjoin = byjoin && length(ans$indices) && !allGrp1
if (!byjoin || nqbyjoin) {
# Really, `anyDuplicated` in base is AWESOME!
# allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
irows = if (allLen1) f__ else vecseq(f__,len__,
if (allow.cartesian ||
notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
!anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
NULL # #742. If 'i' has no duplicates, ignore
} else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
# Fix for #1092 and #1074
# TODO: implement better version of "any"/"all"/"which" to avoid
# unnecessary construction of logical vectors
if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L]
} else {
if (length(xo) && missing(on))
stop("Internal error. Cannot by=.EACHI when joining to a secondary key, yet") # nocov
# since f__ refers to xo later in grouping, so xo needs to be passed through to dogroups too.
if (length(irows))
stop("Internal error. irows has length in by=.EACHI") # nocov
}
if (nqbyjoin) {
irows = if (length(xo)) xo[irows] else irows
xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
ans = .Call(CnqRecreateIndices, xo, len__, indices__, max(indices__), nomatch) # issue#4388 fix
f__ = ans[[1L]]; len__ = ans[[2L]]
allLen1 = FALSE # TODO; should this always be FALSE?
irows = NULL # important to reset
if (any_na(as_list(xo))) xo = xo[!is.na(xo)]
}
} else {
if (!byjoin) { #1287 and #1271
irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic
if (identical(nomatch,0L)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows
}
# TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"?
# TODO: how about when nomatch=0L, can we avoid allocating then as well?
}
if (length(xo) && length(irows)) {
irows = xo[irows] # TO DO: fsort here?
if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit.
if (verbose) {last.started.at=proc.time();cat("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()}
irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
}
}
if (optimizedSubset){
## special treatment for calls like DT[x == 3] that are transformed into DT[J(x=3), on = "x==x"]
if(!.Call(CisOrderedSubset, irows, nrow(x))){
## restore original order. This is a very expensive operation.
## benchmarks have shown that starting with 1e6 irows, a tweak can significantly reduce time
## (see #2366)
if (verbose) {last.started.at=proc.time()[3L];cat("Reordering", length(irows), "rows after bmerge done in ... ");flush.console()}
if(length(irows) < 1e6){
irows = fsort(irows, internal=TRUE) ## internally, fsort on integer falls back to forderv
} else {
irows = as.integer(fsort(as.numeric(irows))) ## nocov; parallelized for numeric, but overhead of type conversion
}
if (verbose) {cat(round(proc.time()[3L]-last.started.at,3L),"secs\n");flush.console()}
}
## make sure, all columns are taken from x and not from i.
## This is done by simply telling data.table to continue as if there was a simple subset
leftcols = integer(0L)
rightcols = integer(0L)
i = irows ## important to make i not a data.table because otherwise Gforce doesn't kick in
}
}
else {
if (!missing(on)) {
stop("logical error. i is not a data.table, but 'on' argument is provided.")
}
# TO DO: TODO: Incorporate which_ here on DT[!i] where i is logical. Should avoid i = !i (above) - inefficient.
# i is not a data.table
if (!is.logical(i) && !is.numeric(i)) stop("i has evaluated to type ", typeof(i), ". Expecting logical, integer or double.")
if (is.logical(i)) {
if (length(i)==1L # to avoid unname copy when length(i)==nrow (normal case we don't want to slow down)
&& isTRUE(unname(i))) { irows=i=NULL } # unname() for #2152 - length 1 named logical vector.
# NULL is efficient signal to avoid creating 1:nrow(x) but still return all rows, fixes #1249
else if (length(i)<=1L) { irows=i=integer(0L) }
# FALSE, NA and empty. All should return empty data.table. The NA here will be result of expression,
# where for consistency of edge case #1252 all NA to be removed. If NA is a single NA symbol, it
# was auto converted to NA_integer_ higher up for ease of use and convenience. We definitely
# don't want base R behaviour where DF[NA,] returns an entire copy filled with NA everywhere.
else if (length(i)==nrow(x)) { irows=i=which(i) }
# The which() here auto removes NA for convenience so user doesn't need to remember "!is.na() & ..."
# Also this which() is for consistency of DT[colA>3,which=TRUE] and which(DT[,colA>3])
# Assigning to 'i' here as well to save memory, #926.
else stop("i evaluates to a logical vector length ", length(i), " but there are ", nrow(x), " rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.")
} else {
irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error)
irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=") # last argument is allowOverMax (NA when selecting, error when assigning)
# simplifies logic from here on: can assume positive subscripts (no zeros)
# maintains Arun's fix for #2697 (test 1042)
# efficient in C with more detailed helpful messages when user mixes positives and negatives
# falls through quickly (no R level allocs) if all items are within range [1,max] with no zeros or negatives
# minor TO DO: can we merge this with check_idx in fcast.c/subset ?
}
}
if (notjoin) {
if (byjoin || !is.integer(irows) || is.na(nomatch)) stop("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov
irows = irows[irows!=0L]
if (verbose) {last.started.at=proc.time()[3L];cat("Inverting irows for notjoin done in ... ");flush.console()}
i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x))
if (verbose) cat(round(proc.time()[3L]-last.started.at, 3L), "sec\n")
leftcols = integer() # proceed as if row subset from now on, length(leftcols) is switched on later
rightcols = integer()
# Doing this once here, helps speed later when repeatedly subsetting each column. R's [irows] would do this for each
# column when irows contains negatives.
}
if (which) return( if (is.null(irows)) seq_len(nrow(x)) else irows )
} else { # missing(i)
i = NULL
}
names_i = names(i) # value is now stable
byval = NULL
xnrow = nrow(x)
xcols = xcolsAns = icols = icolsAns = integer()
xdotcols = FALSE
# track which columns appear in j through ansvars, and those
# that belong in .SD through sdvars. Mostly these will be the same,
# except in cases like DT[ , lapply(.SD, function(x) x/V1), .SDcols = !'V1'],
# see e.g. #484, #495, #1744, #1965.
non_sdvars = character(0L)
if (missing(j)) {
# missingby was already checked above before dealing with i
if (!length(x)) return(null.data.table())
if (!length(leftcols)) {
# basic x[i] subset, #2951
if (is.null(irows)) return(shallow(x)) # e.g. DT[TRUE] (#3214); otherwise CsubsetDT would materialize a deep copy
else return(.Call(CsubsetDT, x, irows, seq_along(x)) )
} else {
jisvars = names_i[-leftcols]
tt = jisvars %chin% names_x
if (length(tt)) jisvars[tt] = paste0("i.",jisvars[tt])
if (length(duprightcols <- rightcols[duplicated(rightcols)])) {
nx = c(names_x, names_x[duprightcols])
rightcols = chmatchdup(names_x[rightcols], nx)
nx = make.unique(nx)
} else nx = names_x
ansvars = make.unique(c(nx, jisvars))
icols = c(leftcols, seq_along(i)[-leftcols])
icolsAns = c(rightcols, seq.int(length(nx)+1L, length.out=ncol(i)-length(unique(leftcols))))
xcols = xcolsAns = seq_along(x)[-rightcols]
}
ansvals = chmatch(ansvars, nx)
}
else {
if (is.data.table(i)) {
idotprefix = paste0("i.", names_i)
xdotprefix = paste0("x.", names_x)
} else idotprefix = xdotprefix = character(0L)
# j was substituted before dealing with i so that := can set allow.cartesian=FALSE (#800) (used above in i logic)
if (is.null(jsub)) return(NULL)
if (!with && jsub %iscall% ":=") {
# TODO: make these both errors (or single long error in both cases) in next release.
# i.e. using with=FALSE together with := at all will become an error. Eventually with will be removed.
if (is.null(names(jsub)) && is.name(jsub[[2L]])) {
warning("with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples. As warned in 2014, this is now a warning.")
jsub[[2L]] = eval(jsub[[2L]], parent.frame(), parent.frame())
} else {
warning("with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.")
}
with = TRUE
}
if (!with) {
# missingby was already checked above before dealing with i
if (jsub %iscall% c("!", "-") && length(jsub)==2L) { # length 2 to only match unary, #2109
notj = TRUE
jsub = jsub[[2L]]
} else notj = FALSE
# fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4)))
while (jsub %iscall% "(") jsub = as.list(jsub)[[-1L]]
if (jsub %iscall% ":" && length(jsub)==3L) {
j = eval(jsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) # else j will be evaluated for the first time on next line
} else {
names(..syms) = ..syms
j = eval(jsub, lapply(substring(..syms,3L), get, pos=parent.frame()), parent.frame())
}
if (is.logical(j)) j <- which(j)
if (!length(j) && !notj) return( null.data.table() )
if (is.factor(j)) j = as.character(j) # fix for FR: #358
if (is.character(j)) {
if (notj) {
if (anyNA(idx <- chmatch(j, names_x))) warning("column(s) not removed because not found: ", brackify(j[is.na(idx)]))
# all duplicates of the name in names(x) must be removed; e.g. data.table(x=1, y=2, x=3)[, !"x"] should just output 'y'.
w = !names_x %chin% j
ansvars = names_x[w]
ansvals = which(w)
} else {
# if DT[, c("x","x")] and "x" is duplicated in names(DT), we still subset only the first. Because dups are unusual and
# it's more common to select the same column a few times. A syntax would be needed to distinguish these intents.
ansvars = j # x. and i. prefixes may be in here, they'll result in NA and will be dealt with further below if length(leftcols)
ansvals = chmatch(ansvars, names_x) # not chmatchdup()
}
if (!length(ansvals)) return(null.data.table())
if (!length(leftcols)) {
if (!anyNA(ansvals)) return(.Call(CsubsetDT, x, irows, ansvals))
else stop("column(s) not found: ", paste(ansvars[is.na(ansvals)],collapse=", "))
}
# else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977)
# in both cases leave to the R-level subsetting of i and x together further below
} else if (is.numeric(j)) {
j = as.integer(j)
if (any(w<-(j>ncol(x)))) stop("Item ",which.first(w)," of j is ",j[which.first(w)]," which is outside the column number range [1,ncol=", ncol(x),"]")
j = j[j!=0L]
if (any(j<0L)) {
if (any(j>0L)) stop("j mixes positives and negatives")
j = seq_along(x)[j] # all j are <0 here
}
# 3013 -- handle !FALSE in column subset in j via logical+with
if (notj) j = seq_along(x)[if (length(j)) -j else TRUE]
if (!length(j)) return(null.data.table())
return(.Call(CsubsetDT, x, irows, j))
} else {
stop("When with=FALSE, j-argument should be of type logical/character/integer indicating the columns to select.") # fix for #1440.
}
} else { # with=TRUE and byjoin could be TRUE
bynames = NULL
allbyvars = NULL
if (byjoin) {
bynames = names_x[rightcols]
} else if (!missingby) {
# deal with by before j because we need byvars when j contains .SD
# may evaluate to NULL | character() | "" | list(), likely a result of a user expression where no-grouping is one case being loop'd through
bysubl = as.list.default(bysub)
bysuborig = bysub
if (is.name(bysub) && !(bysub %chin% names_x)) { # TO DO: names(x),names(i),and i. and x. prefixes
bysub = eval(bysub, parent.frame(), parent.frame())
# fix for # 5106 - http://stackoverflow.com/questions/19983423/why-by-on-a-vector-not-from-a-data-table-column-is-very-slow
# case where by=y where y is not a column name, and not a call/symbol/expression, but an atomic vector outside of DT.
# note that if y is a list, this'll return an error (not sure if it should).
if (is.atomic(bysub)) bysubl = list(bysuborig) else bysubl = as.list.default(bysub)
}
if (length(bysubl) && identical(bysubl[[1L]],quote(eval))) { # TO DO: or by=..()
bysub = eval(bysubl[[2L]], parent.frame(), parent.frame())
bysub = replace_dot_alias(bysub) # fix for #1298
if (is.expression(bysub)) bysub=bysub[[1L]]
bysubl = as.list.default(bysub)
} else if (bysub %iscall% c("c","key","names", "intersect", "setdiff")) {
# catch common cases, so we don't have to copy x[irows] for all columns
# *** TO DO ***: try() this eval first (as long as not list() or .()) and see if it evaluates to column names
# to avoid the explicit c,key,names which already misses paste("V",1:10) for example
# tried before but since not wrapped in try() it failed on some tests
# or look for column names used in this by (since if none it wouldn't find column names anyway
# when evaled within full x[irows]). Trouble is that colA%%2L is a call and should be within frame.
tt = eval(bysub, parent.frame(), parent.frame())
if (!is.character(tt)) stop("by=c(...), key(...) or names(...) must evaluate to 'character'")
bysub=tt
} else if (is.call(bysub) && !(bysub[[1L]] %chin% c("list", "as.list", "{", ".", ":"))) {
# potential use of function, ex: by=month(date). catch it and wrap with "(", because we need to set "bysameorder" to FALSE as we don't know if the function will return ordered results just because "date" is ordered. Fixes #2670.
bysub = as.call(c(as.name('('), list(bysub)))
bysubl = as.list.default(bysub)
} else if (bysub %iscall% ".") bysub[[1L]] = quote(list)
if (mode(bysub) == "character") {
if (length(grep(",", bysub, fixed = TRUE))) {
if (length(bysub)>1L) stop("'by' is a character vector length ",length(bysub)," but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.")
bysub = strsplit(bysub,split=",")[[1L]]
}
backtick_idx = grep("^[^`]+$",bysub)
if (length(backtick_idx)) bysub[backtick_idx] = paste0("`",bysub[backtick_idx],"`")
backslash_idx = grep("\\", bysub, fixed = TRUE)
if (length(backslash_idx)) bysub[backslash_idx] = gsub('\\', '\\\\', bysub[backslash_idx], fixed = TRUE)
bysub = parse(text=paste0("list(",paste(bysub,collapse=","),")"))[[1L]]
bysubl = as.list.default(bysub)
}
allbyvars = intersect(all.vars(bysub), names_x)
orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__)
bysameorder = byindex = FALSE
if (all(vapply_1b(bysubl, is.name))) {
bysameorder = orderedirows && haskey(x) && length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars)))
# either bysameorder or byindex can be true but not both. TODO: better name for bysameorder might be bykeyx
if (!bysameorder && keyby && !length(irows) && isTRUE(getOption("datatable.use.index"))) {
# TODO: could be allowed if length(irows)>1 but then the index would need to be squashed for use by uniqlist, #3062
# find if allbyvars is leading subset of any of the indices; add a trailing "__" to fix #3498 where a longer column name starts with a shorter column name
tt = paste0(c(allbyvars,""), collapse="__")
w = which.first(substring(paste0(indices(x),"__"),1L,nchar(tt)) == tt)
if (!is.na(w)) {
byindex = indices(x)[w]
if (!length(getindex(x, byindex))) {
if (verbose) cat("by index '", byindex, "' but that index has 0 length. Ignoring.\n", sep="")
byindex=FALSE
}
}
}
}
if (is.null(irows)) {
if (bysub %iscall% ':' && length(bysub)==3L && is.name(bysub[[2L]]) && is.name(bysub[[3L]])) {
byval = eval(bysub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame())
byval = as.list(x)[byval]
} else byval = eval(bysub, x, parent.frame())
} else {
# length 0 when i returns no rows
if (!is.integer(irows)) stop("Internal error: irows isn't integer") # nocov
# Passing irows as i to x[] below has been troublesome in a rare edge case.
# irows may contain NA, 0, negatives and >nrow(x) here. That's all ok.
# But we may need i join column values to be retained (where those rows have no match), hence we tried eval(isub)
# in 1.8.3, but this failed test 876.
# TO DO: Add a test like X[i,sum(v),by=i.x2], or where by includes a join column (both where some i don't match).
# TO DO: Make xss directly, rather than recursive call.
if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset
if (length(allbyvars)) { ############### TO DO TO DO TO DO ###############
if (verbose) cat("i clause present and columns used in by detected, only these subset:",paste(allbyvars,collapse=","),"\n")
xss = x[irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends]
} else {
if (verbose) cat("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '",deparse(by),"'\n",sep="")
xss = x[irows,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends]
}
if (bysub %iscall% ':' && length(bysub)==3L) {
byval = eval(bysub, setattr(as.list(seq_along(xss)), 'names', names(xss)), parent.frame())
byval = as.list(xss)[byval]
} else byval = eval(bysub, xss, parent.frame())
xnrow = nrow(xss)
# TO DO: pass xss (x subset) through into dogroups. Still need irows there (for :=), but more condense
# and contiguous to use xss to form .SD in dogroups than going via irows
}
if (!length(byval) && xnrow>0L) {
# see missingby up above for comments
# by could be NULL or character(0L) for example (e.g. passed in as argument in a loop of different bys)
bysameorder = FALSE # 1st and only group is the entire table, so could be TRUE, but FALSE to avoid
# a key of empty character()
byval = list()
bynames = allbyvars = NULL
# the rest now fall through
} else bynames = names(byval)
if (is.atomic(byval)) {
if (is.character(byval) && length(byval)<=ncol(x) && !(is.name(bysub) && bysub %chin% names_x) ) {
stop("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval",deparse(bysub)," should work. This is for efficiency so data.table can detect which columns are needed.")
} else {
# by may be a single unquoted column name but it must evaluate to list so this is a convenience to users. Could also be a single expression here such as DT[,sum(v),by=colA%%2]
byval = list(byval)
bysubl = c(as.name("list"),bysuborig) # for guessing the column name below
if (is.name(bysuborig))
bynames = as.character(bysuborig)
else
bynames = names(byval)
}
}
if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)")
if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)()
if (!bynull) for (jj in seq_len(length(byval))) {
if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]")
}
tt = vapply_1i(byval,length)
if (any(tt!=xnrow)) stop(gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow, domain='R-data.table'))
if (is.null(bynames)) bynames = rep.int("",length(byval))
if (length(idx <- which(!nzchar(bynames))) && !bynull) {
# TODO: improve this and unify auto-naming of jsub and bysub
if (is.name(bysubl[[1L]]) && bysubl[[1L]] == '{') bysubl = bysubl[[length(bysubl)]] # fix for #3156
for (jj in idx) {
# Best guess. Use "month" in the case of by=month(date), use "a" in the case of by=a%%2
byvars = all.vars(bysubl[[jj+1L]], functions = TRUE)
if (length(byvars) == 1L) tt = byvars
else {
# take the first variable that is (1) not eval (#3758) and (2) starts with a character that can't start a variable name
tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE)
# byvars but exclude functions or `0`+`1` becomes `+`
tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L]
}
# fix for #497
if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) {
bynames[jj] = deparse(bysubl[[jj+1L]])
if (verbose)
cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt,
"' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="")
}
else bynames[jj] = tt
# if user doesn't like this inferred name, user has to use by=list() to name the column
}
# Fix for #1334
if (any(duplicated(bynames))) {
bynames = make.unique(bynames)
}
}
setattr(byval, "names", bynames) # byval is just a list not a data.table hence setattr not setnames
}
jvnames = NULL
drop_dot = function(x) {
if (length(x)!=1L) stop("Internal error: drop_dot passed ",length(x)," items") # nocov
if (identical(substring(x<-as.character(x), 1L, 1L), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY"))
substring(x, 2L)
else
x
}
# handle auto-naming of last item of j (e.g. within {} or if/else, #2478)
# e.g. DT[, .(a=sum(v), v, .N), by=] should create columns named a, v, N
do_j_names = function(q) {
if (!is.call(q) || !is.name(q[[1L]])) return(q)
if (q[[1L]] %chin% c('list', '.')) {
q[[1L]] = quote(list)
qlen = length(q)
if (qlen>1L) {
nm = names(q[-1L]) # check list(a=sum(v),v)
if (is.null(nm)) nm = rep.int("", qlen-1L)
# attempt to auto-name unnamed columns
for (jj in which(nm=="")) {
thisq = q[[jj + 1L]]
if (missing(thisq)) stop(gettextf("Item %d of the .() or list() passed to j is missing", jj, domain="R-data.table")) #3507
if (is.name(thisq)) nm[jj] = drop_dot(thisq)
# TO DO: if call to a[1] for example, then call it 'a' too
}
if (!is.null(jvnames) && any(idx <- nm != jvnames))
warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE)
jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better
setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group; reinstated at the end on the result.
}
return(q) # else empty list is needed for test 468: adding an empty list column
}
if (q[[1L]] == '{') {
if (!is.null(q[[qlen<-length(q)]])) q[[qlen]] = do_j_names(q[[qlen]])
return(q)
}
if (q[[1L]] == 'if') {
#explicit NULL would return NULL, assigning NULL would delete that from the expression
if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]])
if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]])
return(q)
}
return(q)
}
if (is.name(jsub)) {
# j is a single unquoted column name
if (jsub!=".SD") jvnames = drop_dot(jsub)
# jsub is list()ed after it's eval'd inside dogroups.
} else jsub = do_j_names(jsub) # else maybe a call to transform or something which returns a list.
av = all.vars(jsub,TRUE) # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c)
use.I = ".I" %chin% av
if (any(c(".SD","eval","get","mget") %chin% av)) {
if (missing(.SDcols)) {
# here we need to use 'dupdiff' instead of 'setdiff'. Ex: setdiff(c("x", "x"), NULL) will give 'x'.
# slight memory efficiency boost if we only store sdvars if it differs from ansvars, but a bit tricky to examine all the uses here
ansvars = sdvars = dupdiff(names_x, union(bynames, allbyvars)) # TO DO: allbyvars here for vars used by 'by'. Document.
# just using .SD in j triggers all non-by columns in the subset even if some of
# those columns are not used. It would be tricky to detect whether the j expression
# really does use all of the .SD columns or not, hence .SDcols for grouping
# over a subset of columns
# all duplicate columns must be matched, because nothing is provided
ansvals = chmatchdup(ansvars, names_x)
} else {
# FR #355 - negative numeric and character indices for SDcols
colsub = substitute(.SDcols)
# fix for R-Forge #5190. colsub[[1L]] gave error when it's a symbol.
if (colsub %iscall% c("!", "-")) {
negate_sdcols = TRUE
colsub = colsub[[2L]]
} else negate_sdcols = FALSE
# fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4)))
while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]]
if (colsub %iscall% ':' && length(colsub)==3L) {
# .SDcols is of the format a:b
.SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame())
} else {
if (colsub %iscall% 'patterns') {
# each pattern gives a new filter condition, intersect the end result
.SDcols = Reduce(intersect, do_patterns(colsub, names_x))
} else {
.SDcols = eval(colsub, parent.frame(), parent.frame())
# allow filtering via function in .SDcols, #3950
if (is.function(.SDcols)) {
.SDcols = lapply(x, .SDcols)
if (any(idx <- vapply_1i(.SDcols, length) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA)))
stop("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: ", brackify(names(x)[idx]))
.SDcols = unlist(.SDcols, use.names = FALSE)
}
}
}
if (anyNA(.SDcols))
stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols))))
if (is.logical(.SDcols)) {
ansvals = which_(rep(.SDcols, length.out=length(x)), !negate_sdcols)
ansvars = sdvars = names_x[ansvals]
} else if (is.numeric(.SDcols)) {
.SDcols = as.integer(.SDcols)
# if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
if (length(unique(sign(.SDcols))) > 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L))
stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx)))
ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols]
ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols
} else {
if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
if (!all(idx <- .SDcols %chin% names_x))
stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx]))
ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols
# dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
ansvals = chmatch(ansvars, names_x)
}
}
# fix for long standing FR/bug, #495 and #484
allcols = c(names_x, xdotprefix, names_i, idotprefix)
non_sdvars = setdiff(intersect(av, allcols), c(bynames, ansvars))
# added 'mget' - fix for #994
if (any(c("get", "mget") %chin% av)){
if (verbose)
cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars), domain = "R-data.table"))
# get('varname') is too difficult to detect which columns are used in general
# eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above.
# Do not include z in .SD when dt[, z := {.SD; get("x")}, .SDcols = "y"] (#2326, #2338)
if (jsub %iscall% ":=" && is.symbol(jsub[[2L]])) {
jsub_lhs_symbol = as.character(jsub[[2L]])
if (jsub_lhs_symbol %chin% non_sdvars) {
sdvars = setdiff(sdvars, jsub_lhs_symbol)
}
}
if (missing(.SDcols)) {
ansvars = setdiff(allcols, bynames) # fix for bug #34
} else {
# fixes #4089 - if .SDcols was already evaluated, we do not want the order of the columns to change.
ansvars = union(ansvars, setdiff(setdiff(allcols, ansvars), bynames))
}
non_sdvars = setdiff(ansvars, sdvars)
ansvals = chmatch(ansvars, names_x)