-
Notifications
You must be signed in to change notification settings - Fork 991
/
data.table.R
2725 lines (2610 loc) · 154 KB
/
data.table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
dim.data.table <- function(x)
{
.Call(Cdim, x)
}
.global <- new.env() # thanks to: http://stackoverflow.com/a/12605694/403310
setPackageName("data.table",.global)
.global$print = ""
.SD = .N = .I = .GRP = .BY = .EACHI = NULL
# These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler.
# But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols.
# And NULL makes it clear (to the R's mask check on loading) that they're variables not functions.
# utils::globalVariables(c(".SD",".N")) was tried as well, but exporting seems better.
# So even though .BY doesn't appear in this file, it should still be NULL here and exported because it's
# defined in SDenv and can be used by users.
print.data.table <- function(x, topn=getOption("datatable.print.topn"),
nrows=getOption("datatable.print.nrows"),
class=getOption("datatable.print.class"),
row.names=getOption("datatable.print.rownames"),
quote=FALSE, ...) { # topn - print the top topn and bottom topn rows with '---' inbetween (5)
# nrows - under this the whole (small) table is printed, unless topn is provided (100)
# class - should column class be printed underneath column name? (FALSE)
if (.global$print!="" && address(x)==.global$print) { # The !="" is to save address() calls and R's global cache of address strings
# := in [.data.table sets .global$print=address(x) to suppress the next print i.e., like <- does. See FAQ 2.22 and README item in v1.9.5
# The issue is distinguishing "> DT" (after a previous := in a function) from "> DT[,foo:=1]". To print.data.table(), there
# is no difference. Now from R 3.2.0 a side effect of the very welcome and requested change to avoid silent deep copy is that
# there is now no longer a difference between > DT and > print(DT). So decided that DT[] is now needed to guarantee print; simpler.
# This applies just at the prompt. Inside functions, print(DT) will of course print.
# Other options investigated (could revisit): Cstack_info(), .Last.value gets set first before autoprint, history(), sys.status(),
# topenv(), inspecting next statement in caller, using clock() at C level to timeout suppression after some number of cycles
SYS <- sys.calls()
if (length(SYS) <= 2 || # "> DT" auto-print or "> print(DT)" explicit print (cannot distinguish from R 3.2.0 but that's ok)
( length(SYS) > 3L &&
SYS[[length(SYS)-3L]][[1L]] == "knit_print.default") ) { # knitr auto print to mimic the prompt
.global$print = ""
return(invisible())
}
}
.global$print = ""
if (!is.numeric(nrows)) nrows = 100L
if (!is.infinite(nrows)) nrows = as.integer(nrows)
if (nrows <= 0L) return(invisible()) # ability to turn off printing
if (!is.numeric(topn)) topn = 5L
topnmiss = missing(topn)
topn = max(as.integer(topn),1L)
if (nrow(x) == 0L) {
if (length(x)==0L)
cat("Null data.table (0 rows and 0 cols)\n") # See FAQ 2.5 and NEWS item in v1.8.9
else
cat("Empty data.table (0 rows) of ",length(x)," col",if(length(x)>1L)"s",": ",paste(head(names(x),6),collapse=","),if(ncol(x)>6)"...","\n",sep="")
return()
}
if (topn*2<nrow(x) && (nrow(x)>nrows || !topnmiss)) {
toprint = rbind(head(x, topn), tail(x, topn))
rn = c(seq_len(topn), seq.int(to=nrow(x), length.out=topn))
printdots = TRUE
} else {
toprint = x
rn = seq_len(nrow(x))
printdots = FALSE
}
toprint=format.data.table(toprint, ...)
# fix for #975.
if (any(sapply(x, function(col) "integer64" %in% class(col))) && !"package:bit64" %in% search()) {
warning("Some columns have been read as type 'integer64' but package bit64 isn't loaded. Those columns will display as strange looking floating point data. There is no need to reload the data. Just require(bit64) to obtain the integer64 print method and print the data again.")
}
# FR #5020 - add row.names = logical argument to print.data.table
if (isTRUE(row.names)) rownames(toprint)=paste(format(rn,right=TRUE,scientific=FALSE),":",sep="") else rownames(toprint)=rep.int("", nrow(toprint))
if (is.null(names(x)) | all(names(x) == "")) colnames(toprint)=rep("", ncol(toprint)) # fixes bug #97 (RF#4934) and #545 (RF#5253)
if (isTRUE(class)) {
#Matching table for most common types & their abbreviations
class_abb = c(list = "<list>", integer = "<int>", numeric = "<num>",
character = "<char>", Date = "<Date>", complex = "<cplx>",
factor = "<fctr>", POSIXct = "<POSc>", logical = "<lgcl>",
IDate = "<IDat>", integer64 = "<i64>", raw = "<raw>",
expression = "<expr>", ordered = "<ord>")
classes = vapply(x, function(col) class(col)[1L], "", USE.NAMES=FALSE)
abbs = unname(class_abb[classes])
if ( length(idx <- which(is.na(abbs))) )
abbs[idx] = paste("<", classes[idx], ">", sep="")
toprint = rbind(abbs, toprint)
rownames(toprint)[1L] = ""
}
if (printdots) {
toprint = rbind(head(toprint,topn),"---"="",tail(toprint,topn))
rownames(toprint) = format(rownames(toprint),justify="right")
print(toprint,right=TRUE,quote=quote)
return(invisible())
}
if (nrow(toprint)>20L)
# repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them
toprint=rbind(toprint,matrix(colnames(toprint),nrow=1)) # fixes bug #4934
print(toprint,right=TRUE,quote=quote)
invisible()
}
# FR #2591 - format.data.table issue with columns of class "formula"
is.formula <- function(x) class(x) == "formula"
format.data.table <- function (x, ..., justify="none") {
if (is.atomic(x) && !is.null(x)) {
stop("Internal structure doesn't seem to be a list. Possibly corrupt data.table.")
}
format.item <- function(x) {
if (is.atomic(x) || is.formula(x)) # FR #2591 - format.data.table issue with columns of class "formula"
paste(c(format(head(x,6), justify=justify, ...), if(length(x)>6)""),collapse=",") # fix for #5435 - format has to be added here...
else
paste("<",class(x)[1L],">",sep="")
}
# FR #1091 for pretty printing of character
# TODO: maybe instead of doing "this is...", we could do "this ... test"?
char.trunc <- function(x, trunc.char = getOption("datatable.prettyprint.char")) {
trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE)
if (!is.character(x) || trunc.char <= 0L) return(x)
idx = which(nchar(x) > trunc.char)
x[idx] = paste(substr(x[idx], 1L, as.integer(trunc.char)), "...", sep="")
x
}
do.call("cbind",lapply(x,function(col,...){
if (!is.null(dim(col))) stop("Invalid column: it has dimensions. Can't format it. If it's the result of data.table(table()), use as.data.table(table()) instead.")
if (is.list(col)) col = sapply(col, format.item)
else col = format(char.trunc(col), justify=justify, ...) # added an else here to fix #5435
col
},...))
}
is.data.table <- function(x) inherits(x, "data.table")
is.ff <- function(x) inherits(x, "ff") # define this in data.table so that we don't have to require(ff), but if user is using ff we'd like it to work
#NCOL <- function(x) {
# # copied from base, but additionally covers data.table via is.list()
# # because NCOL in base explicity tests using is.data.frame()
# if (is.list(x) && !is.ff(x)) return(length(x))
# if (is.array(x) && length(dim(x)) > 1L) ncol(x) else as.integer(1L)
#}
#NROW <- function(x) {
# if (is.data.frame(x) || is.data.table(x)) return(nrow(x))
# if (is.list(x) && !is.ff(x)) stop("List is not a data.frame or data.table. Convert first before using NROW") # list may have different length elements, which data.table and data.frame's resolve.
# if (is.array(x)) nrow(x) else length(x)
#}
null.data.table <-function() {
ans = list()
setattr(ans,"class",c("data.table","data.frame"))
setattr(ans,"row.names",.set_row_names(0L))
alloc.col(ans)
}
data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
{
# NOTE: It may be faster in some circumstances to create a data.table by creating a list l first, and then setattr(l,"class",c("data.table","data.frame")) at the expense of checking.
# TO DO: rewrite data.table(), one of the oldest functions here. Many people use data.table() to convert data.frame rather than
# as.data.table which is faster; speed could be better. Revisit how many copies are taken in for example data.table(DT1,DT2) which
# cbind directs to. And the nested loops for recycling lend themselves to being C level.
x <- list(...) # doesn't copy named inputs as from R >= 3.1.0 (a very welcome change)
if (!.R.listCopiesNamed) .Call(CcopyNamedInList,x) # to maintain the old behaviour going forwards, for now. See test 548.2.
# **TO DO** Something strange with NAMED on components of `...`. To investigate. Or just port data.table() to C. This is why
# it's switched, because extra copies would be introduced in R <= 3.1.0, iiuc.
# fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top.
if (identical(x, list(NULL)) || identical(x, list(list())) ||
identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() )
tt <- as.list(substitute(list(...)))[-1L] # Intention here is that data.table(X,Y) will automatically put X and Y as the column names. For longer expressions, name the arguments to data.table(). But in a call to [.data.table, wrap in list() e.g. DT[,list(a=mean(v),b=foobarzoo(zang))] will get the col names
vnames = names(tt)
if (is.null(vnames)) vnames = rep.int("",length(x))
vnames[is.na(vnames)] = ""
novname = vnames==""
if (any(!novname)) {
if (any(vnames[!novname] == ".SD")) stop("A column may not be called .SD. That has special meaning.")
}
for (i in which(novname)) {
# if (ncol(as.data.table(x[[i]])) <= 1) { # cbind call in test 230 fails if I write ncol(as.data.table(eval(tt[[i]], parent.frame()))) <= 1, no idea why... (keep this for later even though all tests pass with ncol(.).. because base uses as.data.frame(.))
if (is.null(ncol(x[[i]]))) {
if ((tmp <- deparse(tt[[i]])[1]) == make.names(tmp))
vnames[i] <- tmp
}
}
tt = vnames==""
if (any(tt)) vnames[tt] = paste("V", which(tt), sep = "")
# so now finally we have good column names. We also will use novname later to know which were explicitly supplied in the call.
n <- length(x)
if (n < 1L)
return( null.data.table() )
if (length(vnames) != n) stop("logical error in vnames")
vnames <- as.list.default(vnames)
nrows = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required.
numcols = integer(n) # the ncols of each of the inputs (e.g. if inputs contain matrix or data.table)
for (i in seq_len(n)) {
xi = x[[i]]
if (is.null(xi)) stop("column or argument ",i," is NULL")
if ("POSIXlt" %chin% class(xi)) {
warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.")
x[[i]] = as.POSIXct(xi)
} else if (is.matrix(xi) || is.data.frame(xi)) { # including data.table (a data.frame, too)
xi = as.data.table(xi, keep.rownames=keep.rownames) # TO DO: allow a matrix to be a column of a data.table. This could allow a key'd lookup to a matrix, not just by a single rowname vector, but by a combination of several columns. A matrix column could be stored either by row or by column contiguous in memory.
x[[i]] = xi
numcols[i] = length(xi)
} else if (is.table(xi)) {
x[[i]] = xi = as.data.table.table(xi, keep.rownames=keep.rownames)
numcols[i] = length(xi)
} else if (is.function(xi)) {
x[[i]] = xi = list(xi)
}
nrows[i] <- NROW(xi) # for a vector (including list() columns) returns the length
if (numcols[i]>0L) {
namesi <- names(xi) # works for both data.frame's, matrices and data.tables's
if (length(namesi)==0L) namesi = rep.int("",ncol(xi))
namesi[is.na(namesi)] = ""
tt = namesi==""
if (any(tt)) namesi[tt] = paste("V", which(tt), sep = "")
if (novname[i]) vnames[[i]] = namesi
else vnames[[i]] = paste(vnames[[i]], namesi, sep=".")
}
}
nr <- max(nrows)
ckey = NULL
recycledkey = FALSE
for (i in seq_len(n)) {
xi = x[[i]]
if (is.data.table(xi) && haskey(xi)) {
if (nrows[i]<nr) recycledkey = TRUE
else ckey = c(ckey, key(xi))
}
}
for (i in which(nrows < nr)) {
# TO DO ... recycle in C, but not high priority as large data already regular from database or file
xi <- x[[i]]
if (identical(xi,list())) {
x[[i]] = vector("list", nr)
next
}
if (nrows[i]==0L) stop("Item ",i," has no length. Provide at least one item (such as NA, NA_integer_ etc) to be repeated to match the ",nr," rows in the longest column. Or, all columns can be 0 length, for insert()ing rows into.")
# Implementing FR #4813 - recycle with warning when nr %% nrows[i] != 0L
if (nr%%nrows[i] != 0L) warning("Item ", i, " is of size ", nrows[i], " but maximum size is ", nr, " (recycled leaving remainder of ", nr%%nrows[i], " items)")
# if (nr%%nrows[i] == 0L) {
if (is.data.frame(xi)) { # including data.table
..i = rep(seq_len(nrow(xi)), length.out = nr)
x[[i]] = xi[..i,,drop=FALSE]
next
}
if (is.atomic(xi) || is.list(xi)) {
# TO DO: surely use set() here, or avoid the coercion
x[[i]] = rep(xi, length.out = nr)
next
}
stop("problem recycling column ",i,", try a simpler type")
# }
stop("argument ",i," (nrow ",nrows[i],") cannot be recycled without remainder to match longest nrow (",nr,")")
}
if (any(numcols>0L)) {
value = vector("list",sum(pmax(numcols,1L)))
k = 1L
for(i in seq_len(n)) {
if (is.list(x[[i]]) && !is.ff(x[[i]])) {
for(j in seq_len(length(x[[i]]))) {
value[[k]] = x[[i]][[j]]
k=k+1L
}
} else {
value[[k]] = x[[i]]
k=k+1L
}
}
} else {
value = x
}
vnames <- unlist(vnames)
if (check.names) # default FALSE
vnames <- make.names(vnames, unique = TRUE)
setattr(value,"names",vnames)
setattr(value,"row.names",.set_row_names(nr))
setattr(value,"class",c("data.table","data.frame"))
if (!is.null(key)) {
if (!is.character(key)) stop("key argument of data.table() must be character")
if (length(key)==1L) {
key = strsplit(key,split=",")[[1L]]
# eg key="A,B"; a syntax only useful in key argument to data.table(), really.
}
setkeyv(value,key)
} else {
# retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table().
# If DT inputs with keys have been recycled then can't retain key
if (length(ckey)
&& !recycledkey
&& !any(duplicated(ckey))
&& all(ckey %in% names(value))
&& !any(duplicated(names(value)[names(value) %in% ckey])))
setattr(value, "sorted", ckey)
}
# FR #643, setfactor is an internal function in fread.R
if (isTRUE(stringsAsFactors)) setfactor(value, which(vapply(value, is.character, TRUE)), FALSE)
alloc.col(value) # returns a NAMED==0 object, unlike data.frame()
}
replace_dot_alias <- function(e) {
# we don't just simply alias .=list because i) list is a primitive (faster to iterate) and ii) we test for use
# of "list" in several places so it saves having to remember to write "." || "list" in those places
if (is.call(e)) {
if (e[[1L]] == ".") e[[1L]] = quote(list)
for (i in seq_along(e)[-1]) if (!is.null(e[[i]])) e[[i]] = replace_dot_alias(e[[i]])
}
e
}
.massagei <- function(x) {
# J alias for list as well in i, just if the first symbol
if (is.call(x) && as.character(x[[1L]]) %chin% c("J","."))
x[[1L]] = quote(list)
x
}
# A (relatively) fast (uses DT grouping) wrapper for matching two vectors, BUT:
# it behaves like 'pmatch' but only the 'exact' matching part. That is, a value in
# 'x' is matched to 'table' only once. No index will be present more than once.
# This should make it even clearer:
# chmatch2(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
# chmatch2(c("a", "a"), c("a", "b")) # 1,NA - the second one doesn't 'see' the first 'a'
# chmatch2(c("a", "a"), c("a", "a.1")) # 1,NA - this is where it differs from pmatch - we don't need the partial match.
chmatch2 <- function(x, table, nomatch=NA_integer_) {
.Call(Cchmatch2, x, table, as.integer(nomatch)) # this is in 'rbindlist.c' for now.
}
"[.data.table" <- function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch"), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
{
# ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could
# test explicitly if the caller is [.data.table (even stronger test. TO DO.)
# the drop=NULL is to sink drop argument when dispatching to [.data.frame; using '...' stops test 147
if (!cedta()) {
# Fix for #5070 (to do)
Nargs = nargs() - (!missing(drop))
ans = if (Nargs<3L) `[.data.frame`(x,i) # drop ignored anyway by DF[i]
else if (missing(drop)) `[.data.frame`(x,i,j)
else `[.data.frame`(x,i,j,drop)
# added is.data.table(ans) check to fix bug #5069
if (!missing(i) & is.data.table(ans)) setkey(ans,NULL) # See test 304
return(ans)
}
if (!mult %chin% c("first","last","all")) stop("mult argument can only be 'first','last' or 'all'")
if (length(roll)!=1L || is.na(roll)) stop("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
if (is.character(roll)) {
if (roll!="nearest") stop("roll is '",roll,"' (type character). Only valid character value is 'nearest'.")
} else {
roll = if (isTRUE(roll)) +Inf else as.double(roll)
}
force(rollends)
if (!is.logical(rollends)) stop("rollends must be a logical vector")
if (length(rollends)>2) stop("rollends must be length 1 or 2")
if (length(rollends)==1) rollends=rep.int(rollends,2L)
# TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one.
missingnomatch = missing(nomatch)
if (!is.na(nomatch) && nomatch!=0L) stop("nomatch must either be NA or 0, or (ideally) NA_integer_ or 0L")
nomatch = as.integer(nomatch)
if (!is.logical(which) || length(which)>1) stop("'which' must be a logical vector length 1. Either FALSE, TRUE or NA.")
if ((isTRUE(which)||is.na(which)) && !missing(j)) stop("'which' is ",which," (meaning return row numbers) but 'j' is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.")
if (!is.na(nomatch) && is.na(which)) stop("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.")
.global$print=""
if (missing(i) && missing(j)) {
# ...[] == oops at console, forgot print(...)
# or some kind of dynamic construction that has edge case of no contents inside [...]
return(x)
}
if (!with && missing(j)) stop("j must be provided when with=FALSE")
if (!missing(j)) {
jsub = replace_dot_alias(substitute(j))
if (is.call(jsub) && jsub[[1L]]=="{") {
if (length(jsub)==2) jsub = jsub[[2L]] # to allow {} wrapping of := e.g. [,{`:=`(...)},] [#376]
else if (is.call(jsub[[2L]]) && jsub[[2L]][[1L]] == ":=")
stop("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}")
}
if (is.call(jsub)) {
if (jsub[[1L]] == "eval" && !any(all.vars(jsub[[2]]) %in% names(x))) {
# Grab the dynamic expression from calling scope now to give the optimizer a chance to optimize it
# Only when top level is eval call. Not nested like x:=eval(...) or `:=`(x=eval(...), y=eval(...))
jsub = eval(jsub[[2L]], parent.frame(), parent.frame()) # this evals the symbol to return the dynamic expression
if (is.expression(jsub)) jsub = jsub[[1L]] # if expression, convert it to call
# Note that the dynamic expression could now be := (new in v1.9.7)
}
if (is.call(jsub) && jsub[[1L]] == ":=") allow.cartesian=TRUE # (see #800)
}
}
bysub=NULL
if (!missing(by)) bysub=substitute(by)
if (!missing(keyby)) {
if (!missing(by)) stop("Provide either 'by' or 'keyby' but not both")
by=bysub=substitute(keyby)
# Assign to 'by' so that by is no longer missing and we can proceed as if there were one by
}
byjoin = FALSE
if (!missing(by)) {
if (missing(j)) stop("'by' or 'keyby' is supplied but not j")
byjoin = is.symbol(bysub) && bysub==".EACHI"
}
irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency.
notjoin = FALSE
rightcols = leftcols = integer()
# To take care of duplicate column names properly (see chmatch2 function above `[data.table`) for description
dupmatch <- function(x, y, ...) {
if (anyDuplicated(x))
pmax(chmatch(x,y, ...), chmatch2(x,y,0L))
else chmatch(x,y)
}
# setdiff removes duplicate entries, which'll create issues with duplicated names. Use '%chin% instead.
dupdiff <- function(x, y) x[!x %chin% y]
if (!missing(i)) {
xo = NULL
isub = substitute(i)
isnull_inames = FALSE
nqgrp = integer(0) # for non-equi join
nqmaxgrp = 1L # for non-equi join
# Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires
# the "eval" to be checked before `as.name("!")`. Therefore interchanged.
restore.N = remove.N = FALSE
if (exists(".N", envir=parent.frame(), inherits=FALSE)) {
old.N = get(".N", envir=parent.frame(), inherits=FALSE)
locked.N = bindingIsLocked(".N", parent.frame())
if (locked.N) eval(call("unlockBinding", ".N", parent.frame())) # eval call to pass R CMD check NOTE until we find cleaner way
assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
restore.N = TRUE
# the comment below is invalid hereafter (due to fix for #1145)
# binding locked when .SD[.N] but that's ok as that's the .N we want anyway
# TO DO: change isub at C level s/.N/nrow(x); changing a symbol to a constant should be ok
} else {
assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
remove.N = TRUE
}
if (is.call(isub) && isub[[1L]]=="eval") { # TO DO: or ..()
isub = eval(.massagei(isub[[2L]]), parent.frame(), parent.frame())
if (is.expression(isub)) isub=isub[[1L]]
}
if (is.call(isub) && isub[[1L]] == as.name("!")) {
notjoin = TRUE
if (!missingnomatch) stop("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch.");
nomatch = 0L
isub = isub[[2L]]
# #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!"
if (is.call(isub) && isub[[1L]] == "(" && !is.name(isub[[2L]]))
isub = isub[[2L]]
}
if (is.call(isub) && isub[[1L]] == as.name("order") && getOption("datatable.optimize") >= 1) { # optimize here so that we can switch it off if needed
if (verbose) cat("order optimisation is on, i changed from 'order(...)' to 'forder(DT, ...)'.\n")
isub = as.list(isub)
isub = as.call(c(list(quote(forder), quote(x)), isub[-1L]))
}
if (is.null(isub)) return( null.data.table() )
if (is.call(isub) && isub[[1L]] == quote(forder)) {
order_env = new.env(parent=parent.frame()) # until 'forder' is exported
assign("forder", forder, order_env)
assign("x", x, order_env)
i = eval(isub, order_env, parent.frame()) # for optimisation of 'order' to 'forder'
# that forder returns integer(0) is taken care of internally within forder
} else if (is.call(isub) &&
getOption("datatable.use.index") && # #1422
as.character(isub[[1L]]) %chin% c("==","%in%") &&
is.name(isub[[2L]]) &&
(isub2<-as.character(isub[[2L]])) %chin% names(x) &&
(getOption("datatable.auto.index") || (isub2 %chin% indices(x))) && # `||` used to either auto.index or already have index #1422
is.null(attr(x, '.data.table.locked'))) { # fix for #958, don't create auto index on '.SD'.
# LHS is a column name symbol
# simplest case for now (single ==). Later, top level may be &,|,< or >
# TO DO: print method could print physical and secondary keys at end.
# TO DO: move down to if (is.data.table) clause below, later ...
RHS = eval(isub[[3L]], x, parent.frame())
# fix for #961
if (is.list(RHS)) RHS = as.character(RHS)
if (isub[[1L]] == "==" && length(RHS)>1) {
if (length(RHS)!=nrow(x)) stop("RHS of == is length ",length(RHS)," which is not 1 or nrow (",nrow(x),"). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %in% instead.")
i = x[[isub2]] == RHS # DT[colA == colB] regular element-wise vector scan
} else if ( (is.integer(x[[isub2]]) && is.double(RHS) && isReallyReal(RHS)) || (mode(x[[isub2]]) != mode(RHS) && !(class(x[[isub2]]) %in% c("character", "factor") &&
class(RHS) %in% c("character", "factor"))) ||
(is.factor(x[[isub2]]) && !is.factor(RHS) && mode(RHS)=="numeric") ) { # fringe case, #1361. TODO: cleaner way of doing these checks.
# re-direct all non-matching mode cases to base R, as data.table's binary
# search based join is strict in types. #957 and #961.
i = if (isub[[1L]] == "==") x[[isub2]] == RHS else x[[isub2]] %in% RHS
} else {
# fix for #932 (notjoin) and also when RHS is NA (and notjoin is also TRUE)
if (isub[[1L]] == "==") {
# RHS is of length=1 or n
if (any_na(as_list(RHS))) {
notjoin = FALSE
RHS = RHS[0L]
} else if (notjoin) {
RHS = c(RHS, if (is.double(RHS)) c(NA, NaN) else NA)
}
}
if (haskey(x) && isub2 == key(x)[1L]) {
# join to key(x)[1L]
xo <- integer()
rightcols = chmatch(key(x)[1],names(x))
} else {
xo = get2key(x,isub2) # Can't be any index with that col as the first one because those indexes will reorder within each group
if (is.null(xo)) { # integer() would be valid and signifies o=1:.N
if (verbose) {cat("Creating new index '",isub2,"'\n",sep="");flush.console()}
if (identical(getOption("datatable.auto.index"), FALSE)) warning("Index is being created on '",isub2,"' besides the fact that option 'datatable.auto.index' is FALSE. Please report to data.table#1422.") # why not double check that, even if won't happen now may be a good check for future changes
setindexv(x,isub2)
xo = get2key(x,isub2)
} else {
if (verbose) {cat("Using existing index '",isub2,"'\n",sep="");flush.console()}
}
rightcols = chmatch(isub2, names(x))
}
# convert RHS to list to join to key (either physical or secondary)
i = as.data.table( unique(RHS) )
# To do: wrap isub[[3L]] with as.data.table() first before eval to save copy
leftcols = 1L
ans = bmerge(i, x, leftcols, rightcols, io<-FALSE, xo, roll=0.0, rollends=c(FALSE,FALSE), nomatch=0L, mult="all", 1L, nqgrp, nqmaxgrp, verbose=verbose)
# No need to shallow copy i before passing to bmerge; we just created i above ourselves
i = if (ans$allLen1 && !identical(suppressWarnings(min(ans$starts)), 0L)) ans$starts else vecseq(ans$starts, ans$lens, NULL)
if (length(xo)) i = fsort(xo[i]) else i = fsort(i) # fix for #1495
leftcols = rightcols = NULL # these are used later to know whether a join was done, affects column order of result. So reset.
}
} else if (!is.name(isub)) i = eval(.massagei(isub), x, parent.frame())
else i = eval(isub, parent.frame(), parent.frame())
if (restore.N) {
assign(".N", old.N, envir=parent.frame())
if (locked.N) lockBinding(".N", parent.frame())
}
if (remove.N) rm(list=".N", envir=parent.frame())
if (is.matrix(i)) {
if (is.numeric(i) && ncol(i)==1L) { # #826 - subset DT on single integer vector stored as matrix
i = as.integer(i)
} else {
stop("i is invalid type (matrix). Perhaps in future a 2 column matrix could return a list of elements of DT (in the spirit of A[B] in FAQ 2.14). Please let datatable-help know if you'd like this, or add your comments to FR #657.")
}
}
if (is.logical(i)) {
if (notjoin) {
notjoin = FALSE
i = !i
}
}
if (is.null(i)) return( null.data.table() )
if (is.character(i)) {
isnull_inames = TRUE
i = data.table(V1=i) # for user convenience; e.g. DT["foo"] without needing DT[.("foo")]
} else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) i = as.data.table(i[[1L]])
else if (identical(class(i),"data.frame")) i = as.data.table(i) # TO DO: avoid these as.data.table() and use a flag instead
else if (identical(class(i),"list")) {
isnull_inames = is.null(names(i))
i = as.data.table(i)
}
if (is.data.table(i)) {
if (!haskey(x) && missing(on) && is.null(xo)) {
stop("When i is a data.table (or character vector), the columns to join by must be specified either using 'on=' argument (see ?data.table) or by keying x (i.e. sorted, and, marked as sorted, see ?setkey). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
}
if (!missing(on)) {
# on = .() is now possible, #1257
parse_on <- function(onsub) {
ops = c("==", "<=", "<", ">=", ">", "!=")
pat = paste("(", ops, ")", sep = "", collapse = "|")
if (is.call(onsub) && onsub[[1L]] == "eval") {
onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L))
if (is.call(onsub) && onsub[[1L]] == "eval") onsub = onsub[[2L]]
}
if (is.call(onsub) && as.character(onsub[[1L]]) %in% c("list", ".")) {
spat = paste("[ ]+(", pat, ")[ ]+", sep="")
onsub = lapply(as.list(onsub)[-1L], function(x) gsub(spat, "\\1", deparse(x, width.cutoff=500L)))
onsub = as.call(c(quote(c), onsub))
}
on = eval(onsub, parent.frame(2L), parent.frame(2L))
if (!is.character(on))
stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
this_op = regmatches(on, gregexpr(pat, on))
idx = (vapply(this_op, length, 0L) == 0L)
this_op[idx] = "=="
this_op = unlist(this_op, use.names=FALSE)
idx_op = match(this_op, ops, nomatch=0L)
if (any(idx_op %in% c(0L, 6L)))
stop("Invalid operators ", paste(this_op[idx_op==0L], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".")
if (is.null(names(on))) {
on[idx] = if (isnull_inames) paste(on[idx], paste("V", seq_len(sum(idx)), sep=""), sep="==") else paste(on[idx], on[idx], sep="==")
} else {
on[idx] = paste(names(on)[idx], on[idx], sep="==")
}
split = tstrsplit(on, paste("[ ]*", pat, "[ ]*", sep=""))
on = setattr(split[[2L]], 'names', split[[1L]])
if (length(empty_idx <- which(names(on) == "")))
names(on)[empty_idx] = on[empty_idx]
list(on = on, ops = idx_op)
}
on_ops = parse_on(substitute(on))
on = on_ops[[1L]]
ops = on_ops[[2L]]
# TODO: collect all '==' ops first to speeden up Cnestedid
rightcols = chmatch(names(on), names(x))
if (length(nacols <- which(is.na(rightcols))))
stop("Column(s) [", paste(names(on)[nacols], collapse=","), "] not found in x")
leftcols = chmatch(unname(on), names(i))
if (length(nacols <- which(is.na(leftcols))))
stop("Column(s) [", paste(unname(on)[nacols], collapse=","), "] not found in i")
# figure out the columns on which to compute groups on
non_equi = which.first(ops != 1L) # 1 is "==" operator
if (!is.na(non_equi)) { # non-equi conditions present.. investigate groups..
nqcols = rightcols[non_equi:length(rightcols)]
nqgrp = .Call(Cnestedid, x, nqcols, forderv(x, nqcols))
if ( (nqmaxgrp <- max(nqgrp)) > 1L) { # got some non-equi join work to do
if ("_nqgrp_" %in% names(x)) stop("Column name '_nqgrp_' is reserved for non-equi joins.")
set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp)
xo = forderv(nqx, c(ncol(nqx), rightcols))
} else nqgrp = integer(0)
}
if (nqmaxgrp == 1L) { # equi join. Reuse secondary index, #1439
xo = if (isTRUE(getOption("datatable.use.index"))) {
if (verbose) cat("Looking for existing (secondary) index... ")
attr(attr(x, 'index'), paste("__", names(x)[rightcols], sep="", collapse=""))
}
if (is.null(xo)) {
if (verbose) {
if (isTRUE(getOption("datatable.use.index"))) cat("not found.\n")
tt = system.time(xo <- forderv(x, by=rightcols))
cat("forder took", tt["user.self"] + tt["sys.self"], "sec\n")
} else xo = forderv(x, by = rightcols)
} else {
if (verbose) cat("found. Reusing index.\n")
}
} else {
if (!missing(by)) stop("by-joins are not yet implemented for multi-group non-equi-joins.")
}
} else if (is.null(xo)) {
rightcols = chmatch(key(x),names(x)) # NAs here (i.e. invalid data.table) checked in bmerge()
leftcols = if (haskey(i))
chmatch(head(key(i),length(rightcols)),names(i))
else
seq_len(min(length(i),length(rightcols)))
rightcols = head(rightcols,length(leftcols))
xo = integer() ## signifies 1:.N
if (missing(by) && with && isTRUE(getOption("datatable.old.bywithoutby"))) {
# To revert to <=v1.9.2 behaviour. TO DO: remove option after Sep 2015
warning("The data.table option 'datatable.old.bywithoutby' for grouping on join without providing `by` will be deprecated in the next release, use `by=.EACHI`.", call. = FALSE)
by=bysub=as.symbol(".EACHI")
byjoin=TRUE
txtav = c(names(x)[-rightcols], names(i)[-leftcols])
if (missing(j)) j = jsub = as.call(parse(text=paste(".(",paste(txtav, collapse=","),")",sep="")))[[1]]
}
ops = rep(1L, length(leftcols))
}
# Implementation for not-join along with by=.EACHI, #604
if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571 fix
notjoin = FALSE
if (verbose) {last.started.at=proc.time()[3];cat("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff(x,i) ...");flush.console()}
orignames = copy(names(i))
i = setdiff_(x, i, rightcols, leftcols) # part of #547
if (verbose) {cat("done in",round(proc.time()[3]-last.started.at,3),"secs\n");flush.console}
setnames(i, orignames[leftcols])
setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
}
io = if (missing(on)) haskey(i) else identical(unname(on), head(key(i), length(on)))
i = .shallow(i, retain.key = io)
ans = bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp, verbose=verbose)
# temp fix for issue spotted by Jan. Ideally would like to avoid this 'setorder', as there's another
# 'setorder' in generating 'irows' below...
if (length(ans$indices)) setorder(setDT(ans[1:3]), indices)
allLen1 = ans$allLen1
allGrp1 = ans$allGrp1
f__ = ans$starts
len__ = ans$lens
indices__ = ans$indices
# length of input nomatch (single 0 or NA) is 1 in both cases.
# When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N
# If using secondary key of x, f__ will refer to xo
if (is.na(which)) {
w = if (notjoin) f__!=0L else is.na(f__)
return( if (length(xo)) fsort(xo[w]) else which(w) )
}
if (mult=="all") {
if (!byjoin) {
# Really, `anyDuplicated` in base is AWESOME!
# allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
irows = if (allLen1) f__ else vecseq(f__,len__,
if( allow.cartesian ||
notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
!anyDuplicated(f__, incomparables = c(0L, NA_integer_))) # #742. If 'i' has no duplicates, ignore
NULL
else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
# Fix for #1092 and #1074
# TODO: implement better version of "any"/"all"/"which" to avoid
# unnecessary construction of logical vectors
if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L]
} else {
if (length(xo) && missing(on)) stop("Cannot by=.EACHI when joining to a secondary key, yet")
# since f__ refers to xo later in grouping, so xo needs to be passed through to dogroups too.
if (length(irows)) stop("Internal error. irows has length in by=.EACHI")
}
} else {
# turning on mult = "first"/"last" for non-equi joins again to test..
# if (nqmaxgrp>1L) stop("Non-equi joins aren't yet functional with mult='first' and mult='last'.")
# mult="first"/"last" logic moved to bmerge.c, also handles non-equi cases, #1452
if (!byjoin) { #1287 and #1271
irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic
if (identical(nomatch,0L)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows
}
# TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"?
# TODO: how about when nomatch=0L, can we avoid allocating then as well?
}
if (length(xo) && length(irows)) {
irows = xo[irows] # TO DO: fsort here?
if (mult=="all" && !allGrp1 && length(xo)) {
irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))$irows
}
}
} else {
if (!missing(on)) {
stop("logical error. i is not a data.table, but 'on' argument is provided.")
}
# TO DO: TODO: Incorporate which_ here on DT[!i] where i is logical. Should avoid i = !i (above) - inefficient.
# i is not a data.table
if (!is.logical(i) && !is.numeric(i)) stop("i has not evaluated to logical, integer or double")
if (is.logical(i)) {
if (isTRUE(i)) irows = i = NULL # fixes #1249
else if (identical(i, NA)) irows=i=integer(0) # DT[NA] thread recycling of NA logical exists,
# but for #1252 and consistency, we need to return 0-rows
else if (length(i)==nrow(x)) irows = i = which(i) # e.g. DT[colA>3,which=TRUE]
# also replacing 'i' here - to save memory, #926.
else irows=seq_len(nrow(x))[i] # e.g. recycling DT[c(TRUE,FALSE),which=TRUE], for completeness
# it could also be DT[!TRUE, which=TRUE] (silly cases, yes).
# replaced the "else if (!isTRUE(i))" to just "else". Fixes bug report #4930
} else {
irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error)
irows = .Call(CconvertNegativeIdx, irows, nrow(x)) # simplifies logic from here on (can assume positive subscripts)
# maintains Arun's fix for #2697 (test 1042)
# efficient in C with more detailed messages
# falls through quickly (no R level allocs) if no negatives
# minor TO DO: can we merge this with check_idx in fcast.c/subset ?
}
}
if (notjoin) {
if (byjoin || !is.integer(irows) || is.na(nomatch)) stop("Internal error: notjoin but byjoin or !integer or nomatch==NA")
irows = irows[irows!=0L]
i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x))
leftcols = integer() # proceed as if row subset from now on, length(leftcols) is switched on later
rightcols = integer()
# Doing this once here, helps speed later when repeatedly subsetting each column. R's [irows] would do this for each
# column when irows contains negatives.
}
if (which) return( if (is.null(irows)) seq_len(nrow(x)) else irows )
} else { # missing(i)
i = NULL
}
byval = NULL
xnrow = nrow(x)
xcols = xcolsAns = icols = icolsAns = integer()
othervars = character(0)
if (missing(j)) {
# missing(by)==TRUE was already checked above before dealing with i
if (!length(x)) return(null.data.table())
if (!length(leftcols)) {
ansvars = nx = names(x)
jisvars = character()
xcols = xcolsAns = seq_along(x)
} else {
jisvars = names(i)[-leftcols]
tt = jisvars %chin% names(x)
if (length(tt)) jisvars[tt] = paste("i.",jisvars[tt],sep="")
if (length(duprightcols <- rightcols[duplicated(rightcols)])) {
nx = c(names(x), names(x)[duprightcols])
rightcols = chmatch2(names(x)[rightcols], nx)
nx = make.unique(nx)
} else nx = names(x)
ansvars = make.unique(c(nx, jisvars))
icols = c(leftcols, seq_along(i)[-leftcols])
icolsAns = c(rightcols, seq.int(length(nx)+1L, length.out=ncol(i)-length(unique(leftcols))))
xcols = xcolsAns = seq_along(x)[-rightcols]
}
ansvals = chmatch(ansvars, nx)
} else {
# j was substituted before dealing with i so that := can set allow.cartesian=FALSE (#800) (used above in i logic)
if (is.null(jsub)) return(NULL)
if (is.call(jsub) && jsub[[1L]]==":=") {
# short circuit do-nothing, don't do further checks on .SDcols for example
if (identical(irows, integer())) {
if (identical(nomatch, 0L)) {
.global$print = address(x)
return(invisible(x)) # irows=NULL means all rows at this stage
} else irows = rep(NA_integer_, nrow(x)) # fix for #759
}
if (!with) {
if (is.null(names(jsub)) && is.name(jsub[[2L]])) {
# TO DO: uncomment these warnings in next release. Later, make both errors.
## warning("with=FALSE is deprecated when using :=. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples.")
jsub[[2L]] = eval(jsub[[2L]], parent.frame(), parent.frame())
} else {
## warning("with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.")
}
with = TRUE
}
}
if (!with) {
# missing(by)==TRUE was already checked above before dealing with i
if (is.call(jsub) && deparse(jsub[[1]], 500L) %in% c("!", "-")) { # TODO is deparse avoidable here?
notj = TRUE
jsub = jsub[[2L]]
} else notj = FALSE
# fix for #1216, make sure the paranthesis are peeled from expr of the form (((1:4)))
while (is.call(jsub) && jsub[[1L]] == "(") jsub = as.list(jsub)[[-1L]]
if (is.call(jsub) && length(jsub) == 3L && jsub[[1L]] == ":") {
j = eval(jsub, setattr(as.list(seq_along(x)), 'names', names(x)), parent.frame()) # else j will be evaluated for the first time on next line
} else {
j = eval(jsub, parent.frame(), parent.frame())
}
if (is.logical(j)) j <- which(j)
if (!length(j)) return( null.data.table() )
if (is.factor(j)) j = as.character(j) # fix for FR: #4867
if (is.character(j)) {
if (notj) {
w = chmatch(j, names(x))
if (any(is.na(w))) {
warning("column(s) not removed because not found: ",paste(j[is.na(w)],collapse=","))
w = w[!is.na(w)]
}
# changed names(x)[-w] to use 'setdiff'. Here, all instances of the column must be removed.
# Ex: DT <- data.table(x=1, y=2, x=3); DT[, !"x", with=FALSE] should just output 'y'.
# But keep 'dup cols' beause it's basically DT[, !names(DT) %chin% "x", with=FALSE] which'll subset all cols not 'x'.
ansvars = if (length(w)) dupdiff(names(x), names(x)[w]) else names(x)
ansvals = dupmatch(ansvars, names(x))
} else {
# once again, use 'setdiff'. Basically, unless indices are specified in `j`, we shouldn't care about duplicated columns.
ansvars = j # x. and i. prefixes may be in here, and they'll be dealt with below
# dups = FALSE here.. even if DT[, c("x", "x"), with=FALSE], we subset only the first.. No way to tell which one the OP wants without index.
ansvals = chmatch(ansvars, names(x))
}
} else if (is.numeric(j)) {
if (all(j == 0L)) return (null.data.table())
if (any(abs(j) > ncol(x) | j==0L)) stop("j out of bounds")
if (any(j<0L) && any(j>0L)) stop("j mixes positive and negative")
if (any(j<0L)) j = seq_len(ncol(x))[j]
ansvars = names(x)[ if (notj) -j else j ] # DT[,!"columntoexclude",with=FALSE], if a copy is needed, rather than :=NULL
# DT[, c(1,3), with=FALSE] should clearly provide both 'x' columns
ansvals = if (notj) setdiff(seq_along(x), as.integer(j)) else as.integer(j)
} else stop("When with=FALSE, j-argument should be of type logical/character/integer indicating the columns to select.") # fix for #1440.
if (!length(ansvals)) return(null.data.table())
} else { # with=TRUE and byjoin could be TRUE
bynames = NULL
allbyvars = NULL
if (byjoin) {
bynames = names(x)[rightcols]
} else if (!missing(by)) {
# deal with by before j because we need byvars when j contains .SD
# may evaluate to NULL | character() | "" | list(), likely a result of a user expression where no-grouping is one case being loop'd through
bysubl = as.list.default(bysub)
bysuborig = bysub
if (is.name(bysub) && !(as.character(bysub) %chin% names(x))) { # TO DO: names(x),names(i),and i. and x. prefixes
bysub = eval(bysub, parent.frame(), parent.frame())
# fix for # 5106 - http://stackoverflow.com/questions/19983423/why-by-on-a-vector-not-from-a-data-table-column-is-very-slow
# case where by=y where y is not a column name, and not a call/symbol/expression, but an atomic vector outside of DT.
# note that if y is a list, this'll return an error (not sure if it should).
if (is.atomic(bysub)) bysubl = list(bysuborig) else bysubl = as.list.default(bysub)
}
if (length(bysubl) && identical(bysubl[[1L]],quote(eval))) { # TO DO: or by=..()
bysub = eval(bysubl[[2]], parent.frame(), parent.frame())
bysub = replace_dot_alias(bysub) # fix for #1298
if (is.expression(bysub)) bysub=bysub[[1L]]
bysubl = as.list.default(bysub)
} else if (is.call(bysub) && as.character(bysub[[1L]]) %chin% c("c","key","names", "intersect", "setdiff")) {
# catch common cases, so we don't have to copy x[irows] for all columns
# *** TO DO ***: try() this eval first (as long as not list() or .()) and see if it evaluates to column names
# to avoid the explicit c,key,names which already misses paste("V",1:10) for example
# tried before but since not wrapped in try() it failed on some tests
# or look for column names used in this by (since if none it wouldn't find column names anyway
# when evaled within full x[irows]). Trouble is that colA%%2L is a call and should be within frame.
tt = eval(bysub, parent.frame(), parent.frame())
if (!is.character(tt)) stop("by=c(...), key(...) or names(...) must evaluate to 'character'")
bysub=tt
} else if (is.call(bysub) && !as.character(bysub[[1L]]) %chin% c("list", "as.list", "{", ".", ":")) {
# potential use of function, ex: by=month(date). catch it and wrap with "(", because we need to set "bysameorder" to FALSE as we don't know if the function will return ordered results just because "date" is ordered. Fixes #2670.
bysub = as.call(c(as.name('('), list(bysub)))
bysubl = as.list.default(bysub)
} else if (is.call(bysub) && bysub[[1L]] == ".") bysub[[1L]] = quote(list)
if (mode(bysub) == "character") {
if (length(grep(",",bysub))) {
if (length(bysub)>1L) stop("'by' is a character vector length ",length(bysub)," but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.")
bysub = strsplit(bysub,split=",")[[1L]]
}
tt = grep("^[^`]+$",bysub)
if (length(tt)) bysub[tt] = paste("`",bysub[tt],"`",sep="")
bysub = parse(text=paste("list(",paste(bysub,collapse=","),")",sep=""))[[1L]]
bysubl = as.list.default(bysub)
}
allbyvars = intersect(all.vars(bysub),names(x))
orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause)
# orderedirows = is.sorted(f__)
bysameorder = orderedirows && haskey(x) && all(sapply(bysubl,is.name)) && length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars)))
if (is.null(irows))
if (is.call(bysub) && length(bysub) == 3L && bysub[[1L]] == ":" && is.name(bysub[[2L]]) && is.name(bysub[[3L]])) {
byval = eval(bysub, setattr(as.list(seq_along(x)), 'names', names(x)), parent.frame())
byval = as.list(x)[byval]
} else byval = eval(bysub, x, parent.frame())
else {
if (!is.integer(irows)) stop("Internal error: irows isn't integer") # length 0 when i returns no rows
# Passing irows as i to x[] below has been troublesome in a rare edge case.
# irows may contain NA, 0, negatives and >nrow(x) here. That's all ok.
# But we may need i join column values to be retained (where those rows have no match), hence we tried eval(isub)
# in 1.8.3, but this failed test 876.
# TO DO: Add a test like X[i,sum(v),by=i.x2], or where by includes a join column (both where some i don't match).
# TO DO: Make xss directly, rather than recursive call.
if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset
if (length(allbyvars)) { ############### TO DO TO DO TO DO ###############
if (verbose) cat("i clause present and columns used in by detected, only these subset:",paste(allbyvars,collapse=","),"\n")
xss = x[irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends]
} else {
if (verbose) cat("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '",deparse(by),"'\n",sep="")
xss = x[irows,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends]
}
if (is.call(bysub) && length(bysub) == 3L && bysub[[1L]] == ":") {
byval = eval(bysub, setattr(as.list(seq_along(xss)), 'names', names(xss)), parent.frame())
byval = as.list(xss)[byval]
} else byval = eval(bysub, xss, parent.frame())
xnrow = nrow(xss)
# TO DO: pass xss (x subset) through into dogroups. Still need irows there (for :=), but more condense
# and contiguous to use xss to form .SD in dogroups than going via irows
}
if (!length(byval) && xnrow>0L) {
# see missing(by) up above for comments
# by could be NULL or character(0) for example (e.g. passed in as argument in a loop of different bys)
bysameorder = FALSE # 1st and only group is the entire table, so could be TRUE, but FALSE to avoid
# a key of empty character()
byval = list()
bynames = allbyvars = NULL
# the rest now fall through
} else bynames = names(byval)
if (is.atomic(byval)) {
if (is.character(byval) && length(byval)<=ncol(x) && !(is.name(bysub) && as.character(bysub)%chin%names(x)) ) {
stop("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval",deparse(bysub)," should work. This is for efficiency so data.table can detect which columns are needed.")
} else {
# by may be a single unquoted column name but it must evaluate to list so this is a convenience to users. Could also be a single expression here such as DT[,sum(v),by=colA%%2]
byval = list(byval)
bysubl = c(as.name("list"),bysuborig) # for guessing the column name below
if (is.name(bysuborig))
bynames = as.character(bysuborig)
else
bynames = names(byval)
}
}
if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to vector or list of vectors (where 'list' includes data.table and data.frame which are lists, too)")
for (jj in seq_len(length(byval))) {
if (!typeof(byval[[jj]]) %chin% c("integer","logical","character","double")) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]")
}
tt = sapply(byval,length)
if (any(tt!=xnrow)) stop("The items in the 'by' or 'keyby' list are length (",paste(tt,collapse=","),"). Each must be same length as rows in x or number of rows returned by i (",xnrow,").")
if (is.null(bynames)) bynames = rep.int("",length(byval))
if (any(bynames=="")) {
if (length(bysubl)<2) stop("When 'by' or 'keyby' is list() we expect something inside the brackets")
for (jj in seq_along(bynames)) {
if (bynames[jj]=="") {
# Best guess. Use "month" in the case of by=month(date), use "a" in the case of by=a%%2
byvars = all.vars(bysubl[[jj+1L]], functions = TRUE)
if (length(byvars) == 1) tt = byvars
else {
tt = grep("^eval|^[^[:alpha:]. ]",byvars,invert=TRUE,value=TRUE)[1L]
if (!length(tt)) tt = all.vars(bysubl[[jj+1L]])[1L]
}
# fix for #497
if (length(byvars) > 1L && tt %in% all.vars(jsub, FALSE)) {
bynames[jj] = deparse(bysubl[[jj+1L]])
if (verbose)
cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt, "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="")
}
else bynames[jj] = tt
# if user doesn't like this inferred name, user has to use by=list() to name the column
}
}
# Fix for #1334
if (any(duplicated(bynames))) {
bynames = make.unique(bynames)
}
}
setattr(byval, "names", bynames) # byval is just a list not a data.table hence setattr not setnames
}
jvnames = NULL
if (is.name(jsub)) {
# j is a single unquoted column name
if (jsub!=".SD") {
jvnames = gsub("^[.](N|I|GRP|BY)$","\\1",as.character(jsub))
# jsub is list()ed after it's eval'd inside dogroups.
}
} else if (is.call(jsub) && as.character(jsub[[1L]]) %chin% c("list",".")) {
jsub[[1L]] = quote(list)
jsubl = as.list.default(jsub) # TO DO: names(jsub) and names(jsub)="" seem to work so make use of that
if (length(jsubl)>1) {
jvnames = names(jsubl)[-1L] # check list(a=sum(v),v)
if (is.null(jvnames)) jvnames = rep.int("", length(jsubl)-1L)
for (jj in seq.int(2L,length(jsubl))) {
if (jvnames[jj-1L] == "" && mode(jsubl[[jj]])=="name")
jvnames[jj-1L] = gsub("^[.](N|I|GRP|BY)$","\\1",deparse(jsubl[[jj]]))
# TO DO: if call to a[1] for example, then call it 'a' too
}
setattr(jsubl, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back aftwards on the result.
jsub = as.call(jsubl)
} # else empty list is needed for test 468: adding an empty list column