Skip to content

Commit 00c8ae2

Browse files
committed
draft of join questions
1 parent a58b010 commit 00c8ae2

File tree

1 file changed

+88
-14
lines changed

1 file changed

+88
-14
lines changed

datatable/join-datatable.R

+88-14
Original file line numberDiff line numberDiff line change
@@ -28,30 +28,104 @@ sapply(sapply(JN, nrow), print) -> nul
2828

2929
cat("joining...\n")
3030

31-
#inner, singlecol, integer, big-big
32-
#inner, singlecol, integer, big-medium
33-
#inner, singlecol, integer, big-small
31+
question = "big inner join on unique int" # q1
32+
t = system.time(print(dim(ans<-DT[JN$big, on="id4", nomatch=NULL])))[["elapsed"]]
33+
m = memory_usage()
34+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
35+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(DT), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
36+
rm(ans)
37+
t = system.time(print(dim(ans<-DT[JN$big, on="id4", nomatch=NULL])))[["elapsed"]]
38+
m = memory_usage()
39+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
40+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
41+
print(head(ans, 3))
42+
print(tail(ans, 3))
43+
rm(ans)
3444

35-
#outer, singlecol, integer, big-medium
45+
question = "medium inner join on unique int" # q2
46+
t = system.time(print(dim(ans<-DT[JN$medium, on="id4", nomatch=NULL])))[["elapsed"]]
47+
m = memory_usage()
48+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
49+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(DT), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
50+
rm(ans)
51+
t = system.time(print(dim(ans<-DT[JN$medium, on="id4", nomatch=NULL])))[["elapsed"]]
52+
m = memory_usage()
53+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
54+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
55+
print(head(ans, 3))
56+
print(tail(ans, 3))
57+
rm(ans)
3658

37-
#inner, singlecol, factor, big-medium
59+
question = "small inner join on unique int" # q3
60+
t = system.time(print(dim(ans<-DT[JN$small, on="id4", nomatch=NULL])))[["elapsed"]]
61+
m = memory_usage()
62+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
63+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(DT), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
64+
rm(ans)
65+
t = system.time(print(dim(ans<-DT[JN$small, on="id4", nomatch=NULL])))[["elapsed"]]
66+
m = memory_usage()
67+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
68+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
69+
print(head(ans, 3))
70+
print(tail(ans, 3))
71+
rm(ans)
3872

39-
#inner, multicol, integer, big-medium
73+
question = "medium outer join on unique int" # q4
74+
t = system.time(print(dim(ans<-DT[JN$medium, on="id4", nomatch=NULL])))[["elapsed"]]
75+
m = memory_usage()
76+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
77+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(DT), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
78+
rm(ans)
79+
t = system.time(print(dim(ans<-DT[JN$medium, on="id4", nomatch=NULL])))[["elapsed"]]
80+
m = memory_usage()
81+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
82+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
83+
print(head(ans, 3))
84+
print(tail(ans, 3))
85+
rm(ans)
86+
87+
question = "medium outer join on unique factor" # q5
88+
t = system.time(print(dim(ans<-DT[JN$medium, on="id4", nomatch=NULL])))[["elapsed"]]
89+
m = memory_usage()
90+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
91+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(DT), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
92+
rm(ans)
93+
t = system.time(print(dim(ans<-DT[JN$medium, on="id4", nomatch=NULL])))[["elapsed"]]
94+
m = memory_usage()
95+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
96+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
97+
print(head(ans, 3))
98+
print(tail(ans, 3))
99+
rm(ans)
40100

41-
#inner, singlecol, integer, big-medium, update on join
101+
question = "medium inner join on unique int factor" # q6
102+
t = system.time(print(dim(ans<-DT[JN$medium, on=c("id4","id1"), nomatch=NULL])))[["elapsed"]]
103+
m = memory_usage()
104+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
105+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(DT), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
106+
rm(ans)
107+
t = system.time(print(dim(ans<-DT[JN$medium, on=c("id4","id1"), nomatch=NULL])))[["elapsed"]]
108+
m = memory_usage()
109+
chkt = system.time(chk<-ans[, .(sum(v1), sum(i.v1))])[["elapsed"]]
110+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
111+
print(head(ans, 3))
112+
print(tail(ans, 3))
113+
rm(ans)
42114

43-
question = "big inner join on unique int" # q1
44-
t = system.time(print(dim(ans<-DT[JN$big, on="id1", nomatch=NULL])))[["elapsed"]]
115+
question = "medium update on join on unique int" # q7
116+
t = system.time(print(dim(ans <- DT[JN$medium, v2:=i.v1, on="id4"])))[["elapsed"]]
45117
m = memory_usage()
46-
chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)), sum(bit64::as.integer64(Y2)))])[["elapsed"]]
47-
write.log(run=1L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
118+
chkt = system.time(chk<-ans[, .(sum(v1), sum(v2))])[["elapsed"]]
119+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(DT), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
48120
rm(ans)
49-
t = system.time(print(dim(ans<-X[Y, on="KEY", nomatch=NULL])))[["elapsed"]]
121+
DT[, v2:=NULL]
122+
t = system.time(print(dim(ans<-DT[JN$medium, v2:=i.v1, on="id4"])))[["elapsed"]]
50123
m = memory_usage()
51-
chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)), sum(bit64::as.integer64(Y2)))])[["elapsed"]]
124+
chkt = system.time(chk<-ans[, .(sum(v1), sum(v2))])[["elapsed"]]
52125
write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
53126
print(head(ans, 3))
54127
print(tail(ans, 3))
55128
rm(ans)
129+
DT[, v2:=NULL]
56130

57-
if( !interactive() ) q("no", status=0)
131+
if (!interactive()) q("no", status=0)

0 commit comments

Comments
 (0)