-
Notifications
You must be signed in to change notification settings - Fork 986
Do's and Don'ts
NB : Timings updated with v1.8.2 (improves 3rd case dramatically)
n=10000
DT = data.table(grp=1:n,name=as.character(as.hexmode(1:n)),x=rnorm(10*n),y=rnorm(10*n))
setkey(DT,grp)
system.time(ans1<-DT[,lapply(.SD[,list(x,y)],sum),by=grp])
# user system elapsed
# 31.130 0.088 31.288 # bad
system.time(ans2<-DT[,lapply(list(x,y),sum),by=grp])
# user system elapsed
# 0.284 0.004 0.291 # good
setnames(ans2,names(ans1))
identical(ans1,ans2)
# [1] TRUE
system.time(ans3<-DT[,lapply(.SD,sum),by=grp,.SDcols=c("x","y")])
# user system elapsed
# 0.080 0.004 0.085 # even better (prior to v1.8.2 was slower and not recommended, no longer)
identical(ans1,ans3)
# [1] TRUE
tables()
# NAME NROW MB COLS KEY
# [1,] ans1 10,000 1 grp,x,y grp
# [2,] ans2 10,000 1 grp,x,y grp
# [3,] ans3 10,000 1 grp,x,y grp
# [4,] DT 100,000 3 grp,name,x,y grp
# Total: 6MB
NB: The motivation for lapply
through a subset of .SD
is when there are non-group column(s) in DT
(such as name
above). Otherwise you could just do DT[,lapply(.SD,sum),by=grp]
.
Subsetting .SD
(as in ans1
) is so much slower because a new copy of .SD
for those columns is created for each and every group.
As from v1.8.2, := by group has been implemented.
library(data.table)
n=10000
DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
setkey(DT,grp)
system.time(ans1 <- DT[,transform(.SD,x2=x/sum(x),y2=y/sum(y)),by=grp])
# user system elapsed
# 5.46 0.00 5.48 # slow
head(ans1,3)
# grp x y x2 y2
# 1: 1 -0.5848814 -0.41560829 0.6241268 0.5695575
# 2: 1 -0.6314059 -0.49076645 0.6737731 0.6725557
# 3: 1 -1.7694071 0.08860505 1.8881340 -0.1214260
system.time(tt <- DT[,list(x2=x/sum(x),y2=y/sum(y)),by=grp])
# user system elapsed
# 0.02 0.00 0.02 (274 times faster!!!)
head(tt,3)
# grp x2 y2
# 1: 1 0.6241268 0.5695575
# 2: 1 0.6737731 0.6725557
# 3: 1 1.8881340 -0.1214260
system.time(ans2 <- cbind(DT,tt[,list(x2,y2)]))
# user system elapsed
# 0.05 0.00 0.05 # very fast to add afterwards in bulk
head(ans2,3)
# grp x y x2 y2
# 1: 1 -0.5848814 -0.41560829 0.6241268 0.5695575
# 2: 1 -0.6314059 -0.49076645 0.6737731 0.6725557
# 3: 1 -1.7694071 0.08860505 1.8881340 -0.1214260
setkey(ans2,grp)
identical(ans1,ans2)
[1] TRUE
system.time(DT[, c('x2', 'y2') := list(x / sum(x), y / sum(y)), by = grp])
# user system elapsed
# 0.07 0.00 0.07 # equivalent to cbind afterwards approach, but more memory efficient
# now DT has been updated
identical(ans1, DT)
# [1] TRUE
# remove new columns to show different approach
DT[, c('x2', 'y2') := NULL]
system.time(DT[, `:=`(x2=x / sum(x),y2= y / sum(y)), by = grp])
# user system elapsed
#0.04 0.00 0.05 # this is slightly faster
identical(ans1, DT)
# [1] TRUE
When called repeatedly, it may be surprising to learn that mean()
is inefficient compared to sum()
. mean()
is S3 generic which means it takes time to dispatch to one of methods(mean)
, usually mean.default()
. Also, mean.default()
contains a few lines of R before finally calling .Internal(mean(x))
. sum()
on the other hand is a primitive function.
For this reason data.table
will optimize calls to mean()
automatically.
TO DO. Revision needed here for v1.9.2
library(data.table)
n<-100000
DT<-data.table(grp1=sample(1:750, n, replace=TRUE),
grp2=sample(1:750, n, replace=TRUE),
x=rnorm(n),
y=rnorm(n))
system.time(ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2), verbose = TRUE])
# Detected that j uses these columns: x,y
# Optimized j from 'list(mean(x), mean(y))' to 'list(.External(Cfastmean, x, FALSE), .External(Cfastmean, y, FALSE))'
# Starting dogroups ... done dogroups in 0.2 secs
# user system elapsed
# 0.25 0.00 0.25
# without optimization
options(datatable.optimize = 0)
system.time(ans2<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose = TRUE])
# Finding groups (bysameorder=FALSE) ... done in 0.04secs. bysameorder=FALSE and o__ is length 100000
# Detected that j uses these columns: x,y
# Starting dogroups ... done dogroups in 1.61 secs
# user system elapsed
# 1.64 0.00 1.65 # 6.6 times faster
identical(ans1,ans2)
# [1] TRUE
median()
is another S3 generic with overhead that can be sped up a lot using a similar technique. However, sum()
, length()
, var()
and many other functions don't suffer the same performance issue :
system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)])
# user system elapsed
# 0.19 0.00 0.19 # not much slower than optimized mean
> n=10000
> DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
> setkey(DT,grp)
> system.time(ans1<-DT[,as.list(cbind(sum(x),sum(y))),by=grp])
user system elapsed
1.472 0.016 1.494 # bad
> system.time(ans2<-DT[,data.table(sum(x),sum(y)),by=grp])
user system elapsed
18.494 0.308 18.914 # worse
> system.time(ans3<-DT[,data.frame(sum(x),sum(y)),by=grp])
user system elapsed
40.502 1.136 41.807 # terrible
> colnames(ans3)=colnames(ans1)
> system.time(ans4<-DT[,list(sum(x),sum(y)),by=grp])
user system elapsed
0.176 0.004 0.181 # best
> identical(ans1,ans2)
[1] TRUE
> identical(ans1,ans3)
[1] TRUE
> identical(ans1,ans4)
[1] TRUE
> n=10000
> DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
> setkey(DT,grp)
> system.time(ans1<-DT[,as.list(colSums(.SD)),by=grp])
user system elapsed
1.296 0.000 1.298 #bad
> system.time(ans2<-DT[,lapply(.SD,sum),by=grp])
user system elapsed
0.020 0.000 0.021 #best
> identical(ans1,ans2)
[1] TRUE
> system.time(ans3<-DT[,list(x=sum(x),y=sum(y)),by=grp])
user system elapsed
0.020 0.000 0.022 #similar but longer to write
> identical(ans1,ans3)
[1] TRUE