按组求和变量的一个好方法是
rowsum(numericToBeSummedUp, groups)
从基地。这里只有collapse::fsum和Rfast::group。求和更快。
关于速度和内存消耗
collapse::fsum(numericToBeSummedUp, groups)
在给定的例子中是最好的,当使用分组数据帧时可以加速。
GDF <- collapse::fgroup_by(DF, g) #Create a grouped data.frame with group g
#GDF <- collapse::gby(DF, g) #Alternative
collapse::fsum(GDF) #Calculate sum per group
这接近于将数据集分成每组子数据集的时间。
对不同方法的基准测试表明::fsum对单列折叠求和的速度比Rfast::group快两倍。Sum,比rowsum快7倍。紧随其后的是tapply, data。表,由和dplyr。Xtabs和aggregate是最慢的。
聚合两个列折叠::fsum仍然是最快的,比Rfast::group快3倍。求和,比rowsum快5倍。紧随其后的是数据。Table, tapply, by和dplyr。xtabs和aggregate是最慢的。
基准
set.seed(42)
n <- 1e5
DF <- data.frame(g = as.factor(sample(letters, n, TRUE))
, x = rnorm(n), y = rnorm(n) )
library(magrittr)
有些方法允许执行有助于加快聚合速度的任务。
DT <- data.table::as.data.table(DF)
data.table::setkey(DT, g)
DFG <- collapse::gby(DF, g)
DFG1 <- collapse::gby(DF[c("g", "x")], g)
# Optimized dataset for this aggregation task
# This will also consume time!
DFS <- lapply(split(DF[c("x", "y")], DF["g"]), as.matrix)
DFS1 <- lapply(split(DF["x"], DF["g"]), as.matrix)
对一列求和。
bench::mark(check = FALSE
, "aggregate" = aggregate(DF$x, DF["g"], sum)
, "tapply" = tapply(DF$x, DF$g, sum)
, "dplyr" = DF %>% dplyr::group_by(g) %>% dplyr::summarise(sum = sum(x))
, "data.table" = data.table::as.data.table(DF)[, sum(x), by = g]
, "data.table2" = DT[, sum(x), by = g]
, "by" = by(DF$x, DF$g, sum)
, "xtabs" = xtabs(x ~ g, DF)
, "rowsum" = rowsum(DF$x, DF$g)
, "Rfast" = Rfast::group.sum(DF$x, DF$g)
, "base Split" = lapply(DFS1, colSums)
, "base Split Rfast" = lapply(DFS1, Rfast::colsums)
, "collapse" = collapse::fsum(DF$x, DF$g)
, "collapse2" = collapse::fsum(DFG1)
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
# 1 aggregate 20.43ms 21.88ms 45.7 16.07MB 59.4 10 13
# 2 tapply 1.24ms 1.39ms 687. 1.53MB 30.1 228 10
# 3 dplyr 3.28ms 4.81ms 209. 2.42MB 13.1 96 6
# 4 data.table 1.59ms 2.47ms 410. 4.69MB 87.7 145 31
# 5 data.table2 1.52ms 1.93ms 514. 2.38MB 40.5 190 15
# 6 by 2.15ms 2.31ms 396. 2.29MB 26.7 148 10
# 7 xtabs 7.78ms 8.91ms 111. 10.54MB 50.0 31 14
# 8 rowsum 951.36µs 1.07ms 830. 1.15MB 24.1 378 11
# 9 Rfast 431.06µs 434.53µs 2268. 2.74KB 0 1134 0
#10 base Split 213.42µs 219.66µs 4342. 256B 12.4 2105 6
#11 base Split Rfast 76.88µs 81.48µs 10923. 65.05KB 16.7 5232 8
#12 collapse 121.03µs 122.92µs 7965. 256B 2.01 3961 1
#13 collapse2 85.97µs 88.67µs 10749. 256B 4.03 5328 2
两列相加
bench::mark(check = FALSE
, "aggregate" = aggregate(DF[c("x", "y")], DF["g"], sum)
, "tapply" = list2DF(lapply(DF[c("x", "y")], tapply, list(DF$g), sum))
, "dplyr" = DF %>% dplyr::group_by(g) %>% dplyr::summarise(x = sum(x), y = sum(y))
, "data.table" = data.table::as.data.table(DF)[,.(sum(x),sum(y)), by = g]
, "data.table2" = DT[,.(sum(x),sum(y)), by = g]
, "by" = lapply(DF[c("x", "y")], by, list(DF$g), sum)
, "xtabs" = xtabs(cbind(x, y) ~ g, DF)
, "rowsum" = rowsum(DF[c("x", "y")], DF$g)
, "Rfast" = list2DF(lapply(DF[c("x", "y")], Rfast::group.sum, DF$g))
, "base Split" = lapply(DFS, colSums)
, "base Split Rfast" = lapply(DFS, Rfast::colsums)
, "collapse" = collapse::fsum(DF[c("x", "y")], DF$g)
, "collapse2" = collapse::fsum(DFG)
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
# 1 aggregate 25.87ms 26.36ms 37.7 20.89MB 132. 4 14
# 2 tapply 2.65ms 3.23ms 312. 3.06MB 22.5 97 7
# 3 dplyr 4.27ms 6.02ms 164. 3.19MB 13.3 74 6
# 4 data.table 2.33ms 3.19ms 309. 4.72MB 57.0 114 21
# 5 data.table2 2.22ms 2.81ms 355. 2.41MB 19.8 161 9
# 6 by 4.45ms 5.23ms 190. 4.59MB 22.5 59 7
# 7 xtabs 10.71ms 13.14ms 76.1 19.7MB 145. 11 21
# 8 rowsum 1.02ms 1.07ms 850. 1.15MB 23.8 393 11
# 9 Rfast 841.57µs 846.88µs 1150. 5.48KB 0 575 0
#10 base Split 360.24µs 368.28µs 2652. 256B 8.16 1300 4
#11 base Split Rfast 113.95µs 119.81µs 7540. 65.05KB 10.3 3661 5
#12 collapse 201.31µs 204.83µs 4724. 512B 2.01 2350 1
#13 collapse2 156.95µs 161.79µs 5408. 512B 2.02 2683 1