我有一个有两列的数据帧。第一列包含类别,如“第一”,“第二”,“第三”,第二列有数字,表示我从“类别”中看到特定组的次数。

例如:

Category     Frequency
First        10
First        15
First        5
Second       2
Third        14
Third        20
Second       3

我想按类别对数据进行排序,并将所有频率相加:

Category     Frequency
First        30
Second       5
Third        34

在R中怎么做呢?


当前回答

按组求和变量的一个好方法是

rowsum(numericToBeSummedUp, groups)

从基地。这里只有collapse::fsum和Rfast::group。求和更快。

关于速度和内存消耗

collapse::fsum(numericToBeSummedUp, groups)

在给定的例子中是最好的,当使用分组数据帧时可以加速。

GDF <- collapse::fgroup_by(DF, g) #Create a grouped data.frame with group g
#GDF <- collapse::gby(DF, g)      #Alternative

collapse::fsum(GDF)               #Calculate sum per group

这接近于将数据集分成每组子数据集的时间。

对不同方法的基准测试表明::fsum对单列折叠求和的速度比Rfast::group快两倍。Sum,比rowsum快7倍。紧随其后的是tapply, data。表,由和dplyr。Xtabs和aggregate是最慢的。

聚合两个列折叠::fsum仍然是最快的,比Rfast::group快3倍。求和,比rowsum快5倍。紧随其后的是数据。Table, tapply, by和dplyr。xtabs和aggregate是最慢的。


基准

set.seed(42)
n <- 1e5
DF <- data.frame(g = as.factor(sample(letters, n, TRUE))
              , x = rnorm(n), y = rnorm(n) )

library(magrittr)

有些方法允许执行有助于加快聚合速度的任务。

DT <- data.table::as.data.table(DF)
data.table::setkey(DT, g)

DFG <- collapse::gby(DF, g)
DFG1 <- collapse::gby(DF[c("g", "x")], g)

# Optimized dataset for this aggregation task
# This will also consume time!
DFS <- lapply(split(DF[c("x", "y")], DF["g"]), as.matrix)
DFS1 <- lapply(split(DF["x"], DF["g"]), as.matrix)

对一列求和。

bench::mark(check = FALSE
          , "aggregate" = aggregate(DF$x, DF["g"], sum)
          , "tapply" = tapply(DF$x, DF$g, sum)
          , "dplyr" = DF %>% dplyr::group_by(g) %>% dplyr::summarise(sum = sum(x))
          , "data.table" = data.table::as.data.table(DF)[, sum(x), by = g]
          , "data.table2" = DT[, sum(x), by = g]
          , "by" = by(DF$x, DF$g, sum)
          , "xtabs" = xtabs(x ~ g, DF)
          , "rowsum" = rowsum(DF$x, DF$g)
          , "Rfast" = Rfast::group.sum(DF$x, DF$g)
          , "base Split" = lapply(DFS1, colSums)
          , "base Split Rfast" = lapply(DFS1, Rfast::colsums)
          , "collapse"  = collapse::fsum(DF$x, DF$g)
          , "collapse2"  = collapse::fsum(DFG1)
)
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
# 1 aggregate         20.43ms  21.88ms      45.7   16.07MB    59.4     10    13
# 2 tapply             1.24ms   1.39ms     687.     1.53MB    30.1    228    10
# 3 dplyr              3.28ms   4.81ms     209.     2.42MB    13.1     96     6
# 4 data.table         1.59ms   2.47ms     410.     4.69MB    87.7    145    31
# 5 data.table2        1.52ms   1.93ms     514.     2.38MB    40.5    190    15
# 6 by                 2.15ms   2.31ms     396.     2.29MB    26.7    148    10
# 7 xtabs              7.78ms   8.91ms     111.    10.54MB    50.0     31    14
# 8 rowsum           951.36µs   1.07ms     830.     1.15MB    24.1    378    11
# 9 Rfast            431.06µs 434.53µs    2268.     2.74KB     0     1134     0
#10 base Split       213.42µs 219.66µs    4342.       256B    12.4   2105     6
#11 base Split Rfast  76.88µs  81.48µs   10923.    65.05KB    16.7   5232     8
#12 collapse         121.03µs 122.92µs    7965.       256B     2.01  3961     1
#13 collapse2         85.97µs  88.67µs   10749.       256B     4.03  5328     2

两列相加

bench::mark(check = FALSE
          , "aggregate" = aggregate(DF[c("x", "y")], DF["g"], sum)
          , "tapply" = list2DF(lapply(DF[c("x", "y")], tapply, list(DF$g), sum))
          , "dplyr" = DF %>% dplyr::group_by(g) %>% dplyr::summarise(x = sum(x), y = sum(y))
          , "data.table" = data.table::as.data.table(DF)[,.(sum(x),sum(y)), by = g]
          , "data.table2" = DT[,.(sum(x),sum(y)), by = g]
          , "by" = lapply(DF[c("x", "y")], by, list(DF$g), sum)
          , "xtabs" = xtabs(cbind(x, y) ~ g, DF)
          , "rowsum" = rowsum(DF[c("x", "y")], DF$g)
          , "Rfast" = list2DF(lapply(DF[c("x", "y")], Rfast::group.sum, DF$g))
          , "base Split" = lapply(DFS, colSums)
          , "base Split Rfast" = lapply(DFS, Rfast::colsums)
          , "collapse" = collapse::fsum(DF[c("x", "y")], DF$g)
          , "collapse2" = collapse::fsum(DFG)
            )
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
# 1 aggregate         25.87ms  26.36ms      37.7   20.89MB   132.       4    14
# 2 tapply             2.65ms   3.23ms     312.     3.06MB    22.5     97     7
# 3 dplyr              4.27ms   6.02ms     164.     3.19MB    13.3     74     6
# 4 data.table         2.33ms   3.19ms     309.     4.72MB    57.0    114    21
# 5 data.table2        2.22ms   2.81ms     355.     2.41MB    19.8    161     9
# 6 by                 4.45ms   5.23ms     190.     4.59MB    22.5     59     7
# 7 xtabs             10.71ms  13.14ms      76.1    19.7MB   145.      11    21
# 8 rowsum             1.02ms   1.07ms     850.     1.15MB    23.8    393    11
# 9 Rfast            841.57µs 846.88µs    1150.     5.48KB     0      575     0
#10 base Split       360.24µs 368.28µs    2652.       256B     8.16  1300     4
#11 base Split Rfast 113.95µs 119.81µs    7540.    65.05KB    10.3   3661     5
#12 collapse         201.31µs 204.83µs    4724.       512B     2.01  2350     1
#13 collapse2        156.95µs 161.79µs    5408.       512B     2.02  2683     1

其他回答

按组求和变量的一个好方法是

rowsum(numericToBeSummedUp, groups)

从基地。这里只有collapse::fsum和Rfast::group。求和更快。

关于速度和内存消耗

collapse::fsum(numericToBeSummedUp, groups)

在给定的例子中是最好的,当使用分组数据帧时可以加速。

GDF <- collapse::fgroup_by(DF, g) #Create a grouped data.frame with group g
#GDF <- collapse::gby(DF, g)      #Alternative

collapse::fsum(GDF)               #Calculate sum per group

这接近于将数据集分成每组子数据集的时间。

对不同方法的基准测试表明::fsum对单列折叠求和的速度比Rfast::group快两倍。Sum,比rowsum快7倍。紧随其后的是tapply, data。表,由和dplyr。Xtabs和aggregate是最慢的。

聚合两个列折叠::fsum仍然是最快的,比Rfast::group快3倍。求和,比rowsum快5倍。紧随其后的是数据。Table, tapply, by和dplyr。xtabs和aggregate是最慢的。


基准

set.seed(42)
n <- 1e5
DF <- data.frame(g = as.factor(sample(letters, n, TRUE))
              , x = rnorm(n), y = rnorm(n) )

library(magrittr)

有些方法允许执行有助于加快聚合速度的任务。

DT <- data.table::as.data.table(DF)
data.table::setkey(DT, g)

DFG <- collapse::gby(DF, g)
DFG1 <- collapse::gby(DF[c("g", "x")], g)

# Optimized dataset for this aggregation task
# This will also consume time!
DFS <- lapply(split(DF[c("x", "y")], DF["g"]), as.matrix)
DFS1 <- lapply(split(DF["x"], DF["g"]), as.matrix)

对一列求和。

bench::mark(check = FALSE
          , "aggregate" = aggregate(DF$x, DF["g"], sum)
          , "tapply" = tapply(DF$x, DF$g, sum)
          , "dplyr" = DF %>% dplyr::group_by(g) %>% dplyr::summarise(sum = sum(x))
          , "data.table" = data.table::as.data.table(DF)[, sum(x), by = g]
          , "data.table2" = DT[, sum(x), by = g]
          , "by" = by(DF$x, DF$g, sum)
          , "xtabs" = xtabs(x ~ g, DF)
          , "rowsum" = rowsum(DF$x, DF$g)
          , "Rfast" = Rfast::group.sum(DF$x, DF$g)
          , "base Split" = lapply(DFS1, colSums)
          , "base Split Rfast" = lapply(DFS1, Rfast::colsums)
          , "collapse"  = collapse::fsum(DF$x, DF$g)
          , "collapse2"  = collapse::fsum(DFG1)
)
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
# 1 aggregate         20.43ms  21.88ms      45.7   16.07MB    59.4     10    13
# 2 tapply             1.24ms   1.39ms     687.     1.53MB    30.1    228    10
# 3 dplyr              3.28ms   4.81ms     209.     2.42MB    13.1     96     6
# 4 data.table         1.59ms   2.47ms     410.     4.69MB    87.7    145    31
# 5 data.table2        1.52ms   1.93ms     514.     2.38MB    40.5    190    15
# 6 by                 2.15ms   2.31ms     396.     2.29MB    26.7    148    10
# 7 xtabs              7.78ms   8.91ms     111.    10.54MB    50.0     31    14
# 8 rowsum           951.36µs   1.07ms     830.     1.15MB    24.1    378    11
# 9 Rfast            431.06µs 434.53µs    2268.     2.74KB     0     1134     0
#10 base Split       213.42µs 219.66µs    4342.       256B    12.4   2105     6
#11 base Split Rfast  76.88µs  81.48µs   10923.    65.05KB    16.7   5232     8
#12 collapse         121.03µs 122.92µs    7965.       256B     2.01  3961     1
#13 collapse2         85.97µs  88.67µs   10749.       256B     4.03  5328     2

两列相加

bench::mark(check = FALSE
          , "aggregate" = aggregate(DF[c("x", "y")], DF["g"], sum)
          , "tapply" = list2DF(lapply(DF[c("x", "y")], tapply, list(DF$g), sum))
          , "dplyr" = DF %>% dplyr::group_by(g) %>% dplyr::summarise(x = sum(x), y = sum(y))
          , "data.table" = data.table::as.data.table(DF)[,.(sum(x),sum(y)), by = g]
          , "data.table2" = DT[,.(sum(x),sum(y)), by = g]
          , "by" = lapply(DF[c("x", "y")], by, list(DF$g), sum)
          , "xtabs" = xtabs(cbind(x, y) ~ g, DF)
          , "rowsum" = rowsum(DF[c("x", "y")], DF$g)
          , "Rfast" = list2DF(lapply(DF[c("x", "y")], Rfast::group.sum, DF$g))
          , "base Split" = lapply(DFS, colSums)
          , "base Split Rfast" = lapply(DFS, Rfast::colsums)
          , "collapse" = collapse::fsum(DF[c("x", "y")], DF$g)
          , "collapse2" = collapse::fsum(DFG)
            )
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
# 1 aggregate         25.87ms  26.36ms      37.7   20.89MB   132.       4    14
# 2 tapply             2.65ms   3.23ms     312.     3.06MB    22.5     97     7
# 3 dplyr              4.27ms   6.02ms     164.     3.19MB    13.3     74     6
# 4 data.table         2.33ms   3.19ms     309.     4.72MB    57.0    114    21
# 5 data.table2        2.22ms   2.81ms     355.     2.41MB    19.8    161     9
# 6 by                 4.45ms   5.23ms     190.     4.59MB    22.5     59     7
# 7 xtabs             10.71ms  13.14ms      76.1    19.7MB   145.      11    21
# 8 rowsum             1.02ms   1.07ms     850.     1.15MB    23.8    393    11
# 9 Rfast            841.57µs 846.88µs    1150.     5.48KB     0      575     0
#10 base Split       360.24µs 368.28µs    2652.       256B     8.16  1300     4
#11 base Split Rfast 113.95µs 119.81µs    7540.    65.05KB    10.3   3661     5
#12 collapse         201.31µs 204.83µs    4724.       512B     2.01  2350     1
#13 collapse2        156.95µs 161.79µs    5408.       512B     2.02  2683     1

当你需要在不同的列上应用不同的聚合函数(并且你必须/想要坚持以R为基底)时,我发现它非常有用(并且有效):

e.g.

假设输入如下:

DF <-                
data.frame(Categ1=factor(c('A','A','B','B','A','B','A')),
           Categ2=factor(c('X','Y','X','X','X','Y','Y')),
           Samples=c(1,2,4,3,5,6,7),
           Freq=c(10,30,45,55,80,65,50))

> DF
  Categ1 Categ2 Samples Freq
1      A      X       1   10
2      A      Y       2   30
3      B      X       4   45
4      B      X       3   55
5      A      X       5   80
6      B      Y       6   65
7      A      Y       7   50

我们要按1类和2类进行分组,并计算Freq的样本和均值。 下面是使用ave的一个可能的解决方案:

# create a copy of DF (only the grouping columns)
DF2 <- DF[,c('Categ1','Categ2')]

# add sum of Samples by Categ1,Categ2 to DF2 
# (ave repeats the sum of the group for each row in the same group)
DF2$GroupTotSamples <- ave(DF$Samples,DF2,FUN=sum)

# add mean of Freq by Categ1,Categ2 to DF2 
# (ave repeats the mean of the group for each row in the same group)
DF2$GroupAvgFreq <- ave(DF$Freq,DF2,FUN=mean)

# remove the duplicates (keep only one row for each group)
DF2 <- DF2[!duplicated(DF2),]

结果:

> DF2
  Categ1 Categ2 GroupTotSamples GroupAvgFreq
1      A      X               6           45
2      A      Y               9           40
3      B      X               7           50
6      B      Y               6           65

如果x是一个包含你的数据的数据框架,那么下面的代码将完成你想要的:

require(reshape)
recast(x, Category ~ ., fun.aggregate=sum)

另一种解决方案是在矩阵或数据帧中按组返回和,并且简短快速:

rowsum(x$Frequency, x$Category)

使用聚合:

aggregate(x$Frequency, by=list(Category=x$Category), FUN=sum)
  Category  x
1    First 30
2   Second  5
3    Third 34

在上面的例子中,可以在列表中指定多个维度。相同数据类型的多个聚合指标可以通过cbind合并:

aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) ...

(嵌入@thelatemail评论),聚合也有一个公式界面

aggregate(Frequency ~ Category, x, sum)

或者,如果希望聚合多个列,可以使用。符号(也适用于一列)

aggregate(. ~ Category, x, sum)

或tapply:

tapply(x$Frequency, x$Category, FUN=sum)
 First Second  Third 
    30      5     34 

使用这些数据:

x <- data.frame(Category=factor(c("First", "First", "First", "Second",
                                      "Third", "Third", "Second")), 
                    Frequency=c(10,15,5,2,14,20,3))