我有一个数据框架,我需要计算每组(即每月,如下)的平均值。

Name     Month  Rate1     Rate2
Aira       1      12        23
Aira       2      18        73
Aira       3      19        45
Ben        1      53        19
Ben        2      22        87
Ben        3      19        45
Cat        1      22        87
Cat        2      67        43
Cat        3      45        32

我想要的输出如下所示,其中Rate1和Rate2的值是组均值。请忽略这个值,我已经为这个例子编好了。

Name       Rate1       Rate2
Aira        23.21       12.2
Ben         45.23       43.9
Cat         33.22       32.2

当前回答

在base R中有多种方法可以做到这一点,包括另一种聚合方法。下面的示例返回的是每月,我认为这是您所要求的。不过,同样的方法也可以用于返回人均均值:

使用大街:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')

Rate1.mean <- with(my.data, ave(Rate1, Month, FUN = function(x) mean(x, na.rm = TRUE)))
Rate2.mean <- with(my.data, ave(Rate2, Month, FUN = function(x) mean(x, na.rm = TRUE)))

my.data <- data.frame(my.data, Rate1.mean, Rate2.mean)
my.data

使用的:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')

by.month <- as.data.frame(do.call("rbind", by(my.data, my.data$Month, FUN = function(x) colMeans(x[,3:4]))))
colnames(by.month) <- c('Rate1.mean', 'Rate2.mean')
by.month <- cbind(Month = rownames(by.month), by.month)

my.data <- merge(my.data, by.month, by = 'Month')
my.data

使用lapply和split:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')

ly.mean <- lapply(split(my.data, my.data$Month), function(x) c(Mean = colMeans(x[,3:4])))
ly.mean <- as.data.frame(do.call("rbind", ly.mean))
ly.mean <- cbind(Month = rownames(ly.mean), ly.mean)

my.data <- merge(my.data, ly.mean, by = 'Month')
my.data

使用sapply和split:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
my.data

sy.mean <- t(sapply(split(my.data, my.data$Month), function(x) colMeans(x[,3:4])))
colnames(sy.mean) <- c('Rate1.mean', 'Rate2.mean')
sy.mean <- data.frame(Month = rownames(sy.mean), sy.mean, stringsAsFactors = FALSE)
my.data <- merge(my.data, sy.mean, by = 'Month')
my.data

使用聚合:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')

my.summary <- with(my.data, aggregate(list(Rate1, Rate2), by = list(Month), 
                   FUN = function(x) { mon.mean = mean(x, na.rm = TRUE) } ))

my.summary <- do.call(data.frame, my.summary)
colnames(my.summary) <- c('Month', 'Rate1.mean', 'Rate2.mean')
my.summary

my.data <- merge(my.data, my.summary, by = 'Month')
my.data

编辑时间:2020年6月28日

在这里,我使用聚合来通过组获得整个矩阵的列均值,其中组定义在外部向量中:

my.group <- c(1,2,1,2,2,3,1,2,3,3)

my.data <- matrix(c(   1,    2,    3,    4,    5,
                      10,   20,   30,   40,   50,
                       2,    4,    6,    8,   10,
                      20,   30,   40,   50,   60,
                      20,   18,   16,   14,   12,
                    1000, 1100, 1200, 1300, 1400,
                       2,    3,    4,    3,    2,
                      50,   40,   30,   20,   10,
                    1001, 2001, 3001, 4001, 5001,
                    1000, 2000, 3000, 4000, 5000), nrow = 10, ncol = 5, byrow = TRUE)
my.data

my.summary <- aggregate(list(my.data), by = list(my.group), FUN = function(x) { my.mean = mean(x, na.rm = TRUE) } )
my.summary
#  Group.1          X1       X2          X3       X4          X5
#1       1    1.666667    3.000    4.333333    5.000    5.666667
#2       2   25.000000   27.000   29.000000   31.000   33.000000
#3       3 1000.333333 1700.333 2400.333333 3100.333 3800.333333

其他回答

你也可以使用sqldf包来完成,如下所示:

library(sqldf)

x <- read.table(text='Name     Month  Rate1     Rate2
Aira       1      12        23
                Aira       2      18        73
                Aira       3      19        45
                Ben        1      53        19
                Ben        2      22        87
                Ben        3      19        45
                Cat        1      22        87
                Cat        2      67        43
                Cat        3      45        32', header=TRUE)

sqldf("
select 
  Name
  ,avg(Rate1) as Rate1_float
  ,avg(Rate2) as Rate2_float
  ,avg(Rate1) as Rate1
  ,avg(Rate2) as Rate2
from x
group by 
  Name
")

#  Name Rate1_float Rate2_float Rate1 Rate2
#1 Aira    16.33333    47.00000    16    47
#2  Ben    31.33333    50.33333    31    50
#3  Cat    44.66667    54.00000    44    54

正如其他答案所示,我最近转向了dplyr,但sqldf很好,因为大多数数据分析师/数据科学家/开发人员至少在SQL方面有一定的流畅性。通过这种方式,我认为它比dplyr或上面介绍的其他解决方案更易于编写具有普遍可读性的代码。

更新:在回复下面的注释时,我尝试按上面所示更新代码。然而,这种行为并不像我预期的那样。似乎只有当列别名与原始列名匹配时,列定义(即int vs float)才会继续执行。指定新名称时,将不舍入地返回聚合列。

这种类型的操作正是设计聚合的目的:

d <- read.table(text=
'Name     Month  Rate1     Rate2
Aira       1      12        23
Aira       2      18        73
Aira       3      19        45
Ben        1      53        19
Ben        2      22        87
Ben        3      19        45
Cat        1      22        87
Cat        2      67        43
Cat        3      45        32', header=TRUE)

aggregate(d[, 3:4], list(d$Name), mean)

  Group.1    Rate1    Rate2
1    Aira 16.33333 47.00000
2     Ben 31.33333 50.33333
3     Cat 44.66667 54.00000

这里我们聚合data.frame d的第3列和第4列,按d$Name分组,并应用平均值函数。


或者,使用公式接口:

aggregate(. ~ Name, d[-2], mean)

或者使用dplyr包中的group_by & summarise_at:

library(dplyr)

d %>%
  group_by(Name) %>%
  summarise_at(vars(-Month), funs(mean(., na.rm=TRUE)))

# A tibble: 3 x 3
  Name  Rate1 Rate2
  <fct> <dbl> <dbl>
1 Aira   16.3  47.0
2 Ben    31.3  50.3
3 Cat    44.7  54.0

参见?summarise_at,了解指定要执行的变量的多种方法。这里,vars(-Month)表示除Month之外的所有变量。


在tidyverse/dplyr的最新版本中,使用summarise(across(…))比summarise_at更可取:

d %>% 
  group_by(Name) %>%
  summarise(across(-Month, mean, na.rm = TRUE))

你也可以使用package plyr,它在某种程度上更通用:

library(plyr)

ddply(d, .(Name), summarize,  Rate1=mean(Rate1), Rate2=mean(Rate2))

  Name    Rate1    Rate2
1 Aira 16.33333 47.00000
2  Ben 31.33333 50.33333
3  Cat 44.66667 54.00000

在base R中有多种方法可以做到这一点,包括另一种聚合方法。下面的示例返回的是每月,我认为这是您所要求的。不过,同样的方法也可以用于返回人均均值:

使用大街:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')

Rate1.mean <- with(my.data, ave(Rate1, Month, FUN = function(x) mean(x, na.rm = TRUE)))
Rate2.mean <- with(my.data, ave(Rate2, Month, FUN = function(x) mean(x, na.rm = TRUE)))

my.data <- data.frame(my.data, Rate1.mean, Rate2.mean)
my.data

使用的:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')

by.month <- as.data.frame(do.call("rbind", by(my.data, my.data$Month, FUN = function(x) colMeans(x[,3:4]))))
colnames(by.month) <- c('Rate1.mean', 'Rate2.mean')
by.month <- cbind(Month = rownames(by.month), by.month)

my.data <- merge(my.data, by.month, by = 'Month')
my.data

使用lapply和split:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')

ly.mean <- lapply(split(my.data, my.data$Month), function(x) c(Mean = colMeans(x[,3:4])))
ly.mean <- as.data.frame(do.call("rbind", ly.mean))
ly.mean <- cbind(Month = rownames(ly.mean), ly.mean)

my.data <- merge(my.data, ly.mean, by = 'Month')
my.data

使用sapply和split:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
my.data

sy.mean <- t(sapply(split(my.data, my.data$Month), function(x) colMeans(x[,3:4])))
colnames(sy.mean) <- c('Rate1.mean', 'Rate2.mean')
sy.mean <- data.frame(Month = rownames(sy.mean), sy.mean, stringsAsFactors = FALSE)
my.data <- merge(my.data, sy.mean, by = 'Month')
my.data

使用聚合:

my.data <- read.table(text = '
     Name     Month  Rate1     Rate2
     Aira       1      12        23
     Aira       2      18        73
     Aira       3      19        45
     Ben        1      53        19
     Ben        2      22        87
     Ben        3      19        45
     Cat        1      22        87
     Cat        2      67        43
     Cat        3      45        32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')

my.summary <- with(my.data, aggregate(list(Rate1, Rate2), by = list(Month), 
                   FUN = function(x) { mon.mean = mean(x, na.rm = TRUE) } ))

my.summary <- do.call(data.frame, my.summary)
colnames(my.summary) <- c('Month', 'Rate1.mean', 'Rate2.mean')
my.summary

my.data <- merge(my.data, my.summary, by = 'Month')
my.data

编辑时间:2020年6月28日

在这里,我使用聚合来通过组获得整个矩阵的列均值,其中组定义在外部向量中:

my.group <- c(1,2,1,2,2,3,1,2,3,3)

my.data <- matrix(c(   1,    2,    3,    4,    5,
                      10,   20,   30,   40,   50,
                       2,    4,    6,    8,   10,
                      20,   30,   40,   50,   60,
                      20,   18,   16,   14,   12,
                    1000, 1100, 1200, 1300, 1400,
                       2,    3,    4,    3,    2,
                      50,   40,   30,   20,   10,
                    1001, 2001, 3001, 4001, 5001,
                    1000, 2000, 3000, 4000, 5000), nrow = 10, ncol = 5, byrow = TRUE)
my.data

my.summary <- aggregate(list(my.data), by = list(my.group), FUN = function(x) { my.mean = mean(x, na.rm = TRUE) } )
my.summary
#  Group.1          X1       X2          X3       X4          X5
#1       1    1.666667    3.000    4.333333    5.000    5.666667
#2       2   25.000000   27.000   29.000000   31.000   33.000000
#3       3 1000.333333 1700.333 2400.333333 3100.333 3800.333333