浏览这么多函数而不考虑性能因素是非常令人困惑的。我知道,在可用的包中,除了查找最优的集群数量之外,很少有函数能做很多事情。以下是这些函数的基准测试结果,供任何考虑将这些函数用于他/她的项目的人使用
n = 100
g = 6
set.seed(g)
d <- data.frame(x = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))),
y = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))))
mydata <- d
require(cluster)
require(vegan)
require(mclust)
require(apcluster)
require(NbClust)
require(fpc)
microbenchmark::microbenchmark(
wss = {
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata, centers=i)$withinss)
},
fpc = {
asw <- numeric(20)
for (k in 2:20)
asw[[k]] <- pam(d, k) $ silinfo $ avg.width
k.best <- which.max(asw)
},
fpc_1 = fpc::pamk(d),
vegan = {
fit <- cascadeKM(scale(d, center = TRUE, scale = TRUE), 1, 10, iter = 1000)
plot(fit, sortg = TRUE, grpmts.plot = TRUE)
calinski.best <- as.numeric(which.max(fit$results[2,]))
},
mclust = {
d_clust <- Mclust(as.matrix(d), G=1:20)
m.best <- dim(d_clust$z)[2]
},
d.apclus = apcluster(negDistMat(r=2), d),
clusGap = clusGap(d, kmeans, 10, B = 100, verbose = interactive()),
NbClust = NbClust(d, diss=NULL, distance = "euclidean",
method = "kmeans", min.nc=2, max.nc=15,
index = "alllong", alphaBeale = 0.1),
times = 1)
Unit: milliseconds
expr min lq mean median uq max neval
wss 16.83938 16.83938 16.83938 16.83938 16.83938 16.83938 1
fpc 221.99490 221.99490 221.99490 221.99490 221.99490 221.99490 1
fpc_1 43.10493 43.10493 43.10493 43.10493 43.10493 43.10493 1
vegan 1096.08568 1096.08568 1096.08568 1096.08568 1096.08568 1096.08568 1
mclust 1531.69475 1531.69475 1531.69475 1531.69475 1531.69475 1531.69475 1
d.apclus 28.56100 28.56100 28.56100 28.56100 28.56100 28.56100 1
clusGap 1096.50680 1096.50680 1096.50680 1096.50680 1096.50680 1096.50680 1
NbClust 10940.98807 10940.98807 10940.98807 10940.98807 10940.98807 10940.98807 1
我发现fpc包中的功能pamk对我的需求最有用。