是否可以使用scikit-learn K-Means聚类来指定自己的距离函数?

不幸的是没有:scikit-learn目前实现的k-means只使用欧几里得距离。

将k-means扩展到其他距离并不是一件简单的事情，denis上面的回答并不是对其他度量实现k-means的正确方法。

2011-04-03 17:17:02

这是一个用20多个距离中的任意一个的小kmeans scipy.spatial。距离，或者用户函数。欢迎评论(到目前为止只有一个用户，还不够); 特别是，你的N，暗度，k度规是什么?

#!/usr/bin/env python
# kmeans.py using any of the 20-odd metrics in scipy.spatial.distance
# kmeanssample 2 pass, first sample sqrt(N)

from __future__ import division
import random
import numpy as np
from scipy.spatial.distance import cdist  # $scipy/spatial/distance.py
    # http://docs.scipy.org/doc/scipy/reference/spatial.html
from scipy.sparse import issparse  # $scipy/sparse/csr.py

__date__ = "2011-11-17 Nov denis"
    # X sparse, any cdist metric: real app ?
    # centres get dense rapidly, metrics in high dim hit distance whiteout
    # vs unsupervised / semi-supervised svm

#...............................................................................
def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1 ):
    """ centres, Xtocentre, distances = kmeans( X, initial centres ... )
    in:
        X N x dim  may be sparse
        centres k x dim: initial centres, e.g. random.sample( X, k )
        delta: relative error, iterate until the average distance to centres
            is within delta of the previous average distance
        maxiter
        metric: any of the 20-odd in scipy.spatial.distance
            "chebyshev" = max, "cityblock" = L1, "minkowski" with p=
            or a function( Xvec, centrevec ), e.g. Lqmetric below
        p: for minkowski metric -- local mod cdist for 0 < p < 1 too
        verbose: 0 silent, 2 prints running distances
    out:
        centres, k x dim
        Xtocentre: each X -> its nearest centre, ints N -> k
        distances, N
    see also: kmeanssample below, class Kmeans below.
    """
    if not issparse(X):
        X = np.asanyarray(X)  # ?
    centres = centres.todense() if issparse(centres) \
        else centres.copy()
    N, dim = X.shape
    k, cdim = centres.shape
    if dim != cdim:
        raise ValueError( "kmeans: X %s and centres %s must have the same number of columns" % (
            X.shape, centres.shape ))
    if verbose:
        print "kmeans: X %s  centres %s  delta=%.2g  maxiter=%d  metric=%s" % (
            X.shape, centres.shape, delta, maxiter, metric)
    allx = np.arange(N)
    prevdist = 0
    for jiter in range( 1, maxiter+1 ):
        D = cdist_sparse( X, centres, metric=metric, p=p )  # |X| x |centres|
        xtoc = D.argmin(axis=1)  # X -> nearest centre
        distances = D[allx,xtoc]
        avdist = distances.mean()  # median ?
        if verbose >= 2:
            print "kmeans: av |X - nearest centre| = %.4g" % avdist
        if (1 - delta) * prevdist <= avdist <= prevdist \
        or jiter == maxiter:
            break
        prevdist = avdist
        for jc in range(k):  # (1 pass in C)
            c = np.where( xtoc == jc )[0]
            if len(c) > 0:
                centres[jc] = X[c].mean( axis=0 )
    if verbose:
        print "kmeans: %d iterations  cluster sizes:" % jiter, np.bincount(xtoc)
    if verbose >= 2:
        r50 = np.zeros(k)
        r90 = np.zeros(k)
        for j in range(k):
            dist = distances[ xtoc == j ]
            if len(dist) > 0:
                r50[j], r90[j] = np.percentile( dist, (50, 90) )
        print "kmeans: cluster 50 % radius", r50.astype(int)
        print "kmeans: cluster 90 % radius", r90.astype(int)
            # scale L1 / dim, L2 / sqrt(dim) ?
    return centres, xtoc, distances

#...............................................................................
def kmeanssample( X, k, nsample=0, **kwargs ):
    """ 2-pass kmeans, fast for large N:
        1) kmeans a random sample of nsample ~ sqrt(N) from X
        2) full kmeans, starting from those centres
    """
        # merge w kmeans ? mttiw
        # v large N: sample N^1/2, N^1/2 of that
        # seed like sklearn ?
    N, dim = X.shape
    if nsample == 0:
        nsample = max( 2*np.sqrt(N), 10*k )
    Xsample = randomsample( X, int(nsample) )
    pass1centres = randomsample( X, int(k) )
    samplecentres = kmeans( Xsample, pass1centres, **kwargs )[0]
    return kmeans( X, samplecentres, **kwargs )

def cdist_sparse( X, Y, **kwargs ):
    """ -> |X| x |Y| cdist array, any cdist metric
        X or Y may be sparse -- best csr
    """
        # todense row at a time, v slow if both v sparse
    sxy = 2*issparse(X) + issparse(Y)
    if sxy == 0:
        return cdist( X, Y, **kwargs )
    d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )
    if sxy == 2:
        for j, x in enumerate(X):
            d[j] = cdist( x.todense(), Y, **kwargs ) [0]
    elif sxy == 1:
        for k, y in enumerate(Y):
            d[:,k] = cdist( X, y.todense(), **kwargs ) [0]
    else:
        for j, x in enumerate(X):
            for k, y in enumerate(Y):
                d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]
    return d

def randomsample( X, n ):
    """ random.sample of the rows of X
        X may be sparse -- best csr
    """
    sampleix = random.sample( xrange( X.shape[0] ), int(n) )
    return X[sampleix]

def nearestcentres( X, centres, metric="euclidean", p=2 ):
    """ each X -> nearest centre, any metric
            euclidean2 (~ withinss) is more sensitive to outliers,
            cityblock (manhattan, L1) less sensitive
    """
    D = cdist( X, centres, metric=metric, p=p )  # |X| x |centres|
    return D.argmin(axis=1)

def Lqmetric( x, y=None, q=.5 ):
    # yes a metric, may increase weight of near matches; see ...
    return (np.abs(x - y) ** q) .mean() if y is not None \
        else (np.abs(x) ** q) .mean()

#...............................................................................
class Kmeans:
    """ km = Kmeans( X, k= or centres=, ... )
        in: either initial centres= for kmeans
            or k= [nsample=] for kmeanssample
        out: km.centres, km.Xtocentre, km.distances
        iterator:
            for jcentre, J in km:
                clustercentre = centres[jcentre]
                J indexes e.g. X[J], classes[J]
    """
    def __init__( self, X, k=0, centres=None, nsample=0, **kwargs ):
        self.X = X
        if centres is None:
            self.centres, self.Xtocentre, self.distances = kmeanssample(
                X, k=k, nsample=nsample, **kwargs )
        else:
            self.centres, self.Xtocentre, self.distances = kmeans(
                X, centres, **kwargs )

    def __iter__(self):
        for jc in range(len(self.centres)):
            yield jc, (self.Xtocentre == jc)

#...............................................................................
if __name__ == "__main__":
    import random
    import sys
    from time import time

    N = 10000
    dim = 10
    ncluster = 10
    kmsample = 100  # 0: random centres, > 0: kmeanssample
    kmdelta = .001
    kmiter = 10
    metric = "cityblock"  # "chebyshev" = max, "cityblock" L1,  Lqmetric
    seed = 1

    exec( "\n".join( sys.argv[1:] ))  # run this.py N= ...
    np.set_printoptions( 1, threshold=200, edgeitems=5, suppress=True )
    np.random.seed(seed)
    random.seed(seed)

    print "N %d  dim %d  ncluster %d  kmsample %d  metric %s" % (
        N, dim, ncluster, kmsample, metric)
    X = np.random.exponential( size=(N,dim) )
        # cf scikits-learn datasets/
    t0 = time()
    if kmsample > 0:
        centres, xtoc, dist = kmeanssample( X, ncluster, nsample=kmsample,
            delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )
    else:
        randomcentres = randomsample( X, ncluster )
        centres, xtoc, dist = kmeans( X, randomcentres,
            delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )
    print "%.0f msec" % ((time() - t0) * 1000)

    # also ~/py/np/kmeans/test-kmeans.py

2012年3月26日增加了一些说明:

1)对于余弦距离，首先将所有数据向量归一化为|X| = 1;然后

cosinedistance( X, Y ) = 1 - X . Y = Euclidean distance |X - Y|^2 / 2

是快。对于位向量，将范数与向量分开而不是展开到浮点数 (尽管有些程序可能会为你扩展)。对于稀疏向量，比如N, X的1%。Y需要时间O(2% N)，空间O (N);但我不知道哪个程序能做到。

2） Scikit-learn集群给出了k-means, mini-batch-k-means… 使用在scipy上工作的代码。稀疏矩阵。

3)总是在k-means之后检查聚类大小。如果你期待大致相同大小的集群，但是它们出来了 [44 37 9 5 5] %……(挠头的声音)。

2011-04-05 12:05:28

是的，你可以使用差分度量函数;然而，根据定义，k-means聚类算法依赖于每个聚类均值的欧几里得距离。

你可以使用不同的度量，所以即使你仍然在计算平均值你也可以使用像mahalnobis距离这样的东西。

2012-03-26 15:52:44

Spectral Python的k-means允许使用L1 (Manhattan)距离。

2013-03-31 00:14:34

只要在可以这样做的地方使用nltk即可，例如:

from nltk.cluster.kmeans import KMeansClusterer
NUM_CLUSTERS = <choose a value>
data = <sparse matrix that you would normally give to scikit>.toarray()

kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)

2016-09-12 01:38:45

python/ c++中有pyclustering(所以它很快!)，可以让你指定一个自定义度量函数

from pyclustering.cluster.kmeans import kmeans
from pyclustering.utils.metric import type_metric, distance_metric

user_function = lambda point1, point2: point1[0] + point2[0] + 2
metric = distance_metric(type_metric.USER_DEFINED, func=user_function)

# create K-Means algorithm with specific distance metric
start_centers = [[4.7, 5.9], [5.7, 6.5]];
kmeans_instance = kmeans(sample, start_centers, metric=metric)

# run cluster analysis and obtain results
kmeans_instance.process()
clusters = kmeans_instance.get_clusters()

实际上，我还没有测试这段代码，但它拼凑在一起从一个票和示例代码。

2018-08-07 13:20:51

Sklearn Kmeans使用欧几里得距离。它没有度量参数。也就是说，如果你在聚类时间序列，你可以使用tslearn python包，当你可以指定一个度量(dtw, softdtw，欧几里得)。

2019-05-27 10:45:28

def distance_metrics(dist_metrics):
    kmeans_instance = kmeans(trs_data, initial_centers, metric=dist_metrics)

    label = np.zeros(210, dtype=int)
    for i in range(0, len(clusters)):
        for index, j in enumerate(clusters[i]):
            label[j] = i

2020-09-02 14:44:05

是的，在当前稳定版本的sklearn (scikit-learn 1.1.3)中，您可以轻松地使用自己的距离度量。你所要做的就是创建一个继承自sklearn.cluster.KMeans的类，并覆盖它的_transform方法。

下面的例子是IOU与Yolov2论文的距离。

import sklearn.cluster
import numpy as np

def anchor_iou(box_dims, centroid_box_dims):
    box_w, box_h = box_dims[..., 0], box_dims[..., 1]
    centroid_w, centroid_h = centroid_box_dims[..., 0], centroid_box_dims[..., 1]
    inter_w = np.minimum(box_w[..., np.newaxis], centroid_w[np.newaxis, ...])
    inter_h = np.minimum(box_h[..., np.newaxis], centroid_h[np.newaxis, ...])
    inter_area = inter_w * inter_h
    centroid_area = centroid_w * centroid_h
    box_area = box_w * box_h
    return inter_area / (
        centroid_area[np.newaxis, ...] + box_area[..., np.newaxis] - inter_area
    )

class IOUKMeans(sklearn.cluster.KMeans):
    def __init__(
        self,
        n_clusters=8,
        *,
        init="k-means++",
        n_init=10,
        max_iter=300,
        tol=1e-4,
        verbose=0,
        random_state=None,
        copy_x=True,
        algorithm="lloyd",
    ):
        super().__init__(
            n_clusters=n_clusters,
            init=init,
            n_init=n_init,
            max_iter=max_iter,
            tol=tol,
            verbose=verbose,
            random_state=random_state,
            copy_x=copy_x,
            algorithm=algorithm
        )

    def _transform(self, X):
        return anchor_iou(X, self.cluster_centers_)

rng = np.random.default_rng(12345)
num_boxes = 10
bboxes = rng.integers(low=0, high=100, size=(num_boxes, 2))

kmeans = IOUKMeans(num_clusters).fit(bboxes)

2022-10-28 08:38:19

The Affinity propagation algorithm from the sklearn library allows you to pass the similarity matrix instead of the samples. So, you can use your metric to compute the similarity matrix (not the dissimilarity matrix) and pass it to the function by setting the "affinity" term to "precomputed".https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation.fit In terms of the K-Mean, I think it is also possible but I have not tried it. However, as the other answers stated, finding the mean using a different metric will be the issue. Instead, you can use PAM (K-Medoids) algorthim as it calculates the change in Total Deviation (TD), thus it does not rely on the distance metric. https://python-kmedoids.readthedocs.io/en/latest/#fasterpam

2022-12-14 07:08:24

是否可以使用scikit-learn K-Means聚类来指定自己的距离函数?

推荐文章

最新文章

标签