NumPy提出了一种通过np.argmax获取数组最大值索引的方法。

我想要一个类似的东西,但返回N个最大值的索引。

例如,如果我有一个数组[1,3,2,4,5],那么nargmax(array, n=3)将返回对应于元素[5,4,3]的下标[4,3,1]。


当前回答

当top_k<<axis_length时,它优于argsort。

import numpy as np

def get_sorted_top_k(array, top_k=1, axis=-1, reverse=False):
    if reverse:
        axis_length = array.shape[axis]
        partition_index = np.take(np.argpartition(array, kth=-top_k, axis=axis),
                                  range(axis_length - top_k, axis_length), axis)
    else:
        partition_index = np.take(np.argpartition(array, kth=top_k, axis=axis), range(0, top_k), axis)
    top_scores = np.take_along_axis(array, partition_index, axis)
    # resort partition
    sorted_index = np.argsort(top_scores, axis=axis)
    if reverse:
        sorted_index = np.flip(sorted_index, axis=axis)
    top_sorted_scores = np.take_along_axis(top_scores, sorted_index, axis)
    top_sorted_indexes = np.take_along_axis(partition_index, sorted_index, axis)
    return top_sorted_scores, top_sorted_indexes

if __name__ == "__main__":
    import time
    from sklearn.metrics.pairwise import cosine_similarity

    x = np.random.rand(10, 128)
    y = np.random.rand(1000000, 128)
    z = cosine_similarity(x, y)
    start_time = time.time()
    sorted_index_1 = get_sorted_top_k(z, top_k=3, axis=1, reverse=True)[1]
    print(time.time() - start_time)

其他回答

当top_k<<axis_length时,它优于argsort。

import numpy as np

def get_sorted_top_k(array, top_k=1, axis=-1, reverse=False):
    if reverse:
        axis_length = array.shape[axis]
        partition_index = np.take(np.argpartition(array, kth=-top_k, axis=axis),
                                  range(axis_length - top_k, axis_length), axis)
    else:
        partition_index = np.take(np.argpartition(array, kth=top_k, axis=axis), range(0, top_k), axis)
    top_scores = np.take_along_axis(array, partition_index, axis)
    # resort partition
    sorted_index = np.argsort(top_scores, axis=axis)
    if reverse:
        sorted_index = np.flip(sorted_index, axis=axis)
    top_sorted_scores = np.take_along_axis(top_scores, sorted_index, axis)
    top_sorted_indexes = np.take_along_axis(partition_index, sorted_index, axis)
    return top_sorted_scores, top_sorted_indexes

if __name__ == "__main__":
    import time
    from sklearn.metrics.pairwise import cosine_similarity

    x = np.random.rand(10, 128)
    y = np.random.rand(1000000, 128)
    z = cosine_similarity(x, y)
    start_time = time.time()
    sorted_index_1 = get_sorted_top_k(z, top_k=3, axis=1, reverse=True)[1]
    print(time.time() - start_time)

Use:

from operator import itemgetter
from heapq import nlargest
result = nlargest(N, enumerate(your_list), itemgetter(1))

现在,结果列表将包含N个元组(index, value),其中value是最大的。

Use:

def max_indices(arr, k):
    '''
    Returns the indices of the k first largest elements of arr
    (in descending order in values)
    '''
    assert k <= arr.size, 'k should be smaller or equal to the array size'
    arr_ = arr.astype(float)  # make a copy of arr
    max_idxs = []
    for _ in range(k):
        max_element = np.max(arr_)
        if np.isinf(max_element):
            break
        else:
            idx = np.where(arr_ == max_element)
        max_idxs.append(idx)
        arr_[idx] = -np.inf
    return max_idxs

它也适用于2D数组。例如,

In [0]: A = np.array([[ 0.51845014,  0.72528114],
                     [ 0.88421561,  0.18798661],
                     [ 0.89832036,  0.19448609],
                     [ 0.89832036,  0.19448609]])
In [1]: max_indices(A, 8)
Out[1]:
    [(array([2, 3], dtype=int64), array([0, 0], dtype=int64)),
     (array([1], dtype=int64), array([0], dtype=int64)),
     (array([0], dtype=int64), array([1], dtype=int64)),
     (array([0], dtype=int64), array([0], dtype=int64)),
     (array([2, 3], dtype=int64), array([1, 1], dtype=int64)),
     (array([1], dtype=int64), array([1], dtype=int64))]

In [2]: A[max_indices(A, 8)[0]][0]
Out[2]: array([ 0.89832036])

简单的:

idx = (-arr).argsort()[:n]

其中n为最大值的个数。

比较了编码的便捷性和速度

速度对我的需求很重要,所以我测试了这个问题的三个答案。

根据我的具体情况,对这三个答案中的代码进行了修改。

然后我比较了每种方法的速度。

编码智慧:

NPE的回答是最优雅的,也足够快地满足我的需求。 Fred foo的回答需要最多的重构来满足我的需求,但却是最快的。我选择了这个答案,因为尽管它需要更多的工作,但它并不太糟糕,并且具有显著的速度优势。 Off99555的回答是最优雅的,但也是最慢的。

测试和比较的完整代码

import numpy as np
import time
import random
import sys
from operator import itemgetter
from heapq import nlargest

''' Fake Data Setup '''
a1 = list(range(1000000))
random.shuffle(a1)
a1 = np.array(a1)

''' ################################################ '''
''' NPE's Answer Modified A Bit For My Case '''
t0 = time.time()
indices = np.flip(np.argsort(a1))[:5]
results = []
for index in indices:
    results.append((index, a1[index]))
t1 = time.time()
print("NPE's Answer:")
print(results)
print(t1 - t0)
print()

''' Fred Foos Answer Modified A Bit For My Case'''
t0 = time.time()
indices = np.argpartition(a1, -6)[-5:]
results = []
for index in indices:
    results.append((a1[index], index))
results.sort(reverse=True)
results = [(b, a) for a, b in results]
t1 = time.time()
print("Fred Foo's Answer:")
print(results)
print(t1 - t0)
print()

''' off99555's Answer - No Modification Needed For My Needs '''
t0 = time.time()
result = nlargest(5, enumerate(a1), itemgetter(1))
t1 = time.time()
print("off99555's Answer:")
print(result)
print(t1 - t0)

输出速度报告

肺水肿的回答是:

[(631934, 999999), (788104, 999998), (413003, 999997), (536514, 999996), (81029, 999995)]
0.1349949836730957

Fred Foo的回答:

[(631934, 999999), (788104, 999998), (413003, 999997), (536514, 999996), (81029, 999995)]
0.011161565780639648

off99555的回答是:

[(631934, 999999), (788104, 999998), (413003, 999997), (536514, 999996), (81029, 999995)]
0.439760684967041