Python代码计算:
余弦距离
余弦相似度
角距离
角相似
import math
from scipy import spatial
def calculate_cosine_distance(a, b):
cosine_distance = float(spatial.distance.cosine(a, b))
return cosine_distance
def calculate_cosine_similarity(a, b):
cosine_similarity = 1 - calculate_cosine_distance(a, b)
return cosine_similarity
def calculate_angular_distance(a, b):
cosine_similarity = calculate_cosine_similarity(a, b)
angular_distance = math.acos(cosine_similarity) / math.pi
return angular_distance
def calculate_angular_similarity(a, b):
angular_similarity = 1 - calculate_angular_distance(a, b)
return angular_similarity
相似性搜索:
如果你想在嵌入数组中找到最接近的余弦相似度,你可以使用Tensorflow,就像下面的代码。
在我的测试中,在不到一秒钟(使用GPU)的时间内,在1M嵌入(1' 000,000 '000 x512)中找到形状为1x512的嵌入的最接近值。
import time
import numpy as np # np.__version__ == '1.23.5'
import tensorflow as tf # tf.__version__ == '2.11.0'
EMBEDDINGS_LENGTH = 512
NUMBER_OF_EMBEDDINGS = 1000 * 1000
def calculate_cosine_similarities(x, embeddings):
cosine_similarities = -1 * tf.keras.losses.cosine_similarity(x, embeddings)
return cosine_similarities.numpy()
def find_closest_embeddings(x, embeddings, top_k=1):
cosine_similarities = calculate_cosine_similarities(x, embeddings)
values, indices = tf.math.top_k(cosine_similarities, k=top_k)
return values.numpy(), indices.numpy()
def main():
# x shape: (512)
# Embeddings shape: (1000000, 512)
x = np.random.rand(EMBEDDINGS_LENGTH).astype(np.float32)
embeddings = np.random.rand(NUMBER_OF_EMBEDDINGS, EMBEDDINGS_LENGTH).astype(np.float32)
print('Embeddings shape: ', embeddings.shape)
n = 100
sum_duration = 0
for i in range(n):
start = time.time()
best_values, best_indices = find_closest_embeddings(x, embeddings, top_k=1)
end = time.time()
duration = end - start
sum_duration += duration
print('Duration (seconds): {}, Best value: {}, Best index: {}'.format(duration, best_values[0], best_indices[0]))
# Average duration (seconds): 1.707 for Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz
# Average duration (seconds): 0.961 for NVIDIA 1080 ti
print('Average duration (seconds): ', sum_duration / n)
if __name__ == '__main__':
main()
对于更高级的相似度搜索,你可以使用Milvus, Weaviate或Faiss。
https://en.wikipedia.org/wiki/Cosine_similarity
https://gist.github.com/amir-saniyan/e102de09b01c4ed1632e3d1a1a1cbf64