我正在寻找一个函数,它将两个列表作为输入,并返回Pearson相关性,以及相关性的重要性。
当前回答
你可以看看这篇文章。这是一个使用pandas库(适用于Python)根据多个文件的历史外汇货币对数据计算相关性的示例,然后使用seaborn库生成热图图。
http://www.tradinggeeks.net/2015/08/calculating-correlation-in-python/
其他回答
本文给出了一种基于稀疏向量的pearson相关的实现方法。这里的向量表示为(index, value)表示的元组列表。两个稀疏向量可以是不同的长度,但总的向量大小必须是相同的。这对于文本挖掘应用程序非常有用,其中向量大小非常大,因为大多数特征都是单词包,因此通常使用稀疏向量执行计算。
def get_pearson_corelation(self, first_feature_vector=[], second_feature_vector=[], length_of_featureset=0):
indexed_feature_dict = {}
if first_feature_vector == [] or second_feature_vector == [] or length_of_featureset == 0:
raise ValueError("Empty feature vectors or zero length of featureset in get_pearson_corelation")
sum_a = sum(value for index, value in first_feature_vector)
sum_b = sum(value for index, value in second_feature_vector)
avg_a = float(sum_a) / length_of_featureset
avg_b = float(sum_b) / length_of_featureset
mean_sq_error_a = sqrt((sum((value - avg_a) ** 2 for index, value in first_feature_vector)) + ((
length_of_featureset - len(first_feature_vector)) * ((0 - avg_a) ** 2)))
mean_sq_error_b = sqrt((sum((value - avg_b) ** 2 for index, value in second_feature_vector)) + ((
length_of_featureset - len(second_feature_vector)) * ((0 - avg_b) ** 2)))
covariance_a_b = 0
#calculate covariance for the sparse vectors
for tuple in first_feature_vector:
if len(tuple) != 2:
raise ValueError("Invalid feature frequency tuple in featureVector: %s") % (tuple,)
indexed_feature_dict[tuple[0]] = tuple[1]
count_of_features = 0
for tuple in second_feature_vector:
count_of_features += 1
if len(tuple) != 2:
raise ValueError("Invalid feature frequency tuple in featureVector: %s") % (tuple,)
if tuple[0] in indexed_feature_dict:
covariance_a_b += ((indexed_feature_dict[tuple[0]] - avg_a) * (tuple[1] - avg_b))
del (indexed_feature_dict[tuple[0]])
else:
covariance_a_b += (0 - avg_a) * (tuple[1] - avg_b)
for index in indexed_feature_dict:
count_of_features += 1
covariance_a_b += (indexed_feature_dict[index] - avg_a) * (0 - avg_b)
#adjust covariance with rest of vector with 0 value
covariance_a_b += (length_of_featureset - count_of_features) * -avg_a * -avg_b
if mean_sq_error_a == 0 or mean_sq_error_b == 0:
return -1
else:
return float(covariance_a_b) / (mean_sq_error_a * mean_sq_error_b)
单元测试:
def test_get_get_pearson_corelation(self):
vector_a = [(1, 1), (2, 2), (3, 3)]
vector_b = [(1, 1), (2, 5), (3, 7)]
self.assertAlmostEquals(self.sim_calculator.get_pearson_corelation(vector_a, vector_b, 3), 0.981980506062, 3, None, None)
vector_a = [(1, 1), (2, 2), (3, 3)]
vector_b = [(1, 1), (2, 5), (3, 7), (4, 14)]
self.assertAlmostEquals(self.sim_calculator.get_pearson_corelation(vector_a, vector_b, 5), -0.0137089240555, 3, None, None)
def correlation_score(y_true, y_pred):
"""Scores the predictions according to the competition rules.
It is assumed that the predictions are not constant.
Returns the average of each sample's Pearson correlation coefficient"""
y2 = y_pred.copy()
y2 -= y2.mean(axis=0); y2 /= y2.std(axis=0)
y1 = y_true.copy();
y1 -= y1.mean(axis=0); y1 /= y1.std(axis=0)
c = (y1*y2).mean().mean()# Correlation for rescaled matrices is just matrix product and average
return c
你可以用pandas.DataFrame这样做。相关系数:
import pandas as pd
a = [[1, 2, 3],
[5, 6, 9],
[5, 6, 11],
[5, 6, 13],
[5, 3, 13]]
df = pd.DataFrame(data=a)
df.corr()
这给了
0 1 2
0 1.000000 0.745601 0.916579
1 0.745601 1.000000 0.544248
2 0.916579 0.544248 1.000000
从Python 3.10开始,Pearson的相关系数(statistics.correlation)可以直接在标准库中获得:
from statistics import correlation
# a = [15, 12, 8, 8, 7, 7, 7, 6, 5, 3]
# b = [10, 25, 17, 11, 13, 17, 20, 13, 9, 15]
correlation(a, b)
# 0.1449981545806852
这是使用numpy的Pearson Correlation函数的实现:
def corr(data1, data2):
"data1 & data2 should be numpy arrays."
mean1 = data1.mean()
mean2 = data2.mean()
std1 = data1.std()
std2 = data2.std()
# corr = ((data1-mean1)*(data2-mean2)).mean()/(std1*std2)
corr = ((data1*data2).mean()-mean1*mean2)/(std1*std2)
return corr