更新:到目前为止表现最好的算法是这个。
这个问题探讨了在实时时间序列数据中检测突然峰值的稳健算法。
考虑以下示例数据:
这个数据的例子是Matlab格式的(但这个问题不是关于语言,而是关于算法):
p = [1 1 1.1 1 0.9 1 1 1.1 1 0.9 1 1.1 1 1 0.9 1 1 1.1 1 1 1 1 1.1 0.9 1 1.1 1 1 0.9, ...
1 1.1 1 1 1.1 1 0.8 0.9 1 1.2 0.9 1 1 1.1 1.2 1 1.5 1 3 2 5 3 2 1 1 1 0.9 1 1, ...
3 2.6 4 3 3.2 2 1 1 0.8 4 4 2 2.5 1 1 1];
你可以清楚地看到有三个大峰和一些小峰。这个数据集是问题所涉及的时间序列数据集类的一个特定示例。这类数据集有两个一般特征:
有一种具有一般平均值的基本噪声
有很大的“峰值”或“更高的数据点”明显偏离噪声。
让我们假设以下情况:
峰的宽度不能事先确定
峰的高度明显偏离其他值
算法实时更新(因此每个新数据点都会更新)
对于这种情况,需要构造一个触发信号的边值。但是,边界值不能是静态的,必须通过算法实时确定。
我的问题是:什么是实时计算这些阈值的好算法?有没有针对这种情况的特定算法?最著名的算法是什么?
健壮的算法或有用的见解都受到高度赞赏。(可以用任何语言回答:这是关于算法的)
我在我的机器人项目中需要这样的东西。我想我可以归还Kotlin实现。
/**
* Smoothed zero-score alogrithm shamelessly copied from https://stackoverflow.com/a/22640362/6029703
* Uses a rolling mean and a rolling deviation (separate) to identify peaks in a vector
*
* @param y - The input vector to analyze
* @param lag - The lag of the moving window (i.e. how big the window is)
* @param threshold - The z-score at which the algorithm signals (i.e. how many standard deviations away from the moving mean a peak (or signal) is)
* @param influence - The influence (between 0 and 1) of new signals on the mean and standard deviation (how much a peak (or signal) should affect other values near it)
* @return - The calculated averages (avgFilter) and deviations (stdFilter), and the signals (signals)
*/
fun smoothedZScore(y: List<Double>, lag: Int, threshold: Double, influence: Double): Triple<List<Int>, List<Double>, List<Double>> {
val stats = SummaryStatistics()
// the results (peaks, 1 or -1) of our algorithm
val signals = MutableList<Int>(y.size, { 0 })
// filter out the signals (peaks) from our original list (using influence arg)
val filteredY = ArrayList<Double>(y)
// the current average of the rolling window
val avgFilter = MutableList<Double>(y.size, { 0.0 })
// the current standard deviation of the rolling window
val stdFilter = MutableList<Double>(y.size, { 0.0 })
// init avgFilter and stdFilter
y.take(lag).forEach { s -> stats.addValue(s) }
avgFilter[lag - 1] = stats.mean
stdFilter[lag - 1] = Math.sqrt(stats.populationVariance) // getStandardDeviation() uses sample variance (not what we want)
stats.clear()
//loop input starting at end of rolling window
(lag..y.size - 1).forEach { i ->
//if the distance between the current value and average is enough standard deviations (threshold) away
if (Math.abs(y[i] - avgFilter[i - 1]) > threshold * stdFilter[i - 1]) {
//this is a signal (i.e. peak), determine if it is a positive or negative signal
signals[i] = if (y[i] > avgFilter[i - 1]) 1 else -1
//filter this signal out using influence
filteredY[i] = (influence * y[i]) + ((1 - influence) * filteredY[i - 1])
} else {
//ensure this signal remains a zero
signals[i] = 0
//ensure this value is not filtered
filteredY[i] = y[i]
}
//update rolling average and deviation
(i - lag..i - 1).forEach { stats.addValue(filteredY[it]) }
avgFilter[i] = stats.getMean()
stdFilter[i] = Math.sqrt(stats.getPopulationVariance()) //getStandardDeviation() uses sample variance (not what we want)
stats.clear()
}
return Triple(signals, avgFilter, stdFilter)
}
带有验证图的示例项目可以在github上找到。
下面是这个答案的平滑z-score算法的c++实现
std::vector<int> smoothedZScore(std::vector<float> input)
{
//lag 5 for the smoothing functions
int lag = 5;
//3.5 standard deviations for signal
float threshold = 3.5;
//between 0 and 1, where 1 is normal influence, 0.5 is half
float influence = .5;
if (input.size() <= lag + 2)
{
std::vector<int> emptyVec;
return emptyVec;
}
//Initialise variables
std::vector<int> signals(input.size(), 0.0);
std::vector<float> filteredY(input.size(), 0.0);
std::vector<float> avgFilter(input.size(), 0.0);
std::vector<float> stdFilter(input.size(), 0.0);
std::vector<float> subVecStart(input.begin(), input.begin() + lag);
avgFilter[lag] = mean(subVecStart);
stdFilter[lag] = stdDev(subVecStart);
for (size_t i = lag + 1; i < input.size(); i++)
{
if (std::abs(input[i] - avgFilter[i - 1]) > threshold * stdFilter[i - 1])
{
if (input[i] > avgFilter[i - 1])
{
signals[i] = 1; //# Positive signal
}
else
{
signals[i] = -1; //# Negative signal
}
//Make influence lower
filteredY[i] = influence* input[i] + (1 - influence) * filteredY[i - 1];
}
else
{
signals[i] = 0; //# No signal
filteredY[i] = input[i];
}
//Adjust the filters
std::vector<float> subVec(filteredY.begin() + i - lag, filteredY.begin() + i);
avgFilter[i] = mean(subVec);
stdFilter[i] = stdDev(subVec);
}
return signals;
}
下面是在Golang中实现的Smoothed z-score算法(上图)。它假设一个[]int16 (PCM 16bit样本)的切片。你可以在这里找到要点。
/*
Settings (the ones below are examples: choose what is best for your data)
set lag to 5; # lag 5 for the smoothing functions
set threshold to 3.5; # 3.5 standard deviations for signal
set influence to 0.5; # between 0 and 1, where 1 is normal influence, 0.5 is half
*/
// ZScore on 16bit WAV samples
func ZScore(samples []int16, lag int, threshold float64, influence float64) (signals []int16) {
//lag := 20
//threshold := 3.5
//influence := 0.5
signals = make([]int16, len(samples))
filteredY := make([]int16, len(samples))
for i, sample := range samples[0:lag] {
filteredY[i] = sample
}
avgFilter := make([]int16, len(samples))
stdFilter := make([]int16, len(samples))
avgFilter[lag] = Average(samples[0:lag])
stdFilter[lag] = Std(samples[0:lag])
for i := lag + 1; i < len(samples); i++ {
f := float64(samples[i])
if float64(Abs(samples[i]-avgFilter[i-1])) > threshold*float64(stdFilter[i-1]) {
if samples[i] > avgFilter[i-1] {
signals[i] = 1
} else {
signals[i] = -1
}
filteredY[i] = int16(influence*f + (1-influence)*float64(filteredY[i-1]))
avgFilter[i] = Average(filteredY[(i - lag):i])
stdFilter[i] = Std(filteredY[(i - lag):i])
} else {
signals[i] = 0
filteredY[i] = samples[i]
avgFilter[i] = Average(filteredY[(i - lag):i])
stdFilter[i] = Std(filteredY[(i - lag):i])
}
}
return
}
// Average a chunk of values
func Average(chunk []int16) (avg int16) {
var sum int64
for _, sample := range chunk {
if sample < 0 {
sample *= -1
}
sum += int64(sample)
}
return int16(sum / int64(len(chunk)))
}
使用实时流的Python版本(不会在每个新数据点到达时重新计算所有数据点)。您可能想要调整类函数返回的内容—对于我的目的,我只需要信号。
import numpy as np
class real_time_peak_detection():
def __init__(self, array, lag, threshold, influence):
self.y = list(array)
self.length = len(self.y)
self.lag = lag
self.threshold = threshold
self.influence = influence
self.signals = [0] * len(self.y)
self.filteredY = np.array(self.y).tolist()
self.avgFilter = [0] * len(self.y)
self.stdFilter = [0] * len(self.y)
self.avgFilter[self.lag - 1] = np.mean(self.y[0:self.lag]).tolist()
self.stdFilter[self.lag - 1] = np.std(self.y[0:self.lag]).tolist()
def thresholding_algo(self, new_value):
self.y.append(new_value)
i = len(self.y) - 1
self.length = len(self.y)
if i < self.lag:
return 0
elif i == self.lag:
self.signals = [0] * len(self.y)
self.filteredY = np.array(self.y).tolist()
self.avgFilter = [0] * len(self.y)
self.stdFilter = [0] * len(self.y)
self.avgFilter[self.lag] = np.mean(self.y[0:self.lag]).tolist()
self.stdFilter[self.lag] = np.std(self.y[0:self.lag]).tolist()
return 0
self.signals += [0]
self.filteredY += [0]
self.avgFilter += [0]
self.stdFilter += [0]
if abs(self.y[i] - self.avgFilter[i - 1]) > (self.threshold * self.stdFilter[i - 1]):
if self.y[i] > self.avgFilter[i - 1]:
self.signals[i] = 1
else:
self.signals[i] = -1
self.filteredY[i] = self.influence * self.y[i] + \
(1 - self.influence) * self.filteredY[i - 1]
self.avgFilter[i] = np.mean(self.filteredY[(i - self.lag):i])
self.stdFilter[i] = np.std(self.filteredY[(i - self.lag):i])
else:
self.signals[i] = 0
self.filteredY[i] = self.y[i]
self.avgFilter[i] = np.mean(self.filteredY[(i - self.lag):i])
self.stdFilter[i] = np.std(self.filteredY[(i - self.lag):i])
return self.signals[i]