我怎样才能做得快呢?
当然我可以这样做:
static bool ByteArrayCompare(byte[] a1, byte[] a2)
{
if (a1.Length != a2.Length)
return false;
for (int i=0; i<a1.Length; i++)
if (a1[i]!=a2[i])
return false;
return true;
}
但我正在寻找一个BCL函数或一些高度优化的已证明的方法来做到这一点。
java.util.Arrays.equals((sbyte[])(Array)a1, (sbyte[])(Array)a2);
工作得很好,但这似乎不适用于x64。
注意我的快速回答。
似乎EqualBytesLongUnrolled是上述建议中最好的。
被跳过的方法(Enumerable.SequenceEqual,StructuralComparisons.StructuralEqualityComparer.Equals)不是慢速的。在265MB的数组上,我测量了这个:
Host Process Environment Information:
BenchmarkDotNet.Core=v0.9.9.0
OS=Microsoft Windows NT 6.2.9200.0
Processor=Intel(R) Core(TM) i7-3770 CPU 3.40GHz, ProcessorCount=8
Frequency=3323582 ticks, Resolution=300.8802 ns, Timer=TSC
CLR=MS.NET 4.0.30319.42000, Arch=64-bit RELEASE [RyuJIT]
GC=Concurrent Workstation
JitModules=clrjit-v4.6.1590.0
Type=CompareMemoriesBenchmarks Mode=Throughput
Method | Median | StdDev | Scaled | Scaled-SD |
----------------------- |------------ |---------- |------- |---------- |
NewMemCopy | 30.0443 ms | 1.1880 ms | 1.00 | 0.00 |
EqualBytesLongUnrolled | 29.9917 ms | 0.7480 ms | 0.99 | 0.04 |
msvcrt_memcmp | 30.0930 ms | 0.2964 ms | 1.00 | 0.03 |
UnsafeCompare | 31.0520 ms | 0.7072 ms | 1.03 | 0.04 |
ByteArrayCompare | 212.9980 ms | 2.0776 ms | 7.06 | 0.25 |
OS=Windows
Processor=?, ProcessorCount=8
Frequency=3323582 ticks, Resolution=300.8802 ns, Timer=TSC
CLR=CORE, Arch=64-bit ? [RyuJIT]
GC=Concurrent Workstation
dotnet cli version: 1.0.0-preview2-003131
Type=CompareMemoriesBenchmarks Mode=Throughput
Method | Median | StdDev | Scaled | Scaled-SD |
----------------------- |------------ |---------- |------- |---------- |
NewMemCopy | 30.1789 ms | 0.0437 ms | 1.00 | 0.00 |
EqualBytesLongUnrolled | 30.1985 ms | 0.1782 ms | 1.00 | 0.01 |
msvcrt_memcmp | 30.1084 ms | 0.0660 ms | 1.00 | 0.00 |
UnsafeCompare | 31.1845 ms | 0.4051 ms | 1.03 | 0.01 |
ByteArrayCompare | 212.0213 ms | 0.1694 ms | 7.03 | 0.01 |
这与其他方法类似,但这里的不同之处在于,不存在我可以一次检查的下一个最高字节数,例如,如果我有63个字节(在我的SIMD示例中),我可以检查前32个字节的相等性,然后是后32个字节,这比检查32个字节、16个字节、8个字节等等要快。您输入的第一个检查是比较所有字节所需要的唯一检查。
这确实在我的测试中名列前茅,但仅以微弱之差。
下面的代码正是我在airbreather/ArrayComparePerf.cs中测试它的方式。
public unsafe bool SIMDNoFallThrough() #requires System.Runtime.Intrinsics.X86
{
if (a1 == null || a2 == null)
return false;
int length0 = a1.Length;
if (length0 != a2.Length) return false;
fixed (byte* b00 = a1, b01 = a2)
{
byte* b0 = b00, b1 = b01, last0 = b0 + length0, last1 = b1 + length0, last32 = last0 - 31;
if (length0 > 31)
{
while (b0 < last32)
{
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0), Avx.LoadVector256(b1))) != -1)
return false;
b0 += 32;
b1 += 32;
}
return Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(last0 - 32), Avx.LoadVector256(last1 - 32))) == -1;
}
if (length0 > 15)
{
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0), Sse2.LoadVector128(b1))) != 65535)
return false;
return Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(last0 - 16), Sse2.LoadVector128(last1 - 16))) == 65535;
}
if (length0 > 7)
{
if (*(ulong*)b0 != *(ulong*)b1)
return false;
return *(ulong*)(last0 - 8) == *(ulong*)(last1 - 8);
}
if (length0 > 3)
{
if (*(uint*)b0 != *(uint*)b1)
return false;
return *(uint*)(last0 - 4) == *(uint*)(last1 - 4);
}
if (length0 > 1)
{
if (*(ushort*)b0 != *(ushort*)b1)
return false;
return *(ushort*)(last0 - 2) == *(ushort*)(last1 - 2);
}
return *b0 == *b1;
}
}
如果没有首选的SIMD,与现有的longpointer算法相同的方法:
public unsafe bool LongPointersNoFallThrough()
{
if (a1 == null || a2 == null || a1.Length != a2.Length)
return false;
fixed (byte* p1 = a1, p2 = a2)
{
byte* x1 = p1, x2 = p2;
int l = a1.Length;
if ((l & 8) != 0)
{
for (int i = 0; i < l / 8; i++, x1 += 8, x2 += 8)
if (*(long*)x1 != *(long*)x2) return false;
return *(long*)(x1 + (l - 8)) == *(long*)(x2 + (l - 8));
}
if ((l & 4) != 0)
{
if (*(int*)x1 != *(int*)x2) return false; x1 += 4; x2 += 4;
return *(int*)(x1 + (l - 4)) == *(int*)(x2 + (l - 4));
}
if ((l & 2) != 0)
{
if (*(short*)x1 != *(short*)x2) return false; x1 += 2; x2 += 2;
return *(short*)(x1 + (l - 2)) == *(short*)(x2 + (l - 2));
}
return *x1 == *x2;
}
}
对于那些关心顺序的人(即希望你的memcmp返回一个int而不是什么都没有),. net Core 3.0(以及。net Standard 2.1也就是。net 5.0)将包括一个Span.SequenceCompareTo(…)扩展方法(加上一个Span.SequenceEqualTo),可以用来比较两个ReadOnlySpan<T>实例(其中T: IComparable<T>)。
在最初的GitHub提案中,讨论了与跳转表计算的方法比较,将字节[]读为长[],SIMD使用,以及对CLR实现的memcmp的p/调用。
继续向前,这应该是您比较字节数组或字节范围的首选方法(对于. net Standard 2.1 api,应该使用Span<byte>而不是byte[]),并且它足够快,您应该不再关心优化它(不,尽管在名称上有相似之处,但它的性能不像可怕的Enumerable.SequenceEqual那样糟糕)。
#if NETCOREAPP3_0_OR_GREATER
// Using the platform-native Span<T>.SequenceEqual<T>(..)
public static int Compare(byte[] range1, int offset1, byte[] range2, int offset2, int count)
{
var span1 = range1.AsSpan(offset1, count);
var span2 = range2.AsSpan(offset2, count);
return span1.SequenceCompareTo(span2);
// or, if you don't care about ordering
// return span1.SequenceEqual(span2);
}
#else
// The most basic implementation, in platform-agnostic, safe C#
public static bool Compare(byte[] range1, int offset1, byte[] range2, int offset2, int count)
{
// Working backwards lets the compiler optimize away bound checking after the first loop
for (int i = count - 1; i >= 0; --i)
{
if (range1[offset1 + i] != range2[offset2 + i])
{
return false;
}
}
return true;
}
#endif
我开发了一个方法,稍微击败memcmp() (plinth的答案)和非常轻微击败EqualBytesLongUnrolled() (Arek Bulski的答案)在我的PC上。基本上,它以4而不是8展开循环。
2019年3月30日更新:
从。net核心3.0开始,我们有了SIMD支持!
这个解决方案在我的PC上是最快的:
#if NETCOREAPP3_0
using System.Runtime.Intrinsics.X86;
#endif
…
public static unsafe bool Compare(byte[] arr0, byte[] arr1)
{
if (arr0 == arr1)
{
return true;
}
if (arr0 == null || arr1 == null)
{
return false;
}
if (arr0.Length != arr1.Length)
{
return false;
}
if (arr0.Length == 0)
{
return true;
}
fixed (byte* b0 = arr0, b1 = arr1)
{
#if NETCOREAPP3_0
if (Avx2.IsSupported)
{
return Compare256(b0, b1, arr0.Length);
}
else if (Sse2.IsSupported)
{
return Compare128(b0, b1, arr0.Length);
}
else
#endif
{
return Compare64(b0, b1, arr0.Length);
}
}
}
#if NETCOREAPP3_0
public static unsafe bool Compare256(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus128 = lastAddr - 128;
const int mask = -1;
while (b0 < lastAddrMinus128) // unroll the loop so that we are comparing 128 bytes at a time.
{
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0), Avx.LoadVector256(b1))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 32), Avx.LoadVector256(b1 + 32))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 64), Avx.LoadVector256(b1 + 64))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 96), Avx.LoadVector256(b1 + 96))) != mask)
{
return false;
}
b0 += 128;
b1 += 128;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}
public static unsafe bool Compare128(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus64 = lastAddr - 64;
const int mask = 0xFFFF;
while (b0 < lastAddrMinus64) // unroll the loop so that we are comparing 64 bytes at a time.
{
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0), Sse2.LoadVector128(b1))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 16), Sse2.LoadVector128(b1 + 16))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 32), Sse2.LoadVector128(b1 + 32))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 48), Sse2.LoadVector128(b1 + 48))) != mask)
{
return false;
}
b0 += 64;
b1 += 64;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}
#endif
public static unsafe bool Compare64(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus32 = lastAddr - 32;
while (b0 < lastAddrMinus32) // unroll the loop so that we are comparing 32 bytes at a time.
{
if (*(ulong*)b0 != *(ulong*)b1) return false;
if (*(ulong*)(b0 + 8) != *(ulong*)(b1 + 8)) return false;
if (*(ulong*)(b0 + 16) != *(ulong*)(b1 + 16)) return false;
if (*(ulong*)(b0 + 24) != *(ulong*)(b1 + 24)) return false;
b0 += 32;
b1 += 32;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}