我怎样才能做得快呢?
当然我可以这样做:
static bool ByteArrayCompare(byte[] a1, byte[] a2)
{
if (a1.Length != a2.Length)
return false;
for (int i=0; i<a1.Length; i++)
if (a1[i]!=a2[i])
return false;
return true;
}
但我正在寻找一个BCL函数或一些高度优化的已证明的方法来做到这一点。
java.util.Arrays.equals((sbyte[])(Array)a1, (sbyte[])(Array)a2);
工作得很好,但这似乎不适用于x64。
注意我的快速回答。
我开发了一个方法,稍微击败memcmp() (plinth的答案)和非常轻微击败EqualBytesLongUnrolled() (Arek Bulski的答案)在我的PC上。基本上,它以4而不是8展开循环。
2019年3月30日更新:
从。net核心3.0开始,我们有了SIMD支持!
这个解决方案在我的PC上是最快的:
#if NETCOREAPP3_0
using System.Runtime.Intrinsics.X86;
#endif
…
public static unsafe bool Compare(byte[] arr0, byte[] arr1)
{
if (arr0 == arr1)
{
return true;
}
if (arr0 == null || arr1 == null)
{
return false;
}
if (arr0.Length != arr1.Length)
{
return false;
}
if (arr0.Length == 0)
{
return true;
}
fixed (byte* b0 = arr0, b1 = arr1)
{
#if NETCOREAPP3_0
if (Avx2.IsSupported)
{
return Compare256(b0, b1, arr0.Length);
}
else if (Sse2.IsSupported)
{
return Compare128(b0, b1, arr0.Length);
}
else
#endif
{
return Compare64(b0, b1, arr0.Length);
}
}
}
#if NETCOREAPP3_0
public static unsafe bool Compare256(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus128 = lastAddr - 128;
const int mask = -1;
while (b0 < lastAddrMinus128) // unroll the loop so that we are comparing 128 bytes at a time.
{
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0), Avx.LoadVector256(b1))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 32), Avx.LoadVector256(b1 + 32))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 64), Avx.LoadVector256(b1 + 64))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 96), Avx.LoadVector256(b1 + 96))) != mask)
{
return false;
}
b0 += 128;
b1 += 128;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}
public static unsafe bool Compare128(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus64 = lastAddr - 64;
const int mask = 0xFFFF;
while (b0 < lastAddrMinus64) // unroll the loop so that we are comparing 64 bytes at a time.
{
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0), Sse2.LoadVector128(b1))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 16), Sse2.LoadVector128(b1 + 16))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 32), Sse2.LoadVector128(b1 + 32))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 48), Sse2.LoadVector128(b1 + 48))) != mask)
{
return false;
}
b0 += 64;
b1 += 64;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}
#endif
public static unsafe bool Compare64(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus32 = lastAddr - 32;
while (b0 < lastAddrMinus32) // unroll the loop so that we are comparing 32 bytes at a time.
{
if (*(ulong*)b0 != *(ulong*)b1) return false;
if (*(ulong*)(b0 + 8) != *(ulong*)(b1 + 8)) return false;
if (*(ulong*)(b0 + 16) != *(ulong*)(b1 + 16)) return false;
if (*(ulong*)(b0 + 24) != *(ulong*)(b1 + 24)) return false;
b0 += 32;
b1 += 32;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}
这与其他方法类似,但这里的不同之处在于,不存在我可以一次检查的下一个最高字节数,例如,如果我有63个字节(在我的SIMD示例中),我可以检查前32个字节的相等性,然后是后32个字节,这比检查32个字节、16个字节、8个字节等等要快。您输入的第一个检查是比较所有字节所需要的唯一检查。
这确实在我的测试中名列前茅,但仅以微弱之差。
下面的代码正是我在airbreather/ArrayComparePerf.cs中测试它的方式。
public unsafe bool SIMDNoFallThrough() #requires System.Runtime.Intrinsics.X86
{
if (a1 == null || a2 == null)
return false;
int length0 = a1.Length;
if (length0 != a2.Length) return false;
fixed (byte* b00 = a1, b01 = a2)
{
byte* b0 = b00, b1 = b01, last0 = b0 + length0, last1 = b1 + length0, last32 = last0 - 31;
if (length0 > 31)
{
while (b0 < last32)
{
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0), Avx.LoadVector256(b1))) != -1)
return false;
b0 += 32;
b1 += 32;
}
return Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(last0 - 32), Avx.LoadVector256(last1 - 32))) == -1;
}
if (length0 > 15)
{
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0), Sse2.LoadVector128(b1))) != 65535)
return false;
return Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(last0 - 16), Sse2.LoadVector128(last1 - 16))) == 65535;
}
if (length0 > 7)
{
if (*(ulong*)b0 != *(ulong*)b1)
return false;
return *(ulong*)(last0 - 8) == *(ulong*)(last1 - 8);
}
if (length0 > 3)
{
if (*(uint*)b0 != *(uint*)b1)
return false;
return *(uint*)(last0 - 4) == *(uint*)(last1 - 4);
}
if (length0 > 1)
{
if (*(ushort*)b0 != *(ushort*)b1)
return false;
return *(ushort*)(last0 - 2) == *(ushort*)(last1 - 2);
}
return *b0 == *b1;
}
}
如果没有首选的SIMD,与现有的longpointer算法相同的方法:
public unsafe bool LongPointersNoFallThrough()
{
if (a1 == null || a2 == null || a1.Length != a2.Length)
return false;
fixed (byte* p1 = a1, p2 = a2)
{
byte* x1 = p1, x2 = p2;
int l = a1.Length;
if ((l & 8) != 0)
{
for (int i = 0; i < l / 8; i++, x1 += 8, x2 += 8)
if (*(long*)x1 != *(long*)x2) return false;
return *(long*)(x1 + (l - 8)) == *(long*)(x2 + (l - 8));
}
if ((l & 4) != 0)
{
if (*(int*)x1 != *(int*)x2) return false; x1 += 4; x2 += 4;
return *(int*)(x1 + (l - 4)) == *(int*)(x2 + (l - 4));
}
if ((l & 2) != 0)
{
if (*(short*)x1 != *(short*)x2) return false; x1 += 2; x2 += 2;
return *(short*)(x1 + (l - 2)) == *(short*)(x2 + (l - 2));
}
return *x1 == *x2;
}
}
编辑:现代的快速方法是使用a1.SequenceEquals(a2)
用户gil提出了不安全的代码,产生了这个解决方案:
// Copyright (c) 2008-2013 Hafthor Stefansson
// Distributed under the MIT/X11 software license
// Ref: http://www.opensource.org/licenses/mit-license.php.
static unsafe bool UnsafeCompare(byte[] a1, byte[] a2) {
unchecked {
if(a1==a2) return true;
if(a1==null || a2==null || a1.Length!=a2.Length)
return false;
fixed (byte* p1=a1, p2=a2) {
byte* x1=p1, x2=p2;
int l = a1.Length;
for (int i=0; i < l/8; i++, x1+=8, x2+=8)
if (*((long*)x1) != *((long*)x2)) return false;
if ((l & 4)!=0) { if (*((int*)x1)!=*((int*)x2)) return false; x1+=4; x2+=4; }
if ((l & 2)!=0) { if (*((short*)x1)!=*((short*)x2)) return false; x1+=2; x2+=2; }
if ((l & 1)!=0) if (*((byte*)x1) != *((byte*)x2)) return false;
return true;
}
}
}
它对尽可能多的数组进行基于64位的比较。这依赖于数组以qword对齐开始的事实。它会工作,如果不是qword对齐,只是没有那么快,如果它是。
它比简单的“for”循环快了大约7个计时器。使用j#库执行相当于原来的' for '循环。使用.SequenceEqual会慢7倍左右;我想只是因为它使用了ienumerator。movenext。我认为基于linq的解决方案至少会这么慢,甚至更糟。
我开发了一个方法,稍微击败memcmp() (plinth的答案)和非常轻微击败EqualBytesLongUnrolled() (Arek Bulski的答案)在我的PC上。基本上,它以4而不是8展开循环。
2019年3月30日更新:
从。net核心3.0开始,我们有了SIMD支持!
这个解决方案在我的PC上是最快的:
#if NETCOREAPP3_0
using System.Runtime.Intrinsics.X86;
#endif
…
public static unsafe bool Compare(byte[] arr0, byte[] arr1)
{
if (arr0 == arr1)
{
return true;
}
if (arr0 == null || arr1 == null)
{
return false;
}
if (arr0.Length != arr1.Length)
{
return false;
}
if (arr0.Length == 0)
{
return true;
}
fixed (byte* b0 = arr0, b1 = arr1)
{
#if NETCOREAPP3_0
if (Avx2.IsSupported)
{
return Compare256(b0, b1, arr0.Length);
}
else if (Sse2.IsSupported)
{
return Compare128(b0, b1, arr0.Length);
}
else
#endif
{
return Compare64(b0, b1, arr0.Length);
}
}
}
#if NETCOREAPP3_0
public static unsafe bool Compare256(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus128 = lastAddr - 128;
const int mask = -1;
while (b0 < lastAddrMinus128) // unroll the loop so that we are comparing 128 bytes at a time.
{
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0), Avx.LoadVector256(b1))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 32), Avx.LoadVector256(b1 + 32))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 64), Avx.LoadVector256(b1 + 64))) != mask)
{
return false;
}
if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 96), Avx.LoadVector256(b1 + 96))) != mask)
{
return false;
}
b0 += 128;
b1 += 128;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}
public static unsafe bool Compare128(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus64 = lastAddr - 64;
const int mask = 0xFFFF;
while (b0 < lastAddrMinus64) // unroll the loop so that we are comparing 64 bytes at a time.
{
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0), Sse2.LoadVector128(b1))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 16), Sse2.LoadVector128(b1 + 16))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 32), Sse2.LoadVector128(b1 + 32))) != mask)
{
return false;
}
if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 48), Sse2.LoadVector128(b1 + 48))) != mask)
{
return false;
}
b0 += 64;
b1 += 64;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}
#endif
public static unsafe bool Compare64(byte* b0, byte* b1, int length)
{
byte* lastAddr = b0 + length;
byte* lastAddrMinus32 = lastAddr - 32;
while (b0 < lastAddrMinus32) // unroll the loop so that we are comparing 32 bytes at a time.
{
if (*(ulong*)b0 != *(ulong*)b1) return false;
if (*(ulong*)(b0 + 8) != *(ulong*)(b1 + 8)) return false;
if (*(ulong*)(b0 + 16) != *(ulong*)(b1 + 16)) return false;
if (*(ulong*)(b0 + 24) != *(ulong*)(b1 + 24)) return false;
b0 += 32;
b1 += 32;
}
while (b0 < lastAddr)
{
if (*b0 != *b1) return false;
b0++;
b1++;
}
return true;
}