我怎样才能做得快呢?

当然我可以这样做:

static bool ByteArrayCompare(byte[] a1, byte[] a2)
{
    if (a1.Length != a2.Length)
        return false;

    for (int i=0; i<a1.Length; i++)
        if (a1[i]!=a2[i])
            return false;

    return true;
}

但我正在寻找一个BCL函数或一些高度优化的已证明的方法来做到这一点。

java.util.Arrays.equals((sbyte[])(Array)a1, (sbyte[])(Array)a2);

工作得很好,但这似乎不适用于x64。

注意我的快速回答。


当前回答

这与其他方法类似,但这里的不同之处在于,不存在我可以一次检查的下一个最高字节数,例如,如果我有63个字节(在我的SIMD示例中),我可以检查前32个字节的相等性,然后是后32个字节,这比检查32个字节、16个字节、8个字节等等要快。您输入的第一个检查是比较所有字节所需要的唯一检查。

这确实在我的测试中名列前茅,但仅以微弱之差。

下面的代码正是我在airbreather/ArrayComparePerf.cs中测试它的方式。

public unsafe bool SIMDNoFallThrough()    #requires  System.Runtime.Intrinsics.X86
{
    if (a1 == null || a2 == null)
        return false;

    int length0 = a1.Length;

    if (length0 != a2.Length) return false;

    fixed (byte* b00 = a1, b01 = a2)
    {
        byte* b0 = b00, b1 = b01, last0 = b0 + length0, last1 = b1 + length0, last32 = last0 - 31;

        if (length0 > 31)
        {
            while (b0 < last32)
            {
                if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0), Avx.LoadVector256(b1))) != -1)
                    return false;
                b0 += 32;
                b1 += 32;
            }
            return Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(last0 - 32), Avx.LoadVector256(last1 - 32))) == -1;
        }

        if (length0 > 15)
        {
            if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0), Sse2.LoadVector128(b1))) != 65535)
                return false;
            return Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(last0 - 16), Sse2.LoadVector128(last1 - 16))) == 65535;
        }

        if (length0 > 7)
        {
            if (*(ulong*)b0 != *(ulong*)b1)
                return false;
            return *(ulong*)(last0 - 8) == *(ulong*)(last1 - 8);
        }

        if (length0 > 3)
        {
            if (*(uint*)b0 != *(uint*)b1)
                return false;
            return *(uint*)(last0 - 4) == *(uint*)(last1 - 4);
        }

        if (length0 > 1)
        {
            if (*(ushort*)b0 != *(ushort*)b1)
                return false;
            return *(ushort*)(last0 - 2) == *(ushort*)(last1 - 2);
        }

        return *b0 == *b1;
    }
}

如果没有首选的SIMD,与现有的longpointer算法相同的方法:

public unsafe bool LongPointersNoFallThrough()
{
    if (a1 == null || a2 == null || a1.Length != a2.Length)
        return false;
    fixed (byte* p1 = a1, p2 = a2)
    {
        byte* x1 = p1, x2 = p2;
        int l = a1.Length;
        if ((l & 8) != 0)
        {
            for (int i = 0; i < l / 8; i++, x1 += 8, x2 += 8)
                if (*(long*)x1 != *(long*)x2) return false;
            return *(long*)(x1 + (l - 8)) == *(long*)(x2 + (l - 8));
        }
        if ((l & 4) != 0)
        {
            if (*(int*)x1 != *(int*)x2) return false; x1 += 4; x2 += 4;
            return *(int*)(x1 + (l - 4)) == *(int*)(x2 + (l - 4));
        }
        if ((l & 2) != 0)
        {
            if (*(short*)x1 != *(short*)x2) return false; x1 += 2; x2 += 2;
            return *(short*)(x1 + (l - 2)) == *(short*)(x2 + (l - 2));
        }
        return *x1 == *x2;
    }
}

其他回答

我在这里没有看到很多linq解决方案。

我不确定性能的影响,但我通常坚持linq作为经验法则,然后在必要时进行优化。

public bool CompareTwoArrays(byte[] array1, byte[] array2)
 {
   return !array1.Where((t, i) => t != array2[i]).Any();
 }

请注意,这只适用于它们是相同大小的数组。 一个扩展可能是这样的

public bool CompareTwoArrays(byte[] array1, byte[] array2)
 {
   if (array1.Length != array2.Length) return false;
   return !array1.Where((t, i) => t != array2[i]).Any();
 }

受到ArekBulski发布的EqualBytesLongUnrolled方法的启发,我确定了一个附加优化的解决方案。在我的实例中,数组中的数组差异往往在数组的尾部附近。在测试中,我发现当这种情况发生在大型数组中时,能够以相反的顺序比较数组元素使这种解决方案比基于memcmp的解决方案获得了巨大的性能提升。下面是解决方案:

public enum CompareDirection { Forward, Backward }

private static unsafe bool UnsafeEquals(byte[] a, byte[] b, CompareDirection direction = CompareDirection.Forward)
{
    // returns when a and b are same array or both null
    if (a == b) return true;

    // if either is null or different lengths, can't be equal
    if (a == null || b == null || a.Length != b.Length)
        return false;

    const int UNROLLED = 16;                // count of longs 'unrolled' in optimization
    int size = sizeof(long) * UNROLLED;     // 128 bytes (min size for 'unrolled' optimization)
    int len = a.Length;
    int n = len / size;         // count of full 128 byte segments
    int r = len % size;         // count of remaining 'unoptimized' bytes

    // pin the arrays and access them via pointers
    fixed (byte* pb_a = a, pb_b = b)
    {
        if (r > 0 && direction == CompareDirection.Backward)
        {
            byte* pa = pb_a + len - 1;
            byte* pb = pb_b + len - 1;
            byte* phead = pb_a + len - r;
            while(pa >= phead)
            {
                if (*pa != *pb) return false;
                pa--;
                pb--;
            }
        }

        if (n > 0)
        {
            int nOffset = n * size;
            if (direction == CompareDirection.Forward)
            {
                long* pa = (long*)pb_a;
                long* pb = (long*)pb_b;
                long* ptail = (long*)(pb_a + nOffset);
                while (pa < ptail)
                {
                    if (*(pa + 0) != *(pb + 0) || *(pa + 1) != *(pb + 1) ||
                        *(pa + 2) != *(pb + 2) || *(pa + 3) != *(pb + 3) ||
                        *(pa + 4) != *(pb + 4) || *(pa + 5) != *(pb + 5) ||
                        *(pa + 6) != *(pb + 6) || *(pa + 7) != *(pb + 7) ||
                        *(pa + 8) != *(pb + 8) || *(pa + 9) != *(pb + 9) ||
                        *(pa + 10) != *(pb + 10) || *(pa + 11) != *(pb + 11) ||
                        *(pa + 12) != *(pb + 12) || *(pa + 13) != *(pb + 13) ||
                        *(pa + 14) != *(pb + 14) || *(pa + 15) != *(pb + 15)
                    )
                    {
                        return false;
                    }
                    pa += UNROLLED;
                    pb += UNROLLED;
                }
            }
            else
            {
                long* pa = (long*)(pb_a + nOffset);
                long* pb = (long*)(pb_b + nOffset);
                long* phead = (long*)pb_a;
                while (phead < pa)
                {
                    if (*(pa - 1) != *(pb - 1) || *(pa - 2) != *(pb - 2) ||
                        *(pa - 3) != *(pb - 3) || *(pa - 4) != *(pb - 4) ||
                        *(pa - 5) != *(pb - 5) || *(pa - 6) != *(pb - 6) ||
                        *(pa - 7) != *(pb - 7) || *(pa - 8) != *(pb - 8) ||
                        *(pa - 9) != *(pb - 9) || *(pa - 10) != *(pb - 10) ||
                        *(pa - 11) != *(pb - 11) || *(pa - 12) != *(pb - 12) ||
                        *(pa - 13) != *(pb - 13) || *(pa - 14) != *(pb - 14) ||
                        *(pa - 15) != *(pb - 15) || *(pa - 16) != *(pb - 16)
                    )
                    {
                        return false;
                    }
                    pa -= UNROLLED;
                    pb -= UNROLLED;
                }
            }
        }

        if (r > 0 && direction == CompareDirection.Forward)
        {
            byte* pa = pb_a + len - r;
            byte* pb = pb_b + len - r;
            byte* ptail = pb_a + len;
            while(pa < ptail)
            {
                if (*pa != *pb) return false;
                pa++;
                pb++;
            }
        }
    }

    return true;
}

使用SequenceEquals进行比较。

我开发了一个方法,稍微击败memcmp() (plinth的答案)和非常轻微击败EqualBytesLongUnrolled() (Arek Bulski的答案)在我的PC上。基本上,它以4而不是8展开循环。

2019年3月30日更新:

从。net核心3.0开始,我们有了SIMD支持!

这个解决方案在我的PC上是最快的:

#if NETCOREAPP3_0
using System.Runtime.Intrinsics.X86;
#endif
…

public static unsafe bool Compare(byte[] arr0, byte[] arr1)
{
    if (arr0 == arr1)
    {
        return true;
    }
    if (arr0 == null || arr1 == null)
    {
        return false;
    }
    if (arr0.Length != arr1.Length)
    {
        return false;
    }
    if (arr0.Length == 0)
    {
        return true;
    }
    fixed (byte* b0 = arr0, b1 = arr1)
    {
#if NETCOREAPP3_0
        if (Avx2.IsSupported)
        {
            return Compare256(b0, b1, arr0.Length);
        }
        else if (Sse2.IsSupported)
        {
            return Compare128(b0, b1, arr0.Length);
        }
        else
#endif
        {
            return Compare64(b0, b1, arr0.Length);
        }
    }
}
#if NETCOREAPP3_0
public static unsafe bool Compare256(byte* b0, byte* b1, int length)
{
    byte* lastAddr = b0 + length;
    byte* lastAddrMinus128 = lastAddr - 128;
    const int mask = -1;
    while (b0 < lastAddrMinus128) // unroll the loop so that we are comparing 128 bytes at a time.
    {
        if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0), Avx.LoadVector256(b1))) != mask)
        {
            return false;
        }
        if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 32), Avx.LoadVector256(b1 + 32))) != mask)
        {
            return false;
        }
        if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 64), Avx.LoadVector256(b1 + 64))) != mask)
        {
            return false;
        }
        if (Avx2.MoveMask(Avx2.CompareEqual(Avx.LoadVector256(b0 + 96), Avx.LoadVector256(b1 + 96))) != mask)
        {
            return false;
        }
        b0 += 128;
        b1 += 128;
    }
    while (b0 < lastAddr)
    {
        if (*b0 != *b1) return false;
        b0++;
        b1++;
    }
    return true;
}
public static unsafe bool Compare128(byte* b0, byte* b1, int length)
{
    byte* lastAddr = b0 + length;
    byte* lastAddrMinus64 = lastAddr - 64;
    const int mask = 0xFFFF;
    while (b0 < lastAddrMinus64) // unroll the loop so that we are comparing 64 bytes at a time.
    {
        if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0), Sse2.LoadVector128(b1))) != mask)
        {
            return false;
        }
        if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 16), Sse2.LoadVector128(b1 + 16))) != mask)
        {
            return false;
        }
        if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 32), Sse2.LoadVector128(b1 + 32))) != mask)
        {
            return false;
        }
        if (Sse2.MoveMask(Sse2.CompareEqual(Sse2.LoadVector128(b0 + 48), Sse2.LoadVector128(b1 + 48))) != mask)
        {
            return false;
        }
        b0 += 64;
        b1 += 64;
    }
    while (b0 < lastAddr)
    {
        if (*b0 != *b1) return false;
        b0++;
        b1++;
    }
    return true;
}
#endif
public static unsafe bool Compare64(byte* b0, byte* b1, int length)
{
    byte* lastAddr = b0 + length;
    byte* lastAddrMinus32 = lastAddr - 32;
    while (b0 < lastAddrMinus32) // unroll the loop so that we are comparing 32 bytes at a time.
    {
        if (*(ulong*)b0 != *(ulong*)b1) return false;
        if (*(ulong*)(b0 + 8) != *(ulong*)(b1 + 8)) return false;
        if (*(ulong*)(b0 + 16) != *(ulong*)(b1 + 16)) return false;
        if (*(ulong*)(b0 + 24) != *(ulong*)(b1 + 24)) return false;
        b0 += 32;
        b1 += 32;
    }
    while (b0 < lastAddr)
    {
        if (*b0 != *b1) return false;
        b0++;
        b1++;
    }
    return true;
}

我使用附带的。net 4.7发布版本做了一些测量,没有附带调试器。我认为人们一直在使用错误的度量,因为如果你关心这里的速度,你所关心的是计算两个字节数组是否相等需要多长时间。即以字节为单位的吞吐量。

StructuralComparison :              4.6 MiB/s
for                  :            274.5 MiB/s
ToUInt32             :            263.6 MiB/s
ToUInt64             :            474.9 MiB/s
memcmp               :           8500.8 MiB/s

正如你所看到的,没有比memcmp更好的方法了,而且它快了几个数量级。简单的for循环是次优选择。我仍然不明白为什么微软不能简单地包含一个缓冲区。比较方法。

[Program.cs]:

using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading.Tasks;

namespace memcmp
{
    class Program
    {
        static byte[] TestVector(int size)
        {
            var data = new byte[size];
            using (var rng = new System.Security.Cryptography.RNGCryptoServiceProvider())
            {
                rng.GetBytes(data);
            }
            return data;
        }

        static TimeSpan Measure(string testCase, TimeSpan offset, Action action, bool ignore = false)
        {
            var t = Stopwatch.StartNew();
            var n = 0L;
            while (t.Elapsed < TimeSpan.FromSeconds(10))
            {
                action();
                n++;
            }
            var elapsed = t.Elapsed - offset;
            if (!ignore)
            {
                Console.WriteLine($"{testCase,-16} : {n / elapsed.TotalSeconds,16:0.0} MiB/s");
            }
            return elapsed;
        }

        [DllImport("msvcrt.dll", CallingConvention = CallingConvention.Cdecl)]
        static extern int memcmp(byte[] b1, byte[] b2, long count);

        static void Main(string[] args)
        {
            // how quickly can we establish if two sequences of bytes are equal?

            // note that we are testing the speed of different comparsion methods

            var a = TestVector(1024 * 1024); // 1 MiB
            var b = (byte[])a.Clone();

            // was meant to offset the overhead of everything but copying but my attempt was a horrible mistake... should have reacted sooner due to the initially ridiculous throughput values...
            // Measure("offset", new TimeSpan(), () => { return; }, ignore: true);
            var offset = TimeZone.Zero

            Measure("StructuralComparison", offset, () =>
            {
                StructuralComparisons.StructuralEqualityComparer.Equals(a, b);
            });

            Measure("for", offset, () =>
            {
                for (int i = 0; i < a.Length; i++)
                {
                    if (a[i] != b[i]) break;
                }
            });

            Measure("ToUInt32", offset, () =>
            {
                for (int i = 0; i < a.Length; i += 4)
                {
                    if (BitConverter.ToUInt32(a, i) != BitConverter.ToUInt32(b, i)) break;
                }
            });

            Measure("ToUInt64", offset, () =>
            {
                for (int i = 0; i < a.Length; i += 8)
                {
                    if (BitConverter.ToUInt64(a, i) != BitConverter.ToUInt64(b, i)) break;
                }
            });

            Measure("memcmp", offset, () =>
            {
                memcmp(a, b, a.Length);
            });
        }
    }
}