实现以下目标最有效的算法是什么:

0010 0000 => 0000 0100

从MSB->LSB转换为LSB->MSB。所有位都必须反转;也就是说,这不是字节交换。


注意:下面所有的算法都是用C语言编写的,但是应该可以移植到你所选择的语言中(当它们没有那么快的时候不要看着我:)

选项

低内存(32位int, 32位机器)(从这里):

unsigned int
reverse(register unsigned int x)
{
    x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
    x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
    x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
    x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
    return((x >> 16) | (x << 16));

}

来自著名的Bit Twiddling Hacks页面:

最快(查找表):

static const unsigned char BitReverseTable256[] = 
{
  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 
  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 
  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 
  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 
  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 
  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 
  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 
  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 
  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 
  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
};

unsigned int v; // reverse 32-bit value, 8 bits at time
unsigned int c; // c will get v reversed

// Option 1:
c = (BitReverseTable256[v & 0xff] << 24) | 
    (BitReverseTable256[(v >> 8) & 0xff] << 16) | 
    (BitReverseTable256[(v >> 16) & 0xff] << 8) |
    (BitReverseTable256[(v >> 24) & 0xff]);

// Option 2:
unsigned char * p = (unsigned char *) &v;
unsigned char * q = (unsigned char *) &c;
q[3] = BitReverseTable256[p[0]]; 
q[2] = BitReverseTable256[p[1]]; 
q[1] = BitReverseTable256[p[2]]; 
q[0] = BitReverseTable256[p[3]];

您可以将此想法扩展到64位整数,或者为了速度而牺牲内存(假设L1数据缓存足够大),并使用一个64k条目查找表一次反向16位。


其他人

简单的

unsigned int v;     // input bits to be reversed
unsigned int r = v & 1; // r will be reversed bits of v; first get LSB of v
int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end

for (v >>= 1; v; v >>= 1)
{   
  r <<= 1;
  r |= v & 1;
  s--;
}
r <<= s; // shift when v's highest bits are zero

更快(32位处理器)

unsigned char b = x;
b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16; 

更快(64位处理器)

unsigned char b; // reverse this (8-bit) byte
b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;

如果您想在32位整型上执行此操作,只需反转每个字节中的位,并反转字节的顺序。那就是:

unsigned int toReverse;
unsigned int reversed;
unsigned char inByte0 = (toReverse & 0xFF);
unsigned char inByte1 = (toReverse & 0xFF00) >> 8;
unsigned char inByte2 = (toReverse & 0xFF0000) >> 16;
unsigned char inByte3 = (toReverse & 0xFF000000) >> 24;
reversed = (reverseBits(inByte0) << 24) | (reverseBits(inByte1) << 16) | (reverseBits(inByte2) << 8) | (reverseBits(inByte3);

结果

我对两种最有希望的解决方案进行了基准测试,查找表和按位and(第一个)。测试机器是一台带有4GB DDR2-800和酷睿2 Duo T7500 @ 2.4GHz, 4MB L2缓存的笔记本电脑;YMMV。我在64位Linux上使用gcc 4.3.2。OpenMP(和GCC绑定)用于高分辨率计时器。

reverse.c

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

unsigned int
reverse(register unsigned int x)
{
    x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
    x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
    x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
    x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
    return((x >> 16) | (x << 16));

}

int main()
{
    unsigned int *ints = malloc(100000000*sizeof(unsigned int));
    unsigned int *ints2 = malloc(100000000*sizeof(unsigned int));
    for(unsigned int i = 0; i < 100000000; i++)
      ints[i] = rand();

    unsigned int *inptr = ints;
    unsigned int *outptr = ints2;
    unsigned int *endptr = ints + 100000000;
    // Starting the time measurement
    double start = omp_get_wtime();
    // Computations to be measured
    while(inptr != endptr)
    {
      (*outptr) = reverse(*inptr);
      inptr++;
      outptr++;
    }
    // Measuring the elapsed time
    double end = omp_get_wtime();
    // Time calculation (in seconds)
    printf("Time: %f seconds\n", end-start);

    free(ints);
    free(ints2);

    return 0;
}

reverse_lookup.c

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

static const unsigned char BitReverseTable256[] = 
{
  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 
  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 
  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 
  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 
  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 
  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 
  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 
  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 
  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 
  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
};

int main()
{
    unsigned int *ints = malloc(100000000*sizeof(unsigned int));
    unsigned int *ints2 = malloc(100000000*sizeof(unsigned int));
    for(unsigned int i = 0; i < 100000000; i++)
      ints[i] = rand();

    unsigned int *inptr = ints;
    unsigned int *outptr = ints2;
    unsigned int *endptr = ints + 100000000;
    // Starting the time measurement
    double start = omp_get_wtime();
    // Computations to be measured
    while(inptr != endptr)
    {
    unsigned int in = *inptr;  

    // Option 1:
    //*outptr = (BitReverseTable256[in & 0xff] << 24) | 
    //    (BitReverseTable256[(in >> 8) & 0xff] << 16) | 
    //    (BitReverseTable256[(in >> 16) & 0xff] << 8) |
    //    (BitReverseTable256[(in >> 24) & 0xff]);

    // Option 2:
    unsigned char * p = (unsigned char *) &(*inptr);
    unsigned char * q = (unsigned char *) &(*outptr);
    q[3] = BitReverseTable256[p[0]]; 
    q[2] = BitReverseTable256[p[1]]; 
    q[1] = BitReverseTable256[p[2]]; 
    q[0] = BitReverseTable256[p[3]];

      inptr++;
      outptr++;
    }
    // Measuring the elapsed time
    double end = omp_get_wtime();
    // Time calculation (in seconds)
    printf("Time: %f seconds\n", end-start);

    free(ints);
    free(ints2);

    return 0;
}

我在几个不同的优化中尝试了这两种方法,在每个级别上进行了3次试验,每次试验逆转了1亿个随机无符号整数。对于查找表选项,我尝试了按位hacks页面上给出的两种方案(选项1和2)。结果如下所示。

位和

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse reverse.c
mrj10@mjlap:~/code$ ./reverse
Time: 2.000593 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 1.938893 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 1.936365 seconds
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse reverse.c
mrj10@mjlap:~/code$ ./reverse
Time: 0.942709 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 0.991104 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 0.947203 seconds
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse reverse.c
mrj10@mjlap:~/code$ ./reverse
Time: 0.922639 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 0.892372 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 0.891688 seconds

查阅表(选项1)

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.201127 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.196129 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.235972 seconds              
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.633042 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.655880 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.633390 seconds              
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.652322 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.631739 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.652431 seconds  

查找表(选项2)

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.671537 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.688173 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.664662 seconds
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.049851 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.048403 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.085086 seconds
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.082223 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.053431 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.081224 seconds

结论

如果您关心性能,请使用选项1(字节寻址速度慢,这是意料中事)的查找表。如果您需要从系统中挤出最后一个字节的内存(如果您关心位反转的性能,您可能会这样做),那么按位- and方法的优化版本也不会太糟糕。

警告

是的,我知道基准代码完全是一种hack。关于如何改进它的建议非常受欢迎。我知道的事情:

I don't have access to ICC. This may be faster (please respond in a comment if you can test this out). A 64K lookup table may do well on some modern microarchitectures with large L1D. -mtune=native didn't work for -O2/-O3 (ld blew up with some crazy symbol redefinition error), so I don't believe the generated code is tuned for my microarchitecture. There may be a way to do this slightly faster with SSE. I have no idea how, but with fast replication, packed bitwise AND, and swizzling instructions, there's got to be something there. I know only enough x86 assembly to be dangerous; here's the code GCC generated on -O3 for option 1, so somebody more knowledgable than myself can check it out:

32位

.L3:
movl    (%r12,%rsi), %ecx
movzbl  %cl, %eax
movzbl  BitReverseTable256(%rax), %edx
movl    %ecx, %eax
shrl    $24, %eax
mov     %eax, %eax
movzbl  BitReverseTable256(%rax), %eax
sall    $24, %edx
orl     %eax, %edx
movzbl  %ch, %eax
shrl    $16, %ecx
movzbl  BitReverseTable256(%rax), %eax
movzbl  %cl, %ecx
sall    $16, %eax
orl     %eax, %edx
movzbl  BitReverseTable256(%rcx), %eax
sall    $8, %eax
orl     %eax, %edx
movl    %edx, (%r13,%rsi)
addq    $4, %rsi
cmpq    $400000000, %rsi
jne     .L3

编辑:我还尝试在我的机器上使用uint64_t类型,看看是否有任何性能提升。性能比32位快10%左右,无论您是一次使用64位类型对两个32位整型反转位,还是实际上将64位值的一半反转位,性能都几乎相同。汇编代码如下所示(对于前一种情况,一次为两个32位整型反转位):

.L3:
movq    (%r12,%rsi), %rdx
movq    %rdx, %rax
shrq    $24, %rax
andl    $255, %eax
movzbl  BitReverseTable256(%rax), %ecx
movzbq  %dl,%rax
movzbl  BitReverseTable256(%rax), %eax
salq    $24, %rax
orq     %rax, %rcx
movq    %rdx, %rax
shrq    $56, %rax
movzbl  BitReverseTable256(%rax), %eax
salq    $32, %rax
orq     %rax, %rcx
movzbl  %dh, %eax
shrq    $16, %rdx
movzbl  BitReverseTable256(%rax), %eax
salq    $16, %rax
orq     %rax, %rcx
movzbq  %dl,%rax
shrq    $16, %rdx
movzbl  BitReverseTable256(%rax), %eax
salq    $8, %rax
orq     %rax, %rcx
movzbq  %dl,%rax
shrq    $8, %rdx
movzbl  BitReverseTable256(%rax), %eax
salq    $56, %rax
orq     %rax, %rcx
movzbq  %dl,%rax
shrq    $8, %rdx
movzbl  BitReverseTable256(%rax), %eax
andl    $255, %edx
salq    $48, %rax
orq     %rax, %rcx
movzbl  BitReverseTable256(%rdx), %eax
salq    $40, %rax
orq     %rax, %rcx
movq    %rcx, (%r13,%rsi)
addq    $8, %rsi
cmpq    $400000000, %rsi
jne     .L3

假设你有一个比特数组,怎么样: 1. 从MSB开始,将比特一个一个地推入堆栈。 2. 从这个堆栈弹出位到另一个数组(如果你想节省空间,也可以是同一个数组),将第一个弹出位放入MSB,然后从那里继续到较低的有效位。

Stack stack = new Stack();
Bit[] bits = new Bit[] { 0, 0, 1, 0, 0, 0, 0, 0 };

for (int i = 0; i < bits.Length; i++) 
{
    stack.push(bits[i]);
}

for (int i = 0; i < bits.Length; i++)
{
    bits[i] = stack.pop();
}

通用的

C代码。以1字节输入数据num为例。

    unsigned char num = 0xaa;   // 1010 1010 (aa) -> 0101 0101 (55)
    int s = sizeof(num) * 8;    // get number of bits
    int i, x, y, p;
    int var = 0;                // make var data type to be equal or larger than num

    for (i = 0; i < (s / 2); i++) {
        // extract bit on the left, from MSB
        p = s - i - 1;
        x = num & (1 << p);
        x = x >> p;
        printf("x: %d\n", x);

        // extract bit on the right, from LSB
        y = num & (1 << i);
        y = y >> i;
        printf("y: %d\n", y);

        var = var | (x << i);       // apply x
        var = var | (y << p);       // apply y
    }

    printf("new: 0x%x\n", new);

当然,玩弄比特的黑客的明显来源是: http://graphics.stanford.edu/~seander/bithacks.html#BitReverseObvious


实现低内存和最快。

private Byte  BitReverse(Byte bData)
    {
        Byte[] lookup = { 0, 8,  4, 12, 
                          2, 10, 6, 14 , 
                          1, 9,  5, 13,
                          3, 11, 7, 15 };
        Byte ret_val = (Byte)(((lookup[(bData & 0x0F)]) << 4) + lookup[((bData & 0xF0) >> 4)]);
        return ret_val;
    }

好吧,这基本上与第一个“reverse()”相同,但它是64位的,只需要从指令流中加载一个即时掩码。GCC创建的代码没有跳转,所以这应该是相当快的。

#include <stdio.h>

static unsigned long long swap64(unsigned long long val)
{
#define ZZZZ(x,s,m) (((x) >>(s)) & (m)) | (((x) & (m))<<(s));
/* val = (((val) >>16) & 0xFFFF0000FFFF) | (((val) & 0xFFFF0000FFFF)<<16); */

val = ZZZZ(val,32,  0x00000000FFFFFFFFull );
val = ZZZZ(val,16,  0x0000FFFF0000FFFFull );
val = ZZZZ(val,8,   0x00FF00FF00FF00FFull );
val = ZZZZ(val,4,   0x0F0F0F0F0F0F0F0Full );
val = ZZZZ(val,2,   0x3333333333333333ull );
val = ZZZZ(val,1,   0x5555555555555555ull );

return val;
#undef ZZZZ
}

int main(void)
{
unsigned long long val, aaaa[16] =
 { 0xfedcba9876543210,0xedcba9876543210f,0xdcba9876543210fe,0xcba9876543210fed
 , 0xba9876543210fedc,0xa9876543210fedcb,0x9876543210fedcba,0x876543210fedcba9
 , 0x76543210fedcba98,0x6543210fedcba987,0x543210fedcba9876,0x43210fedcba98765
 , 0x3210fedcba987654,0x210fedcba9876543,0x10fedcba98765432,0x0fedcba987654321
 };
unsigned iii;

for (iii=0; iii < 16; iii++) {
    val = swap64 (aaaa[iii]);
    printf("A[]=%016llX Sw=%016llx\n", aaaa[iii], val);
    }
return 0;
}

对于喜欢递归的人来说,这是另一个解决方案。

这个想法很简单。 将输入除以一半并交换两部分,继续直到达到单个位。

Illustrated in the example below.

Ex : If Input is 00101010   ==> Expected output is 01010100

1. Divide the input into 2 halves 
    0010 --- 1010

2. Swap the 2 Halves
    1010     0010

3. Repeat the same for each half.
    10 -- 10 ---  00 -- 10
    10    10      10    00

    1-0 -- 1-0 --- 1-0 -- 0-0
    0 1    0 1     0 1    0 0

Done! Output is 01010100

这里有一个递归函数来求解。(注意,我使用了unsigned int,所以它可以用于sizeof(unsigned int)*8位的输入。

递归函数有两个参数-需要位的值 要反转的值和值中的比特数。

int reverse_bits_recursive(unsigned int num, unsigned int numBits)
{
    unsigned int reversedNum;;
    unsigned int mask = 0;

    mask = (0x1 << (numBits/2)) - 1;

    if (numBits == 1) return num;
    reversedNum = reverse_bits_recursive(num >> numBits/2, numBits/2) |
                   reverse_bits_recursive((num & mask), numBits/2) << numBits/2;
    return reversedNum;
}

int main()
{
    unsigned int reversedNum;
    unsigned int num;

    num = 0x55;
    reversedNum = reverse_bits_recursive(num, 8);
    printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);

    num = 0xabcd;
    reversedNum = reverse_bits_recursive(num, 16);
    printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);

    num = 0x123456;
    reversedNum = reverse_bits_recursive(num, 24);
    printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);

    num = 0x11223344;
    reversedNum = reverse_bits_recursive(num,32);
    printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);
}

输出如下:

Bit Reversal Input = 0x55 Output = 0xaa
Bit Reversal Input = 0xabcd Output = 0xb3d5
Bit Reversal Input = 0x123456 Output = 0x651690
Bit Reversal Input = 0x11223344 Output = 0x22cc4488

您可能希望使用标准模板库。它可能比上面提到的代码慢。然而,在我看来,这似乎更清楚,更容易理解。

 #include<bitset>
 #include<iostream>


 template<size_t N>
 const std::bitset<N> reverse(const std::bitset<N>& ordered)
 {
      std::bitset<N> reversed;
      for(size_t i = 0, j = N - 1; i < N; ++i, --j)
           reversed[j] = ordered[i];
      return reversed;
 };


 // test the function
 int main()
 {
      unsigned long num; 
      const size_t N = sizeof(num)*8;

      std::cin >> num;
      std::cout << std::showbase << std::hex;
      std::cout << "ordered  = " << num << std::endl;
      std::cout << "reversed = " << reverse<N>(num).to_ulong()  << std::endl;
      std::cout << "double_reversed = " << reverse<N>(reverse<N>(num)).to_ulong() << std::endl;  
 }

下面这个怎么样:

    uint reverseMSBToLSB32ui(uint input)
    {
        uint output = 0x00000000;
        uint toANDVar = 0;
        int places = 0;

        for (int i = 1; i < 32; i++)
        {
            places = (32 - i);
            toANDVar = (uint)(1 << places);
            output |= (uint)(input & (toANDVar)) >> places;

        }


        return output;
    }

小而简单(不过只有32位)。


伪代码中的位反转

源-要反转的>字节b00101100 Destination ->反转,也需要为unsigned类型,这样符号位就不会向下传播

复制到临时,因此原始不受影响,还需要为unsigned类型,以便符号位不会自动移位

bytecopy = b0010110

循环8://执行8次 测试字节拷贝是否< 0(负)

    set bit8 (msb) of reversed = reversed | b10000000 

else do not set bit8

shift bytecopy left 1 place
bytecopy = bytecopy << 1 = b0101100 result

shift result right 1 place
reversed = reversed >> 1 = b00000000
8 times no then up^ LOOP8
8 times yes then done.

好吧,这肯定不会是一个像Matt J的答案,但希望它仍然有用。

size_t reverse(size_t n, unsigned int bytes)
{
    __asm__("BSWAP %0" : "=r"(n) : "0"(n));
    n >>= ((sizeof(size_t) - bytes) * 8);
    n = ((n & 0xaaaaaaaaaaaaaaaa) >> 1) | ((n & 0x5555555555555555) << 1);
    n = ((n & 0xcccccccccccccccc) >> 2) | ((n & 0x3333333333333333) << 2);
    n = ((n & 0xf0f0f0f0f0f0f0f0) >> 4) | ((n & 0x0f0f0f0f0f0f0f0f) << 4);
    return n;
}

这与Matt的最佳算法完全相同,除了有一个叫做BSWAP的小指令,它交换64位数字的字节(而不是位)。所以b7 b6 b5 b4 b3 b2 b1 b0变成了b0 b1 b2 b3 b4 b5 b6 b7。由于我们处理的是32位数字,所以需要将字节交换后的数字向下移动32位。这只留给我们交换每个字节的8位的任务,这是完成的,瞧!我们做完了。

计时:在我的机器上,Matt的算法每次试验只需0.52秒。我的每次试验大约耗时0.42秒。我认为快20%还不错。

如果你担心指令BSWAP的可用性,维基百科列出了指令BSWAP是与1989年推出的80846一起添加的。值得注意的是,维基百科还指出,这条指令只适用于32位寄存器,这显然不是我的机器上的情况,它只适用于64位寄存器。

此方法同样适用于任何整型数据类型,因此可以通过传递所需的字节数来简单地推广该方法:

    size_t reverse(size_t n, unsigned int bytes)
    {
        __asm__("BSWAP %0" : "=r"(n) : "0"(n));
        n >>= ((sizeof(size_t) - bytes) * 8);
        n = ((n & 0xaaaaaaaaaaaaaaaa) >> 1) | ((n & 0x5555555555555555) << 1);
        n = ((n & 0xcccccccccccccccc) >> 2) | ((n & 0x3333333333333333) << 2);
        n = ((n & 0xf0f0f0f0f0f0f0f0) >> 4) | ((n & 0x0f0f0f0f0f0f0f0f) << 4);
        return n;
    }

它可以被称为:

    n = reverse(n, sizeof(char));//only reverse 8 bits
    n = reverse(n, sizeof(short));//reverse 16 bits
    n = reverse(n, sizeof(int));//reverse 32 bits
    n = reverse(n, sizeof(size_t));//reverse 64 bits

编译器应该能够优化掉额外的形参(假设编译器内联了函数),对于sizeof(size_t)情况,右移将被完全删除。注意,如果传递sizeof(char), GCC至少不能删除BSWAP和右移。


unsigned char ReverseBits(unsigned char data)
{
    unsigned char k = 0, rev = 0;

    unsigned char n = data;

    while(n)

    {
        k = n & (~(n - 1));
        n &= (n - 1);
        rev |= (128 / k);
    }
    return rev;
}

这个线程引起了我的注意,因为它处理了一个简单的问题,即使对于现代CPU也需要大量的工作(CPU周期)。有一天我也站在那里,有同样的¤#%“#”问题。我得翻几百万字节。然而,我知道我所有的目标系统都是基于现代英特尔的,所以让我们开始优化到极致!!

所以我使用了Matt J的查找代码作为基础。我正在基准测试的系统是i7 haswell 4700eq。

Matt J的查找位翻转400亿字节:大约0.272秒。

然后我继续尝试,看看英特尔的ISPC编译器是否可以向量化反向的算术。c。

我不打算在这里用我的发现来烦你,因为我尝试了很多来帮助编译器找到东西,无论如何,我最终得到了大约0.15秒的性能来bitflip 400亿字节。这是一个伟大的减少,但对于我的应用程序,这仍然是方式方式太慢。

所以人们让我展示世界上最快的基于英特尔的bitflipper。定时:

时间到bitflip 400000000字节:0.050082秒!!!!!

// Bitflip using AVX2 - The fastest Intel based bitflip in the world!!
// Made by Anders Cedronius 2014 (anders.cedronius (you know what) gmail.com)

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

using namespace std;

#define DISPLAY_HEIGHT  4
#define DISPLAY_WIDTH   32
#define NUM_DATA_BYTES  400000000

// Constants (first we got the mask, then the high order nibble look up table and last we got the low order nibble lookup table)
__attribute__ ((aligned(32))) static unsigned char k1[32*3]={
        0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,
        0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,
        0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0
};

// The data to be bitflipped (+32 to avoid the quantization out of memory problem)
__attribute__ ((aligned(32))) static unsigned char data[NUM_DATA_BYTES+32]={};

extern "C" {
void bitflipbyte(unsigned char[],unsigned int,unsigned char[]);
}

int main()
{

    for(unsigned int i = 0; i < NUM_DATA_BYTES; i++)
    {
        data[i] = rand();
    }

    printf ("\r\nData in(start):\r\n");
    for (unsigned int j = 0; j < 4; j++)
    {
        for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
        {
            printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
        }
        printf ("\r\n");
    }

    printf ("\r\nNumber of 32-byte chunks to convert: %d\r\n",(unsigned int)ceil(NUM_DATA_BYTES/32.0));

    double start_time = omp_get_wtime();
    bitflipbyte(data,(unsigned int)ceil(NUM_DATA_BYTES/32.0),k1);
    double end_time = omp_get_wtime();

    printf ("\r\nData out:\r\n");
    for (unsigned int j = 0; j < 4; j++)
    {
        for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
        {
            printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
        }
        printf ("\r\n");
    }
    printf("\r\n\r\nTime to bitflip %d bytes: %f seconds\r\n\r\n",NUM_DATA_BYTES, end_time-start_time);

    // return with no errors
    return 0;
}

printf是用来调试的。

这是主要的工作:

bits 64
global bitflipbyte

bitflipbyte:    
        vmovdqa     ymm2, [rdx]
        add         rdx, 20h
        vmovdqa     ymm3, [rdx]
        add         rdx, 20h
        vmovdqa     ymm4, [rdx]
bitflipp_loop:
        vmovdqa     ymm0, [rdi] 
        vpand       ymm1, ymm2, ymm0 
        vpandn      ymm0, ymm2, ymm0 
        vpsrld      ymm0, ymm0, 4h 
        vpshufb     ymm1, ymm4, ymm1 
        vpshufb     ymm0, ymm3, ymm0         
        vpor        ymm0, ymm0, ymm1
        vmovdqa     [rdi], ymm0
        add     rdi, 20h
        dec     rsi
        jnz     bitflipp_loop
        ret

代码占用32个字节,然后屏蔽掉蚕食。高啃角右移了4。然后使用vpshufb和ymm4 / ymm3作为查找表。我可以使用一个单独的查找表,但我将不得不在ORing再次一起啃啃之前向左移动。

还有更快的翻转比特的方法。但我被绑定到单线程和CPU,所以这是我能实现的最快速度。你能做一个快一点的版本吗?

关于使用Intel C/ c++编译器内在等效命令,请不要发表任何评论…


我认为下面是我所知道的最简单的方法。MSB是输入,LSB是“反向”输出:

unsigned char rev(char MSB) {
    unsigned char LSB=0;  // for output
    _FOR(i,0,8) {
        LSB= LSB << 1;
        if(MSB&1) LSB = LSB | 1;
        MSB= MSB >> 1;
    }
    return LSB;
}

//    It works by rotating bytes in opposite directions. 
//    Just repeat for each byte.

// Purpose: to reverse bits in an unsigned short integer 
// Input: an unsigned short integer whose bits are to be reversed
// Output: an unsigned short integer with the reversed bits of the input one
unsigned short ReverseBits( unsigned short a )
{
     // declare and initialize number of bits in the unsigned short integer
     const char num_bits = sizeof(a) * CHAR_BIT;

     // declare and initialize bitset representation of integer a
     bitset<num_bits> bitset_a(a);          

     // declare and initialize bitset representation of integer b (0000000000000000)
     bitset<num_bits> bitset_b(0);                  

     // declare and initialize bitset representation of mask (0000000000000001)
     bitset<num_bits> mask(1);          

     for ( char i = 0; i < num_bits; ++i )
     {
          bitset_b = (bitset_b << 1) | bitset_a & mask;
          bitset_a >>= 1;
     }

     return (unsigned short) bitset_b.to_ulong();
}

void PrintBits( unsigned short a )
{
     // declare and initialize bitset representation of a
     bitset<sizeof(a) * CHAR_BIT> bitset(a);

     // print out bits
     cout << bitset << endl;
}


// Testing the functionality of the code

int main ()
{
     unsigned short a = 17, b;

     cout << "Original: "; 
     PrintBits(a);

     b = ReverseBits( a );

     cout << "Reversed: ";
     PrintBits(b);
}

// Output:
Original: 0000000000010001
Reversed: 1000100000000000

我很好奇原始旋转有多快。 在我的机器(i7@2600)上,1,500,150,000次迭代的平均值为27.28 ns(在131,071个64位整数的随机集上)。

优点:占用内存少,代码简单。我想说它也没有那么大。对于任何输入(128个算术SHIFT运算+ 64个逻辑and运算+ 64个逻辑OR运算),所需的时间都是可预测的常量。

我比较了@Matt J获得的最佳时间,他有公认的答案。如果我没有看错他的答案,他得到的最好结果是0.631739秒,100万次迭代,这导致平均每次旋转631 ns。

我使用的代码片段如下:

unsigned long long reverse_long(unsigned long long x)
{
    return (((x >> 0) & 1) << 63) |
           (((x >> 1) & 1) << 62) |
           (((x >> 2) & 1) << 61) |
           (((x >> 3) & 1) << 60) |
           (((x >> 4) & 1) << 59) |
           (((x >> 5) & 1) << 58) |
           (((x >> 6) & 1) << 57) |
           (((x >> 7) & 1) << 56) |
           (((x >> 8) & 1) << 55) |
           (((x >> 9) & 1) << 54) |
           (((x >> 10) & 1) << 53) |
           (((x >> 11) & 1) << 52) |
           (((x >> 12) & 1) << 51) |
           (((x >> 13) & 1) << 50) |
           (((x >> 14) & 1) << 49) |
           (((x >> 15) & 1) << 48) |
           (((x >> 16) & 1) << 47) |
           (((x >> 17) & 1) << 46) |
           (((x >> 18) & 1) << 45) |
           (((x >> 19) & 1) << 44) |
           (((x >> 20) & 1) << 43) |
           (((x >> 21) & 1) << 42) |
           (((x >> 22) & 1) << 41) |
           (((x >> 23) & 1) << 40) |
           (((x >> 24) & 1) << 39) |
           (((x >> 25) & 1) << 38) |
           (((x >> 26) & 1) << 37) |
           (((x >> 27) & 1) << 36) |
           (((x >> 28) & 1) << 35) |
           (((x >> 29) & 1) << 34) |
           (((x >> 30) & 1) << 33) |
           (((x >> 31) & 1) << 32) |
           (((x >> 32) & 1) << 31) |
           (((x >> 33) & 1) << 30) |
           (((x >> 34) & 1) << 29) |
           (((x >> 35) & 1) << 28) |
           (((x >> 36) & 1) << 27) |
           (((x >> 37) & 1) << 26) |
           (((x >> 38) & 1) << 25) |
           (((x >> 39) & 1) << 24) |
           (((x >> 40) & 1) << 23) |
           (((x >> 41) & 1) << 22) |
           (((x >> 42) & 1) << 21) |
           (((x >> 43) & 1) << 20) |
           (((x >> 44) & 1) << 19) |
           (((x >> 45) & 1) << 18) |
           (((x >> 46) & 1) << 17) |
           (((x >> 47) & 1) << 16) |
           (((x >> 48) & 1) << 15) |
           (((x >> 49) & 1) << 14) |
           (((x >> 50) & 1) << 13) |
           (((x >> 51) & 1) << 12) |
           (((x >> 52) & 1) << 11) |
           (((x >> 53) & 1) << 10) |
           (((x >> 54) & 1) << 9) |
           (((x >> 55) & 1) << 8) |
           (((x >> 56) & 1) << 7) |
           (((x >> 57) & 1) << 6) |
           (((x >> 58) & 1) << 5) |
           (((x >> 59) & 1) << 4) |
           (((x >> 60) & 1) << 3) |
           (((x >> 61) & 1) << 2) |
           (((x >> 62) & 1) << 1) |
           (((x >> 63) & 1) << 0);
}

这是32位,如果我们考虑8位,我们需要改变大小。

    void bitReverse(int num)
    {
        int num_reverse = 0;
        int size = (sizeof(int)*8) -1;
        int i=0,j=0;
        for(i=0,j=size;i<=size,j>=0;i++,j--)
        {
            if((num >> i)&1)
            {
                num_reverse = (num_reverse | (1<<j));
            }
        }
        printf("\n rev num = %d\n",num_reverse);
    }

按LSB->MSB顺序读取输入整数“num”,并按MSB->LSB顺序存储在num_reverse中。


Anders Cedronius的答案为那些拥有支持AVX2的x86 CPU的人提供了一个很好的解决方案。对于没有AVX支持的x86平台或非x86平台,以下任何一种实现都应该工作良好。

第一个代码是经典二进制分区方法的一个变体,编码的目的是最大限度地利用shift-plus-logic习惯用法,这种习惯用法在各种ARM处理器上都很有用。此外,它使用动态掩码生成,这对于需要多个指令来加载每个32位掩码值的RISC处理器是有益的。x86平台的编译器应该在编译时而不是运行时使用常量传播来计算所有掩码。

/* Classic binary partitioning algorithm */
inline uint32_t brev_classic (uint32_t a)
{
    uint32_t m;
    a = (a >> 16) | (a << 16);                            // swap halfwords
    m = 0x00ff00ff; a = ((a >> 8) & m) | ((a << 8) & ~m); // swap bytes
    m = m^(m << 4); a = ((a >> 4) & m) | ((a << 4) & ~m); // swap nibbles
    m = m^(m << 2); a = ((a >> 2) & m) | ((a << 2) & ~m);
    m = m^(m << 1); a = ((a >> 1) & m) | ((a << 1) & ~m);
    return a;
}

在“计算机编程艺术”的第4A卷中,D. Knuth展示了反转位的聪明方法,这比经典的二进制分区算法所需的操作少得令人惊讶。一个这样的32位操作数算法,我在TAOCP中找不到,在Hacker’s Delight网站上的这个文档中显示。

/* Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf. Retrieved 8/19/2015 */
inline uint32_t brev_knuth (uint32_t a)
{
    uint32_t t;
    a = (a << 15) | (a >> 17);
    t = (a ^ (a >> 10)) & 0x003f801f; 
    a = (t + (t << 10)) ^ a;
    t = (a ^ (a >>  4)) & 0x0e038421; 
    a = (t + (t <<  4)) ^ a;
    t = (a ^ (a >>  2)) & 0x22488842; 
    a = (t + (t <<  2)) ^ a;
    return a;
}

使用Intel编译器C/ c++编译器13.1.3.198,上述两个函数都能很好地自动向量化XMM寄存器。它们也可以手动向量化,而不需要很多努力。

在我的IvyBridge Xeon E3 1270v2上,使用自动向量化代码,1亿uint32_t字在0.070秒内使用brev_classic()位反转,0.068秒使用brev_knuth()位反转。我注意确保我的基准测试不受系统内存带宽的限制。


另一个基于循环的解决方案,在数量较低时快速退出(在c++中用于多种类型)

template<class T>
T reverse_bits(T in) {
    T bit = static_cast<T>(1) << (sizeof(T) * 8 - 1);
    T out;

    for (out = 0; bit && in; bit >>= 1, in >>= 1) {
        if (in & 1) {
            out |= bit;
        }
    }
    return out;
}

或者C语言中unsigned int

unsigned int reverse_bits(unsigned int in) {
    unsigned int bit = 1u << (sizeof(T) * 8 - 1);
    unsigned int out;

    for (out = 0; bit && in; bit >>= 1, in >>= 1) {
        if (in & 1)
            out |= bit;
    }
    return out;
}

这不是人类能做的工作!... 但非常适合做机器

这是2015年,距离第一次提出这个问题已经过去了6年。编译器从此成为我们的主人,而我们作为人类的工作只是帮助它们。那么,把我们的意图传达给机器的最佳方式是什么呢?

位反转是如此普遍,以至于你不得不怀疑为什么x86不断增长的ISA没有包含一次性完成它的指令。

原因是:如果你给编译器一个真正简洁的意图,位反转应该只需要大约20个CPU周期。让我向你展示如何制作reverse()并使用它:

#include <inttypes.h>
#include <stdio.h>

uint64_t reverse(const uint64_t n,
                 const uint64_t k)
{
        uint64_t r, i;
        for (r = 0, i = 0; i < k; ++i)
                r |= ((n >> i) & 1) << (k - i - 1);
        return r;
}

int main()
{
        const uint64_t size = 64;
        uint64_t sum = 0;
        uint64_t a;
        for (a = 0; a < (uint64_t)1 << 30; ++a)
                sum += reverse(a, size);
        printf("%" PRIu64 "\n", sum);
        return 0;
}

使用Clang版本>= 3.6,-O3, -march=native(用Haswell测试)编译这个示例程序,使用新的AVX2指令提供美术质量代码,运行时为11秒处理~ 10亿reverse()秒。这是~10 ns每反向(),0.5 ns CPU周期假设2 GHz,我们将达到甜蜜的20个CPU周期。

对于单个大数组,您可以在访问RAM一次所需的时间内放入10个reverse() ! 你可以在访问L2缓存LUT两次的时间里放入1个reverse()。

注意:这个示例代码应该可以作为一个不错的基准运行几年,但是一旦编译器足够聪明,可以优化main()只输出最终结果,而不是真正计算任何东西,它最终就会开始显得过时了。但目前它只用于展示reverse()。


我认为这是最简单的逆转比特的方法之一。 如果这个逻辑有什么缺陷,请让我知道。 基本上在这个逻辑中,我们检查位置的位的值。 如果值为1,则在反转位置上设置位。

void bit_reverse(ui32 *data)
{
  ui32 temp = 0;    
  ui32 i, bit_len;    
  {    
   for(i = 0, bit_len = 31; i <= bit_len; i++)   
   {    
    temp |= (*data & 1 << i)? (1 << bit_len-i) : 0;    
   }    
   *data = temp;    
  }    
  return;    
}    

原生ARM指令“rbit”可以用1个cpu周期和1个额外的cpu寄存器来完成,不可能被击败。


似乎许多其他帖子都关心速度(即最好=最快)。 简单性怎么样?考虑:

char ReverseBits(char character) {
    char reversed_character = 0;
    for (int i = 0; i < 8; i++) {
        char ith_bit = (c >> i) & 1;
        reversed_character |= (ith_bit << (sizeof(char) - 1 - i));
    }
    return reversed_character;
}

并希望聪明的编译器将为您优化。

如果你想反转一个更长的位列表(包含sizeof(char) * n位),你可以使用这个函数得到:

void ReverseNumber(char* number, int bit_count_in_number) {
    int bytes_occupied = bit_count_in_number / sizeof(char);      

    // first reverse bytes
    for (int i = 0; i <= (bytes_occupied / 2); i++) {
        swap(long_number[i], long_number[n - i]);
    }

    // then reverse bits of each individual byte
    for (int i = 0; i < bytes_occupied; i++) {
         long_number[i] = ReverseBits(long_number[i]);
    }
}

这将把[10000000,10101010]反向转换为[01010101,00000001]。


我的简单解决方案

BitReverse(IN)
    OUT = 0x00;
    R = 1;      // Right mask   ...0000.0001
    L = 0;      // Left mask    1000.0000...
    L = ~0; 
    L = ~(i >> 1);
    int size = sizeof(IN) * 4;  // bit size

    while(size--){
        if(IN & L) OUT = OUT | R; // start from MSB  1000.xxxx
        if(IN & R) OUT = OUT | L; // start from LSB  xxxx.0001
        L = L >> 1;
        R = R << 1; 
    }
    return OUT;

高效意味着吞吐量或延迟。

从头到尾,看看安德斯·塞德罗尼厄斯的回答,很好。

为了降低延迟,我推荐以下代码:

uint32_t reverseBits( uint32_t x )
{
#if defined(__arm__) || defined(__aarch64__)
    __asm__( "rbit %0, %1" : "=r" ( x ) : "r" ( x ) );
    return x;
#endif
    // Flip pairwise
    x = ( ( x & 0x55555555 ) << 1 ) | ( ( x & 0xAAAAAAAA ) >> 1 );
    // Flip pairs
    x = ( ( x & 0x33333333 ) << 2 ) | ( ( x & 0xCCCCCCCC ) >> 2 );
    // Flip nibbles
    x = ( ( x & 0x0F0F0F0F ) << 4 ) | ( ( x & 0xF0F0F0F0 ) >> 4 );

    // Flip bytes. CPUs have an instruction for that, pretty fast one.
#ifdef _MSC_VER
    return _byteswap_ulong( x );
#elif defined(__INTEL_COMPILER)
    return (uint32_t)_bswap( (int)x );
#else
    // Assuming gcc or clang
    return __builtin_bswap32( x );
#endif
}

编译器输出:https://godbolt.org/z/5ehd89


对于其他可能遇到这个问题的网络搜索者,这里有一个总结(针对C和JavaScript)。

对于JavaScript的完整解决方案,我们可以首先生成表:

const BIT_REVERSAL_TABLE = new Array(256)

for (var i = 0; i < 256; ++i) {
  var v = i, r = i, s = 7;
  for (v >>>= 1; v; v >>>= 1) {
    r <<= 1;
    r |= v & 1;
    --s;
  }
  BIT_REVERSAL_TABLE[i] = (r << s) & 0xff;
}

这给了我们BIT_REVERSAL_TABLE,这是@MattJ发布的:

const BIT_REVERSAL_TABLE = new Uint8Array([      
  0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, 
  0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, 
  0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, 
  0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, 
  0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 
  0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
  0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, 
  0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
  0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
  0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, 
  0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
  0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
  0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, 
  0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
  0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, 
  0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
])

8位、16位和32位无符号整数的算法可以在这里找到:

function reverseBits8(n) {
  return BIT_REVERSAL_TABLE[n]
}

function reverseBits16(n) {
  return (BIT_REVERSAL_TABLE[(n >> 8) & 0xff] |
    BIT_REVERSAL_TABLE[n & 0xff] << 8)
}

function reverseBits32(n) {
  return (BIT_REVERSAL_TABLE[n & 0xff] << 24) |
    (BIT_REVERSAL_TABLE[(n >>> 8) & 0xff] << 16) |
    (BIT_REVERSAL_TABLE[(n >>> 16) & 0xff] << 8) |
    BIT_REVERSAL_TABLE[(n >>> 24) & 0xff];
}

注意,32位版本不能在JavaScript中工作(必须转换为使用bigint,这很简单),但应该可以在64位语言中工作:

log8(0b11000100)
log16(0b1110001001001100)
log32(0b11110010111110111100110010101011)

// 0b11000100 => 0b00100011
// 0b1110001001001100 => 0b0011001001000111
// doesn't work in JS it seems:
// 0b11110010111110111100110010101011 => 0b0-101010110011000010000010110001

function log8(n) {
  console.log(`${bits(n, 8)} => ${bits(reverseBits8(n), 8)}`)
}

function log16(n) {
  console.log(`${bits(n, 16)} => ${bits(reverseBits16(n), 16)}`)
}

function log32(n) {
  console.log(`${bits(n, 32)} => ${bits(reverseBits32(n), 32)}`)
}

function bits(n, size) {
  return `0b${n.toString(2).padStart(size, '0')}`
}

注意:这个解决方案适用于JavaScript的32位:

function reverseBits32(n) {
  let res = 0;
  for (let i = 0; i < 32; i++) {
    res = (res << 1) + (n & 1);
    n = n >>> 1;
  }

  return res >>> 0;
}

所有3个基于表格的解决方案都可以在C中正常工作。下面是一个粗略的C版本:

#include <stdlib.h>

static uint8_t* BIT_REVERSAL_TABLE;

uint8_t* 
make_bit_reversal_table() {
  uint8_t *table = malloc(256 * sizeof(uint8_t));
  uint8_t i;
  for (i = 0; i < 256 ; ++i) {
    uint8_t v = i;
    uint8_t r = i;
    uint8_t s = 7;
    for (v = v >> 1; v; v = v >> 1) {
      r <<= 1;
      r |= v & 1;
      --s;
    }
    table[i] = (r << s) & 0xff;
  }
  return table;
}

uint8_t 
reverse_bits_8(uint8_t n) {
  return BIT_REVERSAL_TABLE[n];
}

uint16_t
reverse_bits_16(uint16_t n)
{
  return (BIT_REVERSAL_TABLE[(n >> 8) & 0xff]
    | BIT_REVERSAL_TABLE[n & 0xff] << 8);
}

uint32_t
reverse_bits_32(uint32_t n) {
  return (BIT_REVERSAL_TABLE[n & 0xff] << 24) 
    | (BIT_REVERSAL_TABLE[(n >> 8) & 0xff] << 16) 
    | (BIT_REVERSAL_TABLE[(n >> 16) & 0xff] << 8) 
    | BIT_REVERSAL_TABLE[(n >> 24) & 0xff];
}

int 
main(void) {
  BIT_REVERSAL_TABLE = make_bit_reversal_table();
  return 0;
}