
0010 0000 => 0000 0100





 template<size_t N>
 const std::bitset<N> reverse(const std::bitset<N>& ordered)
      std::bitset<N> reversed;
      for(size_t i = 0, j = N - 1; i < N; ++i, --j)
           reversed[j] = ordered[i];
      return reversed;

 // test the function
 int main()
      unsigned long num; 
      const size_t N = sizeof(num)*8;

      std::cin >> num;
      std::cout << std::showbase << std::hex;
      std::cout << "ordered  = " << num << std::endl;
      std::cout << "reversed = " << reverse<N>(num).to_ulong()  << std::endl;
      std::cout << "double_reversed = " << reverse<N>(reverse<N>(num)).to_ulong() << std::endl;  


unsigned char ReverseBits(unsigned char data)
    unsigned char k = 0, rev = 0;

    unsigned char n = data;


        k = n & (~(n - 1));
        n &= (n - 1);
        rev |= (128 / k);
    return rev;


所以我使用了Matt J的查找代码作为基础。我正在基准测试的系统是i7 haswell 4700eq。

Matt J的查找位翻转400亿字节:大约0.272秒。


我不打算在这里用我的发现来烦你,因为我尝试了很多来帮助编译器找到东西,无论如何,我最终得到了大约0.15秒的性能来bitflip 400亿字节。这是一个伟大的减少,但对于我的应用程序,这仍然是方式方式太慢。


时间到bitflip 400000000字节:0.050082秒!!!!!

// Bitflip using AVX2 - The fastest Intel based bitflip in the world!!
// Made by Anders Cedronius 2014 (anders.cedronius (you know what) gmail.com)

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

using namespace std;

#define DISPLAY_WIDTH   32
#define NUM_DATA_BYTES  400000000

// Constants (first we got the mask, then the high order nibble look up table and last we got the low order nibble lookup table)
__attribute__ ((aligned(32))) static unsigned char k1[32*3]={

// The data to be bitflipped (+32 to avoid the quantization out of memory problem)
__attribute__ ((aligned(32))) static unsigned char data[NUM_DATA_BYTES+32]={};

extern "C" {
void bitflipbyte(unsigned char[],unsigned int,unsigned char[]);

int main()

    for(unsigned int i = 0; i < NUM_DATA_BYTES; i++)
        data[i] = rand();

    printf ("\r\nData in(start):\r\n");
    for (unsigned int j = 0; j < 4; j++)
        for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
            printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
        printf ("\r\n");

    printf ("\r\nNumber of 32-byte chunks to convert: %d\r\n",(unsigned int)ceil(NUM_DATA_BYTES/32.0));

    double start_time = omp_get_wtime();
    bitflipbyte(data,(unsigned int)ceil(NUM_DATA_BYTES/32.0),k1);
    double end_time = omp_get_wtime();

    printf ("\r\nData out:\r\n");
    for (unsigned int j = 0; j < 4; j++)
        for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
            printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
        printf ("\r\n");
    printf("\r\n\r\nTime to bitflip %d bytes: %f seconds\r\n\r\n",NUM_DATA_BYTES, end_time-start_time);

    // return with no errors
    return 0;



bits 64
global bitflipbyte

        vmovdqa     ymm2, [rdx]
        add         rdx, 20h
        vmovdqa     ymm3, [rdx]
        add         rdx, 20h
        vmovdqa     ymm4, [rdx]
        vmovdqa     ymm0, [rdi] 
        vpand       ymm1, ymm2, ymm0 
        vpandn      ymm0, ymm2, ymm0 
        vpsrld      ymm0, ymm0, 4h 
        vpshufb     ymm1, ymm4, ymm1 
        vpshufb     ymm0, ymm3, ymm0         
        vpor        ymm0, ymm0, ymm1
        vmovdqa     [rdi], ymm0
        add     rdi, 20h
        dec     rsi
        jnz     bitflipp_loop

代码占用32个字节,然后屏蔽掉蚕食。高啃角右移了4。然后使用vpshufb和ymm4 / ymm3作为查找表。我可以使用一个单独的查找表,但我将不得不在ORing再次一起啃啃之前向左移动。


关于使用Intel C/ c++编译器内在等效命令,请不要发表任何评论…

似乎许多其他帖子都关心速度(即最好=最快)。 简单性怎么样?考虑:

char ReverseBits(char character) {
    char reversed_character = 0;
    for (int i = 0; i < 8; i++) {
        char ith_bit = (c >> i) & 1;
        reversed_character |= (ith_bit << (sizeof(char) - 1 - i));
    return reversed_character;


如果你想反转一个更长的位列表(包含sizeof(char) * n位),你可以使用这个函数得到:

void ReverseNumber(char* number, int bit_count_in_number) {
    int bytes_occupied = bit_count_in_number / sizeof(char);      

    // first reverse bytes
    for (int i = 0; i <= (bytes_occupied / 2); i++) {
        swap(long_number[i], long_number[n - i]);

    // then reverse bits of each individual byte
    for (int i = 0; i < bytes_occupied; i++) {
         long_number[i] = ReverseBits(long_number[i]);



// Purpose: to reverse bits in an unsigned short integer 
// Input: an unsigned short integer whose bits are to be reversed
// Output: an unsigned short integer with the reversed bits of the input one
unsigned short ReverseBits( unsigned short a )
     // declare and initialize number of bits in the unsigned short integer
     const char num_bits = sizeof(a) * CHAR_BIT;

     // declare and initialize bitset representation of integer a
     bitset<num_bits> bitset_a(a);          

     // declare and initialize bitset representation of integer b (0000000000000000)
     bitset<num_bits> bitset_b(0);                  

     // declare and initialize bitset representation of mask (0000000000000001)
     bitset<num_bits> mask(1);          

     for ( char i = 0; i < num_bits; ++i )
          bitset_b = (bitset_b << 1) | bitset_a & mask;
          bitset_a >>= 1;

     return (unsigned short) bitset_b.to_ulong();

void PrintBits( unsigned short a )
     // declare and initialize bitset representation of a
     bitset<sizeof(a) * CHAR_BIT> bitset(a);

     // print out bits
     cout << bitset << endl;

// Testing the functionality of the code

int main ()
     unsigned short a = 17, b;

     cout << "Original: "; 

     b = ReverseBits( a );

     cout << "Reversed: ";

// Output:
Original: 0000000000010001
Reversed: 1000100000000000