代表数字7的8位像这样:

00000111

设置了三个比特。

确定32位整数中设置位数的算法是什么?


当前回答

我发现了一个在数组中使用SIMD指令(SSSE3和AVX2)的位计数实现。它的性能比使用__popcnt64内禀函数要好2-2.5倍。

SSSE3版:

#include <smmintrin.h>
#include <stdint.h>

const __m128i Z = _mm_set1_epi8(0x0);
const __m128i F = _mm_set1_epi8(0xF);
//Vector with pre-calculated bit count:
const __m128i T = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);

uint64_t BitCount(const uint8_t * src, size_t size)
{
    __m128i _sum =  _mm128_setzero_si128();
    for (size_t i = 0; i < size; i += 16)
    {
        //load 16-byte vector
        __m128i _src = _mm_loadu_si128((__m128i*)(src + i));
        //get low 4 bit for every byte in vector
        __m128i lo = _mm_and_si128(_src, F);
        //sum precalculated value from T
        _sum = _mm_add_epi64(_sum, _mm_sad_epu8(Z, _mm_shuffle_epi8(T, lo)));
        //get high 4 bit for every byte in vector
        __m128i hi = _mm_and_si128(_mm_srli_epi16(_src, 4), F);
        //sum precalculated value from T
        _sum = _mm_add_epi64(_sum, _mm_sad_epu8(Z, _mm_shuffle_epi8(T, hi)));
    }
    uint64_t sum[2];
    _mm_storeu_si128((__m128i*)sum, _sum);
    return sum[0] + sum[1];
}

AVX2 版本:

#include <immintrin.h>
#include <stdint.h>

const __m256i Z = _mm256_set1_epi8(0x0);
const __m256i F = _mm256_set1_epi8(0xF);
//Vector with pre-calculated bit count:
const __m256i T = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 
                                   0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);

uint64_t BitCount(const uint8_t * src, size_t size)
{
    __m256i _sum =  _mm256_setzero_si256();
    for (size_t i = 0; i < size; i += 32)
    {
        //load 32-byte vector
        __m256i _src = _mm256_loadu_si256((__m256i*)(src + i));
        //get low 4 bit for every byte in vector
        __m256i lo = _mm256_and_si256(_src, F);
        //sum precalculated value from T
        _sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(Z, _mm256_shuffle_epi8(T, lo)));
        //get high 4 bit for every byte in vector
        __m256i hi = _mm256_and_si256(_mm256_srli_epi16(_src, 4), F);
        //sum precalculated value from T
        _sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(Z, _mm256_shuffle_epi8(T, hi)));
    }
    uint64_t sum[4];
    _mm256_storeu_si256((__m256i*)sum, _sum);
    return sum[0] + sum[1] + sum[2] + sum[3];
}

其他回答

当你写出比特模式时,“黑客的喜悦”比特旋转变得更加清晰。

unsigned int bitCount(unsigned int x)
{
  x = ((x >> 1) & 0b01010101010101010101010101010101)
     + (x       & 0b01010101010101010101010101010101);
  x = ((x >> 2) & 0b00110011001100110011001100110011)
     + (x       & 0b00110011001100110011001100110011); 
  x = ((x >> 4) & 0b00001111000011110000111100001111)
     + (x       & 0b00001111000011110000111100001111); 
  x = ((x >> 8) & 0b00000000111111110000000011111111)
     + (x       & 0b00000000111111110000000011111111); 
  x = ((x >> 16)& 0b00000000000000001111111111111111)
     + (x       & 0b00000000000000001111111111111111); 
  return x;
}

第一步将偶数位加到奇数位上,产生每两个位的和。其他步骤将高阶数据块添加到低阶数据块,将数据块的大小一直增加一倍,直到最终计数占用整个int。

为什么不迭代地除以2呢?

count = 0
while n > 0
  if (n % 2) == 1
    count += 1
  n /= 2  

我同意这不是最快的,但是“最好”这个词有点含糊不清。我认为“最好”应该有一个清晰的元素

Python的解决方案:

def hammingWeight(n: int) -> int:
    sums = 0
    while (n!=0):
        sums+=1
        n = n &(n-1)

    return sums

在二进制表示中,n中最不有效的1位总是对应n - 1中的0位。因此,对n和n - 1这两个数进行and运算总是将n中最不有效的1位翻转为0,并保持所有其他位相同。

对于那些想要在c++ 11中为任何无符号整数类型作为consexpr函数的人(tacklelib/include/tacklelib/utility/math.hpp):

#include <stdint.h>
#include <limits>
#include <type_traits>

const constexpr uint32_t uint32_max = (std::numeric_limits<uint32_t>::max)();

namespace detail
{
    template <typename T>
    inline constexpr T _count_bits_0(const T & v)
    {
        return v - ((v >> 1) & 0x55555555);
    }

    template <typename T>
    inline constexpr T _count_bits_1(const T & v)
    {
        return (v & 0x33333333) + ((v >> 2) & 0x33333333);
    }

    template <typename T>
    inline constexpr T _count_bits_2(const T & v)
    {
        return (v + (v >> 4)) & 0x0F0F0F0F;
    }

    template <typename T>
    inline constexpr T _count_bits_3(const T & v)
    {
        return v + (v >> 8);
    }

    template <typename T>
    inline constexpr T _count_bits_4(const T & v)
    {
        return v + (v >> 16);
    }

    template <typename T>
    inline constexpr T _count_bits_5(const T & v)
    {
        return v & 0x0000003F;
    }

    template <typename T, bool greater_than_uint32>
    struct _impl
    {
        static inline constexpr T _count_bits_with_shift(const T & v)
        {
            return
                detail::_count_bits_5(
                    detail::_count_bits_4(
                        detail::_count_bits_3(
                            detail::_count_bits_2(
                                detail::_count_bits_1(
                                    detail::_count_bits_0(v)))))) + count_bits(v >> 32);
        }
    };

    template <typename T>
    struct _impl<T, false>
    {
        static inline constexpr T _count_bits_with_shift(const T & v)
        {
            return 0;
        }
    };
}

template <typename T>
inline constexpr T count_bits(const T & v)
{
    static_assert(std::is_integral<T>::value, "type T must be an integer");
    static_assert(!std::is_signed<T>::value, "type T must be not signed");

    return uint32_max >= v ?
        detail::_count_bits_5(
            detail::_count_bits_4(
                detail::_count_bits_3(
                    detail::_count_bits_2(
                        detail::_count_bits_1(
                            detail::_count_bits_0(v)))))) :
        detail::_impl<T, sizeof(uint32_t) < sizeof(v)>::_count_bits_with_shift(v);
}

谷歌测试库中的附加测试:

#include <stdlib.h>
#include <time.h>

namespace {
    template <typename T>
    inline uint32_t _test_count_bits(const T & v)
    {
        uint32_t count = 0;
        T n = v;
        while (n > 0) {
            if (n % 2) {
                count += 1;
            }
            n /= 2;
        }
        return count;
    }
}

TEST(FunctionsTest, random_count_bits_uint32_100K)
{
    srand(uint_t(time(NULL)));
    for (uint32_t i = 0; i < 100000; i++) {
        const uint32_t r = uint32_t(rand()) + (uint32_t(rand()) << 16);
        ASSERT_EQ(_test_count_bits(r), count_bits(r));
    }
}

TEST(FunctionsTest, random_count_bits_uint64_100K)
{
    srand(uint_t(time(NULL)));
    for (uint32_t i = 0; i < 100000; i++) {
        const uint64_t r = uint64_t(rand()) + (uint64_t(rand()) << 16) + (uint64_t(rand()) << 32) + (uint64_t(rand()) << 48);
        ASSERT_EQ(_test_count_bits(r), count_bits(r));
    }
}

这可以在O(k)中完成,其中k是设置的比特数。

int NumberOfSetBits(int n)
{
    int count = 0;

    while (n){
        ++ count;
        n = (n - 1) & n;
    }

    return count;
}