计算32位整数中的设置位数

代表数字7的8位像这样:

00000111

设置了三个比特。

确定32位整数中设置位数的算法是什么?

当前回答

我提供了另一个未提及的算法，称为并行，从这里取。它的优点是它是通用的，这意味着代码对于8、16、32、64和128位大小是相同的。

我检查了它的值的正确性和时间的数量为2^26位大小为8,16,32和64位。请看下面的时间安排。

该算法是第一个代码片段。这里提到另外两个只是为了参考，因为我测试和比较了它们。

算法是用c++编写的，是通用的，但它可以很容易地应用到旧的C中。

#include <type_traits>
#include <cstdint>

template <typename IntT>
inline size_t PopCntParallel(IntT n) {
    // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
    using T = std::make_unsigned_t<IntT>;

    T v = T(n);
    v = v - ((v >> 1) & (T)~(T)0/3);                           // temp
    v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);      // temp
    v = (v + (v >> 4)) & (T)~(T)0/255*15;                      // temp
    return size_t((T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8); // count
}

下面是我比较的两种算法。一个是带有循环的Kernighan简单方法，从这里开始。

template <typename IntT>
inline size_t PopCntKernighan(IntT n) {
    // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
    using T = std::make_unsigned_t<IntT>;
    T v = T(n);
    size_t c;
    for (c = 0; v; ++c)
        v &= v - 1; // Clear the least significant bit set
    return c;
}

另一个是使用内置的__popcnt16()/__popcnt()/__popcnt64() MSVC的内在(doc在这里)。或CLang/GCC (doc)的__builtin_popcount。这个内在应该提供一个非常优化的版本，可能是硬件:

#ifdef _MSC_VER
    // https://learn.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64?view=msvc-160
    #include <intrin.h>
    #define popcnt16 __popcnt16
    #define popcnt32 __popcnt
    #define popcnt64 __popcnt64
#else
    // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
    #define popcnt16 __builtin_popcount
    #define popcnt32 __builtin_popcount
    #define popcnt64 __builtin_popcountll
#endif

template <typename IntT>
inline size_t PopCntBuiltin(IntT n) {
    using T = std::make_unsigned_t<IntT>;
    T v = T(n);
    if constexpr(sizeof(IntT) <= 2)
        return popcnt16(uint16_t(v));
    else if constexpr(sizeof(IntT) <= 4)
        return popcnt32(uint32_t(v));
    else if constexpr(sizeof(IntT) <= 8)
        return popcnt64(uint64_t(v));
    else
        static_assert([]{ return false; }());
}

以下是计时，以纳秒为单位。所有的计时都是对2^26个随机数进行的。比较了所有三种算法的计时以及8、16、32和64之间的所有比特大小。总的来说，所有测试在我的机器上花费了16秒。使用了高分辨率时钟。

08 bit Builtin 8.2 ns
08 bit Parallel 8.2 ns
08 bit Kernighan 26.7 ns

16 bit Builtin 7.7 ns
16 bit Parallel 7.7 ns
16 bit Kernighan 39.7 ns

32 bit Builtin 7.0 ns
32 bit Parallel 7.0 ns
32 bit Kernighan 47.9 ns

64 bit Builtin 7.5 ns
64 bit Parallel 7.5 ns
64 bit Kernighan 59.4 ns

128 bit Builtin 7.8 ns
128 bit Parallel 13.8 ns
128 bit Kernighan 127.6 ns

可以看到，所提供的并行算法(在三个算法中排名第一)与MSVC /CLang的固有算法一样好。

作为参考，下面是我用来测试所有函数的速度/时间/正确性的完整代码。

作为奖励，这段代码(不像上面的短代码片段)也测试128位大小，但只在CLang/GCC下(不是MSVC)，因为它们有unsigned __int128。

在网上试试!

#include <type_traits>
#include <cstdint>

using std::size_t;

#if defined(_MSC_VER) && !defined(__clang__)
    #define IS_MSVC 1
#else
    #define IS_MSVC 0
#endif

#if IS_MSVC
    #define HAS128 false
#else
    using int128_t = __int128;
    using uint128_t = unsigned __int128;
    #define HAS128 true
#endif

template <typename T> struct UnSignedT { using type = std::make_unsigned_t<T>; };
#if HAS128
    template <> struct UnSignedT<int128_t> { using type = uint128_t; };
    template <> struct UnSignedT<uint128_t> { using type = uint128_t; };
#endif
template <typename T> using UnSigned = typename UnSignedT<T>::type;

template <typename IntT>
inline size_t PopCntParallel(IntT n) {
    // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
    using T = UnSigned<IntT>;

    T v = T(n);
    v = v - ((v >> 1) & (T)~(T)0/3);                           // temp
    v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);      // temp
    v = (v + (v >> 4)) & (T)~(T)0/255*15;                      // temp
    return size_t((T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8); // count
}

template <typename IntT>
inline size_t PopCntKernighan(IntT n) {
    // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
    using T = UnSigned<IntT>;
    T v = T(n);
    size_t c;
    for (c = 0; v; ++c)
        v &= v - 1; // Clear the least significant bit set
    return c;
}

#if IS_MSVC
    // https://learn.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64?view=msvc-160
    #include <intrin.h>
    #define popcnt16 __popcnt16
    #define popcnt32 __popcnt
    #define popcnt64 __popcnt64
#else
    // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
    #define popcnt16 __builtin_popcount
    #define popcnt32 __builtin_popcount
    #define popcnt64 __builtin_popcountll
#endif

#define popcnt128(x) (popcnt64(uint64_t(x)) + popcnt64(uint64_t(x >> 64)))

template <typename IntT>
inline size_t PopCntBuiltin(IntT n) {
    using T = UnSigned<IntT>;
    T v = T(n);
    if constexpr(sizeof(IntT) <= 2)
        return popcnt16(uint16_t(v));
    else if constexpr(sizeof(IntT) <= 4)
        return popcnt32(uint32_t(v));
    else if constexpr(sizeof(IntT) <= 8)
        return popcnt64(uint64_t(v));
    else if constexpr(sizeof(IntT) <= 16)
        return popcnt128(uint128_t(v));
    else
        static_assert([]{ return false; }());
}

#include <random>
#include <vector>
#include <chrono>
#include <string>
#include <iostream>
#include <iomanip>
#include <map>

inline double Time() {
    static auto const gtb = std::chrono::high_resolution_clock::now();
    return std::chrono::duration_cast<std::chrono::duration<double>>(
        std::chrono::high_resolution_clock::now() - gtb).count();
}

template <typename T, typename F>
void Test(std::string const & name, F f) {
    std::mt19937_64 rng{123};
    size_t constexpr bit_size = sizeof(T) * 8, ntests = 1 << 6, nnums = 1 << 14;
    std::vector<T> nums(nnums);
    for (size_t i = 0; i < nnums; ++i)
        nums[i] = T(rng() % ~T(0));
    static std::map<size_t, size_t> times;
    double min_time = 1000;
    for (size_t i = 0; i < ntests; ++i) {
        double timer = Time();
        size_t sum = 0;
        for (size_t j = 0; j < nnums; j += 4)
            sum += f(nums[j + 0]) + f(nums[j + 1]) + f(nums[j + 2]) + f(nums[j + 3]);
        auto volatile vsum = sum;
        min_time = std::min(min_time, (Time() - timer) / nnums);
        if (times.count(bit_size) && times.at(bit_size) != sum)
            std::cout << "Wrong bit cnt checksum!" << std::endl;
        times[bit_size] = sum;
    }
    std::cout << std::setw(2) << std::setfill('0') << bit_size
        << " bit " << name << " " << std::fixed << std::setprecision(1)
        << min_time * 1000000000 << " ns" << std::endl;
}

int main() {
    #define TEST(T) \
        Test<T>("Builtin", PopCntBuiltin<T>); \
        Test<T>("Parallel", PopCntParallel<T>); \
        Test<T>("Kernighan", PopCntKernighan<T>); \
        std::cout << std::endl;
    
    TEST(uint8_t); TEST(uint16_t); TEST(uint32_t); TEST(uint64_t);
    #if HAS128
        TEST(uint128_t);
    #endif
    
    #undef TEST
}

2021-04-20 20:17:01

其他回答

int bitcount(unsigned int n)
{ 
      int count=0;
      while(n)
      {
           count += n & 0x1u;
           n >>= 1;
      }
      return  count;
 }

迭代的“计数”运行的时间与总比特数成比例。它只是循环遍历所有位，因为while条件而稍微提前终止。如果1'S或集合位是稀疏的且在最低有效位之间，则很有用。

2014-06-07 16:41:10

你可以这样做:

int countSetBits(int n)
{
    n=((n&0xAAAAAAAA)>>1) + (n&0x55555555);
    n=((n&0xCCCCCCCC)>>2) + (n&0x33333333);
    n=((n&0xF0F0F0F0)>>4) + (n&0x0F0F0F0F);
    n=((n&0xFF00FF00)>>8) + (n&0x00FF00FF);
    return n;
}

int main()
{
    int n=10;
    printf("Number of set bits: %d",countSetBits(n));
     return 0;
}

海王： http://ideone.com/JhwcX

工作原理如下:

首先，所有的偶数位都向右移动，并与奇数位相加，以计算两组位的数量。然后我们两人一组，然后四个人，以此类推。

2012-06-21 13:34:47

下面是PHP中的一些东西(所有PHP整数都是32位符号，因此是31位):

function bits_population($nInteger)
{

    $nPop=0;
    while($nInteger)
    {
        $nInteger^=(1<<(floor(1+log($nInteger)/log(2))-1));
        $nPop++;
    }
    return $nPop;
}

2012-02-24 14:16:47

这里有一个到目前为止还没有提到的解决方案，使用位字段。下面的程序使用4种不同的方法对100000000个16位整数数组中的设置位进行计数。计时结果在括号中给出(在MacOSX上，使用gcc -O3):

#include <stdio.h>
#include <stdlib.h>

#define LENGTH 100000000

typedef struct {
    unsigned char bit0 : 1;
    unsigned char bit1 : 1;
    unsigned char bit2 : 1;
    unsigned char bit3 : 1;
    unsigned char bit4 : 1;
    unsigned char bit5 : 1;
    unsigned char bit6 : 1;
    unsigned char bit7 : 1;
} bits;

unsigned char sum_bits(const unsigned char x) {
    const bits *b = (const bits*) &x;
    return b->bit0 + b->bit1 + b->bit2 + b->bit3 \
         + b->bit4 + b->bit5 + b->bit6 + b->bit7;
}

int NumberOfSetBits(int i) {
    i = i - ((i >> 1) & 0x55555555);
    i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
    return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
}

#define out(s) \
    printf("bits set: %lu\nbits counted: %lu\n", 8*LENGTH*sizeof(short)*3/4, s);

int main(int argc, char **argv) {
    unsigned long i, s;
    unsigned short *x = malloc(LENGTH*sizeof(short));
    unsigned char lut[65536], *p;
    unsigned short *ps;
    int *pi;

    /* set 3/4 of the bits */
    for (i=0; i<LENGTH; ++i)
        x[i] = 0xFFF0;

    /* sum_bits (1.772s) */
    for (i=LENGTH*sizeof(short), p=(unsigned char*) x, s=0; i--; s+=sum_bits(*p++));
    out(s);

    /* NumberOfSetBits (0.404s) */
    for (i=LENGTH*sizeof(short)/sizeof(int), pi=(int*)x, s=0; i--; s+=NumberOfSetBits(*pi++));
    out(s);

    /* populate lookup table */
    for (i=0, p=(unsigned char*) &i; i<sizeof(lut); ++i)
        lut[i] = sum_bits(p[0]) + sum_bits(p[1]);

    /* 256-bytes lookup table (0.317s) */
    for (i=LENGTH*sizeof(short), p=(unsigned char*) x, s=0; i--; s+=lut[*p++]);
    out(s);

    /* 65536-bytes lookup table (0.250s) */
    for (i=LENGTH, ps=x, s=0; i--; s+=lut[*ps++]);
    out(s);

    free(x);
    return 0;
}

虽然位域版本非常可读，但计时结果显示它比NumberOfSetBits()慢了4倍以上。基于查找表的实现仍然要快得多，特别是对于一个65 kB的表。

2013-11-26 23:26:10

"最佳算法"是什么意思?短码还是长码?您的代码看起来非常优雅，并且具有恒定的执行时间。代码也很短。

但如果速度是主要因素，而不是代码大小，那么我认为以下方法可以更快:

       static final int[] BIT_COUNT = { 0, 1, 1, ... 256 values with a bitsize of a byte ... };
        static int bitCountOfByte( int value ){
            return BIT_COUNT[ value & 0xFF ];
        }

        static int bitCountOfInt( int value ){
            return bitCountOfByte( value ) 
                 + bitCountOfByte( value >> 8 ) 
                 + bitCountOfByte( value >> 16 ) 
                 + bitCountOfByte( value >> 24 );
        }

我认为这不会更快的64位值，但32位值可以更快。

2008-09-20 19:31:08

计算32位整数中的设置位数

推荐文章

最新文章

标签