代表数字7的8位像这样:
00000111
设置了三个比特。
确定32位整数中设置位数的算法是什么?
代表数字7的8位像这样:
00000111
设置了三个比特。
确定32位整数中设置位数的算法是什么?
当前回答
我提供了另一个未提及的算法,称为并行,从这里取。它的优点是它是通用的,这意味着代码对于8、16、32、64和128位大小是相同的。
我检查了它的值的正确性和时间的数量为2^26位大小为8,16,32和64位。请看下面的时间安排。
该算法是第一个代码片段。这里提到另外两个只是为了参考,因为我测试和比较了它们。
算法是用c++编写的,是通用的,但它可以很容易地应用到旧的C中。
#include <type_traits>
#include <cstdint>
template <typename IntT>
inline size_t PopCntParallel(IntT n) {
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
using T = std::make_unsigned_t<IntT>;
T v = T(n);
v = v - ((v >> 1) & (T)~(T)0/3); // temp
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp
v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp
return size_t((T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8); // count
}
下面是我比较的两种算法。一个是带有循环的Kernighan简单方法,从这里开始。
template <typename IntT>
inline size_t PopCntKernighan(IntT n) {
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
using T = std::make_unsigned_t<IntT>;
T v = T(n);
size_t c;
for (c = 0; v; ++c)
v &= v - 1; // Clear the least significant bit set
return c;
}
另一个是使用内置的__popcnt16()/__popcnt()/__popcnt64() MSVC的内在(doc在这里)。或CLang/GCC (doc)的__builtin_popcount。这个内在应该提供一个非常优化的版本,可能是硬件:
#ifdef _MSC_VER
// https://learn.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64?view=msvc-160
#include <intrin.h>
#define popcnt16 __popcnt16
#define popcnt32 __popcnt
#define popcnt64 __popcnt64
#else
// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
#define popcnt16 __builtin_popcount
#define popcnt32 __builtin_popcount
#define popcnt64 __builtin_popcountll
#endif
template <typename IntT>
inline size_t PopCntBuiltin(IntT n) {
using T = std::make_unsigned_t<IntT>;
T v = T(n);
if constexpr(sizeof(IntT) <= 2)
return popcnt16(uint16_t(v));
else if constexpr(sizeof(IntT) <= 4)
return popcnt32(uint32_t(v));
else if constexpr(sizeof(IntT) <= 8)
return popcnt64(uint64_t(v));
else
static_assert([]{ return false; }());
}
以下是计时,以纳秒为单位。所有的计时都是对2^26个随机数进行的。比较了所有三种算法的计时以及8、16、32和64之间的所有比特大小。总的来说,所有测试在我的机器上花费了16秒。使用了高分辨率时钟。
08 bit Builtin 8.2 ns
08 bit Parallel 8.2 ns
08 bit Kernighan 26.7 ns
16 bit Builtin 7.7 ns
16 bit Parallel 7.7 ns
16 bit Kernighan 39.7 ns
32 bit Builtin 7.0 ns
32 bit Parallel 7.0 ns
32 bit Kernighan 47.9 ns
64 bit Builtin 7.5 ns
64 bit Parallel 7.5 ns
64 bit Kernighan 59.4 ns
128 bit Builtin 7.8 ns
128 bit Parallel 13.8 ns
128 bit Kernighan 127.6 ns
可以看到,所提供的并行算法(在三个算法中排名第一)与MSVC /CLang的固有算法一样好。
作为参考,下面是我用来测试所有函数的速度/时间/正确性的完整代码。
作为奖励,这段代码(不像上面的短代码片段)也测试128位大小,但只在CLang/GCC下(不是MSVC),因为它们有unsigned __int128。
在网上试试!
#include <type_traits>
#include <cstdint>
using std::size_t;
#if defined(_MSC_VER) && !defined(__clang__)
#define IS_MSVC 1
#else
#define IS_MSVC 0
#endif
#if IS_MSVC
#define HAS128 false
#else
using int128_t = __int128;
using uint128_t = unsigned __int128;
#define HAS128 true
#endif
template <typename T> struct UnSignedT { using type = std::make_unsigned_t<T>; };
#if HAS128
template <> struct UnSignedT<int128_t> { using type = uint128_t; };
template <> struct UnSignedT<uint128_t> { using type = uint128_t; };
#endif
template <typename T> using UnSigned = typename UnSignedT<T>::type;
template <typename IntT>
inline size_t PopCntParallel(IntT n) {
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
using T = UnSigned<IntT>;
T v = T(n);
v = v - ((v >> 1) & (T)~(T)0/3); // temp
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp
v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp
return size_t((T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8); // count
}
template <typename IntT>
inline size_t PopCntKernighan(IntT n) {
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
using T = UnSigned<IntT>;
T v = T(n);
size_t c;
for (c = 0; v; ++c)
v &= v - 1; // Clear the least significant bit set
return c;
}
#if IS_MSVC
// https://learn.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64?view=msvc-160
#include <intrin.h>
#define popcnt16 __popcnt16
#define popcnt32 __popcnt
#define popcnt64 __popcnt64
#else
// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
#define popcnt16 __builtin_popcount
#define popcnt32 __builtin_popcount
#define popcnt64 __builtin_popcountll
#endif
#define popcnt128(x) (popcnt64(uint64_t(x)) + popcnt64(uint64_t(x >> 64)))
template <typename IntT>
inline size_t PopCntBuiltin(IntT n) {
using T = UnSigned<IntT>;
T v = T(n);
if constexpr(sizeof(IntT) <= 2)
return popcnt16(uint16_t(v));
else if constexpr(sizeof(IntT) <= 4)
return popcnt32(uint32_t(v));
else if constexpr(sizeof(IntT) <= 8)
return popcnt64(uint64_t(v));
else if constexpr(sizeof(IntT) <= 16)
return popcnt128(uint128_t(v));
else
static_assert([]{ return false; }());
}
#include <random>
#include <vector>
#include <chrono>
#include <string>
#include <iostream>
#include <iomanip>
#include <map>
inline double Time() {
static auto const gtb = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::high_resolution_clock::now() - gtb).count();
}
template <typename T, typename F>
void Test(std::string const & name, F f) {
std::mt19937_64 rng{123};
size_t constexpr bit_size = sizeof(T) * 8, ntests = 1 << 6, nnums = 1 << 14;
std::vector<T> nums(nnums);
for (size_t i = 0; i < nnums; ++i)
nums[i] = T(rng() % ~T(0));
static std::map<size_t, size_t> times;
double min_time = 1000;
for (size_t i = 0; i < ntests; ++i) {
double timer = Time();
size_t sum = 0;
for (size_t j = 0; j < nnums; j += 4)
sum += f(nums[j + 0]) + f(nums[j + 1]) + f(nums[j + 2]) + f(nums[j + 3]);
auto volatile vsum = sum;
min_time = std::min(min_time, (Time() - timer) / nnums);
if (times.count(bit_size) && times.at(bit_size) != sum)
std::cout << "Wrong bit cnt checksum!" << std::endl;
times[bit_size] = sum;
}
std::cout << std::setw(2) << std::setfill('0') << bit_size
<< " bit " << name << " " << std::fixed << std::setprecision(1)
<< min_time * 1000000000 << " ns" << std::endl;
}
int main() {
#define TEST(T) \
Test<T>("Builtin", PopCntBuiltin<T>); \
Test<T>("Parallel", PopCntParallel<T>); \
Test<T>("Kernighan", PopCntKernighan<T>); \
std::cout << std::endl;
TEST(uint8_t); TEST(uint16_t); TEST(uint32_t); TEST(uint64_t);
#if HAS128
TEST(uint128_t);
#endif
#undef TEST
}
其他回答
我在任何地方都没见过这种方法:
int nbits(unsigned char v) {
return ((((v - ((v >> 1) & 0x55)) * 0x1010101) & 0x30c00c03) * 0x10040041) >> 0x1c;
}
它每字节工作一次,所以对于一个32位整数,它必须被调用四次。它源于横向加法,但它使用两个32位乘法将指令数量减少到只有7条。
大多数当前的C编译器将使用SIMD (SSE2)指令优化这个函数,当请求的数量是4的倍数时,它变得非常有竞争力。它是可移植的,可以定义为宏或内联函数,并且不需要数据表。
这种方法可以扩展为一次处理16位,使用64位乘法。但是,当所有16位都被设置时,它会失败,返回0,所以它只能在0xFFFF输入值不存在时使用。由于64位操作,它也比较慢,并且没有很好地优化。
在Java 8或9中只调用Integer。bitCount。
对于那些想要在c++ 11中为任何无符号整数类型作为consexpr函数的人(tacklelib/include/tacklelib/utility/math.hpp):
#include <stdint.h>
#include <limits>
#include <type_traits>
const constexpr uint32_t uint32_max = (std::numeric_limits<uint32_t>::max)();
namespace detail
{
template <typename T>
inline constexpr T _count_bits_0(const T & v)
{
return v - ((v >> 1) & 0x55555555);
}
template <typename T>
inline constexpr T _count_bits_1(const T & v)
{
return (v & 0x33333333) + ((v >> 2) & 0x33333333);
}
template <typename T>
inline constexpr T _count_bits_2(const T & v)
{
return (v + (v >> 4)) & 0x0F0F0F0F;
}
template <typename T>
inline constexpr T _count_bits_3(const T & v)
{
return v + (v >> 8);
}
template <typename T>
inline constexpr T _count_bits_4(const T & v)
{
return v + (v >> 16);
}
template <typename T>
inline constexpr T _count_bits_5(const T & v)
{
return v & 0x0000003F;
}
template <typename T, bool greater_than_uint32>
struct _impl
{
static inline constexpr T _count_bits_with_shift(const T & v)
{
return
detail::_count_bits_5(
detail::_count_bits_4(
detail::_count_bits_3(
detail::_count_bits_2(
detail::_count_bits_1(
detail::_count_bits_0(v)))))) + count_bits(v >> 32);
}
};
template <typename T>
struct _impl<T, false>
{
static inline constexpr T _count_bits_with_shift(const T & v)
{
return 0;
}
};
}
template <typename T>
inline constexpr T count_bits(const T & v)
{
static_assert(std::is_integral<T>::value, "type T must be an integer");
static_assert(!std::is_signed<T>::value, "type T must be not signed");
return uint32_max >= v ?
detail::_count_bits_5(
detail::_count_bits_4(
detail::_count_bits_3(
detail::_count_bits_2(
detail::_count_bits_1(
detail::_count_bits_0(v)))))) :
detail::_impl<T, sizeof(uint32_t) < sizeof(v)>::_count_bits_with_shift(v);
}
谷歌测试库中的附加测试:
#include <stdlib.h>
#include <time.h>
namespace {
template <typename T>
inline uint32_t _test_count_bits(const T & v)
{
uint32_t count = 0;
T n = v;
while (n > 0) {
if (n % 2) {
count += 1;
}
n /= 2;
}
return count;
}
}
TEST(FunctionsTest, random_count_bits_uint32_100K)
{
srand(uint_t(time(NULL)));
for (uint32_t i = 0; i < 100000; i++) {
const uint32_t r = uint32_t(rand()) + (uint32_t(rand()) << 16);
ASSERT_EQ(_test_count_bits(r), count_bits(r));
}
}
TEST(FunctionsTest, random_count_bits_uint64_100K)
{
srand(uint_t(time(NULL)));
for (uint32_t i = 0; i < 100000; i++) {
const uint64_t r = uint64_t(rand()) + (uint64_t(rand()) << 16) + (uint64_t(rand()) << 32) + (uint64_t(rand()) << 48);
ASSERT_EQ(_test_count_bits(r), count_bits(r));
}
}
这是一个可移植的模块(ANSI-C),它可以在任何架构上对每个算法进行基准测试。
你的CPU有9位字节?目前它实现了2个算法,K&R算法和一个字节查找表。查找表的平均速度比K&R算法快3倍。如果有人能想出办法使“黑客的喜悦”算法可移植,请随意添加它。
#ifndef _BITCOUNT_H_
#define _BITCOUNT_H_
/* Return the Hamming Wieght of val, i.e. the number of 'on' bits. */
int bitcount( unsigned int );
/* List of available bitcount algorithms.
* onTheFly: Calculate the bitcount on demand.
*
* lookupTalbe: Uses a small lookup table to determine the bitcount. This
* method is on average 3 times as fast as onTheFly, but incurs a small
* upfront cost to initialize the lookup table on the first call.
*
* strategyCount is just a placeholder.
*/
enum strategy { onTheFly, lookupTable, strategyCount };
/* String represenations of the algorithm names */
extern const char *strategyNames[];
/* Choose which bitcount algorithm to use. */
void setStrategy( enum strategy );
#endif
.
#include <limits.h>
#include "bitcount.h"
/* The number of entries needed in the table is equal to the number of unique
* values a char can represent which is always UCHAR_MAX + 1*/
static unsigned char _bitCountTable[UCHAR_MAX + 1];
static unsigned int _lookupTableInitialized = 0;
static int _defaultBitCount( unsigned int val ) {
int count;
/* Starting with:
* 1100 - 1 == 1011, 1100 & 1011 == 1000
* 1000 - 1 == 0111, 1000 & 0111 == 0000
*/
for ( count = 0; val; ++count )
val &= val - 1;
return count;
}
/* Looks up each byte of the integer in a lookup table.
*
* The first time the function is called it initializes the lookup table.
*/
static int _tableBitCount( unsigned int val ) {
int bCount = 0;
if ( !_lookupTableInitialized ) {
unsigned int i;
for ( i = 0; i != UCHAR_MAX + 1; ++i )
_bitCountTable[i] =
( unsigned char )_defaultBitCount( i );
_lookupTableInitialized = 1;
}
for ( ; val; val >>= CHAR_BIT )
bCount += _bitCountTable[val & UCHAR_MAX];
return bCount;
}
static int ( *_bitcount ) ( unsigned int ) = _defaultBitCount;
const char *strategyNames[] = { "onTheFly", "lookupTable" };
void setStrategy( enum strategy s ) {
switch ( s ) {
case onTheFly:
_bitcount = _defaultBitCount;
break;
case lookupTable:
_bitcount = _tableBitCount;
break;
case strategyCount:
break;
}
}
/* Just a forwarding function which will call whichever version of the
* algorithm has been selected by the client
*/
int bitcount( unsigned int val ) {
return _bitcount( val );
}
#ifdef _BITCOUNT_EXE_
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
/* Use the same sequence of pseudo random numbers to benmark each Hamming
* Weight algorithm.
*/
void benchmark( int reps ) {
clock_t start, stop;
int i, j;
static const int iterations = 1000000;
for ( j = 0; j != strategyCount; ++j ) {
setStrategy( j );
srand( 257 );
start = clock( );
for ( i = 0; i != reps * iterations; ++i )
bitcount( rand( ) );
stop = clock( );
printf
( "\n\t%d psudoe-random integers using %s: %f seconds\n\n",
reps * iterations, strategyNames[j],
( double )( stop - start ) / CLOCKS_PER_SEC );
}
}
int main( void ) {
int option;
while ( 1 ) {
printf( "Menu Options\n"
"\t1.\tPrint the Hamming Weight of an Integer\n"
"\t2.\tBenchmark Hamming Weight implementations\n"
"\t3.\tExit ( or cntl-d )\n\n\t" );
if ( scanf( "%d", &option ) == EOF )
break;
switch ( option ) {
case 1:
printf( "Please enter the integer: " );
if ( scanf( "%d", &option ) != EOF )
printf
( "The Hamming Weight of %d ( 0x%X ) is %d\n\n",
option, option, bitcount( option ) );
break;
case 2:
printf
( "Please select number of reps ( in millions ): " );
if ( scanf( "%d", &option ) != EOF )
benchmark( option );
break;
case 3:
goto EXIT;
break;
default:
printf( "Invalid option\n" );
}
}
EXIT:
printf( "\n" );
return 0;
}
#endif
一个简单的算法来计算设置位的数量:
int countbits(n) {
int count = 0;
while(n != 0) {
n = n & (n-1);
count++;
}
return count;
}
以11(1011)为例,尝试手动运行该算法。它应该对你有很大帮助!