代表数字7的8位像这样:
00000111
设置了三个比特。
确定32位整数中设置位数的算法是什么?
代表数字7的8位像这样:
00000111
设置了三个比特。
确定32位整数中设置位数的算法是什么?
当前回答
我发现了一个在数组中使用SIMD指令(SSSE3和AVX2)的位计数实现。它的性能比使用__popcnt64内禀函数要好2-2.5倍。
SSSE3版:
#include <smmintrin.h>
#include <stdint.h>
const __m128i Z = _mm_set1_epi8(0x0);
const __m128i F = _mm_set1_epi8(0xF);
//Vector with pre-calculated bit count:
const __m128i T = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
uint64_t BitCount(const uint8_t * src, size_t size)
{
__m128i _sum = _mm128_setzero_si128();
for (size_t i = 0; i < size; i += 16)
{
//load 16-byte vector
__m128i _src = _mm_loadu_si128((__m128i*)(src + i));
//get low 4 bit for every byte in vector
__m128i lo = _mm_and_si128(_src, F);
//sum precalculated value from T
_sum = _mm_add_epi64(_sum, _mm_sad_epu8(Z, _mm_shuffle_epi8(T, lo)));
//get high 4 bit for every byte in vector
__m128i hi = _mm_and_si128(_mm_srli_epi16(_src, 4), F);
//sum precalculated value from T
_sum = _mm_add_epi64(_sum, _mm_sad_epu8(Z, _mm_shuffle_epi8(T, hi)));
}
uint64_t sum[2];
_mm_storeu_si128((__m128i*)sum, _sum);
return sum[0] + sum[1];
}
AVX2 版本:
#include <immintrin.h>
#include <stdint.h>
const __m256i Z = _mm256_set1_epi8(0x0);
const __m256i F = _mm256_set1_epi8(0xF);
//Vector with pre-calculated bit count:
const __m256i T = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
uint64_t BitCount(const uint8_t * src, size_t size)
{
__m256i _sum = _mm256_setzero_si256();
for (size_t i = 0; i < size; i += 32)
{
//load 32-byte vector
__m256i _src = _mm256_loadu_si256((__m256i*)(src + i));
//get low 4 bit for every byte in vector
__m256i lo = _mm256_and_si256(_src, F);
//sum precalculated value from T
_sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(Z, _mm256_shuffle_epi8(T, lo)));
//get high 4 bit for every byte in vector
__m256i hi = _mm256_and_si256(_mm256_srli_epi16(_src, 4), F);
//sum precalculated value from T
_sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(Z, _mm256_shuffle_epi8(T, hi)));
}
uint64_t sum[4];
_mm256_storeu_si256((__m256i*)sum, _sum);
return sum[0] + sum[1] + sum[2] + sum[3];
}
其他回答
为什么不迭代地除以2呢?
count = 0 while n > 0 if (n % 2) == 1 count += 1 n /= 2
我同意这不是最快的,但是“最好”这个词有点含糊不清。我认为“最好”应该有一个清晰的元素
我个人使用这个:
public static int myBitCount(long L){
int count = 0;
while (L != 0) {
count++;
L ^= L & -L;
}
return count;
}
我使用下面的函数。我还没有检查基准测试,但它是有效的。
int msb(int num)
{
int m = 0;
for (int i = 16; i > 0; i = i>>1)
{
// debug(i, num, m);
if(num>>i)
{
m += i;
num>>=i;
}
}
return m;
}
public class BinaryCounter {
private int N;
public BinaryCounter(int N) {
this.N = N;
}
public static void main(String[] args) {
BinaryCounter counter=new BinaryCounter(7);
System.out.println("Number of ones is "+ counter.count());
}
public int count(){
if(N<=0) return 0;
int counter=0;
int K = 0;
do{
K = biggestPowerOfTwoSmallerThan(N);
N = N-K;
counter++;
}while (N != 0);
return counter;
}
private int biggestPowerOfTwoSmallerThan(int N) {
if(N==1) return 1;
for(int i=0;i<N;i++){
if(Math.pow(2, i) > N){
int power = i-1;
return (int) Math.pow(2, power);
}
}
return 0;
}
}
这是在golang中的实现
func CountBitSet(n int) int {
count := 0
for n > 0 {
count += n & 1
n >>= 1
}
return count
}