例如,乘法和除法可以使用位运算符来实现
i*2 = i<<1
i*3 = (i<<1) + i;
i*10 = (i<<3) + (i<<1)
等等。
用say (i<<3)+(i<<1)乘10是否比直接用i*10更快?有没有什么输入是不能用这种方法乘或除的?
例如,乘法和除法可以使用位运算符来实现
i*2 = i<<1
i*3 = (i<<1) + i;
i*10 = (i<<3) + (i<<1)
等等。
用say (i<<3)+(i<<1)乘10是否比直接用i*10更快?有没有什么输入是不能用这种方法乘或除的?
当前回答
不要这样做,除非你绝对需要这样做,并且你的代码意图是移位而不是乘法/除法。
在典型的日子里,你可能会节省一些机器周期(或松弛,因为编译器更知道优化什么),但成本并不值得——你把时间花在小细节上而不是实际的工作上,维护代码变得更加困难,你的同事会诅咒你。
对于高负载计算,您可能需要这样做,其中每个节省的周期意味着几分钟的运行时。但是,您应该一次优化一个地方,并每次都进行性能测试,看看您是否真的使它更快了,还是破坏了编译器逻辑。
其他回答
这完全取决于目标设备、语言、目的等。
像素压缩显卡驱动程序?很有可能,是的!
.NET业务应用程序为您的部门?根本没必要去调查。
对于一款面向移动设备的高性能游戏来说,这可能是值得一试的,但前提是要进行更简单的优化。
In the case of signed integers and right shift vs division, it can make a difference. For negative numbers, the shift rounds rounds towards negative infinity whereas division rounds towards zero. Of course the compiler will change the division to something cheaper, but it will usually change it to something that has the same rounding behavior as division, because it is either unable to prove that the variable won't be negative or it simply doesn't care. So if you can prove that a number won't be negative or if you don't care which way it will round, you can do that optimization in a way that is more likely to make a difference.
我也想看看我能不能打败房子。这是一个更通用的任意数乘任意数的位乘法。我做的宏比普通的乘法要慢25%到两倍。正如其他人所说,如果它接近2的倍数或由几个2的倍数组成,你可能会赢。比如由(X<<4)+(X<<2)+(X<<1)+X组成的X*23要比由(X<<6)+X组成的X*65慢。
#include <stdio.h>
#include <time.h>
#define MULTIPLYINTBYMINUS(X,Y) (-((X >> 30) & 1)&(Y<<30))+(-((X >> 29) & 1)&(Y<<29))+(-((X >> 28) & 1)&(Y<<28))+(-((X >> 27) & 1)&(Y<<27))+(-((X >> 26) & 1)&(Y<<26))+(-((X >> 25) & 1)&(Y<<25))+(-((X >> 24) & 1)&(Y<<24))+(-((X >> 23) & 1)&(Y<<23))+(-((X >> 22) & 1)&(Y<<22))+(-((X >> 21) & 1)&(Y<<21))+(-((X >> 20) & 1)&(Y<<20))+(-((X >> 19) & 1)&(Y<<19))+(-((X >> 18) & 1)&(Y<<18))+(-((X >> 17) & 1)&(Y<<17))+(-((X >> 16) & 1)&(Y<<16))+(-((X >> 15) & 1)&(Y<<15))+(-((X >> 14) & 1)&(Y<<14))+(-((X >> 13) & 1)&(Y<<13))+(-((X >> 12) & 1)&(Y<<12))+(-((X >> 11) & 1)&(Y<<11))+(-((X >> 10) & 1)&(Y<<10))+(-((X >> 9) & 1)&(Y<<9))+(-((X >> 8) & 1)&(Y<<8))+(-((X >> 7) & 1)&(Y<<7))+(-((X >> 6) & 1)&(Y<<6))+(-((X >> 5) & 1)&(Y<<5))+(-((X >> 4) & 1)&(Y<<4))+(-((X >> 3) & 1)&(Y<<3))+(-((X >> 2) & 1)&(Y<<2))+(-((X >> 1) & 1)&(Y<<1))+(-((X >> 0) & 1)&(Y<<0))
#define MULTIPLYINTBYSHIFT(X,Y) (((((X >> 30) & 1)<<31)>>31)&(Y<<30))+(((((X >> 29) & 1)<<31)>>31)&(Y<<29))+(((((X >> 28) & 1)<<31)>>31)&(Y<<28))+(((((X >> 27) & 1)<<31)>>31)&(Y<<27))+(((((X >> 26) & 1)<<31)>>31)&(Y<<26))+(((((X >> 25) & 1)<<31)>>31)&(Y<<25))+(((((X >> 24) & 1)<<31)>>31)&(Y<<24))+(((((X >> 23) & 1)<<31)>>31)&(Y<<23))+(((((X >> 22) & 1)<<31)>>31)&(Y<<22))+(((((X >> 21) & 1)<<31)>>31)&(Y<<21))+(((((X >> 20) & 1)<<31)>>31)&(Y<<20))+(((((X >> 19) & 1)<<31)>>31)&(Y<<19))+(((((X >> 18) & 1)<<31)>>31)&(Y<<18))+(((((X >> 17) & 1)<<31)>>31)&(Y<<17))+(((((X >> 16) & 1)<<31)>>31)&(Y<<16))+(((((X >> 15) & 1)<<31)>>31)&(Y<<15))+(((((X >> 14) & 1)<<31)>>31)&(Y<<14))+(((((X >> 13) & 1)<<31)>>31)&(Y<<13))+(((((X >> 12) & 1)<<31)>>31)&(Y<<12))+(((((X >> 11) & 1)<<31)>>31)&(Y<<11))+(((((X >> 10) & 1)<<31)>>31)&(Y<<10))+(((((X >> 9) & 1)<<31)>>31)&(Y<<9))+(((((X >> 8) & 1)<<31)>>31)&(Y<<8))+(((((X >> 7) & 1)<<31)>>31)&(Y<<7))+(((((X >> 6) & 1)<<31)>>31)&(Y<<6))+(((((X >> 5) & 1)<<31)>>31)&(Y<<5))+(((((X >> 4) & 1)<<31)>>31)&(Y<<4))+(((((X >> 3) & 1)<<31)>>31)&(Y<<3))+(((((X >> 2) & 1)<<31)>>31)&(Y<<2))+(((((X >> 1) & 1)<<31)>>31)&(Y<<1))+(((((X >> 0) & 1)<<31)>>31)&(Y<<0))
int main()
{
int randomnumber=23;
int randomnumber2=23;
int checknum=23;
clock_t start, diff;
srand(time(0));
start = clock();
for(int i=0;i<1000000;i++)
{
randomnumber = rand() % 10000;
randomnumber2 = rand() % 10000;
checknum=MULTIPLYINTBYMINUS(randomnumber,randomnumber2);
if (checknum!=randomnumber*randomnumber2)
{
printf("s %i and %i and %i",checknum,randomnumber,randomnumber2);
}
}
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("MULTIPLYINTBYMINUS Time %d milliseconds", msec);
start = clock();
for(int i=0;i<1000000;i++)
{
randomnumber = rand() % 10000;
randomnumber2 = rand() % 10000;
checknum=MULTIPLYINTBYSHIFT(randomnumber,randomnumber2);
if (checknum!=randomnumber*randomnumber2)
{
printf("s %i and %i and %i",checknum,randomnumber,randomnumber2);
}
}
diff = clock() - start;
msec = diff * 1000 / CLOCKS_PER_SEC;
printf("MULTIPLYINTBYSHIFT Time %d milliseconds", msec);
start = clock();
for(int i=0;i<1000000;i++)
{
randomnumber = rand() % 10000;
randomnumber2 = rand() % 10000;
checknum= randomnumber*randomnumber2;
if (checknum!=randomnumber*randomnumber2)
{
printf("s %i and %i and %i",checknum,randomnumber,randomnumber2);
}
}
diff = clock() - start;
msec = diff * 1000 / CLOCKS_PER_SEC;
printf("normal * Time %d milliseconds", msec);
return 0;
}
如果在gcc编译器上比较x+x, x*2和x<<1语法的输出,那么在x86汇编中会得到相同的结果:https://godbolt.org/z/JLpp0j
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
mov eax, DWORD PTR [rbp-4]
add eax, eax
pop rbp
ret
因此,您可以认为gcc足够聪明,可以独立于您输入的内容确定自己的最佳解决方案。
有些优化编译器无法做到,因为它们只适用于减少的输入集。
下面是c++示例代码,可以执行更快的除法,执行64位“乘倒数”。分子和分母都必须低于某个阈值。注意,它必须被编译为使用64位指令才能比普通除法更快。
#include <stdio.h>
#include <chrono>
static const unsigned s_bc = 32;
static const unsigned long long s_p = 1ULL << s_bc;
static const unsigned long long s_hp = s_p / 2;
static unsigned long long s_f;
static unsigned long long s_fr;
static void fastDivInitialize(const unsigned d)
{
s_f = s_p / d;
s_fr = s_f * (s_p - (s_f * d));
}
static unsigned fastDiv(const unsigned n)
{
return (s_f * n + ((s_fr * n + s_hp) >> s_bc)) >> s_bc;
}
static bool fastDivCheck(const unsigned n, const unsigned d)
{
// 32 to 64 cycles latency on modern cpus
const unsigned expected = n / d;
// At least 10 cycles latency on modern cpus
const unsigned result = fastDiv(n);
if (result != expected)
{
printf("Failed for: %u/%u != %u\n", n, d, expected);
return false;
}
return true;
}
int main()
{
unsigned result = 0;
// Make sure to verify it works for your expected set of inputs
const unsigned MAX_N = 65535;
const unsigned MAX_D = 40000;
const double ONE_SECOND_COUNT = 1000000000.0;
auto t0 = std::chrono::steady_clock::now();
unsigned count = 0;
printf("Verifying...\n");
for (unsigned d = 1; d <= MAX_D; ++d)
{
fastDivInitialize(d);
for (unsigned n = 0; n <= MAX_N; ++n)
{
count += !fastDivCheck(n, d);
}
}
auto t1 = std::chrono::steady_clock::now();
printf("Errors: %u / %u (%.4fs)\n", count, MAX_D * (MAX_N + 1), (t1 - t0).count() / ONE_SECOND_COUNT);
t0 = t1;
for (unsigned d = 1; d <= MAX_D; ++d)
{
fastDivInitialize(d);
for (unsigned n = 0; n <= MAX_N; ++n)
{
result += fastDiv(n);
}
}
t1 = std::chrono::steady_clock::now();
printf("Fast division time: %.4fs\n", (t1 - t0).count() / ONE_SECOND_COUNT);
t0 = t1;
count = 0;
for (unsigned d = 1; d <= MAX_D; ++d)
{
for (unsigned n = 0; n <= MAX_N; ++n)
{
result += n / d;
}
}
t1 = std::chrono::steady_clock::now();
printf("Normal division time: %.4fs\n", (t1 - t0).count() / ONE_SECOND_COUNT);
getchar();
return result;
}