例如,乘法和除法可以使用位运算符来实现

i*2 = i<<1
i*3 = (i<<1) + i;
i*10 = (i<<3) + (i<<1)

等等。

用say (i<<3)+(i<<1)乘10是否比直接用i*10更快?有没有什么输入是不能用这种方法乘或除的?


当前回答

这完全取决于目标设备、语言、目的等。

像素压缩显卡驱动程序?很有可能,是的!

.NET业务应用程序为您的部门?根本没必要去调查。

对于一款面向移动设备的高性能游戏来说,这可能是值得一试的,但前提是要进行更简单的优化。

其他回答

据我所知,在一些机器上,乘法运算可能需要16到32个机器周期。是的,根据机器类型,位移运算符比乘除运算符快。

然而,某些机器确实有它们的数学处理器,其中包含乘法/除法的特殊指令。

我也想看看我能不能打败房子。这是一个更通用的任意数乘任意数的位乘法。我做的宏比普通的乘法要慢25%到两倍。正如其他人所说,如果它接近2的倍数或由几个2的倍数组成,你可能会赢。比如由(X<<4)+(X<<2)+(X<<1)+X组成的X*23要比由(X<<6)+X组成的X*65慢。

#include <stdio.h>
#include <time.h>

#define MULTIPLYINTBYMINUS(X,Y) (-((X >> 30) & 1)&(Y<<30))+(-((X >> 29) & 1)&(Y<<29))+(-((X >> 28) & 1)&(Y<<28))+(-((X >> 27) & 1)&(Y<<27))+(-((X >> 26) & 1)&(Y<<26))+(-((X >> 25) & 1)&(Y<<25))+(-((X >> 24) & 1)&(Y<<24))+(-((X >> 23) & 1)&(Y<<23))+(-((X >> 22) & 1)&(Y<<22))+(-((X >> 21) & 1)&(Y<<21))+(-((X >> 20) & 1)&(Y<<20))+(-((X >> 19) & 1)&(Y<<19))+(-((X >> 18) & 1)&(Y<<18))+(-((X >> 17) & 1)&(Y<<17))+(-((X >> 16) & 1)&(Y<<16))+(-((X >> 15) & 1)&(Y<<15))+(-((X >> 14) & 1)&(Y<<14))+(-((X >> 13) & 1)&(Y<<13))+(-((X >> 12) & 1)&(Y<<12))+(-((X >> 11) & 1)&(Y<<11))+(-((X >> 10) & 1)&(Y<<10))+(-((X >> 9) & 1)&(Y<<9))+(-((X >> 8) & 1)&(Y<<8))+(-((X >> 7) & 1)&(Y<<7))+(-((X >> 6) & 1)&(Y<<6))+(-((X >> 5) & 1)&(Y<<5))+(-((X >> 4) & 1)&(Y<<4))+(-((X >> 3) & 1)&(Y<<3))+(-((X >> 2) & 1)&(Y<<2))+(-((X >> 1) & 1)&(Y<<1))+(-((X >> 0) & 1)&(Y<<0))
#define MULTIPLYINTBYSHIFT(X,Y) (((((X >> 30) & 1)<<31)>>31)&(Y<<30))+(((((X >> 29) & 1)<<31)>>31)&(Y<<29))+(((((X >> 28) & 1)<<31)>>31)&(Y<<28))+(((((X >> 27) & 1)<<31)>>31)&(Y<<27))+(((((X >> 26) & 1)<<31)>>31)&(Y<<26))+(((((X >> 25) & 1)<<31)>>31)&(Y<<25))+(((((X >> 24) & 1)<<31)>>31)&(Y<<24))+(((((X >> 23) & 1)<<31)>>31)&(Y<<23))+(((((X >> 22) & 1)<<31)>>31)&(Y<<22))+(((((X >> 21) & 1)<<31)>>31)&(Y<<21))+(((((X >> 20) & 1)<<31)>>31)&(Y<<20))+(((((X >> 19) & 1)<<31)>>31)&(Y<<19))+(((((X >> 18) & 1)<<31)>>31)&(Y<<18))+(((((X >> 17) & 1)<<31)>>31)&(Y<<17))+(((((X >> 16) & 1)<<31)>>31)&(Y<<16))+(((((X >> 15) & 1)<<31)>>31)&(Y<<15))+(((((X >> 14) & 1)<<31)>>31)&(Y<<14))+(((((X >> 13) & 1)<<31)>>31)&(Y<<13))+(((((X >> 12) & 1)<<31)>>31)&(Y<<12))+(((((X >> 11) & 1)<<31)>>31)&(Y<<11))+(((((X >> 10) & 1)<<31)>>31)&(Y<<10))+(((((X >> 9) & 1)<<31)>>31)&(Y<<9))+(((((X >> 8) & 1)<<31)>>31)&(Y<<8))+(((((X >> 7) & 1)<<31)>>31)&(Y<<7))+(((((X >> 6) & 1)<<31)>>31)&(Y<<6))+(((((X >> 5) & 1)<<31)>>31)&(Y<<5))+(((((X >> 4) & 1)<<31)>>31)&(Y<<4))+(((((X >> 3) & 1)<<31)>>31)&(Y<<3))+(((((X >> 2) & 1)<<31)>>31)&(Y<<2))+(((((X >> 1) & 1)<<31)>>31)&(Y<<1))+(((((X >> 0) & 1)<<31)>>31)&(Y<<0))
int main()
{
    int randomnumber=23;
    int randomnumber2=23;
    int checknum=23;
    clock_t start, diff;
    srand(time(0));
    start = clock();
    for(int i=0;i<1000000;i++)
    {
        randomnumber = rand() % 10000;
        randomnumber2 = rand() % 10000;
        checknum=MULTIPLYINTBYMINUS(randomnumber,randomnumber2);
        if (checknum!=randomnumber*randomnumber2)
        {
            printf("s %i and %i and %i",checknum,randomnumber,randomnumber2);
        }
    }
    diff = clock() - start;
    int msec = diff * 1000 / CLOCKS_PER_SEC;
    printf("MULTIPLYINTBYMINUS Time %d milliseconds", msec);
    start = clock();
    for(int i=0;i<1000000;i++)
    {
        randomnumber = rand() % 10000;
        randomnumber2 = rand() % 10000;
        checknum=MULTIPLYINTBYSHIFT(randomnumber,randomnumber2);
        if (checknum!=randomnumber*randomnumber2)
        {
            printf("s %i and %i and %i",checknum,randomnumber,randomnumber2);
        }
    }
    diff = clock() - start;
    msec = diff * 1000 / CLOCKS_PER_SEC;
    printf("MULTIPLYINTBYSHIFT Time %d milliseconds", msec);
    start = clock();
    for(int i=0;i<1000000;i++)
    {
        randomnumber = rand() % 10000;
        randomnumber2 = rand() % 10000;
        checknum= randomnumber*randomnumber2;
        if (checknum!=randomnumber*randomnumber2)
        {
            printf("s %i and %i and %i",checknum,randomnumber,randomnumber2);
        }
    }
    diff = clock() - start;
    msec = diff * 1000 / CLOCKS_PER_SEC;
    printf("normal * Time %d milliseconds", msec);
    return 0;
}

有些优化编译器无法做到,因为它们只适用于减少的输入集。

下面是c++示例代码,可以执行更快的除法,执行64位“乘倒数”。分子和分母都必须低于某个阈值。注意,它必须被编译为使用64位指令才能比普通除法更快。

#include <stdio.h>
#include <chrono>

static const unsigned s_bc = 32;
static const unsigned long long s_p = 1ULL << s_bc;
static const unsigned long long s_hp = s_p / 2;

static unsigned long long s_f;
static unsigned long long s_fr;

static void fastDivInitialize(const unsigned d)
{
    s_f = s_p / d;
    s_fr = s_f * (s_p - (s_f * d));
}

static unsigned fastDiv(const unsigned n)
{
    return (s_f * n + ((s_fr * n + s_hp) >> s_bc)) >> s_bc;
}

static bool fastDivCheck(const unsigned n, const unsigned d)
{
    // 32 to 64 cycles latency on modern cpus
    const unsigned expected = n / d;

    // At least 10 cycles latency on modern cpus
    const unsigned result = fastDiv(n);

    if (result != expected)
    {
        printf("Failed for: %u/%u != %u\n", n, d, expected);
        return false;
    }

    return true;
}

int main()
{
    unsigned result = 0;

    // Make sure to verify it works for your expected set of inputs
    const unsigned MAX_N = 65535;
    const unsigned MAX_D = 40000;

    const double ONE_SECOND_COUNT = 1000000000.0;

    auto t0 = std::chrono::steady_clock::now();
    unsigned count = 0;
    printf("Verifying...\n");
    for (unsigned d = 1; d <= MAX_D; ++d)
    {
        fastDivInitialize(d);
        for (unsigned n = 0; n <= MAX_N; ++n)
        {
            count += !fastDivCheck(n, d);
        }
    }
    auto t1 = std::chrono::steady_clock::now();
    printf("Errors: %u / %u (%.4fs)\n", count, MAX_D * (MAX_N + 1), (t1 - t0).count() / ONE_SECOND_COUNT);

    t0 = t1;
    for (unsigned d = 1; d <= MAX_D; ++d)
    {
        fastDivInitialize(d);
        for (unsigned n = 0; n <= MAX_N; ++n)
        {
            result += fastDiv(n);
        }
    }
    t1 = std::chrono::steady_clock::now();
    printf("Fast division time: %.4fs\n", (t1 - t0).count() / ONE_SECOND_COUNT);

    t0 = t1;
    count = 0;
    for (unsigned d = 1; d <= MAX_D; ++d)
    {
        for (unsigned n = 0; n <= MAX_N; ++n)
        {
            result += n / d;
        }
    }
    t1 = std::chrono::steady_clock::now();
    printf("Normal division time: %.4fs\n", (t1 - t0).count() / ONE_SECOND_COUNT);

    getchar();
    return result;
}

它是否真的更快取决于实际使用的硬件和编译器。

In the case of signed integers and right shift vs division, it can make a difference. For negative numbers, the shift rounds rounds towards negative infinity whereas division rounds towards zero. Of course the compiler will change the division to something cheaper, but it will usually change it to something that has the same rounding behavior as division, because it is either unable to prove that the variable won't be negative or it simply doesn't care. So if you can prove that a number won't be negative or if you don't care which way it will round, you can do that optimization in a way that is more likely to make a difference.