很简单,什么是尾部调用优化?

更具体地说,有哪些小代码段可以应用,哪些地方不可以,并解释原因?


当前回答

GCC C最小可运行示例与x86拆装分析

让我们看看GCC如何通过查看生成的程序集自动执行尾部调用优化。

这将作为一个非常具体的例子,说明其他答案(如https://stackoverflow.com/a/9814654/895245)中提到的优化可以将递归函数调用转换为循环。

这反过来又节省了内存并提高了性能,因为内存访问通常是现在使程序变慢的主要原因。

作为输入,我们给GCC一个基于阶乘的非优化朴素堆栈:

tail_call.c

#include <stdio.h>
#include <stdlib.h>

unsigned factorial(unsigned n) {
    if (n == 1) {
        return 1;
    }
    return n * factorial(n - 1);
}

int main(int argc, char **argv) {
    int input;
    if (argc > 1) {
        input = strtoul(argv[1], NULL, 0);
    } else {
        input = 5;
    }
    printf("%u\n", factorial(input));
    return EXIT_SUCCESS;
}

GitHub上游。

编译和分解:

gcc -O1 -foptimize-sibling-calls -ggdb3 -std=c99 -Wall -Wextra -Wpedantic \
  -o tail_call.out tail_call.c
objdump -d tail_call.out

其中-foptimize-sibling-calls是根据man gcc泛化尾部调用的名称:

   -foptimize-sibling-calls
       Optimize sibling and tail recursive calls.

       Enabled at levels -O2, -O3, -Os.

如上所述:我如何检查gcc是否执行尾递归优化?

我选择-O1是因为:

优化不是用-O0完成的。我怀疑这是因为缺少必要的中间转换。 -O3产生了非常高效的代码,虽然它也是尾部调用优化。

使用-fno-optimize-sibling-calls进行反汇编:

0000000000001145 <factorial>:
    1145:       89 f8                   mov    %edi,%eax
    1147:       83 ff 01                cmp    $0x1,%edi
    114a:       74 10                   je     115c <factorial+0x17>
    114c:       53                      push   %rbx
    114d:       89 fb                   mov    %edi,%ebx
    114f:       8d 7f ff                lea    -0x1(%rdi),%edi
    1152:       e8 ee ff ff ff          callq  1145 <factorial>
    1157:       0f af c3                imul   %ebx,%eax
    115a:       5b                      pop    %rbx
    115b:       c3                      retq
    115c:       c3                      retq

-foptimize-sibling-calls:

0000000000001145 <factorial>:
    1145:       b8 01 00 00 00          mov    $0x1,%eax
    114a:       83 ff 01                cmp    $0x1,%edi
    114d:       74 0e                   je     115d <factorial+0x18>
    114f:       8d 57 ff                lea    -0x1(%rdi),%edx
    1152:       0f af c7                imul   %edi,%eax
    1155:       89 d7                   mov    %edx,%edi
    1157:       83 fa 01                cmp    $0x1,%edx
    115a:       75 f3                   jne    114f <factorial+0xa>
    115c:       c3                      retq
    115d:       89 f8                   mov    %edi,%eax
    115f:       c3                      retq

两者之间的关键区别在于:

the -fno-optimize-sibling-calls uses callq, which is the typical non-optimized function call. This instruction pushes the return address to the stack, therefore increasing it. Furthermore, this version also does push %rbx, which pushes %rbx to the stack. GCC does this because it stores edi, which is the first function argument (n) into ebx, then calls factorial. GCC needs to do this because it is preparing for another call to factorial, which will use the new edi == n-1. It chooses ebx because this register is callee-saved: What registers are preserved through a linux x86-64 function call so the subcall to factorial won't change it and lose n. the -foptimize-sibling-calls does not use any instructions that push to the stack: it only does goto jumps within factorial with the instructions je and jne. Therefore, this version is equivalent to a while loop, without any function calls. Stack usage is constant.

在Ubuntu 18.10, GCC 8.2中测试。

其他回答

GCC C最小可运行示例与x86拆装分析

让我们看看GCC如何通过查看生成的程序集自动执行尾部调用优化。

这将作为一个非常具体的例子,说明其他答案(如https://stackoverflow.com/a/9814654/895245)中提到的优化可以将递归函数调用转换为循环。

这反过来又节省了内存并提高了性能,因为内存访问通常是现在使程序变慢的主要原因。

作为输入,我们给GCC一个基于阶乘的非优化朴素堆栈:

tail_call.c

#include <stdio.h>
#include <stdlib.h>

unsigned factorial(unsigned n) {
    if (n == 1) {
        return 1;
    }
    return n * factorial(n - 1);
}

int main(int argc, char **argv) {
    int input;
    if (argc > 1) {
        input = strtoul(argv[1], NULL, 0);
    } else {
        input = 5;
    }
    printf("%u\n", factorial(input));
    return EXIT_SUCCESS;
}

GitHub上游。

编译和分解:

gcc -O1 -foptimize-sibling-calls -ggdb3 -std=c99 -Wall -Wextra -Wpedantic \
  -o tail_call.out tail_call.c
objdump -d tail_call.out

其中-foptimize-sibling-calls是根据man gcc泛化尾部调用的名称:

   -foptimize-sibling-calls
       Optimize sibling and tail recursive calls.

       Enabled at levels -O2, -O3, -Os.

如上所述:我如何检查gcc是否执行尾递归优化?

我选择-O1是因为:

优化不是用-O0完成的。我怀疑这是因为缺少必要的中间转换。 -O3产生了非常高效的代码,虽然它也是尾部调用优化。

使用-fno-optimize-sibling-calls进行反汇编:

0000000000001145 <factorial>:
    1145:       89 f8                   mov    %edi,%eax
    1147:       83 ff 01                cmp    $0x1,%edi
    114a:       74 10                   je     115c <factorial+0x17>
    114c:       53                      push   %rbx
    114d:       89 fb                   mov    %edi,%ebx
    114f:       8d 7f ff                lea    -0x1(%rdi),%edi
    1152:       e8 ee ff ff ff          callq  1145 <factorial>
    1157:       0f af c3                imul   %ebx,%eax
    115a:       5b                      pop    %rbx
    115b:       c3                      retq
    115c:       c3                      retq

-foptimize-sibling-calls:

0000000000001145 <factorial>:
    1145:       b8 01 00 00 00          mov    $0x1,%eax
    114a:       83 ff 01                cmp    $0x1,%edi
    114d:       74 0e                   je     115d <factorial+0x18>
    114f:       8d 57 ff                lea    -0x1(%rdi),%edx
    1152:       0f af c7                imul   %edi,%eax
    1155:       89 d7                   mov    %edx,%edi
    1157:       83 fa 01                cmp    $0x1,%edx
    115a:       75 f3                   jne    114f <factorial+0xa>
    115c:       c3                      retq
    115d:       89 f8                   mov    %edi,%eax
    115f:       c3                      retq

两者之间的关键区别在于:

the -fno-optimize-sibling-calls uses callq, which is the typical non-optimized function call. This instruction pushes the return address to the stack, therefore increasing it. Furthermore, this version also does push %rbx, which pushes %rbx to the stack. GCC does this because it stores edi, which is the first function argument (n) into ebx, then calls factorial. GCC needs to do this because it is preparing for another call to factorial, which will use the new edi == n-1. It chooses ebx because this register is callee-saved: What registers are preserved through a linux x86-64 function call so the subcall to factorial won't change it and lose n. the -foptimize-sibling-calls does not use any instructions that push to the stack: it only does goto jumps within factorial with the instructions je and jne. Therefore, this version is equivalent to a while loop, without any function calls. Stack usage is constant.

在Ubuntu 18.10, GCC 8.2中测试。

看这里:

http://tratt.net/laurie/tech_articles/articles/tail_call_optimization

你可能知道,递归函数调用会对堆栈造成严重破坏;堆栈空间很容易很快用完。尾部调用优化是一种方法,通过它你可以创建一个使用常量堆栈空间的递归式算法,因此它不会不断增长,你会得到堆栈错误。

尾部调用优化可以避免为函数分配新的堆栈框架,因为调用函数将简单地返回从被调用函数获得的值。最常见的用法是尾部递归,其中为利用尾部调用优化而编写的递归函数可以使用常量堆栈空间。

Scheme是少数几种在规范中保证任何实现都必须提供这种优化的编程语言之一,因此这里有Scheme中的阶乘函数的两个示例:

(define (fact x)
  (if (= x 0) 1
      (* x (fact (- x 1)))))

(define (fact x)
  (define (fact-tail x accum)
    (if (= x 0) accum
        (fact-tail (- x 1) (* x accum))))
  (fact-tail x 1))

第一个函数不是尾部递归的,因为当进行递归调用时,函数需要跟踪调用返回后需要与结果进行的乘法运算。因此,堆栈看起来如下所示:

(fact 3)
(* 3 (fact 2))
(* 3 (* 2 (fact 1)))
(* 3 (* 2 (* 1 (fact 0))))
(* 3 (* 2 (* 1 1)))
(* 3 (* 2 1))
(* 3 2)
6

相反,尾部递归阶乘的堆栈跟踪如下所示:

(fact 3)
(fact-tail 3 1)
(fact-tail 2 3)
(fact-tail 1 6)
(fact-tail 0 6)
6

正如您所看到的,对于每次调用fact-tail,我们只需要跟踪相同数量的数据,因为我们只是返回我们直接到达顶部的值。这意味着即使我要调用(事实1000000),我只需要与(事实3)相同的空间。对于非尾部递归事实,情况并非如此,如此大的值可能会导致堆栈溢出。

我们应该确保函数本身没有goto语句。函数调用是被调用函数的最后一个内容。 大规模递归可以使用它进行优化,但在小规模中,使函数调用成为尾部调用的指令开销降低了实际目的。 TCO可能导致一个永远运行的函数: 空白永恒() { 永恒(); }

在函数式语言中,尾部调用优化就好像函数调用可以返回部分求值的表达式作为结果,然后由调用者求值。

f x = g x

f6变成了g6。因此,如果实现可以返回g6作为结果,然后调用该表达式,它将保存一个堆栈帧。

Also

f x = if c x then g x else h x.

还原到f6到g6或h6。所以如果实现计算c6,发现它是真的,那么它可以减少,

if true then g x else h x ---> g x

f x ---> h x

一个简单的非尾部调用优化解释器可能是这样的,

class simple_expresion
{
    ...
public:
    virtual ximple_value *DoEvaluate() const = 0;
};

class simple_value
{
    ...
};

class simple_function : public simple_expresion
{
    ...
private:
    simple_expresion *m_Function;
    simple_expresion *m_Parameter;

public:
    virtual simple_value *DoEvaluate() const
    {
        vector<simple_expresion *> parameterList;
        parameterList->push_back(m_Parameter);
        return m_Function->Call(parameterList);
    }
};

class simple_if : public simple_function
{
private:
    simple_expresion *m_Condition;
    simple_expresion *m_Positive;
    simple_expresion *m_Negative;

public:
    simple_value *DoEvaluate() const
    {
        if (m_Condition.DoEvaluate()->IsTrue())
        {
            return m_Positive.DoEvaluate();
        }
        else
        {
            return m_Negative.DoEvaluate();
        }
    }
}

尾部调用优化解释器可能是这样的,

class tco_expresion
{
    ...
public:
    virtual tco_expresion *DoEvaluate() const = 0;
    virtual bool IsValue()
    {
        return false;
    }
};

class tco_value
{
    ...
public:
    virtual bool IsValue()
    {
        return true;
    }
};

class tco_function : public tco_expresion
{
    ...
private:
    tco_expresion *m_Function;
    tco_expresion *m_Parameter;

public:
    virtual tco_expression *DoEvaluate() const
    {
        vector< tco_expression *> parameterList;
        tco_expression *function = const_cast<SNI_Function *>(this);
        while (!function->IsValue())
        {
            function = function->DoCall(parameterList);
        }
        return function;
    }

    tco_expresion *DoCall(vector<tco_expresion *> &p_ParameterList)
    {
        p_ParameterList.push_back(m_Parameter);
        return m_Function;
    }
};

class tco_if : public tco_function
{
private:
    tco_expresion *m_Condition;
    tco_expresion *m_Positive;
    tco_expresion *m_Negative;

    tco_expresion *DoEvaluate() const
    {
        if (m_Condition.DoEvaluate()->IsTrue())
        {
            return m_Positive;
        }
        else
        {
            return m_Negative;
        }
    }
}