我一直认为std::vector是“作为数组实现的”,等等等等。今天我去测试了一下,结果似乎不是这样:

以下是一些测试结果:

UseArray completed in 2.619 seconds
UseVector completed in 9.284 seconds
UseVectorPushBack completed in 14.669 seconds
The whole thing completed in 26.591 seconds

这大约要慢3 - 4倍!这并不能证明“向量可能会慢几纳秒”的评论是正确的。

我使用的代码是:

#include <cstdlib>
#include <vector>

#include <iostream>
#include <string>

#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>

class TestTimer
{
    public:
        TestTimer(const std::string & name) : name(name),
            start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
        {
        }

        ~TestTimer()
        {
            using namespace std;
            using namespace boost;

            posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
            posix_time::time_duration d = now - start;

            cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
                " seconds" << endl;
        }

    private:
        std::string name;
        boost::posix_time::ptime start;
};

struct Pixel
{
    Pixel()
    {
    }

    Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
    {
    }

    unsigned char r, g, b;
};

void UseVector()
{
    TestTimer t("UseVector");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels;
        pixels.resize(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}

void UseVectorPushBack()
{
    TestTimer t("UseVectorPushBack");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels;
            pixels.reserve(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels.push_back(Pixel(255, 0, 0));
    }
}

void UseArray()
{
    TestTimer t("UseArray");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);

        for(int i = 0 ; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }

        free(pixels);
    }
}

int main()
{
    TestTimer t1("The whole thing");

    UseArray();
    UseVector();
    UseVectorPushBack();

    return 0;
}

我做错了吗?还是我刚刚打破了这个性能神话?

我使用Visual Studio 2005中的发布模式。


在Visual c++中,#define _SECURE_SCL 0将UseVector减少了一半(减少到4秒)。在我看来,这真的是件大事。


当前回答

这似乎取决于编译器标志。下面是一个基准代码:

#include <chrono>
#include <cmath>
#include <ctime>
#include <iostream>
#include <vector>


int main(){

    int size = 1000000; // reduce this number in case your program crashes
    int L = 10;

    std::cout << "size=" << size << " L=" << L << std::endl;
    {
        srand( time(0) );
        double * data = new double[size];
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of C style heap array:    " << duration << "ms\n";
        delete data;
    }

    {
        srand( 1 + time(0) );
        double data[size]; // technically, non-compliant with C++ standard.
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of C99 style stack array: " << duration << "ms\n";
    }

    {
        srand( 2 + time(0) );
        std::vector<double> data( size );
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of std::vector array:     " << duration << "ms\n";
    }

    return 0;
}

不同的优化标志给出不同的答案:

$ g++ -O0 benchmark.cpp 
$ ./a.out 
size=1000000 L=10
Calculation result is 181182
Duration of C style heap array:    118441ms
Calculation result is 181240
Duration of C99 style stack array: 104920ms
Calculation result is 181210
Duration of std::vector array:     124477ms
$g++ -O3 benchmark.cpp
$ ./a.out 
size=1000000 L=10
Calculation result is 181213
Duration of C style heap array:    107803ms
Calculation result is 181198
Duration of C99 style stack array: 87247ms
Calculation result is 181204
Duration of std::vector array:     89083ms
$ g++ -Ofast benchmark.cpp 
$ ./a.out 
size=1000000 L=10
Calculation result is 181164
Duration of C style heap array:    93530ms
Calculation result is 181179
Duration of C99 style stack array: 80620ms
Calculation result is 181191
Duration of std::vector array:     78830ms

您的确切结果会有所不同,但这在我的机器上是非常典型的。

其他回答

我的笔记本电脑是联想G770 (4gb内存)。

操作系统为Windows 7 64位(笔记本电脑版本)

编译器是MinGW 4.6.1。

IDE为Code::Blocks。

我测试了第一篇文章的源代码。

结果

O2优化

UseArray完成用时2.841秒

UseVector在2.548秒内完成

UseVectorPushBack在11.95秒内完成

全程用时17.342秒

系统暂停

O3优化

UseArray完成用时1.452秒

UseVector在2.514秒内完成

UseVectorPushBack在12.967秒内完成

全程用时16.937秒

在O3优化下,向量的性能更差。

如果你把循环改为

    pixels[i].r = i;
    pixels[i].g = i;
    pixels[i].b = i;

在O2和O3下,数组和矢量的速度几乎相同。

我不得不说我不是c++方面的专家。但要补充一些实验结果:

编译: gcc-6.2.0/bin/g++ -O3 -std=c++14 vector.cpp

机:

Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz 

OS:

2.6.32-642.13.1.el6.x86_64

输出:

UseArray completed in 0.167821 seconds
UseVector completed in 0.134402 seconds
UseConstructor completed in 0.134806 seconds
UseFillConstructor completed in 1.00279 seconds
UseVectorPushBack completed in 6.6887 seconds
The whole thing completed in 8.12888 seconds

这里我唯一感到奇怪的是“UseFillConstructor”的性能与“UseConstructor”相比。

代码:

void UseConstructor()
{
    TestTimer t("UseConstructor");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels(dimension*dimension);
        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}


void UseFillConstructor()
{
    TestTimer t("UseFillConstructor");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels(dimension*dimension, Pixel(255,0,0));
    }
}

因此提供的额外“值”大大降低了性能,我认为这是由于多次调用复制构造函数造成的。但是…

编译:

gcc-6.2.0/bin/g++ -std=c++14 -O vector.cpp

输出:

UseArray completed in 1.02464 seconds
UseVector completed in 1.31056 seconds
UseConstructor completed in 1.47413 seconds
UseFillConstructor completed in 1.01555 seconds
UseVectorPushBack completed in 6.9597 seconds
The whole thing completed in 11.7851 seconds

因此,在这种情况下,gcc优化非常重要,但当一个值作为默认值提供时,它帮不了你太多。这,其实是对我的学费。希望它能帮助新程序员选择哪种矢量初始化格式。

这似乎取决于编译器标志。下面是一个基准代码:

#include <chrono>
#include <cmath>
#include <ctime>
#include <iostream>
#include <vector>


int main(){

    int size = 1000000; // reduce this number in case your program crashes
    int L = 10;

    std::cout << "size=" << size << " L=" << L << std::endl;
    {
        srand( time(0) );
        double * data = new double[size];
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of C style heap array:    " << duration << "ms\n";
        delete data;
    }

    {
        srand( 1 + time(0) );
        double data[size]; // technically, non-compliant with C++ standard.
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of C99 style stack array: " << duration << "ms\n";
    }

    {
        srand( 2 + time(0) );
        std::vector<double> data( size );
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of std::vector array:     " << duration << "ms\n";
    }

    return 0;
}

不同的优化标志给出不同的答案:

$ g++ -O0 benchmark.cpp 
$ ./a.out 
size=1000000 L=10
Calculation result is 181182
Duration of C style heap array:    118441ms
Calculation result is 181240
Duration of C99 style stack array: 104920ms
Calculation result is 181210
Duration of std::vector array:     124477ms
$g++ -O3 benchmark.cpp
$ ./a.out 
size=1000000 L=10
Calculation result is 181213
Duration of C style heap array:    107803ms
Calculation result is 181198
Duration of C99 style stack array: 87247ms
Calculation result is 181204
Duration of std::vector array:     89083ms
$ g++ -Ofast benchmark.cpp 
$ ./a.out 
size=1000000 L=10
Calculation result is 181164
Duration of C style heap array:    93530ms
Calculation result is 181179
Duration of C99 style stack array: 80620ms
Calculation result is 181191
Duration of std::vector array:     78830ms

您的确切结果会有所不同,但这在我的机器上是非常典型的。

公平地说,您不能将c++实现与C实现进行比较,即我所说的malloc版本。Malloc不创建对象——它只分配原始内存。然后不调用构造函数就把内存当作对象,这是拙劣的c++(可能是无效的——我把这个问题留给语言律师吧)。

也就是说,简单地将malloc更改为新的Pixel[维度*维度]并自由删除[]个像素,这与您所拥有的Pixel的简单实现没有太大区别。下面是我的盒子(E6600, 64位)上的结果:

UseArray completed in 0.269 seconds
UseVector completed in 1.665 seconds
UseVectorPushBack completed in 7.309 seconds
The whole thing completed in 9.244 seconds

但随着一个微小的变化,情况发生了变化:

Pixel.h

struct Pixel
{
    Pixel();
    Pixel(unsigned char r, unsigned char g, unsigned char b);

    unsigned char r, g, b;
};

Pixel.cc

#include "Pixel.h"

Pixel::Pixel() {}
Pixel::Pixel(unsigned char r, unsigned char g, unsigned char b) 
  : r(r), g(g), b(b) {}

main.cc

#include "Pixel.h"
[rest of test harness without class Pixel]
[UseArray now uses new/delete not malloc/free]

编译如下:

$ g++ -O3 -c -o Pixel.o Pixel.cc
$ g++ -O3 -c -o main.o main.cc
$ g++ -o main main.o Pixel.o

我们得到了非常不同的结果:

UseArray completed in 2.78 seconds
UseVector completed in 1.651 seconds
UseVectorPushBack completed in 7.826 seconds
The whole thing completed in 12.258 seconds

使用Pixel的非内联构造函数,std::vector现在可以击败原始数组。

通过std::vector和std:allocator进行分配的复杂性似乎太多了,无法像简单的新Pixel[n]那样有效地进行优化。然而,我们可以看到问题仅仅是分配而不是vector访问,通过调整几个测试函数来创建vector/数组,将其移到循环之外:

void UseVector()
{
    TestTimer t("UseVector");

    int dimension = 999;
    std::vector<Pixel> pixels;
    pixels.resize(dimension * dimension);

    for(int i = 0; i < 1000; ++i)
    {
        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}

and

void UseArray()
{
    TestTimer t("UseArray");

    int dimension = 999;
    Pixel * pixels = new Pixel[dimension * dimension];

    for(int i = 0; i < 1000; ++i)
    {
        for(int i = 0 ; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
    delete [] pixels;
}

我们现在得到这些结果:

UseArray completed in 0.254 seconds
UseVector completed in 0.249 seconds
UseVectorPushBack completed in 7.298 seconds
The whole thing completed in 7.802 seconds

从这里我们可以了解到std::vector可以与原始数组进行访问,但是如果你需要多次创建和删除vector/数组,在元素的构造函数没有内联的情况下,创建一个复杂的对象将比创建一个简单的数组花费更多的时间。我不认为这很令人惊讶。

一些分析器数据(像素对齐为32位):

g++ -msse3 -O3 -ftree-vectorize -g test.cpp -DNDEBUG && ./a.out
UseVector completed in 3.123 seconds
UseArray completed in 1.847 seconds
UseVectorPushBack completed in 9.186 seconds
The whole thing completed in 14.159 seconds

Blah

andrey@nv:~$ opannotate --source libcchem/src/a.out  | grep "Total samples for file" -A3
Overflow stats not available
 * Total samples for file : "/usr/include/c++/4.4/ext/new_allocator.h"
 *
 * 141008 52.5367
 */
--
 * Total samples for file : "/home/andrey/libcchem/src/test.cpp"
 *
 *  61556 22.9345
 */
--
 * Total samples for file : "/usr/include/c++/4.4/bits/stl_vector.h"
 *
 *  41956 15.6320
 */
--
 * Total samples for file : "/usr/include/c++/4.4/bits/stl_uninitialized.h"
 *
 *  20956  7.8078
 */
--
 * Total samples for file : "/usr/include/c++/4.4/bits/stl_construct.h"
 *
 *   2923  1.0891
 */

在分配器:

               :      // _GLIBCXX_RESOLVE_LIB_DEFECTS
               :      // 402. wrong new expression in [some_] allocator::construct
               :      void
               :      construct(pointer __p, const _Tp& __val)
141008 52.5367 :      { ::new((void *)__p) _Tp(__val); }

向量:

               :void UseVector()
               :{ /* UseVector() total:  60121 22.3999 */
...
               :
               :
 10790  4.0201 :        for (int i = 0; i < dimension * dimension; ++i) {
               :
   495  0.1844 :            pixels[i].r = 255;
               :
 12618  4.7012 :            pixels[i].g = 0;
               :
  2253  0.8394 :            pixels[i].b = 0;
               :
               :        }

数组

               :void UseArray()
               :{ /* UseArray() total:  35191 13.1114 */
               :
...
               :
   136  0.0507 :        for (int i = 0; i < dimension * dimension; ++i) {
               :
  9897  3.6874 :            pixels[i].r = 255;
               :
  3511  1.3081 :            pixels[i].g = 0;
               :
 21647  8.0652 :            pixels[i].b = 0;

大部分开销都在复制构造函数中。例如,

    std::vector < Pixel > pixels;//(dimension * dimension, Pixel());

    pixels.reserve(dimension * dimension);

    for (int i = 0; i < dimension * dimension; ++i) {

        pixels[i].r = 255;

        pixels[i].g = 0;

        pixels[i].b = 0;
    }

它具有与数组相同的性能。