我一直认为std::vector是“作为数组实现的”,等等等等。今天我去测试了一下,结果似乎不是这样:

以下是一些测试结果:

UseArray completed in 2.619 seconds
UseVector completed in 9.284 seconds
UseVectorPushBack completed in 14.669 seconds
The whole thing completed in 26.591 seconds

这大约要慢3 - 4倍!这并不能证明“向量可能会慢几纳秒”的评论是正确的。

我使用的代码是:

#include <cstdlib>
#include <vector>

#include <iostream>
#include <string>

#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>

class TestTimer
{
    public:
        TestTimer(const std::string & name) : name(name),
            start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
        {
        }

        ~TestTimer()
        {
            using namespace std;
            using namespace boost;

            posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
            posix_time::time_duration d = now - start;

            cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
                " seconds" << endl;
        }

    private:
        std::string name;
        boost::posix_time::ptime start;
};

struct Pixel
{
    Pixel()
    {
    }

    Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
    {
    }

    unsigned char r, g, b;
};

void UseVector()
{
    TestTimer t("UseVector");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels;
        pixels.resize(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}

void UseVectorPushBack()
{
    TestTimer t("UseVectorPushBack");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels;
            pixels.reserve(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels.push_back(Pixel(255, 0, 0));
    }
}

void UseArray()
{
    TestTimer t("UseArray");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);

        for(int i = 0 ; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }

        free(pixels);
    }
}

int main()
{
    TestTimer t1("The whole thing");

    UseArray();
    UseVector();
    UseVectorPushBack();

    return 0;
}

我做错了吗?还是我刚刚打破了这个性能神话?

我使用Visual Studio 2005中的发布模式。


在Visual c++中,#define _SECURE_SCL 0将UseVector减少了一半(减少到4秒)。在我看来,这真的是件大事。


当前回答

好吧,因为vector::resize()比普通内存分配(由malloc)做更多的处理。

尝试在复制构造函数中设置断点(定义它以便可以设置断点!),就会增加处理时间。

其他回答

这似乎取决于编译器标志。下面是一个基准代码:

#include <chrono>
#include <cmath>
#include <ctime>
#include <iostream>
#include <vector>


int main(){

    int size = 1000000; // reduce this number in case your program crashes
    int L = 10;

    std::cout << "size=" << size << " L=" << L << std::endl;
    {
        srand( time(0) );
        double * data = new double[size];
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of C style heap array:    " << duration << "ms\n";
        delete data;
    }

    {
        srand( 1 + time(0) );
        double data[size]; // technically, non-compliant with C++ standard.
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of C99 style stack array: " << duration << "ms\n";
    }

    {
        srand( 2 + time(0) );
        std::vector<double> data( size );
        double result = 0.;
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
        for( int l = 0; l < L; l++ ) {
            for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
            for( int i = 0; i < size; i++ ) result += data[i] * data[i];
        }
        std::chrono::steady_clock::time_point end   = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        std::cout << "Calculation result is " << sqrt(result) << "\n";
        std::cout << "Duration of std::vector array:     " << duration << "ms\n";
    }

    return 0;
}

不同的优化标志给出不同的答案:

$ g++ -O0 benchmark.cpp 
$ ./a.out 
size=1000000 L=10
Calculation result is 181182
Duration of C style heap array:    118441ms
Calculation result is 181240
Duration of C99 style stack array: 104920ms
Calculation result is 181210
Duration of std::vector array:     124477ms
$g++ -O3 benchmark.cpp
$ ./a.out 
size=1000000 L=10
Calculation result is 181213
Duration of C style heap array:    107803ms
Calculation result is 181198
Duration of C99 style stack array: 87247ms
Calculation result is 181204
Duration of std::vector array:     89083ms
$ g++ -Ofast benchmark.cpp 
$ ./a.out 
size=1000000 L=10
Calculation result is 181164
Duration of C style heap array:    93530ms
Calculation result is 181179
Duration of C99 style stack array: 80620ms
Calculation result is 181191
Duration of std::vector array:     78830ms

您的确切结果会有所不同,但这在我的机器上是非常典型的。

向量类还调用Pixel构造函数。

每一种都会导致你在计时时运行近一百万次。

编辑:然后是外层…1000个循环,所以要做十亿次ctor调用!

编辑2:看到UseArray案例的分解会很有趣。优化器可以优化整个事情,因为它除了消耗CPU外没有其他效果。

一个更好的基准测试(我认为…),编译器由于优化可以改变代码,因为分配的向量/数组的结果不会在任何地方使用。 结果:

$ g++ test.cpp -o test -O3 -march=native
$ ./test 
UseArray inner completed in 0.652 seconds
UseArray completed in 0.773 seconds
UseVector inner completed in 0.638 seconds
UseVector completed in 0.757 seconds
UseVectorPushBack inner completed in 6.732 seconds
UseVectorPush completed in 6.856 seconds
The whole thing completed in 8.387 seconds

编译器:

gcc version 6.2.0 20161019 (Debian 6.2.0-9)

CPU:

model name  : Intel(R) Core(TM) i7-3630QM CPU @ 2.40GHz

代码是:

#include <cstdlib>
#include <vector>

#include <iostream>
#include <string>

#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>

class TestTimer
{
    public:
        TestTimer(const std::string & name) : name(name),
            start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
        {
        }

        ~TestTimer()
        {
            using namespace std;
            using namespace boost;

            posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
            posix_time::time_duration d = now - start;

            cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
                " seconds" << endl;
        }

    private:
        std::string name;
        boost::posix_time::ptime start;
};

struct Pixel
{
    Pixel()
    {
    }

    Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
    {
    }

    unsigned char r, g, b;
};

void UseVector(std::vector<std::vector<Pixel> >& results)
{
    TestTimer t("UseVector inner");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel>& pixels = results.at(i);
        pixels.resize(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}

void UseVectorPushBack(std::vector<std::vector<Pixel> >& results)
{
    TestTimer t("UseVectorPushBack inner");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel>& pixels = results.at(i);
            pixels.reserve(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels.push_back(Pixel(255, 0, 0));
    }
}

void UseArray(Pixel** results)
{
    TestTimer t("UseArray inner");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);

        results[i] = pixels;

        for(int i = 0 ; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }

        // free(pixels);
    }
}

void UseArray()
{
    TestTimer t("UseArray");
    Pixel** array = (Pixel**)malloc(sizeof(Pixel*)* 1000);
    UseArray(array);
    for(int i=0;i<1000;++i)
        free(array[i]);
    free(array);
}

void UseVector()
{
    TestTimer t("UseVector");
    {
        std::vector<std::vector<Pixel> > vector(1000, std::vector<Pixel>());
        UseVector(vector);
    }
}

void UseVectorPushBack()
{
    TestTimer t("UseVectorPush");
    {
        std::vector<std::vector<Pixel> > vector(1000, std::vector<Pixel>());
        UseVectorPushBack(vector);
    }
}


int main()
{
    TestTimer t1("The whole thing");

    UseArray();
    UseVector();
    UseVectorPushBack();

    return 0;
}

好吧,因为vector::resize()比普通内存分配(由malloc)做更多的处理。

尝试在复制构造函数中设置断点(定义它以便可以设置断点!),就会增加处理时间。

公平地说,您不能将c++实现与C实现进行比较,即我所说的malloc版本。Malloc不创建对象——它只分配原始内存。然后不调用构造函数就把内存当作对象,这是拙劣的c++(可能是无效的——我把这个问题留给语言律师吧)。

也就是说,简单地将malloc更改为新的Pixel[维度*维度]并自由删除[]个像素,这与您所拥有的Pixel的简单实现没有太大区别。下面是我的盒子(E6600, 64位)上的结果:

UseArray completed in 0.269 seconds
UseVector completed in 1.665 seconds
UseVectorPushBack completed in 7.309 seconds
The whole thing completed in 9.244 seconds

但随着一个微小的变化,情况发生了变化:

Pixel.h

struct Pixel
{
    Pixel();
    Pixel(unsigned char r, unsigned char g, unsigned char b);

    unsigned char r, g, b;
};

Pixel.cc

#include "Pixel.h"

Pixel::Pixel() {}
Pixel::Pixel(unsigned char r, unsigned char g, unsigned char b) 
  : r(r), g(g), b(b) {}

main.cc

#include "Pixel.h"
[rest of test harness without class Pixel]
[UseArray now uses new/delete not malloc/free]

编译如下:

$ g++ -O3 -c -o Pixel.o Pixel.cc
$ g++ -O3 -c -o main.o main.cc
$ g++ -o main main.o Pixel.o

我们得到了非常不同的结果:

UseArray completed in 2.78 seconds
UseVector completed in 1.651 seconds
UseVectorPushBack completed in 7.826 seconds
The whole thing completed in 12.258 seconds

使用Pixel的非内联构造函数,std::vector现在可以击败原始数组。

通过std::vector和std:allocator进行分配的复杂性似乎太多了,无法像简单的新Pixel[n]那样有效地进行优化。然而,我们可以看到问题仅仅是分配而不是vector访问,通过调整几个测试函数来创建vector/数组,将其移到循环之外:

void UseVector()
{
    TestTimer t("UseVector");

    int dimension = 999;
    std::vector<Pixel> pixels;
    pixels.resize(dimension * dimension);

    for(int i = 0; i < 1000; ++i)
    {
        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}

and

void UseArray()
{
    TestTimer t("UseArray");

    int dimension = 999;
    Pixel * pixels = new Pixel[dimension * dimension];

    for(int i = 0; i < 1000; ++i)
    {
        for(int i = 0 ; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
    delete [] pixels;
}

我们现在得到这些结果:

UseArray completed in 0.254 seconds
UseVector completed in 0.249 seconds
UseVectorPushBack completed in 7.298 seconds
The whole thing completed in 7.802 seconds

从这里我们可以了解到std::vector可以与原始数组进行访问,但是如果你需要多次创建和删除vector/数组,在元素的构造函数没有内联的情况下,创建一个复杂的对象将比创建一个简单的数组花费更多的时间。我不认为这很令人惊讶。