我一直认为std::vector是“作为数组实现的”,等等等等。今天我去测试了一下,结果似乎不是这样:
以下是一些测试结果:
UseArray completed in 2.619 seconds
UseVector completed in 9.284 seconds
UseVectorPushBack completed in 14.669 seconds
The whole thing completed in 26.591 seconds
这大约要慢3 - 4倍!这并不能证明“向量可能会慢几纳秒”的评论是正确的。
我使用的代码是:
#include <cstdlib>
#include <vector>
#include <iostream>
#include <string>
#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>
class TestTimer
{
public:
TestTimer(const std::string & name) : name(name),
start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
{
}
~TestTimer()
{
using namespace std;
using namespace boost;
posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
posix_time::time_duration d = now - start;
cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
" seconds" << endl;
}
private:
std::string name;
boost::posix_time::ptime start;
};
struct Pixel
{
Pixel()
{
}
Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
{
}
unsigned char r, g, b;
};
void UseVector()
{
TestTimer t("UseVector");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
std::vector<Pixel> pixels;
pixels.resize(dimension * dimension);
for(int i = 0; i < dimension * dimension; ++i)
{
pixels[i].r = 255;
pixels[i].g = 0;
pixels[i].b = 0;
}
}
}
void UseVectorPushBack()
{
TestTimer t("UseVectorPushBack");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
std::vector<Pixel> pixels;
pixels.reserve(dimension * dimension);
for(int i = 0; i < dimension * dimension; ++i)
pixels.push_back(Pixel(255, 0, 0));
}
}
void UseArray()
{
TestTimer t("UseArray");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);
for(int i = 0 ; i < dimension * dimension; ++i)
{
pixels[i].r = 255;
pixels[i].g = 0;
pixels[i].b = 0;
}
free(pixels);
}
}
int main()
{
TestTimer t1("The whole thing");
UseArray();
UseVector();
UseVectorPushBack();
return 0;
}
我做错了吗?还是我刚刚打破了这个性能神话?
我使用Visual Studio 2005中的发布模式。
在Visual c++中,#define _SECURE_SCL 0将UseVector减少了一半(减少到4秒)。在我看来,这真的是件大事。
使用以下方法:
g++ -O3 Time.cpp -I <MyBoost> . cfg
. / a.o ut
UseArray完成用时2.196秒
UseVector在4.412秒内完成
UseVectorPushBack在8.017秒内完成
全程用时14.626秒
数组的速度是向量的两倍。
但在更详细地查看代码后,这是预期的;当你遍历向量两次,只遍历数组一次时。注意:当你调整vector的size()时,你不仅是在分配内存,而且还在遍历vector并调用每个成员的构造函数。
稍微重新排列代码,使vector只初始化每个对象一次:
std::vector<Pixel> pixels(dimensions * dimensions, Pixel(255,0,0));
现在再做一次同样的计时:
g++ -O3 Time.cpp -I <MyBoost> . cfg
. / a.o ut
UseVector在2.216秒内完成
vector现在的性能只比数组差一点点。在我看来,这种差异是微不足道的,可能是由一大堆与测试无关的事情造成的。
我也会考虑到,你没有正确初始化/销毁像素对象在UseArrray()方法的构造函数/析构函数都没有被调用(这可能不是这个简单的类的问题,但任何稍微复杂(即指针或指针成员)将导致问题。
一些分析器数据(像素对齐为32位):
g++ -msse3 -O3 -ftree-vectorize -g test.cpp -DNDEBUG && ./a.out
UseVector completed in 3.123 seconds
UseArray completed in 1.847 seconds
UseVectorPushBack completed in 9.186 seconds
The whole thing completed in 14.159 seconds
Blah
andrey@nv:~$ opannotate --source libcchem/src/a.out | grep "Total samples for file" -A3
Overflow stats not available
* Total samples for file : "/usr/include/c++/4.4/ext/new_allocator.h"
*
* 141008 52.5367
*/
--
* Total samples for file : "/home/andrey/libcchem/src/test.cpp"
*
* 61556 22.9345
*/
--
* Total samples for file : "/usr/include/c++/4.4/bits/stl_vector.h"
*
* 41956 15.6320
*/
--
* Total samples for file : "/usr/include/c++/4.4/bits/stl_uninitialized.h"
*
* 20956 7.8078
*/
--
* Total samples for file : "/usr/include/c++/4.4/bits/stl_construct.h"
*
* 2923 1.0891
*/
在分配器:
: // _GLIBCXX_RESOLVE_LIB_DEFECTS
: // 402. wrong new expression in [some_] allocator::construct
: void
: construct(pointer __p, const _Tp& __val)
141008 52.5367 : { ::new((void *)__p) _Tp(__val); }
向量:
:void UseVector()
:{ /* UseVector() total: 60121 22.3999 */
...
:
:
10790 4.0201 : for (int i = 0; i < dimension * dimension; ++i) {
:
495 0.1844 : pixels[i].r = 255;
:
12618 4.7012 : pixels[i].g = 0;
:
2253 0.8394 : pixels[i].b = 0;
:
: }
数组
:void UseArray()
:{ /* UseArray() total: 35191 13.1114 */
:
...
:
136 0.0507 : for (int i = 0; i < dimension * dimension; ++i) {
:
9897 3.6874 : pixels[i].r = 255;
:
3511 1.3081 : pixels[i].g = 0;
:
21647 8.0652 : pixels[i].b = 0;
大部分开销都在复制构造函数中。例如,
std::vector < Pixel > pixels;//(dimension * dimension, Pixel());
pixels.reserve(dimension * dimension);
for (int i = 0; i < dimension * dimension; ++i) {
pixels[i].r = 255;
pixels[i].g = 0;
pixels[i].b = 0;
}
它具有与数组相同的性能。
试试这个:
void UseVectorCtor()
{
TestTimer t("UseConstructor");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
std::vector<Pixel> pixels(dimension * dimension, Pixel(255, 0, 0));
}
}
我得到了和数组几乎完全一样的性能。
The thing about vector is that it's a much more general tool than an array. And that means you have to consider how you use it. It can be used in a lot of different ways, providing functionality that an array doesn't even have. And if you use it "wrong" for your purpose, you incur a lot of overhead, but if you use it correctly, it is usually basically a zero-overhead data structure. In this case, the problem is that you separately initialized the vector (causing all elements to have their default ctor called), and then overwriting each element individually with the correct value. That is much harder for the compiler to optimize away than when you do the same thing with an array. Which is why the vector provides a constructor which lets you do exactly that: initialize N elements with value X.
当你使用它时,向量和数组一样快。
所以,你还没有打破性能神话。但是你已经证明了只有当你最优地使用向量时它才成立,这也是一个很好的观点。:)
好的一面是,它确实是最简单的用法,但却是最快的。如果您将我的代码片段(一行)与John Kugelman的答案进行对比,其中包含大量的调整和优化,但仍然不能完全消除性能差异,很明显,vector的设计非常巧妙。你不必费尽周折才能得到等于数组的速度。相反,您必须使用最简单的解决方案。
顺便说一下,你在使用vector的类中看到的减速也发生在标准类型中,比如int。这是一个多线程代码:
#include <iostream>
#include <cstdio>
#include <map>
#include <string>
#include <typeinfo>
#include <vector>
#include <pthread.h>
#include <sstream>
#include <fstream>
using namespace std;
//pthread_mutex_t map_mutex=PTHREAD_MUTEX_INITIALIZER;
long long num=500000000;
int procs=1;
struct iterate
{
int id;
int num;
void * member;
iterate(int a, int b, void *c) : id(a), num(b), member(c) {}
};
//fill out viterate and piterate
void * viterate(void * input)
{
printf("am in viterate\n");
iterate * info=static_cast<iterate *> (input);
// reproduce member type
vector<int> test= *static_cast<vector<int>*> (info->member);
for (int i=info->id; i<test.size(); i+=info->num)
{
//printf("am in viterate loop\n");
test[i];
}
pthread_exit(NULL);
}
void * piterate(void * input)
{
printf("am in piterate\n");
iterate * info=static_cast<iterate *> (input);;
int * test=static_cast<int *> (info->member);
for (int i=info->id; i<num; i+=info->num) {
//printf("am in piterate loop\n");
test[i];
}
pthread_exit(NULL);
}
int main()
{
cout<<"producing vector of size "<<num<<endl;
vector<int> vtest(num);
cout<<"produced a vector of size "<<vtest.size()<<endl;
pthread_t thread[procs];
iterate** it=new iterate*[procs];
int ans;
void *status;
cout<<"begining to thread through the vector\n";
for (int i=0; i<procs; i++) {
it[i]=new iterate(i, procs, (void *) &vtest);
// ans=pthread_create(&thread[i],NULL,viterate, (void *) it[i]);
}
for (int i=0; i<procs; i++) {
pthread_join(thread[i], &status);
}
cout<<"end of threading through the vector";
//reuse the iterate structures
cout<<"producing a pointer with size "<<num<<endl;
int * pint=new int[num];
cout<<"produced a pointer with size "<<num<<endl;
cout<<"begining to thread through the pointer\n";
for (int i=0; i<procs; i++) {
it[i]->member=&pint;
ans=pthread_create(&thread[i], NULL, piterate, (void*) it[i]);
}
for (int i=0; i<procs; i++) {
pthread_join(thread[i], &status);
}
cout<<"end of threading through the pointer\n";
//delete structure array for iterate
for (int i=0; i<procs; i++) {
delete it[i];
}
delete [] it;
//delete pointer
delete [] pint;
cout<<"end of the program"<<endl;
return 0;
}
代码中的行为表明vector的实例化是代码中最长的部分。一旦你通过瓶颈。其余的代码运行得非常快。无论在多少个线程上运行,这都是正确的。
顺便说一下,忽略那些疯狂的包含数。我一直在使用这段代码来测试一个项目的东西,所以包含的数量不断增长。