我一直认为std::vector是“作为数组实现的”,等等等等。今天我去测试了一下,结果似乎不是这样:
以下是一些测试结果:
UseArray completed in 2.619 seconds
UseVector completed in 9.284 seconds
UseVectorPushBack completed in 14.669 seconds
The whole thing completed in 26.591 seconds
这大约要慢3 - 4倍!这并不能证明“向量可能会慢几纳秒”的评论是正确的。
我使用的代码是:
#include <cstdlib>
#include <vector>
#include <iostream>
#include <string>
#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>
class TestTimer
{
public:
TestTimer(const std::string & name) : name(name),
start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
{
}
~TestTimer()
{
using namespace std;
using namespace boost;
posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
posix_time::time_duration d = now - start;
cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
" seconds" << endl;
}
private:
std::string name;
boost::posix_time::ptime start;
};
struct Pixel
{
Pixel()
{
}
Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
{
}
unsigned char r, g, b;
};
void UseVector()
{
TestTimer t("UseVector");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
std::vector<Pixel> pixels;
pixels.resize(dimension * dimension);
for(int i = 0; i < dimension * dimension; ++i)
{
pixels[i].r = 255;
pixels[i].g = 0;
pixels[i].b = 0;
}
}
}
void UseVectorPushBack()
{
TestTimer t("UseVectorPushBack");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
std::vector<Pixel> pixels;
pixels.reserve(dimension * dimension);
for(int i = 0; i < dimension * dimension; ++i)
pixels.push_back(Pixel(255, 0, 0));
}
}
void UseArray()
{
TestTimer t("UseArray");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);
for(int i = 0 ; i < dimension * dimension; ++i)
{
pixels[i].r = 255;
pixels[i].g = 0;
pixels[i].b = 0;
}
free(pixels);
}
}
int main()
{
TestTimer t1("The whole thing");
UseArray();
UseVector();
UseVectorPushBack();
return 0;
}
我做错了吗?还是我刚刚打破了这个性能神话?
我使用Visual Studio 2005中的发布模式。
在Visual c++中,#define _SECURE_SCL 0将UseVector减少了一半(减少到4秒)。在我看来,这真的是件大事。
顺便说一下,你在使用vector的类中看到的减速也发生在标准类型中,比如int。这是一个多线程代码:
#include <iostream>
#include <cstdio>
#include <map>
#include <string>
#include <typeinfo>
#include <vector>
#include <pthread.h>
#include <sstream>
#include <fstream>
using namespace std;
//pthread_mutex_t map_mutex=PTHREAD_MUTEX_INITIALIZER;
long long num=500000000;
int procs=1;
struct iterate
{
int id;
int num;
void * member;
iterate(int a, int b, void *c) : id(a), num(b), member(c) {}
};
//fill out viterate and piterate
void * viterate(void * input)
{
printf("am in viterate\n");
iterate * info=static_cast<iterate *> (input);
// reproduce member type
vector<int> test= *static_cast<vector<int>*> (info->member);
for (int i=info->id; i<test.size(); i+=info->num)
{
//printf("am in viterate loop\n");
test[i];
}
pthread_exit(NULL);
}
void * piterate(void * input)
{
printf("am in piterate\n");
iterate * info=static_cast<iterate *> (input);;
int * test=static_cast<int *> (info->member);
for (int i=info->id; i<num; i+=info->num) {
//printf("am in piterate loop\n");
test[i];
}
pthread_exit(NULL);
}
int main()
{
cout<<"producing vector of size "<<num<<endl;
vector<int> vtest(num);
cout<<"produced a vector of size "<<vtest.size()<<endl;
pthread_t thread[procs];
iterate** it=new iterate*[procs];
int ans;
void *status;
cout<<"begining to thread through the vector\n";
for (int i=0; i<procs; i++) {
it[i]=new iterate(i, procs, (void *) &vtest);
// ans=pthread_create(&thread[i],NULL,viterate, (void *) it[i]);
}
for (int i=0; i<procs; i++) {
pthread_join(thread[i], &status);
}
cout<<"end of threading through the vector";
//reuse the iterate structures
cout<<"producing a pointer with size "<<num<<endl;
int * pint=new int[num];
cout<<"produced a pointer with size "<<num<<endl;
cout<<"begining to thread through the pointer\n";
for (int i=0; i<procs; i++) {
it[i]->member=&pint;
ans=pthread_create(&thread[i], NULL, piterate, (void*) it[i]);
}
for (int i=0; i<procs; i++) {
pthread_join(thread[i], &status);
}
cout<<"end of threading through the pointer\n";
//delete structure array for iterate
for (int i=0; i<procs; i++) {
delete it[i];
}
delete [] it;
//delete pointer
delete [] pint;
cout<<"end of the program"<<endl;
return 0;
}
代码中的行为表明vector的实例化是代码中最长的部分。一旦你通过瓶颈。其余的代码运行得非常快。无论在多少个线程上运行,这都是正确的。
顺便说一下,忽略那些疯狂的包含数。我一直在使用这段代码来测试一个项目的东西,所以包含的数量不断增长。
我只是想提一下vector(和smart_ptr)只是原始数组(和原始指针)上的一个薄层。
实际上在连续存储器中向量的访问时间比数组快。
下面的代码显示了初始化和访问向量和数组的结果。
#include <boost/date_time/posix_time/posix_time.hpp>
#include <iostream>
#include <vector>
#define SIZE 20000
int main() {
srand (time(NULL));
vector<vector<int>> vector2d;
vector2d.reserve(SIZE);
int index(0);
boost::posix_time::ptime start_total = boost::posix_time::microsec_clock::local_time();
// timer start - build + access
for (int i = 0; i < SIZE; i++) {
vector2d.push_back(vector<int>(SIZE));
}
boost::posix_time::ptime start_access = boost::posix_time::microsec_clock::local_time();
// timer start - access
for (int i = 0; i < SIZE; i++) {
index = rand()%SIZE;
for (int j = 0; j < SIZE; j++) {
vector2d[index][index]++;
}
}
boost::posix_time::ptime end = boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff = end - start_total;
cout << "Vector total time: " << msdiff.total_milliseconds() << "milliseconds.\n";
msdiff = end - start_acess;
cout << "Vector access time: " << msdiff.total_milliseconds() << "milliseconds.\n";
int index(0);
int** raw2d = nullptr;
raw2d = new int*[SIZE];
start_total = boost::posix_time::microsec_clock::local_time();
// timer start - build + access
for (int i = 0; i < SIZE; i++) {
raw2d[i] = new int[SIZE];
}
start_access = boost::posix_time::microsec_clock::local_time();
// timer start - access
for (int i = 0; i < SIZE; i++) {
index = rand()%SIZE;
for (int j = 0; j < SIZE; j++) {
raw2d[index][index]++;
}
}
end = boost::posix_time::microsec_clock::local_time();
msdiff = end - start_total;
cout << "Array total time: " << msdiff.total_milliseconds() << "milliseconds.\n";
msdiff = end - start_acess;
cout << "Array access time: " << msdiff.total_milliseconds() << "milliseconds.\n";
for (int i = 0; i < SIZE; i++) {
delete [] raw2d[i];
}
return 0;
}
输出结果为:
Vector total time: 925milliseconds.
Vector access time: 4milliseconds.
Array total time: 30milliseconds.
Array access time: 21milliseconds.
所以如果使用得当,速度几乎是一样的。
(正如其他人提到的使用reserve()或resize())。
这似乎取决于编译器标志。下面是一个基准代码:
#include <chrono>
#include <cmath>
#include <ctime>
#include <iostream>
#include <vector>
int main(){
int size = 1000000; // reduce this number in case your program crashes
int L = 10;
std::cout << "size=" << size << " L=" << L << std::endl;
{
srand( time(0) );
double * data = new double[size];
double result = 0.;
std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
for( int l = 0; l < L; l++ ) {
for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
for( int i = 0; i < size; i++ ) result += data[i] * data[i];
}
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Calculation result is " << sqrt(result) << "\n";
std::cout << "Duration of C style heap array: " << duration << "ms\n";
delete data;
}
{
srand( 1 + time(0) );
double data[size]; // technically, non-compliant with C++ standard.
double result = 0.;
std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
for( int l = 0; l < L; l++ ) {
for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
for( int i = 0; i < size; i++ ) result += data[i] * data[i];
}
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Calculation result is " << sqrt(result) << "\n";
std::cout << "Duration of C99 style stack array: " << duration << "ms\n";
}
{
srand( 2 + time(0) );
std::vector<double> data( size );
double result = 0.;
std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
for( int l = 0; l < L; l++ ) {
for( int i = 0; i < size; i++ ) data[i] = rand() % 100;
for( int i = 0; i < size; i++ ) result += data[i] * data[i];
}
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Calculation result is " << sqrt(result) << "\n";
std::cout << "Duration of std::vector array: " << duration << "ms\n";
}
return 0;
}
不同的优化标志给出不同的答案:
$ g++ -O0 benchmark.cpp
$ ./a.out
size=1000000 L=10
Calculation result is 181182
Duration of C style heap array: 118441ms
Calculation result is 181240
Duration of C99 style stack array: 104920ms
Calculation result is 181210
Duration of std::vector array: 124477ms
$g++ -O3 benchmark.cpp
$ ./a.out
size=1000000 L=10
Calculation result is 181213
Duration of C style heap array: 107803ms
Calculation result is 181198
Duration of C99 style stack array: 87247ms
Calculation result is 181204
Duration of std::vector array: 89083ms
$ g++ -Ofast benchmark.cpp
$ ./a.out
size=1000000 L=10
Calculation result is 181164
Duration of C style heap array: 93530ms
Calculation result is 181179
Duration of C99 style stack array: 80620ms
Calculation result is 181191
Duration of std::vector array: 78830ms
您的确切结果会有所不同,但这在我的机器上是非常典型的。
一些分析器数据(像素对齐为32位):
g++ -msse3 -O3 -ftree-vectorize -g test.cpp -DNDEBUG && ./a.out
UseVector completed in 3.123 seconds
UseArray completed in 1.847 seconds
UseVectorPushBack completed in 9.186 seconds
The whole thing completed in 14.159 seconds
Blah
andrey@nv:~$ opannotate --source libcchem/src/a.out | grep "Total samples for file" -A3
Overflow stats not available
* Total samples for file : "/usr/include/c++/4.4/ext/new_allocator.h"
*
* 141008 52.5367
*/
--
* Total samples for file : "/home/andrey/libcchem/src/test.cpp"
*
* 61556 22.9345
*/
--
* Total samples for file : "/usr/include/c++/4.4/bits/stl_vector.h"
*
* 41956 15.6320
*/
--
* Total samples for file : "/usr/include/c++/4.4/bits/stl_uninitialized.h"
*
* 20956 7.8078
*/
--
* Total samples for file : "/usr/include/c++/4.4/bits/stl_construct.h"
*
* 2923 1.0891
*/
在分配器:
: // _GLIBCXX_RESOLVE_LIB_DEFECTS
: // 402. wrong new expression in [some_] allocator::construct
: void
: construct(pointer __p, const _Tp& __val)
141008 52.5367 : { ::new((void *)__p) _Tp(__val); }
向量:
:void UseVector()
:{ /* UseVector() total: 60121 22.3999 */
...
:
:
10790 4.0201 : for (int i = 0; i < dimension * dimension; ++i) {
:
495 0.1844 : pixels[i].r = 255;
:
12618 4.7012 : pixels[i].g = 0;
:
2253 0.8394 : pixels[i].b = 0;
:
: }
数组
:void UseArray()
:{ /* UseArray() total: 35191 13.1114 */
:
...
:
136 0.0507 : for (int i = 0; i < dimension * dimension; ++i) {
:
9897 3.6874 : pixels[i].r = 255;
:
3511 1.3081 : pixels[i].g = 0;
:
21647 8.0652 : pixels[i].b = 0;
大部分开销都在复制构造函数中。例如,
std::vector < Pixel > pixels;//(dimension * dimension, Pixel());
pixels.reserve(dimension * dimension);
for (int i = 0; i < dimension * dimension; ++i) {
pixels[i].r = 255;
pixels[i].g = 0;
pixels[i].b = 0;
}
它具有与数组相同的性能。