我寻找一个好的方法来复制文件(二进制或文本)。我写了几个样本,每个人都能工作。但我想听听经验丰富的程序员的意见。
我错过了好的例子,并寻找一种与c++一起工作的方法。
ANSI-C-WAY
#include <iostream>
#include <cstdio> // fopen, fclose, fread, fwrite, BUFSIZ
#include <ctime>
using namespace std;
int main() {
clock_t start, end;
start = clock();
// BUFSIZE default is 8192 bytes
// BUFSIZE of 1 means one chareter at time
// good values should fit to blocksize, like 1024 or 4096
// higher values reduce number of system calls
// size_t BUFFER_SIZE = 4096;
char buf[BUFSIZ];
size_t size;
FILE* source = fopen("from.ogv", "rb");
FILE* dest = fopen("to.ogv", "wb");
// clean and more secure
// feof(FILE* stream) returns non-zero if the end of file indicator for stream is set
while (size = fread(buf, 1, BUFSIZ, source)) {
fwrite(buf, 1, size, dest);
}
fclose(source);
fclose(dest);
end = clock();
cout << "CLOCKS_PER_SEC " << CLOCKS_PER_SEC << "\n";
cout << "CPU-TIME START " << start << "\n";
cout << "CPU-TIME END " << end << "\n";
cout << "CPU-TIME END - START " << end - start << "\n";
cout << "TIME(SEC) " << static_cast<double>(end - start) / CLOCKS_PER_SEC << "\n";
return 0;
}
POSIX-WAY (K&R在“C编程语言”中使用这个,更低级)
#include <iostream>
#include <fcntl.h> // open
#include <unistd.h> // read, write, close
#include <cstdio> // BUFSIZ
#include <ctime>
using namespace std;
int main() {
clock_t start, end;
start = clock();
// BUFSIZE defaults to 8192
// BUFSIZE of 1 means one chareter at time
// good values should fit to blocksize, like 1024 or 4096
// higher values reduce number of system calls
// size_t BUFFER_SIZE = 4096;
char buf[BUFSIZ];
size_t size;
int source = open("from.ogv", O_RDONLY, 0);
int dest = open("to.ogv", O_WRONLY | O_CREAT /*| O_TRUNC/**/, 0644);
while ((size = read(source, buf, BUFSIZ)) > 0) {
write(dest, buf, size);
}
close(source);
close(dest);
end = clock();
cout << "CLOCKS_PER_SEC " << CLOCKS_PER_SEC << "\n";
cout << "CPU-TIME START " << start << "\n";
cout << "CPU-TIME END " << end << "\n";
cout << "CPU-TIME END - START " << end - start << "\n";
cout << "TIME(SEC) " << static_cast<double>(end - start) / CLOCKS_PER_SEC << "\n";
return 0;
}
KISS-C + + -Streambuffer-WAY
#include <iostream>
#include <fstream>
#include <ctime>
using namespace std;
int main() {
clock_t start, end;
start = clock();
ifstream source("from.ogv", ios::binary);
ofstream dest("to.ogv", ios::binary);
dest << source.rdbuf();
source.close();
dest.close();
end = clock();
cout << "CLOCKS_PER_SEC " << CLOCKS_PER_SEC << "\n";
cout << "CPU-TIME START " << start << "\n";
cout << "CPU-TIME END " << end << "\n";
cout << "CPU-TIME END - START " << end - start << "\n";
cout << "TIME(SEC) " << static_cast<double>(end - start) / CLOCKS_PER_SEC << "\n";
return 0;
}
COPY-ALGORITHM-C + +收费方法
#include <iostream>
#include <fstream>
#include <ctime>
#include <algorithm>
#include <iterator>
using namespace std;
int main() {
clock_t start, end;
start = clock();
ifstream source("from.ogv", ios::binary);
ofstream dest("to.ogv", ios::binary);
istreambuf_iterator<char> begin_source(source);
istreambuf_iterator<char> end_source;
ostreambuf_iterator<char> begin_dest(dest);
copy(begin_source, end_source, begin_dest);
source.close();
dest.close();
end = clock();
cout << "CLOCKS_PER_SEC " << CLOCKS_PER_SEC << "\n";
cout << "CPU-TIME START " << start << "\n";
cout << "CPU-TIME END " << end << "\n";
cout << "CPU-TIME END - START " << end - start << "\n";
cout << "TIME(SEC) " << static_cast<double>(end - start) / CLOCKS_PER_SEC << "\n";
return 0;
}
OWN-BUFFER-C + +收费方法
#include <iostream>
#include <fstream>
#include <ctime>
using namespace std;
int main() {
clock_t start, end;
start = clock();
ifstream source("from.ogv", ios::binary);
ofstream dest("to.ogv", ios::binary);
// file size
source.seekg(0, ios::end);
ifstream::pos_type size = source.tellg();
source.seekg(0);
// allocate memory for buffer
char* buffer = new char[size];
// copy file
source.read(buffer, size);
dest.write(buffer, size);
// clean up
delete[] buffer;
source.close();
dest.close();
end = clock();
cout << "CLOCKS_PER_SEC " << CLOCKS_PER_SEC << "\n";
cout << "CPU-TIME START " << start << "\n";
cout << "CPU-TIME END " << end << "\n";
cout << "CPU-TIME END - START " << end - start << "\n";
cout << "TIME(SEC) " << static_cast<double>(end - start) / CLOCKS_PER_SEC << "\n";
return 0;
}
LINUX-WAY //要求内核>= 2.6.33
#include <iostream>
#include <sys/sendfile.h> // sendfile
#include <fcntl.h> // open
#include <unistd.h> // close
#include <sys/stat.h> // fstat
#include <sys/types.h> // fstat
#include <ctime>
using namespace std;
int main() {
clock_t start, end;
start = clock();
int source = open("from.ogv", O_RDONLY, 0);
int dest = open("to.ogv", O_WRONLY | O_CREAT /*| O_TRUNC/**/, 0644);
// struct required, rationale: function stat() exists also
struct stat stat_source;
fstat(source, &stat_source);
sendfile(dest, source, 0, stat_source.st_size);
close(source);
close(dest);
end = clock();
cout << "CLOCKS_PER_SEC " << CLOCKS_PER_SEC << "\n";
cout << "CPU-TIME START " << start << "\n";
cout << "CPU-TIME END " << end << "\n";
cout << "CPU-TIME END - START " << end - start << "\n";
cout << "TIME(SEC) " << static_cast<double>(end - start) / CLOCKS_PER_SEC << "\n";
return 0;
}
环境
GNU / LINUX (Archlinux)
内核3.3
GLIBC-2.15, libstdc++ 4.7 (GCC- libs), GCC 4.7, Coreutils 8.16
使用RUNLEVEL 3(多用户,网络,终端,无GUI)
INTEL SSD-Postville 80gb,最多可填充50%
复制一个270 MB的OGG-VIDEO-FILE
复制步骤
1. $ rm from.ogg
2. $ reboot # kernel and filesystem buffers are in regular
3. $ (time ./program) &>> report.txt # executes program, redirects output of program and append to file
4. $ sha256sum *.ogv # checksum
5. $ rm to.ogg # remove copy, but no sync, kernel and fileystem buffers are used
6. $ (time ./program) &>> report.txt # executes program, redirects output of program and append to file
结果(CPU时间使用)
Program Description UNBUFFERED|BUFFERED
ANSI C (fread/frwite) 490,000|260,000
POSIX (K&R, read/write) 450,000|230,000
FSTREAM (KISS, Streambuffer) 500,000|270,000
FSTREAM (Algorithm, copy) 500,000|270,000
FSTREAM (OWN-BUFFER) 500,000|340,000
SENDFILE (native LINUX, sendfile) 410,000|200,000
文件大小不会改变。
Sha256sum打印相同的结果。
视频文件仍然可以播放。
问题
你喜欢什么方法?
你知道更好的解决方案吗?
你在我的代码中发现什么错误了吗?
你知道回避解决方案的理由吗?
FSTREAM (KISS, Streambuffer)
我真的很喜欢这个,因为它真的很短很简单。据我所知,操作符<<对于rdbuf()是重载的,不转换任何东西。正确吗?
谢谢
更新1
我以这种方式更改了所有示例中的源代码,文件描述符的打开和关闭都包含在clock()的测量中。源代码中没有其他重大更改。结果没有改变!我也花时间仔细检查我的结果。
更新2
while循环的条件不再调用feof(),而是我将fread()移动到条件中。看起来,代码运行速度快了10000个时钟。
度量改变了:以前的结果总是被缓冲,因为我重复了旧的命令行rm to。为每个程序执行几次Ogv && sync && time ./program。现在我为每个程序重新启动系统。未缓冲的结果是新的,并不令人惊讶。没有缓冲的结果并没有什么变化。
如果我不删除旧的副本,程序的反应不同。使用POSIX和SENDFILE覆盖现有的缓冲文件更快,所有其他程序都较慢。也许截断或创建选项对这种行为有影响。但是用相同的副本覆盖现有文件并不是一个真实的用例。
使用cp执行拷贝需要0.44秒未缓冲,0.30秒缓冲。所以cp比POSIX样本要慢一点。对我来说还好。
也许我还从boost::filesystem中添加了mmap()和copy_file()的示例和结果。
更新3
我也把它放在博客页面上,并做了一些扩展。包括splice(),这是一个来自Linux内核的低级函数。也许接下来会有更多的Java示例。
http://www.ttyhoney.com/blog/?page_id=69
理论上,复制文件最有效的方法是使用内存映射,因此复制过程可以完全在内核模式下完成。
如果文件小于2GB,你可以在Unix平台上使用以下代码:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
int main(int argc, char **argv) {
if (argc != 3) {
fprintf(stderr, "usage: %s <source> <target>\n", argv[0]);
return EXIT_FAILURE;
}
int source_fd = open(argv[1], O_RDONLY, 0);
if (source_fd < 0) {
perror("open source");
return EXIT_FAILURE;
}
int target_fd = open(argv[2], O_RDWR | O_CREAT | O_TRUNC, 0666);
if (target_fd < 0) {
perror("open target");
return EXIT_FAILURE;
}
struct stat stat;
int r = fstat(source_fd, &stat);
if (r < 0) {
perror("fstat");
return EXIT_FAILURE;
}
char *buf = mmap(NULL, stat.st_size, PROT_READ, MAP_PRIVATE, source_fd, 0);
if (buf == MAP_FAILED) {
perror("mmap");
return EXIT_FAILURE;
}
r = write(target_fd, buf, stat.st_size);
if (r < 0) {
perror("write");
return EXIT_FAILURE;
} else if (r != stat.st_size) {
fprintf(stderr, "write: copied file truncated to %d bytes\n", r);
return EXIT_FAILURE;
} else {
printf("write: %d bytes copied\n", r);
}
munmap(buf, stat.st_size);
close(source_fd);
close(target_fd);
return EXIT_SUCCESS;
}
复制一个2GB的文件,时间使用如下:
real 0m1.842s
user 0m0.000s
sys 0m1.505s
但如果文件大小大于2GB,则write()不能使用。我们必须映射目标文件并使用memcpy复制该文件。由于使用了memcpy,我们可以看到在用户模式下花费了一些时间。
以下是一个通用版本:
import sys
import mmap
if len(sys.argv) != 3:
print(f'Usage: {sys.argv[0]} <source> <destination>')
sys.exit(1)
with open(sys.argv[1], 'rb') as src, open(sys.argv[2], 'wb') as dst:
mmapped_src = mmap.mmap(src.fileno(), 0, access=mmap.ACCESS_READ)
print(f"{dst.write(mmapped_src)} bytes written")
mmapped_src.close()
复制一个3.2GB的文件,时间占用为:
real 0m4.426s
user 0m0.030s
sys 0m2.793s
下面是一个Unix版本:
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
int main(int argc, char *argv[]) {
int src_fd, dst_fd;
void *src_map, *dst_map;
struct stat src_stat;
if (argc != 3) {
printf("Usage: %s <source> <destination>\n", argv[0]);
return 1;
}
src_fd = open(argv[1], O_RDONLY);
if (src_fd == -1) {
perror("open source");
return 1;
}
if (fstat(src_fd, &src_stat) == -1) {
perror("fstat");
return 1;
}
src_map = mmap(NULL, src_stat.st_size, PROT_READ, MAP_PRIVATE, src_fd, 0);
if (src_map == MAP_FAILED) {
perror("mmap source");
return 1;
}
dst_fd = open(argv[2], O_RDWR | O_CREAT | O_TRUNC, src_stat.st_mode);
if (dst_fd == -1) {
perror("open destination");
return 1;
}
if (ftruncate(dst_fd, src_stat.st_size) == -1) {
perror("ftruncate");
return 1;
}
dst_map = mmap(NULL, src_stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, dst_fd, 0);
if (dst_map == MAP_FAILED) {
perror("mmap destination");
return 1;
}
memcpy(dst_map, src_map, src_stat.st_size);
printf("Copied %ld bytes from %s to %s\n", src_stat.st_size, argv[1], argv[2]);
munmap(src_map, src_stat.st_size);
munmap(dst_map, src_stat.st_size);
close(src_fd);
close(dst_fd);
return 0;
}
复制一个3.2GB的文件,时间占用为:
real 0m3.365s
user 0m0.788s
sys 0m2.471s
下面是Windows版本:
#include <stdio.h>
#include <windows.h>
void PrintLastError(const char *name) {
char *msg;
FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR) &msg, 0, NULL);
fprintf(stderr, "%s: %s", name, msg);
LocalFree(msg);
exit(1);
}
int main(int argc, char* argv[]) {
HANDLE hSrc, hDst;
HANDLE hSrcMap, hDstMap;
LPVOID lpSrcMap, lpDstMap;
DWORD dwSrcSize, dwDstSize;
if (argc != 3) {
printf("Usage: %s <source> <destination>\n", argv[0]);
return 1;
}
hSrc = CreateFile(argv[1], GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hSrc == INVALID_HANDLE_VALUE) {
PrintLastError("CreateFile");
return 1;
}
dwSrcSize = GetFileSize(hSrc, NULL);
if (dwSrcSize == INVALID_FILE_SIZE) {
PrintLastError("GetFileSize");
goto SRC_MAP_FAIL;
}
hSrcMap = CreateFileMapping(hSrc, NULL, PAGE_READONLY, 0, 0, NULL);
if (hSrcMap == NULL) {
PrintLastError("CreateFileMapping");
goto SRC_MAP_FAIL;
}
lpSrcMap = MapViewOfFile(hSrcMap, FILE_MAP_READ, 0, 0, 0);
if (lpSrcMap == NULL) {
PrintLastError("MapViewOfFile");
goto SRC_VIEW_FAIL;
}
hDst = CreateFile(argv[2], GENERIC_READ | GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
if (hDst == INVALID_HANDLE_VALUE) {
PrintLastError("CreateFile");
goto DEST_OPEN_FAIL;
}
dwDstSize = dwSrcSize;
hDstMap = CreateFileMapping(hDst, NULL, PAGE_READWRITE, 0, dwDstSize, NULL);
if (hDstMap == NULL) {
PrintLastError("CreateFileMapping");
goto DEST_MAP_FAIL;
}
lpDstMap = MapViewOfFile(hDstMap, FILE_MAP_WRITE, 0, 0, 0);
if (lpDstMap == NULL) {
PrintLastError("MapViewOfFile");
goto DEST_VIEW_FAIL;
}
memcpy(lpDstMap, lpSrcMap, dwSrcSize);
printf("Copied %lu bytes from %s to %s", dwSrcSize, argv[1], argv[2]);
UnmapViewOfFile(lpDstMap);
DEST_VIEW_FAIL:
CloseHandle(hDstMap);
DEST_MAP_FAIL:
CloseHandle(hDst);
DEST_OPEN_FAIL:
UnmapViewOfFile(lpSrcMap);
SRC_VIEW_FAIL:
CloseHandle(hSrcMap);
SRC_MAP_FAIL:
CloseHandle(hSrc);
return 0;
}
理论上,复制文件最有效的方法是使用内存映射,因此复制过程可以完全在内核模式下完成。
如果文件小于2GB,你可以在Unix平台上使用以下代码:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
int main(int argc, char **argv) {
if (argc != 3) {
fprintf(stderr, "usage: %s <source> <target>\n", argv[0]);
return EXIT_FAILURE;
}
int source_fd = open(argv[1], O_RDONLY, 0);
if (source_fd < 0) {
perror("open source");
return EXIT_FAILURE;
}
int target_fd = open(argv[2], O_RDWR | O_CREAT | O_TRUNC, 0666);
if (target_fd < 0) {
perror("open target");
return EXIT_FAILURE;
}
struct stat stat;
int r = fstat(source_fd, &stat);
if (r < 0) {
perror("fstat");
return EXIT_FAILURE;
}
char *buf = mmap(NULL, stat.st_size, PROT_READ, MAP_PRIVATE, source_fd, 0);
if (buf == MAP_FAILED) {
perror("mmap");
return EXIT_FAILURE;
}
r = write(target_fd, buf, stat.st_size);
if (r < 0) {
perror("write");
return EXIT_FAILURE;
} else if (r != stat.st_size) {
fprintf(stderr, "write: copied file truncated to %d bytes\n", r);
return EXIT_FAILURE;
} else {
printf("write: %d bytes copied\n", r);
}
munmap(buf, stat.st_size);
close(source_fd);
close(target_fd);
return EXIT_SUCCESS;
}
复制一个2GB的文件,时间使用如下:
real 0m1.842s
user 0m0.000s
sys 0m1.505s
但如果文件大小大于2GB,则write()不能使用。我们必须映射目标文件并使用memcpy复制该文件。由于使用了memcpy,我们可以看到在用户模式下花费了一些时间。
以下是一个通用版本:
import sys
import mmap
if len(sys.argv) != 3:
print(f'Usage: {sys.argv[0]} <source> <destination>')
sys.exit(1)
with open(sys.argv[1], 'rb') as src, open(sys.argv[2], 'wb') as dst:
mmapped_src = mmap.mmap(src.fileno(), 0, access=mmap.ACCESS_READ)
print(f"{dst.write(mmapped_src)} bytes written")
mmapped_src.close()
复制一个3.2GB的文件,时间占用为:
real 0m4.426s
user 0m0.030s
sys 0m2.793s
下面是一个Unix版本:
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
int main(int argc, char *argv[]) {
int src_fd, dst_fd;
void *src_map, *dst_map;
struct stat src_stat;
if (argc != 3) {
printf("Usage: %s <source> <destination>\n", argv[0]);
return 1;
}
src_fd = open(argv[1], O_RDONLY);
if (src_fd == -1) {
perror("open source");
return 1;
}
if (fstat(src_fd, &src_stat) == -1) {
perror("fstat");
return 1;
}
src_map = mmap(NULL, src_stat.st_size, PROT_READ, MAP_PRIVATE, src_fd, 0);
if (src_map == MAP_FAILED) {
perror("mmap source");
return 1;
}
dst_fd = open(argv[2], O_RDWR | O_CREAT | O_TRUNC, src_stat.st_mode);
if (dst_fd == -1) {
perror("open destination");
return 1;
}
if (ftruncate(dst_fd, src_stat.st_size) == -1) {
perror("ftruncate");
return 1;
}
dst_map = mmap(NULL, src_stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, dst_fd, 0);
if (dst_map == MAP_FAILED) {
perror("mmap destination");
return 1;
}
memcpy(dst_map, src_map, src_stat.st_size);
printf("Copied %ld bytes from %s to %s\n", src_stat.st_size, argv[1], argv[2]);
munmap(src_map, src_stat.st_size);
munmap(dst_map, src_stat.st_size);
close(src_fd);
close(dst_fd);
return 0;
}
复制一个3.2GB的文件,时间占用为:
real 0m3.365s
user 0m0.788s
sys 0m2.471s
下面是Windows版本:
#include <stdio.h>
#include <windows.h>
void PrintLastError(const char *name) {
char *msg;
FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR) &msg, 0, NULL);
fprintf(stderr, "%s: %s", name, msg);
LocalFree(msg);
exit(1);
}
int main(int argc, char* argv[]) {
HANDLE hSrc, hDst;
HANDLE hSrcMap, hDstMap;
LPVOID lpSrcMap, lpDstMap;
DWORD dwSrcSize, dwDstSize;
if (argc != 3) {
printf("Usage: %s <source> <destination>\n", argv[0]);
return 1;
}
hSrc = CreateFile(argv[1], GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hSrc == INVALID_HANDLE_VALUE) {
PrintLastError("CreateFile");
return 1;
}
dwSrcSize = GetFileSize(hSrc, NULL);
if (dwSrcSize == INVALID_FILE_SIZE) {
PrintLastError("GetFileSize");
goto SRC_MAP_FAIL;
}
hSrcMap = CreateFileMapping(hSrc, NULL, PAGE_READONLY, 0, 0, NULL);
if (hSrcMap == NULL) {
PrintLastError("CreateFileMapping");
goto SRC_MAP_FAIL;
}
lpSrcMap = MapViewOfFile(hSrcMap, FILE_MAP_READ, 0, 0, 0);
if (lpSrcMap == NULL) {
PrintLastError("MapViewOfFile");
goto SRC_VIEW_FAIL;
}
hDst = CreateFile(argv[2], GENERIC_READ | GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
if (hDst == INVALID_HANDLE_VALUE) {
PrintLastError("CreateFile");
goto DEST_OPEN_FAIL;
}
dwDstSize = dwSrcSize;
hDstMap = CreateFileMapping(hDst, NULL, PAGE_READWRITE, 0, dwDstSize, NULL);
if (hDstMap == NULL) {
PrintLastError("CreateFileMapping");
goto DEST_MAP_FAIL;
}
lpDstMap = MapViewOfFile(hDstMap, FILE_MAP_WRITE, 0, 0, 0);
if (lpDstMap == NULL) {
PrintLastError("MapViewOfFile");
goto DEST_VIEW_FAIL;
}
memcpy(lpDstMap, lpSrcMap, dwSrcSize);
printf("Copied %lu bytes from %s to %s", dwSrcSize, argv[1], argv[2]);
UnmapViewOfFile(lpDstMap);
DEST_VIEW_FAIL:
CloseHandle(hDstMap);
DEST_MAP_FAIL:
CloseHandle(hDst);
DEST_OPEN_FAIL:
UnmapViewOfFile(lpSrcMap);
SRC_VIEW_FAIL:
CloseHandle(hSrcMap);
SRC_MAP_FAIL:
CloseHandle(hSrc);
return 0;
}