如何以最有效的内存和时间方式获取大文件的行数?
def file_len(filename):
with open(filename) as f:
for i, _ in enumerate(f):
pass
return i + 1
如何以最有效的内存和时间方式获取大文件的行数?
def file_len(filename):
with open(filename) as f:
for i, _ in enumerate(f):
pass
return i + 1
当前回答
在perfplot分析之后,必须推荐缓冲读取解决方案
def buf_count_newlines_gen(fname):
def _make_gen(reader):
while True:
b = reader(2 ** 16)
if not b: break
yield b
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
它速度快,内存效率高。大多数其他解决方案大约要慢20倍。
代码重现情节:
import mmap
import subprocess
from functools import partial
import perfplot
def setup(n):
fname = "t.txt"
with open(fname, "w") as f:
for i in range(n):
f.write(str(i) + "\n")
return fname
def for_enumerate(fname):
i = 0
with open(fname) as f:
for i, _ in enumerate(f):
pass
return i + 1
def sum1(fname):
return sum(1 for _ in open(fname))
def mmap_count(fname):
with open(fname, "r+") as f:
buf = mmap.mmap(f.fileno(), 0)
lines = 0
while buf.readline():
lines += 1
return lines
def for_open(fname):
lines = 0
for _ in open(fname):
lines += 1
return lines
def buf_count_newlines(fname):
lines = 0
buf_size = 2 ** 16
with open(fname) as f:
buf = f.read(buf_size)
while buf:
lines += buf.count("\n")
buf = f.read(buf_size)
return lines
def buf_count_newlines_gen(fname):
def _make_gen(reader):
b = reader(2 ** 16)
while b:
yield b
b = reader(2 ** 16)
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
def wc_l(fname):
return int(subprocess.check_output(["wc", "-l", fname]).split()[0])
def sum_partial(fname):
with open(fname) as f:
count = sum(x.count("\n") for x in iter(partial(f.read, 2 ** 16), ""))
return count
def read_count(fname):
return open(fname).read().count("\n")
b = perfplot.bench(
setup=setup,
kernels=[
for_enumerate,
sum1,
mmap_count,
for_open,
wc_l,
buf_count_newlines,
buf_count_newlines_gen,
sum_partial,
read_count,
],
n_range=[2 ** k for k in range(27)],
xlabel="num lines",
)
b.save("out.png")
b.show()
其他回答
我不得不在类似的问题上发表这篇文章,直到我的声誉分数上升了一点(感谢那些撞了我的人!)。
所有这些解决方案都忽略了一种使其运行得更快的方法,即使用无缓冲(原始)接口,使用字节数组,并进行自己的缓冲。(这只适用于Python 3。在Python 2中,原始接口在默认情况下可以使用,也可以不使用,但在Python 3中,您将默认使用Unicode。)
使用一个修改版本的计时工具,我相信下面的代码比任何提供的解决方案都更快(并且稍微更python化):
def rawcount(filename):
f = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines
使用单独的生成器函数,运行速度会快一点:
def _make_gen(reader):
b = reader(1024 * 1024)
while b:
yield b
b = reader(1024*1024)
def rawgencount(filename):
f = open(filename, 'rb')
f_gen = _make_gen(f.raw.read)
return sum( buf.count(b'\n') for buf in f_gen )
这完全可以用itertools内嵌的生成器表达式来完成,但它看起来非常奇怪:
from itertools import (takewhile,repeat)
def rawincount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
return sum( buf.count(b'\n') for buf in bufgen )
以下是我的时间安排:
function average, s min, s ratio
rawincount 0.0043 0.0041 1.00
rawgencount 0.0044 0.0042 1.01
rawcount 0.0048 0.0045 1.09
bufcount 0.008 0.0068 1.64
wccount 0.01 0.0097 2.35
itercount 0.014 0.014 3.41
opcount 0.02 0.02 4.83
kylecount 0.021 0.021 5.05
simplecount 0.022 0.022 5.25
mapcount 0.037 0.031 7.46
一句话解决方案:
import os
os.system("wc -l filename")
我的代码片段:
>>> os.system('wc -l *.txt')
0 bar.txt
1000 command.txt
3 test_file.txt
1003 total
我相信内存映射文件将是最快的解决方案。我尝试了四个函数:由OP发布的函数(opcount);对文件中的行进行简单迭代(simplecount);带有内存映射字段(mmap)的Readline (mapcount);以及Mykola Kharechko (buffcount)提供的缓冲区读取解决方案。
我将每个函数运行五次,并计算出120万在线文本文件的平均运行时间。
Windows XP, Python 2.5, 2GB RAM, 2ghz AMD处理器
以下是我的结果:
mapcount : 0.465599966049
simplecount : 0.756399965286
bufcount : 0.546800041199
opcount : 0.718600034714
编辑:Python 2.6的数字:
mapcount : 0.471799945831
simplecount : 0.634400033951
bufcount : 0.468800067902
opcount : 0.602999973297
因此,对于Windows/Python 2.6,缓冲区读取策略似乎是最快的
代码如下:
from __future__ import with_statement
import time
import mmap
import random
from collections import defaultdict
def mapcount(filename):
f = open(filename, "r+")
buf = mmap.mmap(f.fileno(), 0)
lines = 0
readline = buf.readline
while readline():
lines += 1
return lines
def simplecount(filename):
lines = 0
for line in open(filename):
lines += 1
return lines
def bufcount(filename):
f = open(filename)
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines
def opcount(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
counts = defaultdict(list)
for i in range(5):
for func in [mapcount, simplecount, bufcount, opcount]:
start_time = time.time()
assert func("big_file.txt") == 1209138
counts[func].append(time.time() - start_time)
for key, vals in counts.items():
print key.__name__, ":", sum(vals) / float(len(vals))
如果文件能放进内存,那么
with open(fname) as f:
count = len(f.read().split(b'\n')) - 1
对我来说,这个变体是最快的:
#!/usr/bin/env python
def main():
f = open('filename')
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
print lines
if __name__ == '__main__':
main()
原因:缓冲比逐行和逐字符串读取快。计数也非常快