如何以最有效的内存和时间方式获取大文件的行数?

def file_len(filename):
    with open(filename) as f:
        for i, _ in enumerate(f):
            pass
    return i + 1

当前回答

在perfplot分析之后,必须推荐缓冲读取解决方案

def buf_count_newlines_gen(fname):
    def _make_gen(reader):
        while True:
            b = reader(2 ** 16)
            if not b: break
            yield b

    with open(fname, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
    return count

它速度快,内存效率高。大多数其他解决方案大约要慢20倍。


代码重现情节:

import mmap
import subprocess
from functools import partial

import perfplot


def setup(n):
    fname = "t.txt"
    with open(fname, "w") as f:
        for i in range(n):
            f.write(str(i) + "\n")
    return fname


def for_enumerate(fname):
    i = 0
    with open(fname) as f:
        for i, _ in enumerate(f):
            pass
    return i + 1


def sum1(fname):
    return sum(1 for _ in open(fname))


def mmap_count(fname):
    with open(fname, "r+") as f:
        buf = mmap.mmap(f.fileno(), 0)

    lines = 0
    while buf.readline():
        lines += 1
    return lines


def for_open(fname):
    lines = 0
    for _ in open(fname):
        lines += 1
    return lines


def buf_count_newlines(fname):
    lines = 0
    buf_size = 2 ** 16
    with open(fname) as f:
        buf = f.read(buf_size)
        while buf:
            lines += buf.count("\n")
            buf = f.read(buf_size)
    return lines


def buf_count_newlines_gen(fname):
    def _make_gen(reader):
        b = reader(2 ** 16)
        while b:
            yield b
            b = reader(2 ** 16)

    with open(fname, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
    return count


def wc_l(fname):
    return int(subprocess.check_output(["wc", "-l", fname]).split()[0])


def sum_partial(fname):
    with open(fname) as f:
        count = sum(x.count("\n") for x in iter(partial(f.read, 2 ** 16), ""))
    return count


def read_count(fname):
    return open(fname).read().count("\n")


b = perfplot.bench(
    setup=setup,
    kernels=[
        for_enumerate,
        sum1,
        mmap_count,
        for_open,
        wc_l,
        buf_count_newlines,
        buf_count_newlines_gen,
        sum_partial,
        read_count,
    ],
    n_range=[2 ** k for k in range(27)],
    xlabel="num lines",
)
b.save("out.png")
b.show()

其他回答

这个呢?

import sys
sys.stdin=open('fname','r')
data=sys.stdin.readlines()
print "counted",len(data),"lines"

在perfplot分析之后,必须推荐缓冲读取解决方案

def buf_count_newlines_gen(fname):
    def _make_gen(reader):
        while True:
            b = reader(2 ** 16)
            if not b: break
            yield b

    with open(fname, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
    return count

它速度快,内存效率高。大多数其他解决方案大约要慢20倍。


代码重现情节:

import mmap
import subprocess
from functools import partial

import perfplot


def setup(n):
    fname = "t.txt"
    with open(fname, "w") as f:
        for i in range(n):
            f.write(str(i) + "\n")
    return fname


def for_enumerate(fname):
    i = 0
    with open(fname) as f:
        for i, _ in enumerate(f):
            pass
    return i + 1


def sum1(fname):
    return sum(1 for _ in open(fname))


def mmap_count(fname):
    with open(fname, "r+") as f:
        buf = mmap.mmap(f.fileno(), 0)

    lines = 0
    while buf.readline():
        lines += 1
    return lines


def for_open(fname):
    lines = 0
    for _ in open(fname):
        lines += 1
    return lines


def buf_count_newlines(fname):
    lines = 0
    buf_size = 2 ** 16
    with open(fname) as f:
        buf = f.read(buf_size)
        while buf:
            lines += buf.count("\n")
            buf = f.read(buf_size)
    return lines


def buf_count_newlines_gen(fname):
    def _make_gen(reader):
        b = reader(2 ** 16)
        while b:
            yield b
            b = reader(2 ** 16)

    with open(fname, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
    return count


def wc_l(fname):
    return int(subprocess.check_output(["wc", "-l", fname]).split()[0])


def sum_partial(fname):
    with open(fname) as f:
        count = sum(x.count("\n") for x in iter(partial(f.read, 2 ** 16), ""))
    return count


def read_count(fname):
    return open(fname).read().count("\n")


b = perfplot.bench(
    setup=setup,
    kernels=[
        for_enumerate,
        sum1,
        mmap_count,
        for_open,
        wc_l,
        buf_count_newlines,
        buf_count_newlines_gen,
        sum_partial,
        read_count,
    ],
    n_range=[2 ** k for k in range(27)],
    xlabel="num lines",
)
b.save("out.png")
b.show()

创建一个可执行脚本文件count.py:

#!/usr/bin/python

import sys
count = 0
for line in sys.stdin:
    count+=1

然后将文件的内容导入python脚本:cat huge.txt | ./count.py。管道也适用于Powershell,因此您将最终计算行数。

对我来说,在Linux上它比简单的解决方案快30%:

count=1
with open('huge.txt') as f:
    count+=1

为了完成上述方法,我尝试了fileinput模块的一个变体:

import fileinput as fi   
def filecount(fname):
        for line in fi.input(fname):
            pass
        return fi.lineno()

并将一个60mil行文件传递给上述所有方法:

mapcount : 6.1331050396
simplecount : 4.588793993
opcount : 4.42918205261
filecount : 43.2780818939
bufcount : 0.170812129974

这让我有点惊讶,fileinput是如此糟糕,比所有其他方法都要糟糕得多…

这是我用纯python发现的最快的东西。 你可以通过设置buffer来使用任意大小的内存,不过在我的电脑上2**16似乎是一个最佳位置。

from functools import partial

buffer=2**16
with open(myfile) as f:
        print sum(x.count('\n') for x in iter(partial(f.read,buffer), ''))

我在这里找到了答案为什么在c++中从stdin读取行要比Python慢得多?稍微调整了一下。这是一个非常好的阅读来理解如何快速计数行,尽管wc -l仍然比其他任何方法快75%。