我试图读取一个大文件一行在一次。我在Quora上找到了一个关于这个主题的问题,但我错过了一些联系,把整个事情联系在一起。

 var Lazy=require("lazy");
 new Lazy(process.stdin)
     .lines
     .forEach(
          function(line) { 
              console.log(line.toString()); 
          }
 );
 process.stdin.resume();

我想要弄清楚的是如何一次从文件中读取一行,而不是像本例中那样读取STDIN。

我试着:

 fs.open('./VeryBigFile.csv', 'r', '0666', Process);

 function Process(err, fd) {
    if (err) throw err;
    // DO lazy read 
 }

但这并不奏效。我知道在必要时我可以使用PHP之类的东西,但我想弄清楚这个问题。

我不认为其他答案会起作用,因为文件比我运行它的服务器的内存大得多。


当前回答

我把日常行处理的整个逻辑包装成一个npm模块:line-kit https://www.npmjs.com/package/line-kit

/ /实例 Var计数= 0 需要(“line-kit”)(需要(fs) .createReadStream (/ etc /问题), (line) => {count++;}, () => {console.log(' seen ${count} lines ')})

其他回答

2019年更新

Nodejs官方文档中已经发布了一个很棒的例子。在这里

这需要在您的机器上安装最新的Nodejs。> 11.4

const fs = require('fs');
const readline = require('readline');

async function processLineByLine() {
  const fileStream = fs.createReadStream('input.txt');

  const rl = readline.createInterface({
    input: fileStream,
    crlfDelay: Infinity
  });
  // Note: we use the crlfDelay option to recognize all instances of CR LF
  // ('\r\n') in input.txt as a single line break.

  for await (const line of rl) {
    // Each line in input.txt will be successively available here as `line`.
    console.log(`Line from file: ${line}`);
  }
}

processLineByLine();

带载波模块:

var carrier = require('carrier');

process.stdin.resume();
carrier.carry(process.stdin, function(line) {
    console.log('got one line: ' + line);
});

我想解决同样的问题,基本上在Perl中是这样的:

while (<>) {
    process_line($_);
}

我的用例只是一个独立的脚本,而不是服务器,所以同步就可以了。以下是我的标准:

可以在许多项目中重用的最小同步代码。 不限制文件大小或行数。 不限制线的长度。 能够处理UTF-8的完整Unicode,包括BMP以外的字符。 能够处理*nix和Windows行结束符(老式Mac对我来说不需要)。 行中要包含的行结束字符。 能够处理带有或不带有行尾字符的最后一行。 不要使用node.js发行版中不包含的任何外部库。

这是一个让我在node.js中感受低级脚本类型代码的项目,并决定它作为其他脚本语言(如Perl)的替代品的可行性。

经过惊人的努力和一些错误的开始,这是我提出的代码。它非常快,但没有我预期的那么琐碎:(在GitHub上分叉)

var fs            = require('fs'),
    StringDecoder = require('string_decoder').StringDecoder,
    util          = require('util');

function lineByLine(fd) {
  var blob = '';
  var blobStart = 0;
  var blobEnd = 0;

  var decoder = new StringDecoder('utf8');

  var CHUNK_SIZE = 16384;
  var chunk = new Buffer(CHUNK_SIZE);

  var eolPos = -1;
  var lastChunk = false;

  var moreLines = true;
  var readMore = true;

  // each line
  while (moreLines) {

    readMore = true;
    // append more chunks from the file onto the end of our blob of text until we have an EOL or EOF
    while (readMore) {

      // do we have a whole line? (with LF)
      eolPos = blob.indexOf('\n', blobStart);

      if (eolPos !== -1) {
        blobEnd = eolPos;
        readMore = false;

      // do we have the last line? (no LF)
      } else if (lastChunk) {
        blobEnd = blob.length;
        readMore = false;

      // otherwise read more
      } else {
        var bytesRead = fs.readSync(fd, chunk, 0, CHUNK_SIZE, null);

        lastChunk = bytesRead !== CHUNK_SIZE;

        blob += decoder.write(chunk.slice(0, bytesRead));
      }
    }

    if (blobStart < blob.length) {
      processLine(blob.substring(blobStart, blobEnd + 1));

      blobStart = blobEnd + 1;

      if (blobStart >= CHUNK_SIZE) {
        // blobStart is in characters, CHUNK_SIZE is in octets
        var freeable = blobStart / CHUNK_SIZE;

        // keep blob from growing indefinitely, not as deterministic as I'd like
        blob = blob.substring(CHUNK_SIZE);
        blobStart -= CHUNK_SIZE;
        blobEnd -= CHUNK_SIZE;
      }
    } else {
      moreLines = false;
    }
  }
}

它可能会被进一步清理,这是试验和错误的结果。

如果你想逐行读取一个文件,并将其写入另一个文件:

var fs = require('fs');
var readline = require('readline');
var Stream = require('stream');

function readFileLineByLine(inputFile, outputFile) {

   var instream = fs.createReadStream(inputFile);
   var outstream = new Stream();
   outstream.readable = true;
   outstream.writable = true;

   var rl = readline.createInterface({
      input: instream,
      output: outstream,
      terminal: false
   });

   rl.on('line', function (line) {
        fs.appendFileSync(outputFile, line + '\n');
   });
};

我对缺乏全面的解决方案感到沮丧,所以我把自己的尝试(git / npm)放在一起。复制粘贴功能列表:

Interactive line processing (callback-based, no loading the entire file into RAM) Optionally, return all lines in an array (detailed or raw mode) Interactively interrupt streaming, or perform map/filter like processing Detect any newline convention (PC/Mac/Linux) Correct eof / last line treatment Correct handling of multi-byte UTF-8 characters Retrieve byte offset and byte length information on per-line basis Random access, using line-based or byte-based offsets Automatically map line-offset information, to speed up random access Zero dependencies Tests

国家卫生研究院?你决定:-)