我希望处理一个文本文件与节点使用命令行调用,如:

节点app.js < input.txt

文件的每一行都需要单独处理,但是一旦处理了输入行就可以忘记。

使用stdin的on-data侦听器,我得到输入蒸汽按字节大小分块,所以我设置了这个。

process.stdin.resume();
process.stdin.setEncoding('utf8');

var lingeringLine = "";

process.stdin.on('data', function(chunk) {
    lines = chunk.split("\n");

    lines[0] = lingeringLine + lines[0];
    lingeringLine = lines.pop();

    lines.forEach(processLine);
});

process.stdin.on('end', function() {
    processLine(lingeringLine);
});

但这看起来太草率了。必须围绕行数组的第一个和最后一个项目进行按摩。就没有更优雅的方式了吗?


当前回答

逐行读取流,应该适合大文件管道到stdin,我的版本:

var n=0;
function on_line(line,cb)
{
    ////one each line
    console.log(n++,"line ",line);
    return cb();
    ////end of one each line
}

var fs = require('fs');
var readStream = fs.createReadStream('all_titles.txt');
//var readStream = process.stdin;
readStream.pause();
readStream.setEncoding('utf8');

var buffer=[];
readStream.on('data', (chunk) => {
    const newlines=/[\r\n]+/;
    var lines=chunk.split(newlines)
    if(lines.length==1)
    {
        buffer.push(lines[0]);
        return;
    }   
    
    buffer.push(lines[0]);
    var str=buffer.join('');
    buffer.length=0;
    readStream.pause();

    on_line(str,()=>{
        var i=1,l=lines.length-1;
        i--;
        function while_next()
        {
            i++;
            if(i<l)
            {
                return on_line(lines[i],while_next);
            }
            else
            {
                buffer.push(lines.pop());
                lines.length=0;
                return readStream.resume();
            }
        }
        while_next();
    });
  }).on('end', ()=>{
      if(buffer.length)
          var str=buffer.join('');
          buffer.length=0;
        on_line(str,()=>{
            ////after end
            console.error('done')
            ////end after end
        });
  });
readStream.resume();

解释:

to cut it correctly on utf8 letter and not in middle byte set encoding to utf8 it ensures it emits each time full multibyte letter. When data is received the input is paused. It is used to block the input until all lines are used up. It prevents overflowing the buffet if the lines processing function is slower than input. If there is every time a line without newlines each time. need to accommulate it for all calls and do nothing, return . once there are more than one line also append it and use the accommulated buffer. after all the splitted lines were consumed. On the last line push the last line to buffer and resume paused stream.

es6代码

var n=0;
async function on_line(line)
{
    ////one each line
    console.log(n++,"line ",line);
    ////end of one each line
}

var fs = require('fs');
var readStream = fs.createReadStream('all_titles.txt');
//var readStream = process.stdin;
readStream.pause();
readStream.setEncoding('utf8');

var buffer=[];
readStream.on('data', async (chunk) => {
    
    const newlines=/[\r\n]+/;
    var lines=chunk.split(newlines)
    if(lines.length==1)
    {
        buffer.push(lines[0]);
        return;
    }
    readStream.pause();

    // let i=0;
    buffer.push(lines[0]); // take first line
    var str=buffer.join('');
    buffer.length=0;//clear array, because consumed
    await on_line(str);
    
    for(let i=1;i<lines.length-1;i++)
       await on_line(lines[i]);
    buffer.push(lines[lines.length-1]);
    lines.length=0; //optional, clear array to hint GC.
    return readStream.resume();
  }).on('end', async ()=>{
      if(buffer.length)
          var str=buffer.join('');
          buffer.length=0;
          await on_line(str);
  });
  readStream.resume();

我没有测试es6代码

其他回答

在我的例子中,程序(elinks)返回的行看起来是空的,但实际上有特殊的终端字符、颜色控制代码和退格,所以在其他答案中提供的grep选项对我不起作用。所以我用Node.js写了这个小脚本。我说这个文件很紧,但那只是个随机的名字。

#!/usr/bin/env node

function visible(a) {
    var R  =  ''
    for (var i = 0; i < a.length; i++) {
        if (a[i] == '\b') {  R -= 1; continue; }  
        if (a[i] == '\u001b') {
            while (a[i] != 'm' && i < a.length) i++
            if (a[i] == undefined) break
        }
        else R += a[i]
    }
    return  R
}

function empty(a) {
    a = visible(a)
    for (var i = 0; i < a.length; i++) {
        if (a[i] != ' ') return false
    }
    return  true
}

var readline = require('readline')
var rl = readline.createInterface({ input: process.stdin, output: process.stdout, terminal: false })

rl.on('line', function(line) {
    if (!empty(line)) console.log(line) 
})
process.stdin.pipe(process.stdout);

逐行读取流,应该适合大文件管道到stdin,我的版本:

var n=0;
function on_line(line,cb)
{
    ////one each line
    console.log(n++,"line ",line);
    return cb();
    ////end of one each line
}

var fs = require('fs');
var readStream = fs.createReadStream('all_titles.txt');
//var readStream = process.stdin;
readStream.pause();
readStream.setEncoding('utf8');

var buffer=[];
readStream.on('data', (chunk) => {
    const newlines=/[\r\n]+/;
    var lines=chunk.split(newlines)
    if(lines.length==1)
    {
        buffer.push(lines[0]);
        return;
    }   
    
    buffer.push(lines[0]);
    var str=buffer.join('');
    buffer.length=0;
    readStream.pause();

    on_line(str,()=>{
        var i=1,l=lines.length-1;
        i--;
        function while_next()
        {
            i++;
            if(i<l)
            {
                return on_line(lines[i],while_next);
            }
            else
            {
                buffer.push(lines.pop());
                lines.length=0;
                return readStream.resume();
            }
        }
        while_next();
    });
  }).on('end', ()=>{
      if(buffer.length)
          var str=buffer.join('');
          buffer.length=0;
        on_line(str,()=>{
            ////after end
            console.error('done')
            ////end after end
        });
  });
readStream.resume();

解释:

to cut it correctly on utf8 letter and not in middle byte set encoding to utf8 it ensures it emits each time full multibyte letter. When data is received the input is paused. It is used to block the input until all lines are used up. It prevents overflowing the buffet if the lines processing function is slower than input. If there is every time a line without newlines each time. need to accommulate it for all calls and do nothing, return . once there are more than one line also append it and use the accommulated buffer. after all the splitted lines were consumed. On the last line push the last line to buffer and resume paused stream.

es6代码

var n=0;
async function on_line(line)
{
    ////one each line
    console.log(n++,"line ",line);
    ////end of one each line
}

var fs = require('fs');
var readStream = fs.createReadStream('all_titles.txt');
//var readStream = process.stdin;
readStream.pause();
readStream.setEncoding('utf8');

var buffer=[];
readStream.on('data', async (chunk) => {
    
    const newlines=/[\r\n]+/;
    var lines=chunk.split(newlines)
    if(lines.length==1)
    {
        buffer.push(lines[0]);
        return;
    }
    readStream.pause();

    // let i=0;
    buffer.push(lines[0]); // take first line
    var str=buffer.join('');
    buffer.length=0;//clear array, because consumed
    await on_line(str);
    
    for(let i=1;i<lines.length-1;i++)
       await on_line(lines[i]);
    buffer.push(lines[lines.length-1]);
    lines.length=0; //optional, clear array to hint GC.
    return readStream.resume();
  }).on('end', async ()=>{
      if(buffer.length)
          var str=buffer.join('');
          buffer.length=0;
          await on_line(str);
  });
  readStream.resume();

我没有测试es6代码

readline是专门设计用于使用terminal(即process.stdin.isTTY === true)。有很多模块为通用流提供拆分功能,比如split。它让事情变得超级简单:

process.stdin.pipe(require('split')()).on('data', processLine)

function processLine (line) {
  console.log(line + '!')
}
#!/usr/bin/env node

const EventEmitter = require('events');

function stdinLineByLine() {
  const stdin = new EventEmitter();
  let buff = '';

  process.stdin
    .on('data', data => {
      buff += data;
      lines = buff.split(/\r\n|\n/);
      buff = lines.pop();
      lines.forEach(line => stdin.emit('line', line));
    })
    .on('end', () => {
      if (buff.length > 0) stdin.emit('line', buff);
    });

  return stdin;
}

const stdin = stdinLineByLine();
stdin.on('line', console.log);