我试图读取一个大文件一行在一次。我在Quora上找到了一个关于这个主题的问题,但我错过了一些联系,把整个事情联系在一起。
var Lazy=require("lazy");
new Lazy(process.stdin)
.lines
.forEach(
function(line) {
console.log(line.toString());
}
);
process.stdin.resume();
我想要弄清楚的是如何一次从文件中读取一行,而不是像本例中那样读取STDIN。
我试着:
fs.open('./VeryBigFile.csv', 'r', '0666', Process);
function Process(err, fd) {
if (err) throw err;
// DO lazy read
}
但这并不奏效。我知道在必要时我可以使用PHP之类的东西,但我想弄清楚这个问题。
我不认为其他答案会起作用,因为文件比我运行它的服务器的内存大得多。
我用这个:
function emitLines(stream, re){
re = re && /\n/;
var buffer = '';
stream.on('data', stream_data);
stream.on('end', stream_end);
function stream_data(data){
buffer += data;
flush();
}//stream_data
function stream_end(){
if(buffer) stream.emmit('line', buffer);
}//stream_end
function flush(){
var re = /\n/;
var match;
while(match = re.exec(buffer)){
var index = match.index + match[0].length;
stream.emit('line', buffer.substring(0, index));
buffer = buffer.substring(index);
re.lastIndex = 0;
}
}//flush
}//emitLines
在流上使用此函数并侦听将发出的行事件。
gr-
有一个很好的模块可以逐行读取文件,它叫做行读取器
用它你只需要写:
var lineReader = require('line-reader');
lineReader.eachLine('file.txt', function(line, last) {
console.log(line);
// do whatever you want with line...
if(last){
// or check if it's the last one
}
});
如果你需要更多的控制,你甚至可以用“java风格”界面迭代文件:
lineReader.open('file.txt', function(reader) {
if (reader.hasNextLine()) {
reader.nextLine(function(line) {
console.log(line);
});
}
});
这是我最喜欢的浏览文件的方式,是使用现代async/await进行渐进式(不是“slurp”或全内存方式)文件读取的简单本机解决方案。在处理大型文本文件时,我发现这是一种“自然”的解决方案,而不必求助于readline包或任何非核心依赖项。
let buf = '';
for await ( const chunk of fs.createReadStream('myfile') ) {
const lines = buf.concat(chunk).split(/\r?\n/);
buf = lines.pop();
for( const line of lines ) {
console.log(line);
}
}
if(buf.length) console.log(buf); // last line, if file does not end with newline
您可以在fs中调整编码。creatererestream或使用chunk.toString(<arg>)。这也让你更好地微调线分裂到你的口味,即。使用.split(/\n+/)跳过空行,用{highWaterMark: <chunkSize>}控制块大小。
Don't forget to create a function like processLine(line) to avoid repeating the line processing code twice due to the ending buf leftover. Unfortunately, the ReadStream instance does not update its end-of-file flags in this setup, so there's no way, afaik, to detect within the loop that we're in the last iteration without some more verbose tricks like comparing the file size from a fs.Stats() with .bytesRead. Hence the final buf processing solution, unless you're absolutely sure your file ends with a newline \n, in which case the for await loop should suffice.
★如果你更喜欢事件异步版本,这将是它:
let buf = '';
fs.createReadStream('myfile')
.on('data', chunk => {
const lines = buf.concat(chunk).split(/\r?\n/);
buf = lines.pop();
for( const line of lines ) {
console.log(line);
}
})
.on('end', () => buf.length && console.log(buf) );
★现在如果你不介意导入流核心包,那么这是等效的管道流版本,它允许链接转换,如gzip解压:
const { Writable } = require('stream');
let buf = '';
fs.createReadStream('myfile').pipe(
new Writable({
write: (chunk, enc, next) => {
const lines = buf.concat(chunk).split(/\r?\n/);
buf = lines.pop();
for (const line of lines) {
console.log(line);
}
next();
}
})
).on('finish', () => buf.length && console.log(buf) );
虽然您可能应该像上面的答案所建议的那样使用readline模块,但readline似乎面向命令行接口,而不是行读取。关于缓冲,它也有点不透明。(任何需要流式行阅读器的人可能都想调整缓冲区大小)。readline模块大约有1000行,而这个,加上统计和测试,是34行。
const EventEmitter = require('events').EventEmitter;
class LineReader extends EventEmitter{
constructor(f, delim='\n'){
super();
this.totalChars = 0;
this.totalLines = 0;
this.leftover = '';
f.on('data', (chunk)=>{
this.totalChars += chunk.length;
let lines = chunk.split(delim);
if (lines.length === 1){
this.leftover += chunk;
return;
}
lines[0] = this.leftover + lines[0];
this.leftover = lines[lines.length-1];
if (this.leftover) lines.pop();
this.totalLines += lines.length;
for (let l of lines) this.onLine(l);
});
// f.on('error', ()=>{});
f.on('end', ()=>{console.log('chars', this.totalChars, 'lines', this.totalLines)});
}
onLine(l){
this.emit('line', l);
}
}
//Command line test
const f = require('fs').createReadStream(process.argv[2], 'utf8');
const delim = process.argv[3];
const lineReader = new LineReader(f, delim);
lineReader.on('line', (line)=> console.log(line));
下面是一个更短的版本,没有统计数据,只有19行:
class LineReader extends require('events').EventEmitter{
constructor(f, delim='\n'){
super();
this.leftover = '';
f.on('data', (chunk)=>{
let lines = chunk.split(delim);
if (lines.length === 1){
this.leftover += chunk;
return;
}
lines[0] = this.leftover + lines[0];
this.leftover = lines[lines.length-1];
if (this.leftover)
lines.pop();
for (let l of lines)
this.emit('line', l);
});
}
}
我使用下面的代码读取行后,验证它不是一个目录,它不包括在文件列表不需要检查。
(function () {
var fs = require('fs');
var glob = require('glob-fs')();
var path = require('path');
var result = 0;
var exclude = ['LICENSE',
path.join('e2e', 'util', 'db-ca', 'someother-file'),
path.join('src', 'favicon.ico')];
var files = [];
files = glob.readdirSync('**');
var allFiles = [];
var patternString = [
'trade',
'order',
'market',
'securities'
];
files.map((file) => {
try {
if (!fs.lstatSync(file).isDirectory() && exclude.indexOf(file) === -1) {
fs.readFileSync(file).toString().split(/\r?\n/).forEach(function(line){
patternString.map((pattern) => {
if (line.indexOf(pattern) !== -1) {
console.log(file + ' contain `' + pattern + '` in in line "' + line +'";');
result = 1;
}
});
});
}
} catch (e) {
console.log('Error:', e.stack);
}
});
process.exit(result);
})();