我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
当前回答
不管怎样,下面是我的实现。它处理wstring输入,但是可以很容易地调整为string。它不处理字段中的换行符(因为我的应用程序也不这样做,但添加它的支持并不太难),它不符合RFC中的“\r\n”行尾(假设您使用std::getline),但它确实正确地处理空格修剪和双引号(希望如此)。
using namespace std;
// trim whitespaces around field or double-quotes, remove double-quotes and replace escaped double-quotes (double double-quotes)
wstring trimquote(const wstring& str, const wstring& whitespace, const wchar_t quotChar)
{
wstring ws;
wstring::size_type strBegin = str.find_first_not_of(whitespace);
if (strBegin == wstring::npos)
return L"";
wstring::size_type strEnd = str.find_last_not_of(whitespace);
wstring::size_type strRange = strEnd - strBegin + 1;
if((str[strBegin] == quotChar) && (str[strEnd] == quotChar))
{
ws = str.substr(strBegin+1, strRange-2);
strBegin = 0;
while((strEnd = ws.find(quotChar, strBegin)) != wstring::npos)
{
ws.erase(strEnd, 1);
strBegin = strEnd+1;
}
}
else
ws = str.substr(strBegin, strRange);
return ws;
}
pair<unsigned, unsigned> nextCSVQuotePair(const wstring& line, const wchar_t quotChar, unsigned ofs = 0)
{
pair<unsigned, unsigned> r;
r.first = line.find(quotChar, ofs);
r.second = wstring::npos;
if(r.first != wstring::npos)
{
r.second = r.first;
while(((r.second = line.find(quotChar, r.second+1)) != wstring::npos)
&& (line[r.second+1] == quotChar)) // WARNING: assumes null-terminated string such that line[r.second+1] always exist
r.second++;
}
return r;
}
unsigned parseLine(vector<wstring>& fields, const wstring& line)
{
unsigned ofs, ofs0, np;
const wchar_t delim = L',';
const wstring whitespace = L" \t\xa0\x3000\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x202f\x205f";
const wchar_t quotChar = L'\"';
pair<unsigned, unsigned> quot;
fields.clear();
ofs = ofs0 = 0;
quot = nextCSVQuotePair(line, quotChar);
while((np = line.find(delim, ofs)) != wstring::npos)
{
if((np > quot.first) && (np < quot.second))
{ // skip delimiter inside quoted field
ofs = quot.second+1;
quot = nextCSVQuotePair(line, quotChar, ofs);
continue;
}
fields.push_back( trimquote(line.substr(ofs0, np-ofs0), whitespace, quotChar) );
ofs = ofs0 = np+1;
}
fields.push_back( trimquote(line.substr(ofs0), whitespace, quotChar) );
return fields.size();
}
其他回答
我的版本只使用标准c++ 11库。它很好地处理Excel CSV引用:
spam eggs,"foo,bar","""fizz buzz"""
1.23,4.567,-8.00E+09
代码是作为有限状态机编写的,每次只消耗一个字符。我认为这更容易解释。
#include <istream>
#include <string>
#include <vector>
enum class CSVState {
UnquotedField,
QuotedField,
QuotedQuote
};
std::vector<std::string> readCSVRow(const std::string &row) {
CSVState state = CSVState::UnquotedField;
std::vector<std::string> fields {""};
size_t i = 0; // index of the current field
for (char c : row) {
switch (state) {
case CSVState::UnquotedField:
switch (c) {
case ',': // end of field
fields.push_back(""); i++;
break;
case '"': state = CSVState::QuotedField;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedField:
switch (c) {
case '"': state = CSVState::QuotedQuote;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedQuote:
switch (c) {
case ',': // , after closing quote
fields.push_back(""); i++;
state = CSVState::UnquotedField;
break;
case '"': // "" -> "
fields[i].push_back('"');
state = CSVState::QuotedField;
break;
default: // end of quote
state = CSVState::UnquotedField;
break; }
break;
}
}
return fields;
}
/// Read CSV file, Excel dialect. Accept "quoted fields ""with quotes"""
std::vector<std::vector<std::string>> readCSV(std::istream &in) {
std::vector<std::vector<std::string>> table;
std::string row;
while (!in.eof()) {
std::getline(in, row);
if (in.bad() || in.fail()) {
break;
}
auto fields = readCSVRow(row);
table.push_back(fields);
}
return table;
}
使用Spirit来解析csv并不过分。Spirit非常适合微解析任务。例如,使用Spirit 2.1,它就像:
bool r = phrase_parse(first, last,
// Begin grammar
(
double_ % ','
)
,
// End grammar
space, v);
向量v被值填满了。在刚刚与Boost 1.41一起发布的新的Spirit 2.1文档中,有一系列教程涉及到这一点。
本教程从简单到复杂。CSV解析器呈现在中间的某个位置,并涉及使用Spirit的各种技术。生成的代码与手写代码一样紧凑。检查生成的汇编程序!
如果您所需要的只是加载一个双精度数据文件(没有整数,没有文本),那么这里有一个随时可用的函数。
#include <sstream>
#include <fstream>
#include <iterator>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
/**
* Parse a CSV data file and fill the 2d STL vector "data".
* Limits: only "pure datas" of doubles, not encapsulated by " and without \n inside.
* Further no formatting in the data (e.g. scientific notation)
* It however handles both dots and commas as decimal separators and removes thousand separator.
*
* returnCodes[0]: file access 0-> ok 1-> not able to read; 2-> decimal separator equal to comma separator
* returnCodes[1]: number of records
* returnCodes[2]: number of fields. -1 If rows have different field size
*
*/
vector<int>
readCsvData (vector <vector <double>>& data, const string& filename, const string& delimiter, const string& decseparator){
int vv[3] = { 0,0,0 };
vector<int> returnCodes(&vv[0], &vv[0]+3);
string rowstring, stringtoken;
double doubletoken;
int rowcount=0;
int fieldcount=0;
data.clear();
ifstream iFile(filename, ios_base::in);
if (!iFile.is_open()){
returnCodes[0] = 1;
return returnCodes;
}
while (getline(iFile, rowstring)) {
if (rowstring=="") continue; // empty line
rowcount ++; //let's start with 1
if(delimiter == decseparator){
returnCodes[0] = 2;
return returnCodes;
}
if(decseparator != "."){
// remove dots (used as thousand separators)
string::iterator end_pos = remove(rowstring.begin(), rowstring.end(), '.');
rowstring.erase(end_pos, rowstring.end());
// replace decimal separator with dots.
replace(rowstring.begin(), rowstring.end(),decseparator.c_str()[0], '.');
} else {
// remove commas (used as thousand separators)
string::iterator end_pos = remove(rowstring.begin(), rowstring.end(), ',');
rowstring.erase(end_pos, rowstring.end());
}
// tokenize..
vector<double> tokens;
// Skip delimiters at beginning.
string::size_type lastPos = rowstring.find_first_not_of(delimiter, 0);
// Find first "non-delimiter".
string::size_type pos = rowstring.find_first_of(delimiter, lastPos);
while (string::npos != pos || string::npos != lastPos){
// Found a token, convert it to double add it to the vector.
stringtoken = rowstring.substr(lastPos, pos - lastPos);
if (stringtoken == "") {
tokens.push_back(0.0);
} else {
istringstream totalSString(stringtoken);
totalSString >> doubletoken;
tokens.push_back(doubletoken);
}
// Skip delimiters. Note the "not_of"
lastPos = rowstring.find_first_not_of(delimiter, pos);
// Find next "non-delimiter"
pos = rowstring.find_first_of(delimiter, lastPos);
}
if(rowcount == 1){
fieldcount = tokens.size();
returnCodes[2] = tokens.size();
} else {
if ( tokens.size() != fieldcount){
returnCodes[2] = -1;
}
}
data.push_back(tokens);
}
iFile.close();
returnCodes[1] = rowcount;
return returnCodes;
}
可以使用std::regex。
根据文件大小和可用内存,可以逐行读取,也可以完全在std::string中读取。
读取文件可以使用:
std::ifstream t("file.txt");
std::string sin((std::istreambuf_iterator<char>(t)),
std::istreambuf_iterator<char>());
然后你可以和这个相匹配,它实际上是根据你的需要定制的。
std::regex word_regex(",\\s]+");
auto what =
std::sregex_iterator(sin.begin(), sin.end(), word_regex);
auto wend = std::sregex_iterator();
std::vector<std::string> v;
for (;what!=wend ; wend) {
std::smatch match = *what;
v.push_back(match.str());
}
如果你确实关心正确解析CSV,这将做它…相对较慢,因为它一次只处理一个字符。
void ParseCSV(const string& csvSource, vector<vector<string> >& lines)
{
bool inQuote(false);
bool newLine(false);
string field;
lines.clear();
vector<string> line;
string::const_iterator aChar = csvSource.begin();
while (aChar != csvSource.end())
{
switch (*aChar)
{
case '"':
newLine = false;
inQuote = !inQuote;
break;
case ',':
newLine = false;
if (inQuote == true)
{
field += *aChar;
}
else
{
line.push_back(field);
field.clear();
}
break;
case '\n':
case '\r':
if (inQuote == true)
{
field += *aChar;
}
else
{
if (newLine == false)
{
line.push_back(field);
lines.push_back(line);
field.clear();
line.clear();
newLine = true;
}
}
break;
default:
newLine = false;
field.push_back(*aChar);
break;
}
aChar++;
}
if (field.size())
line.push_back(field);
if (line.size())
lines.push_back(line);
}