我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
当前回答
当你使用boost::spirit这样漂亮的东西时,你应该感到自豪
这里我的一个解析器的尝试(几乎)符合这个链接的CSV规范(我不需要在字段中换行)。逗号周围的空格也被省略了)。
在你克服了编译这段代码需要等待10秒的令人震惊的经历之后:),你就可以坐下来享受了。
// csvparser.cpp
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <iostream>
#include <string>
namespace qi = boost::spirit::qi;
namespace bascii = boost::spirit::ascii;
template <typename Iterator>
struct csv_parser : qi::grammar<Iterator, std::vector<std::string>(),
bascii::space_type>
{
qi::rule<Iterator, char() > COMMA;
qi::rule<Iterator, char() > DDQUOTE;
qi::rule<Iterator, std::string(), bascii::space_type > non_escaped;
qi::rule<Iterator, std::string(), bascii::space_type > escaped;
qi::rule<Iterator, std::string(), bascii::space_type > field;
qi::rule<Iterator, std::vector<std::string>(), bascii::space_type > start;
csv_parser() : csv_parser::base_type(start)
{
using namespace qi;
using qi::lit;
using qi::lexeme;
using bascii::char_;
start = field % ',';
field = escaped | non_escaped;
escaped = lexeme['"' >> *( char_ -(char_('"') | ',') | COMMA | DDQUOTE) >> '"'];
non_escaped = lexeme[ *( char_ -(char_('"') | ',') ) ];
DDQUOTE = lit("\"\"") [_val = '"'];
COMMA = lit(",") [_val = ','];
}
};
int main()
{
std::cout << "Enter CSV lines [empty] to quit\n";
using bascii::space;
typedef std::string::const_iterator iterator_type;
typedef csv_parser<iterator_type> csv_parser;
csv_parser grammar;
std::string str;
int fid;
while (getline(std::cin, str))
{
fid = 0;
if (str.empty())
break;
std::vector<std::string> csv;
std::string::const_iterator it_beg = str.begin();
std::string::const_iterator it_end = str.end();
bool r = phrase_parse(it_beg, it_end, grammar, space, csv);
if (r && it_beg == it_end)
{
std::cout << "Parsing succeeded\n";
for (auto& field: csv)
{
std::cout << "field " << ++fid << ": " << field << std::endl;
}
}
else
{
std::cout << "Parsing failed\n";
}
}
return 0;
}
编译:
make csvparser
测试(例子摘自维基百科):
./csvparser
Enter CSV lines [empty] to quit
1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00
Parsing succeeded
field 1: 1999
field 2: Chevy
field 3: Venture "Extended Edition, Very Large"
field 4:
field 5: 5000.00
1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00"
Parsing failed
其他回答
您需要做的第一件事是确保文件存在。来完成 这你只需要尝试打开文件流的路径。在你 打开文件流使用stream.fail()查看它是否如预期的那样工作, 与否。
bool fileExists(string fileName)
{
ifstream test;
test.open(fileName.c_str());
if (test.fail())
{
test.close();
return false;
}
else
{
test.close();
return true;
}
}
您还必须验证所提供的文件是正确的文件类型。 要做到这一点,您需要查看提供的文件路径直到 您可以找到文件扩展名。一旦你有了文件扩展名,请确保 它是一个。csv文件。
bool verifyExtension(string filename)
{
int period = 0;
for (unsigned int i = 0; i < filename.length(); i++)
{
if (filename[i] == '.')
period = i;
}
string extension;
for (unsigned int i = period; i < filename.length(); i++)
extension += filename[i];
if (extension == ".csv")
return true;
else
return false;
}
此函数将返回稍后在错误消息中使用的文件扩展名。
string getExtension(string filename)
{
int period = 0;
for (unsigned int i = 0; i < filename.length(); i++)
{
if (filename[i] == '.')
period = i;
}
string extension;
if (period != 0)
{
for (unsigned int i = period; i < filename.length(); i++)
extension += filename[i];
}
else
extension = "NO FILE";
return extension;
}
这个函数实际上会调用上面创建的错误检查,然后解析文件。
void parseFile(string fileName)
{
if (fileExists(fileName) && verifyExtension(fileName))
{
ifstream fs;
fs.open(fileName.c_str());
string fileCommand;
while (fs.good())
{
string temp;
getline(fs, fileCommand, '\n');
for (unsigned int i = 0; i < fileCommand.length(); i++)
{
if (fileCommand[i] != ',')
temp += fileCommand[i];
else
temp += " ";
}
if (temp != "\0")
{
// Place your code here to run the file.
}
}
fs.close();
}
else if (!fileExists(fileName))
{
cout << "Error: The provided file does not exist: " << fileName << endl;
if (!verifyExtension(fileName))
{
if (getExtension(fileName) != "NO FILE")
cout << "\tCheck the file extension." << endl;
else
cout << "\tThere is no file in the provided path." << endl;
}
}
else if (!verifyExtension(fileName))
{
if (getExtension(fileName) != "NO FILE")
cout << "Incorrect file extension provided: " << getExtension(fileName) << endl;
else
cout << "There is no file in the following path: " << fileName << endl;
}
}
可以使用std::regex。
根据文件大小和可用内存,可以逐行读取,也可以完全在std::string中读取。
读取文件可以使用:
std::ifstream t("file.txt");
std::string sin((std::istreambuf_iterator<char>(t)),
std::istreambuf_iterator<char>());
然后你可以和这个相匹配,它实际上是根据你的需要定制的。
std::regex word_regex(",\\s]+");
auto what =
std::sregex_iterator(sin.begin(), sin.end(), word_regex);
auto wend = std::sregex_iterator();
std::vector<std::string> v;
for (;what!=wend ; wend) {
std::smatch match = *what;
v.push_back(match.str());
}
@sastanin的解决方案的一个小版本,以便它可以处理引号中的换行。
std::vector<std::vector<std::string>> readCSV(std::istream &in) {
std::vector<std::vector<std::string>> table;
while (!in.eof()) {
CSVState state = CSVState::UnquotedField;
std::vector<std::string> fields {""};
size_t i = 0; // index of the current field
for (char c : row) {
switch (state) {
case CSVState::UnquotedField:
switch (c) {
case ',': // end of field
fields.push_back(""); i++;
break;
case '"': state = CSVState::QuotedField;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedField:
switch (c) {
case '"': state = CSVState::QuotedQuote;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedQuote:
switch (c) {
case ',': // , after closing quote
fields.push_back(""); i++;
state = CSVState::UnquotedField;
break;
case '"': // "" -> "
fields[i].push_back('"');
state = CSVState::QuotedField;
break;
case '\n': // newline
table.push_back(fields);
state = CSVState::UnquotedField;
fields = vector<string>{""};
i = 0;
default: // end of quote
state = CSVState::UnquotedField;
break; }
break;
}
}
}
return table;
}
不管怎样,下面是我的实现。它处理wstring输入,但是可以很容易地调整为string。它不处理字段中的换行符(因为我的应用程序也不这样做,但添加它的支持并不太难),它不符合RFC中的“\r\n”行尾(假设您使用std::getline),但它确实正确地处理空格修剪和双引号(希望如此)。
using namespace std;
// trim whitespaces around field or double-quotes, remove double-quotes and replace escaped double-quotes (double double-quotes)
wstring trimquote(const wstring& str, const wstring& whitespace, const wchar_t quotChar)
{
wstring ws;
wstring::size_type strBegin = str.find_first_not_of(whitespace);
if (strBegin == wstring::npos)
return L"";
wstring::size_type strEnd = str.find_last_not_of(whitespace);
wstring::size_type strRange = strEnd - strBegin + 1;
if((str[strBegin] == quotChar) && (str[strEnd] == quotChar))
{
ws = str.substr(strBegin+1, strRange-2);
strBegin = 0;
while((strEnd = ws.find(quotChar, strBegin)) != wstring::npos)
{
ws.erase(strEnd, 1);
strBegin = strEnd+1;
}
}
else
ws = str.substr(strBegin, strRange);
return ws;
}
pair<unsigned, unsigned> nextCSVQuotePair(const wstring& line, const wchar_t quotChar, unsigned ofs = 0)
{
pair<unsigned, unsigned> r;
r.first = line.find(quotChar, ofs);
r.second = wstring::npos;
if(r.first != wstring::npos)
{
r.second = r.first;
while(((r.second = line.find(quotChar, r.second+1)) != wstring::npos)
&& (line[r.second+1] == quotChar)) // WARNING: assumes null-terminated string such that line[r.second+1] always exist
r.second++;
}
return r;
}
unsigned parseLine(vector<wstring>& fields, const wstring& line)
{
unsigned ofs, ofs0, np;
const wchar_t delim = L',';
const wstring whitespace = L" \t\xa0\x3000\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x202f\x205f";
const wchar_t quotChar = L'\"';
pair<unsigned, unsigned> quot;
fields.clear();
ofs = ofs0 = 0;
quot = nextCSVQuotePair(line, quotChar);
while((np = line.find(delim, ofs)) != wstring::npos)
{
if((np > quot.first) && (np < quot.second))
{ // skip delimiter inside quoted field
ofs = quot.second+1;
quot = nextCSVQuotePair(line, quotChar, ofs);
continue;
}
fields.push_back( trimquote(line.substr(ofs0, np-ofs0), whitespace, quotChar) );
ofs = ofs0 = np+1;
}
fields.push_back( trimquote(line.substr(ofs0), whitespace, quotChar) );
return fields.size();
}
你可以在escaped_list_separator中使用Boost Tokenizer。
Escaped_list_separator解析csv的超集。Boost::记号赋予器
这只使用Boost标记器头文件,不需要链接到Boost库。
下面是一个例子,(详情请参阅c++中使用Boost Tokenizer解析CSV文件或Boost:: Tokenizer):
#include <iostream> // cout, endl
#include <fstream> // fstream
#include <vector>
#include <string>
#include <algorithm> // copy
#include <iterator> // ostream_operator
#include <boost/tokenizer.hpp>
int main()
{
using namespace std;
using namespace boost;
string data("data.csv");
ifstream in(data.c_str());
if (!in.is_open()) return 1;
typedef tokenizer< escaped_list_separator<char> > Tokenizer;
vector< string > vec;
string line;
while (getline(in,line))
{
Tokenizer tok(line);
vec.assign(tok.begin(),tok.end());
// vector now contains strings from one row, output to cout here
copy(vec.begin(), vec.end(), ostream_iterator<string>(cout, "|"));
cout << "\n----------------------" << endl;
}
}