我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
当前回答
另一种快速简单的方法是使用Boost。I / O:融合
#include <iostream>
#include <sstream>
#include <boost/fusion/adapted/boost_tuple.hpp>
#include <boost/fusion/sequence/io.hpp>
namespace fusion = boost::fusion;
struct CsvString
{
std::string value;
// Stop reading a string once a CSV delimeter is encountered.
friend std::istream& operator>>(std::istream& s, CsvString& v) {
v.value.clear();
for(;;) {
auto c = s.peek();
if(std::istream::traits_type::eof() == c || ',' == c || '\n' == c)
break;
v.value.push_back(c);
s.get();
}
return s;
}
friend std::ostream& operator<<(std::ostream& s, CsvString const& v) {
return s << v.value;
}
};
int main() {
std::stringstream input("abc,123,true,3.14\n"
"def,456,false,2.718\n");
typedef boost::tuple<CsvString, int, bool, double> CsvRow;
using fusion::operator<<;
std::cout << std::boolalpha;
using fusion::operator>>;
input >> std::boolalpha;
input >> fusion::tuple_open("") >> fusion::tuple_close("\n") >> fusion::tuple_delimiter(',');
for(CsvRow row; input >> row;)
std::cout << row << '\n';
}
输出:
(abc 123 true 3.14)
(def 456 false 2.718)
其他回答
你可以在escaped_list_separator中使用Boost Tokenizer。
Escaped_list_separator解析csv的超集。Boost::记号赋予器
这只使用Boost标记器头文件,不需要链接到Boost库。
下面是一个例子,(详情请参阅c++中使用Boost Tokenizer解析CSV文件或Boost:: Tokenizer):
#include <iostream> // cout, endl
#include <fstream> // fstream
#include <vector>
#include <string>
#include <algorithm> // copy
#include <iterator> // ostream_operator
#include <boost/tokenizer.hpp>
int main()
{
using namespace std;
using namespace boost;
string data("data.csv");
ifstream in(data.c_str());
if (!in.is_open()) return 1;
typedef tokenizer< escaped_list_separator<char> > Tokenizer;
vector< string > vec;
string line;
while (getline(in,line))
{
Tokenizer tok(line);
vec.assign(tok.begin(),tok.end());
// vector now contains strings from one row, output to cout here
copy(vec.begin(), vec.end(), ostream_iterator<string>(cout, "|"));
cout << "\n----------------------" << endl;
}
}
我写了一个很好的解析CSV文件的方法,我认为我应该把它作为一个答案:
#include <algorithm>
#include <fstream>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
struct CSVDict
{
std::vector< std::string > inputImages;
std::vector< double > inputLabels;
};
/**
\brief Splits the string
\param str String to split
\param delim Delimiter on the basis of which splitting is to be done
\return results Output in the form of vector of strings
*/
std::vector<std::string> stringSplit( const std::string &str, const std::string &delim )
{
std::vector<std::string> results;
for (size_t i = 0; i < str.length(); i++)
{
std::string tempString = "";
while ((str[i] != *delim.c_str()) && (i < str.length()))
{
tempString += str[i];
i++;
}
results.push_back(tempString);
}
return results;
}
/**
\brief Parse the supplied CSV File and obtain Row and Column information.
Assumptions:
1. Header information is in first row
2. Delimiters are only used to differentiate cell members
\param csvFileName The full path of the file to parse
\param inputColumns The string of input columns which contain the data to be used for further processing
\param inputLabels The string of input labels based on which further processing is to be done
\param delim The delimiters used in inputColumns and inputLabels
\return Vector of Vector of strings: Collection of rows and columns
*/
std::vector< CSVDict > parseCSVFile( const std::string &csvFileName, const std::string &inputColumns, const std::string &inputLabels, const std::string &delim )
{
std::vector< CSVDict > return_CSVDict;
std::vector< std::string > inputColumnsVec = stringSplit(inputColumns, delim), inputLabelsVec = stringSplit(inputLabels, delim);
std::vector< std::vector< std::string > > returnVector;
std::ifstream inFile(csvFileName.c_str());
int row = 0;
std::vector< size_t > inputColumnIndeces, inputLabelIndeces;
for (std::string line; std::getline(inFile, line, '\n');)
{
CSVDict tempDict;
std::vector< std::string > rowVec;
line.erase(std::remove(line.begin(), line.end(), '"'), line.end());
rowVec = stringSplit(line, delim);
// for the first row, record the indeces of the inputColumns and inputLabels
if (row == 0)
{
for (size_t i = 0; i < rowVec.size(); i++)
{
for (size_t j = 0; j < inputColumnsVec.size(); j++)
{
if (rowVec[i] == inputColumnsVec[j])
{
inputColumnIndeces.push_back(i);
}
}
for (size_t j = 0; j < inputLabelsVec.size(); j++)
{
if (rowVec[i] == inputLabelsVec[j])
{
inputLabelIndeces.push_back(i);
}
}
}
}
else
{
for (size_t i = 0; i < inputColumnIndeces.size(); i++)
{
tempDict.inputImages.push_back(rowVec[inputColumnIndeces[i]]);
}
for (size_t i = 0; i < inputLabelIndeces.size(); i++)
{
double test = std::atof(rowVec[inputLabelIndeces[i]].c_str());
tempDict.inputLabels.push_back(std::atof(rowVec[inputLabelIndeces[i]].c_str()));
}
return_CSVDict.push_back(tempDict);
}
row++;
}
return return_CSVDict;
}
可以使用std::regex。
根据文件大小和可用内存,可以逐行读取,也可以完全在std::string中读取。
读取文件可以使用:
std::ifstream t("file.txt");
std::string sin((std::istreambuf_iterator<char>(t)),
std::istreambuf_iterator<char>());
然后你可以和这个相匹配,它实际上是根据你的需要定制的。
std::regex word_regex(",\\s]+");
auto what =
std::sregex_iterator(sin.begin(), sin.end(), word_regex);
auto wend = std::sregex_iterator();
std::vector<std::string> v;
for (;what!=wend ; wend) {
std::smatch match = *what;
v.push_back(match.str());
}
由于所有CSV问题似乎都被重定向到这里,我想我应该在这里发布我的答案。这个回答并没有直接回答提问者的问题。我希望能够读取已知的CSV格式的流,而且每个字段的类型都已经知道。当然,可以使用下面的方法将每个字段处理为字符串类型。
作为我希望能够使用CSV输入流的一个例子,考虑以下输入(取自维基百科的CSV页面):
const char input[] =
"Year,Make,Model,Description,Price\n"
"1997,Ford,E350,\"ac, abs, moon\",3000.00\n"
"1999,Chevy,\"Venture \"\"Extended Edition\"\"\",\"\",4900.00\n"
"1999,Chevy,\"Venture \"\"Extended Edition, Very Large\"\"\",\"\",5000.00\n"
"1996,Jeep,Grand Cherokee,\"MUST SELL!\n\
air, moon roof, loaded\",4799.00\n"
;
然后,我希望能够像这样读取数据:
std::istringstream ss(input);
std::string title[5];
int year;
std::string make, model, desc;
float price;
csv_istream(ss)
>> title[0] >> title[1] >> title[2] >> title[3] >> title[4];
while (csv_istream(ss)
>> year >> make >> model >> desc >> price) {
//...do something with the record...
}
这就是我最后得到的解。
struct csv_istream {
std::istream &is_;
csv_istream (std::istream &is) : is_(is) {}
void scan_ws () const {
while (is_.good()) {
int c = is_.peek();
if (c != ' ' && c != '\t') break;
is_.get();
}
}
void scan (std::string *s = 0) const {
std::string ws;
int c = is_.get();
if (is_.good()) {
do {
if (c == ',' || c == '\n') break;
if (s) {
ws += c;
if (c != ' ' && c != '\t') {
*s += ws;
ws.clear();
}
}
c = is_.get();
} while (is_.good());
if (is_.eof()) is_.clear();
}
}
template <typename T, bool> struct set_value {
void operator () (std::string in, T &v) const {
std::istringstream(in) >> v;
}
};
template <typename T> struct set_value<T, true> {
template <bool SIGNED> void convert (std::string in, T &v) const {
if (SIGNED) v = ::strtoll(in.c_str(), 0, 0);
else v = ::strtoull(in.c_str(), 0, 0);
}
void operator () (std::string in, T &v) const {
convert<is_signed_int<T>::val>(in, v);
}
};
template <typename T> const csv_istream & operator >> (T &v) const {
std::string tmp;
scan(&tmp);
set_value<T, is_int<T>::val>()(tmp, v);
return *this;
}
const csv_istream & operator >> (std::string &v) const {
v.clear();
scan_ws();
if (is_.peek() != '"') scan(&v);
else {
std::string tmp;
is_.get();
std::getline(is_, tmp, '"');
while (is_.peek() == '"') {
v += tmp;
v += is_.get();
std::getline(is_, tmp, '"');
}
v += tmp;
scan();
}
return *this;
}
template <typename T>
const csv_istream & operator >> (T &(*manip)(T &)) const {
is_ >> manip;
return *this;
}
operator bool () const { return !is_.fail(); }
};
使用以下helper,可以通过c++ 11中的新积分特征模板进行简化:
template <typename T> struct is_signed_int { enum { val = false }; };
template <> struct is_signed_int<short> { enum { val = true}; };
template <> struct is_signed_int<int> { enum { val = true}; };
template <> struct is_signed_int<long> { enum { val = true}; };
template <> struct is_signed_int<long long> { enum { val = true}; };
template <typename T> struct is_unsigned_int { enum { val = false }; };
template <> struct is_unsigned_int<unsigned short> { enum { val = true}; };
template <> struct is_unsigned_int<unsigned int> { enum { val = true}; };
template <> struct is_unsigned_int<unsigned long> { enum { val = true}; };
template <> struct is_unsigned_int<unsigned long long> { enum { val = true}; };
template <typename T> struct is_int {
enum { val = (is_signed_int<T>::val || is_unsigned_int<T>::val) };
};
在网上试试!
我的版本只使用标准c++ 11库。它很好地处理Excel CSV引用:
spam eggs,"foo,bar","""fizz buzz"""
1.23,4.567,-8.00E+09
代码是作为有限状态机编写的,每次只消耗一个字符。我认为这更容易解释。
#include <istream>
#include <string>
#include <vector>
enum class CSVState {
UnquotedField,
QuotedField,
QuotedQuote
};
std::vector<std::string> readCSVRow(const std::string &row) {
CSVState state = CSVState::UnquotedField;
std::vector<std::string> fields {""};
size_t i = 0; // index of the current field
for (char c : row) {
switch (state) {
case CSVState::UnquotedField:
switch (c) {
case ',': // end of field
fields.push_back(""); i++;
break;
case '"': state = CSVState::QuotedField;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedField:
switch (c) {
case '"': state = CSVState::QuotedQuote;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedQuote:
switch (c) {
case ',': // , after closing quote
fields.push_back(""); i++;
state = CSVState::UnquotedField;
break;
case '"': // "" -> "
fields[i].push_back('"');
state = CSVState::QuotedField;
break;
default: // end of quote
state = CSVState::UnquotedField;
break; }
break;
}
}
return fields;
}
/// Read CSV file, Excel dialect. Accept "quoted fields ""with quotes"""
std::vector<std::vector<std::string>> readCSV(std::istream &in) {
std::vector<std::vector<std::string>> table;
std::string row;
while (!in.eof()) {
std::getline(in, row);
if (in.bad() || in.fail()) {
break;
}
auto fields = readCSVRow(row);
table.push_back(fields);
}
return table;
}