我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。

我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp

我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。


当前回答

如果可以的话,这是我简单快速的贡献。 没有提高。

接受分隔符和分隔符中的分隔符,只要成对或远离分隔符即可。

#include <iostream>
#include <vector>
#include <fstream>

std::vector<std::string> SplitCSV(const std::string &data, char separator, char delimiter)
{
  std::vector<std::string> Values;
  std::string Val = "";
  bool VDel = false; // Is within delimiter?
  size_t CDel = 0; // Delimiters counter within delimiters.
  const char *C = data.c_str();
  size_t P = 0;
  do
  {
    if ((Val.length() == 0) && (C[P] == delimiter))
    {
      VDel = !VDel;
      CDel = 0;
      P++;
      continue;
    }
    if (VDel)
    {
      if (C[P] == delimiter)
      {
        if (((CDel % 2) == 0) && ( (C[P+1] == separator) || (C[P+1] == 0) || (C[P+1] == '\n') || (C[P+1] == '\r') ))
        {
          VDel = false;
          CDel = 0;
          P++;
          continue;
        }
        else
          CDel++;
      }
    }
    else
    {
      if (C[P] == separator)
      {
        Values.push_back(Val);
        Val = "";
        P++;
        continue;
      }
      if ((C[P] == 0) || (C[P] == '\n') || (C[P] == '\r'))
        break;
    }
    Val += C[P];
    P++;
  } while(P < data.length());
  Values.push_back(Val);
  return Values;
}

bool ReadCsv(const std::string &fname, std::vector<std::vector<std::string>> &data,
  char separator = ',', char delimiter = '\"')
{
  bool Ret = false;
  std::ifstream FCsv(fname);
  if (FCsv)
  {
    FCsv.seekg(0, FCsv.end);
    size_t Siz = FCsv.tellg();
    if (Siz > 0)
    {
      FCsv.seekg(0);
      data.clear();
      std::string Line;
      while (getline(FCsv, Line, '\n'))
        data.push_back(SplitCSV(Line, separator, delimiter));
      Ret = true;
    }
    FCsv.close();
  }
  return Ret;
}

int main(int argc, char *argv[])
{
  std::vector<std::vector<std::string>> Data;
  ReadCsv("fsample.csv", Data);
  return 0;
}

其他回答

我的版本只使用标准c++ 11库。它很好地处理Excel CSV引用:

spam eggs,"foo,bar","""fizz buzz"""
1.23,4.567,-8.00E+09

代码是作为有限状态机编写的,每次只消耗一个字符。我认为这更容易解释。

#include <istream>
#include <string>
#include <vector>

enum class CSVState {
    UnquotedField,
    QuotedField,
    QuotedQuote
};

std::vector<std::string> readCSVRow(const std::string &row) {
    CSVState state = CSVState::UnquotedField;
    std::vector<std::string> fields {""};
    size_t i = 0; // index of the current field
    for (char c : row) {
        switch (state) {
            case CSVState::UnquotedField:
                switch (c) {
                    case ',': // end of field
                              fields.push_back(""); i++;
                              break;
                    case '"': state = CSVState::QuotedField;
                              break;
                    default:  fields[i].push_back(c);
                              break; }
                break;
            case CSVState::QuotedField:
                switch (c) {
                    case '"': state = CSVState::QuotedQuote;
                              break;
                    default:  fields[i].push_back(c);
                              break; }
                break;
            case CSVState::QuotedQuote:
                switch (c) {
                    case ',': // , after closing quote
                              fields.push_back(""); i++;
                              state = CSVState::UnquotedField;
                              break;
                    case '"': // "" -> "
                              fields[i].push_back('"');
                              state = CSVState::QuotedField;
                              break;
                    default:  // end of quote
                              state = CSVState::UnquotedField;
                              break; }
                break;
        }
    }
    return fields;
}

/// Read CSV file, Excel dialect. Accept "quoted fields ""with quotes"""
std::vector<std::vector<std::string>> readCSV(std::istream &in) {
    std::vector<std::vector<std::string>> table;
    std::string row;
    while (!in.eof()) {
        std::getline(in, row);
        if (in.bad() || in.fail()) {
            break;
        }
        auto fields = readCSVRow(row);
        table.push_back(fields);
    }
    return table;
}

另一个类似于Loki Astari的答案的解决方案,在c++ 11中。这里的行是给定类型的std::元组。代码扫描一行,然后扫描到每个分隔符,然后将值直接转换并转储到元组中(使用一些模板代码)。

for (auto row : csv<std::string, int, float>(file, ',')) {
    std::cout << "first col: " << std::get<0>(row) << std::endl;
}

优势:

非常干净,使用简单,只有c++ 11。 自动类型转换为std::tuple<t1,…>通过算子>>。

缺少什么:

转义和引用 没有错误处理的情况下畸形的CSV。

主要代码:

#include <iterator>
#include <sstream>
#include <string>

namespace csvtools {
    /// Read the last element of the tuple without calling recursively
    template <std::size_t idx, class... fields>
    typename std::enable_if<idx >= std::tuple_size<std::tuple<fields...>>::value - 1>::type
    read_tuple(std::istream &in, std::tuple<fields...> &out, const char delimiter) {
        std::string cell;
        std::getline(in, cell, delimiter);
        std::stringstream cell_stream(cell);
        cell_stream >> std::get<idx>(out);
    }

    /// Read the @p idx-th element of the tuple and then calls itself with @p idx + 1 to
    /// read the next element of the tuple. Automatically falls in the previous case when
    /// reaches the last element of the tuple thanks to enable_if
    template <std::size_t idx, class... fields>
    typename std::enable_if<idx < std::tuple_size<std::tuple<fields...>>::value - 1>::type
    read_tuple(std::istream &in, std::tuple<fields...> &out, const char delimiter) {
        std::string cell;
        std::getline(in, cell, delimiter);
        std::stringstream cell_stream(cell);
        cell_stream >> std::get<idx>(out);
        read_tuple<idx + 1, fields...>(in, out, delimiter);
    }
}

/// Iterable csv wrapper around a stream. @p fields the list of types that form up a row.
template <class... fields>
class csv {
    std::istream &_in;
    const char _delim;
public:
    typedef std::tuple<fields...> value_type;
    class iterator;

    /// Construct from a stream.
    inline csv(std::istream &in, const char delim) : _in(in), _delim(delim) {}

    /// Status of the underlying stream
    /// @{
    inline bool good() const {
        return _in.good();
    }
    inline const std::istream &underlying_stream() const {
        return _in;
    }
    /// @}

    inline iterator begin();
    inline iterator end();
private:

    /// Reads a line into a stringstream, and then reads the line into a tuple, that is returned
    inline value_type read_row() {
        std::string line;
        std::getline(_in, line);
        std::stringstream line_stream(line);
        std::tuple<fields...> retval;
        csvtools::read_tuple<0, fields...>(line_stream, retval, _delim);
        return retval;
    }
};

/// Iterator; just calls recursively @ref csv::read_row and stores the result.
template <class... fields>
class csv<fields...>::iterator {
    csv::value_type _row;
    csv *_parent;
public:
    typedef std::input_iterator_tag iterator_category;
    typedef csv::value_type         value_type;
    typedef std::size_t             difference_type;
    typedef csv::value_type *       pointer;
    typedef csv::value_type &       reference;

    /// Construct an empty/end iterator
    inline iterator() : _parent(nullptr) {}
    /// Construct an iterator at the beginning of the @p parent csv object.
    inline iterator(csv &parent) : _parent(parent.good() ? &parent : nullptr) {
        ++(*this);
    }

    /// Read one row, if possible. Set to end if parent is not good anymore.
    inline iterator &operator++() {
        if (_parent != nullptr) {
            _row = _parent->read_row();
            if (!_parent->good()) {
                _parent = nullptr;
            }
        }
        return *this;
    }

    inline iterator operator++(int) {
        iterator copy = *this;
        ++(*this);
        return copy;
    }

    inline csv::value_type const &operator*() const {
        return _row;
    }

    inline csv::value_type const *operator->() const {
        return &_row;
    }

    bool operator==(iterator const &other) {
        return (this == &other) or (_parent == nullptr and other._parent == nullptr);
    }
    bool operator!=(iterator const &other) {
        return not (*this == other);
    }
};

template <class... fields>
typename csv<fields...>::iterator csv<fields...>::begin() {
    return iterator(*this);
}

template <class... fields>
typename csv<fields...>::iterator csv<fields...>::end() {
    return iterator();
}

我在GitHub上放了一个小的工作示例;我一直用它来解析一些数值数据,它达到了它的目的。

当你使用boost::spirit这样漂亮的东西时,你应该感到自豪

这里我的一个解析器的尝试(几乎)符合这个链接的CSV规范(我不需要在字段中换行)。逗号周围的空格也被省略了)。

在你克服了编译这段代码需要等待10秒的令人震惊的经历之后:),你就可以坐下来享受了。

// csvparser.cpp
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>

#include <iostream>
#include <string>

namespace qi = boost::spirit::qi;
namespace bascii = boost::spirit::ascii;

template <typename Iterator>
struct csv_parser : qi::grammar<Iterator, std::vector<std::string>(), 
    bascii::space_type>
{
    qi::rule<Iterator, char()                                           > COMMA;
    qi::rule<Iterator, char()                                           > DDQUOTE;
    qi::rule<Iterator, std::string(),               bascii::space_type  > non_escaped;
    qi::rule<Iterator, std::string(),               bascii::space_type  > escaped;
    qi::rule<Iterator, std::string(),               bascii::space_type  > field;
    qi::rule<Iterator, std::vector<std::string>(),  bascii::space_type  > start;

    csv_parser() : csv_parser::base_type(start)
    {
        using namespace qi;
        using qi::lit;
        using qi::lexeme;
        using bascii::char_;

        start       = field % ',';
        field       = escaped | non_escaped;
        escaped     = lexeme['"' >> *( char_ -(char_('"') | ',') | COMMA | DDQUOTE)  >> '"'];
        non_escaped = lexeme[       *( char_ -(char_('"') | ',')                  )        ];
        DDQUOTE     = lit("\"\"")       [_val = '"'];
        COMMA       = lit(",")          [_val = ','];
    }

};

int main()
{
    std::cout << "Enter CSV lines [empty] to quit\n";

    using bascii::space;
    typedef std::string::const_iterator iterator_type;
    typedef csv_parser<iterator_type> csv_parser;

    csv_parser grammar;
    std::string str;
    int fid;
    while (getline(std::cin, str))
    {
        fid = 0;

        if (str.empty())
            break;

        std::vector<std::string> csv;
        std::string::const_iterator it_beg = str.begin();
        std::string::const_iterator it_end = str.end();
        bool r = phrase_parse(it_beg, it_end, grammar, space, csv);

        if (r && it_beg == it_end)
        {
            std::cout << "Parsing succeeded\n";
            for (auto& field: csv)
            {
                std::cout << "field " << ++fid << ": " << field << std::endl;
            }
        }
        else
        {
            std::cout << "Parsing failed\n";
        }
    }

    return 0;
}

编译:

make csvparser

测试(例子摘自维基百科):

./csvparser
Enter CSV lines [empty] to quit

1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00
Parsing succeeded
field 1: 1999
field 2: Chevy
field 3: Venture "Extended Edition, Very Large"
field 4: 
field 5: 5000.00

1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00"
Parsing failed

您可以使用仅头文件的Csv::Parser库。

它完全支持RFC 4180,包括字段值中的引号、转义引号和换行。 它只需要标准的c++ (c++ 17)。 它支持在编译时从std::string_view读取CSV数据。 它使用Catch2进行了广泛的测试。

如果您所需要的只是加载一个双精度数据文件(没有整数,没有文本),那么这里有一个随时可用的函数。

#include <sstream>
#include <fstream>
#include <iterator>
#include <string>
#include <vector>
#include <algorithm>

using namespace std;

/**
 * Parse a CSV data file and fill the 2d STL vector "data".
 * Limits: only "pure datas" of doubles, not encapsulated by " and without \n inside.
 * Further no formatting in the data (e.g. scientific notation)
 * It however handles both dots and commas as decimal separators and removes thousand separator.
 * 
 * returnCodes[0]: file access 0-> ok 1-> not able to read; 2-> decimal separator equal to comma separator
 * returnCodes[1]: number of records
 * returnCodes[2]: number of fields. -1 If rows have different field size
 * 
 */
vector<int>
readCsvData (vector <vector <double>>& data, const string& filename, const string& delimiter, const string& decseparator){

 int vv[3] = { 0,0,0 };
 vector<int> returnCodes(&vv[0], &vv[0]+3);

 string rowstring, stringtoken;
 double doubletoken;
 int rowcount=0;
 int fieldcount=0;
 data.clear();

 ifstream iFile(filename, ios_base::in);
 if (!iFile.is_open()){
   returnCodes[0] = 1;
   return returnCodes;
 }
 while (getline(iFile, rowstring)) {
    if (rowstring=="") continue; // empty line
    rowcount ++; //let's start with 1
    if(delimiter == decseparator){
      returnCodes[0] = 2;
      return returnCodes;
    }
    if(decseparator != "."){
     // remove dots (used as thousand separators)
     string::iterator end_pos = remove(rowstring.begin(), rowstring.end(), '.');
     rowstring.erase(end_pos, rowstring.end());
     // replace decimal separator with dots.
     replace(rowstring.begin(), rowstring.end(),decseparator.c_str()[0], '.'); 
    } else {
     // remove commas (used as thousand separators)
     string::iterator end_pos = remove(rowstring.begin(), rowstring.end(), ',');
     rowstring.erase(end_pos, rowstring.end());
    }
    // tokenize..
    vector<double> tokens;
    // Skip delimiters at beginning.
    string::size_type lastPos = rowstring.find_first_not_of(delimiter, 0);
    // Find first "non-delimiter".
    string::size_type pos     = rowstring.find_first_of(delimiter, lastPos);
    while (string::npos != pos || string::npos != lastPos){
        // Found a token, convert it to double add it to the vector.
        stringtoken = rowstring.substr(lastPos, pos - lastPos);
        if (stringtoken == "") {
      tokens.push_back(0.0);
    } else {
          istringstream totalSString(stringtoken);
      totalSString >> doubletoken;
      tokens.push_back(doubletoken);
    }     
        // Skip delimiters.  Note the "not_of"
        lastPos = rowstring.find_first_not_of(delimiter, pos);
        // Find next "non-delimiter"
        pos = rowstring.find_first_of(delimiter, lastPos);
    }
    if(rowcount == 1){
      fieldcount = tokens.size();
      returnCodes[2] = tokens.size();
    } else {
      if ( tokens.size() != fieldcount){
    returnCodes[2] = -1;
      }
    }
    data.push_back(tokens);
 }
 iFile.close();
 returnCodes[1] = rowcount;
 return returnCodes;
}