我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。

我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp

我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。


当前回答

另一个类似于Loki Astari的答案的解决方案,在c++ 11中。这里的行是给定类型的std::元组。代码扫描一行,然后扫描到每个分隔符,然后将值直接转换并转储到元组中(使用一些模板代码)。

for (auto row : csv<std::string, int, float>(file, ',')) {
    std::cout << "first col: " << std::get<0>(row) << std::endl;
}

优势:

非常干净,使用简单,只有c++ 11。 自动类型转换为std::tuple<t1,…>通过算子>>。

缺少什么:

转义和引用 没有错误处理的情况下畸形的CSV。

主要代码:

#include <iterator>
#include <sstream>
#include <string>

namespace csvtools {
    /// Read the last element of the tuple without calling recursively
    template <std::size_t idx, class... fields>
    typename std::enable_if<idx >= std::tuple_size<std::tuple<fields...>>::value - 1>::type
    read_tuple(std::istream &in, std::tuple<fields...> &out, const char delimiter) {
        std::string cell;
        std::getline(in, cell, delimiter);
        std::stringstream cell_stream(cell);
        cell_stream >> std::get<idx>(out);
    }

    /// Read the @p idx-th element of the tuple and then calls itself with @p idx + 1 to
    /// read the next element of the tuple. Automatically falls in the previous case when
    /// reaches the last element of the tuple thanks to enable_if
    template <std::size_t idx, class... fields>
    typename std::enable_if<idx < std::tuple_size<std::tuple<fields...>>::value - 1>::type
    read_tuple(std::istream &in, std::tuple<fields...> &out, const char delimiter) {
        std::string cell;
        std::getline(in, cell, delimiter);
        std::stringstream cell_stream(cell);
        cell_stream >> std::get<idx>(out);
        read_tuple<idx + 1, fields...>(in, out, delimiter);
    }
}

/// Iterable csv wrapper around a stream. @p fields the list of types that form up a row.
template <class... fields>
class csv {
    std::istream &_in;
    const char _delim;
public:
    typedef std::tuple<fields...> value_type;
    class iterator;

    /// Construct from a stream.
    inline csv(std::istream &in, const char delim) : _in(in), _delim(delim) {}

    /// Status of the underlying stream
    /// @{
    inline bool good() const {
        return _in.good();
    }
    inline const std::istream &underlying_stream() const {
        return _in;
    }
    /// @}

    inline iterator begin();
    inline iterator end();
private:

    /// Reads a line into a stringstream, and then reads the line into a tuple, that is returned
    inline value_type read_row() {
        std::string line;
        std::getline(_in, line);
        std::stringstream line_stream(line);
        std::tuple<fields...> retval;
        csvtools::read_tuple<0, fields...>(line_stream, retval, _delim);
        return retval;
    }
};

/// Iterator; just calls recursively @ref csv::read_row and stores the result.
template <class... fields>
class csv<fields...>::iterator {
    csv::value_type _row;
    csv *_parent;
public:
    typedef std::input_iterator_tag iterator_category;
    typedef csv::value_type         value_type;
    typedef std::size_t             difference_type;
    typedef csv::value_type *       pointer;
    typedef csv::value_type &       reference;

    /// Construct an empty/end iterator
    inline iterator() : _parent(nullptr) {}
    /// Construct an iterator at the beginning of the @p parent csv object.
    inline iterator(csv &parent) : _parent(parent.good() ? &parent : nullptr) {
        ++(*this);
    }

    /// Read one row, if possible. Set to end if parent is not good anymore.
    inline iterator &operator++() {
        if (_parent != nullptr) {
            _row = _parent->read_row();
            if (!_parent->good()) {
                _parent = nullptr;
            }
        }
        return *this;
    }

    inline iterator operator++(int) {
        iterator copy = *this;
        ++(*this);
        return copy;
    }

    inline csv::value_type const &operator*() const {
        return _row;
    }

    inline csv::value_type const *operator->() const {
        return &_row;
    }

    bool operator==(iterator const &other) {
        return (this == &other) or (_parent == nullptr and other._parent == nullptr);
    }
    bool operator!=(iterator const &other) {
        return not (*this == other);
    }
};

template <class... fields>
typename csv<fields...>::iterator csv<fields...>::begin() {
    return iterator(*this);
}

template <class... fields>
typename csv<fields...>::iterator csv<fields...>::end() {
    return iterator();
}

我在GitHub上放了一个小的工作示例;我一直用它来解析一些数值数据,它达到了它的目的。

其他回答

你可能想看看我的自由/开源软件项目CSVfix(更新链接),这是一个用c++编写的CSV流编辑器。CSV解析器不是什么好东西,但它完成了工作,整个包可以在不编写任何代码的情况下满足您的需要。

CSV解析器请参见alib/src/a_csv.cpp,使用示例请参见csvlib/src/csved_ioman.cpp (IOManager::ReadCSV)。

如果可以的话,这是我简单快速的贡献。 没有提高。

接受分隔符和分隔符中的分隔符,只要成对或远离分隔符即可。

#include <iostream>
#include <vector>
#include <fstream>

std::vector<std::string> SplitCSV(const std::string &data, char separator, char delimiter)
{
  std::vector<std::string> Values;
  std::string Val = "";
  bool VDel = false; // Is within delimiter?
  size_t CDel = 0; // Delimiters counter within delimiters.
  const char *C = data.c_str();
  size_t P = 0;
  do
  {
    if ((Val.length() == 0) && (C[P] == delimiter))
    {
      VDel = !VDel;
      CDel = 0;
      P++;
      continue;
    }
    if (VDel)
    {
      if (C[P] == delimiter)
      {
        if (((CDel % 2) == 0) && ( (C[P+1] == separator) || (C[P+1] == 0) || (C[P+1] == '\n') || (C[P+1] == '\r') ))
        {
          VDel = false;
          CDel = 0;
          P++;
          continue;
        }
        else
          CDel++;
      }
    }
    else
    {
      if (C[P] == separator)
      {
        Values.push_back(Val);
        Val = "";
        P++;
        continue;
      }
      if ((C[P] == 0) || (C[P] == '\n') || (C[P] == '\r'))
        break;
    }
    Val += C[P];
    P++;
  } while(P < data.length());
  Values.push_back(Val);
  return Values;
}

bool ReadCsv(const std::string &fname, std::vector<std::vector<std::string>> &data,
  char separator = ',', char delimiter = '\"')
{
  bool Ret = false;
  std::ifstream FCsv(fname);
  if (FCsv)
  {
    FCsv.seekg(0, FCsv.end);
    size_t Siz = FCsv.tellg();
    if (Siz > 0)
    {
      FCsv.seekg(0);
      data.clear();
      std::string Line;
      while (getline(FCsv, Line, '\n'))
        data.push_back(SplitCSV(Line, separator, delimiter));
      Ret = true;
    }
    FCsv.close();
  }
  return Ret;
}

int main(int argc, char *argv[])
{
  std::vector<std::vector<std::string>> Data;
  ReadCsv("fsample.csv", Data);
  return 0;
}

如果您正在使用Visual Studio / MFC,下面的解决方案可能会使您的工作更轻松。它支持Unicode和MBCS,有注释,除了CString之外没有其他依赖项,对我来说工作得很好。它不支持在带引号的字符串中嵌入换行符,但我不在乎,只要它在这种情况下不崩溃,它不会崩溃。

总体策略是,将带引号的字符串和空字符串作为特殊情况处理,其余使用Tokenize。对于带引号的字符串,策略是找到真正的结束引号,跟踪是否遇到了连续的引号对。如果是,则使用Replace将成对转换为单个。毫无疑问,有更有效的方法,但在我的案例中,性能还不够重要,不足以证明进一步优化的合理性。

class CParseCSV {
public:
// Construction
    CParseCSV(const CString& sLine);

// Attributes
    bool    GetString(CString& sDest);

protected:
    CString m_sLine;    // line to extract tokens from
    int     m_nLen;     // line length in characters
    int     m_iPos;     // index of current position
};

CParseCSV::CParseCSV(const CString& sLine) : m_sLine(sLine)
{
    m_nLen = m_sLine.GetLength();
    m_iPos = 0;
}

bool CParseCSV::GetString(CString& sDest)
{
    if (m_iPos < 0 || m_iPos > m_nLen)  // if position out of range
        return false;
    if (m_iPos == m_nLen) { // if at end of string
        sDest.Empty();  // return empty token
        m_iPos = -1;    // really done now
        return true;
    }
    if (m_sLine[m_iPos] == '\"') {  // if current char is double quote
        m_iPos++;   // advance to next char
        int iTokenStart = m_iPos;
        bool    bHasEmbeddedQuotes = false;
        while (m_iPos < m_nLen) {   // while more chars to parse
            if (m_sLine[m_iPos] == '\"') {  // if current char is double quote
                // if next char exists and is also double quote
                if (m_iPos < m_nLen - 1 && m_sLine[m_iPos + 1] == '\"') {
                    // found pair of consecutive double quotes
                    bHasEmbeddedQuotes = true;  // request conversion
                    m_iPos++;   // skip first quote in pair
                } else  // next char doesn't exist or is normal
                    break;  // found closing quote; exit loop
            }
            m_iPos++;   // advance to next char
        }
        sDest = m_sLine.Mid(iTokenStart, m_iPos - iTokenStart);
        if (bHasEmbeddedQuotes) // if string contains embedded quote pairs
            sDest.Replace(_T("\"\""), _T("\""));    // convert pairs to singles
        m_iPos += 2;    // skip closing quote and trailing delimiter if any
    } else if (m_sLine[m_iPos] == ',') {    // else if char is comma
        sDest.Empty();  // return empty token
        m_iPos++;   // advance to next char
    } else {    // else get next comma-delimited token
        sDest = m_sLine.Tokenize(_T(","), m_iPos);
    }
    return true;
}

// calling code should look something like this:

    CStdioFile  fIn(pszPath, CFile::modeRead);
    CString sLine, sToken;
    while (fIn.ReadString(sLine)) { // for each line of input file
        if (!sLine.IsEmpty()) { // ignore blank lines
            CParseCSV   csv(sLine);
            while (csv.GetString(sToken)) {
                // do something with sToken here
            }
        }
    }

如果你不关心转义逗号和换行符, 并且你不能在引号中嵌入逗号和换行符(如果你不能转义那么…) 那么它只有大约三行代码(好的14 ->,但它只有15读取整个文件)。

std::vector<std::string> getNextLineAndSplitIntoTokens(std::istream& str)
{
    std::vector<std::string>   result;
    std::string                line;
    std::getline(str,line);

    std::stringstream          lineStream(line);
    std::string                cell;

    while(std::getline(lineStream,cell, ','))
    {
        result.push_back(cell);
    }
    // This checks for a trailing comma with no data after it.
    if (!lineStream && cell.empty())
    {
        // If there was a trailing comma then add an empty element.
        result.push_back("");
    }
    return result;
}

我只需要创建一个表示一行的类。 然后流到该对象:

#include <iterator>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>

class CSVRow
{
    public:
        std::string_view operator[](std::size_t index) const
        {
            return std::string_view(&m_line[m_data[index] + 1], m_data[index + 1] -  (m_data[index] + 1));
        }
        std::size_t size() const
        {
            return m_data.size() - 1;
        }
        void readNextRow(std::istream& str)
        {
            std::getline(str, m_line);

            m_data.clear();
            m_data.emplace_back(-1);
            std::string::size_type pos = 0;
            while((pos = m_line.find(',', pos)) != std::string::npos)
            {
                m_data.emplace_back(pos);
                ++pos;
            }
            // This checks for a trailing comma with no data after it.
            pos   = m_line.size();
            m_data.emplace_back(pos);
        }
    private:
        std::string         m_line;
        std::vector<int>    m_data;
};

std::istream& operator>>(std::istream& str, CSVRow& data)
{
    data.readNextRow(str);
    return str;
}   
int main()
{
    std::ifstream       file("plop.csv");

    CSVRow              row;
    while(file >> row)
    {
        std::cout << "4th Element(" << row[3] << ")\n";
    }
}

但只要做一点工作,我们就可以在技术上创建一个迭代器:

class CSVIterator
{   
    public:
        typedef std::input_iterator_tag     iterator_category;
        typedef CSVRow                      value_type;
        typedef std::size_t                 difference_type;
        typedef CSVRow*                     pointer;
        typedef CSVRow&                     reference;

        CSVIterator(std::istream& str)  :m_str(str.good()?&str:nullptr) { ++(*this); }
        CSVIterator()                   :m_str(nullptr) {}

        // Pre Increment
        CSVIterator& operator++()               {if (m_str) { if (!((*m_str) >> m_row)){m_str = nullptr;}}return *this;}
        // Post increment
        CSVIterator operator++(int)             {CSVIterator    tmp(*this);++(*this);return tmp;}
        CSVRow const& operator*()   const       {return m_row;}
        CSVRow const* operator->()  const       {return &m_row;}

        bool operator==(CSVIterator const& rhs) {return ((this == &rhs) || ((this->m_str == nullptr) && (rhs.m_str == nullptr)));}
        bool operator!=(CSVIterator const& rhs) {return !((*this) == rhs);}
    private:
        std::istream*       m_str;
        CSVRow              m_row;
};


int main()
{
    std::ifstream       file("plop.csv");

    for(CSVIterator loop(file); loop != CSVIterator(); ++loop)
    {
        std::cout << "4th Element(" << (*loop)[3] << ")\n";
    }
}

现在我们已经到了2020年,让我们添加一个CSVRange对象:

class CSVRange
{
    std::istream&   stream;
    public:
        CSVRange(std::istream& str)
            : stream(str)
        {}
        CSVIterator begin() const {return CSVIterator{stream};}
        CSVIterator end()   const {return CSVIterator{};}
};

int main()
{
    std::ifstream       file("plop.csv");

    for(auto& row: CSVRange(file))
    {
        std::cout << "4th Element(" << row[3] << ")\n";
    }
}

@sastanin的解决方案的一个小版本,以便它可以处理引号中的换行。

std::vector<std::vector<std::string>> readCSV(std::istream &in) {
    std::vector<std::vector<std::string>> table;

    while (!in.eof()) {
        CSVState state = CSVState::UnquotedField;
        std::vector<std::string> fields {""};
        size_t i = 0; // index of the current field
        for (char c : row) {
            switch (state) {
                case CSVState::UnquotedField:
                    switch (c) {
                        case ',': // end of field
                                  fields.push_back(""); i++;
                                  break;
                        case '"': state = CSVState::QuotedField;
                                  break;
                        default:  fields[i].push_back(c);
                                  break; }
                    break;
                case CSVState::QuotedField:
                    switch (c) {
                        case '"': state = CSVState::QuotedQuote;
                                  break;
                        default:  fields[i].push_back(c);
                                  break; }
                    break;
                case CSVState::QuotedQuote:
                    switch (c) {
                        case ',': // , after closing quote
                                  fields.push_back(""); i++;
                                  state = CSVState::UnquotedField;
                                  break;
                        case '"': // "" -> "
                                  fields[i].push_back('"');
                                  state = CSVState::QuotedField;
                                  break;
                        case '\n': // newline
                                  table.push_back(fields);
                                  state = CSVState::UnquotedField;
                                  fields = vector<string>{""};
                                  i = 0;
                        default:  // end of quote
                                  state = CSVState::UnquotedField;
                                  break; }
                    break;
            }
        }
    }
    return table;
}