我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。

我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp

我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。


当前回答

就像每个人都把他的解决方案,这里是我的使用模板,lambda和tuple。

它可以将任何具有所需列的CSV转换为tuple的c++向量。

它通过在元组中定义每个CSV行元素类型来工作。

您还需要为每个元素定义std::string到类型转换Formatter lambda(例如使用std::atod)。

然后你就得到了这个结构的一个向量,对应于你的CSV数据。

您可以很容易地重用它来匹配任何CSV结构。

StringsHelpers.hpp

#include <string>
#include <fstream>
#include <vector>
#include <functional>

namespace StringHelpers
{
    template<typename Tuple>
    using Formatter = std::function<Tuple(const std::vector<std::string> &)>;

    std::vector<std::string> split(const std::string &string, const std::string &delimiter);

    template<typename Tuple>
    std::vector<Tuple> readCsv(const std::string &path, const std::string &delimiter, Formatter<Tuple> formatter);
};

StringsHelpers.cpp

#include "StringHelpers.hpp"

namespace StringHelpers
{
    /**
     * Split a string with the given delimiter into several strings
     *
     * @param string - The string to extract the substrings from
     * @param delimiter - The substrings delimiter
     *
     * @return The substrings
     */
    std::vector<std::string> split(const std::string &string, const std::string &delimiter)
    {
        std::vector<std::string> result;
        size_t                   last = 0,
                                 next = 0;

        while ((next = string.find(delimiter, last)) != std::string::npos) {
            result.emplace_back(string.substr(last, next - last));
            last = next + 1;
        }

        result.emplace_back(string.substr(last));

        return result;
    }

    /**
     * Read a CSV file and store its values into the given structure (Tuple with Formatter constructor)
     *
     * @tparam Tuple - The CSV line structure format
     *
     * @param path - The CSV file path
     * @param delimiter - The CSV values delimiter
     * @param formatter - The CSV values formatter that take a vector of strings in input and return a Tuple
     *
     * @return The CSV as vector of Tuple
     */
    template<typename Tuple>
    std::vector<Tuple> readCsv(const std::string &path, const std::string &delimiter, Formatter<Tuple> formatter)
    {
        std::ifstream      file(path, std::ifstream::in);
        std::string        line;
        std::vector<Tuple> result;

        if (file.fail()) {
            throw std::runtime_error("The file " + path + " could not be opened");
        }

        while (std::getline(file, line)) {
            result.emplace_back(formatter(split(line, delimiter)));
        }

        file.close();

        return result;
    }

    // Forward template declarations

    template std::vector<std::tuple<double, double, double>> readCsv<std::tuple<double, double, double>>(const std::string &, const std::string &, Formatter<std::tuple<double, double, double>>);
} // End of StringHelpers namespace

Main.cpp(一些用法)

#include "StringHelpers.hpp"

/**
 * Example of use with a CSV file which have (number,Red,Green,Blue) as line values. We do not want to use the 1st value
 * of the line.
 */
int main(int argc, char **argv)
{
    // Declare CSV line type, formatter and template type
    typedef std::tuple<double, double, double>                          CSV_format;
    typedef std::function<CSV_format(const std::vector<std::string> &)> formatterT;

    enum RGB { Red = 1, Green, Blue };

    const std::string COLOR_MAP_PATH = "/some/absolute/path";

    // Load the color map
    auto colorMap = StringHelpers::readCsv<CSV_format>(COLOR_MAP_PATH, ",", [](const std::vector<std::string> &values) {
        return CSV_format {
                // Here is the formatter lambda that convert each value from string to what you want
                std::strtod(values[Red].c_str(), nullptr),
                std::strtod(values[Green].c_str(), nullptr),
                std::strtod(values[Blue].c_str(), nullptr)
        };
    });

    // Use your colorMap as you  wish...
}

其他回答

我写了一个很好的解析CSV文件的方法,我认为我应该把它作为一个答案:

#include <algorithm>
#include <fstream>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>

struct CSVDict
{
  std::vector< std::string > inputImages;
  std::vector< double > inputLabels;
};

/**
\brief Splits the string

\param str String to split
\param delim Delimiter on the basis of which splitting is to be done
\return results Output in the form of vector of strings
*/
std::vector<std::string> stringSplit( const std::string &str, const std::string &delim )
{
  std::vector<std::string> results;

  for (size_t i = 0; i < str.length(); i++)
  {
    std::string tempString = "";
    while ((str[i] != *delim.c_str()) && (i < str.length()))
    {
      tempString += str[i];
      i++;
    }
    results.push_back(tempString);
  }

  return results;
}

/**
\brief Parse the supplied CSV File and obtain Row and Column information. 

Assumptions:
1. Header information is in first row
2. Delimiters are only used to differentiate cell members

\param csvFileName The full path of the file to parse
\param inputColumns The string of input columns which contain the data to be used for further processing
\param inputLabels The string of input labels based on which further processing is to be done
\param delim The delimiters used in inputColumns and inputLabels
\return Vector of Vector of strings: Collection of rows and columns
*/
std::vector< CSVDict > parseCSVFile( const std::string &csvFileName, const std::string &inputColumns, const std::string &inputLabels, const std::string &delim )
{
  std::vector< CSVDict > return_CSVDict;
  std::vector< std::string > inputColumnsVec = stringSplit(inputColumns, delim), inputLabelsVec = stringSplit(inputLabels, delim);
  std::vector< std::vector< std::string > > returnVector;
  std::ifstream inFile(csvFileName.c_str());
  int row = 0;
  std::vector< size_t > inputColumnIndeces, inputLabelIndeces;
  for (std::string line; std::getline(inFile, line, '\n');)
  {
    CSVDict tempDict;
    std::vector< std::string > rowVec;
    line.erase(std::remove(line.begin(), line.end(), '"'), line.end());
    rowVec = stringSplit(line, delim);

    // for the first row, record the indeces of the inputColumns and inputLabels
    if (row == 0)
    {
      for (size_t i = 0; i < rowVec.size(); i++)
      {
        for (size_t j = 0; j < inputColumnsVec.size(); j++)
        {
          if (rowVec[i] == inputColumnsVec[j])
          {
            inputColumnIndeces.push_back(i);
          }
        }
        for (size_t j = 0; j < inputLabelsVec.size(); j++)
        {
          if (rowVec[i] == inputLabelsVec[j])
          {
            inputLabelIndeces.push_back(i);
          }
        }
      }
    }
    else
    {
      for (size_t i = 0; i < inputColumnIndeces.size(); i++)
      {
        tempDict.inputImages.push_back(rowVec[inputColumnIndeces[i]]);
      }
      for (size_t i = 0; i < inputLabelIndeces.size(); i++)
      {
        double test = std::atof(rowVec[inputLabelIndeces[i]].c_str());
        tempDict.inputLabels.push_back(std::atof(rowVec[inputLabelIndeces[i]].c_str()));
      }
      return_CSVDict.push_back(tempDict);
    }
    row++;
  }

  return return_CSVDict;
}

您可以使用fopen,fscanf函数打开和读取.csv文件,但重要的是解析数据。使用分隔符解析数据的最简单方法。对于.csv,分隔符为','。

假设你的data1.csv文件如下所示:

A,45,76,01
B,77,67,02
C,63,76,03
D,65,44,04

您可以标记数据并存储在字符数组中,然后使用atoi()等函数进行适当的转换

FILE *fp;
char str1[10], str2[10], str3[10], str4[10];

fp = fopen("G:\\data1.csv", "r");
if(NULL == fp)
{
    printf("\nError in opening file.");
    return 0;
}
while(EOF != fscanf(fp, " %[^,], %[^,], %[^,], %s, %s, %s, %s ", str1, str2, str3, str4))
{
    printf("\n%s %s %s %s", str1, str2, str3, str4);
}
fclose(fp);

[^,], ^ -它颠倒了逻辑,意思是匹配任何不包含逗号的字符串,然后最后,表示匹配终止前一个字符串的逗号。

可以使用std::regex。

根据文件大小和可用内存,可以逐行读取,也可以完全在std::string中读取。

读取文件可以使用:

std::ifstream t("file.txt");
std::string sin((std::istreambuf_iterator<char>(t)),
                 std::istreambuf_iterator<char>());

然后你可以和这个相匹配,它实际上是根据你的需要定制的。

std::regex word_regex(",\\s]+");
auto what = 
    std::sregex_iterator(sin.begin(), sin.end(), word_regex);
auto wend = std::sregex_iterator();

std::vector<std::string> v;
for (;what!=wend ; wend) {
    std::smatch match = *what;
    v.push_back(match.str());
}

由于所有CSV问题似乎都被重定向到这里,我想我应该在这里发布我的答案。这个回答并没有直接回答提问者的问题。我希望能够读取已知的CSV格式的流,而且每个字段的类型都已经知道。当然,可以使用下面的方法将每个字段处理为字符串类型。

作为我希望能够使用CSV输入流的一个例子,考虑以下输入(取自维基百科的CSV页面):

const char input[] =
"Year,Make,Model,Description,Price\n"
"1997,Ford,E350,\"ac, abs, moon\",3000.00\n"
"1999,Chevy,\"Venture \"\"Extended Edition\"\"\",\"\",4900.00\n"
"1999,Chevy,\"Venture \"\"Extended Edition, Very Large\"\"\",\"\",5000.00\n"
"1996,Jeep,Grand Cherokee,\"MUST SELL!\n\
air, moon roof, loaded\",4799.00\n"
;

然后,我希望能够像这样读取数据:

std::istringstream ss(input);
std::string title[5];
int year;
std::string make, model, desc;
float price;
csv_istream(ss)
    >> title[0] >> title[1] >> title[2] >> title[3] >> title[4];
while (csv_istream(ss)
       >> year >> make >> model >> desc >> price) {
    //...do something with the record...
}

这就是我最后得到的解。

struct csv_istream {
    std::istream &is_;
    csv_istream (std::istream &is) : is_(is) {}
    void scan_ws () const {
        while (is_.good()) {
            int c = is_.peek();
            if (c != ' ' && c != '\t') break;
            is_.get();
        }
    }
    void scan (std::string *s = 0) const {
        std::string ws;
        int c = is_.get();
        if (is_.good()) {
            do {
                if (c == ',' || c == '\n') break;
                if (s) {
                    ws += c;
                    if (c != ' ' && c != '\t') {
                        *s += ws;
                        ws.clear();
                    }
                }
                c = is_.get();
            } while (is_.good());
            if (is_.eof()) is_.clear();
        }
    }
    template <typename T, bool> struct set_value {
        void operator () (std::string in, T &v) const {
            std::istringstream(in) >> v;
        }
    };
    template <typename T> struct set_value<T, true> {
        template <bool SIGNED> void convert (std::string in, T &v) const {
            if (SIGNED) v = ::strtoll(in.c_str(), 0, 0);
            else v = ::strtoull(in.c_str(), 0, 0);
        }
        void operator () (std::string in, T &v) const {
            convert<is_signed_int<T>::val>(in, v);
        }
    };
    template <typename T> const csv_istream & operator >> (T &v) const {
        std::string tmp;
        scan(&tmp);
        set_value<T, is_int<T>::val>()(tmp, v);
        return *this;
    }
    const csv_istream & operator >> (std::string &v) const {
        v.clear();
        scan_ws();
        if (is_.peek() != '"') scan(&v);
        else {
            std::string tmp;
            is_.get();
            std::getline(is_, tmp, '"');
            while (is_.peek() == '"') {
                v += tmp;
                v += is_.get();
                std::getline(is_, tmp, '"');
            }
            v += tmp;
            scan();
        }
        return *this;
    }
    template <typename T>
    const csv_istream & operator >> (T &(*manip)(T &)) const {
        is_ >> manip;
        return *this;
    }
    operator bool () const { return !is_.fail(); }
};

使用以下helper,可以通过c++ 11中的新积分特征模板进行简化:

template <typename T> struct is_signed_int { enum { val = false }; };
template <> struct is_signed_int<short> { enum { val = true}; };
template <> struct is_signed_int<int> { enum { val = true}; };
template <> struct is_signed_int<long> { enum { val = true}; };
template <> struct is_signed_int<long long> { enum { val = true}; };

template <typename T> struct is_unsigned_int { enum { val = false }; };
template <> struct is_unsigned_int<unsigned short> { enum { val = true}; };
template <> struct is_unsigned_int<unsigned int> { enum { val = true}; };
template <> struct is_unsigned_int<unsigned long> { enum { val = true}; };
template <> struct is_unsigned_int<unsigned long long> { enum { val = true}; };

template <typename T> struct is_int {
    enum { val = (is_signed_int<T>::val || is_unsigned_int<T>::val) };
};

在网上试试!

不管怎样,下面是我的实现。它处理wstring输入,但是可以很容易地调整为string。它不处理字段中的换行符(因为我的应用程序也不这样做,但添加它的支持并不太难),它不符合RFC中的“\r\n”行尾(假设您使用std::getline),但它确实正确地处理空格修剪和双引号(希望如此)。

using namespace std;

// trim whitespaces around field or double-quotes, remove double-quotes and replace escaped double-quotes (double double-quotes)
wstring trimquote(const wstring& str, const wstring& whitespace, const wchar_t quotChar)
{
    wstring ws;
    wstring::size_type strBegin = str.find_first_not_of(whitespace);
    if (strBegin == wstring::npos)
        return L"";

    wstring::size_type strEnd = str.find_last_not_of(whitespace);
    wstring::size_type strRange = strEnd - strBegin + 1;

    if((str[strBegin] == quotChar) && (str[strEnd] == quotChar))
    {
        ws = str.substr(strBegin+1, strRange-2);
        strBegin = 0;
        while((strEnd = ws.find(quotChar, strBegin)) != wstring::npos)
        {
            ws.erase(strEnd, 1);
            strBegin = strEnd+1;
        }

    }
    else
        ws = str.substr(strBegin, strRange);
    return ws;
}

pair<unsigned, unsigned> nextCSVQuotePair(const wstring& line, const wchar_t quotChar, unsigned ofs = 0)
{
    pair<unsigned, unsigned> r;
    r.first = line.find(quotChar, ofs);
    r.second = wstring::npos;
    if(r.first != wstring::npos)
    {
        r.second = r.first;
        while(((r.second = line.find(quotChar, r.second+1)) != wstring::npos)
            && (line[r.second+1] == quotChar)) // WARNING: assumes null-terminated string such that line[r.second+1] always exist
            r.second++;

    }
    return r;
}

unsigned parseLine(vector<wstring>& fields, const wstring& line)
{
    unsigned ofs, ofs0, np;
    const wchar_t delim = L',';
    const wstring whitespace = L" \t\xa0\x3000\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x202f\x205f";
    const wchar_t quotChar = L'\"';
    pair<unsigned, unsigned> quot;

    fields.clear();

    ofs = ofs0 = 0;
    quot = nextCSVQuotePair(line, quotChar);
    while((np = line.find(delim, ofs)) != wstring::npos)
    {
        if((np > quot.first) && (np < quot.second))
        { // skip delimiter inside quoted field
            ofs = quot.second+1;
            quot = nextCSVQuotePair(line, quotChar, ofs);
            continue;
        }
        fields.push_back( trimquote(line.substr(ofs0, np-ofs0), whitespace, quotChar) );
        ofs = ofs0 = np+1;
    }
    fields.push_back( trimquote(line.substr(ofs0), whitespace, quotChar) );

    return fields.size();
}