我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。

我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp

我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。


当前回答

我写了一个很好的解析CSV文件的方法,我认为我应该把它作为一个答案:

#include <algorithm>
#include <fstream>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>

struct CSVDict
{
  std::vector< std::string > inputImages;
  std::vector< double > inputLabels;
};

/**
\brief Splits the string

\param str String to split
\param delim Delimiter on the basis of which splitting is to be done
\return results Output in the form of vector of strings
*/
std::vector<std::string> stringSplit( const std::string &str, const std::string &delim )
{
  std::vector<std::string> results;

  for (size_t i = 0; i < str.length(); i++)
  {
    std::string tempString = "";
    while ((str[i] != *delim.c_str()) && (i < str.length()))
    {
      tempString += str[i];
      i++;
    }
    results.push_back(tempString);
  }

  return results;
}

/**
\brief Parse the supplied CSV File and obtain Row and Column information. 

Assumptions:
1. Header information is in first row
2. Delimiters are only used to differentiate cell members

\param csvFileName The full path of the file to parse
\param inputColumns The string of input columns which contain the data to be used for further processing
\param inputLabels The string of input labels based on which further processing is to be done
\param delim The delimiters used in inputColumns and inputLabels
\return Vector of Vector of strings: Collection of rows and columns
*/
std::vector< CSVDict > parseCSVFile( const std::string &csvFileName, const std::string &inputColumns, const std::string &inputLabels, const std::string &delim )
{
  std::vector< CSVDict > return_CSVDict;
  std::vector< std::string > inputColumnsVec = stringSplit(inputColumns, delim), inputLabelsVec = stringSplit(inputLabels, delim);
  std::vector< std::vector< std::string > > returnVector;
  std::ifstream inFile(csvFileName.c_str());
  int row = 0;
  std::vector< size_t > inputColumnIndeces, inputLabelIndeces;
  for (std::string line; std::getline(inFile, line, '\n');)
  {
    CSVDict tempDict;
    std::vector< std::string > rowVec;
    line.erase(std::remove(line.begin(), line.end(), '"'), line.end());
    rowVec = stringSplit(line, delim);

    // for the first row, record the indeces of the inputColumns and inputLabels
    if (row == 0)
    {
      for (size_t i = 0; i < rowVec.size(); i++)
      {
        for (size_t j = 0; j < inputColumnsVec.size(); j++)
        {
          if (rowVec[i] == inputColumnsVec[j])
          {
            inputColumnIndeces.push_back(i);
          }
        }
        for (size_t j = 0; j < inputLabelsVec.size(); j++)
        {
          if (rowVec[i] == inputLabelsVec[j])
          {
            inputLabelIndeces.push_back(i);
          }
        }
      }
    }
    else
    {
      for (size_t i = 0; i < inputColumnIndeces.size(); i++)
      {
        tempDict.inputImages.push_back(rowVec[inputColumnIndeces[i]]);
      }
      for (size_t i = 0; i < inputLabelIndeces.size(); i++)
      {
        double test = std::atof(rowVec[inputLabelIndeces[i]].c_str());
        tempDict.inputLabels.push_back(std::atof(rowVec[inputLabelIndeces[i]].c_str()));
      }
      return_CSVDict.push_back(tempDict);
    }
    row++;
  }

  return return_CSVDict;
}

其他回答

使用Spirit来解析csv并不过分。Spirit非常适合微解析任务。例如,使用Spirit 2.1,它就像:

bool r = phrase_parse(first, last,

    //  Begin grammar
    (
        double_ % ','
    )
    ,
    //  End grammar

    space, v);

向量v被值填满了。在刚刚与Boost 1.41一起发布的新的Spirit 2.1文档中,有一系列教程涉及到这一点。

本教程从简单到复杂。CSV解析器呈现在中间的某个位置,并涉及使用Spirit的各种技术。生成的代码与手写代码一样紧凑。检查生成的汇编程序!

不好意思,但是为了隐藏几行代码,这似乎是非常复杂的语法。

为什么不这样呢:

/**

  Read line from a CSV file

  @param[in] fp file pointer to open file
  @param[in] vls reference to vector of strings to hold next line

  */
void readCSV( FILE *fp, std::vector<std::string>& vls )
{
    vls.clear();
    if( ! fp )
        return;
    char buf[10000];
    if( ! fgets( buf,999,fp) )
        return;
    std::string s = buf;
    int p,q;
    q = -1;
    // loop over columns
    while( 1 ) {
        p = q;
        q = s.find_first_of(",\n",p+1);
        if( q == -1 ) 
            break;
        vls.push_back( s.substr(p+1,q-p-1) );
    }
}

int _tmain(int argc, _TCHAR* argv[])
{
    std::vector<std::string> vls;
    FILE * fp = fopen( argv[1], "r" );
    if( ! fp )
        return 1;
    readCSV( fp, vls );
    readCSV( fp, vls );
    readCSV( fp, vls );
    std::cout << "row 3, col 4 is " << vls[3].c_str() << "\n";

    return 0;
}

使用Boost Tokenizer的解决方案:

std::vector<std::string> vec;
using namespace boost;
tokenizer<escaped_list_separator<char> > tk(
   line, escaped_list_separator<char>('\\', ',', '\"'));
for (tokenizer<escaped_list_separator<char> >::iterator i(tk.begin());
   i!=tk.end();++i) 
{
   vec.push_back(*i);
}

如果可以的话,这是我简单快速的贡献。 没有提高。

接受分隔符和分隔符中的分隔符,只要成对或远离分隔符即可。

#include <iostream>
#include <vector>
#include <fstream>

std::vector<std::string> SplitCSV(const std::string &data, char separator, char delimiter)
{
  std::vector<std::string> Values;
  std::string Val = "";
  bool VDel = false; // Is within delimiter?
  size_t CDel = 0; // Delimiters counter within delimiters.
  const char *C = data.c_str();
  size_t P = 0;
  do
  {
    if ((Val.length() == 0) && (C[P] == delimiter))
    {
      VDel = !VDel;
      CDel = 0;
      P++;
      continue;
    }
    if (VDel)
    {
      if (C[P] == delimiter)
      {
        if (((CDel % 2) == 0) && ( (C[P+1] == separator) || (C[P+1] == 0) || (C[P+1] == '\n') || (C[P+1] == '\r') ))
        {
          VDel = false;
          CDel = 0;
          P++;
          continue;
        }
        else
          CDel++;
      }
    }
    else
    {
      if (C[P] == separator)
      {
        Values.push_back(Val);
        Val = "";
        P++;
        continue;
      }
      if ((C[P] == 0) || (C[P] == '\n') || (C[P] == '\r'))
        break;
    }
    Val += C[P];
    P++;
  } while(P < data.length());
  Values.push_back(Val);
  return Values;
}

bool ReadCsv(const std::string &fname, std::vector<std::vector<std::string>> &data,
  char separator = ',', char delimiter = '\"')
{
  bool Ret = false;
  std::ifstream FCsv(fname);
  if (FCsv)
  {
    FCsv.seekg(0, FCsv.end);
    size_t Siz = FCsv.tellg();
    if (Siz > 0)
    {
      FCsv.seekg(0);
      data.clear();
      std::string Line;
      while (getline(FCsv, Line, '\n'))
        data.push_back(SplitCSV(Line, separator, delimiter));
      Ret = true;
    }
    FCsv.close();
  }
  return Ret;
}

int main(int argc, char *argv[])
{
  std::vector<std::vector<std::string>> Data;
  ReadCsv("fsample.csv", Data);
  return 0;
}

下面是Unicode CSV解析器的另一个实现(使用wchar_t)。我写了一部分,乔纳森·莱弗勒写了剩下的部分。

注意:此解析器旨在尽可能地复制Excel的行为,特别是在导入损坏或格式错误的CSV文件时。

这是最初的问题-用多行字段和转义双引号解析CSV文件

这是作为SSCCE(简短,自包含,正确示例)的代码。

#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>

extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);

// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
    // Parse quoted sequences
    if ('"' == p[0]) {
        p++;
        while (1) {
            // Find next double-quote
            p = wcschr(p, L'"');
            // If we don't find it or it's the last symbol
            // then this is the last field
            if (!p || !p[1])
                return 0;
            // Check for "", it is an escaped double-quote
            if (p[1] != '"')
                break;
            // Skip the escaped double-quote
            p += 2;
        }
    }

    // Find next newline or comma.
    wchar_t newline_or_sep[4] = L"\n\r ";
    newline_or_sep[2] = sep;
    p = wcspbrk(p, newline_or_sep);

    // If no newline or separator, this is the last field.
    if (!p)
        return 0;

    // Check if we had newline.
    *newline = (p[0] == '\r' || p[0] == '\n');

    // Handle "\r\n", otherwise just increment
    if (p[0] == '\r' && p[1] == '\n')
        p += 2;
    else
        p++;

    return p;
}

static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
    wchar_t *dst = buffer;
    wchar_t *end = buffer + buflen - 1;
    const wchar_t *src = fld_s;

    if (*src == L'"')
    {
        const wchar_t *p = src + 1;
        while (p < fld_e && dst < end)
        {
            if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
            {
                *dst++ = p[0];
                p += 2;
            }
            else if (p[0] == L'"')
            {
                p++;
                break;
            }
            else
                *dst++ = *p++;
        }
        src = p;
    }
    while (src < fld_e && dst < end)
        *dst++ = *src++;
    if (dst >= end)
        return 0;
    *dst = L'\0';
    return(buffer);
}

static void dissect(const wchar_t *line)
{
    const wchar_t *start = line;
    const wchar_t *next;
    bool     eol;
    wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
    while ((next = nextCsvField(start, L',', &eol)) != 0)
    {
        wchar_t buffer[1024];
        wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
        if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
            wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
        start = next;
    }
}

static const wchar_t multiline[] =
   L"First field of first row,\"This field is multiline\n"
    "\n"
    "but that's OK because it's enclosed in double quotes, and this\n"
    "is an escaped \"\" double quote\" but this one \"\" is not\n"
    "   \"This is second field of second row, but it is not multiline\n"
    "   because it doesn't start \n"
    "   with an immediate double quote\"\n"
    ;

int main(void)
{
    wchar_t line[1024];

    while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
        dissect(line);
    dissect(multiline);

    return 0;
}