我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。

我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp

我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。


当前回答

我有一个更快的解决方案,最初是针对这个问题的:

如何拉动不同弦的特定部分?

但它显然是关闭的。但我不会把它扔掉:

#include <iostream>
#include <string>
#include <regex>

std::string text = "\"4,\"\"3\"\",\"\"Mon May 11 03:17:40 UTC 2009\"\",\"\"kindle2\"\",\"\"tpryan\"\",\"\"TEXT HERE\"\"\";;;;";

int main()
{
    std::regex r("(\".*\")(\".*\")(\".*\")(\".*\")(\".*\")(\".*\")(\".*\")(\".*\")(\".*\")(\".*\")");
    std::smatch m;
    std::regex_search(text, m, r);
    std::cout<<"FOUND: "<<m[9]<<std::endl;

    return 0;
}

只需根据索引从smatch集合中挑选出您想要的匹配项。 正则表达式是福气。

其他回答

下面是Unicode CSV解析器的另一个实现(使用wchar_t)。我写了一部分,乔纳森·莱弗勒写了剩下的部分。

注意:此解析器旨在尽可能地复制Excel的行为,特别是在导入损坏或格式错误的CSV文件时。

这是最初的问题-用多行字段和转义双引号解析CSV文件

这是作为SSCCE(简短,自包含,正确示例)的代码。

#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>

extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);

// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
    // Parse quoted sequences
    if ('"' == p[0]) {
        p++;
        while (1) {
            // Find next double-quote
            p = wcschr(p, L'"');
            // If we don't find it or it's the last symbol
            // then this is the last field
            if (!p || !p[1])
                return 0;
            // Check for "", it is an escaped double-quote
            if (p[1] != '"')
                break;
            // Skip the escaped double-quote
            p += 2;
        }
    }

    // Find next newline or comma.
    wchar_t newline_or_sep[4] = L"\n\r ";
    newline_or_sep[2] = sep;
    p = wcspbrk(p, newline_or_sep);

    // If no newline or separator, this is the last field.
    if (!p)
        return 0;

    // Check if we had newline.
    *newline = (p[0] == '\r' || p[0] == '\n');

    // Handle "\r\n", otherwise just increment
    if (p[0] == '\r' && p[1] == '\n')
        p += 2;
    else
        p++;

    return p;
}

static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
    wchar_t *dst = buffer;
    wchar_t *end = buffer + buflen - 1;
    const wchar_t *src = fld_s;

    if (*src == L'"')
    {
        const wchar_t *p = src + 1;
        while (p < fld_e && dst < end)
        {
            if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
            {
                *dst++ = p[0];
                p += 2;
            }
            else if (p[0] == L'"')
            {
                p++;
                break;
            }
            else
                *dst++ = *p++;
        }
        src = p;
    }
    while (src < fld_e && dst < end)
        *dst++ = *src++;
    if (dst >= end)
        return 0;
    *dst = L'\0';
    return(buffer);
}

static void dissect(const wchar_t *line)
{
    const wchar_t *start = line;
    const wchar_t *next;
    bool     eol;
    wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
    while ((next = nextCsvField(start, L',', &eol)) != 0)
    {
        wchar_t buffer[1024];
        wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
        if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
            wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
        start = next;
    }
}

static const wchar_t multiline[] =
   L"First field of first row,\"This field is multiline\n"
    "\n"
    "but that's OK because it's enclosed in double quotes, and this\n"
    "is an escaped \"\" double quote\" but this one \"\" is not\n"
    "   \"This is second field of second row, but it is not multiline\n"
    "   because it doesn't start \n"
    "   with an immediate double quote\"\n"
    ;

int main(void)
{
    wchar_t line[1024];

    while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
        dissect(line);
    dissect(multiline);

    return 0;
}

如果您所需要的只是加载一个双精度数据文件(没有整数,没有文本),那么这里有一个随时可用的函数。

#include <sstream>
#include <fstream>
#include <iterator>
#include <string>
#include <vector>
#include <algorithm>

using namespace std;

/**
 * Parse a CSV data file and fill the 2d STL vector "data".
 * Limits: only "pure datas" of doubles, not encapsulated by " and without \n inside.
 * Further no formatting in the data (e.g. scientific notation)
 * It however handles both dots and commas as decimal separators and removes thousand separator.
 * 
 * returnCodes[0]: file access 0-> ok 1-> not able to read; 2-> decimal separator equal to comma separator
 * returnCodes[1]: number of records
 * returnCodes[2]: number of fields. -1 If rows have different field size
 * 
 */
vector<int>
readCsvData (vector <vector <double>>& data, const string& filename, const string& delimiter, const string& decseparator){

 int vv[3] = { 0,0,0 };
 vector<int> returnCodes(&vv[0], &vv[0]+3);

 string rowstring, stringtoken;
 double doubletoken;
 int rowcount=0;
 int fieldcount=0;
 data.clear();

 ifstream iFile(filename, ios_base::in);
 if (!iFile.is_open()){
   returnCodes[0] = 1;
   return returnCodes;
 }
 while (getline(iFile, rowstring)) {
    if (rowstring=="") continue; // empty line
    rowcount ++; //let's start with 1
    if(delimiter == decseparator){
      returnCodes[0] = 2;
      return returnCodes;
    }
    if(decseparator != "."){
     // remove dots (used as thousand separators)
     string::iterator end_pos = remove(rowstring.begin(), rowstring.end(), '.');
     rowstring.erase(end_pos, rowstring.end());
     // replace decimal separator with dots.
     replace(rowstring.begin(), rowstring.end(),decseparator.c_str()[0], '.'); 
    } else {
     // remove commas (used as thousand separators)
     string::iterator end_pos = remove(rowstring.begin(), rowstring.end(), ',');
     rowstring.erase(end_pos, rowstring.end());
    }
    // tokenize..
    vector<double> tokens;
    // Skip delimiters at beginning.
    string::size_type lastPos = rowstring.find_first_not_of(delimiter, 0);
    // Find first "non-delimiter".
    string::size_type pos     = rowstring.find_first_of(delimiter, lastPos);
    while (string::npos != pos || string::npos != lastPos){
        // Found a token, convert it to double add it to the vector.
        stringtoken = rowstring.substr(lastPos, pos - lastPos);
        if (stringtoken == "") {
      tokens.push_back(0.0);
    } else {
          istringstream totalSString(stringtoken);
      totalSString >> doubletoken;
      tokens.push_back(doubletoken);
    }     
        // Skip delimiters.  Note the "not_of"
        lastPos = rowstring.find_first_not_of(delimiter, pos);
        // Find next "non-delimiter"
        pos = rowstring.find_first_of(delimiter, lastPos);
    }
    if(rowcount == 1){
      fieldcount = tokens.size();
      returnCodes[2] = tokens.size();
    } else {
      if ( tokens.size() != fieldcount){
    returnCodes[2] = -1;
      }
    }
    data.push_back(tokens);
 }
 iFile.close();
 returnCodes[1] = rowcount;
 return returnCodes;
}

使用Spirit来解析csv并不过分。Spirit非常适合微解析任务。例如,使用Spirit 2.1,它就像:

bool r = phrase_parse(first, last,

    //  Begin grammar
    (
        double_ % ','
    )
    ,
    //  End grammar

    space, v);

向量v被值填满了。在刚刚与Boost 1.41一起发布的新的Spirit 2.1文档中,有一系列教程涉及到这一点。

本教程从简单到复杂。CSV解析器呈现在中间的某个位置,并涉及使用Spirit的各种技术。生成的代码与手写代码一样紧凑。检查生成的汇编程序!

我写了一个只有头文件的c++ 11 CSV解析器。它经过了良好的测试,快速,支持整个CSV规范(带引号的字段,引号中的分隔符/结束符,引号转义等),并且可以配置为不符合规范的CSV。

配置是通过一个流畅的接口完成的:

// constructor accepts any input stream
CsvParser parser = CsvParser(std::cin)
  .delimiter(';')    // delimited by ; instead of ,
  .quote('\'')       // quoted fields use ' instead of "
  .terminator('\0'); // terminated by \0 instead of by \r\n, \n, or \r

解析只是一个基于范围的for循环:

#include <iostream>
#include "../parser.hpp"

using namespace aria::csv;

int main() {
  std::ifstream f("some_file.csv");
  CsvParser parser(f);

  for (auto& row : parser) {
    for (auto& field : row) {
      std::cout << field << " | ";
    }
    std::cout << std::endl;
  }
}

@sastanin的解决方案的一个小版本,以便它可以处理引号中的换行。

std::vector<std::vector<std::string>> readCSV(std::istream &in) {
    std::vector<std::vector<std::string>> table;

    while (!in.eof()) {
        CSVState state = CSVState::UnquotedField;
        std::vector<std::string> fields {""};
        size_t i = 0; // index of the current field
        for (char c : row) {
            switch (state) {
                case CSVState::UnquotedField:
                    switch (c) {
                        case ',': // end of field
                                  fields.push_back(""); i++;
                                  break;
                        case '"': state = CSVState::QuotedField;
                                  break;
                        default:  fields[i].push_back(c);
                                  break; }
                    break;
                case CSVState::QuotedField:
                    switch (c) {
                        case '"': state = CSVState::QuotedQuote;
                                  break;
                        default:  fields[i].push_back(c);
                                  break; }
                    break;
                case CSVState::QuotedQuote:
                    switch (c) {
                        case ',': // , after closing quote
                                  fields.push_back(""); i++;
                                  state = CSVState::UnquotedField;
                                  break;
                        case '"': // "" -> "
                                  fields[i].push_back('"');
                                  state = CSVState::QuotedField;
                                  break;
                        case '\n': // newline
                                  table.push_back(fields);
                                  state = CSVState::UnquotedField;
                                  fields = vector<string>{""};
                                  i = 0;
                        default:  // end of quote
                                  state = CSVState::UnquotedField;
                                  break; }
                    break;
            }
        }
    }
    return table;
}