我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
当前回答
我的版本只使用标准c++ 11库。它很好地处理Excel CSV引用:
spam eggs,"foo,bar","""fizz buzz"""
1.23,4.567,-8.00E+09
代码是作为有限状态机编写的,每次只消耗一个字符。我认为这更容易解释。
#include <istream>
#include <string>
#include <vector>
enum class CSVState {
UnquotedField,
QuotedField,
QuotedQuote
};
std::vector<std::string> readCSVRow(const std::string &row) {
CSVState state = CSVState::UnquotedField;
std::vector<std::string> fields {""};
size_t i = 0; // index of the current field
for (char c : row) {
switch (state) {
case CSVState::UnquotedField:
switch (c) {
case ',': // end of field
fields.push_back(""); i++;
break;
case '"': state = CSVState::QuotedField;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedField:
switch (c) {
case '"': state = CSVState::QuotedQuote;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedQuote:
switch (c) {
case ',': // , after closing quote
fields.push_back(""); i++;
state = CSVState::UnquotedField;
break;
case '"': // "" -> "
fields[i].push_back('"');
state = CSVState::QuotedField;
break;
default: // end of quote
state = CSVState::UnquotedField;
break; }
break;
}
}
return fields;
}
/// Read CSV file, Excel dialect. Accept "quoted fields ""with quotes"""
std::vector<std::vector<std::string>> readCSV(std::istream &in) {
std::vector<std::vector<std::string>> table;
std::string row;
while (!in.eof()) {
std::getline(in, row);
if (in.bad() || in.fail()) {
break;
}
auto fields = readCSVRow(row);
table.push_back(fields);
}
return table;
}
其他回答
你可以使用这个库: https://github.com/vadamsky/csvworker
代码示例:
#include <iostream>
#include "csvworker.h"
using namespace std;
int main()
{
//
CsvWorker csv;
csv.loadFromFile("example.csv");
cout << csv.getRowsNumber() << " " << csv.getColumnsNumber() << endl;
csv.getFieldRef(0, 2) = "0";
csv.getFieldRef(1, 1) = "0";
csv.getFieldRef(1, 3) = "0";
csv.getFieldRef(2, 0) = "0";
csv.getFieldRef(2, 4) = "0";
csv.getFieldRef(3, 1) = "0";
csv.getFieldRef(3, 3) = "0";
csv.getFieldRef(4, 2) = "0";
for(unsigned int i=0;i<csv.getRowsNumber();++i)
{
//cout << csv.getRow(i) << endl;
for(unsigned int j=0;j<csv.getColumnsNumber();++j)
{
cout << csv.getField(i, j) << ".";
}
cout << endl;
}
csv.saveToFile("test.csv");
//
CsvWorker csv2(4,4);
csv2.getFieldRef(0, 0) = "a";
csv2.getFieldRef(0, 1) = "b";
csv2.getFieldRef(0, 2) = "r";
csv2.getFieldRef(0, 3) = "a";
csv2.getFieldRef(1, 0) = "c";
csv2.getFieldRef(1, 1) = "a";
csv2.getFieldRef(1, 2) = "d";
csv2.getFieldRef(2, 0) = "a";
csv2.getFieldRef(2, 1) = "b";
csv2.getFieldRef(2, 2) = "r";
csv2.getFieldRef(2, 3) = "a";
csv2.saveToFile("test2.csv");
return 0;
}
就像每个人都把他的解决方案,这里是我的使用模板,lambda和tuple。
它可以将任何具有所需列的CSV转换为tuple的c++向量。
它通过在元组中定义每个CSV行元素类型来工作。
您还需要为每个元素定义std::string到类型转换Formatter lambda(例如使用std::atod)。
然后你就得到了这个结构的一个向量,对应于你的CSV数据。
您可以很容易地重用它来匹配任何CSV结构。
StringsHelpers.hpp
#include <string>
#include <fstream>
#include <vector>
#include <functional>
namespace StringHelpers
{
template<typename Tuple>
using Formatter = std::function<Tuple(const std::vector<std::string> &)>;
std::vector<std::string> split(const std::string &string, const std::string &delimiter);
template<typename Tuple>
std::vector<Tuple> readCsv(const std::string &path, const std::string &delimiter, Formatter<Tuple> formatter);
};
StringsHelpers.cpp
#include "StringHelpers.hpp"
namespace StringHelpers
{
/**
* Split a string with the given delimiter into several strings
*
* @param string - The string to extract the substrings from
* @param delimiter - The substrings delimiter
*
* @return The substrings
*/
std::vector<std::string> split(const std::string &string, const std::string &delimiter)
{
std::vector<std::string> result;
size_t last = 0,
next = 0;
while ((next = string.find(delimiter, last)) != std::string::npos) {
result.emplace_back(string.substr(last, next - last));
last = next + 1;
}
result.emplace_back(string.substr(last));
return result;
}
/**
* Read a CSV file and store its values into the given structure (Tuple with Formatter constructor)
*
* @tparam Tuple - The CSV line structure format
*
* @param path - The CSV file path
* @param delimiter - The CSV values delimiter
* @param formatter - The CSV values formatter that take a vector of strings in input and return a Tuple
*
* @return The CSV as vector of Tuple
*/
template<typename Tuple>
std::vector<Tuple> readCsv(const std::string &path, const std::string &delimiter, Formatter<Tuple> formatter)
{
std::ifstream file(path, std::ifstream::in);
std::string line;
std::vector<Tuple> result;
if (file.fail()) {
throw std::runtime_error("The file " + path + " could not be opened");
}
while (std::getline(file, line)) {
result.emplace_back(formatter(split(line, delimiter)));
}
file.close();
return result;
}
// Forward template declarations
template std::vector<std::tuple<double, double, double>> readCsv<std::tuple<double, double, double>>(const std::string &, const std::string &, Formatter<std::tuple<double, double, double>>);
} // End of StringHelpers namespace
Main.cpp(一些用法)
#include "StringHelpers.hpp"
/**
* Example of use with a CSV file which have (number,Red,Green,Blue) as line values. We do not want to use the 1st value
* of the line.
*/
int main(int argc, char **argv)
{
// Declare CSV line type, formatter and template type
typedef std::tuple<double, double, double> CSV_format;
typedef std::function<CSV_format(const std::vector<std::string> &)> formatterT;
enum RGB { Red = 1, Green, Blue };
const std::string COLOR_MAP_PATH = "/some/absolute/path";
// Load the color map
auto colorMap = StringHelpers::readCsv<CSV_format>(COLOR_MAP_PATH, ",", [](const std::vector<std::string> &values) {
return CSV_format {
// Here is the formatter lambda that convert each value from string to what you want
std::strtod(values[Red].c_str(), nullptr),
std::strtod(values[Green].c_str(), nullptr),
std::strtod(values[Blue].c_str(), nullptr)
};
});
// Use your colorMap as you wish...
}
当对CSV文件使用Boost Tokenizer escaped_list_separator时,应该注意以下几点:
它需要一个转义字符(默认的反斜杠- \) 它需要一个分割符/分隔符-字符(默认逗号-,) 它需要一个引号字符(默认的引号- ")
wiki指定的CSV格式规定数据字段可以包含引号分隔符(支持):
1997年,福特E350,“超级豪华卡车”
由wiki指定的CSV格式规定单引号应该用双引号处理(escaped_list_separator将剥离所有引号字符):
1997年,福特E350,“超级”“豪华”“卡车”
CSV格式没有指定应该删除任何反斜杠字符(escaped_list_separator将删除所有转义字符)。
修复boost escaped_list_separator的默认行为的一个可能的变通方法:
首先将所有反斜杠字符(\)替换为两个反斜杠字符(\\),这样它们就不会被剥离。 其次,将所有双引号("")替换为一个反斜杠字符和一个引号(\")
这种变通方法有一个副作用,即由双引号表示的空数据字段将被转换为单引号标记。在遍历令牌时,必须检查令牌是否是单引号,并将其视为空字符串。
不漂亮,但它工作,只要在引号中没有换行。
由于我现在不习惯boost,我将建议一个更简单的解决方案。假设您的.csv文件有100行,每行有10个数字,用“,”分隔。你可以用下面的代码以数组的形式加载这个数据:
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
using namespace std;
int main()
{
int A[100][10];
ifstream ifs;
ifs.open("name_of_file.csv");
string s1;
char c;
for(int k=0; k<100; k++)
{
getline(ifs,s1);
stringstream stream(s1);
int j=0;
while(1)
{
stream >>A[k][j];
stream >> c;
j++;
if(!stream) {break;}
}
}
}
如果您正在使用Visual Studio / MFC,下面的解决方案可能会使您的工作更轻松。它支持Unicode和MBCS,有注释,除了CString之外没有其他依赖项,对我来说工作得很好。它不支持在带引号的字符串中嵌入换行符,但我不在乎,只要它在这种情况下不崩溃,它不会崩溃。
总体策略是,将带引号的字符串和空字符串作为特殊情况处理,其余使用Tokenize。对于带引号的字符串,策略是找到真正的结束引号,跟踪是否遇到了连续的引号对。如果是,则使用Replace将成对转换为单个。毫无疑问,有更有效的方法,但在我的案例中,性能还不够重要,不足以证明进一步优化的合理性。
class CParseCSV {
public:
// Construction
CParseCSV(const CString& sLine);
// Attributes
bool GetString(CString& sDest);
protected:
CString m_sLine; // line to extract tokens from
int m_nLen; // line length in characters
int m_iPos; // index of current position
};
CParseCSV::CParseCSV(const CString& sLine) : m_sLine(sLine)
{
m_nLen = m_sLine.GetLength();
m_iPos = 0;
}
bool CParseCSV::GetString(CString& sDest)
{
if (m_iPos < 0 || m_iPos > m_nLen) // if position out of range
return false;
if (m_iPos == m_nLen) { // if at end of string
sDest.Empty(); // return empty token
m_iPos = -1; // really done now
return true;
}
if (m_sLine[m_iPos] == '\"') { // if current char is double quote
m_iPos++; // advance to next char
int iTokenStart = m_iPos;
bool bHasEmbeddedQuotes = false;
while (m_iPos < m_nLen) { // while more chars to parse
if (m_sLine[m_iPos] == '\"') { // if current char is double quote
// if next char exists and is also double quote
if (m_iPos < m_nLen - 1 && m_sLine[m_iPos + 1] == '\"') {
// found pair of consecutive double quotes
bHasEmbeddedQuotes = true; // request conversion
m_iPos++; // skip first quote in pair
} else // next char doesn't exist or is normal
break; // found closing quote; exit loop
}
m_iPos++; // advance to next char
}
sDest = m_sLine.Mid(iTokenStart, m_iPos - iTokenStart);
if (bHasEmbeddedQuotes) // if string contains embedded quote pairs
sDest.Replace(_T("\"\""), _T("\"")); // convert pairs to singles
m_iPos += 2; // skip closing quote and trailing delimiter if any
} else if (m_sLine[m_iPos] == ',') { // else if char is comma
sDest.Empty(); // return empty token
m_iPos++; // advance to next char
} else { // else get next comma-delimited token
sDest = m_sLine.Tokenize(_T(","), m_iPos);
}
return true;
}
// calling code should look something like this:
CStdioFile fIn(pszPath, CFile::modeRead);
CString sLine, sToken;
while (fIn.ReadString(sLine)) { // for each line of input file
if (!sLine.IsEmpty()) { // ignore blank lines
CParseCSV csv(sLine);
while (csv.GetString(sToken)) {
// do something with sToken here
}
}
}