我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
当前回答
下面是Unicode CSV解析器的另一个实现(使用wchar_t)。我写了一部分,乔纳森·莱弗勒写了剩下的部分。
注意:此解析器旨在尽可能地复制Excel的行为,特别是在导入损坏或格式错误的CSV文件时。
这是最初的问题-用多行字段和转义双引号解析CSV文件
这是作为SSCCE(简短,自包含,正确示例)的代码。
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
wchar_t *dst = buffer;
wchar_t *end = buffer + buflen - 1;
const wchar_t *src = fld_s;
if (*src == L'"')
{
const wchar_t *p = src + 1;
while (p < fld_e && dst < end)
{
if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
{
*dst++ = p[0];
p += 2;
}
else if (p[0] == L'"')
{
p++;
break;
}
else
*dst++ = *p++;
}
src = p;
}
while (src < fld_e && dst < end)
*dst++ = *src++;
if (dst >= end)
return 0;
*dst = L'\0';
return(buffer);
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wchar_t buffer[1024];
wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
其他回答
如果您正在使用Visual Studio / MFC,下面的解决方案可能会使您的工作更轻松。它支持Unicode和MBCS,有注释,除了CString之外没有其他依赖项,对我来说工作得很好。它不支持在带引号的字符串中嵌入换行符,但我不在乎,只要它在这种情况下不崩溃,它不会崩溃。
总体策略是,将带引号的字符串和空字符串作为特殊情况处理,其余使用Tokenize。对于带引号的字符串,策略是找到真正的结束引号,跟踪是否遇到了连续的引号对。如果是,则使用Replace将成对转换为单个。毫无疑问,有更有效的方法,但在我的案例中,性能还不够重要,不足以证明进一步优化的合理性。
class CParseCSV {
public:
// Construction
CParseCSV(const CString& sLine);
// Attributes
bool GetString(CString& sDest);
protected:
CString m_sLine; // line to extract tokens from
int m_nLen; // line length in characters
int m_iPos; // index of current position
};
CParseCSV::CParseCSV(const CString& sLine) : m_sLine(sLine)
{
m_nLen = m_sLine.GetLength();
m_iPos = 0;
}
bool CParseCSV::GetString(CString& sDest)
{
if (m_iPos < 0 || m_iPos > m_nLen) // if position out of range
return false;
if (m_iPos == m_nLen) { // if at end of string
sDest.Empty(); // return empty token
m_iPos = -1; // really done now
return true;
}
if (m_sLine[m_iPos] == '\"') { // if current char is double quote
m_iPos++; // advance to next char
int iTokenStart = m_iPos;
bool bHasEmbeddedQuotes = false;
while (m_iPos < m_nLen) { // while more chars to parse
if (m_sLine[m_iPos] == '\"') { // if current char is double quote
// if next char exists and is also double quote
if (m_iPos < m_nLen - 1 && m_sLine[m_iPos + 1] == '\"') {
// found pair of consecutive double quotes
bHasEmbeddedQuotes = true; // request conversion
m_iPos++; // skip first quote in pair
} else // next char doesn't exist or is normal
break; // found closing quote; exit loop
}
m_iPos++; // advance to next char
}
sDest = m_sLine.Mid(iTokenStart, m_iPos - iTokenStart);
if (bHasEmbeddedQuotes) // if string contains embedded quote pairs
sDest.Replace(_T("\"\""), _T("\"")); // convert pairs to singles
m_iPos += 2; // skip closing quote and trailing delimiter if any
} else if (m_sLine[m_iPos] == ',') { // else if char is comma
sDest.Empty(); // return empty token
m_iPos++; // advance to next char
} else { // else get next comma-delimited token
sDest = m_sLine.Tokenize(_T(","), m_iPos);
}
return true;
}
// calling code should look something like this:
CStdioFile fIn(pszPath, CFile::modeRead);
CString sLine, sToken;
while (fIn.ReadString(sLine)) { // for each line of input file
if (!sLine.IsEmpty()) { // ignore blank lines
CParseCSV csv(sLine);
while (csv.GetString(sToken)) {
// do something with sToken here
}
}
}
如果你不关心转义逗号和换行符, 并且你不能在引号中嵌入逗号和换行符(如果你不能转义那么…) 那么它只有大约三行代码(好的14 ->,但它只有15读取整个文件)。
std::vector<std::string> getNextLineAndSplitIntoTokens(std::istream& str)
{
std::vector<std::string> result;
std::string line;
std::getline(str,line);
std::stringstream lineStream(line);
std::string cell;
while(std::getline(lineStream,cell, ','))
{
result.push_back(cell);
}
// This checks for a trailing comma with no data after it.
if (!lineStream && cell.empty())
{
// If there was a trailing comma then add an empty element.
result.push_back("");
}
return result;
}
我只需要创建一个表示一行的类。 然后流到该对象:
#include <iterator>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
class CSVRow
{
public:
std::string_view operator[](std::size_t index) const
{
return std::string_view(&m_line[m_data[index] + 1], m_data[index + 1] - (m_data[index] + 1));
}
std::size_t size() const
{
return m_data.size() - 1;
}
void readNextRow(std::istream& str)
{
std::getline(str, m_line);
m_data.clear();
m_data.emplace_back(-1);
std::string::size_type pos = 0;
while((pos = m_line.find(',', pos)) != std::string::npos)
{
m_data.emplace_back(pos);
++pos;
}
// This checks for a trailing comma with no data after it.
pos = m_line.size();
m_data.emplace_back(pos);
}
private:
std::string m_line;
std::vector<int> m_data;
};
std::istream& operator>>(std::istream& str, CSVRow& data)
{
data.readNextRow(str);
return str;
}
int main()
{
std::ifstream file("plop.csv");
CSVRow row;
while(file >> row)
{
std::cout << "4th Element(" << row[3] << ")\n";
}
}
但只要做一点工作,我们就可以在技术上创建一个迭代器:
class CSVIterator
{
public:
typedef std::input_iterator_tag iterator_category;
typedef CSVRow value_type;
typedef std::size_t difference_type;
typedef CSVRow* pointer;
typedef CSVRow& reference;
CSVIterator(std::istream& str) :m_str(str.good()?&str:nullptr) { ++(*this); }
CSVIterator() :m_str(nullptr) {}
// Pre Increment
CSVIterator& operator++() {if (m_str) { if (!((*m_str) >> m_row)){m_str = nullptr;}}return *this;}
// Post increment
CSVIterator operator++(int) {CSVIterator tmp(*this);++(*this);return tmp;}
CSVRow const& operator*() const {return m_row;}
CSVRow const* operator->() const {return &m_row;}
bool operator==(CSVIterator const& rhs) {return ((this == &rhs) || ((this->m_str == nullptr) && (rhs.m_str == nullptr)));}
bool operator!=(CSVIterator const& rhs) {return !((*this) == rhs);}
private:
std::istream* m_str;
CSVRow m_row;
};
int main()
{
std::ifstream file("plop.csv");
for(CSVIterator loop(file); loop != CSVIterator(); ++loop)
{
std::cout << "4th Element(" << (*loop)[3] << ")\n";
}
}
现在我们已经到了2020年,让我们添加一个CSVRange对象:
class CSVRange
{
std::istream& stream;
public:
CSVRange(std::istream& str)
: stream(str)
{}
CSVIterator begin() const {return CSVIterator{stream};}
CSVIterator end() const {return CSVIterator{};}
};
int main()
{
std::ifstream file("plop.csv");
for(auto& row: CSVRange(file))
{
std::cout << "4th Element(" << row[3] << ")\n";
}
}
由于我现在不习惯boost,我将建议一个更简单的解决方案。假设您的.csv文件有100行,每行有10个数字,用“,”分隔。你可以用下面的代码以数组的形式加载这个数据:
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
using namespace std;
int main()
{
int A[100][10];
ifstream ifs;
ifs.open("name_of_file.csv");
string s1;
char c;
for(int k=0; k<100; k++)
{
getline(ifs,s1);
stringstream stream(s1);
int j=0;
while(1)
{
stream >>A[k][j];
stream >> c;
j++;
if(!stream) {break;}
}
}
}
CSV文件是由行组成的文本文件,每一行都由逗号分隔的令牌组成。虽然在解析时你应该知道一些事情:
(0)文件用“CP_ACP”编码页编码。您应该使用相同的编码页来解码文件内容。
(1) CSV丢失了“复合单元格”信息(比如rowspan > 1),所以当它被读回excel时,复合单元格信息丢失。
(2)单元格文本可以在头部和尾部用""" "进行引用,文字引用char将变成双引号。因此,结束匹配的引号字符必须是一个引号字符,而不是后面跟着另一个引号字符。例如,如果一个单元格有逗号,它必须在csv中被引用,因为逗号在csv中有意义。
(3)当单元格内容有多行时,它将在CSV中被引用,在这种情况下,解析器必须继续读取CSV文件中的下几行,直到获得与第一个引用字符匹配的结束引号字符,确保当前逻辑行读取完成后再解析该行的令牌。
例如:在CSV文件中,以下3个物理行是由3个令牌组成的逻辑行:
--+----------
1 |a,"b-first part
2 |b-second part
3 |b-third part",c
--+----------
如果您所需要的只是加载一个双精度数据文件(没有整数,没有文本),那么这里有一个随时可用的函数。
#include <sstream>
#include <fstream>
#include <iterator>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
/**
* Parse a CSV data file and fill the 2d STL vector "data".
* Limits: only "pure datas" of doubles, not encapsulated by " and without \n inside.
* Further no formatting in the data (e.g. scientific notation)
* It however handles both dots and commas as decimal separators and removes thousand separator.
*
* returnCodes[0]: file access 0-> ok 1-> not able to read; 2-> decimal separator equal to comma separator
* returnCodes[1]: number of records
* returnCodes[2]: number of fields. -1 If rows have different field size
*
*/
vector<int>
readCsvData (vector <vector <double>>& data, const string& filename, const string& delimiter, const string& decseparator){
int vv[3] = { 0,0,0 };
vector<int> returnCodes(&vv[0], &vv[0]+3);
string rowstring, stringtoken;
double doubletoken;
int rowcount=0;
int fieldcount=0;
data.clear();
ifstream iFile(filename, ios_base::in);
if (!iFile.is_open()){
returnCodes[0] = 1;
return returnCodes;
}
while (getline(iFile, rowstring)) {
if (rowstring=="") continue; // empty line
rowcount ++; //let's start with 1
if(delimiter == decseparator){
returnCodes[0] = 2;
return returnCodes;
}
if(decseparator != "."){
// remove dots (used as thousand separators)
string::iterator end_pos = remove(rowstring.begin(), rowstring.end(), '.');
rowstring.erase(end_pos, rowstring.end());
// replace decimal separator with dots.
replace(rowstring.begin(), rowstring.end(),decseparator.c_str()[0], '.');
} else {
// remove commas (used as thousand separators)
string::iterator end_pos = remove(rowstring.begin(), rowstring.end(), ',');
rowstring.erase(end_pos, rowstring.end());
}
// tokenize..
vector<double> tokens;
// Skip delimiters at beginning.
string::size_type lastPos = rowstring.find_first_not_of(delimiter, 0);
// Find first "non-delimiter".
string::size_type pos = rowstring.find_first_of(delimiter, lastPos);
while (string::npos != pos || string::npos != lastPos){
// Found a token, convert it to double add it to the vector.
stringtoken = rowstring.substr(lastPos, pos - lastPos);
if (stringtoken == "") {
tokens.push_back(0.0);
} else {
istringstream totalSString(stringtoken);
totalSString >> doubletoken;
tokens.push_back(doubletoken);
}
// Skip delimiters. Note the "not_of"
lastPos = rowstring.find_first_not_of(delimiter, pos);
// Find next "non-delimiter"
pos = rowstring.find_first_of(delimiter, lastPos);
}
if(rowcount == 1){
fieldcount = tokens.size();
returnCodes[2] = tokens.size();
} else {
if ( tokens.size() != fieldcount){
returnCodes[2] = -1;
}
}
data.push_back(tokens);
}
iFile.close();
returnCodes[1] = rowcount;
return returnCodes;
}