-
Notifications
You must be signed in to change notification settings - Fork 437
allow '\0' '\n' in quoted columns #91
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,11 +49,34 @@ | |
#include <cerrno> | ||
#include <istream> | ||
|
||
|
||
|
||
namespace io{ | ||
//////////////////////////////////////////////////////////////////////////// | ||
// LineReader // | ||
//////////////////////////////////////////////////////////////////////////// | ||
|
||
template<char sep, bool null_terminated = true> | ||
struct no_quote_escape { | ||
static bool hasQuote() { | ||
return false; | ||
} | ||
static char getQuoteChar() { | ||
return '\0'; | ||
} | ||
static const char*find_next_column_end(const char*col_begin) { | ||
while (*col_begin != sep && (*col_begin != '\0')) | ||
++col_begin; | ||
return col_begin; | ||
} | ||
|
||
static void unescape(char*&, char*&) { | ||
|
||
} | ||
|
||
}; | ||
|
||
|
||
namespace error{ | ||
struct base : std::exception{ | ||
virtual void format_error_message()const = 0; | ||
|
@@ -153,7 +176,7 @@ namespace io{ | |
} | ||
|
||
int read(char*buffer, int size){ | ||
return std::fread(buffer, 1, size, file); | ||
return (int)std::fread(buffer, 1, size, file); | ||
} | ||
|
||
~OwningStdIOByteSourceBase(){ | ||
|
@@ -170,7 +193,7 @@ namespace io{ | |
|
||
int read(char*buffer, int size){ | ||
in.read(buffer, size); | ||
return in.gcount(); | ||
return (int)in.gcount(); | ||
} | ||
|
||
~NonOwningIStreamByteSource(){} | ||
|
@@ -186,7 +209,7 @@ namespace io{ | |
int read(char*buffer, int desired_byte_count){ | ||
int to_copy_byte_count = desired_byte_count; | ||
if(remaining_byte_count < to_copy_byte_count) | ||
to_copy_byte_count = remaining_byte_count; | ||
to_copy_byte_count = (int)remaining_byte_count; | ||
std::memcpy(buffer, str, to_copy_byte_count); | ||
remaining_byte_count -= to_copy_byte_count; | ||
str += to_copy_byte_count; | ||
|
@@ -314,7 +337,7 @@ namespace io{ | |
int desired_byte_count; | ||
}; | ||
} | ||
|
||
template<class quote_policy = no_quote_escape<','>> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. made LineReader templated so it could call the quote policy functions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This breaks code using LineReader directly. Previously, people could write io::LineReader my_reader. With this change, they have to write io::LineReader<> my_reader. This is a breaking change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about the following then: and add This way the change will not break anything and allow LineReader to find the next line with regards to quoting policy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is the only option, if LineReader must be changed. A downside of this is that error message get more complex as users are now confronted with a type name they did not use themselves. This is the same situation as std::string vs std::basic_string<char, ...>. A different option would be to have two LineReaders and make them an argument to CSVReader |
||
class LineReader{ | ||
private: | ||
static const int block_len = 1<<20; | ||
|
@@ -327,6 +350,9 @@ namespace io{ | |
int data_begin; | ||
int data_end; | ||
|
||
bool hasQuote; | ||
char quoteChar; | ||
|
||
char file_name[error::max_file_name_length+1]; | ||
unsigned file_line; | ||
|
||
|
@@ -347,6 +373,9 @@ namespace io{ | |
void init(std::unique_ptr<ByteSourceBase>byte_source){ | ||
file_line = 0; | ||
|
||
hasQuote = quote_policy::hasQuote(); | ||
quoteChar = quote_policy::getQuoteChar(); | ||
|
||
buffer = std::unique_ptr<char[]>(new char[3*block_len]); | ||
data_begin = 0; | ||
data_end = byte_source->read(buffer.get(), 2*block_len); | ||
|
@@ -462,9 +491,14 @@ namespace io{ | |
} | ||
} | ||
|
||
bool quoted_data = false; | ||
int line_end = data_begin; | ||
while(buffer[line_end] != '\n' && line_end != data_end){ | ||
while( (quoted_data || buffer[line_end] != '\n') && line_end != data_end){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when using double quote policy, new_line() should not stop prematurely on the first \n it sees but rather on the first un-quoted \n. |
||
if (hasQuote && buffer[line_end] == quoteChar) { | ||
quoted_data = !quoted_data; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How does this work, if somebody has a backslash-quote-policy? As far as I see, you are counting the number of quotes. This happens to work for double quote escape but fails for backslash quote escape. |
||
} | ||
++line_end; | ||
|
||
} | ||
|
||
if(line_end - data_begin + 1 > block_len){ | ||
|
@@ -753,30 +787,26 @@ namespace io{ | |
} | ||
}; | ||
|
||
template<char sep> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where did no_quote_escape go? Removing this is a no-go. 90% of CSV files are just numbers without quotes. Parsing these must be fast and therefore the default policy setting should not involve any quote handling. Without this policy, this is no longer possible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was moved up since some compilers are giving error for the declaration location. |
||
struct no_quote_escape{ | ||
static const char*find_next_column_end(const char*col_begin){ | ||
while(*col_begin != sep && *col_begin != '\0') | ||
++col_begin; | ||
return col_begin; | ||
} | ||
|
||
static void unescape(char*&, char*&){ | ||
|
||
|
||
} | ||
}; | ||
|
||
template<char sep, char quote> | ||
template<char sep, char quote, bool null_terminated = true> | ||
struct double_quote_escape{ | ||
static bool hasQuote() { | ||
return true; | ||
} | ||
static char getQuoteChar() { | ||
return quote; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This reduces the flexibility given by the policy system. For example, before it was possible to write a policy that parses a,b,c,d as three columns with values "a", "b,c" and "d". I doubt anybody used this freedom, but it is an introduced restriction. |
||
} | ||
|
||
static const char*find_next_column_end(const char*col_begin){ | ||
while(*col_begin != sep && *col_begin != '\0') | ||
while(*col_begin != sep && ( *col_begin != '\0')) | ||
if(*col_begin != quote) | ||
++col_begin; | ||
else{ | ||
do{ | ||
++col_begin; | ||
while(*col_begin != quote){ | ||
if(*col_begin == '\0') | ||
if(( null_terminated && *col_begin == '\0')) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if null_terminated template flag is false, encountering a '\0' inside a quoted column is not considered an escaped_string_not_closed error. |
||
throw error::escaped_string_not_closed(); | ||
++col_begin; | ||
} | ||
|
@@ -1110,7 +1140,7 @@ namespace io{ | |
> | ||
class CSVReader{ | ||
private: | ||
LineReader in; | ||
LineReader<quote_policy> in; | ||
|
||
char*row[column_count]; | ||
std::string column_names[column_count]; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
null_terminated by default is true which makes no difference for existing user code.