Skip to content

Commit

Permalink
fix(csv-parse): rtrim encoding support (fix #349)
Browse files Browse the repository at this point in the history
  • Loading branch information
wdavidw committed Jun 29, 2022
1 parent 737ac66 commit 8bf52f0
Show file tree
Hide file tree
Showing 11 changed files with 414 additions and 153 deletions.
60 changes: 43 additions & 17 deletions packages/csv-parse/dist/cjs/index.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ class ResizeableBuffer{
}
}

// white space characters
// https://en.wikipedia.org/wiki/Whitespace_character
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
const np = 12;
const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
const space = 32;
const tab = 9;

const init_state = function(options){
return {
bomSkipped: false,
Expand Down Expand Up @@ -148,7 +158,14 @@ const init_state = function(options){
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
wasQuoting: false,
wasRowDelimiter: false
wasRowDelimiter: false,
timchars: [
Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
]
};
};

Expand Down Expand Up @@ -571,15 +588,9 @@ const isRecordEmpty = function(record){
return record.every((field) => field == null || field.toString && field.toString().trim() === '');
};

// white space characters
// https://en.wikipedia.org/wiki/Whitespace_character
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
const tab = 9;
const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
const np = 12;
const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
const space = 32;
const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal

const boms = {
// Note, the following are equals:
// Buffer.from("\ufeff")
Expand Down Expand Up @@ -724,7 +735,7 @@ const transform = function(original_options = {}) {
if(this.state.commenting === false && this.__isQuote(buf, pos)){
if(this.state.quoting === true){
const nextChr = buf[pos+quote.length];
const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
Expand Down Expand Up @@ -834,30 +845,34 @@ const transform = function(original_options = {}) {
}
if(this.state.commenting === false){
if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
const err = this.__error(
return this.__error(
new CsvError('CSV_MAX_RECORD_SIZE', [
'Max Record Size:',
'record exceed the maximum number of tolerated bytes',
`of ${max_record_size}`,
`at line ${this.info.lines}`,
], this.options, this.__infoField())
);
if(err !== undefined) return err;
}
}
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
// rtrim in non quoting is handle in __onField
const rappend = rtrim === false || this.state.wasQuoting === false;
if(lappend === true && rappend === true){
this.state.field.append(chr);
}else if(rtrim === true && !this.__isCharTrimable(chr)){
}else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
return this.__error(
new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
'Invalid Closing Quote:',
'found non trimable byte after quote',
`at line ${this.info.lines}`,
], this.options, this.__infoField())
);
}else {
if(lappend === false){
pos += this.__isCharTrimable(buf, pos) - 1;
}
continue;
}
}
if(end === true){
Expand Down Expand Up @@ -1114,8 +1129,19 @@ const transform = function(original_options = {}) {
return [undefined, field];
},
// Helper to test if a character is a space or a line delimiter
__isCharTrimable: function(chr){
return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
__isCharTrimable: function(buf, pos){
const isTrim = (buf, pos) => {
const {timchars} = this.state;
loop1: for(let i = 0; i < timchars.length; i++){
const timchar = timchars[i];
for(let j = 0; j < timchar.length; j++){
if(timchar[j] !== buf[pos+j]) continue loop1;
}
return timchar.length;
}
return 0;
};
return isTrim(buf, pos);
},
// Keep it in case we implement the `cast_int` option
// __isInt(value){
Expand Down
60 changes: 43 additions & 17 deletions packages/csv-parse/dist/cjs/sync.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,16 @@ class ResizeableBuffer{
}
}

// white space characters
// https://en.wikipedia.org/wiki/Whitespace_character
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
const np = 12;
const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
const space = 32;
const tab = 9;

const init_state = function(options){
return {
bomSkipped: false,
Expand Down Expand Up @@ -146,7 +156,14 @@ const init_state = function(options){
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
wasQuoting: false,
wasRowDelimiter: false
wasRowDelimiter: false,
timchars: [
Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
]
};
};

Expand Down Expand Up @@ -569,15 +586,9 @@ const isRecordEmpty = function(record){
return record.every((field) => field == null || field.toString && field.toString().trim() === '');
};

// white space characters
// https://en.wikipedia.org/wiki/Whitespace_character
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
const tab = 9;
const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
const np = 12;
const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
const space = 32;
const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal

const boms = {
// Note, the following are equals:
// Buffer.from("\ufeff")
Expand Down Expand Up @@ -722,7 +733,7 @@ const transform = function(original_options = {}) {
if(this.state.commenting === false && this.__isQuote(buf, pos)){
if(this.state.quoting === true){
const nextChr = buf[pos+quote.length];
const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
Expand Down Expand Up @@ -832,30 +843,34 @@ const transform = function(original_options = {}) {
}
if(this.state.commenting === false){
if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
const err = this.__error(
return this.__error(
new CsvError('CSV_MAX_RECORD_SIZE', [
'Max Record Size:',
'record exceed the maximum number of tolerated bytes',
`of ${max_record_size}`,
`at line ${this.info.lines}`,
], this.options, this.__infoField())
);
if(err !== undefined) return err;
}
}
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
// rtrim in non quoting is handle in __onField
const rappend = rtrim === false || this.state.wasQuoting === false;
if(lappend === true && rappend === true){
this.state.field.append(chr);
}else if(rtrim === true && !this.__isCharTrimable(chr)){
}else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
return this.__error(
new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
'Invalid Closing Quote:',
'found non trimable byte after quote',
`at line ${this.info.lines}`,
], this.options, this.__infoField())
);
}else {
if(lappend === false){
pos += this.__isCharTrimable(buf, pos) - 1;
}
continue;
}
}
if(end === true){
Expand Down Expand Up @@ -1112,8 +1127,19 @@ const transform = function(original_options = {}) {
return [undefined, field];
},
// Helper to test if a character is a space or a line delimiter
__isCharTrimable: function(chr){
return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
__isCharTrimable: function(buf, pos){
const isTrim = (buf, pos) => {
const {timchars} = this.state;
loop1: for(let i = 0; i < timchars.length; i++){
const timchar = timchars[i];
for(let j = 0; j < timchar.length; j++){
if(timchar[j] !== buf[pos+j]) continue loop1;
}
return timchar.length;
}
return 0;
};
return isTrim(buf, pos);
},
// Keep it in case we implement the `cast_int` option
// __isInt(value){
Expand Down
60 changes: 43 additions & 17 deletions packages/csv-parse/dist/esm/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5171,6 +5171,16 @@ class ResizeableBuffer{
}
}

// white space characters
// https://en.wikipedia.org/wiki/Whitespace_character
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
const np = 12;
const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
const space = 32;
const tab = 9;

const init_state = function(options){
return {
bomSkipped: false,
Expand Down Expand Up @@ -5204,7 +5214,14 @@ const init_state = function(options){
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
wasQuoting: false,
wasRowDelimiter: false
wasRowDelimiter: false,
timchars: [
Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
]
};
};

Expand Down Expand Up @@ -5627,15 +5644,9 @@ const isRecordEmpty = function(record){
return record.every((field) => field == null || field.toString && field.toString().trim() === '');
};

// white space characters
// https://en.wikipedia.org/wiki/Whitespace_character
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
const tab = 9;
const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
const np = 12;
const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
const space = 32;
const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal

const boms = {
// Note, the following are equals:
// Buffer.from("\ufeff")
Expand Down Expand Up @@ -5780,7 +5791,7 @@ const transform = function(original_options = {}) {
if(this.state.commenting === false && this.__isQuote(buf, pos)){
if(this.state.quoting === true){
const nextChr = buf[pos+quote.length];
const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
Expand Down Expand Up @@ -5890,30 +5901,34 @@ const transform = function(original_options = {}) {
}
if(this.state.commenting === false){
if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
const err = this.__error(
return this.__error(
new CsvError('CSV_MAX_RECORD_SIZE', [
'Max Record Size:',
'record exceed the maximum number of tolerated bytes',
`of ${max_record_size}`,
`at line ${this.info.lines}`,
], this.options, this.__infoField())
);
if(err !== undefined) return err;
}
}
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
// rtrim in non quoting is handle in __onField
const rappend = rtrim === false || this.state.wasQuoting === false;
if(lappend === true && rappend === true){
this.state.field.append(chr);
}else if(rtrim === true && !this.__isCharTrimable(chr)){
}else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
return this.__error(
new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
'Invalid Closing Quote:',
'found non trimable byte after quote',
`at line ${this.info.lines}`,
], this.options, this.__infoField())
);
}else {
if(lappend === false){
pos += this.__isCharTrimable(buf, pos) - 1;
}
continue;
}
}
if(end === true){
Expand Down Expand Up @@ -6170,8 +6185,19 @@ const transform = function(original_options = {}) {
return [undefined, field];
},
// Helper to test if a character is a space or a line delimiter
__isCharTrimable: function(chr){
return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
__isCharTrimable: function(buf, pos){
const isTrim = (buf, pos) => {
const {timchars} = this.state;
loop1: for(let i = 0; i < timchars.length; i++){
const timchar = timchars[i];
for(let j = 0; j < timchar.length; j++){
if(timchar[j] !== buf[pos+j]) continue loop1;
}
return timchar.length;
}
return 0;
};
return isTrim(buf, pos);
},
// Keep it in case we implement the `cast_int` option
// __isInt(value){
Expand Down
Loading

0 comments on commit 8bf52f0

Please sign in to comment.