fix(csv-parse): rtrim encoding support (fix #349)

adaltas · Jun 29, 2022 · 8bf52f0 · 8bf52f0
1 parent 737ac66
commit 8bf52f0
Show file tree

Hide file tree

Showing 11 changed files with 414 additions and 153 deletions.
diff --git a/packages/csv-parse/dist/cjs/index.cjs b/packages/csv-parse/dist/cjs/index.cjs
@@ -115,6 +115,16 @@ class ResizeableBuffer{
   }
 }
 
+// white space characters
+// https://en.wikipedia.org/wiki/Whitespace_character
+// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
+// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
+const np = 12;
+const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
+const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
+const space = 32;
+const tab = 9;
+
 const init_state = function(options){
   return {
     bomSkipped: false,
@@ -148,7 +158,14 @@ const init_state = function(options){
     recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
     trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
     wasQuoting: false,
-    wasRowDelimiter: false
+    wasRowDelimiter: false,
+    timchars: [
+      Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
+    ]
   };
 };
 
@@ -571,15 +588,9 @@ const isRecordEmpty = function(record){
   return record.every((field) => field == null || field.toString && field.toString().trim() === '');
 };
 
-// white space characters
-// https://en.wikipedia.org/wiki/Whitespace_character
-// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
-// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
-const tab = 9;
-const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
-const np = 12;
-const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
-const space = 32;
+const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
+const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
+
 const boms = {
   // Note, the following are equals:
   // Buffer.from("\ufeff")
@@ -724,7 +735,7 @@ const transform = function(original_options = {}) {
           if(this.state.commenting === false && this.__isQuote(buf, pos)){
             if(this.state.quoting === true){
               const nextChr = buf[pos+quote.length];
-              const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
+              const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
               const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
               const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
               const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
@@ -834,30 +845,34 @@ const transform = function(original_options = {}) {
         }
         if(this.state.commenting === false){
           if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
-            const err = this.__error(
+            return this.__error(
               new CsvError('CSV_MAX_RECORD_SIZE', [
                 'Max Record Size:',
                 'record exceed the maximum number of tolerated bytes',
                 `of ${max_record_size}`,
                 `at line ${this.info.lines}`,
               ], this.options, this.__infoField())
             );
-            if(err !== undefined) return err;
           }
         }
-        const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
+        const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
         // rtrim in non quoting is handle in __onField
         const rappend = rtrim === false || this.state.wasQuoting === false;
         if(lappend === true && rappend === true){
           this.state.field.append(chr);
-        }else if(rtrim === true && !this.__isCharTrimable(chr)){
+        }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
           return this.__error(
             new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
               'Invalid Closing Quote:',
               'found non trimable byte after quote',
               `at line ${this.info.lines}`,
             ], this.options, this.__infoField())
           );
+        }else {
+          if(lappend === false){
+            pos += this.__isCharTrimable(buf, pos) - 1;
+          }
+          continue;
         }
       }
       if(end === true){
@@ -1114,8 +1129,19 @@ const transform = function(original_options = {}) {
       return [undefined, field];
     },
     // Helper to test if a character is a space or a line delimiter
-    __isCharTrimable: function(chr){
-      return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
+    __isCharTrimable: function(buf, pos){
+      const isTrim = (buf, pos) => {
+        const {timchars} = this.state;
+        loop1: for(let i = 0; i < timchars.length; i++){
+          const timchar = timchars[i];
+          for(let j = 0; j < timchar.length; j++){
+            if(timchar[j] !== buf[pos+j]) continue loop1;
+          }
+          return timchar.length;
+        }
+        return 0;
+      };
+      return isTrim(buf, pos);
     },
     // Keep it in case we implement the `cast_int` option
     // __isInt(value){

diff --git a/packages/csv-parse/dist/cjs/sync.cjs b/packages/csv-parse/dist/cjs/sync.cjs
@@ -113,6 +113,16 @@ class ResizeableBuffer{
   }
 }
 
+// white space characters
+// https://en.wikipedia.org/wiki/Whitespace_character
+// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
+// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
+const np = 12;
+const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
+const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
+const space = 32;
+const tab = 9;
+
 const init_state = function(options){
   return {
     bomSkipped: false,
@@ -146,7 +156,14 @@ const init_state = function(options){
     recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
     trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
     wasQuoting: false,
-    wasRowDelimiter: false
+    wasRowDelimiter: false,
+    timchars: [
+      Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
+    ]
   };
 };
 
@@ -569,15 +586,9 @@ const isRecordEmpty = function(record){
   return record.every((field) => field == null || field.toString && field.toString().trim() === '');
 };
 
-// white space characters
-// https://en.wikipedia.org/wiki/Whitespace_character
-// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
-// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
-const tab = 9;
-const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
-const np = 12;
-const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
-const space = 32;
+const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
+const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
+
 const boms = {
   // Note, the following are equals:
   // Buffer.from("\ufeff")
@@ -722,7 +733,7 @@ const transform = function(original_options = {}) {
           if(this.state.commenting === false && this.__isQuote(buf, pos)){
             if(this.state.quoting === true){
               const nextChr = buf[pos+quote.length];
-              const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
+              const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
               const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
               const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
               const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
@@ -832,30 +843,34 @@ const transform = function(original_options = {}) {
         }
         if(this.state.commenting === false){
           if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
-            const err = this.__error(
+            return this.__error(
               new CsvError('CSV_MAX_RECORD_SIZE', [
                 'Max Record Size:',
                 'record exceed the maximum number of tolerated bytes',
                 `of ${max_record_size}`,
                 `at line ${this.info.lines}`,
               ], this.options, this.__infoField())
             );
-            if(err !== undefined) return err;
           }
         }
-        const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
+        const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
         // rtrim in non quoting is handle in __onField
         const rappend = rtrim === false || this.state.wasQuoting === false;
         if(lappend === true && rappend === true){
           this.state.field.append(chr);
-        }else if(rtrim === true && !this.__isCharTrimable(chr)){
+        }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
           return this.__error(
             new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
               'Invalid Closing Quote:',
               'found non trimable byte after quote',
               `at line ${this.info.lines}`,
             ], this.options, this.__infoField())
           );
+        }else {
+          if(lappend === false){
+            pos += this.__isCharTrimable(buf, pos) - 1;
+          }
+          continue;
         }
       }
       if(end === true){
@@ -1112,8 +1127,19 @@ const transform = function(original_options = {}) {
       return [undefined, field];
     },
     // Helper to test if a character is a space or a line delimiter
-    __isCharTrimable: function(chr){
-      return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
+    __isCharTrimable: function(buf, pos){
+      const isTrim = (buf, pos) => {
+        const {timchars} = this.state;
+        loop1: for(let i = 0; i < timchars.length; i++){
+          const timchar = timchars[i];
+          for(let j = 0; j < timchar.length; j++){
+            if(timchar[j] !== buf[pos+j]) continue loop1;
+          }
+          return timchar.length;
+        }
+        return 0;
+      };
+      return isTrim(buf, pos);
     },
     // Keep it in case we implement the `cast_int` option
     // __isInt(value){

diff --git a/packages/csv-parse/dist/esm/index.js b/packages/csv-parse/dist/esm/index.js
@@ -5171,6 +5171,16 @@ class ResizeableBuffer{
   }
 }
 
+// white space characters
+// https://en.wikipedia.org/wiki/Whitespace_character
+// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
+// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
+const np = 12;
+const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
+const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
+const space = 32;
+const tab = 9;
+
 const init_state = function(options){
   return {
     bomSkipped: false,
@@ -5204,7 +5214,14 @@ const init_state = function(options){
     recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
     trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
     wasQuoting: false,
-    wasRowDelimiter: false
+    wasRowDelimiter: false,
+    timchars: [
+      Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
+      Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
+    ]
   };
 };
 
@@ -5627,15 +5644,9 @@ const isRecordEmpty = function(record){
   return record.every((field) => field == null || field.toString && field.toString().trim() === '');
 };
 
-// white space characters
-// https://en.wikipedia.org/wiki/Whitespace_character
-// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
-// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
-const tab = 9;
-const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
-const np = 12;
-const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
-const space = 32;
+const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
+const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
+
 const boms = {
   // Note, the following are equals:
   // Buffer.from("\ufeff")
@@ -5780,7 +5791,7 @@ const transform = function(original_options = {}) {
           if(this.state.commenting === false && this.__isQuote(buf, pos)){
             if(this.state.quoting === true){
               const nextChr = buf[pos+quote.length];
-              const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
+              const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
               const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
               const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
               const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
@@ -5890,30 +5901,34 @@ const transform = function(original_options = {}) {
         }
         if(this.state.commenting === false){
           if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
-            const err = this.__error(
+            return this.__error(
               new CsvError('CSV_MAX_RECORD_SIZE', [
                 'Max Record Size:',
                 'record exceed the maximum number of tolerated bytes',
                 `of ${max_record_size}`,
                 `at line ${this.info.lines}`,
               ], this.options, this.__infoField())
             );
-            if(err !== undefined) return err;
           }
         }
-        const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
+        const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
         // rtrim in non quoting is handle in __onField
         const rappend = rtrim === false || this.state.wasQuoting === false;
         if(lappend === true && rappend === true){
           this.state.field.append(chr);
-        }else if(rtrim === true && !this.__isCharTrimable(chr)){
+        }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
           return this.__error(
             new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
               'Invalid Closing Quote:',
               'found non trimable byte after quote',
               `at line ${this.info.lines}`,
             ], this.options, this.__infoField())
           );
+        }else {
+          if(lappend === false){
+            pos += this.__isCharTrimable(buf, pos) - 1;
+          }
+          continue;
         }
       }
       if(end === true){
@@ -6170,8 +6185,19 @@ const transform = function(original_options = {}) {
       return [undefined, field];
     },
     // Helper to test if a character is a space or a line delimiter
-    __isCharTrimable: function(chr){
-      return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
+    __isCharTrimable: function(buf, pos){
+      const isTrim = (buf, pos) => {
+        const {timchars} = this.state;
+        loop1: for(let i = 0; i < timchars.length; i++){
+          const timchar = timchars[i];
+          for(let j = 0; j < timchar.length; j++){
+            if(timchar[j] !== buf[pos+j]) continue loop1;
+          }
+          return timchar.length;
+        }
+        return 0;
+      };
+      return isTrim(buf, pos);
     },
     // Keep it in case we implement the `cast_int` option
     // __isInt(value){