From e67bc34326e1594aecdfaefb966be54e2d1fd2ad Mon Sep 17 00:00:00 2001
From: Yagiz Nizipli <yagiz@nizipli.com>
Date: Thu, 4 Apr 2024 18:03:17 -0400
Subject: [PATCH] buffer: use simdutf for `atob` implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Daniel Lemire <daniel@lemire.me>
PR-URL: https://github.com/nodejs/node/pull/52381
Refs: https://github.com/nodejs/node/pull/51670
Reviewed-By: Daniel Lemire <daniel@lemire.me>
Reviewed-By: Vinícius Lourenço Claro Cardoso <contact@viniciusl.com.br>
Reviewed-By: Matteo Collina <matteo.collina@gmail.com>
Reviewed-By: Robert Nagy <ronagy@icloud.com>
Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com>
Reviewed-By: Filip Skokan <panva.ip@gmail.com>
---
 benchmark/buffers/buffer-atob.js | 20 ++++++++
 lib/buffer.js                    | 86 +++++---------------------------
 src/node_buffer.cc               | 60 ++++++++++++++++++++++
 3 files changed, 93 insertions(+), 73 deletions(-)
 create mode 100644 benchmark/buffers/buffer-atob.js

diff --git a/benchmark/buffers/buffer-atob.js b/benchmark/buffers/buffer-atob.js
new file mode 100644
index 00000000000000..2cc20759e3f0f6
--- /dev/null
+++ b/benchmark/buffers/buffer-atob.js
@@ -0,0 +1,20 @@
+'use strict';
+const common = require('../common.js');
+const assert = require('node:assert');
+
+const bench = common.createBenchmark(main, {
+  size: [16, 32, 64, 128],
+  n: [1e6],
+});
+
+function main({ n, size }) {
+  const input = btoa('A'.repeat(size));
+  let out = 0;
+
+  bench.start();
+  for (let i = 0; i < n; i++) {
+    out += atob(input).length;
+  }
+  bench.end(n);
+  assert(out > 0);
+}
diff --git a/lib/buffer.js b/lib/buffer.js
index a8d07342e15eaa..ea94ebf24192f9 100644
--- a/lib/buffer.js
+++ b/lib/buffer.js
@@ -23,10 +23,8 @@
 
 const {
   Array,
-  ArrayFrom,
   ArrayIsArray,
   ArrayPrototypeForEach,
-  ArrayPrototypeIndexOf,
   MathFloor,
   MathMin,
   MathTrunc,
@@ -70,6 +68,7 @@ const {
   swap64: _swap64,
   kMaxLength,
   kStringMaxLength,
+  atob: _atob,
 } = internalBinding('buffer');
 const {
   constants: {
@@ -1259,85 +1258,26 @@ function btoa(input) {
   return buf.toString('base64');
 }
 
-// Refs: https://infra.spec.whatwg.org/#forgiving-base64-decode
-const kForgivingBase64AllowedChars = [
-  // ASCII whitespace
-  // Refs: https://infra.spec.whatwg.org/#ascii-whitespace
-  0x09, 0x0A, 0x0C, 0x0D, 0x20,
-
-  // Uppercase letters
-  ...ArrayFrom({ length: 26 }, (_, i) => StringPrototypeCharCodeAt('A') + i),
-
-  // Lowercase letters
-  ...ArrayFrom({ length: 26 }, (_, i) => StringPrototypeCharCodeAt('a') + i),
-
-  // Decimal digits
-  ...ArrayFrom({ length: 10 }, (_, i) => StringPrototypeCharCodeAt('0') + i),
-
-  0x2B, // +
-  0x2F, // /
-  0x3D, // =
-];
-const kEqualSignIndex = ArrayPrototypeIndexOf(kForgivingBase64AllowedChars,
-                                              0x3D);
-
 function atob(input) {
-  // The implementation here has not been performance optimized in any way and
-  // should not be.
-  // Refs: https://github.com/nodejs/node/pull/38433#issuecomment-828426932
   if (arguments.length === 0) {
     throw new ERR_MISSING_ARGS('input');
   }
 
-  input = `${input}`;
-  let nonAsciiWhitespaceCharCount = 0;
-  let equalCharCount = 0;
+  const result = _atob(`${input}`);
 
-  for (let n = 0; n < input.length; n++) {
-    const index = ArrayPrototypeIndexOf(
-      kForgivingBase64AllowedChars,
-      StringPrototypeCharCodeAt(input, n));
-
-    if (index > 4) {
-      // The first 5 elements of `kForgivingBase64AllowedChars` are
-      // ASCII whitespace char codes.
-      nonAsciiWhitespaceCharCount++;
-
-      if (index === kEqualSignIndex) {
-        equalCharCount++;
-      } else if (equalCharCount) {
-        // The `=` char is only allowed at the end.
-        throw lazyDOMException('Invalid character', 'InvalidCharacterError');
-      }
-
-      if (equalCharCount > 2) {
-        // Only one more `=` is permitted after the first equal sign.
-        throw lazyDOMException('Invalid character', 'InvalidCharacterError');
-      }
-    } else if (index === -1) {
+  switch (result) {
+    case -2: // Invalid character
       throw lazyDOMException('Invalid character', 'InvalidCharacterError');
-    }
-  }
-
-  let reminder = nonAsciiWhitespaceCharCount % 4;
-
-  // See #2, #3, #4 - https://infra.spec.whatwg.org/#forgiving-base64
-  if (!reminder) {
-    // Remove all trailing `=` characters and get the new reminder.
-    reminder = (nonAsciiWhitespaceCharCount - equalCharCount) % 4;
-  } else if (equalCharCount) {
-    // `=` should not in the input if there's a reminder.
-    throw lazyDOMException('Invalid character', 'InvalidCharacterError');
-  }
-
-  // See #3 - https://infra.spec.whatwg.org/#forgiving-base64
-  if (reminder === 1) {
-    throw lazyDOMException(
-      'The string to be decoded is not correctly encoded.',
-      'InvalidCharacterError');
+    case -1: // Single character remained
+      throw lazyDOMException(
+        'The string to be decoded is not correctly encoded.',
+        'InvalidCharacterError');
+    case -3: // Possible overflow
+      // TODO(@anonrig): Throw correct error in here.
+      throw lazyDOMException('The input causes overflow.', 'InvalidCharacterError');
+    default:
+      return result;
   }
-
-  return Buffer.from(input, 'base64').toString('latin1');
 }
 
 function isUtf8(input) {
diff --git a/src/node_buffer.cc b/src/node_buffer.cc
index 82e98193ba0fdf..b31beada451bc8 100644
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@@ -67,6 +67,7 @@ using v8::Just;
 using v8::Local;
 using v8::Maybe;
 using v8::MaybeLocal;
+using v8::NewStringType;
 using v8::Nothing;
 using v8::Number;
 using v8::Object;
@@ -1210,6 +1211,61 @@ void DetachArrayBuffer(const FunctionCallbackInfo<Value>& args) {
   }
 }
 
+// In case of success, the decoded string is returned.
+// In case of error, a negative value is returned:
+// * -1 indicates a single character remained,
+// * -2 indicates an invalid character,
+// * -3 indicates a possible overflow (i.e., more than 2 GB output).
+static void Atob(const FunctionCallbackInfo<Value>& args) {
+  CHECK_EQ(args.Length(), 1);
+  Environment* env = Environment::GetCurrent(args);
+  THROW_AND_RETURN_IF_NOT_STRING(env, args[0], "argument");
+
+  Local<String> input = args[0].As<String>();
+  MaybeStackBuffer<char> buffer;
+  simdutf::result result;
+
+  if (input->IsExternalOneByte()) {  // 8-bit case
+    auto ext = input->GetExternalOneByteStringResource();
+    size_t expected_length =
+        simdutf::maximal_binary_length_from_base64(ext->data(), ext->length());
+    buffer.AllocateSufficientStorage(expected_length + 1);
+    buffer.SetLengthAndZeroTerminate(expected_length);
+    result = simdutf::base64_to_binary(
+        ext->data(), ext->length(), buffer.out(), simdutf::base64_default);
+  } else {  // 16-bit case
+    String::Value value(env->isolate(), input);
+    auto data = reinterpret_cast<const char16_t*>(*value);
+    size_t expected_length =
+        simdutf::maximal_binary_length_from_base64(data, value.length());
+    buffer.AllocateSufficientStorage(expected_length + 1);
+    buffer.SetLengthAndZeroTerminate(expected_length);
+    result = simdutf::base64_to_binary(
+        data, value.length(), buffer.out(), simdutf::base64_default);
+  }
+
+  if (result.error == simdutf::error_code::SUCCESS) {
+    auto value =
+        String::NewFromOneByte(env->isolate(),
+                               reinterpret_cast<const uint8_t*>(buffer.out()),
+                               NewStringType::kNormal,
+                               result.count)
+            .ToLocalChecked();
+    return args.GetReturnValue().Set(value);
+  }
+
+  // Default value is: "possible overflow"
+  int32_t error_code = -3;
+
+  if (result.error == simdutf::error_code::INVALID_BASE64_CHARACTER) {
+    error_code = -2;
+  } else if (result.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+    error_code = -1;
+  }
+
+  args.GetReturnValue().Set(error_code);
+}
+
 namespace {
 
 std::pair<void*, size_t> DecomposeBufferToParts(Local<Value> buffer) {
@@ -1272,6 +1328,8 @@ void Initialize(Local<Object> target,
   Environment* env = Environment::GetCurrent(context);
   Isolate* isolate = env->isolate();
 
+  SetMethodNoSideEffect(context, target, "atob", Atob);
+
   SetMethod(context, target, "setBufferPrototype", SetBufferPrototype);
   SetMethodNoSideEffect(context, target, "createFromString", CreateFromString);
 
@@ -1373,6 +1431,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
 
   registry->Register(DetachArrayBuffer);
   registry->Register(CopyArrayBuffer);
+
+  registry->Register(Atob);
 }
 
 }  // namespace Buffer