From 190596c1890ee78a9161b749122201647dce9efd Mon Sep 17 00:00:00 2001 From: Keyhan Vakil Date: Wed, 10 May 2023 04:17:59 -0700 Subject: [PATCH] src: register external references for source code Currently we use external strings for internalized builtin source code. However when a snapshot is taken, any external string whose resource is not registered is flattened into a SeqString (see ref). The result is that module source code stored in the snapshot does not use external strings after deserialization. This patch registers an external string resource for each internalized builtin's source. The savings are substantial: ~1.9 MB of heap memory per isolate, or ~44% of an otherwise empty isolate's heap usage: ```console $ node --expose-gc -p 'gc(),process.memoryUsage().heapUsed' 4190968 $ ./node --expose-gc -p 'gc(),process.memoryUsage().heapUsed' 2327536 ``` The savings can be even higher for user snapshots which may include more internal modules. The existing UnionBytes implementation was ill-suited, because it only created an external string resource when ToStringChecked was called, but we need to register the external string resources before the isolate even exists. We change UnionBytes to no longer own the data, and shift ownership of the data to a new external resource class called StaticExternalByteResource. StaticExternalByteResource are either statically allocated (for internalized builtin code) or owned by the static `externalized_builtin_sources` map, so they will only be destructed when static resources are destructed. We change JS2C to emit statements to register a string resource for each internalized builtin. Refs: https://github.com/v8/v8/blob/d2c8fbe9ccd1a6ce5591bb7dd319c3c00d6bf489/src/snapshot/serializer.cc#L633 PR-URL: https://github.com/nodejs/node/pull/47055 Reviewed-By: Joyee Cheung Reviewed-By: Chengzhong Wu --- src/node_builtins.cc | 46 ++++++++++++--------- src/node_builtins.h | 6 ++- src/node_external_reference.h | 3 +- src/node_union_bytes.h | 76 +++++++++++++++++++++++------------ src/util.cc | 60 ++------------------------- tools/js2c.py | 49 +++++++++++++++------- 6 files changed, 123 insertions(+), 117 deletions(-) diff --git a/src/node_builtins.cc b/src/node_builtins.cc index 1e088ffa34ab6a..9e9cf6948b2c17 100644 --- a/src/node_builtins.cc +++ b/src/node_builtins.cc @@ -213,18 +213,18 @@ MaybeLocal BuiltinLoader::LoadBuiltinSource(Isolate* isolate, namespace { static Mutex externalized_builtins_mutex; -std::unordered_map externalized_builtin_sources; +std::unordered_map> + externalized_builtin_sources; } // namespace void BuiltinLoader::AddExternalizedBuiltin(const char* id, const char* filename) { - std::string source; + StaticExternalTwoByteResource* resource; { Mutex::ScopedLock lock(externalized_builtins_mutex); auto it = externalized_builtin_sources.find(id); - if (it != externalized_builtin_sources.end()) { - source = it->second; - } else { + if (it == externalized_builtin_sources.end()) { + std::string source; int r = ReadFileSync(&source, filename); if (r != 0) { fprintf(stderr, @@ -233,23 +233,29 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id, filename); ABORT(); } - externalized_builtin_sources[id] = source; + size_t expected_u16_length = + simdutf::utf16_length_from_utf8(source.data(), source.length()); + auto out = std::make_shared>(expected_u16_length); + size_t u16_length = simdutf::convert_utf8_to_utf16( + source.data(), + source.length(), + reinterpret_cast(out->data())); + out->resize(u16_length); + + auto result = externalized_builtin_sources.emplace( + id, + std::make_unique( + out->data(), out->size(), out)); + CHECK(result.second); + it = result.first; } + // OK to get the raw pointer, since externalized_builtin_sources owns + // the resource, resources are never removed from the map, and + // externalized_builtin_sources has static lifetime. + resource = it->second.get(); } - Add(id, source); -} - -bool BuiltinLoader::Add(const char* id, std::string_view utf8source) { - size_t expected_u16_length = - simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length()); - auto out = std::make_shared>(expected_u16_length); - size_t u16_length = - simdutf::convert_utf8_to_utf16(utf8source.data(), - utf8source.length(), - reinterpret_cast(out->data())); - out->resize(u16_length); - return Add(id, UnionBytes(out)); + Add(id, UnionBytes(resource)); } MaybeLocal BuiltinLoader::LookupAndCompileInternal( @@ -719,6 +725,8 @@ void BuiltinLoader::RegisterExternalReferences( registry->Register(CompileFunction); registry->Register(HasCachedBuiltins); registry->Register(SetInternalLoaders); + + RegisterExternalReferencesForInternalizedBuiltinCode(registry); } } // namespace builtins diff --git a/src/node_builtins.h b/src/node_builtins.h index 81d6d1cca279c2..2dd3ee8b8c9c4e 100644 --- a/src/node_builtins.h +++ b/src/node_builtins.h @@ -10,6 +10,7 @@ #include #include #include +#include "node_external_reference.h" #include "node_mutex.h" #include "node_threadsafe_cow.h" #include "node_union_bytes.h" @@ -30,6 +31,10 @@ using BuiltinCodeCacheMap = std::unordered_map>; +// Generated by tools/js2c.py as node_javascript.cc +void RegisterExternalReferencesForInternalizedBuiltinCode( + ExternalReferenceRegistry* registry); + struct CodeCacheInfo { std::string id; std::vector data; @@ -72,7 +77,6 @@ class NODE_EXTERN_PRIVATE BuiltinLoader { v8::Local GetConfigString(v8::Isolate* isolate); bool Exists(const char* id); bool Add(const char* id, const UnionBytes& source); - bool Add(const char* id, std::string_view utf8source); bool CompileAllBuiltins(v8::Local context); void RefreshCodeCache(const std::vector& in); diff --git a/src/node_external_reference.h b/src/node_external_reference.h index 85acbc114143da..d7bc8ed343fe19 100644 --- a/src/node_external_reference.h +++ b/src/node_external_reference.h @@ -50,7 +50,8 @@ class ExternalReferenceRegistry { V(v8::IndexedPropertyDefinerCallback) \ V(v8::IndexedPropertyDeleterCallback) \ V(v8::IndexedPropertyQueryCallback) \ - V(v8::IndexedPropertyDescriptorCallback) + V(v8::IndexedPropertyDescriptorCallback) \ + V(const v8::String::ExternalStringResourceBase*) #define V(ExternalReferenceType) \ void Register(ExternalReferenceType addr) { RegisterT(addr); } diff --git a/src/node_union_bytes.h b/src/node_union_bytes.h index 6e1a3afb614323..4a3f67980fdc92 100644 --- a/src/node_union_bytes.h +++ b/src/node_union_bytes.h @@ -4,50 +4,74 @@ #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS -// A union of const uint8_t* or const uint16_t* data that can be -// turned into external v8::String when given an isolate. - #include "v8.h" namespace node { +// An external resource intended to be used with static lifetime. +template +class StaticExternalByteResource : public Base { + static_assert(sizeof(IChar) == sizeof(Char), + "incompatible interface and internal pointers"); + + public: + explicit StaticExternalByteResource(const Char* data, + size_t length, + std::shared_ptr owning_ptr) + : data_(data), length_(length), owning_ptr_(owning_ptr) {} + + const IChar* data() const override { + return reinterpret_cast(data_); + } + size_t length() const override { return length_; } + + void Dispose() override { + // We ignore Dispose calls from V8, even if we "own" a resource via + // owning_ptr_. All instantiations of this class are static or owned by a + // static map, and will be destructed when static variables are destructed. + } + + StaticExternalByteResource(const StaticExternalByteResource&) = delete; + StaticExternalByteResource& operator=(const StaticExternalByteResource&) = + delete; + + private: + const Char* data_; + const size_t length_; + std::shared_ptr owning_ptr_; +}; + +using StaticExternalOneByteResource = + StaticExternalByteResource; +using StaticExternalTwoByteResource = + StaticExternalByteResource; + // Similar to a v8::String, but it's independent from Isolates // and can be materialized in Isolates as external Strings // via ToStringChecked. class UnionBytes { public: - UnionBytes(const uint16_t* data, size_t length) - : one_bytes_(nullptr), two_bytes_(data), length_(length) {} - UnionBytes(const uint8_t* data, size_t length) - : one_bytes_(data), two_bytes_(nullptr), length_(length) {} - template // T = uint8_t or uint16_t - explicit UnionBytes(std::shared_ptr> data) - : UnionBytes(data->data(), data->size()) { - owning_ptr_ = data; - } + explicit UnionBytes(StaticExternalOneByteResource* one_byte_resource) + : one_byte_resource_(one_byte_resource), two_byte_resource_(nullptr) {} + explicit UnionBytes(StaticExternalTwoByteResource* two_byte_resource) + : one_byte_resource_(nullptr), two_byte_resource_(two_byte_resource) {} UnionBytes(const UnionBytes&) = default; UnionBytes& operator=(const UnionBytes&) = default; UnionBytes(UnionBytes&&) = default; UnionBytes& operator=(UnionBytes&&) = default; - bool is_one_byte() const { return one_bytes_ != nullptr; } - const uint16_t* two_bytes_data() const { - CHECK_NOT_NULL(two_bytes_); - return two_bytes_; - } - const uint8_t* one_bytes_data() const { - CHECK_NOT_NULL(one_bytes_); - return one_bytes_; - } + bool is_one_byte() const { return one_byte_resource_ != nullptr; } + v8::Local ToStringChecked(v8::Isolate* isolate) const; - size_t length() const { return length_; } private: - const uint8_t* one_bytes_; - const uint16_t* two_bytes_; - size_t length_; - std::shared_ptr owning_ptr_; + StaticExternalOneByteResource* one_byte_resource_; + StaticExternalTwoByteResource* two_byte_resource_; }; } // namespace node diff --git a/src/util.cc b/src/util.cc index 5a0b22348526c6..7aaa5e2be5b880 100644 --- a/src/util.cc +++ b/src/util.cc @@ -606,65 +606,13 @@ void SetConstructorFunction(Isolate* isolate, that->Set(name, tmpl); } -namespace { - -class NonOwningExternalOneByteResource - : public v8::String::ExternalOneByteStringResource { - public: - explicit NonOwningExternalOneByteResource(const UnionBytes& source) - : source_(source) {} - ~NonOwningExternalOneByteResource() override = default; - - const char* data() const override { - return reinterpret_cast(source_.one_bytes_data()); - } - size_t length() const override { return source_.length(); } - - NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) = - delete; - NonOwningExternalOneByteResource& operator=( - const NonOwningExternalOneByteResource&) = delete; - - private: - const UnionBytes source_; -}; - -class NonOwningExternalTwoByteResource - : public v8::String::ExternalStringResource { - public: - explicit NonOwningExternalTwoByteResource(const UnionBytes& source) - : source_(source) {} - ~NonOwningExternalTwoByteResource() override = default; - - const uint16_t* data() const override { return source_.two_bytes_data(); } - size_t length() const override { return source_.length(); } - - NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) = - delete; - NonOwningExternalTwoByteResource& operator=( - const NonOwningExternalTwoByteResource&) = delete; - - private: - const UnionBytes source_; -}; - -} // anonymous namespace - Local UnionBytes::ToStringChecked(Isolate* isolate) const { - if (UNLIKELY(length() == 0)) { - // V8 requires non-null data pointers for empty external strings, - // but we don't guarantee that. Solve this by not creating an - // external string at all in that case. - return String::Empty(isolate); - } if (is_one_byte()) { - NonOwningExternalOneByteResource* source = - new NonOwningExternalOneByteResource(*this); - return String::NewExternalOneByte(isolate, source).ToLocalChecked(); + return String::NewExternalOneByte(isolate, one_byte_resource_) + .ToLocalChecked(); } else { - NonOwningExternalTwoByteResource* source = - new NonOwningExternalTwoByteResource(*this); - return String::NewExternalTwoByte(isolate, source).ToLocalChecked(); + return String::NewExternalTwoByte(isolate, two_byte_resource_) + .ToLocalChecked(); } } diff --git a/tools/js2c.py b/tools/js2c.py index 50f34c070ac099..ff31f182746053 100755 --- a/tools/js2c.py +++ b/tools/js2c.py @@ -49,6 +49,7 @@ def ReadFile(filename): TEMPLATE = """ #include "env-inl.h" #include "node_builtins.h" +#include "node_external_reference.h" #include "node_internals.h" namespace node {{ @@ -67,8 +68,13 @@ def ReadFile(filename): source_ = global_source_map; }} +void RegisterExternalReferencesForInternalizedBuiltinCode( + ExternalReferenceRegistry* registry) {{ + {2} +}} + UnionBytes BuiltinLoader::GetConfig() {{ - return UnionBytes(config_raw, {2}); // config.gypi + return UnionBytes(&{3}); }} }} // namespace builtins @@ -80,23 +86,31 @@ def ReadFile(filename): static const uint8_t {0}[] = {{ {1} }}; + +static StaticExternalOneByteResource {2}({0}, {3}, nullptr); """ TWO_BYTE_STRING = """ static const uint16_t {0}[] = {{ {1} }}; + +static StaticExternalTwoByteResource {2}({0}, {3}, nullptr); """ -INITIALIZER = '{{"{0}", UnionBytes{{{1}, {2}}} }},' +INITIALIZER = '{{"{0}", UnionBytes(&{1}) }},' + +REGISTRATION = 'registry->Register(&{0});' CONFIG_GYPI_ID = 'config_raw' +CONFIG_GYPI_RESOURCE_ID = 'config_resource' + SLUGGER_RE = re.compile(r'[.\-/]') is_verbose = False -def GetDefinition(var, source, step=30): +def GetDefinition(var, source, resource_var, step=30): template = ONE_BYTE_STRING code_points = [ord(c) for c in source] if any(c > 127 for c in code_points): @@ -114,20 +128,24 @@ def GetDefinition(var, source, step=30): slices = [elements_s[i:i + step] for i in range(0, len(elements_s), step)] lines = [','.join(s) for s in slices] array_content = ',\n'.join(lines) - definition = template.format(var, array_content) + length = len(code_points) + definition = template.format(var, array_content, resource_var, length) - return definition, len(code_points) + return definition -def AddModule(filename, definitions, initializers): +def AddModule(filename, definitions, initializers, registrations): code = ReadFile(filename) name = NormalizeFileName(filename) slug = SLUGGER_RE.sub('_', name) var = slug + '_raw' - definition, size = GetDefinition(var, code) - initializer = INITIALIZER.format(name, var, size) + resource_var = slug + '_resource' + definition = GetDefinition(var, code, resource_var) + initializer = INITIALIZER.format(name, resource_var) + registration = REGISTRATION.format(resource_var) definitions.append(definition) initializers.append(initializer) + registrations.append(registration) def NormalizeFileName(filename): split = filename.split('/') @@ -144,19 +162,22 @@ def JS2C(source_files, target): # Build source code lines definitions = [] initializers = [] + registrations = [] for filename in source_files['.js']: - AddModule(filename, definitions, initializers) + AddModule(filename, definitions, initializers, registrations) for filename in source_files['.mjs']: - AddModule(filename, definitions, initializers) + AddModule(filename, definitions, initializers, registrations) - config_def, config_size = handle_config_gypi(source_files['config.gypi']) + config_def = handle_config_gypi(source_files['config.gypi']) definitions.append(config_def) # Emit result definitions = ''.join(definitions) initializers = '\n '.join(initializers) - out = TEMPLATE.format(definitions, initializers, config_size) + registrations = '\n '.join(registrations) + out = TEMPLATE.format(definitions, initializers, + registrations, CONFIG_GYPI_RESOURCE_ID) write_if_chaged(out, target) @@ -165,8 +186,8 @@ def handle_config_gypi(config_filename): # later on anyway, so get it out of the way now config = ReadFile(config_filename) config = jsonify(config) - config_def, config_size = GetDefinition(CONFIG_GYPI_ID, config) - return config_def, config_size + config_def = GetDefinition(CONFIG_GYPI_ID, config, CONFIG_GYPI_RESOURCE_ID) + return config_def def jsonify(config):