Separate subtree size information from parse nodes. (carbon-language#…

…4174) Move subtree sizes over to TreeAndSubtrees, using the different structure to represent the additional parse work that occurs, as well as making it clear which functions require the extra information. My intent is to make it hard to use this by accident. The subtree size is still tracked during Parse::Tree construction. I think a lot of that can be cleaned up, although we use it during placeholder assignment so it may take some work. I wanted to see what people thought about this before taking action on such a change. I'm using a 1m line source file generated by carbon-language#4124 for testing. Command is `time bazel-bin/toolchain/install/prefix_root/bin/carbon compile --phase=check --dump-mem-usage ~/tmp/data.carbon` At head, what I'm seeing is: ``` ... parse_tree_.node_impls_: used_bytes: 61516116 reserved_bytes: 61516116 ... Total: used_bytes: 447814230 reserved_bytes: 551663894 ... 1.43s user 0.14s system 99% cpu 1.565 total ``` With `Tree::Verify` disabled completely, it looks like: ``` parse_tree_.node_impls_: used_bytes: 41010744 reserved_bytes: 41010744 ... Total: used_bytes: 427308858 reserved_bytes: 531158522 ... 1.20s user 0.13s system 99% cpu 1.332 total ``` Re-enabling just the basic verification (what is now `Tree::Verify`), I'm seeing maybe 0.05s slower, but that's within noise for my system. I do see variability in my timing results, and overall I think this is a 0.2s +/- 0.1s improvement versus the earlier (always testing `Extract` code) implementation. That's opt; debug builds will be unaffected, because the same checking occurs as before. Note, the subtree size is a third of the node representation, which is why I'm showing the decrease in memory usage here.
brymer-meneses · Aug 15, 2024 · b54e1a2 · b54e1a2
1 parent 08b1fd8
commit b54e1a2
Show file tree

Hide file tree

Showing 40 changed files with 767 additions and 692 deletions.
diff --git a/language_server/language_server.cpp b/language_server/language_server.cpp
@@ -10,6 +10,7 @@
 #include "toolchain/lex/lex.h"
 #include "toolchain/parse/node_kind.h"
 #include "toolchain/parse/parse.h"
+#include "toolchain/parse/tree_and_subtrees.h"
 #include "toolchain/source/source_buffer.h"
 
 namespace Carbon::LS {
@@ -79,11 +80,12 @@ auto LanguageServer::onReply(llvm::json::Value /*id*/,
 // Returns the text of first child of kind Parse::NodeKind::IdentifierName.
 static auto GetIdentifierName(const SharedValueStores& value_stores,
                               const Lex::TokenizedBuffer& tokens,
-                              const Parse::Tree& p, Parse::NodeId node)
+                              const Parse::TreeAndSubtrees& p,
+                              Parse::NodeId node)
     -> std::optional<llvm::StringRef> {
   for (auto ch : p.children(node)) {
-    if (p.node_kind(ch) == Parse::NodeKind::IdentifierName) {
-      auto token = p.node_token(ch);
+    if (p.tree().node_kind(ch) == Parse::NodeKind::IdentifierName) {
+      auto token = p.tree().node_token(ch);
       if (tokens.GetKind(token) == Lex::TokenKind::Identifier) {
         return value_stores.identifiers().Get(tokens.GetIdentifier(token));
       }
@@ -104,6 +106,7 @@ void LanguageServer::OnDocumentSymbol(
   auto buf = SourceBuffer::MakeFromFile(vfs, file, NullDiagnosticConsumer());
   auto lexed = Lex::Lex(value_stores, *buf, NullDiagnosticConsumer());
   auto parsed = Parse::Parse(lexed, NullDiagnosticConsumer(), nullptr);
+  Parse::TreeAndSubtrees tree_and_subtrees(lexed, parsed);
   std::vector<clang::clangd::DocumentSymbol> result;
   for (const auto& node : parsed.postorder()) {
     clang::clangd::SymbolKind symbol_kind;
@@ -126,7 +129,8 @@ void LanguageServer::OnDocumentSymbol(
         continue;
     }
 
-    if (auto name = GetIdentifierName(value_stores, lexed, parsed, node)) {
+    if (auto name =
+            GetIdentifierName(value_stores, lexed, tree_and_subtrees, node)) {
       auto tok = parsed.node_token(node);
       clang::clangd::Position pos{lexed.GetLineNumber(tok) - 1,
                                   lexed.GetColumnNumber(tok) - 1};

diff --git a/toolchain/check/check.cpp b/toolchain/check/check.cpp
@@ -65,7 +65,7 @@ struct UnitInfo {
       : check_ir_id(check_ir_id),
         unit(&unit),
         converter(unit.tokens, unit.tokens->source().filename(),
-                  unit.parse_tree),
+                  unit.get_parse_tree_and_subtrees),
         err_tracker(*unit.consumer),
         emitter(converter, err_tracker) {}
 
@@ -891,7 +891,8 @@ static auto CheckParseTree(
   SemIRDiagnosticConverter converter(node_converters, &sem_ir);
   Context::DiagnosticEmitter emitter(converter, unit_info.err_tracker);
   Context context(*unit_info.unit->tokens, emitter, *unit_info.unit->parse_tree,
-                  sem_ir, vlog_stream);
+                  unit_info.unit->get_parse_tree_and_subtrees, sem_ir,
+                  vlog_stream);
   PrettyStackTraceFunction context_dumper(
       [&](llvm::raw_ostream& output) { context.PrintForStackDump(output); });
 

diff --git a/toolchain/check/check.h b/toolchain/check/check.h
@@ -10,6 +10,7 @@
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/lex/tokenized_buffer.h"
 #include "toolchain/parse/tree.h"
+#include "toolchain/parse/tree_and_subtrees.h"
 #include "toolchain/sem_ir/file.h"
 
 namespace Carbon::Check {
@@ -20,6 +21,8 @@ struct Unit {
   const Lex::TokenizedBuffer* tokens;
   const Parse::Tree* parse_tree;
   DiagnosticConsumer* consumer;
+  // Returns a lazily constructed TreeAndSubtrees.
+  std::function<const Parse::TreeAndSubtrees&()> get_parse_tree_and_subtrees;
   // The generated IR. Unset on input, set on output.
   std::optional<SemIR::File>* sem_ir;
 };

diff --git a/toolchain/check/context.cpp b/toolchain/check/context.cpp
@@ -36,11 +36,14 @@
 namespace Carbon::Check {
 
 Context::Context(const Lex::TokenizedBuffer& tokens, DiagnosticEmitter& emitter,
-                 const Parse::Tree& parse_tree, SemIR::File& sem_ir,
-                 llvm::raw_ostream* vlog_stream)
+                 const Parse::Tree& parse_tree,
+                 llvm::function_ref<const Parse::TreeAndSubtrees&()>
+                     get_parse_tree_and_subtrees,
+                 SemIR::File& sem_ir, llvm::raw_ostream* vlog_stream)
     : tokens_(&tokens),
       emitter_(&emitter),
       parse_tree_(&parse_tree),
+      get_parse_tree_and_subtrees_(get_parse_tree_and_subtrees),
       sem_ir_(&sem_ir),
       vlog_stream_(vlog_stream),
       node_stack_(parse_tree, vlog_stream),

diff --git a/toolchain/check/context.h b/toolchain/check/context.h
@@ -19,6 +19,7 @@
 #include "toolchain/check/scope_stack.h"
 #include "toolchain/parse/node_ids.h"
 #include "toolchain/parse/tree.h"
+#include "toolchain/parse/tree_and_subtrees.h"
 #include "toolchain/sem_ir/file.h"
 #include "toolchain/sem_ir/ids.h"
 #include "toolchain/sem_ir/import_ir.h"
@@ -53,6 +54,8 @@ class Context {
   // Stores references for work.
   explicit Context(const Lex::TokenizedBuffer& tokens,
                    DiagnosticEmitter& emitter, const Parse::Tree& parse_tree,
+                   llvm::function_ref<const Parse::TreeAndSubtrees&()>
+                       get_parse_tree_and_subtrees,
                    SemIR::File& sem_ir, llvm::raw_ostream* vlog_stream);
 
   // Marks an implementation TODO. Always returns false.
@@ -360,6 +363,10 @@ class Context {
 
   auto parse_tree() -> const Parse::Tree& { return *parse_tree_; }
 
+  auto parse_tree_and_subtrees() -> const Parse::TreeAndSubtrees& {
+    return get_parse_tree_and_subtrees_();
+  }
+
   auto sem_ir() -> SemIR::File& { return *sem_ir_; }
 
   auto node_stack() -> NodeStack& { return node_stack_; }
@@ -486,6 +493,10 @@ class Context {
   // The file's parse tree.
   const Parse::Tree* parse_tree_;
 
+  // Returns a lazily constructed TreeAndSubtrees.
+  llvm::function_ref<const Parse::TreeAndSubtrees&()>
+      get_parse_tree_and_subtrees_;
+
   // The SemIR::File being added to.
   SemIR::File* sem_ir_;
 

diff --git a/toolchain/check/handle_impl.cpp b/toolchain/check/handle_impl.cpp
@@ -141,7 +141,8 @@ static auto ExtendImpl(Context& context, Parse::NodeId extend_node,
     // The explicit self type is the same as the default self type, so suggest
     // removing it and recover as if it were not present.
     if (auto self_as =
-            context.parse_tree().ExtractAs<Parse::TypeImplAs>(self_type_node)) {
+            context.parse_tree_and_subtrees().ExtractAs<Parse::TypeImplAs>(
+                self_type_node)) {
       CARBON_DIAGNOSTIC(ExtendImplSelfAsDefault, Note,
                         "Remove the explicit `Self` type here.");
       diag.Note(self_as->type_expr, ExtendImplSelfAsDefault);

diff --git a/toolchain/driver/BUILD b/toolchain/driver/BUILD
@@ -74,6 +74,7 @@ cc_library(
         "//toolchain/lex",
         "//toolchain/lower",
         "//toolchain/parse",
+        "//toolchain/parse:tree",
         "//toolchain/sem_ir:file",
         "//toolchain/sem_ir:formatter",
         "//toolchain/sem_ir:inst_namer",

diff --git a/toolchain/driver/driver.cpp b/toolchain/driver/driver.cpp
@@ -27,6 +27,7 @@
 #include "toolchain/lex/lex.h"
 #include "toolchain/lower/lower.h"
 #include "toolchain/parse/parse.h"
+#include "toolchain/parse/tree_and_subtrees.h"
 #include "toolchain/sem_ir/formatter.h"
 #include "toolchain/sem_ir/inst_namer.h"
 #include "toolchain/source/source_buffer.h"
@@ -599,7 +600,12 @@ class Driver::CompilationUnit {
     });
     if (options_.dump_parse_tree && IncludeInDumps()) {
       consumer_->Flush();
-      parse_tree_->Print(driver_->output_stream_, options_.preorder_parse_tree);
+      const auto& tree_and_subtrees = GetParseTreeAndSubtrees();
+      if (options_.preorder_parse_tree) {
+        tree_and_subtrees.PrintPreorder(driver_->output_stream_);
+      } else {
+        tree_and_subtrees.Print(driver_->output_stream_);
+      }
     }
     if (mem_usage_) {
       mem_usage_->Collect("parse_tree_", *parse_tree_);
@@ -613,11 +619,15 @@ class Driver::CompilationUnit {
   // Returns information needed to check this unit.
   auto GetCheckUnit() -> Check::Unit {
     CARBON_CHECK(parse_tree_);
-    return {.value_stores = &value_stores_,
-            .tokens = &*tokens_,
-            .parse_tree = &*parse_tree_,
-            .consumer = consumer_,
-            .sem_ir = &sem_ir_};
+    return {
+        .value_stores = &value_stores_,
+        .tokens = &*tokens_,
+        .parse_tree = &*parse_tree_,
+        .consumer = consumer_,
+        .get_parse_tree_and_subtrees = [&]() -> const Parse::TreeAndSubtrees& {
+          return GetParseTreeAndSubtrees();
+        },
+        .sem_ir = &sem_ir_};
   }
 
   // Runs post-check logic. Returns true if checking succeeded for the IR.
@@ -778,6 +788,19 @@ class Driver::CompilationUnit {
     return true;
   }
 
+  // The TreeAndSubtrees is mainly used for debugging and diagnostics, and has
+  // significant overhead. Avoid constructing it when unused.
+  auto GetParseTreeAndSubtrees() -> const Parse::TreeAndSubtrees& {
+    if (!parse_tree_and_subtrees_) {
+      parse_tree_and_subtrees_ = Parse::TreeAndSubtrees(*tokens_, *parse_tree_);
+      if (mem_usage_) {
+        mem_usage_->Collect("parse_tree_and_subtrees_",
+                            *parse_tree_and_subtrees_);
+      }
+    }
+    return *parse_tree_and_subtrees_;
+  }
+
   // Wraps a call with log statements to indicate start and end.
   auto LogCall(llvm::StringLiteral label, llvm::function_ref<void()> fn)
       -> void {
@@ -814,6 +837,7 @@ class Driver::CompilationUnit {
   std::optional<SourceBuffer> source_;
   std::optional<Lex::TokenizedBuffer> tokens_;
   std::optional<Parse::Tree> parse_tree_;
+  std::optional<Parse::TreeAndSubtrees> parse_tree_and_subtrees_;
   std::optional<SemIR::File> sem_ir_;
   std::unique_ptr<llvm::LLVMContext> llvm_context_;
   std::unique_ptr<llvm::Module> module_;

diff --git a/toolchain/parse/BUILD b/toolchain/parse/BUILD
@@ -106,8 +106,12 @@ cc_library(
     srcs = [
         "extract.cpp",
         "tree.cpp",
+        "tree_and_subtrees.cpp",
+    ],
+    hdrs = [
+        "tree.h",
+        "tree_and_subtrees.h",
     ],
-    hdrs = ["tree.h"],
     deps = [
         ":node_kind",
         "//common:check",

diff --git a/toolchain/parse/context.cpp b/toolchain/parse/context.cpp
@@ -68,18 +68,15 @@ Context::Context(Tree& tree, Lex::TokenizedBuffer& tokens,
 
 auto Context::AddLeafNode(NodeKind kind, Lex::TokenIndex token, bool has_error)
     -> void {
-  tree_->node_impls_.push_back(
-      Tree::NodeImpl(kind, has_error, token, /*subtree_size=*/1));
+  tree_->node_impls_.push_back(Tree::NodeImpl(kind, has_error, token));
   if (has_error) {
     tree_->has_errors_ = true;
   }
 }
 
-auto Context::AddNode(NodeKind kind, Lex::TokenIndex token, int subtree_start,
-                      bool has_error) -> void {
-  int subtree_size = tree_->size() - subtree_start + 1;
-  tree_->node_impls_.push_back(
-      Tree::NodeImpl(kind, has_error, token, subtree_size));
+auto Context::AddNode(NodeKind kind, Lex::TokenIndex token, bool has_error)
+    -> void {
+  tree_->node_impls_.push_back(Tree::NodeImpl(kind, has_error, token));
   if (has_error) {
     tree_->has_errors_ = true;
   }
@@ -91,7 +88,6 @@ auto Context::ReplacePlaceholderNode(int32_t position, NodeKind kind,
   CARBON_CHECK(position >= 0 && position < tree_->size())
       << "position: " << position << " size: " << tree_->size();
   auto* node_impl = &tree_->node_impls_[position];
-  CARBON_CHECK(node_impl->subtree_size == 1);
   CARBON_CHECK(node_impl->kind == NodeKind::Placeholder);
   node_impl->kind = kind;
   node_impl->has_error = has_error;
@@ -123,9 +119,9 @@ auto Context::ConsumeAndAddCloseSymbol(Lex::TokenIndex expected_open,
   Lex::TokenKind open_token_kind = tokens().GetKind(expected_open);
 
   if (!open_token_kind.is_opening_symbol()) {
-    AddNode(close_kind, state.token, state.subtree_start, /*has_error=*/true);
+    AddNode(close_kind, state.token, /*has_error=*/true);
   } else if (auto close_token = ConsumeIf(open_token_kind.closing_symbol())) {
-    AddNode(close_kind, *close_token, state.subtree_start, state.has_error);
+    AddNode(close_kind, *close_token, state.has_error);
   } else {
     // TODO: Include the location of the matching opening delimiter in the
     // diagnostic.
@@ -135,7 +131,7 @@ auto Context::ConsumeAndAddCloseSymbol(Lex::TokenIndex expected_open,
                    open_token_kind.closing_symbol().fixed_spelling());
 
     SkipTo(tokens().GetMatchedClosingToken(expected_open));
-    AddNode(close_kind, Consume(), state.subtree_start, /*has_error=*/true);
+    AddNode(close_kind, Consume(), /*has_error=*/true);
   }
 }
 
@@ -415,7 +411,7 @@ auto Context::AddNodeExpectingDeclSemi(StateStackEntry state,
   }
 
   if (auto semi = ConsumeIf(Lex::TokenKind::Semi)) {
-    AddNode(node_kind, *semi, state.subtree_start, /*has_error=*/false);
+    AddNode(node_kind, *semi, /*has_error=*/false);
   } else {
     if (is_def_allowed) {
       DiagnoseExpectedDeclSemiOrDefinition(decl_kind);
@@ -433,8 +429,7 @@ auto Context::RecoverFromDeclError(StateStackEntry state, NodeKind node_kind,
   if (skip_past_likely_end) {
     token = SkipPastLikelyEnd(token);
   }
-  AddNode(node_kind, token, state.subtree_start,
-          /*has_error=*/true);
+  AddNode(node_kind, token, /*has_error=*/true);
 }
 
 auto Context::ParseLibraryName(bool accept_default)
@@ -464,13 +459,11 @@ auto Context::ParseLibraryName(bool accept_default)
 auto Context::ParseLibrarySpecifier(bool accept_default)
     -> std::optional<StringLiteralValueId> {
   auto library_token = ConsumeChecked(Lex::TokenKind::Library);
-  auto library_subtree_start = tree().size();
   auto library_id = ParseLibraryName(accept_default);
   if (!library_id) {
     AddLeafNode(NodeKind::LibraryName, *position_, /*has_error=*/true);
   }
-  AddNode(NodeKind::LibrarySpecifier, library_token, library_subtree_start,
-          /*has_error=*/false);
+  AddNode(NodeKind::LibrarySpecifier, library_token, /*has_error=*/false);
   return library_id;
 }
 
@@ -503,20 +496,19 @@ static auto ParsingInDeferredDefinitionScope(Context& context) -> bool {
          state == State::DeclDefinitionFinishAsNamedConstraint;
 }
 
-auto Context::AddFunctionDefinitionStart(Lex::TokenIndex token,
-                                         int subtree_start, bool has_error)
+auto Context::AddFunctionDefinitionStart(Lex::TokenIndex token, bool has_error)
     -> void {
   if (ParsingInDeferredDefinitionScope(*this)) {
     deferred_definition_stack_.push_back(tree_->deferred_definitions_.Add(
         {.start_id =
              FunctionDefinitionStartId(NodeId(tree_->node_impls_.size()))}));
   }
 
-  AddNode(NodeKind::FunctionDefinitionStart, token, subtree_start, has_error);
+  AddNode(NodeKind::FunctionDefinitionStart, token, has_error);
 }
 
-auto Context::AddFunctionDefinition(Lex::TokenIndex token, int subtree_start,
-                                    bool has_error) -> void {
+auto Context::AddFunctionDefinition(Lex::TokenIndex token, bool has_error)
+    -> void {
   if (ParsingInDeferredDefinitionScope(*this)) {
     auto definition_index = deferred_definition_stack_.pop_back_val();
     auto& definition = tree_->deferred_definitions_.Get(definition_index);
@@ -526,7 +518,7 @@ auto Context::AddFunctionDefinition(Lex::TokenIndex token, int subtree_start,
         DeferredDefinitionIndex(tree_->deferred_definitions().size());
   }
 
-  AddNode(NodeKind::FunctionDefinition, token, subtree_start, has_error);
+  AddNode(NodeKind::FunctionDefinition, token, has_error);
 }
 
 auto Context::PrintForStackDump(llvm::raw_ostream& output) const -> void {

diff --git a/toolchain/parse/context.h b/toolchain/parse/context.h
@@ -100,8 +100,7 @@ class Context {
       -> void;
 
   // Adds a node to the parse tree that has children.
-  auto AddNode(NodeKind kind, Lex::TokenIndex token, int subtree_start,
-               bool has_error) -> void;
+  auto AddNode(NodeKind kind, Lex::TokenIndex token, bool has_error) -> void;
 
   // Replaces the placeholder node at the indicated position with a leaf node.
   //
@@ -328,12 +327,11 @@ class Context {
 
   // Adds a function definition start node, and begins tracking a deferred
   // definition if necessary.
-  auto AddFunctionDefinitionStart(Lex::TokenIndex token, int subtree_start,
-                                  bool has_error) -> void;
+  auto AddFunctionDefinitionStart(Lex::TokenIndex token, bool has_error)
+      -> void;
   // Adds a function definition node, and ends tracking a deferred definition if
   // necessary.
-  auto AddFunctionDefinition(Lex::TokenIndex token, int subtree_start,
-                             bool has_error) -> void;
+  auto AddFunctionDefinition(Lex::TokenIndex token, bool has_error) -> void;
 
   // Prints information for a stack dump.
   auto PrintForStackDump(llvm::raw_ostream& output) const -> void;