carbon-language · jonmeow · Jul 16, 2024 · Jul 15, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/toolchain/base/BUILD b/toolchain/base/BUILD
@@ -31,11 +31,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "mem_usage",
+    hdrs = ["mem_usage.h"],
+    deps = [
+        ":yaml",
+        "//common:map",
+        "//common:set",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "value_store",
     hdrs = ["value_store.h"],
     deps = [
         ":index_base",
+        ":mem_usage",
         ":yaml",
         "//common:check",
         "//common:hashing",

diff --git a/toolchain/base/mem_usage.h b/toolchain/base/mem_usage.h
@@ -0,0 +1,145 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_TOOLCHAIN_BASE_MEM_USAGE_H_
+#define CARBON_TOOLCHAIN_BASE_MEM_USAGE_H_
+
+#include <cstdint>
+
+#include "common/map.h"
+#include "common/set.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "toolchain/base/yaml.h"
+
+namespace Carbon {
+
+// Helps track memory usage for a compile.
+//
+// Uses will mix `Add` and `Collect` calls, using `ConcatLabel` to label
+// allocation sources. Typically we'll collect stats for growable, potentially
+// large data types (such as `SmallVector`), ignoring small fixed-size members
+// (such as pointers or `int32_t`).
+//
+// For example:
+//
+//   auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
+//       -> void {
+//     // Explicit tracking.
+//     mem_usage.Add(MemUsage::ConcatLabel(label, "data_"), data_.used_bytes(),
+//                   data_.reserved_bytes());
+//     // Common library types like `Map` and `llvm::SmallVector` have
+//     // type-specific support.
+//     mem_usage.Add(MemUsage::Concat(label, "array_"), array_);
+//     // Implementing `CollectMemUsage` allows use with the same interface.
+//     mem_usage.Collect(MemUsage::Concat(label, "obj_"), obj_);
+//   }
+class MemUsage {
+ public:
+  // Adds tracking for used and reserved bytes, paired with the given label.
+  auto Add(std::string label, int64_t used_bytes, int64_t reserved_bytes)
+      -> void {
+    mem_usage_.push_back({.label = std::move(label),
+                          .used_bytes = used_bytes,
+                          .reserved_bytes = reserved_bytes});
+  }
+
+  // Adds usage tracking for an allocator.
+  auto Add(std::string label, const llvm::BumpPtrAllocator& allocator) -> void {
+    Add(std::move(label), allocator.getBytesAllocated(),
+        allocator.getTotalMemory());
+  }
+
+  // Adds usage tracking for a map.
+  template <typename KeyT, typename ValueT, ssize_t SmallSize,
+            typename KeyContextT>
+  auto Add(std::string label, Map<KeyT, ValueT, SmallSize, KeyContextT> map,
+           KeyContextT key_context = KeyContextT()) -> void {
+    // These don't track used bytes, so we set the same value for used and
+    // reserved bytes.
+    auto bytes = map.ComputeMetrics(key_context).storage_bytes;
+    Add(std::move(label), bytes, bytes);
+  }
+
+  // Adds usage tracking for a set.
+  template <typename KeyT, ssize_t SmallSize, typename KeyContextT>
+  auto Add(std::string label, Set<KeyT, SmallSize, KeyContextT> set,
+           KeyContextT key_context = KeyContextT()) -> void {
+    // These don't track used bytes, so we set the same value for used and
+    // reserved bytes.
+    auto bytes = set.ComputeMetrics(key_context).storage_bytes;
+    Add(std::move(label), bytes, bytes);
+  }
+
+  // Adds memory usage of an array's data. This ignores the possible overhead of
+  // a SmallVector's in-place storage; if it's used, it's going to be tiny
+  // relative to scaling memory costs.
+  //
+  // This uses SmallVector in order to get proper inference for T, which
+  // ArrayRef misses.
+  template <typename T, unsigned N>
+  auto Add(std::string label, const llvm::SmallVector<T, N>& array) -> void {
+    Add(std::move(label), array.size_in_bytes(), array.capacity_in_bytes());
+  }
+
+  // Adds memory usage for an object that provides `CollectMemUsage`.
+  //
+  // The expected signature of `CollectMemUsage` is above, in MemUsage class
+  // comments.
+  template <typename T>
+  auto Collect(llvm::StringRef label, const T& arg) -> void {
+    arg.CollectMemUsage(*this, label);
+  }
+
+  // Constructs a label for memory usage, handling the `.` concatenation.
+  // We don't expect much depth in labels per-call.
+  static auto ConcatLabel(llvm::StringRef label, llvm::StringRef child_label)
+      -> std::string {
+    return llvm::formatv("{0}.{1}", label, child_label);
+  }
+  static auto ConcatLabel(llvm::StringRef label, llvm::StringRef child_label1,
+                          llvm::StringRef child_label2) -> std::string {
+    return llvm::formatv("{0}.{1}.{2}", label, child_label1, child_label2);
+  }
+
+  auto OutputYaml(llvm::StringRef filename) const -> Yaml::OutputMapping {
+    // Explicitly copy the filename.
+    return Yaml::OutputMapping([&, filename](Yaml::OutputMapping::Map map) {
+      map.Add("filename", filename);
+      int64_t total_used = 0;
+      int64_t total_reserved = 0;
+      for (const auto& entry : mem_usage_) {
+        total_used += entry.used_bytes;
+        total_reserved += entry.reserved_bytes;
+        map.Add(entry.label,
+                Yaml::OutputMapping([&](Yaml::OutputMapping::Map byte_map) {
+                  byte_map.Add("used_bytes", entry.used_bytes);
+                  byte_map.Add("reserved_bytes", entry.reserved_bytes);
+                }));
+      }
+      map.Add("Total",
+              Yaml::OutputMapping([&](Yaml::OutputMapping::Map byte_map) {
+                byte_map.Add("used_bytes", total_used);
+                byte_map.Add("reserved_bytes", total_reserved);
+              }));
+    });
+  }
+
+ private:
+  // Memory usage for a specific label.
+  struct Entry {
+    std::string label;
+    int64_t used_bytes;
+    int64_t reserved_bytes;
+  };
+
+  // The accumulated data on memory usage.
+  llvm::SmallVector<Entry> mem_usage_;
+};
+
+}  // namespace Carbon
+
+#endif  // CARBON_TOOLCHAIN_BASE_MEM_USAGE_H_
diff --git a/toolchain/base/value_store.h b/toolchain/base/value_store.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/YAMLParser.h"
 #include "toolchain/base/index_base.h"
+#include "toolchain/base/mem_usage.h"
 #include "toolchain/base/yaml.h"
 
 namespace Carbon {
@@ -187,6 +188,12 @@ class ValueStore
     });
   }
 
+  // Collects memory usage of the values.
+  auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
+      -> void {
+    mem_usage.Add(label.str(), values_);
+  }
+
   auto array_ref() const -> llvm::ArrayRef<ValueType> { return values_; }
   auto size() const -> size_t { return values_.size(); }
 
@@ -237,6 +244,15 @@ class CanonicalValueStore {
   }
   auto size() const -> size_t { return values_.size(); }
 
+  // Collects memory usage of the values and deduplication set.
+  auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
+      -> void {
+    mem_usage.Collect(MemUsage::ConcatLabel(label, "values_"), values_);
+    auto bytes =
+        set_.ComputeMetrics(KeyContext(values_.array_ref())).storage_bytes;
+    mem_usage.Add(MemUsage::ConcatLabel(label, "set_"), bytes, bytes);
+  }
+
  private:
   class KeyContext;
 
@@ -322,6 +338,18 @@ class SharedValueStores : public Yaml::Printable<SharedValueStores> {
     });
   }
 
+  // Collects memory usage for the various shared stores.
+  auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
+      -> void {
+    mem_usage.Collect(MemUsage::ConcatLabel(label, "ints_"), ints_);
+    mem_usage.Collect(MemUsage::ConcatLabel(label, "reals_"), reals_);
+    mem_usage.Collect(MemUsage::ConcatLabel(label, "floats_"), floats_);
+    mem_usage.Collect(MemUsage::ConcatLabel(label, "identifiers_"),
+                      identifiers_);
+    mem_usage.Collect(MemUsage::ConcatLabel(label, "string_literals_"),
+                      string_literals_);
+  }
+
  private:
   CanonicalValueStore<IntId> ints_;
   ValueStore<RealId> reals_;

diff --git a/toolchain/driver/driver.cpp b/toolchain/driver/driver.cpp
@@ -308,6 +308,14 @@ Dump the generated assembly to stdout after codegen.
 )""",
         },
         [&](auto& arg_b) { arg_b.Set(&dump_asm); });
+    b.AddFlag(
+        {
+            .name = "dump-mem-usage",
+            .help = R"""(
+Dumps the amount of memory used.
+)""",
+        },
+        [&](auto& arg_b) { arg_b.Set(&dump_mem_usage); });
     b.AddFlag(
         {
             .name = "prelude-import",
@@ -344,6 +352,7 @@ Excludes files with the given prefix from dumps.
   bool dump_sem_ir = false;
   bool dump_llvm_ir = false;
   bool dump_asm = false;
+  bool dump_mem_usage = false;
   bool stream_errors = false;
   bool preorder_parse_tree = false;
   bool builtin_sem_ir = false;
@@ -540,6 +549,9 @@ class Driver::CompilationUnit {
       sorting_consumer_ = SortingDiagnosticConsumer(*consumer);
       consumer_ = &*sorting_consumer_;
     }
+    if (options_.dump_mem_usage && IncludeInDumps()) {
+      mem_usage_ = MemUsage();
+    }
   }
 
   // Loads source and lexes it. Returns true on success.
@@ -552,6 +564,10 @@ class Driver::CompilationUnit {
                                              *consumer_);
       }
     });
+    if (mem_usage_) {
+      mem_usage_->Add("source_", source_->text().size(),
+                      source_->text().size());
+    }
     if (!source_) {
       success_ = false;
       return;
@@ -565,6 +581,9 @@ class Driver::CompilationUnit {
       consumer_->Flush();
       driver_->output_stream_ << tokens_;
     }
+    if (mem_usage_) {
+      mem_usage_->Collect("tokens_", *tokens_);
+    }
     CARBON_VLOG() << "*** Lex::TokenizedBuffer ***\n" << tokens_;
     if (tokens_->has_errors()) {
       success_ = false;
@@ -582,6 +601,9 @@ class Driver::CompilationUnit {
       consumer_->Flush();
       parse_tree_->Print(driver_->output_stream_, options_.preorder_parse_tree);
     }
+    if (mem_usage_) {
+      mem_usage_->Collect("parse_tree_", *parse_tree_);
+    }
     CARBON_VLOG() << "*** Parse::Tree ***\n" << parse_tree_;
     if (parse_tree_->has_errors()) {
       success_ = false;
@@ -607,8 +629,12 @@ class Driver::CompilationUnit {
     // to wait for code generation.
     consumer_->Flush();
 
-    CARBON_VLOG() << "*** Raw SemIR::File ***\n" << *sem_ir_ << "\n";
+    if (mem_usage_) {
+      mem_usage_->Collect("sem_ir_", *sem_ir_);
+    }
+
     if (options_.dump_raw_sem_ir && IncludeInDumps()) {
+      CARBON_VLOG() << "*** Raw SemIR::File ***\n" << *sem_ir_ << "\n";
       sem_ir_->Print(driver_->output_stream_, options_.builtin_sem_ir);
       if (options_.dump_sem_ir) {
         driver_->output_stream_ << "\n";
@@ -659,11 +685,16 @@ class Driver::CompilationUnit {
 
   // Runs post-compile logic. This is always called, and called after all other
   // actions on the CompilationUnit.
-  auto PostCompile() const -> void {
+  auto PostCompile() -> void {
     if (options_.dump_shared_values && IncludeInDumps()) {
       Yaml::Print(driver_->output_stream_,
                   value_stores_.OutputYaml(input_filename_));
     }
+    if (mem_usage_) {
+      mem_usage_->Collect("value_stores_", value_stores_);
+      Yaml::Print(driver_->output_stream_,
+                  mem_usage_->OutputYaml(input_filename_));
+    }
 
     // The diagnostics consumer must be flushed before compilation artifacts are
     // destructed, because diagnostics can refer to their state.
@@ -773,6 +804,9 @@ class Driver::CompilationUnit {
 
   bool success_ = true;
 
+  // Tracks memory usage of the compile.
+  std::optional<MemUsage> mem_usage_;
+
   // These are initialized as steps are run.
   std::optional<SourceBuffer> source_;
   std::optional<Lex::TokenizedBuffer> tokens_;

diff --git a/toolchain/driver/testdata/dump_mem_usage.carbon b/toolchain/driver/testdata/dump_mem_usage.carbon
@@ -0,0 +1,19 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ARGS: compile --phase=check --dump-mem-usage %s
+//
+// NOAUTOUPDATE
+//
+// Avoid testing specific values:
+// SET-CHECK-SUBSET
+
+var x: i32 = 1;
+
+// CHECK:STDOUT: ---
+// CHECK:STDOUT: filename:        dump_mem_usage.carbon
+// CHECK:STDOUT: source_:
+// CHECK:STDOUT:   used:            0
+// CHECK:STDOUT:   reserved:        0
+// CHECK:STDOUT: ...
diff --git a/toolchain/lex/BUILD b/toolchain/lex/BUILD
@@ -221,6 +221,7 @@ cc_library(
         "//common:ostream",
         "//common:string_helpers",
         "//toolchain/base:index_base",
+        "//toolchain/base:mem_usage",
         "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/source:source_buffer",

diff --git a/toolchain/lex/tokenized_buffer.cpp b/toolchain/lex/tokenized_buffer.cpp
@@ -340,6 +340,13 @@ auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
   return TokenIndex(static_cast<int>(token_infos_.size()) - 1);
 }
 
+auto TokenizedBuffer::CollectMemUsage(MemUsage& mem_usage,
+                                      llvm::StringRef label) const -> void {
+  mem_usage.Add(MemUsage::ConcatLabel(label, "allocator_"), allocator_);
+  mem_usage.Add(MemUsage::ConcatLabel(label, "token_infos_"), token_infos_);
+  mem_usage.Add(MemUsage::ConcatLabel(label, "line_infos_"), line_infos_);
+}
+
 auto TokenIterator::Print(llvm::raw_ostream& output) const -> void {
   output << token_.index;
 }

diff --git a/toolchain/lex/tokenized_buffer.h b/toolchain/lex/tokenized_buffer.h
@@ -18,6 +18,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/raw_ostream.h"
 #include "toolchain/base/index_base.h"
+#include "toolchain/base/mem_usage.h"
 #include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/lex/token_index.h"
@@ -204,6 +205,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token) const
       -> void;
 
+  // Collects memory usage of members.
+  auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
+      -> void;
+
   // Returns true if the buffer has errors that were detected at lexing time.
   auto has_errors() const -> bool { return has_errors_; }