release: switch to Cargo's default (#9031)

This sets `lto = "thin"` instead of using "fat" LTO, and sets `codegen-units = 16`. These are the defaults for Cargo's `release` profile, and I think it may give us faster iteration times, especially when benchmarking. The point of this PR is to see what kind of impact this has on benchmarks. It is expected that benchmarks may regress to some extent. I did some quick ad hoc experiments to quantify this change in compile times. Namely, I ran: cargo build --profile release -p ruff_cli Then I ran touch crates/ruff_python_formatter/src/expression/string/docstring.rs (because that's where i've been working lately) and re-ran cargo build --profile release -p ruff_cli This last command is what I timed, since it reflects how much time one has to wait between making a change and getting a compiled artifact. Here are my results: * With status quo `release` profile, build takes 77s * with `release` but `lto = "thin"`, build takes 41s * with `release`, but `lto = false`, build takes 19s * with `release`, but `lto = false` **and** `codegen-units = 16`, build takes 7s * with `release`, but `lto = "thin"` **and** `codegen-units = 16`, build takes 16s (i believe this is the default `release` configuration) This PR represents the last option. It's not the fastest to compile, but it's nearly a whole minute faster! The idea is that with `codegen-units = 16`, we still make use of parallelism, but keep _some_ level of LTO on to try and re-gain what we lose by increasing the number of codegen units.
astral-sh · Dec 15, 2023 · 3ce145c · 3ce145c
1 parent db38078
commit 3ce145c
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 9 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -556,19 +556,19 @@ examples.
 
 #### Linux
 
-Install `perf` and build `ruff_benchmark` with the `release-debug` profile and then run it with perf
+Install `perf` and build `ruff_benchmark` with the `profiling` profile and then run it with perf
 
 ```shell
-cargo bench -p ruff_benchmark --no-run --profile=release-debug && perf record --call-graph dwarf -F 9999 cargo bench -p ruff_benchmark --profile=release-debug -- --profile-time=1
+cargo bench -p ruff_benchmark --no-run --profile=profiling && perf record --call-graph dwarf -F 9999 cargo bench -p ruff_benchmark --profile=profiling -- --profile-time=1
 ```
 
 You can also use the `ruff_dev` launcher to run `ruff check` multiple times on a repository to
 gather enough samples for a good flamegraph (change the 999, the sample rate, and the 30, the number
 of checks, to your liking)
 
 ```shell
-cargo build --bin ruff_dev --profile=release-debug
-perf record -g -F 999 target/release-debug/ruff_dev repeat --repeat 30 --exit-zero --no-cache path/to/cpython > /dev/null
+cargo build --bin ruff_dev --profile=profiling
+perf record -g -F 999 target/profiling/ruff_dev repeat --repeat 30 --exit-zero --no-cache path/to/cpython > /dev/null
 ```
 
 Then convert the recorded profile
@@ -598,7 +598,7 @@ cargo install cargo-instruments
 Then run the profiler with
 
 ```shell
-cargo instruments -t time --bench linter --profile release-debug -p ruff_benchmark -- --profile-time=1
+cargo instruments -t time --bench linter --profile profiling -p ruff_benchmark -- --profile-time=1
 ```
 
 - `-t`: Specifies what to profile. Useful options are `time` to profile the wall time and `alloc`

diff --git a/Cargo.toml b/Cargo.toml
@@ -88,7 +88,20 @@ rc_mutex = "warn"
 rest_pat_in_fully_bound_structs = "warn"
 
 [profile.release]
-lto = "fat"
+# Note that we set these explicitly, and these values
+# were chosen based on a trade-off between compile times
+# and runtime performance[1].
+#
+# [1]: https://github.com/astral-sh/ruff/pull/9031
+lto = "thin"
+codegen-units = 16
+
+# Some crates don't change as much but benefit more from
+# more expensive optimization passes, so we selectively
+# decrease codegen-units in some cases.
+[profile.release.package.ruff_python_parser]
+codegen-units = 1
+[profile.release.package.ruff_python_ast]
 codegen-units = 1
 
 [profile.dev.package.insta]
@@ -102,8 +115,8 @@ opt-level = 3
 [profile.dev.package.ruff_python_parser]
 opt-level = 1
 
-# Use the `--profile release-debug` flag to show symbols in release mode.
-# e.g. `cargo build --profile release-debug`
-[profile.release-debug]
+# Use the `--profile profiling` flag to show symbols in release mode.
+# e.g. `cargo build --profile profiling`
+[profile.profiling]
 inherits = "release"
 debug = 1
diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs
@@ -120,6 +120,7 @@ impl<'a> Cursor<'a> {
  }
 
  /// Eats symbols while predicate returns true or until the end of file is reached.
+ #[inline]
  pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
  // It was tried making optimized version of this for eg. line comments, but
  // LLVM can inline all of this and compile it down to fast iteration over bytes.