diff --git a/.gitignore b/.gitignore index d9c4a7972f076d..ef7642b09bc5d2 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,10 @@ *.gc?? *.profclang? *.profraw +# Copies of binaries before BOLT optimizations. +*.prebolt +# BOLT profile data. +*.fdata *.dyn .gdb_history .purify @@ -124,6 +128,7 @@ Tools/unicode/data/ /platform /profile-clean-stamp /profile-run-stamp +/profile-bolt-stamp /Python/deepfreeze/*.c /pybuilddir.txt /pyconfig.h diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index ce858ab2c8d79e..fbe280d6413170 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -314,6 +314,13 @@ also be used to improve performance. is dependent on a combination of the build environment + the other optimization configure args + the CPU architecture, and not all combinations are supported. + BOLT versions before LLVM 16 are known to crash BOLT under some scenarios. + Use of LLVM 16 or newer for BOLT optimization is strongly encouraged. + + The :envvar:`!BOLT_INSTRUMENT_FLAGS` and :envvar:`!BOLT_APPLY_FLAGS` + :program:`configure` variables can be defined to override the default set of + arguments for :program:`llvm-bolt` to instrument and apply BOLT data to + binaries, respectively. .. versionadded:: 3.12 diff --git a/Makefile.pre.in b/Makefile.pre.in index da3a8f6c13f90b..eb79c9c4ca1801 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -672,21 +672,55 @@ profile-opt: profile-run-stamp -rm -f profile-clean-stamp $(MAKE) @DEF_MAKE_RULE@ CFLAGS_NODIST="$(CFLAGS_NODIST) $(PGO_PROF_USE_FLAG)" LDFLAGS_NODIST="$(LDFLAGS_NODIST)" -.PHONY: bolt-opt -bolt-opt: @PREBOLT_RULE@ +# List of binaries that BOLT runs on. +BOLT_BINARIES := @BOLT_BINARIES@ + +BOLT_INSTRUMENT_FLAGS := @BOLT_INSTRUMENT_FLAGS@ +BOLT_APPLY_FLAGS := @BOLT_APPLY_FLAGS@ + +.PHONY: clean-bolt +clean-bolt: + # Profile data. rm -f *.fdata - @if $(READELF) -p .note.bolt_info $(BUILDPYTHON) | grep BOLT > /dev/null; then\ - echo "skip: $(BUILDPYTHON) is already BOLTed."; \ - else \ - @LLVM_BOLT@ ./$(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst; \ - ./$(BUILDPYTHON).bolt_inst $(PROFILE_TASK) || true; \ - @MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata; \ - @LLVM_BOLT@ ./$(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=none -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot; \ - rm -f *.fdata; \ - rm -f $(BUILDPYTHON).bolt_inst; \ - mv $(BUILDPYTHON).bolt $(BUILDPYTHON); \ - fi + # Pristine binaries before BOLT optimization. + rm -f *.prebolt + # BOLT instrumented binaries. + rm -f *.bolt_inst + +profile-bolt-stamp: $(BUILDPYTHON) + # Ensure a pristine, pre-BOLT copy of the binary and no profile data from last run. + for bin in $(BOLT_BINARIES); do \ + prebolt="$${bin}.prebolt"; \ + if [ -e "$${prebolt}" ]; then \ + echo "Restoring pre-BOLT binary $${prebolt}"; \ + mv "$${bin}.prebolt" "$${bin}"; \ + fi; \ + cp "$${bin}" "$${prebolt}"; \ + rm -f $${bin}.bolt.*.fdata $${bin}.fdata; \ + done + # Instrument each binary. + for bin in $(BOLT_BINARIES); do \ + @LLVM_BOLT@ "$${bin}" -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $${bin}.bolt) -o $${bin}.bolt_inst $(BOLT_INSTRUMENT_FLAGS); \ + mv "$${bin}.bolt_inst" "$${bin}"; \ + done + # Run instrumented binaries to collect data. + $(RUNSHARED) ./$(BUILDPYTHON) $(PROFILE_TASK) || true + # Merge all the data files together. + for bin in $(BOLT_BINARIES); do \ + @MERGE_FDATA@ $${bin}.*.fdata > "$${bin}.fdata"; \ + rm -f $${bin}.*.fdata; \ + done + # Run bolt against the merged data to produce an optimized binary. + for bin in $(BOLT_BINARIES); do \ + @LLVM_BOLT@ "$${bin}.prebolt" -o "$${bin}.bolt" -data="$${bin}.fdata" $(BOLT_APPLY_FLAGS); \ + mv "$${bin}.bolt" "$${bin}"; \ + done + touch $@ +.PHONY: bolt-opt +bolt-opt: + $(MAKE) @PREBOLT_RULE@ + $(MAKE) profile-bolt-stamp # Compile and run with gcov .PHONY: coverage @@ -2623,10 +2657,11 @@ profile-removal: rm -f $(COVERAGE_INFO) rm -rf $(COVERAGE_REPORT) rm -f profile-run-stamp + rm -f profile-bolt-stamp .PHONY: clean -clean: clean-retain-profile - @if test @DEF_MAKE_ALL_RULE@ = profile-opt; then \ +clean: clean-retain-profile clean-bolt + @if test @DEF_MAKE_ALL_RULE@ = profile-opt -o @DEF_MAKE_ALL_RULE@ = bolt-opt; then \ rm -f profile-gen-stamp profile-clean-stamp; \ $(MAKE) profile-removal; \ fi diff --git a/Misc/NEWS.d/next/Build/2023-05-20-16-09-59.gh-issue-101282.FvRARb.rst b/Misc/NEWS.d/next/Build/2023-05-20-16-09-59.gh-issue-101282.FvRARb.rst new file mode 100644 index 00000000000000..cc70d47c6c16dd --- /dev/null +++ b/Misc/NEWS.d/next/Build/2023-05-20-16-09-59.gh-issue-101282.FvRARb.rst @@ -0,0 +1,4 @@ +BOLT optimization is now applied to the libpython shared library if building +a shared library. BOLT instrumentation and application settings can now be +influenced via the ``BOLT_INSTRUMENT_FLAGS`` and ``BOLT_APPLY_FLAGS`` +configure variables. diff --git a/configure b/configure index 7aad4fe89e3cbf..2b863be108be26 100755 --- a/configure +++ b/configure @@ -883,10 +883,11 @@ CFLAGS_NODIST BASECFLAGS CFLAGS_ALIASING OPT +BOLT_APPLY_FLAGS +BOLT_INSTRUMENT_FLAGS +BOLT_BINARIES MERGE_FDATA LLVM_BOLT -ac_ct_READELF -READELF PREBOLT_RULE LLVM_PROF_FOUND LLVM_PROFDATA @@ -1105,6 +1106,8 @@ CPPFLAGS CPP HOSTRUNNER PROFILE_TASK +BOLT_INSTRUMENT_FLAGS +BOLT_APPLY_FLAGS LIBUUID_CFLAGS LIBUUID_LIBS LIBFFI_CFLAGS @@ -1916,6 +1919,10 @@ Some influential environment variables: HOSTRUNNER Program to run CPython for the host platform PROFILE_TASK Python args for PGO generation task + BOLT_INSTRUMENT_FLAGS + Arguments to llvm-bolt when instrumenting binaries + BOLT_APPLY_FLAGS + Arguments to llvm-bolt when creating a BOLT optimized binary LIBUUID_CFLAGS C compiler flags for LIBUUID, overriding pkg-config LIBUUID_LIBS @@ -8106,112 +8113,6 @@ if test "$Py_BOLT" = 'true' ; then DEF_MAKE_ALL_RULE="bolt-opt" DEF_MAKE_RULE="build_all" - - if test -n "$ac_tool_prefix"; then - for ac_prog in readelf - do - # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. -set dummy $ac_tool_prefix$ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_READELF+:} false; then : - $as_echo_n "(cached) " >&6 -else - if test -n "$READELF"; then - ac_cv_prog_READELF="$READELF" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then - ac_cv_prog_READELF="$ac_tool_prefix$ac_prog" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done - done -IFS=$as_save_IFS - -fi -fi -READELF=$ac_cv_prog_READELF -if test -n "$READELF"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $READELF" >&5 -$as_echo "$READELF" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi - - - test -n "$READELF" && break - done -fi -if test -z "$READELF"; then - ac_ct_READELF=$READELF - for ac_prog in readelf -do - # Extract the first word of "$ac_prog", so it can be a program name with args. -set dummy $ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_ac_ct_READELF+:} false; then : - $as_echo_n "(cached) " >&6 -else - if test -n "$ac_ct_READELF"; then - ac_cv_prog_ac_ct_READELF="$ac_ct_READELF" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then - ac_cv_prog_ac_ct_READELF="$ac_prog" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done - done -IFS=$as_save_IFS - -fi -fi -ac_ct_READELF=$ac_cv_prog_ac_ct_READELF -if test -n "$ac_ct_READELF"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_READELF" >&5 -$as_echo "$ac_ct_READELF" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi - - - test -n "$ac_ct_READELF" && break -done - - if test "x$ac_ct_READELF" = x; then - READELF=""notfound"" - else - case $cross_compiling:$ac_tool_warned in -yes:) -{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} -ac_tool_warned=yes ;; -esac - READELF=$ac_ct_READELF - fi -fi - - if test "$READELF" == "notfound" - then - as_fn_error $? "readelf is required for a --enable-bolt build but could not be found." "$LINENO" 5 - fi - # -fno-reorder-blocks-and-partition is required for bolt to work. # Possibly GCC only. { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -fno-reorder-blocks-and-partition" >&5 @@ -8474,6 +8375,36 @@ $as_echo "\"Found merge-fdata\"" >&6; } fi fi + +BOLT_BINARIES='$(BUILDPYTHON)' +if test "x$enable_shared" = xyes; then : + + BOLT_BINARIES="${BOLT_BINARIES} \$(INSTSONAME)" + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking BOLT_INSTRUMENT_FLAGS" >&5 +$as_echo_n "checking BOLT_INSTRUMENT_FLAGS... " >&6; } +if test -z "${BOLT_INSTRUMENT_FLAGS}" +then + BOLT_INSTRUMENT_FLAGS= +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $BOLT_INSTRUMENT_FLAGS" >&5 +$as_echo "$BOLT_INSTRUMENT_FLAGS" >&6; } + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking BOLT_APPLY_FLAGS" >&5 +$as_echo_n "checking BOLT_APPLY_FLAGS... " >&6; } +if test -z "${BOLT_APPLY_FLAGS}" +then + BOLT_APPLY_FLAGS=-update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=none -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot + + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $BOLT_APPLY_FLAGS" >&5 +$as_echo "$BOLT_APPLY_FLAGS" >&6; } + # XXX Shouldn't the code above that fiddles with BASECFLAGS and OPT be # merged with this chunk of code? diff --git a/configure.ac b/configure.ac index 115998e0753b26..786d3414eb0e06 100644 --- a/configure.ac +++ b/configure.ac @@ -2028,13 +2028,6 @@ if test "$Py_BOLT" = 'true' ; then DEF_MAKE_ALL_RULE="bolt-opt" DEF_MAKE_RULE="build_all" - AC_SUBST(READELF) - AC_CHECK_TOOLS(READELF, [readelf], "notfound") - if test "$READELF" == "notfound" - then - AC_MSG_ERROR([readelf is required for a --enable-bolt build but could not be found.]) - fi - # -fno-reorder-blocks-and-partition is required for bolt to work. # Possibly GCC only. AX_CHECK_COMPILE_FLAG([-fno-reorder-blocks-and-partition],[ @@ -2067,6 +2060,54 @@ if test "$Py_BOLT" = 'true' ; then fi fi +dnl Enable BOLT of libpython if built. +AC_SUBST(BOLT_BINARIES) +BOLT_BINARIES='$(BUILDPYTHON)' +AS_VAR_IF([enable_shared], [yes], [ + BOLT_BINARIES="${BOLT_BINARIES} \$(INSTSONAME)" +]) + +AC_ARG_VAR( + [BOLT_INSTRUMENT_FLAGS], + [Arguments to llvm-bolt when instrumenting binaries] +) +AC_MSG_CHECKING([BOLT_INSTRUMENT_FLAGS]) +if test -z "${BOLT_INSTRUMENT_FLAGS}" +then + BOLT_INSTRUMENT_FLAGS= +fi +AC_MSG_RESULT([$BOLT_INSTRUMENT_FLAGS]) + +AC_ARG_VAR( + [BOLT_APPLY_FLAGS], + [Arguments to llvm-bolt when creating a BOLT optimized binary] +) +AC_MSG_CHECKING([BOLT_APPLY_FLAGS]) +if test -z "${BOLT_APPLY_FLAGS}" +then + AS_VAR_SET( + [BOLT_APPLY_FLAGS], + [m4_join([ ], + [-update-debug-sections], + [-reorder-blocks=ext-tsp], + [-reorder-functions=hfsort+], + [-split-functions], + [-icf=1], + [-inline-all], + [-split-eh], + [-reorder-functions-use-hot-size], + [-peepholes=none], + [-jump-tables=aggressive], + [-inline-ap], + [-indirect-call-promotion=all], + [-dyno-stats], + [-use-gnu-stack], + [-frame-opt=hot] + )] + ) +fi +AC_MSG_RESULT([$BOLT_APPLY_FLAGS]) + # XXX Shouldn't the code above that fiddles with BASECFLAGS and OPT be # merged with this chunk of code?