Reboot EVAL_CTORS with the new wasm-ctor-eval (#16011)

This updates us to use Binaryen's new version of wasm-ctor-eval, which can now do a lot more things, like eval just part of a function, eval to globals, etc. That plus other changes on the emscripten side that move more things like sbrk into pure wasm means that we can eval a lot more code. Previously -Oz would enable EVAL_CTORS. That was pretty dangerous as often it does not help code size. You really just need to run with the option and then measure the code size change vs the startup speed improvement. So this PR makes us no longer do anything automatically - you must manually build with -s EVAL_CTORS. A new mode EVAL_CTORS=2 is also added. This enables wasm-ctor-eval's new --ignore-external-input flag, which ignores the environment, params to main, etc. This is unsafe, and probably we should have separate options for these things, but for now this seems useful for experimentation. Tested by running all of wasm2 with EVAL_CTORS=2 enabled, and then ignoring the failures that are expected (things reading from argv, for example). Also I ran around 200,000 fuzzer iterations on binaryen. Example results on ./emcc tests/hello_libcxx.cpp -O3: mode | wasm size (bytes) --------------+------------------ normal | 136625 EVAL_CTORS-1 | 136616 EVAL_CTORS-2 | 133059 The output on the last one is: trying to eval __wasm_call_ctors ...success on __wasm_call_ctors. trying to eval main ...partial evalling successful, but stopping since could not eval: call import: wasi_snapshot_preview1.fd_write ...stopping It completely evals the ctors, and in main it evals some stuff, until it reaches a call to print to stdout. Fixes #15402
emscripten-core · Jan 14, 2022 · 7240c3d · 7240c3d
1 parent c618ee3
commit 7240c3d
Show file tree

Hide file tree

Showing 27 changed files with 327 additions and 351 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -20,6 +20,11 @@ See docs/process.md for more on how version tagging works.
 
 3.1.2
 -----
+- `EVAL_CTORS` has been rewritten and improved. The main differences from before
+  are that it is much more capable (it can now eval parts of functions and not
+  just all or nothing, and it can eval more wasm constructs like globals). It is
+  no longer run by default, so to use it you should build with `-s EVAL_CTORS`.
+  See `settings.js` for more details. (#16011)
 - `wasmX` test suites that are defined in `test_core.py` have been renamed to
   `coreX` to better reflect where they are defined.  The old suite names such
   as `wasm2` will continue to work for now as aliases.

diff --git a/emcc.py b/emcc.py
@@ -1440,8 +1440,6 @@ def default_setting(name, new_default):
 
   if settings.OPT_LEVEL >= 1:
     default_setting('ASSERTIONS', 0)
-  if settings.SHRINK_LEVEL >= 2:
-    default_setting('EVAL_CTORS', 1)
 
   if options.emrun:
     options.pre_js.append(utils.path_from_root('src/emrun_prejs.js'))
@@ -2165,6 +2163,19 @@ def check_memory_setting(setting):
   if settings.SINGLE_FILE:
     settings.GENERATE_SOURCE_MAP = 0
 
+  if settings.EVAL_CTORS:
+    if settings.WASM2JS:
+      # code size/memory and correctness issues TODO
+      exit_with_error('EVAL_CTORS is not compatible with wasm2js yet')
+    elif settings.USE_PTHREADS:
+      exit_with_error('EVAL_CTORS is not compatible with pthreads yet (passive segments)')
+    elif settings.RELOCATABLE:
+      exit_with_error('EVAL_CTORS is not compatible with relocatable yet (movable segments)')
+    elif settings.ASYNCIFY:
+      # In Asyncify exports can be called more than once, and this seems to not
+      # work properly yet (see test_emscripten_scan_registers).
+      exit_with_error('EVAL_CTORS is not compatible with asyncify yet')
+
   if options.use_closure_compiler == 2 and not settings.WASM2JS:
     exit_with_error('closure compiler mode 2 assumes the code is asm.js, so not meaningful for wasm')
 

diff --git a/emscripten.py b/emscripten.py
@@ -35,14 +35,12 @@
 
 logger = logging.getLogger('emscripten')
 
-WASM_INIT_FUNC = '__wasm_call_ctors'
-
 
 def compute_minimal_runtime_initializer_and_exports(post, exports, receiving):
   # Declare all exports out to global JS scope so that JS library functions can access them in a
   # way that minifies well with Closure
   # e.g. var a,b,c,d,e,f;
-  exports_that_are_not_initializers = [x for x in exports if x not in WASM_INIT_FUNC]
+  exports_that_are_not_initializers = [x for x in exports if x not in building.WASM_CALL_CTORS]
   # In Wasm backend the exports are still unmangled at this point, so mangle the names here
   exports_that_are_not_initializers = [asmjs_mangle(x) for x in exports_that_are_not_initializers]
 
@@ -780,7 +778,7 @@ def create_receiving(exports):
   if not settings.DECLARE_ASM_MODULE_EXPORTS:
     return ''
 
-  exports_that_are_not_initializers = [x for x in exports if x != WASM_INIT_FUNC]
+  exports_that_are_not_initializers = [x for x in exports if x != building.WASM_CALL_CTORS]
 
   receiving = []
 

diff --git a/site/source/docs/optimizing/Optimizing-Code.rst b/site/source/docs/optimizing/Optimizing-Code.rst
@@ -119,6 +119,59 @@ linker can handle a mix wasm object files and LTO object files.  Passing
 Thus, to allow maximal LTO opportunities with the LLVM wasm backend, build all
 source files with ``-flto`` and also link with ``flto``.
 
+EVAL_CTORS
+==========
+
+Building with ``-sEVAL_CTORS`` will evaluate as much code as possible at
+compile time. That includes both the "global ctor" functions (functions LLVM
+emits that run before ``main()``) as well as ``main()`` itself. As much as can
+be evaluated will be, and the resulting state is then "snapshotted" into the
+wasm. Then when the program is run it will begin from that state, and not need
+to execute that code, which can save time.
+
+This optimization can either reduce or increase code size. If a small amount
+of code generates many changes in memory, for example, then overall size may
+increase. It is best to build with this flag and then measure code and startup
+speed and see if the tradeoff is worthwhile in your program.
+
+You can make an effort to write EVAL_CTORS-friendly code, by deferring things
+that cannot be evalled as much as possible. For example, calls to imports stop
+this optimization, and so if you have a game engine that creates a GL context
+and then does some pure computation to set up unrelated data structures in
+memory, then you could reverse that order. Then the pure computation could run
+first, and be evalled away, and the GL context creation call to an import would
+not prevent that. Other things you can do are to avoid using ``argc/argv``, to
+avoid using ``getenv()``, and so forth.
+
+Logging is shown when using this option so that you can see whether things can
+be improved. Here is an example of output from ``emcc -sEVAL_CTORS``:
+
+::
+
+  trying to eval __wasm_call_ctors
+    ...partial evalling successful, but stopping since could not eval: call import: wasi_snapshot_preview1.environ_sizes_get
+         recommendation: consider --ignore-external-input
+    ...stopping
+
+The first line indicates an attempt to eval LLVM's function that runs global
+ctors. It evalled some of the function but then it stopped on the WASI import
+``environ_sizes_get``, which means it is trying to read from the environment.
+As the output says, you can tell ``EVAL_CTORS`` to ignore external input, which
+will ignore such things. You can enable that with mode ``2``, that is, build
+with ``emcc -sEVAL_CTORS=2``:
+
+::
+
+  trying to eval __wasm_call_ctors
+    ...success on __wasm_call_ctors.
+  trying to eval main
+    ...stopping (in block) since could not eval: call import: wasi_snapshot_preview1.fd_write
+    ...stopping
+
+Now it has succeeded to eval ``__wasm_call_ctors`` completely. It then moved on
+to ``main``, where it stopped because of a call to WASI's ``fd_write``, that is,
+a call to print something.
+
 Very large codebases
 ====================
 

diff --git a/src/settings.js b/src/settings.js
@@ -1535,40 +1535,40 @@ var ALLOW_BLOCKING_ON_MAIN_THREAD = 1;
 // [link]
 var PTHREADS_DEBUG = 0;
 
-// This tries to evaluate global ctors at compile-time, applying their effects
-// into the mem init file. This saves running code during startup, and also
-// allows removing the global ctor functions and other code that only they used,
-// so this is also good for reducing code size. However, this does make the
-// compile step much slower.
-//
-// This basically runs the ctors during compile time, seeing if they execute
-// safely in a sandbox. Any ffi access out of wasm causes failure, as it could
-// do something nondeterministic and/or alter some other state we don't see. If
-// all the global ctor does is pure computation inside wasm, it should be ok.
-// Run with EMCC_DEBUG=1 in the env to see logging, and errors when it fails to
-// eval (you'll see a message, or a stack trace; in the latter case, the
-// functions on the stack should give you an idea of what ffi was called and
-// why, and perhaps you can refactor your code to avoid it, e.g., remove
-// mallocs, printfs in global ctors).
-//
-// This optimization can increase the size of the mem init file, because ctors
-// can write to memory that would otherwise be in a zeroinit area. This may not
-// be a significant increase after gzip, if there are mostly zeros in there, and
-// in any case the mem init increase would be offset by a code size decrease.
-// (Unless you have a small ctor that writes 'random' data to memory, which
-// would reduce little code but add potentially lots of uncompressible data.)
+// This tries to evaluate code at compile time. The main use case is to eval
+// global ctor functions, which are those that run before main(), but main()
+// itself or parts of it can also be evalled. Evaluating code this way can avoid
+// work at runtime, as it applies the results of the execution to memory and
+// globals and so forth, "snapshotting" the wasm and then just running it from
+// there when it is loaded.
+//
+// This will stop when it sees something it cannot eval at compile time, like a
+// call to an import. When running with this option you will see logging that
+// indicates what is evalled and where it stops.
+//
+// This optimization can either reduce or increase code size. If a small amount
+// of code generates many changes in memory, for example, then overall size may
+// increase.
 //
 // LLVM's GlobalOpt *almost* does this operation. It does in simple cases, where
 // LLVM IR is not too complex for its logic to evaluate, but it isn't powerful
 // enough for e.g. libc++ iostream ctors. It is just hard to do at the LLVM IR
-// level - LLVM IR is complex and getting more complex, this would require
+// level - LLVM IR is complex and getting more complex, so this would require
 // GlobalOpt to have a full interpreter, plus a way to write back into LLVM IR
 // global objects.  At the wasm level, however, everything has been lowered
 // into a simple low level, and we also just need to write bytes into an array,
-// so this is easy for us to do, but not for LLVM. A further issue for LLVM is
-// that it doesn't know that we will not link in further code, so it only tries
-// to optimize ctors with lowest priority. We do know that, and can optimize all
-// the ctors.
+// so this is easy for us to do. A further issue for LLVM is that it doesn't
+// know that we will not link in further code, so it only tries to optimize
+// ctors with lowest priority (while we do know explicitly if dynamic linking is
+// enabled or not).
+//
+// If set to a value of 2, this also makes some "unsafe" assumptions,
+// specifically that there is no input received while evalling ctors. That means
+// we ignore args to main() as well as assume no environment vars are readable.
+// This allows more programs to be optimized, but you need to make sure your
+// program does not depend on those features - even just checking the value of
+// argc can lead to problems.
+//
 // [link]
 var EVAL_CTORS = 0;
 

diff --git a/tests/common.py b/tests/common.py
@@ -642,6 +642,11 @@ def is_exported_in_wasm(self, name, wasm):
     wat = self.get_wasm_text(wasm)
     return ('(export "%s"' % name) in wat
 
+  def measure_wasm_code_lines(self, wasm):
+    wat_lines = self.get_wasm_text(wasm).splitlines()
+    non_data_lines = [line for line in wat_lines if '(data ' not in line]
+    return len(non_data_lines)
+
   def run_js(self, filename, engine=None, args=[],
              output_nicerizer=None,
              assert_returncode=0,

diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.exports b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.exports
@@ -0,0 +1,13 @@
+__errno_location
+__indirect_function_table
+__wasm_call_ctors
+dynCall_iiiiiijj
+dynCall_iiiiij
+dynCall_iiiiijj
+dynCall_jiji
+dynCall_viijii
+main
+memory
+stackAlloc
+stackRestore
+stackSave
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.imports b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.imports
@@ -0,0 +1,11 @@
+env.abort
+env.emscripten_memcpy_big
+env.emscripten_resize_heap
+env.setTempRet0
+env.strftime_l
+wasi_snapshot_preview1.environ_get
+wasi_snapshot_preview1.environ_sizes_get
+wasi_snapshot_preview1.fd_close
+wasi_snapshot_preview1.fd_read
+wasi_snapshot_preview1.fd_seek
+wasi_snapshot_preview1.fd_write
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.jssize b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.jssize
@@ -0,0 +1 @@
+98089
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.sent b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.sent
@@ -0,0 +1,11 @@
+abort
+emscripten_memcpy_big
+emscripten_resize_heap
+environ_get
+environ_sizes_get
+fd_close
+fd_read
+fd_seek
+fd_write
+setTempRet0
+strftime_l
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.size b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS.size
@@ -0,0 +1 @@
+124645
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.exports b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.exports
@@ -0,0 +1,12 @@
+__errno_location
+__indirect_function_table
+dynCall_iiiiiijj
+dynCall_iiiiij
+dynCall_iiiiijj
+dynCall_jiji
+dynCall_viijii
+main
+memory
+stackAlloc
+stackRestore
+stackSave
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.imports b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.imports
@@ -0,0 +1,9 @@
+env.abort
+env.emscripten_memcpy_big
+env.emscripten_resize_heap
+env.setTempRet0
+env.strftime_l
+wasi_snapshot_preview1.fd_close
+wasi_snapshot_preview1.fd_read
+wasi_snapshot_preview1.fd_seek
+wasi_snapshot_preview1.fd_write
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.jssize b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.jssize
@@ -0,0 +1 @@
+97987
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.sent b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.sent
@@ -0,0 +1,11 @@
+abort
+emscripten_memcpy_big
+emscripten_resize_heap
+environ_get
+environ_sizes_get
+fd_close
+fd_read
+fd_seek
+fd_write
+setTempRet0
+strftime_l
diff --git a/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.size b/tests/other/metadce/hello_libcxx_O2_EVAL_CTORS_2.size
@@ -0,0 +1 @@
+122060
diff --git a/tests/other/metadce/minimal_Oz_EVAL_CTORS.exports b/tests/other/metadce/minimal_Oz_EVAL_CTORS.exports
@@ -0,0 +1,3 @@
+a
+b
+c
diff --git a/tests/other/metadce/minimal_Oz_EVAL_CTORS.funcs b/tests/other/metadce/minimal_Oz_EVAL_CTORS.funcs
@@ -0,0 +1 @@
+$add
diff --git a/tests/other/metadce/minimal_Oz_EVAL_CTORS.imports b/tests/other/metadce/minimal_Oz_EVAL_CTORS.imports
@@ -0,0 +1 @@
+
diff --git a/tests/other/metadce/minimal_Oz_EVAL_CTORS.jssize b/tests/other/metadce/minimal_Oz_EVAL_CTORS.jssize
@@ -0,0 +1 @@
+11845
diff --git a/tests/other/metadce/minimal_Oz_EVAL_CTORS.sent b/tests/other/metadce/minimal_Oz_EVAL_CTORS.sent
@@ -0,0 +1 @@
+
diff --git a/tests/other/metadce/minimal_Oz_EVAL_CTORS.size b/tests/other/metadce/minimal_Oz_EVAL_CTORS.size
@@ -0,0 +1 @@
+62
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -379,6 +379,7 @@ def cleanup(self):
     benchmarkers += [
       EmscriptenBenchmarker(default_v8_name, aot_v8),
       EmscriptenBenchmarker(default_v8_name + '-lto', aot_v8, ['-flto']),
+      EmscriptenBenchmarker(default_v8_name + '-ctors', aot_v8, ['-sEVAL_CTORS']),
       # EmscriptenWasm2CBenchmarker('wasm2c')
     ]
   if os.path.exists(CHEERP_BIN):
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    a
+    b
+    c