From 5cc17647316cf1764448f1858166c69854aba4ee Mon Sep 17 00:00:00 2001 From: Etan Kissling Date: Tue, 16 Jan 2024 00:30:10 +0100 Subject: [PATCH 1/3] workaround random `SIGSEGV` on macOS aarch64 CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Separate a `let` block into multiple `let` statements to reduce probability of hitting random `SIGSEGV` during flaky CI tests. whatever... 🤯 --- beacon_chain/validators/beacon_validators.nim | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/beacon_chain/validators/beacon_validators.nim b/beacon_chain/validators/beacon_validators.nim index 5f60ddfe73..c168af1bd3 100644 --- a/beacon_chain/validators/beacon_validators.nim +++ b/beacon_chain/validators/beacon_validators.nim @@ -1178,6 +1178,85 @@ proc proposeBlockAux( blobsBundle.proofs, blobsBundle.blobs)) else: Opt.none(seq[BlobSidecar]) + + # BIG BUG SOURCE: The `let` below cannot be combined with the others above! + # If combined, there are sometimes `SIGSEGV` during `test_keymanager_api`. + # This has only been observed on macOS (aarch64) in Jenkins, not on GitHub. + # + # - macOS 14.2.1 (23C71) + # - Xcode 15.1 (15C65) + # - Nim v1.6.18 (a749a8b742bd0a4272c26a65517275db4720e58a) + # + # Issue has started occuring around 12 Jan 2024, in a CI run for PR #5731. + # The PR did not change anything related to this, suggesting an environment + # or hardware change. The issue is flaky; could have been introduced earlier + # before surfacing in the aforementioned mentioned PR. About 30% to hit bug. + # + # [2024-01-12T11:54:21.011Z] Wrote test_keymanager_api/bootstrap_node.enr + # [2024-01-12T11:54:29.294Z] Serialization/deserialization [Beacon Node] [Preset: mainnet] . (0.00s) + # [2024-01-12T11:54:29.294Z] ListKeys requests [Beacon Node] [Preset: mainnet] .... (0.01s) + # [2024-01-12T11:54:34.870Z] ImportKeystores requests [Beacon Node] [Preset: mainnet] Traceback (most recent call last, using override) + # [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(1016) main + # [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(1006) NimMain + # [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(997) PreMain + # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1502) atmtest_keymanager_apidotnim_Init000 + # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1475) main + # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue + # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1481) main + # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(307) startBeaconNode + # [2024-01-12T11:54:34.870Z] beacon_chain/nimbus_beacon_node.nim(1900) start + # [2024-01-12T11:54:34.870Z] beacon_chain/nimbus_beacon_node.nim(1847) run + # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll + # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue + # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1465) delayedTests + # [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(392) runTests + # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue + # [2024-01-12T11:54:34.870Z] vendor/nim-unittest2/unittest2.nim(1147) runTests + # [2024-01-12T11:54:34.870Z] vendor/nim-unittest2/unittest2.nim(1086) runDirect + # [2024-01-12T11:54:34.870Z] vendor/nim-testutils/testutils/unittests.nim(16) runTestX60gensym2933 + # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(656) waitFor + # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(631) pollFor + # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll + # [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue + # [2024-01-12T11:54:34.870Z] beacon_chain/validators/beacon_validators.nim(82) proposeBlockAux + # [2024-01-12T11:54:34.870Z] vendor/nimbus-build-system/vendor/Nim/lib/system/excpt.nim(631) signalHandler + # [2024-01-12T11:54:34.870Z] SIGSEGV: Illegal storage access. (Attempt to read from nil?) + # + # The generated `nimcache` differs slightly if the `let` are separated from + # a single block; separation introduces an additional state in closure iter. + # This change, maybe combined with some macOS specific compiler specifics, + # could this trigger the `SIGSEGV`? Maybe the extra state adds just enough + # complexity to the function to disable certain problematic optimizations? + # Furthermore, the combination of `(await xyz).valueOr: return` is not very + # commonly used together with other `await` in the same `let` block, which + # could explain why this is not a more common trigger across Nimbus. + # + # Note that when compiling for Wasm, there are similar bugs with `results` + # when inlining unwraps, e.g., in `eth2_rest_serialization.nim`. + # These have not been investigated thoroughly so far as that project uses + # Nim 2.0 with --mm:orc and is just a prototype for Wasm, no production use. + # But maybe there is something weird going on with `results` related to the + # random `SIGSEGV` that we are now observing here, related to doing too much + # inline logic without defining intermediate isolated `let` statements. + # + # if mediaType == ApplicationJsonMediaType: + # try: + # - ok RestJson.decode(value, T, + # - requireAllFields = true, + # - allowUnknownFields = true) + # + let r = RestJson.decode(value, T, + # + requireAllFields = true, + # + allowUnknownFields = true) + # + ok r + # except SerializationError as exc: + # debug "Failed to deserialize REST JSON data", + # err = exc.formatMsg(""), + # + # At this time we can only speculate about the trigger of these issues. + # Until a shared pattern can be identified, it is better to apply + # workarounds that at least avoid the known to be reachable triggers. + # The solution is hacky and far from desirable; it is what it is. + let newBlockRef = ( await node.router.routeSignedBeaconBlock(signedBlock, blobsOpt) ).valueOr: From 0fd88e9d812992393b4a01ab1454f1d78a13f80b Mon Sep 17 00:00:00 2001 From: Etan Kissling Date: Tue, 16 Jan 2024 11:36:54 +0100 Subject: [PATCH 2/3] Update beacon_chain/validators/beacon_validators.nim --- beacon_chain/validators/beacon_validators.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_chain/validators/beacon_validators.nim b/beacon_chain/validators/beacon_validators.nim index c168af1bd3..1301c8cb80 100644 --- a/beacon_chain/validators/beacon_validators.nim +++ b/beacon_chain/validators/beacon_validators.nim @@ -1190,7 +1190,7 @@ proc proposeBlockAux( # Issue has started occuring around 12 Jan 2024, in a CI run for PR #5731. # The PR did not change anything related to this, suggesting an environment # or hardware change. The issue is flaky; could have been introduced earlier - # before surfacing in the aforementioned mentioned PR. About 30% to hit bug. + # before surfacing in the aforementioned PR. About 30% to hit bug. # # [2024-01-12T11:54:21.011Z] Wrote test_keymanager_api/bootstrap_node.enr # [2024-01-12T11:54:29.294Z] Serialization/deserialization [Beacon Node] [Preset: mainnet] . (0.00s) From 970e965cff87830219b6cbe1c3341cb7aa93baf9 Mon Sep 17 00:00:00 2001 From: Etan Kissling Date: Tue, 16 Jan 2024 11:46:48 +0100 Subject: [PATCH 3/3] Update beacon_chain/validators/beacon_validators.nim --- beacon_chain/validators/beacon_validators.nim | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/beacon_chain/validators/beacon_validators.nim b/beacon_chain/validators/beacon_validators.nim index 1301c8cb80..f418ac27f8 100644 --- a/beacon_chain/validators/beacon_validators.nim +++ b/beacon_chain/validators/beacon_validators.nim @@ -1227,9 +1227,12 @@ proc proposeBlockAux( # This change, maybe combined with some macOS specific compiler specifics, # could this trigger the `SIGSEGV`? Maybe the extra state adds just enough # complexity to the function to disable certain problematic optimizations? - # Furthermore, the combination of `(await xyz).valueOr: return` is not very - # commonly used together with other `await` in the same `let` block, which - # could explain why this is not a more common trigger across Nimbus. + # The change in size of the environment changes a number of things such as + # alignment and which parts of an environment contain pointers and so on, + # which in turn may have surprising behavioural effects, ie most likely this + # extra state masks some underlying issue. Furthermore, the combination of + # `(await xyz).valueOr: return` is not very commonly used with other `await` + # in the same `let` block, which could explain this not being more common. # # Note that when compiling for Wasm, there are similar bugs with `results` # when inlining unwraps, e.g., in `eth2_rest_serialization.nim`.