From 920c205861b7dd3d98bdad62b88bdf6ba4e85ed9 Mon Sep 17 00:00:00 2001 From: NagyZoltanPeter <113987313+NagyZoltanPeter@users.noreply.github.com> Date: Tue, 11 Jun 2024 05:51:05 +0200 Subject: [PATCH 1/5] Fix for multiple nat module initialization causes dead lock in nat refresh thread --- waku/common/utils/nat.nim | 58 +++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/waku/common/utils/nat.nim b/waku/common/utils/nat.nim index e05a345e62..2304d60851 100644 --- a/waku/common/utils/nat.nim +++ b/waku/common/utils/nat.nim @@ -9,6 +9,15 @@ import chronicles, eth/net/nat, stew/results, nativesockets logScope: topics = "nat" +## Do to the design of nim-eth/nat module we must ensure it is only initialized once. +## see: https://github.com/waku-org/nwaku/issues/2628 +## Details: nim-eth/nat module starts a meaintenance thread for refreshing the NAT mappings, but everything in the module is global, +## there is no room to store multiple configurations. +## Exact mean: redirectPorts cannot be called twice in a program lifetime. +## During waku tests we happen to start several node instances parallel thus result in multiple NAT configurations and multiple threads. +## Those threads will dead lock each other in tear down. +var singletonNat: bool = false + proc setupNat*( natConf, clientId: string, tcpPort, udpPort: Port ): Result[ @@ -26,26 +35,35 @@ proc setupNat*( tuple[ip: Option[IpAddress], tcpPort: Option[Port], udpPort: Option[Port]] if strategy != NatNone: - let extIp = getExternalIP(strategy) - if extIP.isSome(): - endpoint.ip = some(extIp.get()) - # RedirectPorts in considered a gcsafety violation - # because it obtains the address of a non-gcsafe proc? - var extPorts: Option[(Port, Port)] - try: - extPorts = ( - {.gcsafe.}: - redirectPorts(tcpPort = tcpPort, udpPort = udpPort, description = clientId) - ) - except CatchableError: - # TODO: nat.nim Error: can raise an unlisted exception: Exception. Isolate here for now. - error "unable to determine external ports" - extPorts = none((Port, Port)) - - if extPorts.isSome(): - let (extTcpPort, extUdpPort) = extPorts.get() - endpoint.tcpPort = some(extTcpPort) - endpoint.udpPort = some(extUdpPort) + ## Only initialize the NAT module once + ## redirectPorts cannot be called twice in a program lifetime. + ## We can do it as same happens if getExternalIP fails and returns None + if singletonNat: + warn "NAT already initialized, skipping as cannot be done multiple times" + else: + singletonNat = true + let extIp = getExternalIP(strategy) + if extIP.isSome(): + endpoint.ip = some(extIp.get()) + # RedirectPorts in considered a gcsafety violation + # because it obtains the address of a non-gcsafe proc? + var extPorts: Option[(Port, Port)] + try: + extPorts = ( + {.gcsafe.}: + redirectPorts( + tcpPort = tcpPort, udpPort = udpPort, description = clientId + ) + ) + except CatchableError: + # TODO: nat.nim Error: can raise an unlisted exception: Exception. Isolate here for now. + error "unable to determine external ports" + extPorts = none((Port, Port)) + + if extPorts.isSome(): + let (extTcpPort, extUdpPort) = extPorts.get() + endpoint.tcpPort = some(extTcpPort) + endpoint.udpPort = some(extUdpPort) else: # NatNone if not natConf.startsWith("extip:"): return err("not a valid NAT mechanism: " & $natConf) From 09a74f73b862a9f33c2b3254bede7ab798679788 Mon Sep 17 00:00:00 2001 From: NagyZoltanPeter <113987313+NagyZoltanPeter@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:09:54 +0200 Subject: [PATCH 2/5] Update waku/common/utils/nat.nim Co-authored-by: gabrielmer <101006718+gabrielmer@users.noreply.github.com> --- waku/common/utils/nat.nim | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/waku/common/utils/nat.nim b/waku/common/utils/nat.nim index 2304d60851..5835a8e7f0 100644 --- a/waku/common/utils/nat.nim +++ b/waku/common/utils/nat.nim @@ -9,12 +9,12 @@ import chronicles, eth/net/nat, stew/results, nativesockets logScope: topics = "nat" -## Do to the design of nim-eth/nat module we must ensure it is only initialized once. +## Due to the design of nim-eth/nat module we must ensure it is only initialized once. ## see: https://github.com/waku-org/nwaku/issues/2628 ## Details: nim-eth/nat module starts a meaintenance thread for refreshing the NAT mappings, but everything in the module is global, ## there is no room to store multiple configurations. -## Exact mean: redirectPorts cannot be called twice in a program lifetime. -## During waku tests we happen to start several node instances parallel thus result in multiple NAT configurations and multiple threads. +## Exact meaning: redirectPorts cannot be called twice in a program lifetime. +## During waku tests we happen to start several node instances in parallel thus resulting in multiple NAT configurations and multiple threads. ## Those threads will dead lock each other in tear down. var singletonNat: bool = false From 64db37ae41bd4e7ebc2f944dee820c8082ca441a Mon Sep 17 00:00:00 2001 From: NagyZoltanPeter <113987313+NagyZoltanPeter@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:16:53 +0200 Subject: [PATCH 3/5] Try ci with non parallel run --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 770d4824fe..534a0e00ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,9 +11,9 @@ concurrency: cancel-in-progress: true env: - NPROC: 2 - MAKEFLAGS: "-j${NPROC}" - NIMFLAGS: "--parallelBuild:${NPROC} --colors:off -d:chronicles_colors:none" + NPROC: 1 + MAKEFLAGS: "" + NIMFLAGS: "--colors:off -d:chronicles_colors:none" jobs: changes: # changes detection From 787d292da0c93d72053a7633fc2e99d5bda67c75 Mon Sep 17 00:00:00 2001 From: NagyZoltanPeter <113987313+NagyZoltanPeter@users.noreply.github.com> Date: Tue, 11 Jun 2024 15:27:13 +0200 Subject: [PATCH 4/5] Update ci.yml . NPROC to 1 to avoid parallel test runs can lead to timing and port allocation issues --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 534a0e00ae..8ce2de27ac 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,8 +12,8 @@ concurrency: env: NPROC: 1 - MAKEFLAGS: "" - NIMFLAGS: "--colors:off -d:chronicles_colors:none" + MAKEFLAGS: "-j${NPROC}" + NIMFLAGS: "--parallelBuild:${NPROC} --colors:off -d:chronicles_colors:none" jobs: changes: # changes detection From 61be1041da3f269fa8b0dd84af0f75a62dcf5824 Mon Sep 17 00:00:00 2001 From: NagyZoltanPeter <113987313+NagyZoltanPeter@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:19:06 +0200 Subject: [PATCH 5/5] Update ci.yml - revert to parallel builds except for tests. --- .github/workflows/ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8ce2de27ac..025ef880d7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ concurrency: cancel-in-progress: true env: - NPROC: 1 + NPROC: 2 MAKEFLAGS: "-j${NPROC}" NIMFLAGS: "--parallelBuild:${NPROC} --colors:off -d:chronicles_colors:none" @@ -111,15 +111,15 @@ jobs: - name: Run tests run: | - if [ ${{ runner.os }} == "Linux" ]; then - sudo docker run --rm -d -e POSTGRES_PASSWORD=test123 -p 5432:5432 postgres:15.4-alpine3.18 - fi - postgres_enabled=0 if [ ${{ runner.os }} == "Linux" ]; then + sudo docker run --rm -d -e POSTGRES_PASSWORD=test123 -p 5432:5432 postgres:15.4-alpine3.18 postgres_enabled=1 fi + export MAKEFLAGS="-j1" + export NIMFLAGS="--colors:off -d:chronicles_colors:none" + make RLN_V${{matrix.rln_version}}=true V=1 LOG_LEVEL=DEBUG QUICK_AND_DIRTY_COMPILER=1 POSTGRES=$postgres_enabled test testwakunode2 build-docker-image: