From 51fea953d670d333849a78ad2485d410017036bc Mon Sep 17 00:00:00 2001
From: Javier Sagredo <javier.sagredo@iohk.io>
Date: Tue, 30 Nov 2021 14:44:27 +0100
Subject: [PATCH] Fix: don't modify clean state on startup

---
 .../src/Ouroboros/Consensus/Node.hs           | 124 ++++++++----------
 .../src/Ouroboros/Consensus/Node/Recovery.hs  |  88 ++++++++++---
 2 files changed, 124 insertions(+), 88 deletions(-)

diff --git a/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs b/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs
index f76dc36891e..7c655a3bb54 100644
--- a/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs
+++ b/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs
@@ -52,7 +52,6 @@ module Ouroboros.Consensus.Node (
   ) where
 
 import           Codec.Serialise (DeserialiseFailure)
-import           Control.Monad (when)
 import           Control.Tracer (Tracer, contramap)
 import           Data.ByteString.Lazy (ByteString)
 import           Data.Hashable (Hashable)
@@ -183,8 +182,14 @@ data LowLevelRunNodeArgs m addrNTN addrNTC versionDataNTN versionDataNTC blk
                          (p2p :: Diffusion.P2P) =
    LowLevelRunNodeArgs {
 
-      -- | How to manage the clean-shutdown marker on disk
-      llrnWithCheckedDB :: forall a. (LastShutDownWasClean -> m a) -> m a
+      -- | An action that will receive a marker indicating whether the previous
+      -- shutdown was considered clean and a wrapper for installing a handler to
+      -- create a clean file on exit if needed. See
+      -- 'Ouroboros.Consensus.Node.Recovery.runWithCheckedDB'.
+      llrnWithCheckedDB :: forall a. (  LastShutDownWasClean
+                                     -> (ChainDB m blk -> m a -> m a)
+                                     -> m a)
+                        -> m a
 
       -- | The " static " ChainDB arguments
     , llrnChainDbArgsDefaults :: ChainDbArgs Defaults m blk
@@ -277,9 +282,8 @@ runWith :: forall m addrNTN addrNTC versionDataNTN versionDataNTC blk p2p.
   -> m ()
 runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =
 
-    llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) ->
+    llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) continueWithCleanChainDB ->
     withRegistry $ \registry -> do
-
       let systemStart :: SystemStart
           systemStart = getSystemStart (configBlock cfg)
 
@@ -311,48 +315,49 @@ runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =
       chainDB <- openChainDB registry inFuture cfg initLedger
                 llrnChainDbArgsDefaults customiseChainDbArgs'
 
-      btime <-
-        hardForkBlockchainTime $
-        llrnCustomiseHardForkBlockchainTimeArgs $
-        HardForkBlockchainTimeArgs
-          { hfbtBackoffDelay   = pure $ BackoffDelay 60
-          , hfbtGetLedgerState =
-              ledgerState <$> ChainDB.getCurrentLedger chainDB
-          , hfbtLedgerConfig   = configLedger cfg
-          , hfbtRegistry       = registry
-          , hfbtSystemTime     = systemTime
-          , hfbtTracer         =
-              contramap (fmap (fromRelativeTime systemStart))
-                (blockchainTimeTracer rnTraceConsensus)
-          , hfbtMaxClockRewind = secondsToNominalDiffTime 20
-          }
-
-      nodeKernelArgs <-
-          fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $
-          mkNodeKernelArgs
-            registry
-            llrnBfcSalt
-            llrnKeepAliveRng
-            cfg
-            blockForging
-            rnTraceConsensus
-            btime
-            chainDB
-      nodeKernel <- initNodeKernel nodeKernelArgs
-      rnNodeKernelHook registry nodeKernel
-
-      peerMetrics <- newPeerMetric
-      let ntnApps = mkNodeToNodeApps   nodeKernelArgs nodeKernel peerMetrics
-          ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel
-          (apps, appsExtra) = mkDiffusionApplications
-                                    rnEnableP2P
-                                    (miniProtocolParameters nodeKernelArgs)
-                                    ntnApps
-                                    ntcApps
-                                    nodeKernel
-                                    peerMetrics
-
-      llrnRunDataDiffusion registry apps appsExtra
+      continueWithCleanChainDB chainDB $ do
+        btime <-
+          hardForkBlockchainTime $
+          llrnCustomiseHardForkBlockchainTimeArgs $
+          HardForkBlockchainTimeArgs
+            { hfbtBackoffDelay   = pure $ BackoffDelay 60
+            , hfbtGetLedgerState =
+                ledgerState <$> ChainDB.getCurrentLedger chainDB
+            , hfbtLedgerConfig   = configLedger cfg
+            , hfbtRegistry       = registry
+            , hfbtSystemTime     = systemTime
+            , hfbtTracer         =
+                contramap (fmap (fromRelativeTime systemStart))
+                  (blockchainTimeTracer rnTraceConsensus)
+            , hfbtMaxClockRewind = secondsToNominalDiffTime 20
+            }
+
+        nodeKernelArgs <-
+            fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $
+            mkNodeKernelArgs
+              registry
+              llrnBfcSalt
+              llrnKeepAliveRng
+              cfg
+              blockForging
+              rnTraceConsensus
+              btime
+              chainDB
+        nodeKernel <- initNodeKernel nodeKernelArgs
+        rnNodeKernelHook registry nodeKernel
+
+        peerMetrics <- newPeerMetric
+        let ntnApps = mkNodeToNodeApps   nodeKernelArgs nodeKernel peerMetrics
+            ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel
+            (apps, appsExtra) = mkDiffusionApplications
+                                      rnEnableP2P
+                                      (miniProtocolParameters nodeKernelArgs)
+                                      ntnApps
+                                      ntcApps
+                                      nodeKernel
+                                      peerMetrics
+
+        llrnRunDataDiffusion registry apps appsExtra
   where
     ProtocolInfo
       { pInfoConfig       = cfg
@@ -488,20 +493,16 @@ runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =
         localRethrowPolicy :: RethrowPolicy
         localRethrowPolicy = mempty
 
--- | Did the ChainDB already have existing clean-shutdown marker on disk?
-newtype LastShutDownWasClean = LastShutDownWasClean Bool
-  deriving (Eq, Show)
-
 -- | Check the DB marker, lock the DB and look for the clean shutdown marker.
 --
--- Run the body action with the DB locked, and if the last shutdown was clean.
+-- Run the body action with the DB locked.
 --
 stdWithCheckedDB ::
      forall blk a. (StandardHash blk, Typeable blk)
   => Proxy blk
   -> FilePath
   -> NetworkMagic
-  -> (LastShutDownWasClean -> IO a)  -- ^ Body action with last shutdown was clean.
+  -> (LastShutDownWasClean -> (ChainDB IO blk -> IO a -> IO a) -> IO a)  -- ^ Body action with last shutdown was clean.
   -> IO a
 stdWithCheckedDB pb databasePath networkMagic body = do
 
@@ -513,22 +514,7 @@ stdWithCheckedDB pb databasePath networkMagic body = do
       networkMagic
 
     -- Then create the lock file.
-    withLockDB mountPoint $ do
-
-      -- When we shut down cleanly, we create a marker file so that the next
-      -- time we start, we know we don't have to validate the contents of the
-      -- whole ChainDB. When we shut down with an exception indicating
-      -- corruption or something going wrong with the file system, we don't
-      -- create this marker file so that the next time we start, we do a full
-      -- validation.
-      lastShutDownWasClean <- hasCleanShutdownMarker hasFS
-      when lastShutDownWasClean $ removeCleanShutdownMarker hasFS
-
-      -- On a clean shutdown, create a marker in the database folder so that
-      -- next time we start up, we know we don't have to validate the whole
-      -- database.
-      createMarkerOnCleanShutdown pb hasFS $
-        body (LastShutDownWasClean lastShutDownWasClean)
+    withLockDB mountPoint $ runWithCheckedDB pb hasFS body
   where
     mountPoint = MountPoint databasePath
     hasFS      = ioHasFS mountPoint
diff --git a/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs b/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs
index f421ef8acf4..4a3fbe5aae1 100644
--- a/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs
+++ b/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs
@@ -2,10 +2,11 @@
 {-# LANGUAGE ScopedTypeVariables #-}
 
 module Ouroboros.Consensus.Node.Recovery (
-    createCleanShutdownMarker
-  , createMarkerOnCleanShutdown
+    LastShutDownWasClean (..)
+  , createCleanShutdownMarker
   , hasCleanShutdownMarker
   , removeCleanShutdownMarker
+  , runWithCheckedDB
   ) where
 
 import           Control.Monad (unless, when)
@@ -16,6 +17,7 @@ import           Ouroboros.Consensus.Block (StandardHash)
 import           Ouroboros.Consensus.Node.Exit (ExitReason (..), toExitReason)
 import           Ouroboros.Consensus.Util.IOLike
 
+import           Ouroboros.Consensus.Storage.ChainDB
 import           Ouroboros.Consensus.Storage.FS.API (HasFS, doesFileExist,
                      removeFile, withFile)
 import           Ouroboros.Consensus.Storage.FS.API.Types (AllowExisting (..),
@@ -25,23 +27,9 @@ import           Ouroboros.Consensus.Storage.FS.API.Types (AllowExisting (..),
 cleanShutdownMarkerFile :: FsPath
 cleanShutdownMarkerFile = mkFsPath ["clean"]
 
--- | When the given action terminates with a /clean/ exception, create the
--- /clean shutdown marker file/.
---
--- NOTE: we assume the action (i.e., the node itself) never terminates without
--- an exception.
---
--- A /clean/ exception is an exception for 'exceptionRequiresRecovery' returns
--- 'False'.
-createMarkerOnCleanShutdown ::
-     (IOLike m, StandardHash blk, Typeable blk)
-  => Proxy blk
-  -> HasFS m h
-  -> m a  -- ^ Action to run
-  -> m a
-createMarkerOnCleanShutdown pb mp = onExceptionIf
-    (not . exceptionRequiresRecovery pb)
-    (createCleanShutdownMarker mp)
+-- | Did the ChainDB already have existing clean-shutdown marker on disk?
+newtype LastShutDownWasClean = LastShutDownWasClean Bool
+  deriving (Eq, Show)
 
 -- | Return 'True' when 'cleanShutdownMarkerFile' exists.
 hasCleanShutdownMarker
@@ -83,6 +71,68 @@ exceptionRequiresRecovery pb e = case toExitReason pb e of
     DatabaseCorruption -> True
     _                  -> False
 
+-- | A bracket function that manages the clean-shutdown marker on disk.
+--
+-- - If the marker is missing on startup, then ChainDB initialization will
+--   revalidate the database contents.
+--
+-- - If the OS kills the nodes, then we don't have the opportunity to write out
+--   the marker file, which is fine, since we want the next startup to do
+--   revalidation.
+--
+-- - If initialization was cleanly interrupted (eg SIGINT), then we leave the
+--   marker the marker in the same state as it was at the beginning of said
+--   initialization.
+--
+-- - At the end of a successful initialization, we remove the marker and install
+--   a shutdown handler that writes the marker except for certain exceptions
+--   (see 'exceptionRequiresRecovery') that indicate corruption, for which we
+--   want the next startup to do revalidation.
+runWithCheckedDB
+  :: forall a m h blk. (IOLike m, StandardHash blk, Typeable blk)
+  => Proxy blk
+  -> HasFS m h
+  -> (LastShutDownWasClean -> (ChainDB m blk -> m a -> m a) -> m a)
+  -> m a
+runWithCheckedDB pb hasFS body = do
+    -- When we shut down cleanly, we create a marker file so that the next
+    -- time we start, we know we don't have to validate the contents of the
+    -- whole ChainDB. When we shut down with an exception indicating
+    -- corruption or something going wrong with the file system, we don't
+    -- create this marker file so that the next time we start, we do a full
+    -- validation.
+    wasClean <- hasCleanShutdownMarker hasFS
+
+    removeMarkerOnUncleanShutdown wasClean
+      $ body
+          (LastShutDownWasClean wasClean)
+          (\_cdb runWithInitializedChainDB -> createMarkerOnCleanShutdown $ do
+            -- ChainDB initialization has finished by the time we reach this
+            -- point. We remove the marker so that a SIGKILL will cause an unclean
+            -- shutdown.
+            when wasClean $ removeCleanShutdownMarker hasFS
+            runWithInitializedChainDB
+          )
+  where
+    -- | If there is a unclean exception during ChainDB initialization, we want
+    -- to remove the marker file, so we install this handler.
+    --
+    -- It is OK to also wrap this handler around code that runs after ChainDB
+    -- initialization, because the condition on this handler is the opposite of
+    -- the condition in the @createMarkerOnCleanShutdown@ handler.
+    removeMarkerOnUncleanShutdown wasClean = if not wasClean then id else onExceptionIf
+      (exceptionRequiresRecovery pb)
+      (removeCleanShutdownMarker hasFS)
+
+    -- | If a clean exception terminates the running node after ChainDB
+    -- initialization, we want to create the marker file.
+    --
+    -- NOTE: we assume the action (i.e., the node itself) never terminates without
+    -- an exception.
+    createMarkerOnCleanShutdown = onExceptionIf
+      (not . exceptionRequiresRecovery pb)
+      (createCleanShutdownMarker hasFS)
+
 {-------------------------------------------------------------------------------
   Auxiliary
 -------------------------------------------------------------------------------}