From 51fea953d670d333849a78ad2485d410017036bc Mon Sep 17 00:00:00 2001 From: Javier Sagredo Date: Tue, 30 Nov 2021 14:44:27 +0100 Subject: [PATCH] Fix: don't modify clean state on startup --- .../src/Ouroboros/Consensus/Node.hs | 124 ++++++++---------- .../src/Ouroboros/Consensus/Node/Recovery.hs | 88 ++++++++++--- 2 files changed, 124 insertions(+), 88 deletions(-) diff --git a/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs b/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs index f76dc36891e..7c655a3bb54 100644 --- a/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs +++ b/ouroboros-consensus/src/Ouroboros/Consensus/Node.hs @@ -52,7 +52,6 @@ module Ouroboros.Consensus.Node ( ) where import Codec.Serialise (DeserialiseFailure) -import Control.Monad (when) import Control.Tracer (Tracer, contramap) import Data.ByteString.Lazy (ByteString) import Data.Hashable (Hashable) @@ -183,8 +182,14 @@ data LowLevelRunNodeArgs m addrNTN addrNTC versionDataNTN versionDataNTC blk (p2p :: Diffusion.P2P) = LowLevelRunNodeArgs { - -- | How to manage the clean-shutdown marker on disk - llrnWithCheckedDB :: forall a. (LastShutDownWasClean -> m a) -> m a + -- | An action that will receive a marker indicating whether the previous + -- shutdown was considered clean and a wrapper for installing a handler to + -- create a clean file on exit if needed. See + -- 'Ouroboros.Consensus.Node.Recovery.runWithCheckedDB'. + llrnWithCheckedDB :: forall a. ( LastShutDownWasClean + -> (ChainDB m blk -> m a -> m a) + -> m a) + -> m a -- | The " static " ChainDB arguments , llrnChainDbArgsDefaults :: ChainDbArgs Defaults m blk @@ -277,9 +282,8 @@ runWith :: forall m addrNTN addrNTC versionDataNTN versionDataNTC blk p2p. -> m () runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} = - llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) -> + llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) continueWithCleanChainDB -> withRegistry $ \registry -> do - let systemStart :: SystemStart systemStart = getSystemStart (configBlock cfg) @@ -311,48 +315,49 @@ runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} = chainDB <- openChainDB registry inFuture cfg initLedger llrnChainDbArgsDefaults customiseChainDbArgs' - btime <- - hardForkBlockchainTime $ - llrnCustomiseHardForkBlockchainTimeArgs $ - HardForkBlockchainTimeArgs - { hfbtBackoffDelay = pure $ BackoffDelay 60 - , hfbtGetLedgerState = - ledgerState <$> ChainDB.getCurrentLedger chainDB - , hfbtLedgerConfig = configLedger cfg - , hfbtRegistry = registry - , hfbtSystemTime = systemTime - , hfbtTracer = - contramap (fmap (fromRelativeTime systemStart)) - (blockchainTimeTracer rnTraceConsensus) - , hfbtMaxClockRewind = secondsToNominalDiffTime 20 - } - - nodeKernelArgs <- - fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $ - mkNodeKernelArgs - registry - llrnBfcSalt - llrnKeepAliveRng - cfg - blockForging - rnTraceConsensus - btime - chainDB - nodeKernel <- initNodeKernel nodeKernelArgs - rnNodeKernelHook registry nodeKernel - - peerMetrics <- newPeerMetric - let ntnApps = mkNodeToNodeApps nodeKernelArgs nodeKernel peerMetrics - ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel - (apps, appsExtra) = mkDiffusionApplications - rnEnableP2P - (miniProtocolParameters nodeKernelArgs) - ntnApps - ntcApps - nodeKernel - peerMetrics - - llrnRunDataDiffusion registry apps appsExtra + continueWithCleanChainDB chainDB $ do + btime <- + hardForkBlockchainTime $ + llrnCustomiseHardForkBlockchainTimeArgs $ + HardForkBlockchainTimeArgs + { hfbtBackoffDelay = pure $ BackoffDelay 60 + , hfbtGetLedgerState = + ledgerState <$> ChainDB.getCurrentLedger chainDB + , hfbtLedgerConfig = configLedger cfg + , hfbtRegistry = registry + , hfbtSystemTime = systemTime + , hfbtTracer = + contramap (fmap (fromRelativeTime systemStart)) + (blockchainTimeTracer rnTraceConsensus) + , hfbtMaxClockRewind = secondsToNominalDiffTime 20 + } + + nodeKernelArgs <- + fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $ + mkNodeKernelArgs + registry + llrnBfcSalt + llrnKeepAliveRng + cfg + blockForging + rnTraceConsensus + btime + chainDB + nodeKernel <- initNodeKernel nodeKernelArgs + rnNodeKernelHook registry nodeKernel + + peerMetrics <- newPeerMetric + let ntnApps = mkNodeToNodeApps nodeKernelArgs nodeKernel peerMetrics + ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel + (apps, appsExtra) = mkDiffusionApplications + rnEnableP2P + (miniProtocolParameters nodeKernelArgs) + ntnApps + ntcApps + nodeKernel + peerMetrics + + llrnRunDataDiffusion registry apps appsExtra where ProtocolInfo { pInfoConfig = cfg @@ -488,20 +493,16 @@ runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} = localRethrowPolicy :: RethrowPolicy localRethrowPolicy = mempty --- | Did the ChainDB already have existing clean-shutdown marker on disk? -newtype LastShutDownWasClean = LastShutDownWasClean Bool - deriving (Eq, Show) - -- | Check the DB marker, lock the DB and look for the clean shutdown marker. -- --- Run the body action with the DB locked, and if the last shutdown was clean. +-- Run the body action with the DB locked. -- stdWithCheckedDB :: forall blk a. (StandardHash blk, Typeable blk) => Proxy blk -> FilePath -> NetworkMagic - -> (LastShutDownWasClean -> IO a) -- ^ Body action with last shutdown was clean. + -> (LastShutDownWasClean -> (ChainDB IO blk -> IO a -> IO a) -> IO a) -- ^ Body action with last shutdown was clean. -> IO a stdWithCheckedDB pb databasePath networkMagic body = do @@ -513,22 +514,7 @@ stdWithCheckedDB pb databasePath networkMagic body = do networkMagic -- Then create the lock file. - withLockDB mountPoint $ do - - -- When we shut down cleanly, we create a marker file so that the next - -- time we start, we know we don't have to validate the contents of the - -- whole ChainDB. When we shut down with an exception indicating - -- corruption or something going wrong with the file system, we don't - -- create this marker file so that the next time we start, we do a full - -- validation. - lastShutDownWasClean <- hasCleanShutdownMarker hasFS - when lastShutDownWasClean $ removeCleanShutdownMarker hasFS - - -- On a clean shutdown, create a marker in the database folder so that - -- next time we start up, we know we don't have to validate the whole - -- database. - createMarkerOnCleanShutdown pb hasFS $ - body (LastShutDownWasClean lastShutDownWasClean) + withLockDB mountPoint $ runWithCheckedDB pb hasFS body where mountPoint = MountPoint databasePath hasFS = ioHasFS mountPoint diff --git a/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs b/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs index f421ef8acf4..4a3fbe5aae1 100644 --- a/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs +++ b/ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs @@ -2,10 +2,11 @@ {-# LANGUAGE ScopedTypeVariables #-} module Ouroboros.Consensus.Node.Recovery ( - createCleanShutdownMarker - , createMarkerOnCleanShutdown + LastShutDownWasClean (..) + , createCleanShutdownMarker , hasCleanShutdownMarker , removeCleanShutdownMarker + , runWithCheckedDB ) where import Control.Monad (unless, when) @@ -16,6 +17,7 @@ import Ouroboros.Consensus.Block (StandardHash) import Ouroboros.Consensus.Node.Exit (ExitReason (..), toExitReason) import Ouroboros.Consensus.Util.IOLike +import Ouroboros.Consensus.Storage.ChainDB import Ouroboros.Consensus.Storage.FS.API (HasFS, doesFileExist, removeFile, withFile) import Ouroboros.Consensus.Storage.FS.API.Types (AllowExisting (..), @@ -25,23 +27,9 @@ import Ouroboros.Consensus.Storage.FS.API.Types (AllowExisting (..), cleanShutdownMarkerFile :: FsPath cleanShutdownMarkerFile = mkFsPath ["clean"] --- | When the given action terminates with a /clean/ exception, create the --- /clean shutdown marker file/. --- --- NOTE: we assume the action (i.e., the node itself) never terminates without --- an exception. --- --- A /clean/ exception is an exception for 'exceptionRequiresRecovery' returns --- 'False'. -createMarkerOnCleanShutdown :: - (IOLike m, StandardHash blk, Typeable blk) - => Proxy blk - -> HasFS m h - -> m a -- ^ Action to run - -> m a -createMarkerOnCleanShutdown pb mp = onExceptionIf - (not . exceptionRequiresRecovery pb) - (createCleanShutdownMarker mp) +-- | Did the ChainDB already have existing clean-shutdown marker on disk? +newtype LastShutDownWasClean = LastShutDownWasClean Bool + deriving (Eq, Show) -- | Return 'True' when 'cleanShutdownMarkerFile' exists. hasCleanShutdownMarker @@ -83,6 +71,68 @@ exceptionRequiresRecovery pb e = case toExitReason pb e of DatabaseCorruption -> True _ -> False +-- | A bracket function that manages the clean-shutdown marker on disk. +-- +-- - If the marker is missing on startup, then ChainDB initialization will +-- revalidate the database contents. +-- +-- - If the OS kills the nodes, then we don't have the opportunity to write out +-- the marker file, which is fine, since we want the next startup to do +-- revalidation. +-- +-- - If initialization was cleanly interrupted (eg SIGINT), then we leave the +-- marker the marker in the same state as it was at the beginning of said +-- initialization. +-- +-- - At the end of a successful initialization, we remove the marker and install +-- a shutdown handler that writes the marker except for certain exceptions +-- (see 'exceptionRequiresRecovery') that indicate corruption, for which we +-- want the next startup to do revalidation. +runWithCheckedDB + :: forall a m h blk. (IOLike m, StandardHash blk, Typeable blk) + => Proxy blk + -> HasFS m h + -> (LastShutDownWasClean -> (ChainDB m blk -> m a -> m a) -> m a) + -> m a +runWithCheckedDB pb hasFS body = do + -- When we shut down cleanly, we create a marker file so that the next + -- time we start, we know we don't have to validate the contents of the + -- whole ChainDB. When we shut down with an exception indicating + -- corruption or something going wrong with the file system, we don't + -- create this marker file so that the next time we start, we do a full + -- validation. + wasClean <- hasCleanShutdownMarker hasFS + + removeMarkerOnUncleanShutdown wasClean + $ body + (LastShutDownWasClean wasClean) + (\_cdb runWithInitializedChainDB -> createMarkerOnCleanShutdown $ do + -- ChainDB initialization has finished by the time we reach this + -- point. We remove the marker so that a SIGKILL will cause an unclean + -- shutdown. + when wasClean $ removeCleanShutdownMarker hasFS + runWithInitializedChainDB + ) + where + -- | If there is a unclean exception during ChainDB initialization, we want + -- to remove the marker file, so we install this handler. + -- + -- It is OK to also wrap this handler around code that runs after ChainDB + -- initialization, because the condition on this handler is the opposite of + -- the condition in the @createMarkerOnCleanShutdown@ handler. + removeMarkerOnUncleanShutdown wasClean = if not wasClean then id else onExceptionIf + (exceptionRequiresRecovery pb) + (removeCleanShutdownMarker hasFS) + + -- | If a clean exception terminates the running node after ChainDB + -- initialization, we want to create the marker file. + -- + -- NOTE: we assume the action (i.e., the node itself) never terminates without + -- an exception. + createMarkerOnCleanShutdown = onExceptionIf + (not . exceptionRequiresRecovery pb) + (createCleanShutdownMarker hasFS) + {------------------------------------------------------------------------------- Auxiliary -------------------------------------------------------------------------------}