Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix clean state initialization #3514

Merged
merged 1 commit into from
Dec 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 55 additions & 69 deletions ouroboros-consensus/src/Ouroboros/Consensus/Node.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ module Ouroboros.Consensus.Node (
) where

import Codec.Serialise (DeserialiseFailure)
import Control.Monad (when)
import Control.Tracer (Tracer, contramap)
import Data.ByteString.Lazy (ByteString)
import Data.Hashable (Hashable)
Expand Down Expand Up @@ -183,8 +182,14 @@ data LowLevelRunNodeArgs m addrNTN addrNTC versionDataNTN versionDataNTC blk
(p2p :: Diffusion.P2P) =
LowLevelRunNodeArgs {

-- | How to manage the clean-shutdown marker on disk
llrnWithCheckedDB :: forall a. (LastShutDownWasClean -> m a) -> m a
-- | An action that will receive a marker indicating whether the previous
-- shutdown was considered clean and a wrapper for installing a handler to
-- create a clean file on exit if needed. See
-- 'Ouroboros.Consensus.Node.Recovery.runWithCheckedDB'.
llrnWithCheckedDB :: forall a. ( LastShutDownWasClean
-> (ChainDB m blk -> m a -> m a)
-> m a)
-> m a

-- | The " static " ChainDB arguments
, llrnChainDbArgsDefaults :: ChainDbArgs Defaults m blk
Expand Down Expand Up @@ -277,9 +282,8 @@ runWith :: forall m addrNTN addrNTC versionDataNTN versionDataNTC blk p2p.
-> m ()
runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =

llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) ->
llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) continueWithCleanChainDB ->
withRegistry $ \registry -> do

let systemStart :: SystemStart
systemStart = getSystemStart (configBlock cfg)

Expand Down Expand Up @@ -311,48 +315,49 @@ runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =
chainDB <- openChainDB registry inFuture cfg initLedger
llrnChainDbArgsDefaults customiseChainDbArgs'

btime <-
hardForkBlockchainTime $
llrnCustomiseHardForkBlockchainTimeArgs $
HardForkBlockchainTimeArgs
{ hfbtBackoffDelay = pure $ BackoffDelay 60
, hfbtGetLedgerState =
ledgerState <$> ChainDB.getCurrentLedger chainDB
, hfbtLedgerConfig = configLedger cfg
, hfbtRegistry = registry
, hfbtSystemTime = systemTime
, hfbtTracer =
contramap (fmap (fromRelativeTime systemStart))
(blockchainTimeTracer rnTraceConsensus)
, hfbtMaxClockRewind = secondsToNominalDiffTime 20
}

nodeKernelArgs <-
fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $
mkNodeKernelArgs
registry
llrnBfcSalt
llrnKeepAliveRng
cfg
blockForging
rnTraceConsensus
btime
chainDB
nodeKernel <- initNodeKernel nodeKernelArgs
rnNodeKernelHook registry nodeKernel

peerMetrics <- newPeerMetric
let ntnApps = mkNodeToNodeApps nodeKernelArgs nodeKernel peerMetrics
ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel
(apps, appsExtra) = mkDiffusionApplications
rnEnableP2P
(miniProtocolParameters nodeKernelArgs)
ntnApps
ntcApps
nodeKernel
peerMetrics

llrnRunDataDiffusion registry apps appsExtra
continueWithCleanChainDB chainDB $ do
btime <-
hardForkBlockchainTime $
llrnCustomiseHardForkBlockchainTimeArgs $
HardForkBlockchainTimeArgs
{ hfbtBackoffDelay = pure $ BackoffDelay 60
, hfbtGetLedgerState =
ledgerState <$> ChainDB.getCurrentLedger chainDB
, hfbtLedgerConfig = configLedger cfg
, hfbtRegistry = registry
, hfbtSystemTime = systemTime
, hfbtTracer =
contramap (fmap (fromRelativeTime systemStart))
(blockchainTimeTracer rnTraceConsensus)
, hfbtMaxClockRewind = secondsToNominalDiffTime 20
}

nodeKernelArgs <-
fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $
mkNodeKernelArgs
registry
llrnBfcSalt
llrnKeepAliveRng
cfg
blockForging
rnTraceConsensus
btime
chainDB
nodeKernel <- initNodeKernel nodeKernelArgs
rnNodeKernelHook registry nodeKernel

peerMetrics <- newPeerMetric
let ntnApps = mkNodeToNodeApps nodeKernelArgs nodeKernel peerMetrics
ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel
(apps, appsExtra) = mkDiffusionApplications
rnEnableP2P
(miniProtocolParameters nodeKernelArgs)
ntnApps
ntcApps
nodeKernel
peerMetrics

llrnRunDataDiffusion registry apps appsExtra
where
ProtocolInfo
{ pInfoConfig = cfg
Expand Down Expand Up @@ -488,20 +493,16 @@ runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =
localRethrowPolicy :: RethrowPolicy
localRethrowPolicy = mempty

-- | Did the ChainDB already have existing clean-shutdown marker on disk?
newtype LastShutDownWasClean = LastShutDownWasClean Bool
deriving (Eq, Show)

-- | Check the DB marker, lock the DB and look for the clean shutdown marker.
--
-- Run the body action with the DB locked, and if the last shutdown was clean.
-- Run the body action with the DB locked.
--
stdWithCheckedDB ::
forall blk a. (StandardHash blk, Typeable blk)
=> Proxy blk
-> FilePath
-> NetworkMagic
-> (LastShutDownWasClean -> IO a) -- ^ Body action with last shutdown was clean.
-> (LastShutDownWasClean -> (ChainDB IO blk -> IO a -> IO a) -> IO a) -- ^ Body action with last shutdown was clean.
-> IO a
stdWithCheckedDB pb databasePath networkMagic body = do

Expand All @@ -513,22 +514,7 @@ stdWithCheckedDB pb databasePath networkMagic body = do
networkMagic

-- Then create the lock file.
withLockDB mountPoint $ do

-- When we shut down cleanly, we create a marker file so that the next
-- time we start, we know we don't have to validate the contents of the
-- whole ChainDB. When we shut down with an exception indicating
-- corruption or something going wrong with the file system, we don't
-- create this marker file so that the next time we start, we do a full
-- validation.
lastShutDownWasClean <- hasCleanShutdownMarker hasFS
when lastShutDownWasClean $ removeCleanShutdownMarker hasFS

-- On a clean shutdown, create a marker in the database folder so that
-- next time we start up, we know we don't have to validate the whole
-- database.
createMarkerOnCleanShutdown pb hasFS $
body (LastShutDownWasClean lastShutDownWasClean)
withLockDB mountPoint $ runWithCheckedDB pb hasFS body
where
mountPoint = MountPoint databasePath
hasFS = ioHasFS mountPoint
Expand Down
88 changes: 69 additions & 19 deletions ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
{-# LANGUAGE ScopedTypeVariables #-}

module Ouroboros.Consensus.Node.Recovery (
createCleanShutdownMarker
, createMarkerOnCleanShutdown
LastShutDownWasClean (..)
, createCleanShutdownMarker
, hasCleanShutdownMarker
, removeCleanShutdownMarker
, runWithCheckedDB
) where

import Control.Monad (unless, when)
Expand All @@ -16,6 +17,7 @@ import Ouroboros.Consensus.Block (StandardHash)
import Ouroboros.Consensus.Node.Exit (ExitReason (..), toExitReason)
import Ouroboros.Consensus.Util.IOLike

import Ouroboros.Consensus.Storage.ChainDB
import Ouroboros.Consensus.Storage.FS.API (HasFS, doesFileExist,
removeFile, withFile)
import Ouroboros.Consensus.Storage.FS.API.Types (AllowExisting (..),
Expand All @@ -25,23 +27,9 @@ import Ouroboros.Consensus.Storage.FS.API.Types (AllowExisting (..),
cleanShutdownMarkerFile :: FsPath
cleanShutdownMarkerFile = mkFsPath ["clean"]

-- | When the given action terminates with a /clean/ exception, create the
-- /clean shutdown marker file/.
--
-- NOTE: we assume the action (i.e., the node itself) never terminates without
-- an exception.
--
-- A /clean/ exception is an exception for 'exceptionRequiresRecovery' returns
-- 'False'.
createMarkerOnCleanShutdown ::
(IOLike m, StandardHash blk, Typeable blk)
=> Proxy blk
-> HasFS m h
-> m a -- ^ Action to run
-> m a
createMarkerOnCleanShutdown pb mp = onExceptionIf
(not . exceptionRequiresRecovery pb)
(createCleanShutdownMarker mp)
-- | Did the ChainDB already have existing clean-shutdown marker on disk?
newtype LastShutDownWasClean = LastShutDownWasClean Bool
deriving (Eq, Show)

-- | Return 'True' when 'cleanShutdownMarkerFile' exists.
hasCleanShutdownMarker
Expand Down Expand Up @@ -83,6 +71,68 @@ exceptionRequiresRecovery pb e = case toExitReason pb e of
DatabaseCorruption -> True
_ -> False

-- | A bracket function that manages the clean-shutdown marker on disk.
jasagredo marked this conversation as resolved.
Show resolved Hide resolved
--
-- - If the marker is missing on startup, then ChainDB initialization will
-- revalidate the database contents.
--
-- - If the OS kills the nodes, then we don't have the opportunity to write out
-- the marker file, which is fine, since we want the next startup to do
-- revalidation.
--
-- - If initialization was cleanly interrupted (eg SIGINT), then we leave the
-- marker the marker in the same state as it was at the beginning of said
-- initialization.
--
-- - At the end of a successful initialization, we remove the marker and install
-- a shutdown handler that writes the marker except for certain exceptions
-- (see 'exceptionRequiresRecovery') that indicate corruption, for which we
-- want the next startup to do revalidation.
runWithCheckedDB
:: forall a m h blk. (IOLike m, StandardHash blk, Typeable blk)
=> Proxy blk
-> HasFS m h
-> (LastShutDownWasClean -> (ChainDB m blk -> m a -> m a) -> m a)
-> m a
runWithCheckedDB pb hasFS body = do
-- When we shut down cleanly, we create a marker file so that the next
-- time we start, we know we don't have to validate the contents of the
-- whole ChainDB. When we shut down with an exception indicating
-- corruption or something going wrong with the file system, we don't
-- create this marker file so that the next time we start, we do a full
-- validation.
wasClean <- hasCleanShutdownMarker hasFS

removeMarkerOnUncleanShutdown wasClean
$ body
(LastShutDownWasClean wasClean)
(\_cdb runWithInitializedChainDB -> createMarkerOnCleanShutdown $ do
-- ChainDB initialization has finished by the time we reach this
-- point. We remove the marker so that a SIGKILL will cause an unclean
-- shutdown.
when wasClean $ removeCleanShutdownMarker hasFS
runWithInitializedChainDB
)
where
-- | If there is a unclean exception during ChainDB initialization, we want
-- to remove the marker file, so we install this handler.
--
-- It is OK to also wrap this handler around code that runs after ChainDB
-- initialization, because the condition on this handler is the opposite of
-- the condition in the @createMarkerOnCleanShutdown@ handler.
removeMarkerOnUncleanShutdown wasClean = if not wasClean then id else onExceptionIf
(exceptionRequiresRecovery pb)
(removeCleanShutdownMarker hasFS)

-- | If a clean exception terminates the running node after ChainDB
-- initialization, we want to create the marker file.
--
-- NOTE: we assume the action (i.e., the node itself) never terminates without
-- an exception.
createMarkerOnCleanShutdown = onExceptionIf
(not . exceptionRequiresRecovery pb)
(createCleanShutdownMarker hasFS)

{-------------------------------------------------------------------------------
Auxiliary
-------------------------------------------------------------------------------}
Expand Down