Skip to content

Commit

Permalink
Merge #3514
Browse files Browse the repository at this point in the history
3514: Fix clean state initialization r=nfrisby a=Jasagredo

Due to the unmasking of exceptions during ChainDB initialization, it is the case right now that the exit code is different from `DatabaseCorruption` and therefore the `withCheckedDb` function considers that it has to create the clean file. Because of this, a Ctrl+C during initialization would skip the validation on the next startup.

This PR ensures that the clean file is only removed after the chainDB is initialized (and therefore potentially modified) and from that point on it can be created again.

| initial status | killed when           | on `SIGKILL`     | on `DatabaseCorruption` | on other exception (e.g. `^C`) |
|----------------|-----------------------|------------------|------------------------|---------------------------|
| clean          | during initialization | unclean → clean | clean → unclean    | clean                     |
| clean          | after initialization  | unclean          | unclean                | clean                     |
| unclean        | during initialization | unclean          | unclean                | clean → unclean          |
| unclean        | after initialization  | unclean          | unclean                | clean                     |



Co-authored-by: Javier Sagredo <javier.sagredo@iohk.io>
  • Loading branch information
iohk-bors[bot] and jasagredo authored Dec 2, 2021
2 parents eeaec66 + 51fea95 commit 1102632
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 88 deletions.
124 changes: 55 additions & 69 deletions ouroboros-consensus/src/Ouroboros/Consensus/Node.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ module Ouroboros.Consensus.Node (
) where

import Codec.Serialise (DeserialiseFailure)
import Control.Monad (when)
import Control.Tracer (Tracer, contramap)
import Data.ByteString.Lazy (ByteString)
import Data.Hashable (Hashable)
Expand Down Expand Up @@ -183,8 +182,14 @@ data LowLevelRunNodeArgs m addrNTN addrNTC versionDataNTN versionDataNTC blk
(p2p :: Diffusion.P2P) =
LowLevelRunNodeArgs {

-- | How to manage the clean-shutdown marker on disk
llrnWithCheckedDB :: forall a. (LastShutDownWasClean -> m a) -> m a
-- | An action that will receive a marker indicating whether the previous
-- shutdown was considered clean and a wrapper for installing a handler to
-- create a clean file on exit if needed. See
-- 'Ouroboros.Consensus.Node.Recovery.runWithCheckedDB'.
llrnWithCheckedDB :: forall a. ( LastShutDownWasClean
-> (ChainDB m blk -> m a -> m a)
-> m a)
-> m a

-- | The " static " ChainDB arguments
, llrnChainDbArgsDefaults :: ChainDbArgs Defaults m blk
Expand Down Expand Up @@ -277,9 +282,8 @@ runWith :: forall m addrNTN addrNTC versionDataNTN versionDataNTC blk p2p.
-> m ()
runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =

llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) ->
llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) continueWithCleanChainDB ->
withRegistry $ \registry -> do

let systemStart :: SystemStart
systemStart = getSystemStart (configBlock cfg)

Expand Down Expand Up @@ -311,48 +315,49 @@ runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =
chainDB <- openChainDB registry inFuture cfg initLedger
llrnChainDbArgsDefaults customiseChainDbArgs'

btime <-
hardForkBlockchainTime $
llrnCustomiseHardForkBlockchainTimeArgs $
HardForkBlockchainTimeArgs
{ hfbtBackoffDelay = pure $ BackoffDelay 60
, hfbtGetLedgerState =
ledgerState <$> ChainDB.getCurrentLedger chainDB
, hfbtLedgerConfig = configLedger cfg
, hfbtRegistry = registry
, hfbtSystemTime = systemTime
, hfbtTracer =
contramap (fmap (fromRelativeTime systemStart))
(blockchainTimeTracer rnTraceConsensus)
, hfbtMaxClockRewind = secondsToNominalDiffTime 20
}

nodeKernelArgs <-
fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $
mkNodeKernelArgs
registry
llrnBfcSalt
llrnKeepAliveRng
cfg
blockForging
rnTraceConsensus
btime
chainDB
nodeKernel <- initNodeKernel nodeKernelArgs
rnNodeKernelHook registry nodeKernel

peerMetrics <- newPeerMetric
let ntnApps = mkNodeToNodeApps nodeKernelArgs nodeKernel peerMetrics
ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel
(apps, appsExtra) = mkDiffusionApplications
rnEnableP2P
(miniProtocolParameters nodeKernelArgs)
ntnApps
ntcApps
nodeKernel
peerMetrics

llrnRunDataDiffusion registry apps appsExtra
continueWithCleanChainDB chainDB $ do
btime <-
hardForkBlockchainTime $
llrnCustomiseHardForkBlockchainTimeArgs $
HardForkBlockchainTimeArgs
{ hfbtBackoffDelay = pure $ BackoffDelay 60
, hfbtGetLedgerState =
ledgerState <$> ChainDB.getCurrentLedger chainDB
, hfbtLedgerConfig = configLedger cfg
, hfbtRegistry = registry
, hfbtSystemTime = systemTime
, hfbtTracer =
contramap (fmap (fromRelativeTime systemStart))
(blockchainTimeTracer rnTraceConsensus)
, hfbtMaxClockRewind = secondsToNominalDiffTime 20
}

nodeKernelArgs <-
fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $
mkNodeKernelArgs
registry
llrnBfcSalt
llrnKeepAliveRng
cfg
blockForging
rnTraceConsensus
btime
chainDB
nodeKernel <- initNodeKernel nodeKernelArgs
rnNodeKernelHook registry nodeKernel

peerMetrics <- newPeerMetric
let ntnApps = mkNodeToNodeApps nodeKernelArgs nodeKernel peerMetrics
ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel
(apps, appsExtra) = mkDiffusionApplications
rnEnableP2P
(miniProtocolParameters nodeKernelArgs)
ntnApps
ntcApps
nodeKernel
peerMetrics

llrnRunDataDiffusion registry apps appsExtra
where
ProtocolInfo
{ pInfoConfig = cfg
Expand Down Expand Up @@ -488,20 +493,16 @@ runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =
localRethrowPolicy :: RethrowPolicy
localRethrowPolicy = mempty

-- | Did the ChainDB already have existing clean-shutdown marker on disk?
newtype LastShutDownWasClean = LastShutDownWasClean Bool
deriving (Eq, Show)

-- | Check the DB marker, lock the DB and look for the clean shutdown marker.
--
-- Run the body action with the DB locked, and if the last shutdown was clean.
-- Run the body action with the DB locked.
--
stdWithCheckedDB ::
forall blk a. (StandardHash blk, Typeable blk)
=> Proxy blk
-> FilePath
-> NetworkMagic
-> (LastShutDownWasClean -> IO a) -- ^ Body action with last shutdown was clean.
-> (LastShutDownWasClean -> (ChainDB IO blk -> IO a -> IO a) -> IO a) -- ^ Body action with last shutdown was clean.
-> IO a
stdWithCheckedDB pb databasePath networkMagic body = do

Expand All @@ -513,22 +514,7 @@ stdWithCheckedDB pb databasePath networkMagic body = do
networkMagic

-- Then create the lock file.
withLockDB mountPoint $ do

-- When we shut down cleanly, we create a marker file so that the next
-- time we start, we know we don't have to validate the contents of the
-- whole ChainDB. When we shut down with an exception indicating
-- corruption or something going wrong with the file system, we don't
-- create this marker file so that the next time we start, we do a full
-- validation.
lastShutDownWasClean <- hasCleanShutdownMarker hasFS
when lastShutDownWasClean $ removeCleanShutdownMarker hasFS

-- On a clean shutdown, create a marker in the database folder so that
-- next time we start up, we know we don't have to validate the whole
-- database.
createMarkerOnCleanShutdown pb hasFS $
body (LastShutDownWasClean lastShutDownWasClean)
withLockDB mountPoint $ runWithCheckedDB pb hasFS body
where
mountPoint = MountPoint databasePath
hasFS = ioHasFS mountPoint
Expand Down
88 changes: 69 additions & 19 deletions ouroboros-consensus/src/Ouroboros/Consensus/Node/Recovery.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
{-# LANGUAGE ScopedTypeVariables #-}

module Ouroboros.Consensus.Node.Recovery (
createCleanShutdownMarker
, createMarkerOnCleanShutdown
LastShutDownWasClean (..)
, createCleanShutdownMarker
, hasCleanShutdownMarker
, removeCleanShutdownMarker
, runWithCheckedDB
) where

import Control.Monad (unless, when)
Expand All @@ -16,6 +17,7 @@ import Ouroboros.Consensus.Block (StandardHash)
import Ouroboros.Consensus.Node.Exit (ExitReason (..), toExitReason)
import Ouroboros.Consensus.Util.IOLike

import Ouroboros.Consensus.Storage.ChainDB
import Ouroboros.Consensus.Storage.FS.API (HasFS, doesFileExist,
removeFile, withFile)
import Ouroboros.Consensus.Storage.FS.API.Types (AllowExisting (..),
Expand All @@ -25,23 +27,9 @@ import Ouroboros.Consensus.Storage.FS.API.Types (AllowExisting (..),
cleanShutdownMarkerFile :: FsPath
cleanShutdownMarkerFile = mkFsPath ["clean"]

-- | When the given action terminates with a /clean/ exception, create the
-- /clean shutdown marker file/.
--
-- NOTE: we assume the action (i.e., the node itself) never terminates without
-- an exception.
--
-- A /clean/ exception is an exception for 'exceptionRequiresRecovery' returns
-- 'False'.
createMarkerOnCleanShutdown ::
(IOLike m, StandardHash blk, Typeable blk)
=> Proxy blk
-> HasFS m h
-> m a -- ^ Action to run
-> m a
createMarkerOnCleanShutdown pb mp = onExceptionIf
(not . exceptionRequiresRecovery pb)
(createCleanShutdownMarker mp)
-- | Did the ChainDB already have existing clean-shutdown marker on disk?
newtype LastShutDownWasClean = LastShutDownWasClean Bool
deriving (Eq, Show)

-- | Return 'True' when 'cleanShutdownMarkerFile' exists.
hasCleanShutdownMarker
Expand Down Expand Up @@ -83,6 +71,68 @@ exceptionRequiresRecovery pb e = case toExitReason pb e of
DatabaseCorruption -> True
_ -> False

-- | A bracket function that manages the clean-shutdown marker on disk.
--
-- - If the marker is missing on startup, then ChainDB initialization will
-- revalidate the database contents.
--
-- - If the OS kills the nodes, then we don't have the opportunity to write out
-- the marker file, which is fine, since we want the next startup to do
-- revalidation.
--
-- - If initialization was cleanly interrupted (eg SIGINT), then we leave the
-- marker the marker in the same state as it was at the beginning of said
-- initialization.
--
-- - At the end of a successful initialization, we remove the marker and install
-- a shutdown handler that writes the marker except for certain exceptions
-- (see 'exceptionRequiresRecovery') that indicate corruption, for which we
-- want the next startup to do revalidation.
runWithCheckedDB
:: forall a m h blk. (IOLike m, StandardHash blk, Typeable blk)
=> Proxy blk
-> HasFS m h
-> (LastShutDownWasClean -> (ChainDB m blk -> m a -> m a) -> m a)
-> m a
runWithCheckedDB pb hasFS body = do
-- When we shut down cleanly, we create a marker file so that the next
-- time we start, we know we don't have to validate the contents of the
-- whole ChainDB. When we shut down with an exception indicating
-- corruption or something going wrong with the file system, we don't
-- create this marker file so that the next time we start, we do a full
-- validation.
wasClean <- hasCleanShutdownMarker hasFS

removeMarkerOnUncleanShutdown wasClean
$ body
(LastShutDownWasClean wasClean)
(\_cdb runWithInitializedChainDB -> createMarkerOnCleanShutdown $ do
-- ChainDB initialization has finished by the time we reach this
-- point. We remove the marker so that a SIGKILL will cause an unclean
-- shutdown.
when wasClean $ removeCleanShutdownMarker hasFS
runWithInitializedChainDB
)
where
-- | If there is a unclean exception during ChainDB initialization, we want
-- to remove the marker file, so we install this handler.
--
-- It is OK to also wrap this handler around code that runs after ChainDB
-- initialization, because the condition on this handler is the opposite of
-- the condition in the @createMarkerOnCleanShutdown@ handler.
removeMarkerOnUncleanShutdown wasClean = if not wasClean then id else onExceptionIf
(exceptionRequiresRecovery pb)
(removeCleanShutdownMarker hasFS)

-- | If a clean exception terminates the running node after ChainDB
-- initialization, we want to create the marker file.
--
-- NOTE: we assume the action (i.e., the node itself) never terminates without
-- an exception.
createMarkerOnCleanShutdown = onExceptionIf
(not . exceptionRequiresRecovery pb)
(createCleanShutdownMarker hasFS)

{-------------------------------------------------------------------------------
Auxiliary
-------------------------------------------------------------------------------}
Expand Down

0 comments on commit 1102632

Please sign in to comment.