diff --git a/.daily_canary b/.daily_canary index 6f5c304deeaf..c409776c5684 100644 --- a/.daily_canary +++ b/.daily_canary @@ -1 +1 @@ -Piou +Tweet. diff --git a/CHANGELOG.md b/CHANGELOG.md index 79a06c04face..0104c2844f63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## Unreleased + +### Fixed + +- Fixed an issue where new node started without a snapshot would be able to join from a node that started with a snapshot (#3573). + ## [2.0.0-rc8] ### Fixed diff --git a/doc/operations/ledger_snapshot.rst b/doc/operations/ledger_snapshot.rst index a1b13233da6b..f3d961c225d9 100644 --- a/doc/operations/ledger_snapshot.rst +++ b/doc/operations/ledger_snapshot.rst @@ -70,7 +70,9 @@ Join/Recover From Snapshot Once a snapshot has been generated by the primary, operators can copy or mount the snapshot directory to the new node directory before it is started. On start-up, the new node will automatically resume from the latest committed snapshot file in the ``snapshots.directory`` directory. If no snapshot file is found, all historical transactions will be replicated to that node. -From 2.x releases (specifically, from `-dev5`), committed snapshot files embed the receipt of the evidence transaction. As such, nodes can join or recover a service from a standalone snapshot file. For 1.x releases, it is expected that operators also copy the ledger suffix containing the proof of commit of the evidence transaction to the node's ledger directory. +From 2.x releases, committed snapshot files embed the receipt of the evidence transaction. As such, nodes can join or recover a service from a standalone snapshot file. For 1.x releases, it is expected that operators also copy the ledger suffix containing the proof of commit of the evidence transaction to the node's ledger directory. + +It is important to note that new nodes cannot resume from a snapshot and join a service via a node that started from a more recent snapshot. For example, if a new node resumes from a snapshot generated at ``seqno 100`` and joins from a (primary) node that originally resumed from a snapshot at ``seqno 50``, the new node will throw a ``StartupSeqnoIsOld`` error shortly after starting up. It is expected that operators copy the *latest* committed snapshot file to new nodes before start up. .. note:: Snapshots emitted by 1.x nodes can be used by 2.x nodes to join or a recover a service. diff --git a/doc/operations/start_network.rst b/doc/operations/start_network.rst index ee05d3fbebf8..03f229c66d51 100644 --- a/doc/operations/start_network.rst +++ b/doc/operations/start_network.rst @@ -21,7 +21,7 @@ To create a new CCF network, the first node of the network should be started wit Uninitialized-- config -->Initialized; Initialized-- start -->PartOfNetwork; -The unique identifier of a CCF node is the hex-encoded string of the SHA-256 digest the public key contained in its identity certificate (e.g. ``50211327a77fc16dd2fba8fae5fffac3df909fceeb307cf804a4125ae2679007``). This unique identifier should be used by operators and members to refer to this node with CCF (for example, when :ref:`governance/common_member_operations:Trusting a New Node`). +The unique identifier of a CCF node is the hex-encoded string of the SHA-256 digest of the public key contained in its identity certificate (e.g. ``50211327a77fc16dd2fba8fae5fffac3df909fceeb307cf804a4125ae2679007``). This unique identifier should be used by operators and members to refer to this node with CCF (for example, when :ref:`governance/common_member_operations:Trusting a New Node`). CCF nodes can be started by using IP Addresses (both IPv4 and IPv6 are supported) or by specifying a fully qualified domain name. If an FQDN is used then a ``dNSName`` subject alternative name should be specified as part of the ``node_certificate.subject_alt_names`` configuration entry. Once a DNS has been setup it will be possible to connect to the node over TLS by using the node's domain name. diff --git a/include/ccf/odata_error.h b/include/ccf/odata_error.h index 7cbb5b6d392c..9ac2cc6db57d 100644 --- a/include/ccf/odata_error.h +++ b/include/ccf/odata_error.h @@ -80,7 +80,7 @@ namespace ccf ERROR(InvalidQuote) ERROR(InvalidNodeState) ERROR(NodeAlreadyExists) - ERROR(StartupSnapshotIsOld) + ERROR(StartupSeqnoIsOld) ERROR(CSRPublicKeyInvalid) ERROR(ResharingAlreadyCompleted) diff --git a/python/ccf/ledger.py b/python/ccf/ledger.py index 2ac3cf85ffbd..8dbee28d9c1b 100644 --- a/python/ccf/ledger.py +++ b/python/ccf/ledger.py @@ -939,7 +939,7 @@ def try_add_chunk(path): raise ValueError( f"Ledger cannot parse committed chunk {file_b} following uncommitted chunk {file_a}" ) - if range_a[1] is not None and range_a[1] + 1 != range_b[0]: + if validator and range_a[1] is not None and range_a[1] + 1 != range_b[0]: raise ValueError( f"Ledger cannot parse non-contiguous chunks {file_a} and {file_b}" ) @@ -974,9 +974,9 @@ def get_transaction(self, seqno: int) -> Transaction: transaction = None for chunk in self: _, chunk_end = chunk.get_seqnos() - if chunk_end and chunk_end < seqno: - continue for tx in chunk: + if chunk_end and chunk_end < seqno: + continue public_transaction = tx.get_public_domain() if public_transaction.get_seqno() == seqno: return tx diff --git a/src/node/node_state.h b/src/node/node_state.h index e9d6952904b6..d1a78adefa3a 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -194,7 +194,7 @@ namespace ccf std::unique_ptr startup_snapshot_info = nullptr; // Set to the snapshot seqno when a node starts from one and remembered for // the lifetime of the node - std::optional startup_seqno = std::nullopt; + kv::Version startup_seqno = 0; std::shared_ptr make_encryptor() { @@ -242,7 +242,7 @@ namespace ccf config.recover.previous_service_identity); startup_seqno = startup_snapshot_info->seqno; - last_recovered_idx = startup_seqno.value(); + last_recovered_idx = startup_seqno; last_recovered_signed_idx = last_recovered_idx; return !startup_snapshot_info->requires_ledger_verification(); @@ -1664,7 +1664,7 @@ namespace ccf return self; } - std::optional get_startup_snapshot_seqno() override + kv::Version get_startup_snapshot_seqno() override { std::lock_guard guard(lock); return startup_seqno; diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 41e26745c99d..6b64962058b3 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -390,23 +390,23 @@ namespace ccf this->network.consensus_type)); } - // If the joiner and this node both started from a snapshot, make sure - // that the joiner's snapshot is more recent than this node's snapshot + // Make sure that the joiner's snapshot is more recent than this node's + // snapshot. Otherwise, the joiner may not be given all the ledger + // secrets required to replay historical transactions. auto this_startup_seqno = this->node_operation.get_startup_snapshot_seqno(); if ( - this_startup_seqno.has_value() && in.startup_seqno.has_value() && - this_startup_seqno.value() > in.startup_seqno.value()) + in.startup_seqno.has_value() && + this_startup_seqno > in.startup_seqno.value()) { return make_error( HTTP_STATUS_BAD_REQUEST, - ccf::errors::StartupSnapshotIsOld, + ccf::errors::StartupSeqnoIsOld, fmt::format( - "Node requested to join from snapshot at seqno {} which is " - "older " - "than this node startup seqno {}", + "Node requested to join from seqno {} which is " + "older than this node startup seqno {}", in.startup_seqno.value(), - this_startup_seqno.value())); + this_startup_seqno)); } auto nodes = args.tx.rw(this->network.nodes); @@ -644,7 +644,7 @@ namespace ccf result.recovery_target_seqno = rts; result.last_recovered_seqno = lrs; result.startup_seqno = - this->node_operation.get_startup_snapshot_seqno().value_or(0); + this->node_operation.get_startup_snapshot_seqno(); auto signatures = args.tx.template ro(Tables::SIGNATURES); auto sig = signatures->get(); diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index cf6c7f779517..90a0eeeb2d44 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -41,7 +41,7 @@ namespace ccf const QuoteInfo& quote_info, const std::vector& expected_node_public_key_der, CodeDigest& code_digest) = 0; - virtual std::optional get_startup_snapshot_seqno() = 0; + virtual kv::Version get_startup_snapshot_seqno() = 0; virtual SessionMetrics get_session_metrics() = 0; virtual size_t get_jwt_attempts() = 0; virtual crypto::Pem get_self_signed_certificate() = 0; diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index 8ec7e9458c20..82b0a49efd47 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -55,7 +55,7 @@ namespace ccf return impl.get_last_recovered_signed_idx(); } - std::optional get_startup_snapshot_seqno() override + kv::Version get_startup_snapshot_seqno() override { return impl.get_startup_snapshot_seqno(); } diff --git a/src/node/rpc/node_operation_interface.h b/src/node/rpc/node_operation_interface.h index 888d424ac623..cf6c52c8ed97 100644 --- a/src/node/rpc/node_operation_interface.h +++ b/src/node/rpc/node_operation_interface.h @@ -39,7 +39,7 @@ namespace ccf virtual bool can_replicate() = 0; virtual kv::Version get_last_recovered_signed_idx() = 0; - virtual std::optional get_startup_snapshot_seqno() = 0; + virtual kv::Version get_startup_snapshot_seqno() = 0; virtual SessionMetrics get_session_metrics() = 0; virtual size_t get_jwt_attempts() = 0; diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index 46f90c69ef28..92aaa6c78bc7 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -57,11 +57,6 @@ namespace ccf return kv::NoVersion; } - std::optional get_startup_snapshot_seqno() override - { - return std::nullopt; - } - SessionMetrics get_session_metrics() override { return {}; @@ -81,6 +76,11 @@ namespace ccf return QuoteVerificationResult::Verified; } + kv::Version get_startup_snapshot_seqno() override + { + return 0; + } + void initiate_private_recovery(kv::Tx& tx) override { throw std::logic_error("Unimplemented"); diff --git a/tests/infra/network.py b/tests/infra/network.py index f0713400179c..ce79871fed39 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -71,7 +71,7 @@ class CodeIdNotFound(Exception): pass -class StartupSnapshotIsOld(Exception): +class StartupSeqnoIsOld(Exception): pass @@ -231,15 +231,15 @@ def _add_node( ledger_dir=None, copy_ledger_read_only=False, read_only_ledger_dirs=None, - from_snapshot=False, + from_snapshot=True, snapshots_dir=None, **kwargs, ): # Contact primary if no target node is set - if target_node is None: - target_node, _ = self.find_primary( - timeout=args.ledger_recovery_timeout if recovery else 3 - ) + primary, _ = self.find_primary( + timeout=args.ledger_recovery_timeout if recovery else 3 + ) + target_node = target_node or primary LOG.info(f"Joining from target node {target_node.local_node_id}") committed_ledger_dirs = read_only_ledger_dirs or [] @@ -248,9 +248,8 @@ def _add_node( # Note: Copy snapshot before ledger as retrieving the latest snapshot may require # to produce more ledger entries if from_snapshot: - # Only retrieve snapshot from target node if the snapshot directory is not - # specified - snapshots_dir = snapshots_dir or self.get_committed_snapshots(target_node) + # Only retrieve snapshot from primary if the snapshot directory is not specified + snapshots_dir = snapshots_dir or self.get_committed_snapshots(primary) if os.listdir(snapshots_dir): LOG.info(f"Joining from snapshot directory: {snapshots_dir}") else: @@ -723,8 +722,8 @@ def join_node( for error in errors: if "Quote does not contain known enclave measurement" in error: raise CodeIdNotFound from e - if "StartupSnapshotIsOld" in error: - raise StartupSnapshotIsOld from e + if "StartupSeqnoIsOld" in error: + raise StartupSeqnoIsOld from e raise def trust_node( @@ -1244,11 +1243,11 @@ def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=20): return node.get_committed_snapshots(wait_for_snapshots_to_be_committed) - def _get_ledger_public_view_at(self, node, call, seqno, timeout): + def _get_ledger_public_view_at(self, node, call, seqno, timeout, insecure=False): end_time = time.time() + timeout while time.time() < end_time: try: - return call(seqno) + return call(seqno, insecure=insecure) except Exception as ex: LOG.info(f"Exception: {ex}") self.consortium.create_and_withdraw_large_proposal(node) @@ -1257,20 +1256,20 @@ def _get_ledger_public_view_at(self, node, call, seqno, timeout): f"Could not read transaction at seqno {seqno} from ledger {node.remote.ledger_paths()} after {timeout}s" ) - def get_ledger_public_state_at(self, seqno, timeout=5): + def get_ledger_public_state_at(self, seqno, timeout=5, insecure=False): primary, _ = self.find_primary() return self._get_ledger_public_view_at( - primary, primary.get_ledger_public_tables_at, seqno, timeout + primary, primary.get_ledger_public_tables_at, seqno, timeout, insecure ) - def get_latest_ledger_public_state(self, timeout=5): + def get_latest_ledger_public_state(self, insecure=False, timeout=5): primary, _ = self.find_primary() with primary.client() as nc: resp = nc.get("/node/commit") body = resp.body.json() tx_id = TxID.from_str(body["transaction_id"]) return self._get_ledger_public_view_at( - primary, primary.get_ledger_public_state_at, tx_id.seqno, timeout + primary, primary.get_ledger_public_state_at, tx_id.seqno, timeout, insecure ) @functools.cached_property diff --git a/tests/infra/node.py b/tests/infra/node.py index fe402dff6215..37f9093de13b 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -451,14 +451,16 @@ def wait_for_node_to_join(self, timeout=3): f"Node {self.local_node_id} failed to join the network" ) from e - def get_ledger_public_tables_at(self, seqno): - ledger = ccf.ledger.Ledger(self.remote.ledger_paths()) + def get_ledger_public_tables_at(self, seqno, insecure=False): + validator = ccf.ledger.LedgerValidator() if not insecure else None + ledger = ccf.ledger.Ledger(self.remote.ledger_paths(), validator=validator) assert ledger.last_committed_chunk_range[1] >= seqno tx = ledger.get_transaction(seqno) return tx.get_public_domain().get_tables() - def get_ledger_public_state_at(self, seqno): - ledger = ccf.ledger.Ledger(self.remote.ledger_paths()) + def get_ledger_public_state_at(self, seqno, insecure=False): + validator = ccf.ledger.LedgerValidator() if not insecure else None + ledger = ccf.ledger.Ledger(self.remote.ledger_paths(), validator=validator) assert ledger.last_committed_chunk_range[1] >= seqno return ledger.get_latest_public_state() diff --git a/tests/infra/remote.py b/tests/infra/remote.py index 78ee11252042..db675570ba8c 100644 --- a/tests/infra/remote.py +++ b/tests/infra/remote.py @@ -935,11 +935,25 @@ def get_snapshots(self): return os.path.join(self.common_dir, self.snapshot_dir_name) def get_committed_snapshots(self, pre_condition_func=lambda src_dir, _: True): - self.remote.get( - self.snapshot_dir_name, - self.common_dir, - pre_condition_func=pre_condition_func, - ) + # It is possible that snapshots are committed while the copy is happening + # so retry a reasonable number of times. + max_retry_count = 5 + retry_count = 0 + while retry_count < max_retry_count: + try: + self.remote.get( + self.snapshot_dir_name, + self.common_dir, + pre_condition_func=pre_condition_func, + ) + break + except Exception as e: + LOG.warning( + f"Error copying committed snapshots from {self.snapshot_dir_name}: {e}. Retrying..." + ) + retry_count += 1 + time.sleep(0.1) + return os.path.join(self.common_dir, self.snapshot_dir_name) def log_path(self): diff --git a/tests/lts_compatibility.py b/tests/lts_compatibility.py index 7985ff916d6e..8b209b218057 100644 --- a/tests/lts_compatibility.py +++ b/tests/lts_compatibility.py @@ -210,6 +210,7 @@ def run_code_upgrade_from( old_nodes = network.get_joined_nodes() primary, _ = network.find_primary() + from_major_version = primary.major_version LOG.info("Apply transactions to old service") issue_activity_on_live_service(network, args) @@ -324,13 +325,13 @@ def run_code_upgrade_from( node.stop() LOG.info("Service is now made of new nodes only") + primary, _ = network.find_nodes() # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh if not os.getenv("CONTAINER_NODES"): jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x - primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: @@ -354,7 +355,17 @@ def run_code_upgrade_from( ) # Check that the ledger can be parsed - network.get_latest_ledger_public_state() + # Note: When upgrading from 1.x to 2.x, it is possible that ledger chunk are not + # in sync between nodes, which may cause some chunks to differ when starting + # from a snapshot. See https://github.com/microsoft/ccf/issues/3613. In such case, + # we only verify that the ledger can be parsed, even if some chunks are duplicated. + # This can go once 2.0 is released. + insecure_ledger_verification = ( + from_major_version == 1 and primary.version_after("ccf-2.0.0-rc7") + ) + network.get_latest_ledger_public_state( + insecure=insecure_ledger_verification + ) @reqs.description("Run live compatibility with latest LTS") diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 76854ef11685..8283c9a87d99 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -68,8 +68,8 @@ def wait_for_reconfiguration_to_complete(network, timeout=10): raise Exception("Reconfiguration did not complete in time") -@reqs.description("Adding a valid node without snapshot") -def test_add_node(network, args): +@reqs.description("Adding a valid node") +def test_add_node(network, args, from_snapshot=True): # Note: host is supplied explicitly to avoid having differently # assigned IPs for the interfaces, something which the test infra doesn't # support widely yet. @@ -90,7 +90,7 @@ def test_add_node(network, args): } ) ) - network.join_node(new_node, args.package, args, from_snapshot=False) + network.join_node(new_node, args.package, args, from_snapshot=from_snapshot) # Verify self-signed node certificate validity period new_node.verify_certificate_validity_period(interface_name=operator_rpc_interface) @@ -100,12 +100,14 @@ def test_add_node(network, args): args, validity_period_days=args.maximum_node_certificate_validity_days // 2, ) - with new_node.client() as c: - s = c.get("/node/state") - assert s.body.json()["node_id"] == new_node.node_id - assert ( - s.body.json()["startup_seqno"] == 0 - ), "Node started without snapshot but reports startup seqno != 0" + + if not from_snapshot: + with new_node.client() as c: + s = c.get("/node/state") + assert s.body.json()["node_id"] == new_node.node_id + assert ( + s.body.json()["startup_seqno"] == 0 + ), "Node started without snapshot but reports startup seqno != 0" # Now that the node is trusted, verify endorsed certificate validity period new_node.verify_certificate_validity_period() @@ -164,7 +166,10 @@ def test_change_curve(network, args): def test_add_node_from_backup(network, args): new_node = network.create_node("local://localhost") network.join_node( - new_node, args.package, args, target_node=network.find_any_backup() + new_node, + args.package, + args, + target_node=network.find_any_backup(), ) network.trust_node(new_node, args) return network @@ -231,7 +236,7 @@ def test_add_as_many_pending_nodes(network, args): new_nodes = [] for _ in range(number_new_nodes): new_node = network.create_node("local://localhost") - network.join_node(new_node, args.package, args, from_snapshot=False) + network.join_node(new_node, args.package, args) new_nodes.append(new_node) for new_node in new_nodes: @@ -359,7 +364,7 @@ def test_node_replacement(network, args): f"local://{node_to_replace.get_public_rpc_host()}:{node_to_replace.get_public_rpc_port()}", node_port=node_to_replace.n2n_interface.port, ) - network.join_node(replacement_node, args.package, args, from_snapshot=False) + network.join_node(replacement_node, args.package, args) network.trust_node(replacement_node, args) assert replacement_node.node_id != node_to_replace.node_id @@ -487,7 +492,7 @@ def test_learner_catches_up(network, args): num_nodes_before = len(c0) new_node = network.create_node("local://localhost") - network.join_node(new_node, args.package, args, from_snapshot=False) + network.join_node(new_node, args.package, args) network.trust_node(new_node, args) with new_node.client() as c: @@ -560,16 +565,16 @@ def run(args): test_version(network, args) if args.consensus != "BFT": + test_add_node(network, args, from_snapshot=False) + test_add_node_with_read_only_ledger(network, args) test_join_straddling_primary_replacement(network, args) test_node_replacement(network, args) test_add_node_from_backup(network, args) - test_add_node(network, args) test_add_node_on_other_curve(network, args) test_retire_backup(network, args) test_add_as_many_pending_nodes(network, args) test_add_node(network, args) test_retire_primary(network, args) - test_add_node_with_read_only_ledger(network, args) test_add_node_from_snapshot(network, args) test_add_node_from_snapshot(network, args, from_backup=True) @@ -642,8 +647,33 @@ def run_join_old_snapshot(args): snapshots_dir=tmp_dir, timeout=3, ) - except infra.network.StartupSnapshotIsOld: - pass + except infra.network.StartupSeqnoIsOld: + LOG.info( + f"Node {new_node.local_node_id} started from old snapshot could not join the service, as expected" + ) + else: + raise RuntimeError( + f"Node {new_node.local_node_id} started from old snapshot unexpectedly joined the service" + ) + + # Start new node from no snapshot + try: + new_node = network.create_node("local://localhost") + network.join_node( + new_node, + args.package, + args, + from_snapshot=False, + timeout=3, + ) + except infra.network.StartupSeqnoIsOld: + LOG.info( + f"Node {new_node.local_node_id} started without snapshot could not join the service, as expected" + ) + else: + raise RuntimeError( + f"Node {new_node.local_node_id} started without snapshot unexpectedly joined the service successfully" + ) def get_current_nodes_table(network): @@ -747,7 +777,7 @@ def run_migration_tests(args): def run_all(args): run(args) if cr.args.consensus != "BFT": - run_join_old_snapshot(all) + run_join_old_snapshot(args) if __name__ == "__main__": @@ -764,7 +794,7 @@ def add(parser): cr.add( "1tx_reconfig", - run, + run_all, package="samples/apps/logging/liblogging", nodes=infra.e2e_args.min_nodes(cr.args, f=1), reconfiguration_type="OneTransaction", diff --git a/tests/recovery.py b/tests/recovery.py index 7d49f883320b..3a4058df3bcb 100644 --- a/tests/recovery.py +++ b/tests/recovery.py @@ -41,7 +41,7 @@ def get_and_verify_historical_receipt(network, ref_msg): @reqs.description("Recover a service") @reqs.recover(number_txs=2) -def test_recover_service(network, args, from_snapshot=False): +def test_recover_service(network, args, from_snapshot=True): network.save_service_identity(args) old_primary, _ = network.find_primary()