From 908294c59dab47ad84085f149362b776b555fbd9 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Fri, 18 Feb 2022 12:38:07 +0000 Subject: [PATCH 01/24] Fir and re-enable test --- src/node/node_state.h | 6 +++--- src/node/rpc/node_frontend.h | 9 ++++----- src/node/rpc/node_interface.h | 2 +- src/node/rpc/test/node_stub.h | 2 +- tests/reconfiguration.py | 33 +++++++++++++++++++++++++++++---- 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index d9165fbbfb5b..1ebf2bcb094c 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -130,7 +130,7 @@ namespace ccf std::unique_ptr startup_snapshot_info = nullptr; // Set to the snapshot seqno when a node starts from one and remembered for // the lifetime of the node - std::optional startup_seqno = std::nullopt; + kv::Version startup_seqno = 0; std::shared_ptr make_encryptor() { @@ -177,7 +177,7 @@ namespace ccf config.startup_snapshot_evidence_seqno_for_1_x); startup_seqno = startup_snapshot_info->seqno; - ledger_idx = startup_seqno.value(); + ledger_idx = startup_seqno; last_recovered_signed_idx = ledger_idx; return !startup_snapshot_info->requires_ledger_verification(); @@ -1463,7 +1463,7 @@ namespace ccf return self; } - std::optional get_startup_snapshot_seqno() override + kv::Version get_startup_snapshot_seqno() override { std::lock_guard guard(lock); return startup_seqno; diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 70919e663caf..98d3701de204 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -361,8 +361,8 @@ namespace ccf auto this_startup_seqno = this->context.get_node_state().get_startup_snapshot_seqno(); if ( - this_startup_seqno.has_value() && in.startup_seqno.has_value() && - this_startup_seqno.value() > in.startup_seqno.value()) + in.startup_seqno.has_value() && + this_startup_seqno > in.startup_seqno.value()) { return make_error( HTTP_STATUS_BAD_REQUEST, @@ -372,7 +372,7 @@ namespace ccf "older " "than this node startup seqno {}", in.startup_seqno.value(), - this_startup_seqno.value())); + this_startup_seqno)); } auto nodes = args.tx.rw(this->network.nodes); @@ -604,8 +604,7 @@ namespace ccf result.recovery_target_seqno = rts; result.last_recovered_seqno = lrs; result.startup_seqno = - this->context.get_node_state().get_startup_snapshot_seqno().value_or( - 0); + this->context.get_node_state().get_startup_snapshot_seqno(); auto signatures = args.tx.template ro(Tables::SIGNATURES); auto sig = signatures->get(); diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index d6fb7c886f9e..386a217681d6 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -51,7 +51,7 @@ namespace ccf const QuoteInfo& quote_info, const std::vector& expected_node_public_key_der, CodeDigest& code_digest) = 0; - virtual std::optional get_startup_snapshot_seqno() = 0; + virtual kv::Version get_startup_snapshot_seqno() = 0; virtual SessionMetrics get_session_metrics() = 0; virtual size_t get_jwt_attempts() = 0; }; diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index 4f91fadf9598..264eb1c5c59e 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -117,7 +117,7 @@ namespace ccf return QuoteVerificationResult::Verified; } - std::optional get_startup_snapshot_seqno() override + kv::Version get_startup_snapshot_seqno() override { return std::nullopt; } diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index a3ce5813fdae..715903d26860 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -643,7 +643,32 @@ def run_join_old_snapshot(args): timeout=3, ) except infra.network.StartupSnapshotIsOld: - pass + LOG.info( + f"Node {new_node.local_node_id} started from old snapshot could not join the service, as expected" + ) + else: + raise RuntimeError( + f"Node {new_node.local_node_id} started from old snapshot unexpectedly joined the service" + ) + + # Start new node from no snapshot + try: + new_node = network.create_node("local://localhost") + network.join_node( + new_node, + args.package, + args, + from_snapshot=False, + timeout=3, + ) + except infra.network.StartupSnapshotIsOld: + LOG.info( + f"Node {new_node.local_node_id} started without snapshot could not join the service, as expected" + ) + else: + raise RuntimeError( + f"Node {new_node.local_node_id} started without snapshot unexpectedly joined the service successfully" + ) def get_current_nodes_table(network): @@ -745,7 +770,7 @@ def run_migration_tests(args): def run_all(args): run(args) if cr.args.consensus != "BFT": - run_join_old_snapshot(all) + run_join_old_snapshot(args) if __name__ == "__main__": @@ -762,7 +787,7 @@ def add(parser): cr.add( "1tx_reconfig", - run, + run_all, package="samples/apps/logging/liblogging", nodes=infra.e2e_args.min_nodes(cr.args, f=1), reconfiguration_type="OneTransaction", @@ -771,7 +796,7 @@ def add(parser): if cr.args.include_2tx_reconfig: cr.add( "2tx_reconfig", - run, + run_all, package="samples/apps/logging/liblogging", nodes=infra.e2e_args.min_nodes(cr.args, f=1), reconfiguration_type="TwoTransaction", From 23f49a4c4cef6ebac2d6e1863986dc00bbd4ab3b Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Fri, 18 Feb 2022 12:42:52 +0000 Subject: [PATCH 02/24] Changelog --- .daily_canary | 2 +- CHANGELOG.md | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.daily_canary b/.daily_canary index 866d416cb6ad..a58a6b8b2b4b 100644 --- a/.daily_canary +++ b/.daily_canary @@ -1 +1 @@ -No no he's not dead, he's, he's restin'! +No no he's not dead, he's, he's restin'. diff --git a/CHANGELOG.md b/CHANGELOG.md index a13febb2aa9d..a10845364669 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## Unreleased + +### Fixed + +- Fixed an issue where new node started without a snapshot would be able to join from a node that started with a snapshot (#3573). + ## [2.0.0-rc1] ### Added From 179f489c6c24321fc355778eda445a7a57c31427 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Fri, 18 Feb 2022 12:56:29 +0000 Subject: [PATCH 03/24] Fix build --- src/node/rpc/test/node_stub.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index 264eb1c5c59e..8dc20569af8a 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -119,7 +119,7 @@ namespace ccf kv::Version get_startup_snapshot_seqno() override { - return std::nullopt; + return 0; } SessionMetrics get_session_metrics() override From 40c18395a03f28ee775943483d1cf34b50fa3c78 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Fri, 18 Feb 2022 14:30:44 +0000 Subject: [PATCH 04/24] LTS compat, further nodes should start from snapshots too --- src/node/rpc/node_frontend.h | 5 ++--- tests/lts_compatibility.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 98d3701de204..526e954fd96f 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -368,9 +368,8 @@ namespace ccf HTTP_STATUS_BAD_REQUEST, ccf::errors::StartupSnapshotIsOld, fmt::format( - "Node requested to join from snapshot at seqno {} which is " - "older " - "than this node startup seqno {}", + "Node requested to join from seqno {} which is " + "older than this node startup seqno {}", in.startup_seqno.value(), this_startup_seqno)); } diff --git a/tests/lts_compatibility.py b/tests/lts_compatibility.py index ed360acfee15..012e1bd609ed 100644 --- a/tests/lts_compatibility.py +++ b/tests/lts_compatibility.py @@ -100,7 +100,7 @@ def test_new_service( library_dir=library_dir, version=version, ) - network.join_node(new_node, args.package, args) + network.join_node(new_node, args.package, args, from_snapshot=True) network.trust_node(new_node, args) new_node.verify_certificate_validity_period( expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS From 3dd9c75aea74ca297d1816352c8eb6f0330d7e76 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Fri, 18 Feb 2022 15:33:40 +0000 Subject: [PATCH 05/24] Migration --- tests/reconfiguration.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 715903d26860..f357d268fbf7 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -707,7 +707,9 @@ def check_2tx_ledger(ledger_paths, learner_id): @reqs.description("Migrate from 1tx to 2tx reconfiguration scheme") -def test_migration_2tx_reconfiguration(network, args, initial_is_1tx=True, **kwargs): +def test_migration_2tx_reconfiguration( + network, args, initial_is_1tx=True, from_snapshot=False, **kwargs +): primary, _ = network.find_primary() # Check that the service config agrees that this is a 1tx network @@ -733,7 +735,7 @@ def test_migration_2tx_reconfiguration(network, args, initial_is_1tx=True, **kwa assert len(rj["details"]["learners"]) == 0 new_node = network.create_node("local://localhost", **kwargs) - network.join_node(new_node, args.package, args) + network.join_node(new_node, args.package, args, from_snapshot=from_snapshot) network.trust_node(new_node, args) # Check that the new node has the right consensus parameter From f5659774daed82f685a505fedb27112f14827335 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Fri, 18 Feb 2022 16:05:39 +0000 Subject: [PATCH 06/24] Fix --- tests/lts_compatibility.py | 1 + tests/reconfiguration.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/lts_compatibility.py b/tests/lts_compatibility.py index 012e1bd609ed..80a27c8c253f 100644 --- a/tests/lts_compatibility.py +++ b/tests/lts_compatibility.py @@ -129,6 +129,7 @@ def test_new_service( network, args, initial_is_1tx=False, # Reconfiguration type added in 2.x + from_snapshot=True, binary_dir=binary_dir, library_dir=library_dir, version=version, diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index f357d268fbf7..9109895a2be5 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -770,7 +770,7 @@ def run_migration_tests(args): def run_all(args): - run(args) + # run(args) if cr.args.consensus != "BFT": run_join_old_snapshot(args) @@ -798,7 +798,7 @@ def add(parser): if cr.args.include_2tx_reconfig: cr.add( "2tx_reconfig", - run_all, + run, package="samples/apps/logging/liblogging", nodes=infra.e2e_args.min_nodes(cr.args, f=1), reconfiguration_type="TwoTransaction", From afb694f9029878c2ed667e94187a8e1963b4b5b9 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Fri, 18 Feb 2022 17:19:16 +0000 Subject: [PATCH 07/24] from snapshot by default --- tests/infra/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/infra/network.py b/tests/infra/network.py index dc458ab84e99..e051593f4006 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -217,7 +217,7 @@ def _add_node( ledger_dir=None, copy_ledger_read_only=False, read_only_ledger_dirs=None, - from_snapshot=False, + from_snapshot=True, snapshots_dir=None, **kwargs, ): From 247fa04bc79444d5ac21863b515a56aab2483aa3 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Mon, 21 Feb 2022 09:16:39 +0000 Subject: [PATCH 08/24] Fix --- tests/lts_compatibility.py | 3 +-- tests/reconfiguration.py | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/lts_compatibility.py b/tests/lts_compatibility.py index 82e7c64bba15..3e8579ead5c1 100644 --- a/tests/lts_compatibility.py +++ b/tests/lts_compatibility.py @@ -100,7 +100,7 @@ def test_new_service( library_dir=library_dir, version=version, ) - network.join_node(new_node, args.package, args, from_snapshot=True) + network.join_node(new_node, args.package, args) network.trust_node(new_node, args) new_node.verify_certificate_validity_period( expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS @@ -129,7 +129,6 @@ def test_new_service( network, args, initial_is_1tx=False, # Reconfiguration type added in 2.x - from_snapshot=True, binary_dir=binary_dir, library_dir=library_dir, version=version, diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 8e4852b2b980..cee854bf4368 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -69,7 +69,7 @@ def wait_for_reconfiguration_to_complete(network, timeout=10): @reqs.description("Adding a valid node without snapshot") -def test_add_node(network, args): +def test_add_node(network, args, from_snapshot=True): # Note: host is supplied explicitly to avoid having differently # assigned IPs for the interfaces, something which the test infra doesn't # support widely yet. @@ -88,7 +88,7 @@ def test_add_node(network, args): } ) ) - network.join_node(new_node, args.package, args, from_snapshot=False) + network.join_node(new_node, args.package, args, from_snapshot=from_snapshot) # Verify self-signed node certificate validity period new_node.verify_certificate_validity_period(interface_name=operator_rpc_interface) @@ -563,11 +563,11 @@ def run(args): test_join_straddling_primary_replacement(network, args) test_node_replacement(network, args) test_add_node_from_backup(network, args) - test_add_node(network, args) + test_add_node(network, args, from_snapshot=False) test_add_node_on_other_curve(network, args) test_retire_backup(network, args) test_add_as_many_pending_nodes(network, args) - test_add_node(network, args) + test_add_node(network, args, from_snapshot=False) test_retire_primary(network, args) test_add_node_with_read_only_ledger(network, args) @@ -708,7 +708,7 @@ def check_2tx_ledger(ledger_paths, learner_id): @reqs.description("Migrate from 1tx to 2tx reconfiguration scheme") def test_migration_2tx_reconfiguration( - network, args, initial_is_1tx=True, from_snapshot=False, **kwargs + network, args, initial_is_1tx=True, **kwargs ): primary, _ = network.find_primary() @@ -735,7 +735,7 @@ def test_migration_2tx_reconfiguration( assert len(rj["details"]["learners"]) == 0 new_node = network.create_node("local://localhost", **kwargs) - network.join_node(new_node, args.package, args, from_snapshot=from_snapshot) + network.join_node(new_node, args.package, args) network.trust_node(new_node, args) # Check that the new node has the right consensus parameter @@ -770,7 +770,7 @@ def run_migration_tests(args): def run_all(args): - # run(args) + run(args) if cr.args.consensus != "BFT": run_join_old_snapshot(args) From 3a16e09402bb4ef97f3e027ca2d926315851ebd2 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Mon, 21 Feb 2022 09:31:41 +0000 Subject: [PATCH 09/24] fmt --- tests/reconfiguration.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index cee854bf4368..9d6e91c0fb0f 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -707,9 +707,7 @@ def check_2tx_ledger(ledger_paths, learner_id): @reqs.description("Migrate from 1tx to 2tx reconfiguration scheme") -def test_migration_2tx_reconfiguration( - network, args, initial_is_1tx=True, **kwargs -): +def test_migration_2tx_reconfiguration(network, args, initial_is_1tx=True, **kwargs): primary, _ = network.find_primary() # Check that the service config agrees that this is a 1tx network From 1f82a31245e6e479ea92374cbbe4723e0289ff8f Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Mon, 21 Feb 2022 10:09:39 +0000 Subject: [PATCH 10/24] Oops --- tests/reconfiguration.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 9d6e91c0fb0f..3800d8043e48 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -98,12 +98,14 @@ def test_add_node(network, args, from_snapshot=True): args, validity_period_days=args.maximum_node_certificate_validity_days // 2, ) - with new_node.client() as c: - s = c.get("/node/state") - assert s.body.json()["node_id"] == new_node.node_id - assert ( - s.body.json()["startup_seqno"] == 0 - ), "Node started without snapshot but reports startup seqno != 0" + + if not from_snapshot: + with new_node.client() as c: + s = c.get("/node/state") + assert s.body.json()["node_id"] == new_node.node_id + assert ( + s.body.json()["startup_seqno"] == 0 + ), "Node started without snapshot but reports startup seqno != 0" # Now that the node is trusted, verify endorsed certificate validity period new_node.verify_certificate_validity_period() From bb90bfbe7fbd5f6476fa858a5392391a204073f3 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 22 Feb 2022 10:05:37 +0000 Subject: [PATCH 11/24] . --- tests/reconfiguration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 3800d8043e48..f39294a54459 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -68,7 +68,7 @@ def wait_for_reconfiguration_to_complete(network, timeout=10): raise Exception("Reconfiguration did not complete in time") -@reqs.description("Adding a valid node without snapshot") +@reqs.description("Adding a valid node") def test_add_node(network, args, from_snapshot=True): # Note: host is supplied explicitly to avoid having differently # assigned IPs for the interfaces, something which the test infra doesn't From dd4da07559d4042b9d7e4754501ea1abcb4c4b3c Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 22 Feb 2022 13:29:37 +0000 Subject: [PATCH 12/24] Fix --- tests/reconfiguration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index f39294a54459..9f7b47c3a4b1 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -562,6 +562,7 @@ def run(args): test_version(network, args) if args.consensus != "BFT": + test_add_node_with_read_only_ledger(network, args) test_join_straddling_primary_replacement(network, args) test_node_replacement(network, args) test_add_node_from_backup(network, args) @@ -571,7 +572,6 @@ def run(args): test_add_as_many_pending_nodes(network, args) test_add_node(network, args, from_snapshot=False) test_retire_primary(network, args) - test_add_node_with_read_only_ledger(network, args) test_add_node_from_snapshot(network, args) test_add_node_from_snapshot(network, args, from_backup=True) From 1288bd72b661388b151e74a7160a8ca5aae605a1 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Thu, 24 Feb 2022 16:09:48 +0000 Subject: [PATCH 13/24] now? --- tests/reconfiguration.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 9f7b47c3a4b1..7d858070ce95 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -231,7 +231,7 @@ def test_add_as_many_pending_nodes(network, args): new_nodes = [] for _ in range(number_new_nodes): new_node = network.create_node("local://localhost") - network.join_node(new_node, args.package, args, from_snapshot=False) + network.join_node(new_node, args.package, args) new_nodes.append(new_node) for new_node in new_nodes: @@ -359,7 +359,7 @@ def test_node_replacement(network, args): f"local://{node_to_replace.get_public_rpc_host()}:{node_to_replace.get_public_rpc_port()}", node_port=node_to_replace.n2n_interface.port, ) - network.join_node(replacement_node, args.package, args, from_snapshot=False) + network.join_node(replacement_node, args.package, args) network.trust_node(replacement_node, args) assert replacement_node.node_id != node_to_replace.node_id @@ -489,7 +489,7 @@ def test_learner_catches_up(network, args): num_nodes_before = len(c0) new_node = network.create_node("local://localhost") - network.join_node(new_node, args.package, args, from_snapshot=False) + network.join_node(new_node, args.package, args) network.trust_node(new_node, args) with new_node.client() as c: From 7d99eca22ef1851355b64aaa3a8711f8e7b96996 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Mon, 14 Mar 2022 16:41:35 +0000 Subject: [PATCH 14/24] Fix for `reconfiguration_test_cft` --- include/ccf/odata_error.h | 2 +- src/node/rpc/node_frontend.h | 7 +++--- tests/infra/network.py | 19 ++++++++--------- tests/reconfiguration.py | 41 +++++++++++++++++++----------------- 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/include/ccf/odata_error.h b/include/ccf/odata_error.h index 7cbb5b6d392c..9ac2cc6db57d 100644 --- a/include/ccf/odata_error.h +++ b/include/ccf/odata_error.h @@ -80,7 +80,7 @@ namespace ccf ERROR(InvalidQuote) ERROR(InvalidNodeState) ERROR(NodeAlreadyExists) - ERROR(StartupSnapshotIsOld) + ERROR(StartupSeqnoIsOld) ERROR(CSRPublicKeyInvalid) ERROR(ResharingAlreadyCompleted) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 05ceb77d3f13..58de22e36ff4 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -380,8 +380,9 @@ namespace ccf this->network.consensus_type)); } - // If the joiner and this node both started from a snapshot, make sure - // that the joiner's snapshot is more recent than this node's snapshot + // Make sure that the joiner's snapshot is more recent than this node's + // snapshot. Otherwise, the joiner may not be given all the ledger + // secrets required to replay historical transactions. auto this_startup_seqno = this->node_operation.get_startup_snapshot_seqno(); if ( @@ -390,7 +391,7 @@ namespace ccf { return make_error( HTTP_STATUS_BAD_REQUEST, - ccf::errors::StartupSnapshotIsOld, + ccf::errors::StartupSeqnoIsOld, fmt::format( "Node requested to join from seqno {} which is " "older than this node startup seqno {}", diff --git a/tests/infra/network.py b/tests/infra/network.py index 4e89a438ea63..0f3b6e12ceb2 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -68,7 +68,7 @@ class CodeIdNotFound(Exception): pass -class StartupSnapshotIsOld(Exception): +class StartupSeqnoIsOld(Exception): pass @@ -228,10 +228,10 @@ def _add_node( **kwargs, ): # Contact primary if no target node is set - if target_node is None: - target_node, _ = self.find_primary( - timeout=args.ledger_recovery_timeout if recovery else 3 - ) + primary, _ = self.find_primary( + timeout=args.ledger_recovery_timeout if recovery else 3 + ) + target_node = target_node or primary LOG.info(f"Joining from target node {target_node.local_node_id}") committed_ledger_dirs = read_only_ledger_dirs or [] @@ -243,9 +243,8 @@ def _add_node( current_ledger_dir, committed_ledger_dirs = target_node.get_ledger() if from_snapshot: - # Only retrieve snapshot from target node if the snapshot directory is not - # specified - snapshots_dir = snapshots_dir or self.get_committed_snapshots(target_node) + # Only retrieve snapshot from primary if the snapshot directory is not specified + snapshots_dir = snapshots_dir or self.get_committed_snapshots(primary) if os.listdir(snapshots_dir): LOG.info(f"Joining from snapshot directory: {snapshots_dir}") else: @@ -698,8 +697,8 @@ def join_node( for error in errors: if "Quote does not contain known enclave measurement" in error: raise CodeIdNotFound from e - if "StartupSnapshotIsOld" in error: - raise StartupSnapshotIsOld from e + if "StartupSeqnoIsOld" in error: + raise StartupSeqnoIsOld from e raise def trust_node( diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 7d858070ce95..34e61593d75c 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -164,7 +164,10 @@ def test_change_curve(network, args): def test_add_node_from_backup(network, args): new_node = network.create_node("local://localhost") network.join_node( - new_node, args.package, args, target_node=network.find_any_backup() + new_node, + args.package, + args, + target_node=network.find_any_backup(), ) network.trust_node(new_node, args) return network @@ -562,15 +565,15 @@ def run(args): test_version(network, args) if args.consensus != "BFT": + test_add_node(network, args, from_snapshot=False) test_add_node_with_read_only_ledger(network, args) test_join_straddling_primary_replacement(network, args) test_node_replacement(network, args) test_add_node_from_backup(network, args) - test_add_node(network, args, from_snapshot=False) test_add_node_on_other_curve(network, args) test_retire_backup(network, args) test_add_as_many_pending_nodes(network, args) - test_add_node(network, args, from_snapshot=False) + test_add_node(network, args) test_retire_primary(network, args) test_add_node_from_snapshot(network, args) @@ -644,7 +647,7 @@ def run_join_old_snapshot(args): snapshots_dir=tmp_dir, timeout=3, ) - except infra.network.StartupSnapshotIsOld: + except infra.network.StartupSeqnoIsOld: LOG.info( f"Node {new_node.local_node_id} started from old snapshot could not join the service, as expected" ) @@ -663,7 +666,7 @@ def run_join_old_snapshot(args): from_snapshot=False, timeout=3, ) - except infra.network.StartupSnapshotIsOld: + except infra.network.StartupSeqnoIsOld: LOG.info( f"Node {new_node.local_node_id} started without snapshot could not join the service, as expected" ) @@ -787,13 +790,13 @@ def add(parser): cr = ConcurrentRunner(add) - cr.add( - "1tx_reconfig", - run_all, - package="samples/apps/logging/liblogging", - nodes=infra.e2e_args.min_nodes(cr.args, f=1), - reconfiguration_type="OneTransaction", - ) + # cr.add( + # "1tx_reconfig", + # run_all, + # package="samples/apps/logging/liblogging", + # nodes=infra.e2e_args.min_nodes(cr.args, f=1), + # reconfiguration_type="OneTransaction", + # ) if cr.args.include_2tx_reconfig: cr.add( @@ -804,12 +807,12 @@ def add(parser): reconfiguration_type="TwoTransaction", ) - cr.add( - "migration", - run_migration_tests, - package="samples/apps/logging/liblogging", - nodes=infra.e2e_args.min_nodes(cr.args, f=1), - reconfiguration_type="OneTransaction", - ) + # cr.add( + # "migration", + # run_migration_tests, + # package="samples/apps/logging/liblogging", + # nodes=infra.e2e_args.min_nodes(cr.args, f=1), + # reconfiguration_type="OneTransaction", + # ) cr.run() From 9e990994d6ed1cc98ef826b2a4908db837075590 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Mon, 14 Mar 2022 16:43:21 +0000 Subject: [PATCH 15/24] . --- tests/reconfiguration.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 34e61593d75c..da3f97e82197 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -790,13 +790,13 @@ def add(parser): cr = ConcurrentRunner(add) - # cr.add( - # "1tx_reconfig", - # run_all, - # package="samples/apps/logging/liblogging", - # nodes=infra.e2e_args.min_nodes(cr.args, f=1), - # reconfiguration_type="OneTransaction", - # ) + cr.add( + "1tx_reconfig", + run_all, + package="samples/apps/logging/liblogging", + nodes=infra.e2e_args.min_nodes(cr.args, f=1), + reconfiguration_type="OneTransaction", + ) if cr.args.include_2tx_reconfig: cr.add( @@ -807,12 +807,12 @@ def add(parser): reconfiguration_type="TwoTransaction", ) - # cr.add( - # "migration", - # run_migration_tests, - # package="samples/apps/logging/liblogging", - # nodes=infra.e2e_args.min_nodes(cr.args, f=1), - # reconfiguration_type="OneTransaction", - # ) + cr.add( + "migration", + run_migration_tests, + package="samples/apps/logging/liblogging", + nodes=infra.e2e_args.min_nodes(cr.args, f=1), + reconfiguration_type="OneTransaction", + ) cr.run() From cd0c8ac26325840e311927521ce96f8b51c84c58 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 26 Apr 2022 16:39:16 +0000 Subject: [PATCH 16/24] Fix LTS compatibility test issue --- python/ccf/ledger.py | 2 +- tests/infra/network.py | 8 ++++---- tests/infra/node.py | 8 +++++--- tests/lts_compatibility.py | 13 +++++++++++-- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/python/ccf/ledger.py b/python/ccf/ledger.py index be06baf39f59..a46d77ac8db3 100644 --- a/python/ccf/ledger.py +++ b/python/ccf/ledger.py @@ -924,7 +924,7 @@ def try_add_chunk(path): raise ValueError( f"Ledger cannot parse committed chunk {file_b} following uncommitted chunk {file_a}" ) - if range_a[1] is not None and range_a[1] + 1 != range_b[0]: + if validator and range_a[1] is not None and range_a[1] + 1 != range_b[0]: raise ValueError( f"Ledger cannot parse non-contiguous chunks {file_a} and {file_b}" ) diff --git a/tests/infra/network.py b/tests/infra/network.py index 43f64eedba31..d7f79c9bb8a4 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -1243,11 +1243,11 @@ def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=20): return node.get_committed_snapshots(wait_for_snapshots_to_be_committed) - def _get_ledger_public_view_at(self, node, call, seqno, timeout): + def _get_ledger_public_view_at(self, node, call, seqno, timeout, insecure=False): end_time = time.time() + timeout while time.time() < end_time: try: - return call(seqno) + return call(seqno, insecure=insecure) except Exception as ex: LOG.info(f"Exception: {ex}") self.consortium.create_and_withdraw_large_proposal(node) @@ -1262,14 +1262,14 @@ def get_ledger_public_state_at(self, seqno, timeout=5): primary, primary.get_ledger_public_tables_at, seqno, timeout ) - def get_latest_ledger_public_state(self, timeout=5): + def get_latest_ledger_public_state(self, insecure=False, timeout=5): primary, _ = self.find_primary() with primary.client() as nc: resp = nc.get("/node/commit") body = resp.body.json() tx_id = TxID.from_str(body["transaction_id"]) return self._get_ledger_public_view_at( - primary, primary.get_ledger_public_state_at, tx_id.seqno, timeout + primary, primary.get_ledger_public_state_at, tx_id.seqno, timeout, insecure ) @functools.cached_property diff --git a/tests/infra/node.py b/tests/infra/node.py index fe402dff6215..19a2d39df0c3 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -451,13 +451,15 @@ def wait_for_node_to_join(self, timeout=3): f"Node {self.local_node_id} failed to join the network" ) from e - def get_ledger_public_tables_at(self, seqno): - ledger = ccf.ledger.Ledger(self.remote.ledger_paths()) + def get_ledger_public_tables_at(self, seqno, insecure=False): + validator = ccf.ledger.LedgerValidator() if not insecure else None + ledger = ccf.ledger.Ledger(self.remote.ledger_paths(), validator=validator) assert ledger.last_committed_chunk_range[1] >= seqno tx = ledger.get_transaction(seqno) return tx.get_public_domain().get_tables() - def get_ledger_public_state_at(self, seqno): + def get_ledger_public_state_at(self, seqno, insecure=False): + validator = ccf.ledger.LedgerValidator() if not insecure else None ledger = ccf.ledger.Ledger(self.remote.ledger_paths()) assert ledger.last_committed_chunk_range[1] >= seqno return ledger.get_latest_public_state() diff --git a/tests/lts_compatibility.py b/tests/lts_compatibility.py index ecf4959d4515..478e70fd7781 100644 --- a/tests/lts_compatibility.py +++ b/tests/lts_compatibility.py @@ -210,6 +210,7 @@ def run_code_upgrade_from( old_nodes = network.get_joined_nodes() primary, _ = network.find_primary() + from_major_version = primary.major_version LOG.info("Apply transactions to old service") issue_activity_on_live_service(network, args) @@ -324,13 +325,13 @@ def run_code_upgrade_from( node.stop() LOG.info("Service is now made of new nodes only") + primary, _ = network.find_nodes() # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh if not os.getenv("CONTAINER_NODES"): jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x - primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: @@ -354,7 +355,15 @@ def run_code_upgrade_from( ) # Check that the ledger can be parsed - network.get_latest_ledger_public_state() + # Note: When upgrading from 1.x to 2.x, it is possible that ledger chunk are not + # in sync between nodes, which may cause some chunks to differ when starting + # from a snapshot. See https://github.com/microsoft/ccf/issues/3613. In such case, + # we only verify that the ledger can be parsed, even if some chunks are duplicated. + # This can go once 2.0 is released. + insecure_ledger_verification = from_major_version == 1 and primary.version_after("ccf-2.0.0-rc7") + network.get_latest_ledger_public_state( + insecure=insecure_ledger_verification + ) @reqs.description("Run live compatibility with latest LTS") From e567c320f4c04d1c6c2ff2ba35e099763b626d51 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Wed, 27 Apr 2022 09:36:59 +0000 Subject: [PATCH 17/24] . --- python/ccf/ledger.py | 4 ++-- tests/infra/network.py | 4 ++-- tests/infra/node.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ccf/ledger.py b/python/ccf/ledger.py index a46d77ac8db3..5b0b699b10ea 100644 --- a/python/ccf/ledger.py +++ b/python/ccf/ledger.py @@ -959,9 +959,9 @@ def get_transaction(self, seqno: int) -> Transaction: transaction = None for chunk in self: _, chunk_end = chunk.get_seqnos() - if chunk_end and chunk_end < seqno: - continue for tx in chunk: + if chunk_end and chunk_end < seqno: + continue public_transaction = tx.get_public_domain() if public_transaction.get_seqno() == seqno: return tx diff --git a/tests/infra/network.py b/tests/infra/network.py index d7f79c9bb8a4..23e2943747cd 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -1256,10 +1256,10 @@ def _get_ledger_public_view_at(self, node, call, seqno, timeout, insecure=False) f"Could not read transaction at seqno {seqno} from ledger {node.remote.ledger_paths()} after {timeout}s" ) - def get_ledger_public_state_at(self, seqno, timeout=5): + def get_ledger_public_state_at(self, seqno, timeout=5, insecure=False): primary, _ = self.find_primary() return self._get_ledger_public_view_at( - primary, primary.get_ledger_public_tables_at, seqno, timeout + primary, primary.get_ledger_public_tables_at, seqno, timeout, insecure ) def get_latest_ledger_public_state(self, insecure=False, timeout=5): diff --git a/tests/infra/node.py b/tests/infra/node.py index 19a2d39df0c3..37f9093de13b 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -460,7 +460,7 @@ def get_ledger_public_tables_at(self, seqno, insecure=False): def get_ledger_public_state_at(self, seqno, insecure=False): validator = ccf.ledger.LedgerValidator() if not insecure else None - ledger = ccf.ledger.Ledger(self.remote.ledger_paths()) + ledger = ccf.ledger.Ledger(self.remote.ledger_paths(), validator=validator) assert ledger.last_committed_chunk_range[1] >= seqno return ledger.get_latest_public_state() From f0c090867a8a7d0180c71ac72b05c45fd5e57344 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Wed, 27 Apr 2022 15:54:38 +0000 Subject: [PATCH 18/24] Fix snapshot ledger rekey issue --- src/kv/encryptor.h | 9 ++- src/kv/generic_serialise_wrapper.h | 14 +++- src/kv/kv_types.h | 3 +- src/kv/snapshot.h | 14 +++- src/kv/test/null_encryptor.h | 3 +- src/node/ledger_secrets.h | 10 ++- src/node/node_state.h | 2 + src/node/test/snapshotter.cpp | 118 +++++++++++++++++++++++++++++ 8 files changed, 163 insertions(+), 10 deletions(-) diff --git a/src/kv/encryptor.h b/src/kv/encryptor.h index 70744a7a5cfa..dd38fe0ac915 100644 --- a/src/kv/encryptor.h +++ b/src/kv/encryptor.h @@ -63,6 +63,9 @@ namespace kv * corresponding with the plaintext * @param[in] entry_type Indicates the type of the entry to * avoid IV re-use + * @param[in] historical_hint If true, considers all ledger secrets for + * encryption. Otherwise, try to use the latest used secret (defaults to + * false) * * @return Boolean status indicating success of encryption. */ @@ -72,13 +75,15 @@ namespace kv std::vector& serialised_header, std::vector& cipher, const TxID& tx_id, - EntryType entry_type = EntryType::WriteSet) override + EntryType entry_type = EntryType::WriteSet, + bool historical_hint = false) override { S hdr; set_iv(hdr, tx_id, entry_type); - auto key = ledger_secrets->get_encryption_key_for(tx_id.version); + auto key = + ledger_secrets->get_encryption_key_for(tx_id.version, historical_hint); if (key == nullptr) { return false; diff --git a/src/kv/generic_serialise_wrapper.h b/src/kv/generic_serialise_wrapper.h index 0c2209867729..0c8901766d97 100644 --- a/src/kv/generic_serialise_wrapper.h +++ b/src/kv/generic_serialise_wrapper.h @@ -31,6 +31,9 @@ namespace kv // must only be set by set_current_domain, since it affects current_writer SecurityDomain current_domain; + // If true, consider historical ledger secrets when encrypting entries + bool historical_hint; + template void serialise_internal(const T& t) { @@ -63,11 +66,13 @@ namespace kv // The evidence and claims digest must be systematically present // in regular transactions, but absent in snapshots. const crypto::Sha256Hash& commit_evidence_digest_ = {}, - const ccf::ClaimsDigest& claims_digest_ = ccf::no_claims()) : + const ccf::ClaimsDigest& claims_digest_ = ccf::no_claims(), + bool historical_hint_ = false) : tx_id(tx_id_), entry_type(entry_type_), header_flags(header_flags_), - crypto_util(e) + crypto_util(e), + historical_hint(historical_hint_) { set_current_domain(SecurityDomain::PUBLIC); serialise_internal(entry_type); @@ -201,7 +206,8 @@ namespace kv serialised_hdr, encrypted_private_domain, tx_id, - entry_type)) + entry_type, + historical_hint)) { throw KvSerialiserException(fmt::format( "Could not serialise transaction at seqno {}", tx_id.version)); @@ -342,6 +348,8 @@ namespace kv public_reader.init(data_public, public_domain_length); read_public_header(); + LOG_FAIL_FMT("version: {}, entry_type: {}", version, entry_type); + // If the domain is public only, skip the decryption and only return the // public data (integrity will be verified at the next signature entry) if ( diff --git a/src/kv/kv_types.h b/src/kv/kv_types.h index 17cd7ed57cb1..acc6fde02e8f 100644 --- a/src/kv/kv_types.h +++ b/src/kv/kv_types.h @@ -573,7 +573,8 @@ namespace kv std::vector& serialised_header, std::vector& cipher, const TxID& tx_id, - EntryType entry_type = EntryType::WriteSet) = 0; + EntryType entry_type = EntryType::WriteSet, + bool historical_hint = false) = 0; virtual bool decrypt( const std::vector& cipher, const std::vector& additional_data, diff --git a/src/kv/snapshot.h b/src/kv/snapshot.h index cffb4fac3921..b170a6947624 100644 --- a/src/kv/snapshot.h +++ b/src/kv/snapshot.h @@ -44,10 +44,20 @@ namespace kv // Set the execution dependency for the snapshot to be the version // previous to said snapshot to ensure that the correct snapshot is // serialized. - // Note: Snapshots are always taken at compacted state so version only is + // Notes: + // - Snapshots are always taken at compacted state so version only is // unique enough to prevent IV reuse + // - Because snapshot generation and ledger rekey can be interleaved, + // consider historical ledger secrets when encrypting snapshot (see + // https://github.com/microsoft/CCF/issues/3796). KvStoreSerialiser serialiser( - encryptor, {0, version}, kv::EntryType::Snapshot, 0); + encryptor, + {0, version}, + kv::EntryType::Snapshot, + 0, + {}, + ccf::no_claims(), + true /* historical_hint */); if (hash_at_snapshot.has_value()) { diff --git a/src/kv/test/null_encryptor.h b/src/kv/test/null_encryptor.h index 5e8281e826c7..afffa0a31928 100644 --- a/src/kv/test/null_encryptor.h +++ b/src/kv/test/null_encryptor.h @@ -16,7 +16,8 @@ namespace kv std::vector& serialised_header, std::vector& cipher, const TxID& tx_id, - EntryType entry_type = EntryType::WriteSet) override + EntryType entry_type = EntryType::WriteSet, + bool historical_hint = false) override { cipher = plain; return true; diff --git a/src/node/ledger_secrets.h b/src/node/ledger_secrets.h index 4812a83e6782..af8ba3b924f0 100644 --- a/src/node/ledger_secrets.h +++ b/src/node/ledger_secrets.h @@ -37,6 +37,9 @@ namespace ccf LedgerSecretPtr get_secret_for_version( kv::Version version, bool historical_hint = false) { + LOG_FAIL_FMT( + "Get secret for version: {}, hint: {}", version, historical_hint); + if (ledger_secrets.empty()) { LOG_FAIL_FMT("Ledger secrets map is empty"); @@ -45,8 +48,9 @@ namespace ccf if (!historical_hint && last_used_secret_it.has_value()) { + LOG_FAIL_FMT("Here"); // Fast path for non-historical queries as both primary and backup nodes - // encryt/decrypt transactions in order, it is sufficient to keep an + // encrypt/decrypt transactions in order, it is sufficient to keep an // iterator on the last used secret to access ledger secrets in constant // time. auto& last_used_secret_it_ = last_used_secret_it.value(); @@ -55,9 +59,11 @@ namespace ccf version >= std::next(last_used_secret_it_)->first) { // Across a rekey, start using the next key + LOG_FAIL_FMT("There!"); ++last_used_secret_it_; } + LOG_FAIL_FMT("Returning secret at {}", last_used_secret_it_->first); return last_used_secret_it_->second; } @@ -87,6 +93,8 @@ namespace ccf last_used_secret_it = std::prev(search); } + LOG_FAIL_FMT("[SLOW] Returning secret at {}", std::prev(search)->first); + return std::prev(search)->second; } diff --git a/src/node/node_state.h b/src/node/node_state.h index 32e644e9477f..97254687b28c 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -514,6 +514,8 @@ namespace ccf auto j = serdes::unpack(data, serdes::Pack::Text); + LOG_FAIL_FMT("Join response: {}", j.dump()); + JoinNetworkNodeToNode::Out resp; try { diff --git a/src/node/test/snapshotter.cpp b/src/node/test/snapshotter.cpp index a506cf45a7a2..178b8b5cbea9 100644 --- a/src/node/test/snapshotter.cpp +++ b/src/node/test/snapshotter.cpp @@ -7,6 +7,7 @@ #include "ds/ring_buffer.h" #include "kv/test/null_encryptor.h" #include "kv/test/stub_consensus.h" +#include "node/encryptor.h" #include "node/history.h" #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN @@ -47,6 +48,35 @@ auto read_ringbuffer_out(ringbuffer::Circuit& circuit) return idx; } +auto read_snapshot_out(ringbuffer::Circuit& circuit) +{ + std::optional> snapshot = std::nullopt; + circuit.read_from_inside().read( + -1, [&snapshot](ringbuffer::Message m, const uint8_t* data, size_t size) { + switch (m) + { + case consensus::snapshot: + { + serialized::read(data, size); + serialized::read(data, size); + snapshot = std::vector(data, data + size); + break; + } + case consensus::snapshot_commit: + { + REQUIRE(false); + break; + } + default: + { + REQUIRE(false); + } + } + }); + + return snapshot; +} + void issue_transactions(ccf::NetworkState& network, size_t tx_count) { for (size_t i = 0; i < tx_count; i++) @@ -300,4 +330,92 @@ TEST_CASE("Rollback before snapshot is committed") threading::ThreadMessaging::thread_messaging.run_one(); } +} + +// https://github.com/microsoft/CCF/issues/3796 +TEST_CASE("Rekey ledger while snapshot is in progress") +{ + logger::config::default_init(); + + ccf::NetworkState network; + + auto consensus = std::make_shared(); + auto history = std::make_shared( + *network.tables.get(), kv::test::PrimaryNodeId, *kp); + network.tables->set_history(history); + network.tables->set_consensus(consensus); + auto ledger_secrets = std::make_shared(); + ledger_secrets->init(); + auto encryptor = std::make_shared(ledger_secrets); + network.tables->set_encryptor(encryptor); + + auto in_buffer = std::make_unique(buffer_size); + auto out_buffer = std::make_unique(buffer_size); + ringbuffer::Circuit eio(in_buffer->bd, out_buffer->bd); + std::unique_ptr writer_factory = + std::make_unique(eio); + + size_t snapshot_tx_interval = 10; + + issue_transactions(network, snapshot_tx_interval); + + auto snapshotter = std::make_shared( + *writer_factory, network.tables, snapshot_tx_interval); + + size_t snapshot_idx = snapshot_tx_interval + 1; + + INFO("Schedule snapshot"); + { + auto tx = network.tables->create_tx(); + auto sigs = tx.rw(ccf::Tables::SIGNATURES); + auto trees = + tx.rw(ccf::Tables::SERIALISED_MERKLE_TREE); + sigs->put({kv::test::PrimaryNodeId, 0, 0, 0, 0, {}, {}, {}, {}}); + auto tree = history->serialise_tree(1, snapshot_idx - 1); + trees->put(tree); + tx.commit(); + + REQUIRE(record_signature(history, snapshotter, snapshot_idx)); + LOG_FAIL_FMT("***************************"); + snapshotter->commit(snapshot_idx, true); + + // Do not schedule task just yet so that we can interleave ledger rekey + } + + INFO("Rekey ledger and commit new transactions"); + { + ledger_secrets->set_secret(snapshot_idx + 1, ccf::make_ledger_secret()); + + // Issue new transactions that make use of new ledger secret + issue_transactions(network, snapshot_tx_interval); + } + + INFO("Finally, schedule snapshot creation"); + { + threading::ThreadMessaging::thread_messaging.run_one(); + REQUIRE(read_latest_snapshot_evidence(network.tables) == snapshot_idx); + auto snapshot = read_snapshot_out(eio); + REQUIRE(snapshot.has_value()); + REQUIRE(!snapshot->empty()); + + // Snapshot can be deserialised to backup store + ccf::NetworkState backup_network; + auto backup_history = std::make_shared( + *backup_network.tables.get(), kv::test::FirstBackupNodeId, *kp); + backup_network.tables->set_history(backup_history); + auto tx = network.tables->create_read_only_tx(); + + auto backup_ledger_secrets = std::make_shared(); + backup_ledger_secrets->init_from_map(ledger_secrets->get(tx)); + auto backup_encryptor = + std::make_shared(backup_ledger_secrets); + backup_network.tables->set_encryptor(backup_encryptor); + + kv::ConsensusHookPtrs hooks; + std::vector view_history; + REQUIRE( + backup_network.tables->deserialise_snapshot( + snapshot->data(), snapshot->size(), hooks, &view_history) == + kv::ApplyResult::PASS); + } } \ No newline at end of file From 7a3d0e2c66f18147b17b41e3597d826ef9c6c4c4 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Wed, 27 Apr 2022 15:54:53 +0000 Subject: [PATCH 19/24] Recovery from snapshot by default --- tests/recovery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/recovery.py b/tests/recovery.py index c39545c9ac0b..747f65f70d75 100644 --- a/tests/recovery.py +++ b/tests/recovery.py @@ -39,7 +39,7 @@ def get_and_verify_historical_receipt(network, ref_msg): @reqs.description("Recover a service") @reqs.recover(number_txs=2) -def test_recover_service(network, args, from_snapshot=False): +def test_recover_service(network, args, from_snapshot=True): network.save_service_identity(args) old_primary, _ = network.find_primary() From cfe785f0ba976bcac24f183d8e1c4317c1cf44ca Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Wed, 27 Apr 2022 15:54:58 +0000 Subject: [PATCH 20/24] Docs --- doc/operations/ledger_snapshot.rst | 4 +++- doc/operations/start_network.rst | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/operations/ledger_snapshot.rst b/doc/operations/ledger_snapshot.rst index a1b13233da6b..f3d961c225d9 100644 --- a/doc/operations/ledger_snapshot.rst +++ b/doc/operations/ledger_snapshot.rst @@ -70,7 +70,9 @@ Join/Recover From Snapshot Once a snapshot has been generated by the primary, operators can copy or mount the snapshot directory to the new node directory before it is started. On start-up, the new node will automatically resume from the latest committed snapshot file in the ``snapshots.directory`` directory. If no snapshot file is found, all historical transactions will be replicated to that node. -From 2.x releases (specifically, from `-dev5`), committed snapshot files embed the receipt of the evidence transaction. As such, nodes can join or recover a service from a standalone snapshot file. For 1.x releases, it is expected that operators also copy the ledger suffix containing the proof of commit of the evidence transaction to the node's ledger directory. +From 2.x releases, committed snapshot files embed the receipt of the evidence transaction. As such, nodes can join or recover a service from a standalone snapshot file. For 1.x releases, it is expected that operators also copy the ledger suffix containing the proof of commit of the evidence transaction to the node's ledger directory. + +It is important to note that new nodes cannot resume from a snapshot and join a service via a node that started from a more recent snapshot. For example, if a new node resumes from a snapshot generated at ``seqno 100`` and joins from a (primary) node that originally resumed from a snapshot at ``seqno 50``, the new node will throw a ``StartupSeqnoIsOld`` error shortly after starting up. It is expected that operators copy the *latest* committed snapshot file to new nodes before start up. .. note:: Snapshots emitted by 1.x nodes can be used by 2.x nodes to join or a recover a service. diff --git a/doc/operations/start_network.rst b/doc/operations/start_network.rst index ee05d3fbebf8..03f229c66d51 100644 --- a/doc/operations/start_network.rst +++ b/doc/operations/start_network.rst @@ -21,7 +21,7 @@ To create a new CCF network, the first node of the network should be started wit Uninitialized-- config -->Initialized; Initialized-- start -->PartOfNetwork; -The unique identifier of a CCF node is the hex-encoded string of the SHA-256 digest the public key contained in its identity certificate (e.g. ``50211327a77fc16dd2fba8fae5fffac3df909fceeb307cf804a4125ae2679007``). This unique identifier should be used by operators and members to refer to this node with CCF (for example, when :ref:`governance/common_member_operations:Trusting a New Node`). +The unique identifier of a CCF node is the hex-encoded string of the SHA-256 digest of the public key contained in its identity certificate (e.g. ``50211327a77fc16dd2fba8fae5fffac3df909fceeb307cf804a4125ae2679007``). This unique identifier should be used by operators and members to refer to this node with CCF (for example, when :ref:`governance/common_member_operations:Trusting a New Node`). CCF nodes can be started by using IP Addresses (both IPv4 and IPv6 are supported) or by specifying a fully qualified domain name. If an FQDN is used then a ``dNSName`` subject alternative name should be specified as part of the ``node_certificate.subject_alt_names`` configuration entry. Once a DNS has been setup it will be possible to connect to the node over TLS by using the node's domain name. From 2132e1748dc35cbd1876df9a597da40201c0868d Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Wed, 27 Apr 2022 16:12:23 +0000 Subject: [PATCH 21/24] fmt --- tests/lts_compatibility.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/lts_compatibility.py b/tests/lts_compatibility.py index 478e70fd7781..f6f8f1bef218 100644 --- a/tests/lts_compatibility.py +++ b/tests/lts_compatibility.py @@ -360,7 +360,9 @@ def run_code_upgrade_from( # from a snapshot. See https://github.com/microsoft/ccf/issues/3613. In such case, # we only verify that the ledger can be parsed, even if some chunks are duplicated. # This can go once 2.0 is released. - insecure_ledger_verification = from_major_version == 1 and primary.version_after("ccf-2.0.0-rc7") + insecure_ledger_verification = ( + from_major_version == 1 and primary.version_after("ccf-2.0.0-rc7") + ) network.get_latest_ledger_public_state( insecure=insecure_ledger_verification ) From 8db9b7663e9f662084334b06ce8cdd79d9bbe882 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Wed, 27 Apr 2022 16:15:19 +0000 Subject: [PATCH 22/24] Cleanup --- src/kv/generic_serialise_wrapper.h | 2 -- src/node/ledger_secrets.h | 8 -------- src/node/node_state.h | 2 -- 3 files changed, 12 deletions(-) diff --git a/src/kv/generic_serialise_wrapper.h b/src/kv/generic_serialise_wrapper.h index 0c8901766d97..8d3b88769c17 100644 --- a/src/kv/generic_serialise_wrapper.h +++ b/src/kv/generic_serialise_wrapper.h @@ -348,8 +348,6 @@ namespace kv public_reader.init(data_public, public_domain_length); read_public_header(); - LOG_FAIL_FMT("version: {}, entry_type: {}", version, entry_type); - // If the domain is public only, skip the decryption and only return the // public data (integrity will be verified at the next signature entry) if ( diff --git a/src/node/ledger_secrets.h b/src/node/ledger_secrets.h index af8ba3b924f0..b2a793afcfc0 100644 --- a/src/node/ledger_secrets.h +++ b/src/node/ledger_secrets.h @@ -37,9 +37,6 @@ namespace ccf LedgerSecretPtr get_secret_for_version( kv::Version version, bool historical_hint = false) { - LOG_FAIL_FMT( - "Get secret for version: {}, hint: {}", version, historical_hint); - if (ledger_secrets.empty()) { LOG_FAIL_FMT("Ledger secrets map is empty"); @@ -48,7 +45,6 @@ namespace ccf if (!historical_hint && last_used_secret_it.has_value()) { - LOG_FAIL_FMT("Here"); // Fast path for non-historical queries as both primary and backup nodes // encrypt/decrypt transactions in order, it is sufficient to keep an // iterator on the last used secret to access ledger secrets in constant @@ -59,11 +55,9 @@ namespace ccf version >= std::next(last_used_secret_it_)->first) { // Across a rekey, start using the next key - LOG_FAIL_FMT("There!"); ++last_used_secret_it_; } - LOG_FAIL_FMT("Returning secret at {}", last_used_secret_it_->first); return last_used_secret_it_->second; } @@ -93,8 +87,6 @@ namespace ccf last_used_secret_it = std::prev(search); } - LOG_FAIL_FMT("[SLOW] Returning secret at {}", std::prev(search)->first); - return std::prev(search)->second; } diff --git a/src/node/node_state.h b/src/node/node_state.h index 97254687b28c..32e644e9477f 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -514,8 +514,6 @@ namespace ccf auto j = serdes::unpack(data, serdes::Pack::Text); - LOG_FAIL_FMT("Join response: {}", j.dump()); - JoinNetworkNodeToNode::Out resp; try { From 21c6dbbb14a1101415147bc788b23b8a64ec58f4 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Thu, 28 Apr 2022 09:24:46 +0000 Subject: [PATCH 23/24] Be more resilient to errors during snapshots copy --- tests/infra/remote.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/infra/remote.py b/tests/infra/remote.py index 78ee11252042..d3ff2e615afe 100644 --- a/tests/infra/remote.py +++ b/tests/infra/remote.py @@ -935,11 +935,25 @@ def get_snapshots(self): return os.path.join(self.common_dir, self.snapshot_dir_name) def get_committed_snapshots(self, pre_condition_func=lambda src_dir, _: True): - self.remote.get( - self.snapshot_dir_name, - self.common_dir, - pre_condition_func=pre_condition_func, - ) + # It is possible that snapshots are committed while the copy is happening + # so retry a reasonable number of times. + max_retry_count = 5 + retry_count = 0 + while retry_count < max_retry_count: + try: + self.remote.get( + self.snapshot_dir_name, + self.common_dir, + pre_condition_func=pre_condition_func, + ) + break + except Exception as e: + LOG.warning( + f"Error copying committed snapshots from {self.snapshot_dir_name}: {e}. Retrying..." + ) + retry_count += 1 + time.sleep(0.1) + return os.path.join(self.common_dir, self.snapshot_dir_name) def log_path(self): From 645b5a1eb25c2e8644926be1dc099e0069927ec3 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Thu, 28 Apr 2022 14:16:11 +0000 Subject: [PATCH 24/24] fmt --- tests/infra/remote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/infra/remote.py b/tests/infra/remote.py index d3ff2e615afe..db675570ba8c 100644 --- a/tests/infra/remote.py +++ b/tests/infra/remote.py @@ -935,7 +935,7 @@ def get_snapshots(self): return os.path.join(self.common_dir, self.snapshot_dir_name) def get_committed_snapshots(self, pre_condition_func=lambda src_dir, _: True): - # It is possible that snapshots are committed while the copy is happening + # It is possible that snapshots are committed while the copy is happening # so retry a reasonable number of times. max_retry_count = 5 retry_count = 0