Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cluster: Parse unknown self-test checks #22831

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions src/v/cluster/self_test_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "base/seastarx.h"
#include "base/vlog.h"
#include "cluster/logger.h"
#include "json/document.h"
#include "ssx/future-util.h"

#include <seastar/core/abort_source.hh>
Expand Down Expand Up @@ -51,6 +52,25 @@ ss::future<> self_test_backend::stop() {
co_await std::move(f);
}

void self_test_backend::parse_unknown_checks(
std::vector<cloudcheck_opts>& ctos,
std::vector<unknown_check>& unknown_checks) {
for (auto it = unknown_checks.begin(); it != unknown_checks.end();) {
if (it->test_type == "cloud") {
json::Document doc;
if (doc.Parse(it->test_json.c_str()).HasParseError()) {
++it;
continue;
}
const auto& obj = doc.GetObject();
ctos.push_back(cluster::cloudcheck_opts::from_json(obj));
it = unknown_checks.erase(it);
WillemKauf marked this conversation as resolved.
Show resolved Hide resolved
} else {
++it;
}
}
}

ss::future<std::vector<self_test_result>> self_test_backend::do_start_test(
std::vector<diskcheck_opts> dtos,
std::vector<netcheck_opts> ntos,
Expand All @@ -59,6 +79,8 @@ ss::future<std::vector<self_test_result>> self_test_backend::do_start_test(
auto gate_holder = _gate.hold();
std::vector<self_test_result> results;

parse_unknown_checks(ctos, unknown_checks);

_stage = self_test_stage::disk;
for (auto& dto : dtos) {
try {
Expand Down Expand Up @@ -185,8 +207,8 @@ get_status_response self_test_backend::start_test(start_test_request req) {
_prev_run = get_status_response{
.id = id,
.status = self_test_status::idle,
.stage = _stage,
.results = std::move(results)};
.results = std::move(results),
.stage = _stage};
});
}).finally([units = std::move(units)] {});
} else {
Expand Down
11 changes: 11 additions & 0 deletions src/v/cluster/self_test_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,17 @@ class self_test_backend {
ss::future<netcheck_response> netcheck(model::node_id, iobuf&&);

private:
// In the case that the controller node is of a redpanda version lower than
// the current node, some self-test checks may have been pushed back into
// the "unknown_checks" vector in the request. We will attempt to parse the
// test json on this node instead, if we recognize it. As of versions >=
// v24.2.x, this will only affect the cloudcheck options. This function
// modifies the in/out parameters: [ctos, unknown_checks]. This function
// will need amending in the event that a new self-test is added.
void parse_unknown_checks(
std::vector<cloudcheck_opts>& ctos,
std::vector<unknown_check>& unknown_checks);

ss::future<std::vector<self_test_result>> do_start_test(
std::vector<diskcheck_opts> dtos,
std::vector<netcheck_opts> ntos,
Expand Down
3 changes: 2 additions & 1 deletion src/v/cluster/self_test_frontend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,9 @@ ss::future<uuid_t> self_test_frontend::start_test(
.id = test_id,
.dtos = std::move(req.dtos),
.ntos = std::move(new_ntos),
.unknown_checks = std::move(req.unknown_checks),
.ctos = std::move(req.ctos),
.unknown_checks = std::move(req.unknown_checks)});
});
});
co_return test_id;
}
Expand Down
12 changes: 8 additions & 4 deletions src/v/cluster/self_test_rpc_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,15 +341,19 @@ struct empty_request
struct start_test_request
: serde::envelope<
start_test_request,
serde::version<1>,
serde::version<2>,
serde::compat_version<0>> {
using rpc_adl_exempt = std::true_type;

uuid_t id;
std::vector<diskcheck_opts> dtos;
std::vector<netcheck_opts> ntos;
std::vector<cloudcheck_opts> ctos;
std::vector<unknown_check> unknown_checks;
std::vector<cloudcheck_opts> ctos;

auto serde_fields() {
return std::tie(id, dtos, ntos, unknown_checks, ctos);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillemKauf i'm confused about what is going on here, and i can't figure out the answer from the commit message.

looking at the git history i see when version was 0, the order of serde fields in the struct was d,n,c. then it looks like it was bumped to 1 when the unknown checks were appended, which seems correct.

but then here, the commit appears to claim that a bug is being fixed (i'm not sure what the bug was), and also, the ordering of the fields was changed and that's not allowed even when bumping the version.


friend std::ostream&
operator<<(std::ostream& o, const start_test_request& r) {
Expand All @@ -374,14 +378,14 @@ struct start_test_request
struct get_status_response
: serde::envelope<
get_status_response,
serde::version<0>,
serde::version<1>,
serde::compat_version<0>> {
using rpc_adl_exempt = std::true_type;

uuid_t id{};
self_test_status status{};
self_test_stage stage{};
std::vector<self_test_result> results;
self_test_stage stage{};
Comment on lines -377 to +387
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this one seems more problematic because the ordering prior to this commit was both wrong, but also probably shipped in a release?


friend std::ostream&
operator<<(std::ostream& o, const get_status_response& r) {
Expand Down
103 changes: 92 additions & 11 deletions tests/rptest/tests/self_test_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@

import re
import time
from collections import defaultdict
from rptest.services.cluster import cluster
from rptest.tests.end_to_end import EndToEndTest
from rptest.tests.redpanda_test import RedpandaTest
from rptest.clients.rpk import RpkTool
from rptest.services.admin import Admin
from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST, SISettings
from rptest.services.redpanda_installer import RedpandaVersionLine
from rptest.services.redpanda_installer import InstallOptions
from ducktape.utils.util import wait_until
from ducktape.mark import matrix
from rptest.utils.functional import flat_map
Expand Down Expand Up @@ -221,9 +224,9 @@ def test_self_test_unknown_test_type(self):
cloud_storage_enable_remote_read=True,
cloud_storage_enable_remote_write=True))

#Attempt to run with an unknown test type "pandatest"
#and possibly unknown "cloud" test.
#The rest of the tests should proceed as normal.
# Attempt to run with an unknown test type "pandatest"
# and possibly unknown "cloud" test.
# The rest of the tests should proceed as normal.
request_json = {
'tests': [{
'type': 'pandatest'
Expand All @@ -236,32 +239,32 @@ def test_self_test_unknown_test_type(self):
}]
}

#Manually invoke self test admin endpoint.
# Manually invoke self test admin endpoint.
self.redpanda._admin._request('POST',
'debug/self_test/start',
json=request_json)

#Populate list of unknown reports.
# Populate list of unknown reports.
unknown_report_types = ['pandatest']
redpanda_versions = [
self.redpanda.get_version_int_tuple(node)
for node in self.redpanda.nodes
]

#All nodes should have the same version of
#Redpanda running.
# All nodes should have the same version of
# Redpanda running.
assert len(set(redpanda_versions)) == 1

#Cloudcheck was introduced in 24.2.1.
#Expect that it will be unknown to nodes running
#earlier versions of redpanda.
# Cloudcheck was introduced in 24.2.1.
# Expect that it will be unknown to nodes running
# earlier versions of redpanda.
if redpanda_versions[0] < (24, 2, 1):
unknown_report_types.append('cloud')

# Wait for self test completion.
node_reports = self.wait_for_self_test_completion()

#Assert reports are passing, with the exception of unknown tests.
# Assert reports are passing, with the exception of unknown tests.
reports = flat_map(lambda node: node['results'], node_reports)
assert len(reports) > 0
for report in reports:
Expand All @@ -270,3 +273,81 @@ def test_self_test_unknown_test_type(self):
else:
assert 'error' not in report
assert 'warning' not in report

@cluster(num_nodes=3)
def test_self_test_mixed_node_controller_lower_version(self):
"""Assert the self test still runs when the controller node
is of a lower version than the rest of the nodes in the cluster.
The upgraded follower nodes should be able to parse the "unknown"
checks (currently just the cloudcheck), and then run and return
their results to the controller node."""
num_nodes = 3

install_opts = InstallOptions(version=RedpandaVersionLine((24, 1)),
num_to_upgrade=2)
self.start_redpanda(
num_nodes=num_nodes,
si_settings=SISettings(test_context=self.test_context),
install_opts=install_opts)

# Attempt to run with a possibly unknown "cloud" test.
# The controller, which is of a lower version than the other nodes in the cluster,
# doesn't recognize "cloud" as a test, but the other nodes should.
request_json = {
'tests': [{
'type': 'cloud',
'backoff_ms': 100,
'timeout_ms': 5000
}]
}

redpanda_versions = {
i: self.redpanda.get_version_int_tuple(node)
for (i, node) in enumerate(self.redpanda.nodes)
}

controller_node_index = min(redpanda_versions,
key=redpanda_versions.get)
controller_node_id = controller_node_index + 1
# Make sure that the lowest version node is the controller.
self.redpanda._admin.partition_transfer_leadership(
'redpanda', 'controller', 0, controller_node_id)
wait_until(lambda: self.redpanda._admin.get_partition_leader(
namespace="redpanda", topic="controller", partition=0) ==
controller_node_id,
timeout_sec=10,
backoff_sec=1,
err_msg="Leadership did not stabilize")

# Manually invoke self test admin endpoint, using the lowest version node as the target.
self.redpanda._admin._request(
'POST',
'debug/self_test/start',
json=request_json,
node=self.redpanda.nodes[controller_node_index])

# Wait for self test completion.
node_reports = self.wait_for_self_test_completion()

unknown_checks_map = defaultdict(set)
for node, version in redpanda_versions.items():
node_id = node + 1
# Cloudcheck was introduced in 24.2.1.
# Expect that it will be unknown to nodes running
# earlier versions of redpanda.
if version < (24, 2, 1):
unknown_checks_map[node_id].add('cloud')

# Assert reports are passing, with the exception of unknown tests.
assert len(node_reports) > 0
for report in node_reports:
node = report['node_id']
results = report['results']
# Results shouldn't be empty, even for unknown checks.
assert len(results) > 0
for result in results:
if result['test_type'] in unknown_checks_map[node]:
assert 'error' in result
else:
assert 'error' not in result
assert 'warning' not in result
Loading