Skip to content

Commit

Permalink
Merge pull request #22993 from bharathv/241x_sync_race
Browse files Browse the repository at this point in the history
[backport] [v24.1.x] rm_stm: fix race during leadership transfer
  • Loading branch information
mmaslankaprv authored Aug 22, 2024
2 parents 78d2169 + 12df2f9 commit d7a656e
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 24 deletions.
41 changes: 22 additions & 19 deletions src/v/cluster/rm_stm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -974,19 +974,12 @@ rm_stm::do_mark_expired(model::producer_identity pid) {
co_return std::error_code(co_await do_try_abort_old_tx(pid));
}

ss::future<result<kafka_result>> rm_stm::do_sync_and_transactional_replicate(
ss::future<result<kafka_result>> rm_stm::transactional_replicate(
model::term_id synced_term,
producer_ptr producer,
model::batch_identity bid,
model::record_batch_reader rdr,
ssx::semaphore_units units) {
if (!co_await sync(_sync_timeout)) {
vlog(
_ctx_log.trace,
"processing name:replicate_tx pid:{} => stale leader",
bid.pid);
co_return errc::not_leader;
}
auto synced_term = _insync_term;
auto result = co_await do_transactional_replicate(
synced_term, producer, bid, std::move(rdr));
if (!result) {
Expand Down Expand Up @@ -1105,31 +1098,34 @@ ss::future<result<kafka_result>> rm_stm::transactional_replicate(
if (!check_tx_permitted()) {
co_return errc::generic_tx_error;
}
if (!co_await sync(_sync_timeout)) {
vlog(
_ctx_log.trace,
"processing name:replicate_tx pid:{} => stale leader",
bid.pid);
co_return errc::not_leader;
}
auto synced_term = _insync_term;
auto [producer, _] = maybe_create_producer(bid.pid);
co_return co_await producer
->run_with_lock([&](ssx::semaphore_units units) {
return do_sync_and_transactional_replicate(
producer, bid, std::move(rdr), std::move(units));
return transactional_replicate(
synced_term, producer, bid, std::move(rdr), std::move(units));
})
.finally([this, producer] {
_producer_state_manager.local().touch(*producer, _vcluster_id);
});
}

ss::future<result<kafka_result>> rm_stm::do_sync_and_idempotent_replicate(
ss::future<result<kafka_result>> rm_stm::idempotent_replicate(
model::term_id synced_term,
producer_ptr producer,
model::batch_identity bid,
model::record_batch_reader br,
raft::replicate_options opts,
ss::lw_shared_ptr<available_promise<>> enqueued,
ssx::semaphore_units units,
producer_previously_known known_producer) {
if (!co_await sync(_sync_timeout)) {
// it's ok not to set enqueued on early return because
// the safety check in replicate_in_stages sets it automatically
co_return errc::not_leader;
}
auto synced_term = _insync_term;
auto result = co_await do_idempotent_replicate(
synced_term,
producer,
Expand Down Expand Up @@ -1256,10 +1252,17 @@ ss::future<result<kafka_result>> rm_stm::idempotent_replicate(
raft::replicate_options opts,
ss::lw_shared_ptr<available_promise<>> enqueued) {
try {
if (!co_await sync(_sync_timeout)) {
// it's ok not to set enqueued on early return because
// the safety check in replicate_in_stages sets it automatically
co_return errc::not_leader;
}
auto synced_term = _insync_term;
auto [producer, known_producer] = maybe_create_producer(bid.pid);
co_return co_await producer
->run_with_lock([&, known_producer](ssx::semaphore_units units) {
return do_sync_and_idempotent_replicate(
return idempotent_replicate(
synced_term,
producer,
bid,
std::move(br),
Expand Down
12 changes: 7 additions & 5 deletions src/v/cluster/rm_stm.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,8 @@ class rm_stm final : public raft::persisted_stm<> {
ss::future<result<kafka_result>> transactional_replicate(
model::batch_identity, model::record_batch_reader);

ss::future<result<kafka_result>> do_sync_and_transactional_replicate(
ss::future<result<kafka_result>> transactional_replicate(
model::term_id,
producer_ptr,
model::batch_identity,
model::record_batch_reader,
Expand All @@ -331,23 +332,24 @@ class rm_stm final : public raft::persisted_stm<> {
raft::replicate_options,
ss::lw_shared_ptr<available_promise<>>);

ss::future<result<kafka_result>> do_idempotent_replicate(
ss::future<result<kafka_result>> idempotent_replicate(
model::term_id,
producer_ptr,
model::batch_identity,
model::record_batch_reader,
raft::replicate_options,
ss::lw_shared_ptr<available_promise<>>,
ssx::semaphore_units&,
ssx::semaphore_units,
producer_previously_known);

ss::future<result<kafka_result>> do_sync_and_idempotent_replicate(
ss::future<result<kafka_result>> do_idempotent_replicate(
model::term_id,
producer_ptr,
model::batch_identity,
model::record_batch_reader,
raft::replicate_options,
ss::lw_shared_ptr<available_promise<>>,
ssx::semaphore_units,
ssx::semaphore_units&,
producer_previously_known);

ss::future<result<kafka_result>> replicate_msg(
Expand Down

0 comments on commit d7a656e

Please sign in to comment.