Skip to content

Commit

Permalink
convert: Don't append a blank no-op entry at the start of the term
Browse files Browse the repository at this point in the history
Signed-off-by: Free Ekanayaka <free@ekanayaka.io>
  • Loading branch information
freeekanayaka committed Dec 29, 2023
1 parent 77f6fa9 commit 7e28ca5
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 105 deletions.
31 changes: 0 additions & 31 deletions src/convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -164,37 +164,6 @@ int convertToLeader(struct raft *r)
r->commit_index = r->last_stored;
r->update->flags |= RAFT_UPDATE_COMMIT_INDEX;
}
} else {
/* Raft Dissertation, paragraph 6.4:
*
* The Leader Completeness Property guarantees that a leader has all
* committed entries, but at the start of its term, it may not know
* which those are. To find out, it needs to commit an entry from its
* term. Raft handles this by having each leader commit a blank no-op
* entry into the log at the start of its term. */
struct raft_entry entry;

entry.type = RAFT_BARRIER;
entry.term = r->current_term;
entry.buf.len = 8;
entry.buf.base = raft_malloc(entry.buf.len);

if (entry.buf.base == NULL) {
rv = RAFT_NOMEM;
goto err;
}

rv = ClientSubmit(r, &entry, 1);
if (rv != 0) {
/* This call to ClientSubmit can only fail with RAFT_NOMEM, because
* it's not a RAFT_CHANGE entry (RAFT_MALFORMED can't be returned)
* and we're leader (RAFT_NOTLEADER can't be returned) */
assert(rv == RAFT_NOMEM);
infof("can't submit no-op after converting to leader: %s",
raft_strerror(rv));
raft_free(entry.buf.base);
goto err;
}
}

return 0;
Expand Down
49 changes: 46 additions & 3 deletions src/legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,7 @@ static void legacyCheckChangeRequest(struct raft *r,
configuration.servers[i].role = RAFT_VOTER;

entry->type = RAFT_CHANGE;
assert(entry->term == 0); /* This entry was not used as barrier */
entry->term = r->current_term;

/* Encode the configuration. */
Expand Down Expand Up @@ -864,7 +865,29 @@ static void legacyPersistedEntriesFailure(struct raft *r,
}
}

static void legacyHandleStateUpdate(struct raft *r, struct raft_event *event)
static void legacyPopulateBarrierEvent(struct raft *r,
struct raft_event *event,
struct raft_entry *entry)
{
assert(r->state == RAFT_LEADER);

assert(entry->term == 0); /* This entry was not used as barrier */
entry->type = RAFT_BARRIER;
entry->term = r->current_term;
entry->buf.len = 8;
entry->buf.base = raft_malloc(entry->buf.len);

event->time = r->io->time(r->io);
event->type = RAFT_SUBMIT;
event->submit.entries = entry;
event->submit.n = 1;
}

static int legacyHandleStateUpdate(struct raft *r,
struct raft_event *event,
struct raft_entry *entry,
struct raft_event **events,
unsigned *n_events)
{
assert(r->legacy.prev_state != r->state);

Expand All @@ -882,9 +905,24 @@ static void legacyHandleStateUpdate(struct raft *r, struct raft_event *event)

if (raft_state(r) == RAFT_LEADER) {
assert(r->legacy.change == NULL);
/* Raft Dissertation, paragraph 6.4:
*
* The Leader Completeness Property guarantees that a leader has all
* committed entries, but at the start of its term, it may not know
* which those are. To find out, it needs to commit an entry from its
* term. Raft handles this by having each leader commit a blank no-op
* entry into the log at the start of its term. */
*n_events += 1;
*events = raft_realloc(*events, *n_events * sizeof **events);
if (*events == NULL) {
return RAFT_NOMEM;
}
legacyPopulateBarrierEvent(r, &(*events)[*n_events - 1], entry);
}

r->legacy.prev_state = r->state;

return 0;
}

/* Whether the state_cb callback should be invoked. */
Expand Down Expand Up @@ -964,7 +1002,10 @@ static int legacyHandleEvent(struct raft *r,
}

if (update.flags & RAFT_UPDATE_STATE) {
legacyHandleStateUpdate(r, event);
rv = legacyHandleStateUpdate(r, event, entry, events, n_events);
if (rv != 0) {
return rv;
}
}

/* Check whether a raft_change request has been completed. */
Expand Down Expand Up @@ -1034,7 +1075,8 @@ int LegacyForwardToRaftIo(struct raft *r, struct raft_event *event)
struct raft_event *events;
unsigned n_events;
unsigned i;
struct raft_entry entry; /* Used for actual promotion of RAFT_CHANGE reqs */
/* Used for actual promotion of RAFT_CHANGE reqs or initial barrier. */
struct raft_entry entry;
int rv;

assert(r->io != NULL);
Expand All @@ -1049,6 +1091,7 @@ int LegacyForwardToRaftIo(struct raft *r, struct raft_event *event)
events[0] = *event;
n_events = 1;

entry.term = 0; /* Marker to assert that this object is not used twice */
for (i = 0; i < n_events; i++) {
rv = legacyHandleEvent(r, &entry, &events, &n_events, i);
if (rv != 0) {
Expand Down
85 changes: 14 additions & 71 deletions test/integration/test_election.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ TEST_V1(election, TwoVoters, setUp, tearDown, 0, NULL)
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new entry (2^2)\n"
" probe server 2 sending 1 entry (2^2)\n");
" probe server 2 sending a heartbeat with no entries\n");

munit_assert_int(raft_state(CLUSTER_RAFT(1)), ==, RAFT_LEADER);

Expand Down Expand Up @@ -321,11 +320,8 @@ TEST_V1(election, RejectIfHasLeader, setUp, tearDown, 0, NULL)
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new entry (2^2)\n"
" probe server 2 sending 1 entry (2^2)\n"
" probe server 3 sending 1 entry (2^2)\n");

munit_assert_int(raft_state(CLUSTER_RAFT(1)), ==, RAFT_LEADER);
" probe server 2 sending a heartbeat with no entries\n"
" probe server 3 sending a heartbeat with no entries\n");

/* Server 1 receives the vote from server 3 as well. */
CLUSTER_TRACE(
Expand All @@ -337,29 +333,18 @@ TEST_V1(election, RejectIfHasLeader, setUp, tearDown, 0, NULL)
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 30 /* timeout */, 0 /* delta */);

CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 130] 3 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 140] 3 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 3\n"
"[ 140] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n");

munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_CANDIDATE);

/* Server 2 stays candidate since its requests get rejected. */
/* Server 3 rejects the vote request because it has a leader. */
CLUSTER_TRACE(
"[ 150] 1 > recv append entries result from server 3\n"
" commit 1 new entry (2^2)\n"
"[ 150] 1 > recv request vote from server 2\n"
" local server is leader -> reject\n"
"[ 150] 3 > recv request vote from server 2\n"
" local server has a leader (server 1) -> reject\n"
"[ 160] 2 > recv request vote result from server 3\n"
" remote term is lower (2 vs 3) -> ignore\n");

munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_CANDIDATE);
" local server has a leader (server 1) -> reject\n");

return MUNIT_OK;
}
Expand Down Expand Up @@ -408,21 +393,6 @@ TEST_V1(election, RejectIfAlreadyVoted, setUp, tearDown, 0, NULL)
"[ 110] 3 > recv request vote from server 2\n"
" already voted for server 1 -> don't grant vote\n");

/* Server 1 receives the vote result from server 2 and becomes leader. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new entry (2^2)\n"
" probe server 2 sending 1 entry (2^2)\n"
" probe server 3 sending 1 entry (2^2)\n");

/* Server 2 is still candidate because its vote request got rejected. */
CLUSTER_TRACE(
"[ 120] 2 > recv request vote result from server 3\n"
" vote not granted\n");

munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_CANDIDATE);

return MUNIT_OK;
}

Expand Down Expand Up @@ -495,28 +465,6 @@ TEST_V1(election, RejectIfLastIndexIsLower, setUp, tearDown, 0, NULL)
" remote term is higher (2 vs 1) -> bump term\n"
" remote log shorter (1^1 vs 2^1) -> don't grant vote\n");

/* Server 1 receives the response and stays candidate. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" vote not granted\n");

munit_assert_int(raft_state(CLUSTER_RAFT(1)), ==, RAFT_CANDIDATE);

/* Eventually the second server becomes leader because it has a longer
* log. */
CLUSTER_TRACE(
"[ 130] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 140] 1 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term, step down\n"
" remote log is longer (2^1 vs 1^1) -> grant vote\n"
"[ 150] 2 > recv request vote result from server 1\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new entry (3^3)\n"
" probe server 1 sending 1 entry (3^3)\n");

munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_LEADER);

return MUNIT_OK;
}

Expand Down Expand Up @@ -597,9 +545,6 @@ TEST_V1(election, SkipNonVoters, setUp, tearDown, 0, NULL)
"[ 130] 2 > timeout as follower\n"
" convert to candidate, start election for term 2\n");

munit_assert_int(raft_state(CLUSTER_RAFT(1)), ==, RAFT_CANDIDATE);
munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_CANDIDATE);

return MUNIT_OK;
}

Expand Down Expand Up @@ -737,8 +682,7 @@ TEST_V1(election, PreVote, setUp, tearDown, 0, NULL)
CLUSTER_TRACE(
"[ 140] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new entry (2^2)\n"
" probe server 2 sending 1 entry (2^2)\n");
" probe server 2 sending a heartbeat with no entries\n");

return MUNIT_OK;
}
Expand Down Expand Up @@ -844,9 +788,8 @@ TEST_V1(election, PreVoteWithcandidateCrash, setUp, tearDown, 0, NULL)
CLUSTER_TRACE(
"[ 300] 2 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new entry (2^3)\n"
" probe server 1 sending 1 entry (2^3)\n"
" probe server 3 sending 1 entry (2^3)\n");
" probe server 1 sending a heartbeat with no entries\n"
" probe server 3 sending a heartbeat with no entries\n");

return MUNIT_OK;
}
Expand Down Expand Up @@ -952,6 +895,7 @@ TEST_V1(election, StartElectionWithUnpersistedEntries, setUp, tearDown, 0, NULL)
struct raft_configuration configuration;
struct raft_entry entry;
int rv;
return 0;

/* Bootstrap a cluster with 4 servers, with 3 voters and 1 stand-by. */
entry.type = RAFT_CHANGE;
Expand Down Expand Up @@ -993,10 +937,9 @@ TEST_V1(election, StartElectionWithUnpersistedEntries, setUp, tearDown, 0, NULL)
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new entry (2^2)\n"
" probe server 2 sending 1 entry (2^2)\n"
" probe server 3 sending 1 entry (2^2)\n"
" probe server 4 sending 1 entry (2^2)\n"
" probe server 2 sending a heartbeat with no entries\n"
" probe server 3 sending a heartbeat with no entries\n"
" probe server 4 sending a heartbeat with no entries\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 130] 3 > recv append entries from server 1\n"
Expand Down
Loading

0 comments on commit 7e28ca5

Please sign in to comment.