Skip to content

Commit

Permalink
replicaset: reconnect after fiber kill
Browse files Browse the repository at this point in the history
Currently if we kill net.box's fibers the connection goes into
'error_reconnect' state. However, it's not reconnecting anymore.

This patch introduces reconnecting in that case. It should be used
wisely, though. Fiber's killing doesn't happen instantly and if the
user doesn't wait util fiber's status is 'dead' and makes the request
immediately, exception will be probably thrown as the fiber can die
in the middle of request.

So, after fiber kill wait until it's really dead and make a request
only after that.

Closes tarantool#341
  • Loading branch information
Serpentian committed Jul 14, 2022
1 parent 96b6d27 commit d484e09
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 1 deletion.
35 changes: 35 additions & 0 deletions test/router-luatest/router_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -545,3 +545,38 @@ g.test_enable_disable = function(g)
-- we don't want this server to interfere with subsequent tests
g.router_1:drop()
end

g.test_explicit_fiber_kill = function(g)
t.run_only_if(vutil.version_is_at_least(2, 10, 1))
--
-- gh-341: explicit kill of net.box's fibers
--
local rs_uuid = g.replica_1_a:replicaset_uuid()
local replica_uuid = g.replica_1_a:instance_uuid()
local bid = vtest.storage_first_bucket(g.replica_1_a)

local res, err = g.router:exec(function(bid)
return ivshard.router.callrw(bid, 'echo', {1}, {timeout = iwait_timeout})
end, {bid})
t.assert_equals(err, nil, 'no error')
t.assert_equals(res, 1, 'good result')

-- Kill fiber and wait until it's dead. Without waiting the fiber can die
-- during request, which will cause an exception to be thrown.
g.router:exec(function(rs_uuid, replica_uuid)
local rs = ivshard.router.static.replicasets[rs_uuid]
local f = rs.replicas[replica_uuid].conn._fiber
assert(f ~= nil)

ifiber.kill(f:id())
while f:status() ~= 'dead'do
ifiber.yield()
end
end, {rs_uuid, replica_uuid})

res, err = g.router:exec(function(bid)
return ivshard.router.callrw(bid, 'echo', {1}, {timeout = iwait_timeout})
end, {bid})
t.assert_equals(err, nil, 'no error')
t.assert_equals(res, 1, 'good result')
end
4 changes: 3 additions & 1 deletion vshard/replicaset.lua
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@ end
--
local function replicaset_connect_to_replica(replicaset, replica)
local conn = replica.conn
if not conn or conn.state == 'closed' then
-- gh-341: additionally check if connection's fiber was explicitly killed
if not conn or conn.state == 'closed' or (conn.state == 'error_reconnect'
and conn._fiber and conn._fiber:status() == 'dead') then
conn = netbox.connect(replica.uri, {
reconnect_after = consts.RECONNECT_TIMEOUT,
wait_connected = false
Expand Down

0 comments on commit d484e09

Please sign in to comment.