Skip to content

Commit

Permalink
replicaset: reconnect after fiber kill
Browse files Browse the repository at this point in the history
Currently if we kill net.box's fibers the connection goes into
'error_reconnect' state. However, it's not reconnecting anymore.

This patch introduces reconnecting in that case. It should be used
wisely, though. Fiber's killing doesn't happen instantly and if the
user doesn't wait util fiber's status is 'dead' and makes the request
immediately, exception will be probably thrown as the fiber can die
in the middle of request.

So, after fiber kill wait until it's really dead and make a request
only after that.

Closes tarantool#341
  • Loading branch information
Serpentian committed Jul 15, 2022
1 parent 96b6d27 commit 83b034f
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 1 deletion.
1 change: 1 addition & 0 deletions test/instances/router.lua
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ local helpers = require('test.luatest_helpers')
_G.ifiber = require('fiber')
_G.imsgpack = require('msgpack')
_G.ivtest = require('test.luatest_helpers.vtest')
_G.ivutil = require('vshard.util')
_G.iwait_timeout = _G.ivtest.wait_timeout

-- Do not load entire vshard into the global namespace to catch errors when code
Expand Down
41 changes: 41 additions & 0 deletions test/router-luatest/router_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -545,3 +545,44 @@ g.test_enable_disable = function(g)
-- we don't want this server to interfere with subsequent tests
g.router_1:drop()
end

g.test_explicit_fiber_kill = function(g)
--
-- Kill fibers and wait until they're dead. Without waiting the fiber can
-- die during request, which will cause an exception to be thrown.
--
local rs_uuids = {g.replica_1_a:replicaset_uuid(),
g.replica_2_a:replicaset_uuid()}

g.router:exec(function(uuids)
for id, f in pairs(ifiber.info()) do
if f.name:endswith('(net.box)') then
ifiber.kill(id)
end
end

local replicasets = ivshard.router.static.replicasets
local conn_1 = replicasets[uuids[1]].master.conn
local conn_2 = replicasets[uuids[2]].master.conn
assert(conn_1 and conn_2)

while ivutil.conn_fiber_killed(conn_1) or
ivutil.conn_fiber_killed(conn_2) do
ifiber.yield()
end
end, {rs_uuids})

-- check that all replicasets are accessible and restore connections
local bids = {vtest.storage_first_bucket(g.replica_1_a),
vtest.storage_first_bucket(g.replica_2_a)}

for _, bid in pairs(bids) do
local res, err = g.router:exec(function(bucket_id)
return ivshard.router.callrw(bucket_id, 'echo', {1},
{timeout = iwait_timeout})
end, {bid})

t.assert_equals(err, nil, 'no error')
t.assert_equals(res, 1, 'good result')
end
end
3 changes: 2 additions & 1 deletion vshard/replicaset.lua
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ end
--
local function replicaset_connect_to_replica(replicaset, replica)
local conn = replica.conn
if not conn or conn.state == 'closed' then
-- gh-341: additionally check if connection's fiber was explicitly killed
if not conn or conn.state == 'closed' or util.conn_fiber_killed(conn) then
conn = netbox.connect(replica.uri, {
reconnect_after = consts.RECONNECT_TIMEOUT,
wait_connected = false
Expand Down
10 changes: 10 additions & 0 deletions vshard/util.lua
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,15 @@ local function fiber_is_self_canceled()
return not pcall(fiber.testcancel)
end

--
-- Check if connection's fiber was explicitly killed
--
local conn_fiber_killed = function(conn)
local msg = 'fiber is cancelled'
return (conn.state == 'error_reconnect' or conn.state == 'error') and
(conn.error == msg or conn.error.message == msg)
end

--
-- Get min tuple from the index with the given key.
--
Expand Down Expand Up @@ -357,6 +366,7 @@ return {
table_extend = table_extend,
fiber_cond_wait = fiber_cond_wait,
fiber_is_self_canceled = fiber_is_self_canceled,
conn_fiber_killed = conn_fiber_killed,
index_min = index_min,
index_has = index_has,
feature = feature,
Expand Down

0 comments on commit 83b034f

Please sign in to comment.