Skip to content

Commit

Permalink
replicaset: reconnect after fiber kill
Browse files Browse the repository at this point in the history
Currently if we kill the worker fiber of the connection, which was
initialized with 'reconnect_after' option, this connection goes into
'error_reconnect' or 'error' state (depends on tarantool version).
Reconnecting doesn't happen in both cases and the only way for user
to return router to working order is reloading or manual restoring of
the connections.

This patch introduces reconnecting in that case. It should be used
wisely, though. Fiber's killing doesn't happen instantly and if the
user doesn't wait util fiber's status is 'dead' and makes the request
immediately, exception will be probably thrown as the fiber can die
in the middle of request.

However, reconnecting doesn't happen automatically in tarantool 2.10.0,
as there's no way to determine if the fiber is dead other than checking
the error message of the connection, which is not a good practice as this
check can be easily trigerred false-positively.

Closes tarantool#341
  • Loading branch information
Serpentian committed Aug 11, 2022
1 parent f8ab617 commit 4df147f
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 1 deletion.
34 changes: 34 additions & 0 deletions test/router-luatest/router_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -545,3 +545,37 @@ g.test_enable_disable = function(g)
-- we don't want this server to interfere with subsequent tests
g.router_1:drop()
end

g.test_explicit_fiber_kill = function(g)
t.run_only_if(not (vutil.version_is_at_least(2, 10, 0, 'beta', 2, 0) and
vutil.version_is_at_most(2, 10, 0, nil, 0, 0)))

local bids = vtest.cluster_exec_each_master(g, function()
return _G.get_first_bucket()
end)

-- Kill worker fibers of connections to masters
g.router:exec(function()
for _, rs in ipairs(ivshard.router.static.replicasets) do
-- Explicitly kill the connection's worker fiber.
-- Callback for pushes is executed inside this fiber.
pcall(function()
rs.master.conn:eval([[
box.session.push(nil)
]], {}, {on_push = function()
ifiber.self():cancel()
end})
end)
end
end)

-- Check that the replicaset is still accessible
g.router:exec(function(bids)
for _, bid in ipairs(bids) do
local res, err = ivshard.router.callrw(bid, 'echo', {1},
{timeout = iwait_timeout})
ilt.assert(res, 1)
ilt.assert(err, nil)
end
end, {bids})
end
41 changes: 40 additions & 1 deletion vshard/replicaset.lua
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,45 @@ local function netbox_wait_connected(conn, timeout)
return timeout
end

--
-- Check if the connection is dead and it won't be restored automatically.
-- Even though replicaset's connections are initialized with the option
-- 'reconnect_after', there's cases, when no reconnect will be done (e.g.
-- if the conn was explicitly cancelled or the connection's worker fiber
-- was killed). In these situations we need to reestablish it manually as
-- the missing connection to some replicaset is a critical problem which
-- leads to the inability of the user to access some part of a data via
-- router.
--
-- Reconnecting after explicit fiber kill doesn't happen in tarantool 2.10.0
-- as there's no way to determine if the fiber is dead other than checking
-- the error message of the connection, which is not a good practice as this
-- check can be easily trigerred false-positively.
--
-- Note: the function expects the conn to be non-nil.
--
local function netbox_is_conn_dead(conn)
-- Fast path - conn is almost always 'active'.
if conn.state == 'active' then
return false
end
if conn.state == 'error' or conn.state == 'closed' then
return true
end
if conn.state ~= 'error_reconnect' then
-- The connection is fetching schema or doing auth, which
-- means it is not dead and shoudn't be reinitialized.
return false
end
if not conn._fiber then
-- The fiber field is not present in old netbox, which correctly
-- reports state as 'error' when its fiber got killed. It would
-- be filtered out above then.
return false
end
return conn._fiber:status() ~= "dead"
end

--
-- Check if the replica is not in backoff. It also serves as an update - if the
-- replica still has an old backoff timestamp, it is cleared. This way of
Expand All @@ -170,7 +209,7 @@ end
--
local function replicaset_connect_to_replica(replicaset, replica)
local conn = replica.conn
if not conn or conn.state == 'closed' then
if not conn or netbox_is_conn_dead(conn) then
conn = netbox.connect(replica.uri, {
reconnect_after = consts.RECONNECT_TIMEOUT,
wait_connected = false
Expand Down
4 changes: 4 additions & 0 deletions vshard/storage/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -935,8 +935,12 @@ local function recovery_step_by_type(type)
end
goto continue
end
log.info('Sending recovery callrw')
local remote_bucket, err =
destination:callrw('vshard.storage.bucket_stat', {bucket_id})
if remote_bucket then
log.info(remote_bucket.status)
end
-- Check if it is not a bucket error, and this result can
-- not be used to recovery anything. Try later.
if not remote_bucket and (not err or err.type ~= 'ShardingError' or
Expand Down
5 changes: 5 additions & 0 deletions vshard/util.lua
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ if not version_is_at_least(1, 10, 1) then
error("VShard supports only Tarantool >= 1.10.1")
end

local function version_is_at_most(...)
return tnt_version <= lversion.new(...)
end

--
-- Copy @a src table. Fiber yields every @a interval keys copied. Does not give
-- any guarantees on what is the result when the source table is changed during
Expand Down Expand Up @@ -353,6 +357,7 @@ return {
async_task = async_task,
internal = M,
version_is_at_least = version_is_at_least,
version_is_at_most = version_is_at_most,
table_copy_yield = table_copy_yield,
table_minus_yield = table_minus_yield,
table_extend = table_extend,
Expand Down

0 comments on commit 4df147f

Please sign in to comment.