diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua index 182e4181..054e7f21 100644 --- a/test/router-luatest/router_test.lua +++ b/test/router-luatest/router_test.lua @@ -545,3 +545,37 @@ g.test_enable_disable = function(g) -- we don't want this server to interfere with subsequent tests g.router_1:drop() end + +g.test_explicit_fiber_kill = function(g) + t.run_only_if(not (vutil.version_is_at_least(2, 10, 0, 'beta', 2, 0) and + vutil.version_is_at_most(2, 10, 0, nil, 0, 0))) + + local bids = vtest.cluster_exec_each_master(g, function() + return _G.get_first_bucket() + end) + + -- Kill worker fibers of connections to masters + g.router:exec(function() + for _, rs in ipairs(ivshard.router.static.replicasets) do + -- Explicitly kill the connection's worker fiber. + -- Callback for pushes is executed inside this fiber. + pcall(function() + rs.master.conn:eval([[ + box.session.push(nil) + ]], {}, {on_push = function() + ifiber.self():cancel() + end}) + end) + end + end) + + -- Check that the replicaset is still accessible + g.router:exec(function(bids) + for _, bid in ipairs(bids) do + local res, err = ivshard.router.callrw(bid, 'echo', {1}, + {timeout = iwait_timeout}) + ilt.assert(res, 1) + ilt.assert(err, nil) + end + end, {bids}) +end diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua index 56cb12e6..d711a1bb 100644 --- a/vshard/replicaset.lua +++ b/vshard/replicaset.lua @@ -144,6 +144,45 @@ local function netbox_wait_connected(conn, timeout) return timeout end +-- +-- Check if the connection is dead and it won't be restored automatically. +-- Even though replicaset's connections are initialized with the option +-- 'reconnect_after', there's cases, when no reconnect will be done (e.g. +-- if the conn was explicitly cancelled or the connection's worker fiber +-- was killed). In these situations we need to reestablish it manually as +-- the missing connection to some replicaset is a critical problem which +-- leads to the inability of the user to access some part of a data via +-- router. +-- +-- Reconnecting after explicit fiber kill doesn't happen in tarantool 2.10.0 +-- as there's no way to determine if the fiber is dead other than checking +-- the error message of the connection, which is not a good practice as this +-- check can be easily trigerred false-positively. +-- +-- Note: the function expects the conn to be non-nil. +-- +local function netbox_is_conn_dead(conn) + -- Fast path - conn is almost always 'active'. + if conn.state == 'active' then + return false + end + if conn.state == 'error' or conn.state == 'closed' then + return true + end + if conn.state ~= 'error_reconnect' then + -- The connection is fetching schema or doing auth, which + -- means it is not dead and shoudn't be reinitialized. + return false + end + if not conn._fiber then + -- The fiber field is not present in old netbox, which correctly + -- reports state as 'error' when its fiber got killed. It would + -- be filtered out above then. + return false + end + return conn._fiber:status() ~= "dead" +end + -- -- Check if the replica is not in backoff. It also serves as an update - if the -- replica still has an old backoff timestamp, it is cleared. This way of @@ -170,7 +209,7 @@ end -- local function replicaset_connect_to_replica(replicaset, replica) local conn = replica.conn - if not conn or conn.state == 'closed' then + if not conn or netbox_is_conn_dead(conn) then conn = netbox.connect(replica.uri, { reconnect_after = consts.RECONNECT_TIMEOUT, wait_connected = false diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index 066d6365..433b8bb4 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -935,8 +935,12 @@ local function recovery_step_by_type(type) end goto continue end + log.info('Sending recovery callrw') local remote_bucket, err = destination:callrw('vshard.storage.bucket_stat', {bucket_id}) + if remote_bucket then + log.info(remote_bucket.status) + end -- Check if it is not a bucket error, and this result can -- not be used to recovery anything. Try later. if not remote_bucket and (not err or err.type ~= 'ShardingError' or diff --git a/vshard/util.lua b/vshard/util.lua index a7f911d0..70373398 100644 --- a/vshard/util.lua +++ b/vshard/util.lua @@ -197,6 +197,10 @@ if not version_is_at_least(1, 10, 1) then error("VShard supports only Tarantool >= 1.10.1") end +local function version_is_at_most(...) + return tnt_version <= lversion.new(...) +end + -- -- Copy @a src table. Fiber yields every @a interval keys copied. Does not give -- any guarantees on what is the result when the source table is changed during @@ -353,6 +357,7 @@ return { async_task = async_task, internal = M, version_is_at_least = version_is_at_least, + version_is_at_most = version_is_at_most, table_copy_yield = table_copy_yield, table_minus_yield = table_minus_yield, table_extend = table_extend,