replicaset: reconnect after fiber kill

Currently if we kill the worker fiber of the connection, which was initialized with 'reconnect_after' option, this connection goes into 'error_reconnect' or 'error' state (depends on tarantool version). Reconnecting doesn't happen in both cases and the only way for user to return router to working order is reloading or manual restoring of the connections. This patch introduces reconnecting in that case. It should be used wisely, though. Fiber's killing doesn't happen instantly and if the user doesn't wait util fiber's status is 'dead' and makes the request immediately, exception will be probably thrown as the fiber can die in the middle of request. So, after fiber kill wait until it's really dead and make a request only after that. Closes tarantool#341
Serpentian · Jul 29, 2022 · 8630569 · 8630569
1 parent ce4c0a0
commit 8630569
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 1 deletion.
diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua
@@ -545,3 +545,41 @@ g.test_enable_disable = function(g)
     -- we don't want this server to interfere with subsequent tests
     g.router_1:drop()
 end
+
+g.test_explicit_fiber_kill = function(g)
+    local masters = {g.replica_1_a, g.replica_2_a}
+    local rs_uuids, bids = {}, {}
+    for _, master in ipairs(masters) do
+        table.insert(rs_uuids, master:replicaset_uuid())
+        table.insert(bids, vtest.storage_first_bucket(master))
+    end
+
+    -- Kill worker fibers of connections to masters
+    g.router:exec(function(rs_uuids)
+        local replicasets = ivshard.router.static.replicasets
+        for _, rs_uuid in ipairs(rs_uuids) do
+            local conn = replicasets[rs_uuid].master.conn
+
+            -- Explicitly kill the connection's worker fiber.
+            -- Callback for pushes is executed in it.
+            pcall(function()
+                conn:eval([[
+                    box.session.push(nil)
+                ]], {}, {on_push = function()
+                    ifiber.self():cancel()
+                end})
+            end)
+        end
+    end, {rs_uuids})
+
+    -- Check that the replicaset is still accessible
+    local res = g.router:exec(function(bids)
+        local res = {}
+        for i, bid in ipairs(bids) do
+            res[i] = ivshard.router.callrw(bid, 'echo', {1},
+                                          {timeout = iwait_timeout})
+        end
+        return res
+    end, {bids})
+    t.assert(res[1] == 1 and res[2] == 1)
+end
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
@@ -144,6 +144,70 @@ local function netbox_wait_connected(conn, timeout)
     return timeout
 end
 
+local netbox_is_conn_dead
+
+if not (util.version_is_at_least(2, 10, 0, 'beta', 2, 0) and
+        util.version_is_at_most(2, 10, 0, nil, 0, 0)) then
+
+--
+-- Check if the connection is dead and it won't be restored automatically.
+-- Even though replicaset's connections are initialized with the option
+-- 'reconnect_after', there's cases, when no reconnect will be done (e.g.
+-- if the conn was explicitly cancelled or the connection's worker fiber
+-- was killed). In these situations we need to reestablish it manually as
+-- the missing connection to some replicaset is a critical problem which
+-- leads to the inability of the user to access some part of a data via
+-- router.
+--
+-- Note: the function expects the conn to be non-nil.
+--
+netbox_is_conn_dead = function(conn)
+    assert(conn)
+    if conn.state == 'error' or conn.state == 'closed' then
+        return true
+    end
+    -- We need to check the 'error_reconnect' option more precisely and not to filter
+    -- it out right here because in some versions of tarantool in case of a killed worker
+    -- fiber no reconnection is done.
+    if conn.state ~= 'error_reconnect' then
+        -- The connection is fetching schema or doing auth, which
+        -- means it is not dead and shoudn't be reinitialized.
+        return false
+    end
+    if not conn._fiber then
+        -- The fiber field is not present in old netbox, which correctly reports
+        -- state as 'error' when its fiber got killed. It would be filtered out
+        -- above then.
+        return false
+    end
+    return conn._fiber:status() ~= "dead"
+end
+
+else -- 2.10.0
+
+--
+-- Tarantool 2.10.0 has a bug: after worker fiber kill connection's state is
+-- error_reconnect, but it doesn't reconnet anymore. The only way we can
+-- catch this situation is error string comparison as the connection doesn't
+-- have _fiber in itself.
+--
+netbox_is_conn_dead = function(conn)
+    assert(conn)
+    if conn.state == 'error' or conn.state == 'closed' then
+        return true
+    end
+
+    if conn.state ~= 'error_reconnect' then
+        -- The connection is fetching schema or doing auth, which
+        -- means it is not dead and shoudn't be reinitialized.
+        return false
+    end
+
+    return conn.error == 'fiber is cancelled'
+end
+
+end
+
 --
 -- Check if the replica is not in backoff. It also serves as an update - if the
 -- replica still has an old backoff timestamp, it is cleared. This way of
@@ -170,7 +234,12 @@ end
 --
 local function replicaset_connect_to_replica(replicaset, replica)
     local conn = replica.conn
-    if not conn or conn.state == 'closed' then
+    -- Fast path - conn is almost always 'active'.
+    if conn and conn.state == 'active' then
+        return conn
+    end
+
+    if not conn or netbox_is_conn_dead(conn) then
         conn = netbox.connect(replica.uri, {
             reconnect_after = consts.RECONNECT_TIMEOUT,
             wait_connected = false

diff --git a/vshard/util.lua b/vshard/util.lua
@@ -197,6 +197,10 @@ if not version_is_at_least(1, 10, 1) then
     error("VShard supports only Tarantool >= 1.10.1")
 end
 
+local function version_is_at_most(...)
+    return tnt_version <= lversion.new(...)
+end
+
 --
 -- Copy @a src table. Fiber yields every @a interval keys copied. Does not give
 -- any guarantees on what is the result when the source table is changed during
@@ -352,6 +356,7 @@ return {
     async_task = async_task,
     internal = M,
     version_is_at_least = version_is_at_least,
+    version_is_at_most = version_is_at_most,
     table_copy_yield = table_copy_yield,
     table_minus_yield = table_minus_yield,
     table_extend = table_extend,