Skip to content

Commit

Permalink
replicaset: introduce name validation
Browse files Browse the repository at this point in the history
In case of name identification, no UUID may be passed at all, so we
cannot verify only UUID, when connecting to storage. It seems impossible
to extend the current net.box greeting by exposing net_box.conn.name to
it, as iproto greeting doesn't have enough free space to save 64 bit
instance name. So, we should deal with name validation on vshard side.

For this, conn.vconnect is introduced. It's asynchronous vshard
greeting, saved inside netbox connection. It stores future object and
additional info, needed for its work. Future is initialized, when the
connection is established (inside netbox_on_connect). The connection
cannot be considered "connected" until vconnect is properly validated.

Currently only instance_name is validated inside conn.vconnect.

Closes #426

NO_DOC=internal
  • Loading branch information
Serpentian authored and Gerold103 committed Dec 19, 2023
1 parent 9d52e1e commit 6364056
Show file tree
Hide file tree
Showing 7 changed files with 381 additions and 9 deletions.
12 changes: 10 additions & 2 deletions test/replicaset-luatest/replicaset_3_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ local t = require('luatest')
local vreplicaset = require('vshard.replicaset')
local vtest = require('test.luatest_helpers.vtest')
local verror = require('vshard.error')
local vutil = require('vshard.util')

local small_timeout_opts = {timeout = 0.05}
local timeout_opts = {timeout = vtest.wait_timeout}
Expand Down Expand Up @@ -247,6 +248,7 @@ test_group.test_locate_master_when_no_conn_object = function(g)
end

test_group.test_named_replicaset = function(g)
t.run_only_if(vutil.feature.persistent_names)
local new_cfg_template = table.deepcopy(cfg_template)
new_cfg_template.identification_mode = 'name_as_key'
new_cfg_template.sharding['replicaset'] = new_cfg_template.sharding[1]
Expand All @@ -267,9 +269,15 @@ test_group.test_named_replicaset = function(g)
t.assert_equals(rs.id, rs.name)
t.assert_equals(replica_1_a.id, replica_1_a.name)

-- Just to be sure, that it works.
-- Name is not set, name mismatch error.
local ret, err = rs:callrw('get_uuid', {}, {timeout = 5})
t.assert_equals(err.name, 'INSTANCE_NAME_MISMATCH')
t.assert_equals(ret, nil)

-- Set name, everything works from now on.
g.replica_1_a:exec(function() box.cfg{instance_name = 'replica_1_a'} end)
local uuid_a = g.replica_1_a:instance_uuid()
local ret, err = rs:callrw('get_uuid', {}, timeout_opts)
ret, err = rs:callrw('get_uuid', {}, timeout_opts)
t.assert_equals(err, nil)
t.assert_equals(ret, uuid_a)

Expand Down
189 changes: 189 additions & 0 deletions test/replicaset-luatest/vconnect_test.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
local fiber = require('fiber')
local t = require('luatest')
local vreplicaset = require('vshard.replicaset')
local vtest = require('test.luatest_helpers.vtest')
local vutil = require('vshard.util')
local verror = require('vshard.error')

local small_timeout_opts = {timeout = 0.01}
local timeout_opts = {timeout = vtest.wait_timeout}

local test_group = t.group('vconnect')

local cfg_template = {
sharding = {
replicaset = {
replicas = {
replica = {
master = true,
},
},
},
},
bucket_count = 20,
test_user_grant_range = 'super',
replication_timeout = 0.1,
identification_mode = 'name_as_key',
}
local global_cfg

test_group.before_all(function(g)
t.run_only_if(vutil.feature.persistent_names)
global_cfg = vtest.config_new(cfg_template)
vtest.cluster_new(g, global_cfg)
vtest.cluster_bootstrap(g, global_cfg)
vtest.cluster_wait_vclock_all(g)
end)

test_group.after_all(function(g)
g.cluster:stop()
end)

--
-- Test, that conn_vconnect_wait fails to get correct
-- result. Connection should be closed.
--
test_group.test_vconnect_no_result = function(g)
local _, rs = next(vreplicaset.buildall(global_cfg))
g.replica:exec(function()
rawset(_G, '_call', ivshard.storage._call)
ivshard.storage._call = nil
end)

-- Drop connection in order to make replicaset to recreate it.
rs.master.conn = nil
local ret, err = rs:callrw('get_uuid', {}, timeout_opts)
t.assert_str_contains(err.message, "_call' is not defined")
t.assert_equals(ret, nil)
-- Critical error, connection should be closed.
t.assert_equals(rs.master.conn.state, 'closed')

g.replica:exec(function()
ivshard.storage._call = _G._call
end)
end

--
-- Test, that conn_vconnect_wait fails, when future is nil.
--
test_group.test_vconnect_no_future = function(g)
local _, rs = next(vreplicaset.buildall(global_cfg))
g.replica:exec(function()
rawset(_G, '_call', ivshard.storage._call)
rawset(_G, 'do_sleep', true)
-- Future should not appear at all.
ivshard.storage._call = function()
while _G.do_sleep do
ifiber.sleep(0.1)
end
end
end)

rs.master.conn = nil
local ret, err = rs:callrw('get_uuid', {}, small_timeout_opts)
t.assert(verror.is_timeout(err))
t.assert_equals(ret, nil)
t.assert_not_equals(rs.master.conn.state, 'closed')

g.replica:exec(function()
_G.do_sleep = false
ivshard.storage._call = _G._call
end)
end

--
-- Test, that conn_vconnect_check fails, when future's result is nil.
--
test_group.test_vconnect_check_no_future = function(g)
local _, rs = next(vreplicaset.buildall(global_cfg))
g.replica:exec(function()
rawset(_G, '_call', ivshard.storage._call)
ivshard.storage._call = nil
end)

rs.master.conn = nil
local opts = table.deepcopy(timeout_opts)
opts.is_async = true
t.helpers.retrying({}, function()
-- It may be VHANDSHAKE_NOT_COMPLETE error, when future
-- is not ready. But at the end it must be the actual error.
local ret, err = rs:callrw('get_uuid', {}, opts)
t.assert_str_contains(err.message, "_call' is not defined")
t.assert_equals(ret, nil)
t.assert_equals(rs.master.conn.state, 'closed')
end)

g.replica:exec(function()
ivshard.storage._call = _G._call
end)
end

--
-- 1. Change name and stop replica.
-- 2. Wait for error_reconnect timeout.
-- 3. Assert, that on reconnect name change is noticed.
--
test_group.test_vconnect_on_reconnect = function(g)
local _, rs = next(vreplicaset.buildall(global_cfg))
t.assert_not_equals(rs:connect_master(), nil)
-- Configuration to use after restart.
local new_cfg = table.deepcopy(global_cfg)
local cfg_rs = new_cfg.sharding.replicaset
cfg_rs.replicas.bad = cfg_rs.replicas.replica
cfg_rs.replicas.replica = nil

g.replica:exec(function()
box.cfg{instance_name = 'bad', force_recovery = true}
end)
g.replica:stop()
t.helpers.retrying({}, function()
t.assert_equals(rs.master.conn.state, 'error_reconnect')
end)

-- Replica cannot be started with incorrect name, change box.cfg.
g.replica.box_cfg.instance_name = 'bad'
g.replica:start()
vtest.cluster_cfg(g, new_cfg)
local ret, err = rs:callrw('get_uuid', {}, timeout_opts)
t.assert_equals(err.name, 'INSTANCE_NAME_MISMATCH')
t.assert_equals(ret, nil)
t.assert_equals(rs.master.conn.state, 'closed')

g.replica:exec(function()
box.cfg{instance_name = 'replica', force_recovery = true}
end)
vtest.cluster_cfg(g, global_cfg)
end

--
-- Test, that async call doesn't yield and immediately fails.
--
test_group.test_async_no_yield = function(g)
local _, rs = next(vreplicaset.buildall(global_cfg))
g.replica:exec(function()
rawset(_G, '_call', ivshard.storage._call)
rawset(_G, 'do_sleep', true)
-- Future should not appear at all.
ivshard.storage._call = function()
while _G.do_sleep do
ifiber.sleep(0.1)
end
end
end)

local opts = table.deepcopy(timeout_opts)
opts.is_async = true
local csw1 = fiber.self():csw()
local ret, err = rs:callrw('get_uuid', {}, opts)
local csw2 = fiber.self():csw()
-- Waiting for #456 to be fixed.
t.assert_equals(csw2, csw1 + 1)
t.assert_str_contains(err.name, 'VHANDSHAKE_NOT_COMPLETE')
t.assert_equals(ret, nil)
t.assert_not_equals(rs.master.conn.state, 'closed')

g.replica:exec(function()
_G.do_sleep = false
ivshard.storage._call = _G._call
end)
end
8 changes: 4 additions & 4 deletions test/storage/storage.result
Original file line number Diff line number Diff line change
Expand Up @@ -1014,16 +1014,16 @@ vshard.storage.internal.errinj.ERRINJ_RECOVERY_PAUSE = false
--
-- Internal info function.
--
vshard.storage._call('info')
vshard.storage._call('info').is_master
---
- is_master: true
- true
...
_ = test_run:switch('storage_1_b')
---
...
vshard.storage._call('info')
vshard.storage._call('info').is_master
---
- is_master: false
- false
...
--
-- gh-123, gh-298: storage auto-enable/disable depending on instance state.
Expand Down
4 changes: 2 additions & 2 deletions test/storage/storage.test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -327,9 +327,9 @@ vshard.storage.internal.errinj.ERRINJ_RECOVERY_PAUSE = false
--
-- Internal info function.
--
vshard.storage._call('info')
vshard.storage._call('info').is_master
_ = test_run:switch('storage_1_b')
vshard.storage._call('info')
vshard.storage._call('info').is_master

--
-- gh-123, gh-298: storage auto-enable/disable depending on instance state.
Expand Down
10 changes: 10 additions & 0 deletions vshard/error.lua
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,16 @@ local error_message_template = {
msg = 'Bucket %s update is invalid: %s',
args = {'bucket_id', 'reason'},
},
[40] = {
name = 'VHANDSHAKE_NOT_COMPLETE',
msg = 'Handshake with %s have not been completed yet',
args = {'replica'},
},
[41] = {
name = 'INSTANCE_NAME_MISMATCH',
msg = 'Mismatch server name: expected "%s", but got "%s"',
args = {'expected_name', 'actual_name'},
},
}

--
Expand Down
Loading

0 comments on commit 6364056

Please sign in to comment.