Skip to content

Commit

Permalink
fix 3 flaky tests in failure schedule (#6846)
Browse files Browse the repository at this point in the history
Fixed 3 flaky tests in failure tests which caused flakiness in other
tests due to changed node and group sequence ids during node
addition-removal.
  • Loading branch information
aykut-bozkurt authored Apr 13, 2023
1 parent 9ba7069 commit 3286ec5
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 32 deletions.
4 changes: 3 additions & 1 deletion src/test/regress/expected/failure_add_disable_node.out
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ ORDER BY placementid;
(1 row)

-- reset cluster to original state
ALTER SEQUENCE pg_dist_node_nodeid_seq RESTART 2;
ALTER SEQUENCE pg_dist_groupid_seq RESTART 2;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
Expand All @@ -196,7 +198,7 @@ SELECT citus.mitmproxy('conn.allow()');
SELECT master_add_node('localhost', :worker_2_proxy_port);
master_add_node
---------------------------------------------------------------------
4
2
(1 row)

-- verify node is added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ SET citus.shard_count TO 2;
SET citus.shard_replication_factor TO 1;
SET citus.max_adaptive_executor_pool_size TO 1;
SELECT pg_backend_pid() as pid \gset
ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 222222;
ALTER SEQUENCE pg_catalog.pg_dist_placement_placementid_seq RESTART 333333;
-- make sure coordinator is in the metadata
SELECT citus_set_coordinator_host('localhost', 57636);
citus_set_coordinator_host
Expand Down Expand Up @@ -189,8 +191,8 @@ SELECT create_distributed_table_concurrently('table_1', 'id');
SELECT * FROM pg_dist_shard WHERE logicalrelid = 'table_1'::regclass;
logicalrelid | shardid | shardstorage | shardminvalue | shardmaxvalue
---------------------------------------------------------------------
table_1 | 1880080 | t | -2147483648 | -1
table_1 | 1880081 | t | 0 | 2147483647
table_1 | 222247 | t | -2147483648 | -1
table_1 | 222248 | t | 0 | 2147483647
(2 rows)

DROP SCHEMA create_dist_tbl_con CASCADE;
Expand All @@ -201,3 +203,5 @@ SELECT citus_remove_node('localhost', 57636);

(1 row)

ALTER SEQUENCE pg_dist_node_nodeid_seq RESTART 3;
ALTER SEQUENCE pg_dist_groupid_seq RESTART 3;
42 changes: 17 additions & 25 deletions src/test/regress/expected/failure_mx_metadata_sync_multi_trans.out
Original file line number Diff line number Diff line change
Expand Up @@ -597,8 +597,8 @@ ERROR: connection not open
SELECT * FROM pg_dist_node ORDER BY nodeport;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
---------------------------------------------------------------------
4 | 4 | localhost | 9060 | default | f | t | primary | default | f | t
6 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
2 | 2 | localhost | 9060 | default | f | t | primary | default | f | t
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
(3 rows)

Expand Down Expand Up @@ -626,24 +626,14 @@ UPDATE dist1 SET id = :failed_node_val WHERE id = :failed_node_val;
-- Show that we can still delete from a shard at the node from coordinator
DELETE FROM dist1 WHERE id = :failed_node_val;
-- Show that DDL would still propagate to the node
SET client_min_messages TO NOTICE;
SET citus.log_remote_commands TO 1;
CREATE SCHEMA dummy;
NOTICE: issuing BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;SELECT assign_distributed_transaction_id(xx, xx, 'xxxxxxx');
NOTICE: issuing BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;SELECT assign_distributed_transaction_id(xx, xx, 'xxxxxxx');
NOTICE: issuing SET citus.enable_ddl_propagation TO 'off'
NOTICE: issuing CREATE SCHEMA dummy
NOTICE: issuing SET citus.enable_ddl_propagation TO 'on'
NOTICE: issuing SET citus.enable_ddl_propagation TO 'off'
NOTICE: issuing CREATE SCHEMA dummy
NOTICE: issuing SET citus.enable_ddl_propagation TO 'on'
NOTICE: issuing WITH distributed_object_data(typetext, objnames, objargs, distargumentindex, colocationid, force_delegation) AS (VALUES ('schema', ARRAY['dummy']::text[], ARRAY[]::text[], -1, 0, false)) SELECT citus_internal_add_object_metadata(typetext, objnames, objargs, distargumentindex::int, colocationid::int, force_delegation::bool) FROM distributed_object_data;
NOTICE: issuing PREPARE TRANSACTION 'citus_xx_xx_xx_xx'
NOTICE: issuing PREPARE TRANSACTION 'citus_xx_xx_xx_xx'
NOTICE: issuing COMMIT PREPARED 'citus_xx_xx_xx_xx'
NOTICE: issuing COMMIT PREPARED 'citus_xx_xx_xx_xx'
SET citus.log_remote_commands TO 0;
SET client_min_messages TO ERROR;
SELECT * FROM run_command_on_workers($$SELECT nspname FROM pg_namespace WHERE nspname = 'dummy'$$);
nodename | nodeport | success | result
---------------------------------------------------------------------
localhost | 9060 | t | dummy
localhost | 57637 | t | dummy
(2 rows)

-- Successfully activate the node after many failures
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
Expand All @@ -654,32 +644,32 @@ SELECT citus.mitmproxy('conn.allow()');
SELECT citus_activate_node('localhost', :worker_2_proxy_port);
citus_activate_node
---------------------------------------------------------------------
4
2
(1 row)

-- Activate the node once more to verify it works again with already synced metadata
SELECT citus_activate_node('localhost', :worker_2_proxy_port);
citus_activate_node
---------------------------------------------------------------------
4
2
(1 row)

-- Show node metadata info on worker2 and coordinator after success
\c - - - :worker_2_port
SELECT * FROM pg_dist_node ORDER BY nodeport;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
---------------------------------------------------------------------
4 | 4 | localhost | 9060 | default | t | t | primary | default | t | t
6 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
(3 rows)

\c - - - :master_port
SELECT * FROM pg_dist_node ORDER BY nodeport;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
---------------------------------------------------------------------
4 | 4 | localhost | 9060 | default | t | t | primary | default | t | t
6 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
(3 rows)

Expand All @@ -701,3 +691,5 @@ SELECT citus_remove_node('localhost', :master_port);

(1 row)

ALTER SEQUENCE pg_dist_node_nodeid_seq RESTART 3;
ALTER SEQUENCE pg_dist_groupid_seq RESTART 3;
2 changes: 2 additions & 0 deletions src/test/regress/sql/failure_add_disable_node.sql
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ WHERE s.logicalrelid = 'user_table'::regclass AND n.isactive
ORDER BY placementid;

-- reset cluster to original state
ALTER SEQUENCE pg_dist_node_nodeid_seq RESTART 2;
ALTER SEQUENCE pg_dist_groupid_seq RESTART 2;
SELECT citus.mitmproxy('conn.allow()');
SELECT master_add_node('localhost', :worker_2_proxy_port);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ SET citus.shard_replication_factor TO 1;
SET citus.max_adaptive_executor_pool_size TO 1;
SELECT pg_backend_pid() as pid \gset

ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 222222;
ALTER SEQUENCE pg_catalog.pg_dist_placement_placementid_seq RESTART 333333;

-- make sure coordinator is in the metadata
SELECT citus_set_coordinator_host('localhost', 57636);

Expand Down Expand Up @@ -108,3 +111,5 @@ SELECT * FROM pg_dist_shard WHERE logicalrelid = 'table_1'::regclass;
DROP SCHEMA create_dist_tbl_con CASCADE;
SET search_path TO default;
SELECT citus_remove_node('localhost', 57636);
ALTER SEQUENCE pg_dist_node_nodeid_seq RESTART 3;
ALTER SEQUENCE pg_dist_groupid_seq RESTART 3;
7 changes: 3 additions & 4 deletions src/test/regress/sql/failure_mx_metadata_sync_multi_trans.sql
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,8 @@ UPDATE dist1 SET id = :failed_node_val WHERE id = :failed_node_val;
DELETE FROM dist1 WHERE id = :failed_node_val;

-- Show that DDL would still propagate to the node
SET client_min_messages TO NOTICE;
SET citus.log_remote_commands TO 1;
CREATE SCHEMA dummy;
SET citus.log_remote_commands TO 0;
SET client_min_messages TO ERROR;
SELECT * FROM run_command_on_workers($$SELECT nspname FROM pg_namespace WHERE nspname = 'dummy'$$);

-- Successfully activate the node after many failures
SELECT citus.mitmproxy('conn.allow()');
Expand All @@ -285,3 +282,5 @@ DROP SCHEMA mx_metadata_sync_multi_trans CASCADE;
DROP ROLE foo1;
DROP ROLE foo2;
SELECT citus_remove_node('localhost', :master_port);
ALTER SEQUENCE pg_dist_node_nodeid_seq RESTART 3;
ALTER SEQUENCE pg_dist_groupid_seq RESTART 3;

0 comments on commit 3286ec5

Please sign in to comment.