Skip to content

Commit

Permalink
Fix flakyness in citus_split_shard_by_split_points_deferred_drop
Browse files Browse the repository at this point in the history
In CI we would sometimes get this failure:
```diff
 -- The original shard is marked for deferred drop with policy_type = 2.
 -- The previous shard should be dropped at the beginning of the second split call
 SELECT * from pg_dist_cleanup;
  record_id | operation_id | object_type |                               object_name                                | node_group_id | policy_type
 -----------+--------------+-------------+--------------------------------------------------------------------------+---------------+-------------
+        60 |          778 |           3 | citus_shard_split_slot_18_21216_778                                      |            16 |           0
        512 |          778 |           1 | citus_split_shard_by_split_points_deferred_schema.table_to_split_8981001 |            16 |           2
-(1 row)
+(2 rows)
```

Replication slots sometimes cannot be deleted right away. Which is hard
to resolve, but luckily we can filter these cleanup records out easily
by filtering by policy_type.

While debugging this issue I learnt that we did not use
GetNextCleanupRecordId in all places where we created cleanup
records. This caused test failures when running tests multiple times,
when they set citus.next_cleanup_record_id. So this PR fixes that too.
  • Loading branch information
JelteF committed Apr 3, 2023
1 parent 697bb55 commit 612b763
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 12 deletions.
9 changes: 3 additions & 6 deletions src/backend/distributed/operations/shard_cleaner.c
Original file line number Diff line number Diff line change
Expand Up @@ -514,19 +514,16 @@ InsertCleanupRecordInSubtransaction(CleanupObject objectType,
*/
Assert(CurrentOperationId != INVALID_OPERATION_ID);

StringInfo sequenceName = makeStringInfo();
appendStringInfo(sequenceName, "%s.%s",
PG_CATALOG,
CLEANUPRECORDID_SEQUENCE_NAME);
uint64 recordId = GetNextCleanupRecordId();

StringInfo command = makeStringInfo();
appendStringInfo(command,
"INSERT INTO %s.%s "
" (record_id, operation_id, object_type, object_name, node_group_id, policy_type) "
" VALUES ( nextval('%s'), " UINT64_FORMAT ", %d, %s, %d, %d)",
" VALUES ( %lu, " UINT64_FORMAT ", %d, %s, %d, %d)",
PG_CATALOG,
PG_DIST_CLEANUP,
sequenceName->data,
recordId,
CurrentOperationId,
objectType,
quote_literal_cstr(objectName),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ SELECT pg_catalog.citus_split_shard_by_split_points(

-- The original shard is marked for deferred drop with policy_type = 2.
-- The previous shard should be dropped at the beginning of the second split call
SELECT * from pg_dist_cleanup;
SELECT * FROM pg_dist_cleanup WHERE policy_type = 2;
record_id | operation_id | object_type | object_name | node_group_id | policy_type
---------------------------------------------------------------------
512 | 778 | 1 | citus_split_shard_by_split_points_deferred_schema.table_to_split_8981001 | 16 | 2
526 | 778 | 1 | citus_split_shard_by_split_points_deferred_schema.table_to_split_8981001 | 16 | 2
(1 row)

-- One of the physical shards should not be deleted, the other one should.
Expand All @@ -90,8 +90,12 @@ SELECT relname FROM pg_class where relname LIKE '%table_to_split_%' AND relkind

-- Perform deferred drop cleanup.
\c - postgres - :master_port
CALL citus_cleanup_orphaned_resources();
NOTICE: cleaned up 1 orphaned resources
SELECT public.wait_for_resource_cleanup();
wait_for_resource_cleanup
---------------------------------------------------------------------

(1 row)

-- Clenaup has been done.
SELECT * from pg_dist_cleanup;
record_id | operation_id | object_type | object_name | node_group_id | policy_type
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ SELECT pg_catalog.citus_split_shard_by_split_points(

-- The original shard is marked for deferred drop with policy_type = 2.
-- The previous shard should be dropped at the beginning of the second split call
SELECT * from pg_dist_cleanup;
SELECT * FROM pg_dist_cleanup WHERE policy_type = 2;

-- One of the physical shards should not be deleted, the other one should.
\c - - - :worker_1_port
Expand All @@ -62,7 +62,7 @@ SELECT relname FROM pg_class where relname LIKE '%table_to_split_%' AND relkind

-- Perform deferred drop cleanup.
\c - postgres - :master_port
CALL citus_cleanup_orphaned_resources();
SELECT public.wait_for_resource_cleanup();

-- Clenaup has been done.
SELECT * from pg_dist_cleanup;
Expand Down

0 comments on commit 612b763

Please sign in to comment.