From b71d9d4718710e5db6c161e94dd94c1c5c68f9d0 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 28 Nov 2024 11:05:49 +0100 Subject: [PATCH] database_dump: Fix `num_no_build` import issue The `num_no_build` column in the database is `NOT NULL`, but since we don't export it in the database dump it will show up as an empty value and thus fail to get imported correctly. This commit temporarily drops the `NOT NULL` constraint during the data import, backfills the data from the `num` column, and then adds the constraint again. --- .../src/dump-import.sql.j2 | 43 +++++++++++++++++++ ...e_dump__tests__sql_scripts@import.sql.snap | 43 +++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/crates/crates_io_database_dump/src/dump-import.sql.j2 b/crates/crates_io_database_dump/src/dump-import.sql.j2 index ad79cb69559..5c5cbc31e33 100644 --- a/crates/crates_io_database_dump/src/dump-import.sql.j2 +++ b/crates/crates_io_database_dump/src/dump-import.sql.j2 @@ -17,13 +17,56 @@ BEGIN; {%- endfor %} -- Enable this trigger so that `crates.textsearchable_index_col` can be excluded from the export + ALTER TABLE "crates" ENABLE TRIGGER "trigger_crates_tsvector_update"; + -- Drop the `NOT NULL` and uniqueness constraints on `num_no_build` column + -- while importing the data. + + alter table "versions" + alter column "num_no_build" drop not null; + + drop index versions_crate_id_num_no_build_uindex; + -- Import the CSV data. {% for table in tables %} \copy "{{table.name}}" ({{table.columns}}) FROM 'data/{{table.name}}.csv' WITH CSV HEADER {%- endfor %} + -- Backfill the `num_no_build` column with the correct values. + + update versions + set num_no_build = split_part(num, '+', 1); + + with duplicates as ( + -- find all versions that have the same `crate_id` and `num_no_build` + select crate_id, num_no_build, array_agg(num ORDER BY id) as nums + from versions + group by crate_id, num_no_build + having count(*) > 1 + ), + duplicates_to_update as ( + -- for each group of duplicates, update all versions except the one that + -- doesn't have "build metadata", or the first one that was published if + -- all versions have "build metadata" + select crate_id, num_no_build, unnest(case when array_position(nums, num_no_build) IS NOT NULL then array_remove(nums, num_no_build) else nums[2:] end) as num + from duplicates + ) + update versions + set num_no_build = duplicates_to_update.num + from duplicates_to_update + where versions.crate_id = duplicates_to_update.crate_id + and versions.num = duplicates_to_update.num; + + -- Set the `num_no_build` column to `NOT NULL` again after the backfill is + -- done and add the unique constraint. + + alter table "versions" + alter column "num_no_build" set not null; + + create unique index versions_crate_id_num_no_build_uindex + on versions (crate_id, num_no_build); + -- Drop the defaults again. {% for table in tables -%} {% for cd in table.column_defaults %} diff --git a/crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@import.sql.snap b/crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@import.sql.snap index 2fa830a2922..7c55606dd6e 100644 --- a/crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@import.sql.snap +++ b/crates/crates_io_database_dump/src/snapshots/crates_io_database_dump__tests__sql_scripts@import.sql.snap @@ -45,8 +45,17 @@ BEGIN; TRUNCATE "version_downloads" RESTART IDENTITY CASCADE; -- Enable this trigger so that `crates.textsearchable_index_col` can be excluded from the export + ALTER TABLE "crates" ENABLE TRIGGER "trigger_crates_tsvector_update"; + -- Drop the `NOT NULL` and uniqueness constraints on `num_no_build` column + -- while importing the data. + + alter table "versions" + alter column "num_no_build" drop not null; + + drop index versions_crate_id_num_no_build_uindex; + -- Import the CSV data. \copy "categories" ("category", "crates_cnt", "created_at", "description", "id", "path", "slug") FROM 'data/categories.csv' WITH CSV HEADER @@ -65,6 +74,40 @@ BEGIN; \copy "dependencies" ("crate_id", "default_features", "explicit_name", "features", "id", "kind", "optional", "req", "target", "version_id") FROM 'data/dependencies.csv' WITH CSV HEADER \copy "version_downloads" ("date", "downloads", "version_id") FROM 'data/version_downloads.csv' WITH CSV HEADER + -- Backfill the `num_no_build` column with the correct values. + + update versions + set num_no_build = split_part(num, '+', 1); + + with duplicates as ( + -- find all versions that have the same `crate_id` and `num_no_build` + select crate_id, num_no_build, array_agg(num ORDER BY id) as nums + from versions + group by crate_id, num_no_build + having count(*) > 1 + ), + duplicates_to_update as ( + -- for each group of duplicates, update all versions except the one that + -- doesn't have "build metadata", or the first one that was published if + -- all versions have "build metadata" + select crate_id, num_no_build, unnest(case when array_position(nums, num_no_build) IS NOT NULL then array_remove(nums, num_no_build) else nums[2:] end) as num + from duplicates + ) + update versions + set num_no_build = duplicates_to_update.num + from duplicates_to_update + where versions.crate_id = duplicates_to_update.crate_id + and versions.num = duplicates_to_update.num; + + -- Set the `num_no_build` column to `NOT NULL` again after the backfill is + -- done and add the unique constraint. + + alter table "versions" + alter column "num_no_build" set not null; + + create unique index versions_crate_id_num_no_build_uindex + on versions (crate_id, num_no_build); + -- Drop the defaults again. ALTER TABLE "users" ALTER COLUMN "gh_access_token" DROP DEFAULT;