Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add unique indexes for containers tables #15308

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
class CleanUpDuplicatesInContainersTables < ActiveRecord::Migration[5.0]
class ContainerBuild < ActiveRecord::Base; end
class ContainerBuildPod < ActiveRecord::Base; end
class ContainerGroup < ActiveRecord::Base
self.inheritance_column = :_type_disabled
end
class ContainerLimit < ActiveRecord::Base; end
class ContainerNode < ActiveRecord::Base
self.inheritance_column = :_type_disabled
end
class ContainerProject < ActiveRecord::Base; end
class ContainerQuota < ActiveRecord::Base; end
class ContainerReplicator < ActiveRecord::Base; end
class ContainerRoute < ActiveRecord::Base; end
class ContainerService < ActiveRecord::Base; end
class ContainerTemplate < ActiveRecord::Base; end
class PersistentVolumeClaim < ActiveRecord::Base; end

class ContainerComponentStatus < ActiveRecord::Base; end
class ContainerImage < ActiveRecord::Base; end
class ContainerImageRegistry < ActiveRecord::Base; end

class ContainerCondition < ActiveRecord::Base; end
class SecurityContext < ActiveRecord::Base; end
class ContainerEnvVar < ActiveRecord::Base; end
class ContainerLimitItem < ActiveRecord::Base; end
class ContainerPortConfig < ActiveRecord::Base; end
class ContainerQuotaItem < ActiveRecord::Base; end
class ContainerServicePortConfig < ActiveRecord::Base; end
class ContainerTemplateParameter < ActiveRecord::Base; end
class ContainerVolume < ActiveRecord::Base; end
class CustomAttribute < ActiveRecord::Base; end

class ContainerDefinition < ActiveRecord::Base; end
class Container < ActiveRecord::Base
self.inheritance_column = :_type_disabled
end

def duplicate_data_query_returning_batches(model, unique_index_columns)
model.group(unique_index_columns)
.select("#{unique_index_columns.join(", ")}, min(id) AS first_duplicate, array_agg(id) AS duplicate_ids")
.having("COUNT(id) > 1")
end

def cleanup_duplicate_data_batch(model, unique_index_columns)
duplicate_data_query_returning_batches(model, unique_index_columns).each do |duplicate|
# TODO(lsmola) do I need to do some merging of occurrences, e.g. reconnecting metrics, events, etc.?
# TODO(lsmola) calling .destroy so it cascade deletes will be expensive
model.where(:id => duplicate.duplicate_ids[1..--1]).delete_all
end
end

def duplicate_data_query_returning_min_id(model, unique_index_columns)
model.group(unique_index_columns).select("min(id)")
end

def cleanup_duplicate_data_delete_all(model, unique_index_columns)
model.where.not(:id => duplicate_data_query_returning_min_id(model, unique_index_columns)).delete_all
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kbrock hum, seems like the group_by query on the Unique Key candidates takes like 10 minute for 130k records, I think mainly because those are not indexed. :-)

So should I first add normal indexes, run clean up, remove indexes and add unique indexes instead of them? Or we'll just wait? Or should we do the cleanup as a rake script, that needs to be run when failing to add Unique DB index?

end

UNIQUE_INDEXES_FOR_MODELS = {
# Just having :ems_id & :ems_ref
ContainerBuild => [:ems_id, :ems_ref],
ContainerBuildPod => [:ems_id, :ems_ref],
ContainerGroup => [:ems_id, :ems_ref],
ContainerLimit => [:ems_id, :ems_ref],
ContainerNode => [:ems_id, :ems_ref],
ContainerProject => [:ems_id, :ems_ref],
ContainerQuota => [:ems_id, :ems_ref],
ContainerReplicator => [:ems_id, :ems_ref],
ContainerRoute => [:ems_id, :ems_ref],
ContainerService => [:ems_id, :ems_ref],
ContainerTemplate => [:ems_id, :ems_ref],
PersistentVolumeClaim => [:ems_id, :ems_ref],
# Having :ems_id but not ems_ref
ContainerComponentStatus => [:ems_id, :name],
ContainerImage => [:ems_id, :image_ref, :container_image_registry_id],
ContainerImageRegistry => [:ems_id, :host, :port],
# Nested tables, not having :ems_id and the foreign_key is a part of the unique index
ContainerCondition => [:container_entity_id, :container_entity_type, :name],
SecurityContext => [:resource_id, :resource_type],
ContainerEnvVar => [:container_definition_id, :name, :value, :field_path],
ContainerLimitItem => [:container_limit_id, :resource, :item_type],
ContainerPortConfig => [:container_definition_id, :ems_ref],
ContainerQuotaItem => [:container_quota_id, :resource],
ContainerServicePortConfig => [:container_service_id, :ems_ref, :protocol],
ContainerTemplateParameter => [:container_template_id, :name],
ContainerVolume => [:parent_id, :parent_type, :name],
CustomAttribute => [:resource_id, :resource_type, :name, :unique_name, :section, :source],
# Questionable
ContainerDefinition => [:ems_id, :ems_ref],
Container => [:ems_id, :ems_ref]
}.freeze

def up
UNIQUE_INDEXES_FOR_MODELS.each do |model, unique_indexes_columns|
say_with_time("Cleanup duplicate data for model #{model}") do
cleanup_duplicate_data_delete_all(model, unique_indexes_columns)
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
class AddUniqueIndexesToContainersTables < ActiveRecord::Migration[5.0]
def change
# Just having :ems_id & :ems_ref
add_index :container_builds, [:ems_id, :ems_ref], :unique => true
add_index :container_build_pods, [:ems_id, :ems_ref], :unique => true
add_index :container_groups, [:ems_id, :ems_ref], :unique => true
add_index :container_limits, [:ems_id, :ems_ref], :unique => true
add_index :container_nodes, [:ems_id, :ems_ref], :unique => true
add_index :container_projects, [:ems_id, :ems_ref], :unique => true
add_index :container_quotas, [:ems_id, :ems_ref], :unique => true
add_index :container_replicators, [:ems_id, :ems_ref], :unique => true
add_index :container_routes, [:ems_id, :ems_ref], :unique => true
add_index :container_services, [:ems_id, :ems_ref], :unique => true
add_index :container_templates, [:ems_id, :ems_ref], :unique => true
add_index :persistent_volume_claims, [:ems_id, :ems_ref], :unique => true

# Having :ems_id but not ems_ref
add_index :container_component_statuses,
[:ems_id, :name],
:unique => true
add_index :container_images,
[:ems_id, :image_ref, :container_image_registry_id],
:unique => true,
:name => "index_container_images_unique_multi_column"
add_index :container_image_registries,
[:ems_id, :host, :port],
:unique => true

# Nested tables, not having :ems_id and the foreign_key is a part of the unique index
add_index :container_conditions,
[:container_entity_id, :container_entity_type, :name],
:unique => true,
:name => "index_container_conditions_unique_multi_column"
add_index :security_contexts,
[:resource_id, :resource_type],
:unique => true,
:name => "index_security_contexts_unique_multi_column"
add_index :container_env_vars,
[:container_definition_id, :name, :value, :field_path],
:unique => true,
:name => "index_container_env_vars_unique_multi_column"
add_index :container_limit_items,
[:container_limit_id, :resource, :item_type],
:unique => true,
:name => "index_container_limit_items_unique_multi_column"
add_index :container_port_configs,
[:container_definition_id, :ems_ref],
:unique => true,
:name => "index_container_port_configs_unique_multi_column"
add_index :container_quota_items,
[:container_quota_id, :resource],
:unique => true
add_index :container_service_port_configs,
[:container_service_id, :ems_ref, :protocol], # FIXME(lsmola) I see duplicate ems_refs, because protocol is not part of it
:unique => true,
:name => "index_container_service_port_configs_unique_multi_column"
add_index :container_template_parameters,
[:container_template_id, :name],
:unique => true,
:name => "index_container_template_parameters_unique_multi_column"
add_index :container_volumes,
[:parent_id, :parent_type, :name],
:unique => true # FIXME(lsmola) has unused :ems_ref
add_index :custom_attributes,
[:resource_id, :resource_type, :name, :unique_name, :section, :source],
:unique => true,
:name => "index_custom_attributes_parameters_unique_multi_column"

# FIXME(lsmola) questionable, these were modeled as nested, but they have :ems_id & :ems_ref
# Is ems_ref unique? we were saving these under container_group
add_index :container_definitions, [:ems_id, :ems_ref], :unique => true
# Is ems_ref unique? we were saving these under container_definition
add_index :containers, [:ems_id, :ems_ref], :unique => true
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
require_migration

describe CleanUpDuplicatesInContainersTables do
def model_unique_keys(model)
models[model]
end

def models
described_class::UNIQUE_INDEXES_FOR_MODELS
end

def model_stub(model)
migration_stub(model.to_s.to_sym)
end

def create_test_data(model)
build_data(model, 10, "string_1")
build_data(model, 10, "string_1")
build_data(model, 10, "string_2")
build_data(model, 11, "string_1")
build_data(model, 11, "string_1")
build_data(model, 11, "string_1")
build_data(model, 11, "string_1")
build_data(model, 12, "string_1")
build_data(model, nil, "string_1")
build_data(model, nil, "string_1")
end

def build_data(model, foreign_key_value, string_value)
data_values = model_unique_keys(model).each_with_object({}) do |key, obj|
obj[key] = build_value(key, foreign_key_value, string_value)
end
model.create!(data_values)
end

def build_value(key, foreign_key_value, string_value)
if key.to_s.ends_with?("id")
foreign_key_value
else
string_value
end
end

def analyze(model)
original_values = {}
duplicate_values = {}
model.all.each do |record|
index = record.attributes.symbolize_keys.slice(*model_unique_keys(model))
if original_values[index]
duplicate_values[index] << record.id
else
original_values[index] = record.id
duplicate_values[index] ||= []
end
end

return original_values, duplicate_values
end

def assert_before_migration_test_data(model, original_values, duplicate_values)
expect(model.count).to eq(10)

# Check there are 5 duplicates in the data
expect(original_values.count).to eq(5)
expect(duplicate_values.count).to eq(5)

# Check that original values ids are the min or all duplicated ids
original_values.each do |key, value|
expect((duplicate_values[key] << value).min).to eq value
end
end

def assert_after_migration_test_data(model, original_values, duplicate_values)
expect(model.count).to eq(5)

model.all.each do |record|
expect(original_values[record.attributes.symbolize_keys.slice(*model_unique_keys(model))]).to eq record.id
end
end

migration_context :up do
it "manually checks we clean up duplicates in ContainerBuild build model" do
model = CleanUpDuplicatesInContainersTables::ContainerBuild
create_test_data(model)

expect(model.pluck(*model_unique_keys(model))).to(
match_array(
[
[10, "string_1"],
[10, "string_1"],
[10, "string_2"],
[11, "string_1"],
[11, "string_1"],
[11, "string_1"],
[11, "string_1"],
[12, "string_1"],
[nil, "string_1"],
[nil, "string_1"]
]
)
)

migrate

expect(model.pluck(*model_unique_keys(model))).to(
match_array(
[
[10, "string_1"],
[10, "string_2"],
[11, "string_1"],
[12, "string_1"],
[nil, "string_1"]
]
)
)
end

described_class::UNIQUE_INDEXES_FOR_MODELS.keys.each do |model|
context "with model #{model}" do
it "checks that the duplicate values are cleaned up" do
create_test_data(model)

original_values, duplicate_values = analyze(model)
assert_before_migration_test_data(model, original_values, duplicate_values)
migrate
assert_after_migration_test_data(model, original_values, duplicate_values)
end
end
end
end
end