From 33c9a8bf365c8125a4f6b25d19de0d220e4f4640 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Tue, 17 Dec 2019 19:06:47 +0100 Subject: [PATCH 01/33] Version column sanitization (WIP) See #15231 --- app/models/data_import.rb | 20 +++++++-- app/models/synchronization/adapter.rb | 3 +- app/models/table.rb | 54 +++++++++++------------- services/importer/lib/importer/column.rb | 42 ++++++++++++++++++ 4 files changed, 85 insertions(+), 34 deletions(-) diff --git a/app/models/data_import.rb b/app/models/data_import.rb index 88b46b19fea4..5c089b88deea 100644 --- a/app/models/data_import.rb +++ b/app/models/data_import.rb @@ -24,6 +24,7 @@ require_relative '../../services/importer/lib/importer/overviews' require_relative '../../services/importer/lib/importer/connector_runner' require_relative '../../services/importer/lib/importer/exceptions' +require_relative '../../services/importer/lib/importer/column' require_dependency 'carto/tracking/events' require_dependency 'carto/valid_table_name_proposer' @@ -119,8 +120,9 @@ def after_initialize # For the old dashboard def before_create if self.from_common_data? - self.extra_options = self.extra_options.merge({:common_data => true}) + extra_options.merge! common_data: true end + extra_options.merge! column_sanitization_version: CartoDB::Importer2::Column::CURRENT_COLUMN_SANITIZATION_VERSION end def before_save @@ -426,6 +428,10 @@ def final_state? [STATE_COMPLETE, STATE_FAILURE, STATE_STUCK].include?(state) end + def column_sanitization_version + extra_options[:column_sanitization_version] || CartoDB::Importer2::Column::INITIAL_COLUMN_SANITIZATION_VERSION + end + private def dispatch @@ -574,12 +580,18 @@ def overwrite_strategy? collision_strategy == Carto::DataImportConstants::COLLISION_STRATEGY_OVERWRITE end + def sanitize_columns(table_name) - Table.sanitize_columns(table_name, { + # TODO: is this called before table is registered? otherwise we should use Table#sanitize_columns + Table.sanitize_columns( + table_name, + column_sanitization_version, + { connection: current_user.in_database, database_schema: current_user.database_schema, - reserved_words: CartoDB::Importer2::Column::RESERVED_WORDS - }) + reserved_words: CartoDB::Importer2::Column::RESERVED_WORDS # TODO: review this + } + ) end def migrate_existing(imported_name) diff --git a/app/models/synchronization/adapter.rb b/app/models/synchronization/adapter.rb index b099965078da..1d4081af2b35 100644 --- a/app/models/synchronization/adapter.rb +++ b/app/models/synchronization/adapter.rb @@ -276,7 +276,8 @@ def import_cleanup(schema_name, table_name) user.db_service.in_database_direct_connection(statement_timeout: STATEMENT_TIMEOUT) do |user_database| # For consistency with regular imports, also eases testing - Table.sanitize_columns(table_name, {database_schema: schema_name, connection: user_database}) + # The sanitization of @table_name is applied to the newly imported table_name + @user.tables.where(name: @table_name).sanitize_columns(table_name: table_name, database_schema: schema_name, connection: user_database) # When tables are created using ogr2ogr they are added a ogc_fid or gid primary key # In that case: diff --git a/app/models/table.rb b/app/models/table.rb index 16e3f30f9b67..a700b0cb87d9 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -11,6 +11,7 @@ require_relative './visualization/table_blender' require_relative '../../services/importer/lib/importer/query_batcher' require_relative '../../services/importer/lib/importer/cartodbfy_time' +require_relative '../../services/importer/lib/importer/column' require_relative '../../services/datasources/lib/datasources/decorators/factory' require_relative '../../services/table-geocoder/lib/internal-geocoder/latitude_longitude' require_relative '../model_factories/layer_factory' @@ -227,7 +228,7 @@ def import_to_cartodb(uniname = nil) uniname = get_valid_name(name ? name : migrate_existing_table) unless uniname # Make sure column names are sanitized. Make it consistently. - self.class.sanitize_columns(uniname, {database_schema: owner.database_schema, connection: owner.in_database}) + sanitize_columns(table_name: uniname, database_schema: owner.database_schema, connection: owner.in_database) # with table #{uniname} table created now run migrator to CartoDBify hash_in = ::SequelRails.configuration.environment_for(Rails.env).merge( @@ -1303,14 +1304,31 @@ def get_valid_name(contendent) Carto::ValidTableNameProposer.new.propose_valid_table_name(contendent, taken_names: user_table_names) end - def self.sanitize_columns(table_name, options={}) + def column_sanitization_version + if @data_import + @data_import.column_sanitization_version + else + CartoDB::Importer2::Column::NO_COLUMN_SANITIZATION_VERSION + end + end + + # Apply the sanitization defined for this table. + # If the table_name parameter is passed, the sanitization which was originally applied to self + # is applied to the table so named. + # If no table_name parameters is paseed the normalization is applied to self. + # This allows re-applying the same normalization to imported tables. + def sanitize_columns(table_name: name, **options) + self.class.sanitize_columns(table_name, column_sanitization_version, options) + end + + def self.sanitize_columns(table_name, column_sanitization_version, options={}) connection = options.fetch(:connection) database_schema = options.fetch(:database_schema, 'public') connection.schema(table_name, schema: database_schema, reload: true).each do |column| column_name = column[0].to_s column_type = column[1][:db_type] - column_name = ensure_column_has_valid_name(table_name, column_name, options) + column_name = ensure_column_has_valid_name(table_name, column_name, column_sanitization_version, options) if column_type == 'unknown' CartoDB::ColumnTypecaster.new( user_database: connection, @@ -1323,11 +1341,11 @@ def self.sanitize_columns(table_name, options={}) end end - def self.ensure_column_has_valid_name(table_name, column_name, options={}) + def self.ensure_column_has_valid_name(table_name, column_name, column_sanitization_version, options={}) connection = options.fetch(:connection) database_schema = options.fetch(:database_schema, 'public') - valid_column_name = get_valid_column_name(table_name, column_name, options) + valid_column_name = get_valid_column_name(table_name, column_name, column_sanitization_version, options) if valid_column_name != column_name connection.run(%Q{ALTER TABLE "#{database_schema}"."#{table_name}" RENAME COLUMN "#{column_name}" TO "#{valid_column_name}";}) end @@ -1335,30 +1353,8 @@ def self.ensure_column_has_valid_name(table_name, column_name, options={}) valid_column_name end - def self.get_valid_column_name(table_name, candidate_column_name, options={}) - reserved_words = options.fetch(:reserved_words, []) - - existing_names = get_column_names(table_name, options) - [candidate_column_name] - - candidate_column_name = 'untitled_column' if candidate_column_name.blank? - candidate_column_name = candidate_column_name.to_s.squish - - # Subsequent characters can be letters, underscores or digits - candidate_column_name = candidate_column_name.gsub(/[^a-z0-9]/,'_').gsub(/_{2,}/, '_') - - # Valid names start with a letter or an underscore - candidate_column_name = "column_#{candidate_column_name}" unless candidate_column_name[/^[a-z_]{1}/] - - # Avoid collisions - count = 1 - new_column_name = candidate_column_name - while existing_names.include?(new_column_name) || reserved_words.include?(new_column_name.upcase) - suffix = "_#{count}" - new_column_name = candidate_column_name[0..PG_IDENTIFIER_MAX_LENGTH-suffix.length] + suffix - count += 1 - end - - new_column_name + def self.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, options={}) + CartoDB::Importer2::Column.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, get_column_names(table_name, options), options) end def self.get_column_names(table_name, options={}) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index 2228bf26f7d3..003c9d0cd4fa 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -33,6 +33,10 @@ class Column FORMAT CONTROLLER ACTION } + NO_COLUMN_SANITIZATION_VERSION = 0 + INITIAL_COLUMN_SANITIZATION_VERSION = 1 + CURRENT_COLUMN_SANITIZATION_VERSION = 2 + def initialize(db, table_name, column_name, user, schema = DEFAULT_SCHEMA, job = nil, logger = nil, capture_exceptions = true) @job = job || Job.new({logger: logger}) @db = db @@ -278,6 +282,44 @@ def unsupported?(name) name !~ /^[a-zA-Z_]/ end + def self.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, column_names, options={}) + + return candidate_column_name if column_sanitization_version == NO_COLUMN_SANITIZATION_VERSION + + if column_sanitization_version == INITIAL_COLUMN_SANITIZATION_VERSION + reserved_words = options.fetch(:reserved_words, []) # FIXME: use RESERVED_WORDS ? + + existing_names = column_names - [candidate_column_name] + + candidate_column_name = 'untitled_column' if candidate_column_name.blank? + candidate_column_name = candidate_column_name.to_s.squish + + # Subsequent characters can be letters, underscores or digits + candidate_column_name = candidate_column_name.gsub(/[^a-z0-9]/,'_').gsub(/_{2,}/, '_') + + # Valid names start with a letter or an underscore + candidate_column_name = "column_#{candidate_column_name}" unless candidate_column_name[/^[a-z_]{1}/] + + # Avoid collisions + count = 1 + new_column_name = candidate_column_name + while existing_names.include?(new_column_name) || reserved_words.include?(new_column_name.upcase) + suffix = "_#{count}" + new_column_name = candidate_column_name[0..PG_IDENTIFIER_MAX_LENGTH-suffix.length] + suffix + count += 1 + end + + new_column_name + elsif column_sanitization_version == 1 + + # TODO: use StringSanitizer / DB::Sanitize (handle special characters, maximum length, ...) + candidate_column_name.downcase + + else + raise "Invalid column sanitization version #{column_sanitization_version.inspect}" + end + end + private attr_reader :job, :db, :table_name, :column_name, :schema From ae5622b25cb3155bd0edbe34394976797d291899 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 09:25:37 +0100 Subject: [PATCH 02/33] Fix extra_options modiication Note that the extra_options accessor canot be used to modify the underlying column --- app/models/data_import.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/models/data_import.rb b/app/models/data_import.rb index 5c089b88deea..a1b444773a9f 100644 --- a/app/models/data_import.rb +++ b/app/models/data_import.rb @@ -119,10 +119,10 @@ def after_initialize # New ones are already tracked during the data_import create inside the controller # For the old dashboard def before_create - if self.from_common_data? - extra_options.merge! common_data: true + if from_common_data? + self.extra_options = extra_options.merge(common_data: true) end - extra_options.merge! column_sanitization_version: CartoDB::Importer2::Column::CURRENT_COLUMN_SANITIZATION_VERSION + self.extra_options = extra_options.merge(column_sanitization_version: CartoDB::Importer2::Column::CURRENT_COLUMN_SANITIZATION_VERSION) end def before_save From 2f1a1e91087864dd59809e2a7fc3f772633d2a41 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 10:38:38 +0100 Subject: [PATCH 03/33] Remove reserved_words options --- app/models/data_import.rb | 3 +-- app/models/table.rb | 2 +- services/importer/lib/importer/column.rb | 5 +++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/app/models/data_import.rb b/app/models/data_import.rb index a1b444773a9f..cec818541a3a 100644 --- a/app/models/data_import.rb +++ b/app/models/data_import.rb @@ -588,8 +588,7 @@ def sanitize_columns(table_name) column_sanitization_version, { connection: current_user.in_database, - database_schema: current_user.database_schema, - reserved_words: CartoDB::Importer2::Column::RESERVED_WORDS # TODO: review this + database_schema: current_user.database_schema } ) end diff --git a/app/models/table.rb b/app/models/table.rb index a700b0cb87d9..25ad15f1391f 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -1354,7 +1354,7 @@ def self.ensure_column_has_valid_name(table_name, column_name, column_sanitizati end def self.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, options={}) - CartoDB::Importer2::Column.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, get_column_names(table_name, options), options) + CartoDB::Importer2::Column.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, get_column_names(table_name, options)) end def self.get_column_names(table_name, options={}) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index 003c9d0cd4fa..817e7c453bc4 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -282,12 +282,13 @@ def unsupported?(name) name !~ /^[a-zA-Z_]/ end - def self.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, column_names, options={}) + def self.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, column_names) return candidate_column_name if column_sanitization_version == NO_COLUMN_SANITIZATION_VERSION if column_sanitization_version == INITIAL_COLUMN_SANITIZATION_VERSION - reserved_words = options.fetch(:reserved_words, []) # FIXME: use RESERVED_WORDS ? + # NOTE: originally not all uses of this sanitization version applied reserved words + reserved_words = RESERVED_WORDS existing_names = column_names - [candidate_column_name] From 4d278462da4d2818a4d70dbc8f874cc88dea62dd Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 10:39:08 +0100 Subject: [PATCH 04/33] Fix sanitization version --- services/importer/lib/importer/column.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index 817e7c453bc4..a92f9bc4a129 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -311,7 +311,7 @@ def self.get_valid_column_name(table_name, candidate_column_name, column_sanitiz end new_column_name - elsif column_sanitization_version == 1 + elsif column_sanitization_version == 2 # TODO: use StringSanitizer / DB::Sanitize (handle special characters, maximum length, ...) candidate_column_name.downcase From 40585a93a5cce977fffa8081494ddd06fec60bf4 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 10:39:44 +0100 Subject: [PATCH 05/33] Use existing column sanitization --- services/importer/lib/importer/column.rb | 19 ++++++++++++------- services/importer/spec/unit/column_spec.rb | 8 ++++---- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index a92f9bc4a129..69d6c10b3040 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -269,19 +269,24 @@ def sanitize end def sanitized_name - name = StringSanitizer.new.sanitize(column_name.to_s) - return name unless reserved?(name) || unsupported?(name) - "_#{name}" + self.class.sanitize_name column_name end - def reserved?(name) + def self.reserved?(name) RESERVED_WORDS.include?(name.upcase) end - def unsupported?(name) + def self.unsupported?(name) name !~ /^[a-zA-Z_]/ end + def self.sanitize_name(column_name) + # TODO: take into account Postgres identifier length limitations; use DB::Sanitize ? + name = StringSanitizer.new.sanitize(column_name.to_s) + return name unless reserved?(name) || unsupported?(name) + "_#{name}" + end + def self.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, column_names) return candidate_column_name if column_sanitization_version == NO_COLUMN_SANITIZATION_VERSION @@ -313,8 +318,8 @@ def self.get_valid_column_name(table_name, candidate_column_name, column_sanitiz new_column_name elsif column_sanitization_version == 2 - # TODO: use StringSanitizer / DB::Sanitize (handle special characters, maximum length, ...) - candidate_column_name.downcase + sanitize_name(candidate_column_name) + # TODO: Avoid collisions? else raise "Invalid column sanitization version #{column_sanitization_version.inspect}" diff --git a/services/importer/spec/unit/column_spec.rb b/services/importer/spec/unit/column_spec.rb index 5435119819e5..4bcefb105cc9 100644 --- a/services/importer/spec/unit/column_spec.rb +++ b/services/importer/spec/unit/column_spec.rb @@ -243,15 +243,15 @@ describe '#reserved?' do it 'returns true if name is a reserved keyword' do - @column.reserved?('select').should eq true - @column.reserved?('bogus').should eq false + Column.reserved?('select').should eq true + Column.reserved?('bogus').should eq false end end #reserved? describe '#unsupported?' do it 'returns true if name is not supported by Postgres' do - @column.unsupported?('9name').should eq true - @column.unsupported?('name9').should eq false + Column.unsupported?('9name').should eq true + Column.unsupported?('name9').should eq false end end #unsupported? From 5cc3e119ecb0b0c26045337871144f3d47fb2d42 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 10:40:08 +0100 Subject: [PATCH 06/33] Fix fetching Table --- app/models/synchronization/adapter.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/models/synchronization/adapter.rb b/app/models/synchronization/adapter.rb index 1d4081af2b35..47f6dd3cc673 100644 --- a/app/models/synchronization/adapter.rb +++ b/app/models/synchronization/adapter.rb @@ -277,7 +277,8 @@ def import_cleanup(schema_name, table_name) # For consistency with regular imports, also eases testing # The sanitization of @table_name is applied to the newly imported table_name - @user.tables.where(name: @table_name).sanitize_columns(table_name: table_name, database_schema: schema_name, connection: user_database) + table = ::Table.new(name: @table_name, user_id: @user.id) + table.sanitize_columns(table_name: table_name, database_schema: schema_name, connection: user_database) # When tables are created using ogr2ogr they are added a ogc_fid or gid primary key # In that case: From e282e04efc48cb0aa45d5b659b33ac03acdd75d9 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 10:40:22 +0100 Subject: [PATCH 07/33] Make method public --- app/models/table.rb | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/app/models/table.rb b/app/models/table.rb index 25ad15f1391f..f613aef8de86 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -1222,6 +1222,15 @@ def update_bounding_box SequelRails.connection.run(update_sql) end + # Apply the sanitization defined for this table. + # If the table_name parameter is passed, the sanitization which was originally applied to self + # is applied to the table so named. + # If no table_name parameters is paseed the normalization is applied to self. + # This allows re-applying the same normalization to imported tables. + def sanitize_columns(table_name: name, **options) + self.class.sanitize_columns(table_name, column_sanitization_version, options) + end + private def valid_cartodb_id_candidate?(col_name) @@ -1312,15 +1321,6 @@ def column_sanitization_version end end - # Apply the sanitization defined for this table. - # If the table_name parameter is passed, the sanitization which was originally applied to self - # is applied to the table so named. - # If no table_name parameters is paseed the normalization is applied to self. - # This allows re-applying the same normalization to imported tables. - def sanitize_columns(table_name: name, **options) - self.class.sanitize_columns(table_name, column_sanitization_version, options) - end - def self.sanitize_columns(table_name, column_sanitization_version, options={}) connection = options.fetch(:connection) database_schema = options.fetch(:database_schema, 'public') From 69ce1079ef53c23de633217f7ba6a275b5508a91 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 10:44:12 +0100 Subject: [PATCH 08/33] Add NEWS entry --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index b9cb4268e8ca..527cffd40632 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,7 @@ Development ([#15266](https://github.com/CartoDB/cartodb/pull/15266)) ### Bug fixes / enhancements +* New versioned sanitization of column names ([#15326](https://github.com/CartoDB/cartodb/issues/15326)) - Avoid warnings when running test in parallel with an empty environment - Improve concurrent Ghost Tables syncs handling ([#15272](https://github.com/CartoDB/cartodb/pull/15272)) - Fix consent screen in OAuth apps without user ([#15247](https://github.com/CartoDB/cartodb/pull/15247)) From 77228e5d4ab37f8d3efd74bba1983e3fb68fc90f Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 23:19:16 +0100 Subject: [PATCH 09/33] Refactor sanitization functions --- app/models/table.rb | 22 +-- config/initializers/carto_db.rb | 2 - config/initializers/string.rb | 144 ------------------ lib/carto/db/sanitize.rb | 35 +++-- lib/importer/lib/cartodb-migrator/migrator.rb | 4 +- lib/tasks/db_maintenance.rake | 3 +- services/importer/lib/importer/column.rb | 35 ++--- .../importer/lib/importer/string_sanitizer.rb | 136 +++++++++++++---- spec/models/table_spec.rb | 4 +- 9 files changed, 160 insertions(+), 225 deletions(-) diff --git a/app/models/table.rb b/app/models/table.rb index f613aef8de86..aad2a0f2a173 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -38,13 +38,6 @@ class Table GEOMETRY_TYPES_PRESENT_CACHING_TIMEOUT = 24.hours STATEMENT_TIMEOUT = 1.hour * 1000 - # See http://www.postgresql.org/docs/9.5/static/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS - PG_IDENTIFIER_MAX_LENGTH = 63 - - # @see services/importer/lib/importer/column.rb -> RESERVED_WORDS - # @see config/initializers/carto_db.rb -> RESERVED_COLUMN_NAMES - RESERVED_COLUMN_NAMES = %w(oid tableoid xmin cmin xmax cmax ctid ogc_fid).freeze - DEFAULT_THE_GEOM_TYPE = 'geometry' VALID_GEOMETRY_TYPES = %W{ geometry multipolygon point multilinestring } @@ -677,10 +670,11 @@ def make_sequel_compatible(attributes) end def add_column!(options) - raise CartoDB::InvalidColumnName if RESERVED_COLUMN_NAMES.include?(options[:name]) || options[:name] =~ /^[0-9]/ + raise CartoDB::InvalidColumnName if CartoDB::Importer2::Column.reserved_or_unsupported?(options[:name]) type = options[:type].convert_to_db_type cartodb_type = options[:type].convert_to_cartodb_type - column_name = options[:name].to_s.sanitize_column_name + # FIXME: consider CartoDB::Importer2::Column.get_valid_column_name with CURRENT_COLUMN_SANITIZATION_VERSION + column_name = CartoDB::Importer2::Column.sanitize_name(options[:name].to_s) owner.in_database.add_column name, column_name, type update_cdb_tablemetadata @@ -701,12 +695,12 @@ def drop_column!(options) end def modify_column!(options) - old_name = options.fetch(:name, '').to_s.sanitize - new_name = options.fetch(:new_name, '').to_s.sanitize + # FIXME: consider CartoDB::Importer2::Column.get_valid_column_name with CURRENT_COLUMN_SANITIZATION_VERSION + old_name = CartoDB::Importer2::Column.sanitize_name(options.fetch(:name, '').to_s) + new_name = CartoDB::Importer2::Column.sanitize_name(options.fetch(:new_name, '').to_s) raise 'This column cannot be modified' if CARTODB_COLUMNS.include?(old_name.to_s) if new_name.present? && new_name != old_name - new_name = new_name.sanitize_column_name rename_column(old_name, new_name) end @@ -732,7 +726,7 @@ def rename_column(old_name, new_name='') raise 'Please provide a column name' if new_name.empty? raise 'This column cannot be renamed' if CARTODB_COLUMNS.include?(old_name.to_s) - if new_name =~ /^[0-9]/ || RESERVED_COLUMN_NAMES.include?(new_name) || CARTODB_COLUMNS.include?(new_name) + if CartoDB::Importer2::Column.reserved_or_unsupported?(new_name) || CARTODB_COLUMNS.include?(new_name) raise CartoDB::InvalidColumnName, 'That column name is reserved, please choose a different one' end @@ -1436,7 +1430,7 @@ def create_table_in_database! sanitized_force_schema = force_schema.split(',').map do |column| # Convert existing primary key into a unique key if column =~ /\A\s*\"([^\"]+)\"(.*)\z/ - "#{$1.sanitize} #{$2.gsub(/primary\s+key/i,'UNIQUE')}" + "#{CartoDB::Importer2::Column.sanitize_name $1} #{$2.gsub(/primary\s+key/i,'UNIQUE')}" else column.gsub(/primary\s+key/i,'UNIQUE') end diff --git a/config/initializers/carto_db.rb b/config/initializers/carto_db.rb index 2c7ae745f7b8..8a8f05aec8e4 100644 --- a/config/initializers/carto_db.rb +++ b/config/initializers/carto_db.rb @@ -20,8 +20,6 @@ module CartoDB SURROGATE_NAMESPACE_PUBLIC_PAGES = 'rp'.freeze SURROGATE_NAMESPACE_VIZJSON = 'rj'.freeze - RESERVED_COLUMN_NAMES = %w(FORMAT CONTROLLER ACTION oid tableoid xmin cmin xmax cmax ctid ogc_fid).freeze - LAST_BLOG_POSTS_FILE_PATH = "#{Rails.root}/public/system/last_blog_posts.html" # Helper method to encapsulate Rails full URL generation compatible with our subdomainless mode diff --git a/config/initializers/string.rb b/config/initializers/string.rb index bc1d4cb680b9..4d63372c5d0c 100644 --- a/config/initializers/string.rb +++ b/config/initializers/string.rb @@ -3,137 +3,10 @@ def self.random(length=10) ('a'..'z').sort_by {rand}[0,length].join end - def normalize - str = self.downcase - return '' if str.blank? - n = str.force_encoding("UTF-8") - n.gsub!(/[àáâãäåāă]/, 'a') - n.gsub!(/æ/, 'ae') - n.gsub!(/[ďđ]/, 'd') - n.gsub!(/[çćčĉċ]/, 'c') - n.gsub!(/[èéêëēęěĕė]/, 'e') - n.gsub!(/ƒ/, 'f') - n.gsub!(/[ĝğġģ]/, 'g') - n.gsub!(/[ĥħ]/, 'h') - n.gsub!(/[ììíîïīĩĭ]/, 'i') - n.gsub!(/[įıijĵ]/, 'j') - n.gsub!(/[ķĸ]/, 'k') - n.gsub!(/[łľĺļŀ]/, 'l') - n.gsub!(/[ñńňņʼnŋ]/, 'n') - n.gsub!(/[òóôõöøōőŏŏ]/, 'o') - n.gsub!(/œ/, 'oe') - n.gsub!(/ą/, 'q') - n.gsub!(/[ŕřŗ]/, 'r') - n.gsub!(/[śšşŝș]/, 's') - n.gsub!(/[ťţŧț]/, 't') - n.gsub!(/[ùúûüūůűŭũų]/, 'u') - n.gsub!(/ŵ/, 'w') - n.gsub!(/[ýÿŷ]/, 'y') - n.gsub!(/[žżź]/, 'z') - n.gsub!(/[ÀÁÂÃÄÅĀĂ]/i, 'A') - n.gsub!(/Æ/i, 'AE') - n.gsub!(/[ĎĐ]/i, 'D') - n.gsub!(/[ÇĆČĈĊ]/i, 'C') - n.gsub!(/[ÈÉÊËĒĘĚĔĖ]/i, 'E') - n.gsub!(/Ƒ/i, 'F') - n.gsub!(/[ĜĞĠĢ]/i, 'G') - n.gsub!(/[ĤĦ]/i, 'H') - n.gsub!(/[ÌÌÍÎÏĪĨĬ]/i, 'I') - n.gsub!(/[IJĴ]/i, 'J') - n.gsub!(/[Ķĸ]/i, 'J') - n.gsub!(/[ŁĽĹĻĿ]/i, 'L') - n.gsub!(/[ÑŃŇŅʼnŊ]/i, 'N') - n.gsub!(/[ÒÓÔÕÖØŌŐŎŎ]/i, 'O') - n.gsub!(/Œ/i, 'OE') - n.gsub!(/Ą/i, 'Q') - n.gsub!(/[ŔŘŖ]/i, 'R') - n.gsub!(/[ŚŠŞŜȘ]/i, 'S') - n.gsub!(/[ŤŢŦȚ]/i, 'T') - n.gsub!(/[ÙÚÛÜŪŮŰŬŨŲ]/i, 'U') - n.gsub!(/Ŵ/i, 'W') - n.gsub!(/[ÝŸŶ]/i, 'Y') - n.gsub!(/[ŽŻŹ]/i, 'Z') - n.gsub!(/A/i, 'A') - n.gsub!(/а/i, 'a') - n.gsub!(/Б/i, 'B') - n.gsub!(/б/i, 'b') - n.gsub!(/В/i, 'V') - n.gsub!(/в/i, 'v') - n.gsub!(/Г/i, 'G') - n.gsub!(/г/i, 'g') - n.gsub!(/Д/i, 'D') - n.gsub!(/д/i, 'd') - n.gsub!(/Е/i, 'E') - n.gsub!(/е/i, 'e') - n.gsub!(/Ё/i, 'Yo') - n.gsub!(/ё/i, 'yo') - n.gsub!(/Ж/i, 'Zh') - n.gsub!(/ж/i, 'zh') - n.gsub!(/З/i, 'Z') - n.gsub!(/з/i, 'z') - n.gsub!(/И/i, 'I') - n.gsub!(/и/i, 'i') - n.gsub!(/Й/i, 'J') - n.gsub!(/й/i, 'j') - n.gsub!(/К/i, 'K') - n.gsub!(/к/i, 'k') - n.gsub!(/Л/i, 'L') - n.gsub!(/л/i, 'l') - n.gsub!(/М/i, 'M') - n.gsub!(/м/i, 'm') - n.gsub!(/Н/i, 'N') - n.gsub!(/н/i, 'n') - n.gsub!(/О/i, 'O') - n.gsub!(/о/i, 'o') - n.gsub!(/П/i, 'P') - n.gsub!(/п/i, 'p') - n.gsub!(/Р/i, 'R') - n.gsub!(/р/i, 'r') - n.gsub!(/С/i, 'S') - n.gsub!(/с/i, 's') - n.gsub!(/Т/i, 'T') - n.gsub!(/т/i, 't') - n.gsub!(/У/i, 'U') - n.gsub!(/у/i, 'u') - n.gsub!(/Ф/i, 'F') - n.gsub!(/ф/i, 'f') - n.gsub!(/Х/i, 'X') - n.gsub!(/х/i, 'x') - n.gsub!(/Ц/i, 'Cz') - n.gsub!(/ц/i, 'cz') - n.gsub!(/Ч/i, 'Ch') - n.gsub!(/ч/i, 'ch') - n.gsub!(/Ш/i, 'Sh') - n.gsub!(/ш/i, 'sh') - n.gsub!(/Щ/i, 'Shh') - n.gsub!(/щ/i, 'shh') - n.gsub!(/Ъ/i, '') - n.gsub!(/ъ/i, '') - n.gsub!(/Ы/i, 'Y') - n.gsub!(/ы/i, 'y') - n.gsub!(/Ь/i, '') - n.gsub!(/ь/i, '') - n.gsub!(/Э/i, 'E') - n.gsub!(/э/i, 'e') - n.gsub!(/Ю/i, 'Yu') - n.gsub!(/ю/i, 'yu') - n.gsub!(/Я/i, 'Ya') - n.gsub!(/я/i, 'ya') - n - end - - def sanitize - return if self.blank? - self.gsub(/<[^>]+>/m,'').normalize.downcase.gsub(/&.+?;/,'-'). - gsub(/[^a-z0-9 _-]/,'-').strip.gsub(/\s+/,'-').gsub(/-+/,'-'). - gsub(/-/,' ').strip.gsub(/ /,'-').gsub(/-/,'_') - end - def strip_tags self.gsub(/<[^>]+>/m,'').strip end - def get_cartodb_types { "number" => ["smallint", /numeric\(\d+,\d+\)/, "integer", "bigint", "decimal", "numeric", "double precision", "serial", "big serial", "real"], @@ -178,21 +51,4 @@ def convert_to_cartodb_type self.downcase end end - - def sanitize_sql - self.gsub(/\\/, '\&\&').gsub(/'/, "''") - end - - def sanitize_column_name - temporal_name = sanitize || '' - - if temporal_name !~ /^[a-zA-Z_]/ || - Carto::DB::Sanitize::RESERVED_WORDS.include?(downcase) || - CartoDB::RESERVED_COLUMN_NAMES.include?(upcase) - return '_' + temporal_name - else - temporal_name - end - end - end diff --git a/lib/carto/db/sanitize.rb b/lib/carto/db/sanitize.rb index 30fa54a89548..3f6b93a4af2a 100644 --- a/lib/carto/db/sanitize.rb +++ b/lib/carto/db/sanitize.rb @@ -4,6 +4,7 @@ module Sanitize PREFIX_REPLACEMENT = 'table_'.freeze SUFFIX_REPLACEMENT = '_t'.freeze CHARACTER_REPLACEMENT = '_'.freeze + # See https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS MAX_IDENTIFIER_LENGTH = 63 DISALLOWED_STARTING_CHARACTERS_REGEX = /^[^a-z]+/ DISALLOWED_CHARACTERS_REGEX = /[^a-z|_|0-9]/ @@ -11,14 +12,29 @@ module Sanitize SYSTEM_TABLE_NAMES = %w(spatial_ref_sys geography_columns geometry_columns raster_columns raster_overviews cdb_tablemetadata geometry raster).freeze RESERVED_TABLE_NAMES = %w(layergroup all public).freeze - RESERVED_WORDS = %w(all analyse analyze and any array as asc asymmetric authorization between binary both - case cast check collate column constraint create cross current_date current_role - current_time current_timestamp current_user default deferrable desc distinct do else - end except false for foreign freeze from full grant group having ilike in initially - inner intersect into is isnull join leading left like limit localtime localtimestamp - natural new not notnull null off offset old on only or order outer overlaps placing - primary references right select session_user similar some symmetric table then to - trailing true union unique user using verbose when where xmin xmax).freeze + + # See https://www.postgresql.org/docs/current/ddl-system-columns.html + SYSTEM_COLUMN_NAMES = %w(tableoid xmin cmin xmax cmax ctid).freeze + + # See https://www.postgresql.org/docs/current/sql-keywords-appendix.html + RESERVED_WORDS = %w(all analyse analyze and any array as asc asymmetric authorization binary both + case cast check collate collation column concurrently constraint create cross + current_catalog current_date current_role current_schema current_time + current_timestamp current_user default deferrable desc distinct do else end + except false fetch for foreign freeze from full grant group having ilike in + initially inner intersect into is isnull join lateral leading left like limit + localtime localtimestamp natural not notnull null offset on only or order outer + overlaps placing primary references returning right select session_user similar + some symmetric table tablesample then to trailing true union unique user using + variadic verbose when where window with).freeze + + ADDITIONAL_RESERVED_COLUMNS = %w(ogc_fid).freeze + + # FIXME we have been reserving these name in columns but I don't know the reason ¯\_(ツ)_/¯ + ADDITIONAL_WORDS = %w(between new off old format controller action).freeze + + RESERVED_COLUMN_NAMES = SYSTEM_COLUMN_NAMES + RESERVED_WORDS + ADDITIONAL_RESERVED_COLUMNS + ADDITIONAL_WORDS + def self.append_with_truncate_and_sanitize(identifier, suffix) suffix_length = suffix.length @@ -58,7 +74,8 @@ def self.sanitize_identifier(identifier) sanitized_identifier = sanitized_identifier[0..(MAX_IDENTIFIER_LENGTH - 1)] # Append _t if is a reserved word or reserved table name - if (RESERVED_WORDS + RESERVED_TABLE_NAMES + SYSTEM_TABLE_NAMES).map(&:downcase).include?(sanitized_identifier) + # NOTE: strictly, we don't need to avoid ADDITIONAL_WORDS for table names + if (RESERVED_WORDS + RESERVED_TABLE_NAMES + SYSTEM_TABLE_NAMES + ADDITIONAL_WORDS).map(&:downcase).include?(sanitized_identifier) sanitized_identifier += SUFFIX_REPLACEMENT end diff --git a/lib/importer/lib/cartodb-migrator/migrator.rb b/lib/importer/lib/cartodb-migrator/migrator.rb index 8088e32386a4..3f8d4a1c36c2 100644 --- a/lib/importer/lib/cartodb-migrator/migrator.rb +++ b/lib/importer/lib/cartodb-migrator/migrator.rb @@ -84,14 +84,14 @@ def sanitize_columns! def sanitize(column_names) columns_to_sanitize = column_names.select do |column_name| - column_name != column_name.sanitize_column_name + column_name != CartoDB::Importer2::Column.sanitize_name(column_name) end correct_columns = column_names - columns_to_sanitize sanitization_map = Hash[ columns_to_sanitize.map { |column_name| - [column_name, column_name.sanitize_column_name] + [column_name, CartoDB::Importer2::Column.sanitize_name(column_name)] } ] diff --git a/lib/tasks/db_maintenance.rake b/lib/tasks/db_maintenance.rake index 55eb5109f7ce..52c3c1c000e2 100644 --- a/lib/tasks/db_maintenance.rake +++ b/lib/tasks/db_maintenance.rake @@ -1026,7 +1026,8 @@ namespace :cartodb do tables = JSON.parse(File.read(args['table_definition_json_path'].to_s)) u.in_database({as: :superuser, no_cartodb_in_schema: true}) do |db| db.transaction do - server_name = "oracle_#{args[:oracle_url].sanitize}_#{Time.now.to_i}" + name = CartoDB::Importer2::StringSanitizer.sanitize({args[:oracle_url], cyrillic: true) + server_name = "oracle_#{name}_#{Time.now.to_i}" db.run('CREATE EXTENSION oracle_fdw') unless db.fetch(%Q{ SELECT count(*) FROM pg_extension WHERE extname='oracle_fdw' }).first[:count] > 0 diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index 69d6c10b3040..c617cbe591ee 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -5,6 +5,8 @@ require_relative './exceptions' require_relative './query_batcher' +require_relative '../../../../lib/carto/db/sanitize' + module CartoDB module Importer2 class Column @@ -16,22 +18,8 @@ class Column KML_POINT_RE = // DEFAULT_SCHEMA = 'cdb_importer' DIRECT_STATEMENT_TIMEOUT = 1.hour * 1000 - # @see config/initializers/carto_db.rb -> POSTGRESQL_RESERVED_WORDS - RESERVED_WORDS = %w{ ALL ANALYSE ANALYZE AND ANY ARRAY AS ASC ASYMMETRIC - AUTHORIZATION BETWEEN BINARY BOTH CASE CAST CHECK - COLLATE COLUMN CONSTRAINT CREATE CROSS CURRENT_DATE - CURRENT_ROLE CURRENT_TIME CURRENT_TIMESTAMP - CURRENT_USER DEFAULT DEFERRABLE DESC DISTINCT DO - ELSE END EXCEPT FALSE FOR FOREIGN FREEZE FROM FULL - GRANT GROUP HAVING ILIKE IN INITIALLY INNER INTERSECT - INTO IS ISNULL JOIN LEADING LEFT LIKE LIMIT LOCALTIME - LOCALTIMESTAMP NATURAL NEW NOT NOTNULL NULL OFF - OFFSET OLD ON ONLY OR ORDER OUTER OVERLAPS PLACING - PRIMARY REFERENCES RIGHT SELECT SESSION_USER SIMILAR - SOME SYMMETRIC TABLE THEN TO TRAILING TRUE UNION - UNIQUE USER USING VERBOSE WHEN WHERE XMIN XMAX - FORMAT CONTROLLER ACTION - } + RESERVED_COLUMN_NAMES = Carto::DB::Sanitize::RESERVED_COLUMN_NAMES + PG_IDENTIFIER_MAX_LENGTH = Carto::DB::Sanitize::MAX_IDENTIFIER_LENGTH NO_COLUMN_SANITIZATION_VERSION = 0 INITIAL_COLUMN_SANITIZATION_VERSION = 1 @@ -273,17 +261,20 @@ def sanitized_name end def self.reserved?(name) - RESERVED_WORDS.include?(name.upcase) + RESERVED_COLUMN_NAMES.include?(name.upcase) end def self.unsupported?(name) name !~ /^[a-zA-Z_]/ end + def self.reserved_or_unsupported?(name) + reserved?(name) || unsupported?(name) + end + def self.sanitize_name(column_name) - # TODO: take into account Postgres identifier length limitations; use DB::Sanitize ? - name = StringSanitizer.new.sanitize(column_name.to_s) - return name unless reserved?(name) || unsupported?(name) + name = StringSanitizer.new.sanitize(column_name.to_s, cyrillic: false) + return name unless reserved_or_unsupported?(name) "_#{name}" end @@ -293,7 +284,7 @@ def self.get_valid_column_name(table_name, candidate_column_name, column_sanitiz if column_sanitization_version == INITIAL_COLUMN_SANITIZATION_VERSION # NOTE: originally not all uses of this sanitization version applied reserved words - reserved_words = RESERVED_WORDS + reserved_words = RESERVED_COLUMN_NAMES existing_names = column_names - [candidate_column_name] @@ -319,7 +310,7 @@ def self.get_valid_column_name(table_name, candidate_column_name, column_sanitiz elsif column_sanitization_version == 2 sanitize_name(candidate_column_name) - # TODO: Avoid collisions? + # TODO: Avoid collisions & apply PG_IDENTIFIER_MAX_LENGTH limit?; else raise "Invalid column sanitization version #{column_sanitization_version.inspect}" diff --git a/services/importer/lib/importer/string_sanitizer.rb b/services/importer/lib/importer/string_sanitizer.rb index b66bbc01b669..2730eb82ac60 100644 --- a/services/importer/lib/importer/string_sanitizer.rb +++ b/services/importer/lib/importer/string_sanitizer.rb @@ -1,11 +1,11 @@ module CartoDB module Importer2 class StringSanitizer - def normalize(string) + def normalize(string, cyrillic: false) return '' if string.nil? || string.empty? - n = string.downcase.force_encoding("UTF-8") - n.gsub!(/[àáâãäåāă]/, 'a') + n = string.force_encoding("UTF-8") + n.gsub!(/[àáâãäåāă]/, 'a') n.gsub!(/æ/, 'ae') n.gsub!(/[ďđ]/, 'd') n.gsub!(/[çćčĉċ]/, 'c') @@ -28,36 +28,114 @@ def normalize(string) n.gsub!(/ŵ/, 'w') n.gsub!(/[ýÿŷ]/, 'y') n.gsub!(/[žżź]/, 'z') - n.gsub!(/[ÀÁÂÃÄÅĀĂ]/i, 'A') - n.gsub!(/Æ/i, 'AE') - n.gsub!(/[ĎĐ]/i, 'D') - n.gsub!(/[ÇĆČĈĊ]/i, 'C') - n.gsub!(/[ÈÉÊËĒĘĚĔĖ]/i, 'E') - n.gsub!(/Ƒ/i, 'F') - n.gsub!(/[ĜĞĠĢ]/i, 'G') - n.gsub!(/[ĤĦ]/i, 'H') - n.gsub!(/[ÌÌÍÎÏĪĨĬ]/i, 'I') - n.gsub!(/[IJĴ]/i, 'J') - n.gsub!(/[Ķĸ]/i, 'J') - n.gsub!(/[ŁĽĹĻĿ]/i, 'L') - n.gsub!(/[ÑŃŇŅʼnŊ]/i, 'M') - n.gsub!(/[ÒÓÔÕÖØŌŐŎŎ]/i, 'N') - n.gsub!(/Œ/i, 'OE') - n.gsub!(/Ą/i, 'Q') - n.gsub!(/[ŔŘŖ]/i, 'R') - n.gsub!(/[ŚŠŞŜȘ]/i, 'S') - n.gsub!(/[ŤŢŦȚ]/i, 'T') - n.gsub!(/[ÙÚÛÜŪŮŰŬŨŲ]/i, 'U') - n.gsub!(/Ŵ/i, 'W') - n.gsub!(/[ÝŸŶ]/i, 'Y') - n.gsub!(/[ŽŻŹ]/i, 'Z') + n.gsub!(/[ÀÁÂÃÄÅĀĂ]/, 'A') + n.gsub!(/Æ/, 'AE') + n.gsub!(/[ĎĐ]/, 'D') + n.gsub!(/[ÇĆČĈĊ]/, 'C') + n.gsub!(/[ÈÉÊËĒĘĚĔĖ]/, 'E') + n.gsub!(/Ƒ/i, 'F') + n.gsub!(/[ĜĞĠĢ]/, 'G') + n.gsub!(/[ĤĦ]/i, 'H') + n.gsub!(/[ÌÌÍÎÏĪĨĬ]/, 'I') + n.gsub!(/[IJĴ]/, 'J') + n.gsub!(/[Ķĸ]/, 'K') + n.gsub!(/[ŁĽĹĻĿ]/, 'L') + n.gsub!(/[ÑŃŇŅʼnŊ]/, 'N') + n.gsub!(/[ÒÓÔÕÖØŌŐŎŎ]/, 'O') + n.gsub!(/Œ/, 'OE') + n.gsub!(/Ą/, 'Q') + n.gsub!(/[ŔŘŖ]/, 'R') + n.gsub!(/[ŚŠŞŜȘ]/, 'S') + n.gsub!(/[ŤŢŦȚ]/, 'T') + n.gsub!(/[ÙÚÛÜŪŮŰŬŨŲ]/, 'U') + n.gsub!(/Ŵ/, 'W') + n.gsub!(/[ÝŸŶ]/, 'Y') + n.gsub!(/[ŽŻŹ]/, 'Z') + if cyrillic + n.gsub!(/Б/, 'B') + n.gsub!(/б/, 'b') + n.gsub!(/В/, 'V') + n.gsub!(/в/, 'v') + n.gsub!(/Г/, 'G') + n.gsub!(/г/, 'g') + n.gsub!(/Д/, 'D') + n.gsub!(/д/, 'd') + n.gsub!(/Е/, 'E') + n.gsub!(/е/, 'e') + n.gsub!(/Ё/, 'Yo') + n.gsub!(/ё/, 'yo') + n.gsub!(/Ж/, 'Zh') + n.gsub!(/ж/, 'zh') + n.gsub!(/З/, 'Z') + n.gsub!(/з/, 'z') + n.gsub!(/И/, 'I') + n.gsub!(/и/, 'i') + n.gsub!(/Й/, 'J') + n.gsub!(/й/, 'j') + n.gsub!(/К/, 'K') + n.gsub!(/к/, 'k') + n.gsub!(/Л/, 'L') + n.gsub!(/л/, 'l') + n.gsub!(/М/, 'M') + n.gsub!(/м/, 'm') + n.gsub!(/Н/, 'N') + n.gsub!(/н/, 'n') + n.gsub!(/О/, 'O') + n.gsub!(/о/, 'o') + n.gsub!(/П/, 'P') + n.gsub!(/п/, 'p') + n.gsub!(/Р/, 'R') + n.gsub!(/р/, 'r') + n.gsub!(/С/, 'S') + n.gsub!(/с/, 's') + n.gsub!(/Т/, 'T') + n.gsub!(/т/, 't') + n.gsub!(/У/, 'U') + n.gsub!(/у/, 'u') + n.gsub!(/Ф/, 'F') + n.gsub!(/ф/, 'f') + n.gsub!(/Х/, 'X') + n.gsub!(/х/, 'x') + n.gsub!(/Ц/, 'Cz') + n.gsub!(/ц/, 'cz') + n.gsub!(/Ч/, 'Ch') + n.gsub!(/ч/, 'ch') + n.gsub!(/Ш/, 'Sh') + n.gsub!(/ш/, 'sh') + n.gsub!(/Щ/, 'Shh') + n.gsub!(/щ/, 'shh') + n.gsub!(/Ъ/, '') + n.gsub!(/ъ/, '') + n.gsub!(/Ы/, 'Y') + n.gsub!(/ы/, 'y') + n.gsub!(/Ь/, '') + n.gsub!(/ь/, '') + n.gsub!(/Э/, 'E') + n.gsub!(/э/, 'e') + n.gsub!(/Ю/, 'Yu') + n.gsub!(/ю/, 'yu') + n.gsub!(/Я/, 'Ya') + n.gsub!(/я/, 'ya') + end n end #normalize - def sanitize(string) - return '' if string.nil? || string.empty? + def legacy_sanitize(string) + return '' if string.nil? || string.empty? + normalize(string.downcase.gsub(/<[^>]+>/m,''), cyrillic: false) + .gsub(/&.+?;/,'-') + .gsub(/[^a-z0-9 _-]/,'-').strip + .gsub(/\s+/,'-') + .gsub(/-+/,'-') + .gsub(/-/,' ').strip + .gsub(/ /,'-') + .gsub(/-/,'_') + end - normalize(string.gsub(/<[^>]+>/m,'')) + def sanitize(string, cyrillic: false) + return '' if string.nil? || string.empty? + normalize(string.gsub(/<[^>]+>/m,''), cyrillic: cyrillic) + .downcase .gsub(/&.+?;/,'-') .gsub(/[^a-z0-9 _-]/,'-').strip .gsub(/\s+/,'-') diff --git a/spec/models/table_spec.rb b/spec/models/table_spec.rb index 3a73f24e2c54..9a34aabb041a 100644 --- a/spec/models/table_spec.rb +++ b/spec/models/table_spec.rb @@ -552,7 +552,7 @@ def expect_save_to_fail_validation(table) table.name = 'Wadus table #23' table.save table.reload - table.name.should == "Wadus table #23".sanitize + table.name.should == CartoDB::Importer2::StringSanitizer.sanitize("Wadus table #23") @user.in_database do |user_database| user_database.table_exists?('wadus_table'.to_sym).should be_false user_database.table_exists?('wadus_table_23'.to_sym).should be_true @@ -561,7 +561,7 @@ def expect_save_to_fail_validation(table) table.name = '' table.save table.reload - table.name.should == "Wadus table #23".sanitize + table.name.should == CartoDB::Importer2::StringSanitizer.sanitize("Wadus table #23") @user.in_database do |user_database| user_database.table_exists?('wadus_table_23'.to_sym).should be_true end From 198a30ecfa027f783f65ff99ce9658d937bbeca6 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Wed, 18 Dec 2019 23:30:43 +0100 Subject: [PATCH 10/33] Fix syntax --- lib/tasks/db_maintenance.rake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tasks/db_maintenance.rake b/lib/tasks/db_maintenance.rake index 52c3c1c000e2..305f3cc37340 100644 --- a/lib/tasks/db_maintenance.rake +++ b/lib/tasks/db_maintenance.rake @@ -1026,7 +1026,7 @@ namespace :cartodb do tables = JSON.parse(File.read(args['table_definition_json_path'].to_s)) u.in_database({as: :superuser, no_cartodb_in_schema: true}) do |db| db.transaction do - name = CartoDB::Importer2::StringSanitizer.sanitize({args[:oracle_url], cyrillic: true) + name = CartoDB::Importer2::StringSanitizer.sanitize(args[:oracle_url], cyrillic: true) server_name = "oracle_#{name}_#{Time.now.to_i}" db.run('CREATE EXTENSION oracle_fdw') unless db.fetch(%Q{ SELECT count(*) FROM pg_extension WHERE extname='oracle_fdw' From ab4db5a6b5aa02f415704867f14631bc67b4b778 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 00:02:36 +0100 Subject: [PATCH 11/33] Fix reserved name detection --- services/importer/lib/importer/column.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index c617cbe591ee..3f3c98fb6967 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -261,7 +261,7 @@ def sanitized_name end def self.reserved?(name) - RESERVED_COLUMN_NAMES.include?(name.upcase) + RESERVED_COLUMN_NAMES.include?(name.downcase) end def self.unsupported?(name) @@ -300,7 +300,7 @@ def self.get_valid_column_name(table_name, candidate_column_name, column_sanitiz # Avoid collisions count = 1 new_column_name = candidate_column_name - while existing_names.include?(new_column_name) || reserved_words.include?(new_column_name.upcase) + while existing_names.include?(new_column_name) || reserved_words.include?(new_column_name.downcase) suffix = "_#{count}" new_column_name = candidate_column_name[0..PG_IDENTIFIER_MAX_LENGTH-suffix.length] + suffix count += 1 From 86a9ed2b047e7a786a935a1569166925ba2ea566 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 13:30:23 +0100 Subject: [PATCH 12/33] Tests for legacy sanitization --- services/importer/spec/unit/column_spec.rb | 60 ++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/services/importer/spec/unit/column_spec.rb b/services/importer/spec/unit/column_spec.rb index 4bcefb105cc9..9090604fdb29 100644 --- a/services/importer/spec/unit/column_spec.rb +++ b/services/importer/spec/unit/column_spec.rb @@ -345,5 +345,65 @@ def random_kml_multi_record ogc_fid: 1 } end #random_kml_multi_record + + + LEGACY_SANITIZATION_EXAMPLES = { + "abc" => "abc", + "abc xyz" => "abc_xyz", + "2abc" => "column_2abc", + "Abc" => "_bc", + "\u0432\u044b\u0445\u043b\u043e\u043f\u044b \u0430\u0432\u0442\u043e\u0442\u0440\u0430\u043d\u0441\u043f\u043e\u0440\u0442\u04302" => "_2", + "\u043d\u0435\u0443\u0441\u0442\u0430\u043d\u043e\u0432\u043b\u0435\u043d\u043d\u044b\u0439 \u0438\u0441\u0442\u043e\u0447\u043d\u0438\u043a2" => "_2", + "\u0432\u044b\u0431\u0440\u043e\u0441\u044b \u043f\u0440\u0435\u0434\u043f\u0440\u0438\u044f\u0442\u0438\u04392" => "_2", + "\u013ar" => "_r", + "CONVERT(BlueNumber USING utf8)" => '_lue_umber_utf8_', + "is growing site fenced?" => "is_growing_site_fenced_", + "if it\u2019s a community garden, is it collective or allotment?" => "if_it_s_a_community_garden_is_it_collective_or_allotment_", + "Paddock" => "_addock", + "Date Due" => "_ate_ue", + "__5" => "_5", + "__1" => "_1", + "tel\u00e9fono" => "tel_fono", + ":@computed_region_wvic_k925" => "_computed_region_wvic_k925", + "\u0420\u0435\u0433\u0438\u043e\u043d" => "_", + "\u043d\u0435\u0443\u0441\u0442\u0430\u043d\u043e\u0432\u043b\u0435\u043d\u043d\u044b\u0439 \u0438\u0441\u0442\u043e\u0447\u043d\u0438\u043a" => "_", + "> min" => "_min", + "12_ schedule of visits.0" => "column_12_schedule_of_visits_0", + "previous rent (\u00a3 per sq ft)" => "previous_rent_per_sq_ft_", + "description/\u540d\u7a31" => "description_", + "description/\u5730\u5740" => "description_", + "@relations" => "_relations", + "EntityName" => "_ntity_ame", + "_ injured" => "_injured", + "trips 11 _ 15 miles" => "trips_11_15_miles" + } + + LEGACY_SANITIZATION_COLS = { + ['выбросы предприятий2', 'выхлопы автотранспорта2', 'неустановленный источник2'] => ['_2', '_2_1', '_2_2'], + ["description/\u540d\u7a31", "description/\u5730\u5740"] => ["description_", "description__1"] + } + + describe '.get_valid_column_name' do + + it 'can apply legacy sanitization to single columns' do + LEGACY_SANITIZATION_EXAMPLES.each do |input_name, output_name| + name = Column::get_valid_column_name('xxx', input_name, Column::INITIAL_COLUMN_SANITIZATION_VERSION, []) + name.should eq output_name + end + end + + it 'can apply legacy sanitization to multiple columns' do + LEGACY_SANITIZATION_COLS.each do |input_columns, output_columns| + columns = [] + input_columns.zip(output_columns).each do |input_column, output_column| + column = Column::get_valid_column_name('xxx', input_column, Column::INITIAL_COLUMN_SANITIZATION_VERSION, columns) + columns << column + column.should eq output_column + end + end + end + + end # .get_valid_column_name + end # Column From 4b7be3001c601608160282d62061db84b2c8719d Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 17:39:28 +0100 Subject: [PATCH 13/33] Remove unnecessary parameter --- app/models/table.rb | 4 ++-- services/importer/lib/importer/column.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/models/table.rb b/app/models/table.rb index aad2a0f2a173..a2fbc1f2e612 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -1339,7 +1339,7 @@ def self.ensure_column_has_valid_name(table_name, column_name, column_sanitizati connection = options.fetch(:connection) database_schema = options.fetch(:database_schema, 'public') - valid_column_name = get_valid_column_name(table_name, column_name, column_sanitization_version, options) + valid_column_name = get_valid_column_name(column_name, column_sanitization_version, options) if valid_column_name != column_name connection.run(%Q{ALTER TABLE "#{database_schema}"."#{table_name}" RENAME COLUMN "#{column_name}" TO "#{valid_column_name}";}) end @@ -1348,7 +1348,7 @@ def self.ensure_column_has_valid_name(table_name, column_name, column_sanitizati end def self.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, options={}) - CartoDB::Importer2::Column.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, get_column_names(table_name, options)) + CartoDB::Importer2::Column.get_valid_column_name(candidate_column_name, column_sanitization_version, get_column_names(table_name, options)) end def self.get_column_names(table_name, options={}) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index 3f3c98fb6967..3bd8f0f6eb9e 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -278,7 +278,7 @@ def self.sanitize_name(column_name) "_#{name}" end - def self.get_valid_column_name(table_name, candidate_column_name, column_sanitization_version, column_names) + def self.get_valid_column_name(candidate_column_name, column_sanitization_version, column_names) return candidate_column_name if column_sanitization_version == NO_COLUMN_SANITIZATION_VERSION From c24ea38ee2df581df2f917e10afd51401212341f Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 17:39:46 +0100 Subject: [PATCH 14/33] More sanitization tests --- services/importer/spec/unit/column_spec.rb | 76 +++++++++++++++++++++- 1 file changed, 73 insertions(+), 3 deletions(-) diff --git a/services/importer/spec/unit/column_spec.rb b/services/importer/spec/unit/column_spec.rb index 9090604fdb29..8c314cd87bf8 100644 --- a/services/importer/spec/unit/column_spec.rb +++ b/services/importer/spec/unit/column_spec.rb @@ -375,7 +375,11 @@ def random_kml_multi_record "@relations" => "_relations", "EntityName" => "_ntity_ame", "_ injured" => "_injured", - "trips 11 _ 15 miles" => "trips_11_15_miles" + "trips 11 _ 15 miles" => "trips_11_15_miles", + "as" => "as", + "any" => "any", + "xmin" => "xmin", + "action" => "action", } LEGACY_SANITIZATION_COLS = { @@ -383,11 +387,52 @@ def random_kml_multi_record ["description/\u540d\u7a31", "description/\u5730\u5740"] => ["description_", "description__1"] } + VERSION_2_SANITIZATION_EXAMPLES = { + "abc" => "abc", + "abc xyz" => "abc_xyz", + "2abc" => "_2abc", + "Abc" => "abc", + "выхлопы автотранспорта2" => "vyxlopy_vtotr_nsport_2", + "неустановленный источник2" => "neust_novlennyj_istochnik2", + "выбросы предприятий2" => "vybrosy_predpriyatij2", + "ĺr" => "lr", + "CONVERT(BlueNumber USING utf8)" => "convert_bluenumber_using_utf8", + "is growing site fenced?" => "is_growing_site_fenced", + "if it’s a community garden, is it collective or allotment?" => "if_it_s_a_community_garden_is_it_collective_or_allotment", + "Paddock" => "paddock", + "Date Due" => "date_due", + "__5" => "__5", + "__1" => "__1", + "teléfono" => "telefono", + ":@computed_region_wvic_k925" => "computed_region_wvic_k925", + "Регион" => "region", + "неустановленный источник" => "neust_novlennyj_istochnik", + "> min" => "min", + "12_ schedule of visits.0" => "_12__schedule_of_visits_0", + "previous rent (£ per sq ft)" => "previous_rent_per_sq_ft", + "description/名稱" => "description", + "description/地址" => "description", + "@relations" => "relations", + "EntityName" => "entityname", + "_ injured" => "__injured", + "trips 11 _ 15 miles" => "trips_11___15_miles", + "as" => "_as", + "any" => "_any", + "xmin" => "_xmin", + "action" => "_action", + } + + VERSION_2_SANITIZATION_COLS = { + ['выбросы предприятий2', 'выхлопы автотранспорта2', 'неустановленный источник2'] => ['vybrosy_predpriyatij2', 'vyxlopy_vtotr_nsport_2', 'neust_novlennyj_istochnik2'], + ["description/\u540d\u7a31", "description/\u5730\u5740"] => ["description", "description_1"], + ["abc", "Abc", "aBc", "ABC"] => ["abc", "abc_1", "abc_2", "abc_3"] + } + describe '.get_valid_column_name' do it 'can apply legacy sanitization to single columns' do LEGACY_SANITIZATION_EXAMPLES.each do |input_name, output_name| - name = Column::get_valid_column_name('xxx', input_name, Column::INITIAL_COLUMN_SANITIZATION_VERSION, []) + name = Column::get_valid_column_name(input_name, Column::INITIAL_COLUMN_SANITIZATION_VERSION, []) name.should eq output_name end end @@ -396,13 +441,38 @@ def random_kml_multi_record LEGACY_SANITIZATION_COLS.each do |input_columns, output_columns| columns = [] input_columns.zip(output_columns).each do |input_column, output_column| - column = Column::get_valid_column_name('xxx', input_column, Column::INITIAL_COLUMN_SANITIZATION_VERSION, columns) + column = Column::get_valid_column_name(input_column, Column::INITIAL_COLUMN_SANITIZATION_VERSION, columns) columns << column column.should eq output_column end end end + it 'can apply sanitization v2 to single columns' do + VERSION_2_SANITIZATION_EXAMPLES.each do |input_name, output_name| + name = Column::get_valid_column_name(input_name, 2, []) + name.should eq output_name + end + end + + it 'v2 sanitization is idempotent' do + VERSION_2_SANITIZATION_EXAMPLES.each_key do |input_name| + first_name = Column::get_valid_column_name(input_name, 2, []) + second_name = Column::get_valid_column_name(first_name, 2, []) + second_name.should eq first_name + end + end + + it 'can apply v2 sanitization to multiple columns' do + VERSION_2_SANITIZATION_COLS.each do |input_columns, output_columns| + columns = [] + input_columns.zip(output_columns).each do |input_column, output_column| + column = Column::get_valid_column_name(input_column, 2, columns) + columns << column + column.should eq output_column + end + end + end end # .get_valid_column_name end # Column From b0a65b4ca71247e7d037e0f93ff311828238f610 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 17:40:04 +0100 Subject: [PATCH 15/33] Implement v2 sanitization --- services/importer/lib/importer/column.rb | 38 ++++++++++--------- .../importer/lib/importer/string_sanitizer.rb | 10 ++--- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index 3bd8f0f6eb9e..fbf731cf8fb5 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -273,7 +273,7 @@ def self.reserved_or_unsupported?(name) end def self.sanitize_name(column_name) - name = StringSanitizer.new.sanitize(column_name.to_s, cyrillic: false) + name = StringSanitizer.new.sanitize(column_name.to_s, transliterate_cyrillic: true) return name unless reserved_or_unsupported?(name) "_#{name}" end @@ -282,11 +282,12 @@ def self.get_valid_column_name(candidate_column_name, column_sanitization_versio return candidate_column_name if column_sanitization_version == NO_COLUMN_SANITIZATION_VERSION + existing_names = column_names - [candidate_column_name] + if column_sanitization_version == INITIAL_COLUMN_SANITIZATION_VERSION # NOTE: originally not all uses of this sanitization version applied reserved words - reserved_words = RESERVED_COLUMN_NAMES - - existing_names = column_names - [candidate_column_name] + # reserved_words = RESERVED_COLUMN_NAMES + reserved_words = [] candidate_column_name = 'untitled_column' if candidate_column_name.blank? candidate_column_name = candidate_column_name.to_s.squish @@ -297,21 +298,11 @@ def self.get_valid_column_name(candidate_column_name, column_sanitization_versio # Valid names start with a letter or an underscore candidate_column_name = "column_#{candidate_column_name}" unless candidate_column_name[/^[a-z_]{1}/] - # Avoid collisions - count = 1 - new_column_name = candidate_column_name - while existing_names.include?(new_column_name) || reserved_words.include?(new_column_name.downcase) - suffix = "_#{count}" - new_column_name = candidate_column_name[0..PG_IDENTIFIER_MAX_LENGTH-suffix.length] + suffix - count += 1 - end - - new_column_name + avoid_collisions(candidate_column_name, existing_names, reserved_words) elsif column_sanitization_version == 2 - - sanitize_name(candidate_column_name) - # TODO: Avoid collisions & apply PG_IDENTIFIER_MAX_LENGTH limit?; - + new_column_name = sanitize_name(candidate_column_name) + new_column_name = [0, PG_IDENTIFIER_MAX_LENGTH] if new_column_name.size > PG_IDENTIFIER_MAX_LENGTH + avoid_collisions(new_column_name, existing_names, RESERVED_COLUMN_NAMES) else raise "Invalid column sanitization version #{column_sanitization_version.inspect}" end @@ -319,6 +310,17 @@ def self.get_valid_column_name(candidate_column_name, column_sanitization_versio private + def self.avoid_collisions(name, existing_names, reserved_words, max_length=PG_IDENTIFIER_MAX_LENGTH) + count = 1 + new_name = name + while existing_names.include?(new_name) || reserved_words.include?(new_name.downcase) + suffix = "_#{count}" + new_name = name[0..max_length-suffix.length] + suffix + count += 1 + end + new_name + end + attr_reader :job, :db, :table_name, :column_name, :schema def qualified_table_name diff --git a/services/importer/lib/importer/string_sanitizer.rb b/services/importer/lib/importer/string_sanitizer.rb index 2730eb82ac60..feb8c597d032 100644 --- a/services/importer/lib/importer/string_sanitizer.rb +++ b/services/importer/lib/importer/string_sanitizer.rb @@ -1,7 +1,7 @@ module CartoDB module Importer2 class StringSanitizer - def normalize(string, cyrillic: false) + def normalize(string, transliterate_cyrillic: false) return '' if string.nil? || string.empty? n = string.force_encoding("UTF-8") @@ -51,7 +51,7 @@ def normalize(string, cyrillic: false) n.gsub!(/Ŵ/, 'W') n.gsub!(/[ÝŸŶ]/, 'Y') n.gsub!(/[ŽŻŹ]/, 'Z') - if cyrillic + if transliterate_cyrillic n.gsub!(/Б/, 'B') n.gsub!(/б/, 'b') n.gsub!(/В/, 'V') @@ -122,7 +122,7 @@ def normalize(string, cyrillic: false) def legacy_sanitize(string) return '' if string.nil? || string.empty? - normalize(string.downcase.gsub(/<[^>]+>/m,''), cyrillic: false) + normalize(string.downcase.gsub(/<[^>]+>/m,''), transliterate_cyrillic: false) .gsub(/&.+?;/,'-') .gsub(/[^a-z0-9 _-]/,'-').strip .gsub(/\s+/,'-') @@ -132,9 +132,9 @@ def legacy_sanitize(string) .gsub(/-/,'_') end - def sanitize(string, cyrillic: false) + def sanitize(string, transliterate_cyrillic: false) return '' if string.nil? || string.empty? - normalize(string.gsub(/<[^>]+>/m,''), cyrillic: cyrillic) + normalize(string.gsub(/<[^>]+>/m,''), transliterate_cyrillic: transliterate_cyrillic) .downcase .gsub(/&.+?;/,'-') .gsub(/[^a-z0-9 _-]/,'-').strip From 9e2dc6739a3b72147fd73ae0180865a3326aec8c Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 18:28:14 +0100 Subject: [PATCH 16/33] Rename cyrillic parameter --- lib/tasks/db_maintenance.rake | 2 +- services/importer/lib/importer/column.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/tasks/db_maintenance.rake b/lib/tasks/db_maintenance.rake index 305f3cc37340..537a8eb5199f 100644 --- a/lib/tasks/db_maintenance.rake +++ b/lib/tasks/db_maintenance.rake @@ -1026,7 +1026,7 @@ namespace :cartodb do tables = JSON.parse(File.read(args['table_definition_json_path'].to_s)) u.in_database({as: :superuser, no_cartodb_in_schema: true}) do |db| db.transaction do - name = CartoDB::Importer2::StringSanitizer.sanitize(args[:oracle_url], cyrillic: true) + name = CartoDB::Importer2::StringSanitizer.sanitize(args[:oracle_url], transliterate_cyrillic: true) server_name = "oracle_#{name}_#{Time.now.to_i}" db.run('CREATE EXTENSION oracle_fdw') unless db.fetch(%Q{ SELECT count(*) FROM pg_extension WHERE extname='oracle_fdw' diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index fbf731cf8fb5..ea214ec2bb88 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -273,7 +273,7 @@ def self.reserved_or_unsupported?(name) end def self.sanitize_name(column_name) - name = StringSanitizer.new.sanitize(column_name.to_s, transliterate_cyrillic: true) + name = StringSanitizer.sanitize(column_name.to_s, transliterate_cyrillic: true) return name unless reserved_or_unsupported?(name) "_#{name}" end From 777f7551b8ec8833edc46fad5e2468fa2d556a2b Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 18:29:00 +0100 Subject: [PATCH 17/33] Add rejected columns to preserve behaviour when adding columns --- app/models/table.rb | 2 +- lib/carto/db/sanitize.rb | 4 +++- services/importer/lib/importer/column.rb | 6 ++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/app/models/table.rb b/app/models/table.rb index a2fbc1f2e612..3906a771f5c3 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -670,7 +670,7 @@ def make_sequel_compatible(attributes) end def add_column!(options) - raise CartoDB::InvalidColumnName if CartoDB::Importer2::Column.reserved_or_unsupported?(options[:name]) + raise CartoDB::InvalidColumnName if CartoDB::Importer2::Column.rejected?(options[:name]) type = options[:type].convert_to_db_type cartodb_type = options[:type].convert_to_cartodb_type # FIXME: consider CartoDB::Importer2::Column.get_valid_column_name with CURRENT_COLUMN_SANITIZATION_VERSION diff --git a/lib/carto/db/sanitize.rb b/lib/carto/db/sanitize.rb index 3f6b93a4af2a..ed24646b8175 100644 --- a/lib/carto/db/sanitize.rb +++ b/lib/carto/db/sanitize.rb @@ -28,7 +28,9 @@ module Sanitize some symmetric table tablesample then to trailing true union unique user using variadic verbose when where window with).freeze - ADDITIONAL_RESERVED_COLUMNS = %w(ogc_fid).freeze + ADDITIONAL_RESERVED_COLUMNS = %w(oid ogc_fid).freeze + + REJECTED_COLUMN_NAMES = (SYSTEM_COLUMN_NAMES + ADDITIONAL_RESERVED_COLUMNS).freeze # FIXME we have been reserving these name in columns but I don't know the reason ¯\_(ツ)_/¯ ADDITIONAL_WORDS = %w(between new off old format controller action).freeze diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index ea214ec2bb88..88b6542e5cda 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -21,6 +21,8 @@ class Column RESERVED_COLUMN_NAMES = Carto::DB::Sanitize::RESERVED_COLUMN_NAMES PG_IDENTIFIER_MAX_LENGTH = Carto::DB::Sanitize::MAX_IDENTIFIER_LENGTH + REJECTED_COLUMN_NAMES = Carto::DB::Sanitize::REJECTED_COLUMN_NAMES + NO_COLUMN_SANITIZATION_VERSION = 0 INITIAL_COLUMN_SANITIZATION_VERSION = 1 CURRENT_COLUMN_SANITIZATION_VERSION = 2 @@ -278,6 +280,10 @@ def self.sanitize_name(column_name) "_#{name}" end + def self.rejected?(name) + REJECTED_COLUMN_NAMES.include?(name) || name =~ /^[0-9]/ + end + def self.get_valid_column_name(candidate_column_name, column_sanitization_version, column_names) return candidate_column_name if column_sanitization_version == NO_COLUMN_SANITIZATION_VERSION From 03bea3473d63952a7c9f2a904bc097c64c1a552b Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 18:29:22 +0100 Subject: [PATCH 18/33] Make StringSanitizer methods module functions --- services/importer/lib/importer/string_sanitizer.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/importer/lib/importer/string_sanitizer.rb b/services/importer/lib/importer/string_sanitizer.rb index feb8c597d032..50165fb7d0cc 100644 --- a/services/importer/lib/importer/string_sanitizer.rb +++ b/services/importer/lib/importer/string_sanitizer.rb @@ -1,6 +1,8 @@ module CartoDB module Importer2 - class StringSanitizer + module StringSanitizer + module_function + def normalize(string, transliterate_cyrillic: false) return '' if string.nil? || string.empty? From 0527f3104473dc09c406b934819bbf8a54e23847 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 19:13:57 +0100 Subject: [PATCH 19/33] Fix bug --- app/models/table.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/table.rb b/app/models/table.rb index 3906a771f5c3..27bc8ef159cb 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -1339,7 +1339,7 @@ def self.ensure_column_has_valid_name(table_name, column_name, column_sanitizati connection = options.fetch(:connection) database_schema = options.fetch(:database_schema, 'public') - valid_column_name = get_valid_column_name(column_name, column_sanitization_version, options) + valid_column_name = get_valid_column_name(table_name, column_name, column_sanitization_version, options) if valid_column_name != column_name connection.run(%Q{ALTER TABLE "#{database_schema}"."#{table_name}" RENAME COLUMN "#{column_name}" TO "#{valid_column_name}";}) end From b2e30b5f2e8bb6018cf268e8f7ea460fab5c7135 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 19:15:19 +0100 Subject: [PATCH 20/33] Fix column modification --- app/models/table.rb | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/app/models/table.rb b/app/models/table.rb index 27bc8ef159cb..ec0c2f83e803 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -695,16 +695,19 @@ def drop_column!(options) end def modify_column!(options) + old_name = options.fetch(:name, '').to_s + new_name = options.fetch(:new_name, '').to_s + # FIXME: consider CartoDB::Importer2::Column.get_valid_column_name with CURRENT_COLUMN_SANITIZATION_VERSION - old_name = CartoDB::Importer2::Column.sanitize_name(options.fetch(:name, '').to_s) - new_name = CartoDB::Importer2::Column.sanitize_name(options.fetch(:new_name, '').to_s) - raise 'This column cannot be modified' if CARTODB_COLUMNS.include?(old_name.to_s) + old_sanitized_name = CartoDB::Importer2::Column.sanitize_name(old_name) + raise 'This column cannot be modified' if CARTODB_COLUMNS.include?(old_sanitized_name.to_s) if new_name.present? && new_name != old_name - rename_column(old_name, new_name) + new_sanitized_name = CartoDB::Importer2::Column.sanitize_name(new_name) + rename_column(old_sanitized_name, new_sanitized_name) end - column_name = (new_name.present? ? new_name : old_name) + column_name = (new_name.present? ? new_sanitized_name : old_sanitized_name) convert_column_datatype(owner.in_database, name, column_name, options[:type]) column_type = column_type_for(column_name) @@ -1316,6 +1319,7 @@ def column_sanitization_version end def self.sanitize_columns(table_name, column_sanitization_version, options={}) + connection = options.fetch(:connection) database_schema = options.fetch(:database_schema, 'public') From cc788b3534bbd9bb084bb345b34c0f819d56efa3 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 19:15:47 +0100 Subject: [PATCH 21/33] Preserve sanitization behaviour for multiple underscores --- services/importer/lib/importer/column.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index 88b6542e5cda..2a4a3a9c31f9 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -306,7 +306,7 @@ def self.get_valid_column_name(candidate_column_name, column_sanitization_versio avoid_collisions(candidate_column_name, existing_names, reserved_words) elsif column_sanitization_version == 2 - new_column_name = sanitize_name(candidate_column_name) + new_column_name = sanitize_name(candidate_column_name).gsub(/_{2,}/, '_') new_column_name = [0, PG_IDENTIFIER_MAX_LENGTH] if new_column_name.size > PG_IDENTIFIER_MAX_LENGTH avoid_collisions(new_column_name, existing_names, RESERVED_COLUMN_NAMES) else From c553e4320d37487fd446bbb2d948903b78c2d47e Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 19:35:59 +0100 Subject: [PATCH 22/33] Adapt tests to new multiple underscore behaviour --- services/importer/spec/unit/column_spec.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/services/importer/spec/unit/column_spec.rb b/services/importer/spec/unit/column_spec.rb index 8c314cd87bf8..1aeb94ad8174 100644 --- a/services/importer/spec/unit/column_spec.rb +++ b/services/importer/spec/unit/column_spec.rb @@ -401,21 +401,21 @@ def random_kml_multi_record "if it’s a community garden, is it collective or allotment?" => "if_it_s_a_community_garden_is_it_collective_or_allotment", "Paddock" => "paddock", "Date Due" => "date_due", - "__5" => "__5", - "__1" => "__1", + "__5" => "_5", + "__1" => "_1", "teléfono" => "telefono", ":@computed_region_wvic_k925" => "computed_region_wvic_k925", "Регион" => "region", "неустановленный источник" => "neust_novlennyj_istochnik", "> min" => "min", - "12_ schedule of visits.0" => "_12__schedule_of_visits_0", + "12_ schedule of visits.0" => "_12_schedule_of_visits_0", "previous rent (£ per sq ft)" => "previous_rent_per_sq_ft", "description/名稱" => "description", "description/地址" => "description", "@relations" => "relations", "EntityName" => "entityname", - "_ injured" => "__injured", - "trips 11 _ 15 miles" => "trips_11___15_miles", + "_ injured" => "_injured", + "trips 11 _ 15 miles" => "trips_11_15_miles", "as" => "_as", "any" => "_any", "xmin" => "_xmin", From bcf74c973e4d6e1dedb3ccea6a398d198bc9d328 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 19:36:12 +0100 Subject: [PATCH 23/33] Neew sanitization tests --- services/importer/spec/unit/column_spec.rb | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/services/importer/spec/unit/column_spec.rb b/services/importer/spec/unit/column_spec.rb index 1aeb94ad8174..fc396ab6ff43 100644 --- a/services/importer/spec/unit/column_spec.rb +++ b/services/importer/spec/unit/column_spec.rb @@ -425,7 +425,9 @@ def random_kml_multi_record VERSION_2_SANITIZATION_COLS = { ['выбросы предприятий2', 'выхлопы автотранспорта2', 'неустановленный источник2'] => ['vybrosy_predpriyatij2', 'vyxlopy_vtotr_nsport_2', 'neust_novlennyj_istochnik2'], ["description/\u540d\u7a31", "description/\u5730\u5740"] => ["description", "description_1"], - ["abc", "Abc", "aBc", "ABC"] => ["abc", "abc_1", "abc_2", "abc_3"] + ["abc", "Abc", "aBc", "ABC"] => ["abc", "abc_1", "abc_2", "abc_3"], + ["_", "_", "_", "_"] => ["_", "_1", "_2", "_3"] + } describe '.get_valid_column_name' do @@ -473,6 +475,22 @@ def random_kml_multi_record end end end + + it 'multiple column sanitization is idempotent' do + VERSION_2_SANITIZATION_COLS.each_key do |input_columns| + columns1 = [] + input_columns.each do |input_column| + column1 = Column::get_valid_column_name(input_column, 2, columns1) + columns1 << column1 + end + columns2 = [] + columns1.each do |input_column| + column2 = Column::get_valid_column_name(input_column, 2, columns2) + columns2 << column2 + column2.should eq input_column + end + end + end end # .get_valid_column_name end # Column From 952b0bf34746de663fd2d60defafd32c35b70fa9 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 19:36:28 +0100 Subject: [PATCH 24/33] Bugfixing --- spec/models/table_spec.rb | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spec/models/table_spec.rb b/spec/models/table_spec.rb index 9a34aabb041a..1a69483ce2fc 100644 --- a/spec/models/table_spec.rb +++ b/spec/models/table_spec.rb @@ -2219,23 +2219,28 @@ def check_query_geometry(query, schema) describe 'self.get_valid_column_name' do it 'returns the same candidate name if it is ok' do Table.expects(:get_column_names).once.returns(%w{a b c}) - Table.get_valid_column_name('dummy_table_name', 'a').should == 'a' + version = CartoDB::Importer2::Column::CURRENT_COLUMN_SANITIZATION_VERSION + Table.get_valid_column_name('dummy_table_name', 'a', version).should == 'a' end it 'returns an alternative name if using a reserved word' do Table.expects(:get_column_names).once.returns(%w{column b c}) + version = CartoDB::Importer2::Column::CURRENT_COLUMN_SANITIZATION_VERSION Table.get_valid_column_name( 'dummy_table_name', 'column', - reserved_words: %w{ INSERT SELECT COLUMN }).should == 'column_1' + version + ).should == 'column_1' end it 'avoids collisions when a renamed column already exists' do Table.expects(:get_column_names).once.returns(%w{column_1 column c}) + version = CartoDB::Importer2::Column::CURRENT_COLUMN_SANITIZATION_VERSION Table.get_valid_column_name( 'dummy_table_name', 'column', - reserved_words: %w{ INSERT SELECT COLUMN }).should == 'column_2' + version + ).should == 'column_2' end end From 4183bbf53cc7b3f835bcc1c42761ba3e0bf0e74e Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 21:56:07 +0100 Subject: [PATCH 25/33] Remove test case --- services/importer/spec/unit/column_spec.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/importer/spec/unit/column_spec.rb b/services/importer/spec/unit/column_spec.rb index fc396ab6ff43..b105a93e7fd1 100644 --- a/services/importer/spec/unit/column_spec.rb +++ b/services/importer/spec/unit/column_spec.rb @@ -425,9 +425,7 @@ def random_kml_multi_record VERSION_2_SANITIZATION_COLS = { ['выбросы предприятий2', 'выхлопы автотранспорта2', 'неустановленный источник2'] => ['vybrosy_predpriyatij2', 'vyxlopy_vtotr_nsport_2', 'neust_novlennyj_istochnik2'], ["description/\u540d\u7a31", "description/\u5730\u5740"] => ["description", "description_1"], - ["abc", "Abc", "aBc", "ABC"] => ["abc", "abc_1", "abc_2", "abc_3"], - ["_", "_", "_", "_"] => ["_", "_1", "_2", "_3"] - + ["abc", "Abc", "aBc", "ABC"] => ["abc", "abc_1", "abc_2", "abc_3"] } describe '.get_valid_column_name' do @@ -470,7 +468,9 @@ def random_kml_multi_record columns = [] input_columns.zip(output_columns).each do |input_column, output_column| column = Column::get_valid_column_name(input_column, 2, columns) + puts "--- ADDING COL #{column}" columns << column + puts " >> #{columns.inspect}" column.should eq output_column end end From 6b481f8dcdfb8a7efbcbf74c8b9f4f3a42703c00 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 21:56:38 +0100 Subject: [PATCH 26/33] Test using ActiveRecord transliteration --- services/importer/lib/importer/column.rb | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index 2a4a3a9c31f9..da0519e354fd 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -285,7 +285,6 @@ def self.rejected?(name) end def self.get_valid_column_name(candidate_column_name, column_sanitization_version, column_names) - return candidate_column_name if column_sanitization_version == NO_COLUMN_SANITIZATION_VERSION existing_names = column_names - [candidate_column_name] @@ -309,6 +308,19 @@ def self.get_valid_column_name(candidate_column_name, column_sanitization_versio new_column_name = sanitize_name(candidate_column_name).gsub(/_{2,}/, '_') new_column_name = [0, PG_IDENTIFIER_MAX_LENGTH] if new_column_name.size > PG_IDENTIFIER_MAX_LENGTH avoid_collisions(new_column_name, existing_names, RESERVED_COLUMN_NAMES) + elsif column_sanitization_version == 3 + # experimental sanitization + # this can be configured using the locale files for the current (I18n.locale) locale; + # for example, for I18n.locale == :en we could add this to config/locales/en.yml + # en: + # i18n: + # transliterate: + # rule: + # Ж: Zh + # ж: zh + new_column_name = candidate_column_name.parameterize.tr('-','_') + new_column_name = [0, PG_IDENTIFIER_MAX_LENGTH] if new_column_name.size > PG_IDENTIFIER_MAX_LENGTH + avoid_collisions(new_column_name, existing_names, RESERVED_COLUMN_NAMES) else raise "Invalid column sanitization version #{column_sanitization_version.inspect}" end From e1b2e154de6b1e8b3a22dc05751fdcadc986d95b Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 23:33:24 +0100 Subject: [PATCH 27/33] Adjust tests to new sanitization --- spec/models/table_spec.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spec/models/table_spec.rb b/spec/models/table_spec.rb index 1a69483ce2fc..9ac1576a7aa9 100644 --- a/spec/models/table_spec.rb +++ b/spec/models/table_spec.rb @@ -2230,17 +2230,17 @@ def check_query_geometry(query, schema) 'dummy_table_name', 'column', version - ).should == 'column_1' + ).should == '_column' end it 'avoids collisions when a renamed column already exists' do - Table.expects(:get_column_names).once.returns(%w{column_1 column c}) + Table.expects(:get_column_names).once.returns(%w{_column column c}) version = CartoDB::Importer2::Column::CURRENT_COLUMN_SANITIZATION_VERSION Table.get_valid_column_name( 'dummy_table_name', 'column', version - ).should == 'column_2' + ).should == '_column_1' end end From 1ff4270612bf58787b35e1556eda56e40768dcb8 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 19 Dec 2019 23:34:54 +0100 Subject: [PATCH 28/33] Fix reserved column names Note that ogc_fid shouldn't be altered since it is handled by Table#import_cleanup --- lib/carto/db/sanitize.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/carto/db/sanitize.rb b/lib/carto/db/sanitize.rb index ed24646b8175..c5c75d1c5525 100644 --- a/lib/carto/db/sanitize.rb +++ b/lib/carto/db/sanitize.rb @@ -35,7 +35,7 @@ module Sanitize # FIXME we have been reserving these name in columns but I don't know the reason ¯\_(ツ)_/¯ ADDITIONAL_WORDS = %w(between new off old format controller action).freeze - RESERVED_COLUMN_NAMES = SYSTEM_COLUMN_NAMES + RESERVED_WORDS + ADDITIONAL_RESERVED_COLUMNS + ADDITIONAL_WORDS + RESERVED_COLUMN_NAMES = SYSTEM_COLUMN_NAMES + RESERVED_WORDS + ADDITIONAL_WORDS def self.append_with_truncate_and_sanitize(identifier, suffix) From d537a80ca0ca23c70ab4f86af96e62ac3e0c2014 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Fri, 20 Dec 2019 16:16:45 +0100 Subject: [PATCH 29/33] Allow asigning user_id in Table constructor Synchronization::Member#table expects user_id to be assignable in the Table constructor (and now also Synchronization::Adapter#import_cleanup expects it) --- app/models/table.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/app/models/table.rb b/app/models/table.rb index ec0c2f83e803..ae52c90a00cc 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -53,6 +53,7 @@ def initialize(args = {}) else @user_table = args[:user_table] end + self.user_id = args[:user_id] if args[:user_id].present? # TODO: this probably makes sense only if user_table is not passed as argument @user_table.set_service(self) end From 4eb840acf3bd51a0261f6e3ebbf80328a7f4ce46 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Fri, 20 Dec 2019 16:20:19 +0100 Subject: [PATCH 30/33] Fix access to Table data_import from column_sanitization_version Sanitization of colum names was being accidentally ommited in Synchronization::Adapter#import_cleanup This wasn't actually too bad because the sanitization was peformed after cartodbfication by Synchronization::Adapter#setup_table. The cause of this was that ::Table relies on its @data_import for the sanitization version, and since it hadn't been assigned in the Table instance used, it used the version 0 which means "no sanitization". This also had another side effect: we have been using double sanitization in syncs (legacy version 1) possibly altering column names (double underscores caused by anti-collision suffixes). So this commit restores that behaviour for v1 syncs (although a less accidental way of achieving the effect would be desiderable, such as preventing double sanitization and altering v1 sanitization to remove double underscores after collision prevention). --- app/models/synchronization/adapter.rb | 6 ++++++ app/models/table.rb | 21 ++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/app/models/synchronization/adapter.rb b/app/models/synchronization/adapter.rb index 47f6dd3cc673..8bb920f25f72 100644 --- a/app/models/synchronization/adapter.rb +++ b/app/models/synchronization/adapter.rb @@ -277,7 +277,13 @@ def import_cleanup(schema_name, table_name) # For consistency with regular imports, also eases testing # The sanitization of @table_name is applied to the newly imported table_name + # This should not be necessary, since setup_table, called after cartodbfication + # also perfomes column sanitization via Table#import_to_cartodb + # but is kept because there may be existing syncs for which this double sanitization + # (version 1 sanitization which wasn't idemponent) had the effect of altering some + # satinizated names (e.g. __1 -> _1). table = ::Table.new(name: @table_name, user_id: @user.id) + # we could as well: table = Carto::UserTable.find(@user.tables.where(name: @table_name).first.id).service table.sanitize_columns(table_name: table_name, database_schema: schema_name, connection: user_database) # When tables are created using ogr2ogr they are added a ogc_fid or gid primary key diff --git a/app/models/table.rb b/app/models/table.rb index ae52c90a00cc..b03609028a3b 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -209,14 +209,17 @@ def self.table_and_schema(table_name) end end + def data_import + @data_import ||= DataImport.where(id: @user_table.data_import_id).first || DataImport.new(user_id: owner.id) + end + ## Callbacks def import_to_cartodb(uniname = nil) - @data_import ||= DataImport.where(id: @user_table.data_import_id).first || DataImport.new(user_id: owner.id) if migrate_existing_table.present? || uniname - @data_import.data_type = DataImport::TYPE_EXTERNAL_TABLE if @data_import.data_type.nil? - @data_import.data_source = migrate_existing_table || uniname - @data_import.save + data_import.data_type = DataImport::TYPE_EXTERNAL_TABLE if data_import.data_type.nil? + data_import.data_source = migrate_existing_table || uniname + data_import.save # ensure unique name, also ensures self.name can override any imported table name uniname = get_valid_name(name ? name : migrate_existing_table) unless uniname @@ -237,12 +240,12 @@ def import_to_cartodb(uniname = nil) :debug => Rails.env.development?, :remaining_quota => owner.remaining_quota, :remaining_tables => owner.remaining_table_quota, - :data_import_id => @data_import.id + :data_import_id => data_import.id ).symbolize_keys importer = CartoDB::Migrator.new(hash_in) imported_name = importer.migrate! - @data_import.reload - @data_import.save + data_import.reload + data_import.save imported_name end end @@ -1312,8 +1315,8 @@ def get_valid_name(contendent) end def column_sanitization_version - if @data_import - @data_import.column_sanitization_version + if data_import + data_import.column_sanitization_version else CartoDB::Importer2::Column::NO_COLUMN_SANITIZATION_VERSION end From 1726064a1fa82b406e5903afad8869d855530b2e Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Mon, 13 Jan 2020 15:46:58 +0100 Subject: [PATCH 31/33] Code simplification --- app/models/table.rb | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/app/models/table.rb b/app/models/table.rb index b03609028a3b..13a54558d64c 100644 --- a/app/models/table.rb +++ b/app/models/table.rb @@ -1315,11 +1315,7 @@ def get_valid_name(contendent) end def column_sanitization_version - if data_import - data_import.column_sanitization_version - else - CartoDB::Importer2::Column::NO_COLUMN_SANITIZATION_VERSION - end + data_import&.column_sanitization_version || CartoDB::Importer2::Column::NO_COLUMN_SANITIZATION_VERSION end def self.sanitize_columns(table_name, column_sanitization_version, options={}) From c0d773e39453fcb5b313b67296bc3c3d41f57b4f Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Thu, 16 Jan 2020 19:01:03 +0100 Subject: [PATCH 32/33] Separate methods per sanitization version --- services/importer/lib/importer/column.rb | 52 +++++++++++++++--------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/services/importer/lib/importer/column.rb b/services/importer/lib/importer/column.rb index da0519e354fd..d0b9cf9c7427 100644 --- a/services/importer/lib/importer/column.rb +++ b/services/importer/lib/importer/column.rb @@ -290,25 +290,42 @@ def self.get_valid_column_name(candidate_column_name, column_sanitization_versio existing_names = column_names - [candidate_column_name] if column_sanitization_version == INITIAL_COLUMN_SANITIZATION_VERSION - # NOTE: originally not all uses of this sanitization version applied reserved words - # reserved_words = RESERVED_COLUMN_NAMES - reserved_words = [] + get_valid_column_name_v1(candidate_column_name, existing_names) + elsif column_sanitization_version == 2 + get_valid_column_name_v2(candidate_column_name, existing_names) + elsif column_sanitization_version == 3 + get_valid_column_name_v3(candidate_column_name, existing_names) + else + raise "Invalid column sanitization version #{column_sanitization_version.inspect}" + end + end - candidate_column_name = 'untitled_column' if candidate_column_name.blank? - candidate_column_name = candidate_column_name.to_s.squish + private - # Subsequent characters can be letters, underscores or digits - candidate_column_name = candidate_column_name.gsub(/[^a-z0-9]/,'_').gsub(/_{2,}/, '_') + def self.get_valid_column_name_v1(candidate_column_name, existing_names) + # NOTE: originally not all uses of this sanitization version applied reserved words + # reserved_words = RESERVED_COLUMN_NAMES + reserved_words = [] - # Valid names start with a letter or an underscore - candidate_column_name = "column_#{candidate_column_name}" unless candidate_column_name[/^[a-z_]{1}/] + candidate_column_name = 'untitled_column' if candidate_column_name.blank? + candidate_column_name = candidate_column_name.to_s.squish - avoid_collisions(candidate_column_name, existing_names, reserved_words) - elsif column_sanitization_version == 2 - new_column_name = sanitize_name(candidate_column_name).gsub(/_{2,}/, '_') - new_column_name = [0, PG_IDENTIFIER_MAX_LENGTH] if new_column_name.size > PG_IDENTIFIER_MAX_LENGTH - avoid_collisions(new_column_name, existing_names, RESERVED_COLUMN_NAMES) - elsif column_sanitization_version == 3 + # Subsequent characters can be letters, underscores or digits + candidate_column_name = candidate_column_name.gsub(/[^a-z0-9]/,'_').gsub(/_{2,}/, '_') + + # Valid names start with a letter or an underscore + candidate_column_name = "column_#{candidate_column_name}" unless candidate_column_name[/^[a-z_]{1}/] + + avoid_collisions(candidate_column_name, existing_names, reserved_words) + end + + def self.get_valid_column_name_v2(candidate_column_name, existing_names) + new_column_name = sanitize_name(candidate_column_name).gsub(/_{2,}/, '_') + new_column_name = [0, PG_IDENTIFIER_MAX_LENGTH] if new_column_name.size > PG_IDENTIFIER_MAX_LENGTH + avoid_collisions(new_column_name, existing_names, RESERVED_COLUMN_NAMES) + end + + def self.get_valid_column_name_v3(candidate_column_name, existing_names) # experimental sanitization # this can be configured using the locale files for the current (I18n.locale) locale; # for example, for I18n.locale == :en we could add this to config/locales/en.yml @@ -321,13 +338,8 @@ def self.get_valid_column_name(candidate_column_name, column_sanitization_versio new_column_name = candidate_column_name.parameterize.tr('-','_') new_column_name = [0, PG_IDENTIFIER_MAX_LENGTH] if new_column_name.size > PG_IDENTIFIER_MAX_LENGTH avoid_collisions(new_column_name, existing_names, RESERVED_COLUMN_NAMES) - else - raise "Invalid column sanitization version #{column_sanitization_version.inspect}" - end end - private - def self.avoid_collisions(name, existing_names, reserved_words, max_length=PG_IDENTIFIER_MAX_LENGTH) count = 1 new_name = name From 8f7c420a0adcaea6ce02008f777db30c1f23419b Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Fri, 17 Jan 2020 09:58:26 +0100 Subject: [PATCH 33/33] Fix NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 1172a6a9743d..a66abe442203 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,7 @@ sudo make install - None yet ### Bug fixes / enhancements +- New versioned sanitization of column names ([#15326](https://github.com/CartoDB/cartodb/issues/15326)) - Change Catalog dropdown placeholders (([#15335](https://github.com/CartoDB/cartodb/issues/15335))) - Fix /embed_map for kuviz ([#15360](https://github.com/CartoDB/cartodb/pull/15360)) - Avoid extra calls when counting number of likes of each visualization ([#15349](https://github.com/CartoDB/cartodb/pull/15349)) @@ -53,7 +54,6 @@ sudo make install * Fixed issue while creating a new user's database: force to alter extension always as template_postgis may have the current version defined and the extension won't be installed ### Bug fixes / enhancements -* New versioned sanitization of column names ([#15326](https://github.com/CartoDB/cartodb/issues/15326)) * Avoid warnings when running test in parallel with an empty environment * Improve concurrent Ghost Tables syncs handling ([#15272](https://github.com/CartoDB/cartodb/pull/15272)) * Fix consent screen in OAuth apps without user ([#15247](https://github.com/CartoDB/cartodb/pull/15247))