diff --git a/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java b/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java index 007fb8f07b9c..d7b57156b3c3 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java +++ b/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java @@ -103,6 +103,16 @@ public class AwsProperties implements Serializable { public static final String GLUE_CATALOG_SKIP_ARCHIVE = "glue.skip-archive"; public static final boolean GLUE_CATALOG_SKIP_ARCHIVE_DEFAULT = false; + /** + * If Glue should skip name validations + * It is recommended to stick to Glue best practice in + * https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html to make sure operations are Hive compatible. + * This is only added for users that have existing conventions using non-standard characters. When database name + * and table name validation are skipped, there is no guarantee that downstream systems would all support the names. + */ + public static final String GLUE_CATALOG_SKIP_NAME_VALIDATION = "glue.skip-name-validation"; + public static final boolean GLUE_CATALOG_SKIP_NAME_VALIDATION_DEFAULT = false; + /** * If set, GlueCatalog will use Lake Formation for access control. * For more credential vending details, see: https://docs.aws.amazon.com/lake-formation/latest/dg/api-overview.html. @@ -375,6 +385,7 @@ public class AwsProperties implements Serializable { private String glueCatalogId; private boolean glueCatalogSkipArchive; + private boolean glueCatalogSkipNameValidation; private boolean glueLakeFormationEnabled; private String dynamoDbTableName; @@ -399,6 +410,7 @@ public AwsProperties() { this.glueCatalogId = null; this.glueCatalogSkipArchive = GLUE_CATALOG_SKIP_ARCHIVE_DEFAULT; + this.glueCatalogSkipNameValidation = GLUE_CATALOG_SKIP_NAME_VALIDATION_DEFAULT; this.glueLakeFormationEnabled = GLUE_LAKEFORMATION_ENABLED_DEFAULT; this.dynamoDbTableName = DYNAMODB_TABLE_NAME_DEFAULT; @@ -417,6 +429,8 @@ public AwsProperties(Map properties) { this.glueCatalogId = properties.get(GLUE_CATALOG_ID); this.glueCatalogSkipArchive = PropertyUtil.propertyAsBoolean(properties, AwsProperties.GLUE_CATALOG_SKIP_ARCHIVE, AwsProperties.GLUE_CATALOG_SKIP_ARCHIVE_DEFAULT); + this.glueCatalogSkipNameValidation = PropertyUtil.propertyAsBoolean(properties, + AwsProperties.GLUE_CATALOG_SKIP_NAME_VALIDATION, AwsProperties.GLUE_CATALOG_SKIP_NAME_VALIDATION_DEFAULT); this.glueLakeFormationEnabled = PropertyUtil.propertyAsBoolean(properties, GLUE_LAKEFORMATION_ENABLED, GLUE_LAKEFORMATION_ENABLED_DEFAULT); @@ -512,11 +526,18 @@ public void setGlueCatalogId(String id) { public boolean glueCatalogSkipArchive() { return glueCatalogSkipArchive; } - public void setGlueCatalogSkipArchive(boolean skipArchive) { this.glueCatalogSkipArchive = skipArchive; } + public boolean glueCatalogSkipNameValidation() { + return glueCatalogSkipNameValidation; + } + + public void setGlueCatalogSkipNameValidation(boolean glueCatalogSkipNameValidation) { + this.glueCatalogSkipNameValidation = glueCatalogSkipNameValidation; + } + public boolean glueLakeFormationEnabled() { return glueLakeFormationEnabled; } diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java index 2c6c46528b00..88c94c12d2b5 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java @@ -201,9 +201,9 @@ protected TableOperations newTableOps(TableIdentifier tableIdentifier) { Map tableSpecificCatalogProperties = ImmutableMap.builder() .putAll(catalogProperties) .put(AwsProperties.LAKE_FORMATION_DB_NAME, - IcebergToGlueConverter.getDatabaseName(tableIdentifier)) + IcebergToGlueConverter.getDatabaseName(tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) .put(AwsProperties.LAKE_FORMATION_TABLE_NAME, - IcebergToGlueConverter.getTableName(tableIdentifier)) + IcebergToGlueConverter.getTableName(tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) .build(); // FileIO initialization depends on tableSpecificCatalogProperties, so a new FileIO is initialized each time return new GlueTableOperations(glue, lockManager, catalogName, awsProperties, @@ -224,7 +224,7 @@ protected TableOperations newTableOps(TableIdentifier tableIdentifier) { protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { // check if value is set in database GetDatabaseResponse response = glue.getDatabase(GetDatabaseRequest.builder() - .name(IcebergToGlueConverter.getDatabaseName(tableIdentifier)) + .name(IcebergToGlueConverter.getDatabaseName(tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) .build()); String dbLocationUri = response.database().locationUri(); if (dbLocationUri != null) { @@ -234,7 +234,7 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { return String.format( "%s/%s.db/%s", warehousePath, - IcebergToGlueConverter.getDatabaseName(tableIdentifier), + IcebergToGlueConverter.getDatabaseName(tableIdentifier, awsProperties.glueCatalogSkipNameValidation()), tableIdentifier.name()); } @@ -247,7 +247,7 @@ public List listTables(Namespace namespace) { do { GetTablesResponse response = glue.getTables(GetTablesRequest.builder() .catalogId(awsProperties.glueCatalogId()) - .databaseName(IcebergToGlueConverter.toDatabaseName(namespace)) + .databaseName(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) .nextToken(nextToken) .build()); nextToken = response.nextToken(); @@ -276,7 +276,8 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { TableMetadata lastMetadata = ops.current(); glue.deleteTable(DeleteTableRequest.builder() .catalogId(awsProperties.glueCatalogId()) - .databaseName(IcebergToGlueConverter.getDatabaseName(identifier)) + .databaseName(IcebergToGlueConverter.getDatabaseName( + identifier, awsProperties.glueCatalogSkipNameValidation())) .name(identifier.name()) .build()); LOG.info("Successfully dropped table {} from Glue", identifier); @@ -309,10 +310,11 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { } // keep metadata Table fromTable = null; - String fromTableDbName = IcebergToGlueConverter.getDatabaseName(from); - String fromTableName = IcebergToGlueConverter.getTableName(from); - String toTableDbName = IcebergToGlueConverter.getDatabaseName(to); - String toTableName = IcebergToGlueConverter.getTableName(to); + String fromTableDbName = IcebergToGlueConverter.getDatabaseName( + from, awsProperties.glueCatalogSkipNameValidation()); + String fromTableName = IcebergToGlueConverter.getTableName(from, awsProperties.glueCatalogSkipNameValidation()); + String toTableDbName = IcebergToGlueConverter.getDatabaseName(to, awsProperties.glueCatalogSkipNameValidation()); + String toTableName = IcebergToGlueConverter.getTableName(to, awsProperties.glueCatalogSkipNameValidation()); try { GetTableResponse response = glue.getTable(GetTableRequest.builder() .catalogId(awsProperties.glueCatalogId()) @@ -359,7 +361,8 @@ public void createNamespace(Namespace namespace, Map metadata) { try { glue.createDatabase(CreateDatabaseRequest.builder() .catalogId(awsProperties.glueCatalogId()) - .databaseInput(IcebergToGlueConverter.toDatabaseInput(namespace, metadata)) + .databaseInput(IcebergToGlueConverter.toDatabaseInput( + namespace, metadata, awsProperties.glueCatalogSkipNameValidation())) .build()); LOG.info("Created namespace: {}", namespace); } catch (software.amazon.awssdk.services.glue.model.AlreadyExistsException e) { @@ -400,7 +403,8 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac @Override public Map loadNamespaceMetadata(Namespace namespace) throws NoSuchNamespaceException { - String databaseName = IcebergToGlueConverter.toDatabaseName(namespace); + String databaseName = IcebergToGlueConverter.toDatabaseName( + namespace, awsProperties.glueCatalogSkipNameValidation()); try { Database database = glue.getDatabase(GetDatabaseRequest.builder() .catalogId(awsProperties.glueCatalogId()) @@ -434,7 +438,7 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept GetTablesResponse response = glue.getTables(GetTablesRequest.builder() .catalogId(awsProperties.glueCatalogId()) - .databaseName(IcebergToGlueConverter.toDatabaseName(namespace)) + .databaseName(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) .build()); if (response.hasTableList() && !response.tableList().isEmpty()) { @@ -450,7 +454,7 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept glue.deleteDatabase(DeleteDatabaseRequest.builder() .catalogId(awsProperties.glueCatalogId()) - .name(IcebergToGlueConverter.toDatabaseName(namespace)) + .name(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) .build()); LOG.info("Dropped namespace: {}", namespace); // Always successful, otherwise exception is thrown @@ -464,8 +468,9 @@ public boolean setProperties(Namespace namespace, Map properties newProperties.putAll(properties); glue.updateDatabase(UpdateDatabaseRequest.builder() .catalogId(awsProperties.glueCatalogId()) - .name(IcebergToGlueConverter.toDatabaseName(namespace)) - .databaseInput(IcebergToGlueConverter.toDatabaseInput(namespace, newProperties)) + .name(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) + .databaseInput(IcebergToGlueConverter.toDatabaseInput( + namespace, newProperties, awsProperties.glueCatalogSkipNameValidation())) .build()); LOG.debug("Successfully set properties {} for {}", properties.keySet(), namespace); // Always successful, otherwise exception is thrown @@ -481,8 +486,9 @@ public boolean removeProperties(Namespace namespace, Set properties) thr glue.updateDatabase(UpdateDatabaseRequest.builder() .catalogId(awsProperties.glueCatalogId()) - .name(IcebergToGlueConverter.toDatabaseName(namespace)) - .databaseInput(IcebergToGlueConverter.toDatabaseInput(namespace, metadata)) + .name(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) + .databaseInput(IcebergToGlueConverter.toDatabaseInput( + namespace, metadata, awsProperties.glueCatalogSkipNameValidation())) .build()); LOG.debug("Successfully removed properties {} from {}", properties, namespace); // Always successful, otherwise exception is thrown diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueTableOperations.java b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueTableOperations.java index cac680279dae..df61e4b166bf 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueTableOperations.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueTableOperations.java @@ -74,8 +74,10 @@ class GlueTableOperations extends BaseMetastoreTableOperations { FileIO fileIO, TableIdentifier tableIdentifier) { this.glue = glue; this.awsProperties = awsProperties; - this.databaseName = IcebergToGlueConverter.getDatabaseName(tableIdentifier); - this.tableName = IcebergToGlueConverter.getTableName(tableIdentifier); + this.databaseName = IcebergToGlueConverter.getDatabaseName( + tableIdentifier, awsProperties.glueCatalogSkipNameValidation()); + this.tableName = IcebergToGlueConverter.getTableName( + tableIdentifier, awsProperties.glueCatalogSkipNameValidation()); this.fullTableName = String.format("%s.%s.%s", catalogName, databaseName, tableName); this.commitLockEntityId = String.format("%s.%s", databaseName, tableName); this.fileIO = fileIO; diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/IcebergToGlueConverter.java b/aws/src/main/java/org/apache/iceberg/aws/glue/IcebergToGlueConverter.java index ba71c191ca08..765ca501d621 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/IcebergToGlueConverter.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/IcebergToGlueConverter.java @@ -105,31 +105,41 @@ static void validateNamespace(Namespace namespace) { /** * Validate and convert Iceberg namespace to Glue database name - * @param namespace Iceberg namespace + * + * @param namespace Iceberg namespace + * @param skipNameValidation should skip name validation * @return database name */ - static String toDatabaseName(Namespace namespace) { - validateNamespace(namespace); + static String toDatabaseName(Namespace namespace, boolean skipNameValidation) { + if (!skipNameValidation) { + validateNamespace(namespace); + } + return namespace.level(0); } /** * Validate and get Glue database name from Iceberg TableIdentifier - * @param tableIdentifier Iceberg table identifier + * + * @param tableIdentifier Iceberg table identifier + * @param skipNameValidation should skip name validation * @return database name */ - static String getDatabaseName(TableIdentifier tableIdentifier) { - return toDatabaseName(tableIdentifier.namespace()); + static String getDatabaseName(TableIdentifier tableIdentifier, boolean skipNameValidation) { + return toDatabaseName(tableIdentifier.namespace(), skipNameValidation); } /** * Validate and convert Iceberg name to Glue DatabaseInput - * @param namespace Iceberg namespace - * @param metadata metadata map + * + * @param namespace Iceberg namespace + * @param metadata metadata map + * @param skipNameValidation should skip name validation * @return Glue DatabaseInput */ - static DatabaseInput toDatabaseInput(Namespace namespace, Map metadata) { - DatabaseInput.Builder builder = DatabaseInput.builder().name(toDatabaseName(namespace)); + static DatabaseInput toDatabaseInput(Namespace namespace, Map metadata, boolean skipNameValidation) { + DatabaseInput.Builder builder = DatabaseInput.builder().name(toDatabaseName(namespace, + skipNameValidation)); Map parameters = Maps.newHashMap(); metadata.forEach((k, v) -> { if (GLUE_DB_DESCRIPTION_KEY.equals(k)) { @@ -167,11 +177,16 @@ static void validateTableName(String tableName) { /** * Validate and get Glue table name from Iceberg TableIdentifier - * @param tableIdentifier table identifier + * + * @param tableIdentifier table identifier + * @param skipNameValidation should skip name validation * @return table name */ - static String getTableName(TableIdentifier tableIdentifier) { - validateTableName(tableIdentifier.name()); + static String getTableName(TableIdentifier tableIdentifier, boolean skipNameValidation) { + if (!skipNameValidation) { + validateTableName(tableIdentifier.name()); + } + return tableIdentifier.name(); } diff --git a/aws/src/test/java/org/apache/iceberg/aws/glue/TestIcebergToGlueConverter.java b/aws/src/test/java/org/apache/iceberg/aws/glue/TestIcebergToGlueConverter.java index ec67af4597cb..ac9c9950cd41 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/glue/TestIcebergToGlueConverter.java +++ b/aws/src/test/java/org/apache/iceberg/aws/glue/TestIcebergToGlueConverter.java @@ -27,6 +27,7 @@ import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableProperties; import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -49,7 +50,7 @@ public class TestIcebergToGlueConverter { @Test public void testToDatabaseName() { - Assert.assertEquals("db", IcebergToGlueConverter.toDatabaseName(Namespace.of("db"))); + Assert.assertEquals("db", IcebergToGlueConverter.toDatabaseName(Namespace.of("db"), false)); } @Test @@ -64,7 +65,30 @@ public void testToDatabaseNameFailure() { AssertHelpers.assertThrows("bad namespace name", ValidationException.class, "Cannot convert namespace", - () -> IcebergToGlueConverter.toDatabaseName(name) + () -> IcebergToGlueConverter.toDatabaseName(name, false) + ); + } + } + + @Test + public void testSkipNamespaceValidation() { + List acceptableNames = Lists.newArrayList( + Namespace.of("db-1"), + Namespace.of("db-1-1-1")); + for (Namespace name : acceptableNames) { + Assert.assertEquals(name.toString(), IcebergToGlueConverter.toDatabaseName(name, true) + ); + } + } + + @Test + public void testSkipTableNameValidation() { + List acceptableIdentifiers = Lists.newArrayList( + TableIdentifier.parse("db.a-1"), + TableIdentifier.parse("db.a-1-1"), + TableIdentifier.parse("db.a#1")); + for (TableIdentifier identifier : acceptableIdentifiers) { + Assert.assertEquals(identifier.name(), IcebergToGlueConverter.getTableName(identifier, true) ); } } @@ -75,7 +99,7 @@ public void testToDatabaseInput() { IcebergToGlueConverter.GLUE_DB_DESCRIPTION_KEY, "description", IcebergToGlueConverter.GLUE_DB_LOCATION_KEY, "s3://location", "key", "val"); - DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties); + DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); Assert.assertEquals("Location should be set", "s3://location", databaseInput.locationUri()); Assert.assertEquals("Description should be set", "description", databaseInput.description()); Assert.assertEquals("Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); @@ -89,7 +113,7 @@ public void testToDatabaseInputNoParameter() { .parameters(ImmutableMap.of()) .build(); Namespace namespace = Namespace.of("db"); - Assert.assertEquals(input, IcebergToGlueConverter.toDatabaseInput(namespace, ImmutableMap.of())); + Assert.assertEquals(input, IcebergToGlueConverter.toDatabaseInput(namespace, ImmutableMap.of(), false)); } @Test @@ -97,7 +121,7 @@ public void testToDatabaseInputEmptyLocation() { Map properties = ImmutableMap.of( IcebergToGlueConverter.GLUE_DB_DESCRIPTION_KEY, "description", "key", "val"); - DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties); + DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); Assert.assertNull("Location should not be set", databaseInput.locationUri()); Assert.assertEquals("Description should be set", "description", databaseInput.description()); Assert.assertEquals("Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); @@ -109,7 +133,7 @@ public void testToDatabaseInputEmptyDescription() { Map properties = ImmutableMap.of( IcebergToGlueConverter.GLUE_DB_LOCATION_KEY, "s3://location", "key", "val"); - DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties); + DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); Assert.assertEquals("Location should be set", "s3://location", databaseInput.locationUri()); Assert.assertNull("Description should not be set", databaseInput.description()); Assert.assertEquals("Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); diff --git a/docs/integrations/aws.md b/docs/integrations/aws.md index d82c192684b2..72211cfd70ef 100644 --- a/docs/integrations/aws.md +++ b/docs/integrations/aws.md @@ -166,6 +166,7 @@ just like what is shown in the [enabling AWS integration](#enabling-aws-integrat More details about loading the catalog can be found in individual engine pages, such as [Spark](../spark-configuration/#loading-a-custom-catalog) and [Flink](../flink/#creating-catalogs-and-using-catalogs). #### Glue Catalog ID + There is a unique Glue metastore in each AWS account and each AWS region. By default, `GlueCatalog` chooses the Glue metastore to use based on the user's default AWS client credential and region setup. You can specify the Glue catalog ID through `glue.id` catalog property to point to a Glue catalog in a different AWS account. @@ -180,6 +181,14 @@ However, if you are streaming data to Iceberg, this will easily create a lot of Therefore, it is recommended to turn off the archive feature in Glue by setting `glue.skip-archive` to `true`. For more details, please read [Glue Quotas](https://docs.aws.amazon.com/general/latest/gr/glue.html) and the [UpdateTable API](https://docs.aws.amazon.com/glue/latest/webapi/API_UpdateTable.html). +#### Skip Name Validation + +Allow user to skip name validation for table name and namespaces. +It is recommended to stick to Glue best practice in +https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html to make sure operations are Hive compatible. +This is only added for users that have existing conventions using non-standard characters. When database name +and table name validation are skipped, there is no guarantee that downstream systems would all support the names. + #### Optimistic Locking By default, Iceberg uses Glue's optimistic locking for concurrent updates to a table.