Skip to content

Commit b77b362

Browse files
ljfgemrzhang10
authored andcommitted
Move 'Hive Metadata Scan: Support case insensitive name mapping' (PR 52) to hivelink-core (linkedin#102)
1 parent 85b8699 commit b77b362

File tree

8 files changed

+77
-34
lines changed

8 files changed

+77
-34
lines changed

build.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,9 @@ project(':iceberg-hivelink') {
528528
exclude group: 'com.zaxxer', module: 'HikariCP'
529529
}
530530

531+
testCompile project(':iceberg-data')
532+
testCompile project(':iceberg-orc')
533+
531534
// By default, hive-exec is a fat/uber jar and it exports a guava library
532535
// that's really old. We use the core classifier to be able to override our guava
533536
// version. Luckily, hive-exec seems to work okay so far with this version of guava

core/src/main/java/org/apache/iceberg/TableMetadata.java

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,6 @@ static TableMetadata newTableMetadata(Schema schema,
130130
.build();
131131
}
132132

133-
public static TableMetadata newTableMetadataWithoutFreshIds(Schema schema,
134-
PartitionSpec spec,
135-
String location,
136-
Map<String, String> properties) {
137-
return new TableMetadata(null, DEFAULT_TABLE_FORMAT_VERSION, UUID.randomUUID().toString(), location,
138-
INITIAL_SEQUENCE_NUMBER, System.currentTimeMillis(),
139-
-1, schema, INITIAL_SPEC_ID, ImmutableList.of(spec),
140-
SortOrder.unsorted().orderId(), ImmutableList.of(SortOrder.unsorted()),
141-
ImmutableMap.copyOf(properties), -1, ImmutableList.of(),
142-
ImmutableList.of(), ImmutableList.of());
143-
}
144-
145133
public static class SnapshotLogEntry implements HistoryEntry {
146134
private final long timestampMillis;
147135
private final long snapshotId;
@@ -257,7 +245,7 @@ public String toString() {
257245
private final List<MetadataUpdate> changes;
258246

259247
@SuppressWarnings("checkstyle:CyclomaticComplexity")
260-
TableMetadata(String metadataFileLocation,
248+
TableMetadata(InputFile file,
261249
int formatVersion,
262250
String uuid,
263251
String location,

core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,6 @@ public static NameMapping create(Schema schema) {
5454
return new NameMapping(TypeUtil.visit(schema, CreateMapping.INSTANCE));
5555
}
5656

57-
/**
58-
* Create a name-based mapping for a schema.
59-
* <p>
60-
* The mapping returned by this method will use the schema's name for each field.
61-
*
62-
* @param schema a {@link Schema}
63-
* @param caseSensitive whether names should be matched case sensitively
64-
* @return a {@link NameMapping} initialized with the schema's fields and names
65-
*/
66-
public static NameMapping create(Schema schema, boolean caseSensitive) {
67-
return new NameMapping(TypeUtil.visit(schema, CreateMapping.INSTANCE), caseSensitive);
68-
}
69-
7057
/**
7158
* Update a name-based mapping using changes to a schema.
7259
*
@@ -259,8 +246,8 @@ private static <S, T> S visit(MappedFields mapping, Visitor<S, T> visitor) {
259246
return visitor.fields(mapping, fieldResults);
260247
}
261248

262-
private static class CreateMapping extends TypeUtil.SchemaVisitor<MappedFields> {
263-
private static final CreateMapping INSTANCE = new CreateMapping();
249+
public static class CreateMapping extends TypeUtil.SchemaVisitor<MappedFields> {
250+
public static final CreateMapping INSTANCE = new CreateMapping();
264251

265252
private CreateMapping() {
266253
}

core/src/main/java/org/apache/iceberg/mapping/NameMapping.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ public static NameMapping of(MappedFields fields) {
5454
this(mapping, true);
5555
}
5656

57-
NameMapping(MappedFields mapping, boolean caseSensitive) {
57+
public NameMapping(MappedFields mapping, boolean caseSensitive) {
5858
this.mapping = mapping;
5959
this.caseSensitive = caseSensitive;
6060
lazyFieldsById();

hivelink-core/src/main/java/org/apache/iceberg/hivelink/core/LegacyHiveTableOperations.java

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.util.List;
2929
import java.util.Map;
3030
import java.util.Set;
31+
import java.util.UUID;
3132
import java.util.stream.Collectors;
3233
import org.apache.hadoop.conf.Configuration;
3334
import org.apache.hadoop.fs.FileStatus;
@@ -40,6 +41,7 @@
4041
import org.apache.iceberg.Metrics;
4142
import org.apache.iceberg.PartitionSpec;
4243
import org.apache.iceberg.Schema;
44+
import org.apache.iceberg.SortOrder;
4345
import org.apache.iceberg.StructLike;
4446
import org.apache.iceberg.TableMetadata;
4547
import org.apache.iceberg.TableProperties;
@@ -51,12 +53,13 @@
5153
import org.apache.iceberg.hadoop.HadoopFileIO;
5254
import org.apache.iceberg.hive.HiveClientPool;
5355
import org.apache.iceberg.hivelink.core.utils.FileSystemUtils;
56+
import org.apache.iceberg.hivelink.core.utils.MappingUtil;
5457
import org.apache.iceberg.io.FileIO;
5558
import org.apache.iceberg.io.LocationProvider;
56-
import org.apache.iceberg.mapping.MappingUtil;
5759
import org.apache.iceberg.mapping.NameMappingParser;
5860
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
5961
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
62+
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
6063
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
6164
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
6265
import org.apache.iceberg.types.Types;
@@ -68,6 +71,9 @@
6871
public class LegacyHiveTableOperations extends BaseMetastoreTableOperations {
6972

7073
private static final Logger LOG = LoggerFactory.getLogger(LegacyHiveTableOperations.class);
74+
private static final long INITIAL_SEQUENCE_NUMBER = 0;
75+
private static final int DEFAULT_TABLE_FORMAT_VERSION = 1;
76+
private static final int INITIAL_SPEC_ID = 0;
7177

7278
private final HiveClientPool metaClients;
7379
private final String databaseName;
@@ -105,7 +111,7 @@ protected void doRefresh() {
105111
// Provide a case insensitive name mapping for Hive tables
106112
tableProperties.put(TableProperties.DEFAULT_NAME_MAPPING,
107113
NameMappingParser.toJson(MappingUtil.create(schema, false)));
108-
TableMetadata metadata = TableMetadata.newTableMetadataWithoutFreshIds(schema, spec,
114+
TableMetadata metadata = newTableMetadataWithoutFreshIds(schema, spec,
109115
hiveTable.getSd().getLocation(), tableProperties);
110116
setCurrentMetadata(metadata);
111117
} catch (TException e) {
@@ -279,4 +285,16 @@ public String metadataFileLocation(String filename) {
279285
public LocationProvider locationProvider() {
280286
throw new UnsupportedOperationException("Writes not supported for Hive tables without Iceberg metadata");
281287
}
288+
289+
private TableMetadata newTableMetadataWithoutFreshIds(Schema schema,
290+
PartitionSpec spec,
291+
String location,
292+
Map<String, String> properties) {
293+
return new TableMetadata(null, DEFAULT_TABLE_FORMAT_VERSION, UUID.randomUUID().toString(), location,
294+
INITIAL_SEQUENCE_NUMBER, System.currentTimeMillis(),
295+
-1, schema, INITIAL_SPEC_ID, ImmutableList.of(spec),
296+
SortOrder.unsorted().orderId(), ImmutableList.of(SortOrder.unsorted()),
297+
ImmutableMap.copyOf(properties), -1, ImmutableList.of(),
298+
ImmutableList.of(), ImmutableList.of());
299+
}
282300
}

hivelink-core/src/main/java/org/apache/iceberg/hivelink/core/schema/MergeHiveSchemaWithAvro.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
* 2. Copies field names, nullability, default value, field props from the Avro schema
4545
* 3. Copies field type from the Hive schema.
4646
* TODO: We should also handle some cases of type promotion where the types in Avro are potentially more correct
47-
* e.g.BINARY in Hive -> FIXED in Avro, STRING in Hive -> ENUM in Avro, etc
47+
* e.g.BINARY in Hive to FIXED in Avro, STRING in Hive to ENUM in Avro, etc
4848
* 4. Retains fields found only in the Hive schema; Ignores fields found only in the Avro schema
4949
* 5. Fields found only in Hive schema are represented as optional fields in the resultant Avro schema
5050
* 6. For fields found only in Hive schema, field names are sanitized to make them compatible with Avro identifier spec
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.iceberg.hivelink.core.utils;
21+
22+
import org.apache.iceberg.Schema;
23+
import org.apache.iceberg.mapping.MappedFields;
24+
import org.apache.iceberg.mapping.NameMapping;
25+
import org.apache.iceberg.types.TypeUtil;
26+
27+
public class MappingUtil {
28+
29+
private MappingUtil() {
30+
}
31+
32+
/**
33+
* Create a name-based mapping for a schema.
34+
* <p>
35+
* The mapping returned by this method will use the schema's name for each field.
36+
*
37+
* @param schema a {@link Schema}
38+
* @param caseSensitive whether names should be matched case sensitively
39+
* @return a {@link NameMapping} initialized with the schema's fields and names
40+
*/
41+
public static NameMapping create(Schema schema, boolean caseSensitive) {
42+
final MappedFields mappedFields = TypeUtil.visit(schema,
43+
org.apache.iceberg.mapping.MappingUtil.CreateMapping.INSTANCE);
44+
return new NameMapping(mappedFields, caseSensitive);
45+
}
46+
}

data/src/test/java/org/apache/iceberg/data/TestReadFileWithCaseMismatch.java renamed to hivelink-core/src/test/java/org/apache/iceberg/hivelink/core/TestReadFileWithCaseMismatch.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* under the License.
1818
*/
1919

20-
package org.apache.iceberg.data;
20+
package org.apache.iceberg.hivelink.core;
2121

2222
import java.io.Closeable;
2323
import java.io.File;
@@ -35,10 +35,11 @@
3535
import org.apache.iceberg.Files;
3636
import org.apache.iceberg.Schema;
3737
import org.apache.iceberg.avro.Avro;
38+
import org.apache.iceberg.data.Record;
3839
import org.apache.iceberg.data.avro.DataReader;
3940
import org.apache.iceberg.data.orc.GenericOrcReader;
41+
import org.apache.iceberg.hivelink.core.utils.MappingUtil;
4042
import org.apache.iceberg.io.CloseableIterable;
41-
import org.apache.iceberg.mapping.MappingUtil;
4243
import org.apache.iceberg.mapping.NameMapping;
4344
import org.apache.iceberg.orc.ORC;
4445
import org.apache.iceberg.relocated.com.google.common.collect.Lists;

0 commit comments

Comments
 (0)