Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ derby.log

# Python stuff
python/.mypy_cache/

hive2-metastore/src
104 changes: 93 additions & 11 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,87 @@ project(':iceberg-hive-metastore') {
}
}


project(':iceberg-hive2-metastore') {

task copyFilesFromHive3(type: Copy) {
copy {
from('../hive-metastore/src/main/java/')
into 'src/main/java/'
}
copy {
from('../hive-metastore/src/main/resources/')
into 'src/main/resources/'
}
copy {
from('../hive-metastore/src/test/java/')
into 'src/test/java/'
}
copy {
from('../hive-metastore/src/test/resources/')
into 'src/test/resources/'
}
}

dependencies {

compile project(':iceberg-core')

compileOnly "org.apache.avro:avro"

compileOnly("org.apache.hive:hive-metastore:2.3.7") {
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
exclude group: 'org.pentaho' // missing dependency
exclude group: 'org.apache.hbase'
exclude group: 'org.apache.logging.log4j'
exclude group: 'co.cask.tephra'
exclude group: 'com.google.code.findbugs', module: 'jsr305'
exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all'
exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet'
exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle'
exclude group: 'com.tdunning', module: 'json'
exclude group: 'javax.transaction', module: 'transaction-api'
exclude group: 'com.zaxxer', module: 'HikariCP'
}

testCompile("org.apache.hive:hive-exec:2.3.7") {
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
exclude group: 'org.pentaho' // missing dependency
exclude group: 'org.apache.hive', module: 'hive-llap-tez'
exclude group: 'org.apache.logging.log4j'
exclude group: 'com.google.protobuf', module: 'protobuf-java'
exclude group: 'org.apache.calcite'
exclude group: 'org.apache.calcite.avatica'
exclude group: 'com.google.code.findbugs', module: 'jsr305'
}

testCompile("org.apache.hive:hive-metastore:2.3.7") {
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
exclude group: 'org.pentaho' // missing dependency
exclude group: 'org.apache.hbase'
exclude group: 'org.apache.logging.log4j'
exclude group: 'co.cask.tephra'
exclude group: 'com.google.code.findbugs', module: 'jsr305'
exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all'
exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet'
exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle'
exclude group: 'com.tdunning', module: 'json'
exclude group: 'javax.transaction', module: 'transaction-api'
exclude group: 'com.zaxxer', module: 'HikariCP'
}

compileOnly("org.apache.hadoop:hadoop-client:2.7.3") {
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
}

testCompile project(path: ':iceberg-api', configuration: 'testArtifacts')
}
}

project(':iceberg-mr') {
configurations {
testCompile {
Expand Down Expand Up @@ -404,7 +485,7 @@ project(':iceberg-mr') {
testCompile("org.apache.calcite:calcite-core")
testCompile("com.esotericsoftware:kryo-shaded:4.0.2")
testCompile("com.fasterxml.jackson.core:jackson-annotations:2.6.5")
testCompile("com.klarna:hiverunner:5.2.1") {
testCompile("com.klarna:hiverunner:6.0.1") {
exclude group: 'javax.jms', module: 'jms'
exclude group: 'org.apache.hive', module: 'hive-exec'
exclude group: 'org.codehaus.jettison', module: 'jettison'
Expand Down Expand Up @@ -549,7 +630,7 @@ project(':iceberg-spark') {
compile project(':iceberg-orc')
compile project(':iceberg-parquet')
compile project(':iceberg-arrow')
compile project(':iceberg-hive-metastore')
compile project(':iceberg-hive2-metastore')
compile project(':iceberg-arrow')

compileOnly "org.apache.avro:avro"
Expand All @@ -562,7 +643,7 @@ project(':iceberg-spark') {
testCompile("org.apache.hadoop:hadoop-minicluster") {
exclude group: 'org.apache.avro', module: 'avro'
}
testCompile project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
testCompile project(path: ':iceberg-hive2-metastore', configuration: 'testArtifacts')
testCompile project(path: ':iceberg-api', configuration: 'testArtifacts')
testCompile project(path: ':iceberg-data', configuration: 'testArtifacts')
}
Expand Down Expand Up @@ -603,8 +684,8 @@ if (jdkVersion == '8') {
compile project(':iceberg-orc')
compile project(':iceberg-parquet')
compile project(':iceberg-arrow')
compile project(':iceberg-hive-metastore')
compile project(':iceberg-spark')
compile project(':iceberg-hive2-metastore')
compile project(":iceberg-spark")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small nit: why the change from single quotes?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it was unintentional. Thanks for the catch


compileOnly "org.apache.avro:avro"
compileOnly("org.apache.spark:spark-hive_2.11") {
Expand All @@ -613,12 +694,13 @@ if (jdkVersion == '8') {

testCompile project(path: ':iceberg-spark', configuration: 'testArtifacts')

testCompile "org.apache.hadoop:hadoop-hdfs::tests"
testCompile "org.apache.hadoop:hadoop-common::tests"
testCompile("org.apache.hadoop:hadoop-minicluster") {
testCompile "org.apache.hadoop:hadoop-hdfs:2.7.3"
testCompile "org.apache.hadoop:hadoop-common:2.7.3"
testCompile("org.apache.hadoop:hadoop-minicluster:2.7.3") {
exclude group: 'org.apache.avro', module: 'avro'
}
testCompile project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')

testCompile project(path: ':iceberg-hive2-metastore', configuration: 'testArtifacts')
testCompile project(path: ':iceberg-api', configuration: 'testArtifacts')
}

Expand Down Expand Up @@ -709,7 +791,7 @@ project(':iceberg-spark3') {
compile project(':iceberg-orc')
compile project(':iceberg-parquet')
compile project(':iceberg-arrow')
compile project(':iceberg-hive-metastore')
compile project(':iceberg-hive2-metastore')
compile project(':iceberg-spark')

compileOnly "org.apache.avro:avro"
Expand All @@ -725,7 +807,7 @@ project(':iceberg-spark3') {
testCompile("org.apache.hadoop:hadoop-minicluster") {
exclude group: 'org.apache.avro', module: 'avro'
}
testCompile project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
testCompile project(path: ':iceberg-hive2-metastore', configuration: 'testArtifacts')
testCompile project(path: ':iceberg-api', configuration: 'testArtifacts')
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.reflect.Method;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
Expand Down Expand Up @@ -80,6 +81,15 @@ public void stop() {
if (hiveLocalDir != null) {
hiveLocalDir.delete();
}

// remove raw store if exists
try {
Method cleanupRawStore = HiveMetaStore.class.getDeclaredMethod("cleanupRawStore");
cleanupRawStore.setAccessible(true);
cleanupRawStore.invoke(null);
} catch (Exception e) {
// no op
}
}

public HiveConf hiveConf() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ private static Object leafToLiteral(PredicateLeaf leaf) {
case FLOAT:
return leaf.getLiteral();
case DATE:
return daysFromTimestamp((Timestamp) leaf.getLiteral());
return daysFromDate((Date) leaf.getLiteral());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should check the type returned by getLiteral and handle that here. Then we won't need separate code for different versions.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I'll make that change.

case TIMESTAMP:
return microsFromTimestamp((Timestamp) LITERAL_FIELD.get(leaf));
case DECIMAL:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ public void configureInputJobProperties(TableDesc tableDesc, Map<String, String>
map.put(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(table.schema()));
}

@Override
public void configureInputJobCredentials(TableDesc tableDesc, Map<String, String> secrets) {

}

@Override
public void configureOutputJobProperties(TableDesc tableDesc, Map<String, String> map) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@

package org.apache.iceberg.mr.hive.serde.objectinspector;

import java.sql.Date;
import java.time.LocalDate;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
Expand All @@ -42,17 +42,17 @@ private IcebergDateObjectInspector() {

@Override
public Date getPrimitiveJavaObject(Object o) {
return o == null ? null : Date.valueOf((LocalDate) o);
return o == null ? null : Date.valueOf(o.toString());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Date/time values must be converted directly and not via string. Conversions like this will lead to time zone bugs.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! I will fix the conversion logic to avoid using toString. As for timezones, the expected LocalDate/LocalDateTime objects in these inspectors contain no timezone information or any offsets.

}

@Override
public DateWritable getPrimitiveWritableObject(Object o) {
return o == null ? null : new DateWritable(DateTimeUtil.daysFromDate((LocalDate) o));
public DateWritableV2 getPrimitiveWritableObject(Object o) {
return o == null ? null : new DateWritableV2(DateTimeUtil.daysFromDate((LocalDate) o));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we control the type returned by the object inspector through a config or by detecting whether DateWritableV2 exists in the classpath? Then we can use the same code for both 2.x and 3.x.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately there was a breaking change in the DateObjectInspector and TimestampObjectInspector interfaces in Hive3. Since we are implementing these interfaces, there has to a be a separate Hive2- and Hive3-compatible version of these implementations. I will open a new PR where the Hive2 and Hive3-specific parts of MR are separated into distinct modules.

}

@Override
public Object copyObject(Object o) {
return o == null ? null : new Date(((Date) o).getTime());
return o == null ? null : Date.valueOf(o.toString());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure that toString is the appropriate method to call here to get the time from the date?

toString is usually unsafe in these situations, especially for a function that takes in Object. At the least, if an object were passed in previously that could not be casted to java.sql.Date, then the user would get a decent stack trace explaining that to them.

To me, it seems like the stack trace's value to developers has decreased here, but perhaps I'm not understanding the semantics of org.apache.hadoop.hive.common.type.Date.valueOf.

Additionally, I think we tend to prefer to use java.sql.Date as it is the more portable representation across the many query engines that need to be supported. Additionally, I believe that java.sql.Date is expressed in the documentation as the physical type for a logical Date. Again, somebody please correct me if I'm wrong.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch!
The data object used by Hive internally for storing Date is changed, so instanceOf should be used to handle the differences between the two Date implementations.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! As @pvary mentioned, with hive3, date and timestamp storage has changed and requires using types from the hive.common package instead of java.sql. I will fix the conversion logic to avoid calling toString.

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@

package org.apache.iceberg.mr.hive.serde.objectinspector;

import java.sql.Timestamp;
import java.time.LocalDateTime;
import java.time.OffsetDateTime;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
Expand Down Expand Up @@ -57,13 +57,13 @@ private IcebergTimestampObjectInspector() {

@Override
public Timestamp getPrimitiveJavaObject(Object o) {
return o == null ? null : Timestamp.valueOf(toLocalDateTime(o));
return o == null ? null : Timestamp.valueOf(o.toString());
}

@Override
public TimestampWritable getPrimitiveWritableObject(Object o) {
public TimestampWritableV2 getPrimitiveWritableObject(Object o) {
Timestamp ts = getPrimitiveJavaObject(o);
return ts == null ? null : new TimestampWritable(ts);
return ts == null ? null : new TimestampWritableV2(ts);
}

@Override
Expand All @@ -73,7 +73,7 @@ public Object copyObject(Object o) {
}

Timestamp ts = (Timestamp) o;
Timestamp copy = new Timestamp(ts.getTime());
Timestamp copy = new Timestamp(ts);
copy.setNanos(ts.getNanos());
return copy;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@

package org.apache.iceberg.mr.hive.serde.objectinspector;

import java.sql.Date;
import java.time.LocalDate;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
Expand All @@ -42,7 +42,7 @@ public void testIcebergDateObjectInspector() {
Assert.assertEquals(TypeInfoFactory.dateTypeInfo.getTypeName(), oi.getTypeName());

Assert.assertEquals(Date.class, oi.getJavaPrimitiveClass());
Assert.assertEquals(DateWritable.class, oi.getPrimitiveWritableClass());
Assert.assertEquals(DateWritableV2.class, oi.getPrimitiveWritableClass());

Assert.assertNull(oi.copyObject(null));
Assert.assertNull(oi.getPrimitiveJavaObject(null));
Expand All @@ -52,7 +52,7 @@ public void testIcebergDateObjectInspector() {
Date date = Date.valueOf("2020-01-01");

Assert.assertEquals(date, oi.getPrimitiveJavaObject(local));
Assert.assertEquals(new DateWritable(date), oi.getPrimitiveWritableObject(local));
Assert.assertEquals(new DateWritableV2(date), oi.getPrimitiveWritableObject(local));

Date copy = (Date) oi.copyObject(date);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@

package org.apache.iceberg.mr.hive.serde.objectinspector;

import java.sql.Timestamp;
import java.time.LocalDateTime;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
Expand All @@ -42,7 +42,7 @@ public void testIcebergTimestampObjectInspector() {
Assert.assertEquals(TypeInfoFactory.timestampTypeInfo.getTypeName(), oi.getTypeName());

Assert.assertEquals(Timestamp.class, oi.getJavaPrimitiveClass());
Assert.assertEquals(TimestampWritable.class, oi.getPrimitiveWritableClass());
Assert.assertEquals(TimestampWritableV2.class, oi.getPrimitiveWritableClass());

Assert.assertNull(oi.copyObject(null));
Assert.assertNull(oi.getPrimitiveJavaObject(null));
Expand All @@ -52,7 +52,7 @@ public void testIcebergTimestampObjectInspector() {
Timestamp ts = Timestamp.valueOf("2020-01-01 00:00:00");

Assert.assertEquals(ts, oi.getPrimitiveJavaObject(local));
Assert.assertEquals(new TimestampWritable(ts), oi.getPrimitiveWritableObject(local));
Assert.assertEquals(new TimestampWritableV2(ts), oi.getPrimitiveWritableObject(local));

Timestamp copy = (Timestamp) oi.copyObject(ts);

Expand Down
2 changes: 2 additions & 0 deletions settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ include 'spark3'
include 'spark3-runtime'
include 'pig'
include 'hive-metastore'
include 'hive2-metastore'

project(':api').name = 'iceberg-api'
project(':common').name = 'iceberg-common'
Expand All @@ -51,6 +52,7 @@ project(':spark3').name = 'iceberg-spark3'
project(':spark3-runtime').name = 'iceberg-spark3-runtime'
project(':pig').name = 'iceberg-pig'
project(':hive-metastore').name = 'iceberg-hive-metastore'
project(':hive2-metastore').name = 'iceberg-hive2-metastore'

if (JavaVersion.current() == JavaVersion.VERSION_1_8) {
include 'spark2'
Expand Down
8 changes: 4 additions & 4 deletions versions.props
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
org.slf4j:* = 1.7.25
org.apache.avro:avro = 1.9.2
org.apache.flink:* = 1.11.0
org.apache.hadoop:* = 2.7.3
org.apache.hive:hive-metastore = 2.3.7
org.apache.hive:hive-serde = 2.3.7
org.apache.hadoop:* = 3.1.0
org.apache.hive:hive-metastore = 3.1.2
org.apache.hive:hive-serde = 3.1.2
org.apache.orc:* = 1.6.3
org.apache.parquet:* = 1.11.0
org.apache.spark:spark-hive_2.11 = 2.4.6
Expand All @@ -21,4 +21,4 @@ com.github.stephenc.findbugs:findbugs-annotations = 1.3.9-1
# test deps
junit:junit = 4.12
org.mockito:mockito-core = 1.10.19
org.apache.hive:hive-exec = 2.3.7
org.apache.hive:hive-exec = 3.1.2