From 8d9dde00fcd4e415b3ee5864852a4d4d462cd904 Mon Sep 17 00:00:00 2001 From: Alex Levenson Date: Wed, 29 Jul 2015 20:11:52 -0700 Subject: [PATCH 1/5] Fixes for PARQUET-350, PARQUET-348, PARQUET-346, PARQUET-345 --- .../org/apache/parquet/CorruptStatistics.java | 30 +++++++++++++------ .../apache/parquet/thrift/ThriftMetaData.java | 3 +- .../parquet/thrift/ThriftRecordConverter.java | 19 +++++++++--- .../thrift/ThriftSchemaConvertVisitor.java | 23 +++++++++----- .../parquet/thrift/ThriftSchemaConverter.java | 6 ++++ 5 files changed, 58 insertions(+), 23 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java index e2b8114e20..3869cdac48 100644 --- a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java @@ -18,6 +18,8 @@ */ package org.apache.parquet; +import java.util.concurrent.atomic.AtomicBoolean; + import org.apache.parquet.SemanticVersion.SemanticVersionParseException; import org.apache.parquet.VersionParser.ParsedVersion; import org.apache.parquet.VersionParser.VersionParseException; @@ -31,6 +33,8 @@ * and thus it's statistics should be ignored / not trusted. */ public class CorruptStatistics { + private static final AtomicBoolean alreadyLogged = new AtomicBoolean(false); + private static final Log LOG = Log.getLog(CorruptStatistics.class); // the version in which the bug described by jira: PARQUET-251 was fixed @@ -52,7 +56,7 @@ public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName if (Strings.isNullOrEmpty(createdBy)) { // created_by is not populated, which could have been caused by // parquet-mr during the same time as PARQUET-251, see PARQUET-297 - LOG.info("Ignoring statistics because created_by is null or empty! See PARQUET-251 and PARQUET-297"); + warnOnce("Ignoring statistics because created_by is null or empty! See PARQUET-251 and PARQUET-297"); return true; } @@ -65,16 +69,16 @@ public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName } if (Strings.isNullOrEmpty(version.version)) { - LOG.warn("Ignoring statistics because created_by did not contain a semver (see PARQUET-251): " + createdBy); + warnOnce("Ignoring statistics because created_by did not contain a semver (see PARQUET-251): " + createdBy); return true; } SemanticVersion semver = SemanticVersion.parse(version.version); if (semver.compareTo(PARQUET_251_FIXED_VERSION) < 0) { - LOG.info("Ignoring statistics because this file was created prior to " + warnOnce("Ignoring statistics because this file was created prior to " + PARQUET_251_FIXED_VERSION - + ", see PARQUET-251" ); + + ", see PARQUET-251"); return true; } @@ -83,22 +87,30 @@ public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName } catch (RuntimeException e) { // couldn't parse the created_by field, log what went wrong, don't trust the stats, // but don't make this fatal. - warnParseError(createdBy, e); + warnParseErrorOnce(createdBy, e); return true; } catch (SemanticVersionParseException e) { // couldn't parse the created_by field, log what went wrong, don't trust the stats, // but don't make this fatal. - warnParseError(createdBy, e); + warnParseErrorOnce(createdBy, e); return true; } catch (VersionParseException e) { // couldn't parse the created_by field, log what went wrong, don't trust the stats, // but don't make this fatal. - warnParseError(createdBy, e); + warnParseErrorOnce(createdBy, e); return true; } } - private static void warnParseError(String createdBy, Throwable e) { - LOG.warn("Ignoring statistics because created_by could not be parsed (see PARQUET-251): " + createdBy, e); + private static void warnParseErrorOnce(String createdBy, Throwable e) { + if(!alreadyLogged.getAndSet(true)) { + LOG.warn("Ignoring statistics because created_by could not be parsed (see PARQUET-251): " + createdBy, e); + } + } + + private static void warnOnce(String message) { + if(!alreadyLogged.getAndSet(true)) { + LOG.warn(message); + } } } diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java index f0a9624669..3b0f8f0c96 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java @@ -128,7 +128,6 @@ public static Set getThriftClassNames(Map> fileMetad @Override public String toString() { - return "ThriftMetaData" + toExtraMetaData(); + return String.format("ThriftMetaData(thriftClassName: %s, descriptor: %s", thriftClassName, descriptor.toJSON()); } - } diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java index ec0f4ff245..a7c5922685 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.Map; +import org.apache.parquet.io.ParquetDecodingException; import org.apache.thrift.TException; import org.apache.thrift.protocol.TField; import org.apache.thrift.protocol.TList; @@ -432,11 +433,12 @@ public ByteBuffer readBinary() throws TException { class FieldEnumConverter extends PrimitiveConverter { private final List events; - - private Map enumLookup = new HashMap(); + private final Map enumLookup = new HashMap(); + private final ThriftField field; public FieldEnumConverter(List events, ThriftField field) { this.events = events; + this.field = field; final Iterable values = ((EnumType)field.getType()).getValues(); for (EnumValue enumValue : values) { enumLookup.put(Binary.fromString(enumValue.getName()), enumValue.getId()); @@ -445,7 +447,16 @@ public FieldEnumConverter(List events, ThriftField field) { @Override public void addBinary(final Binary value) { - final int id = enumLookup.get(value); + final Integer id = enumLookup.get(value); + + if (id == null) { + throw new ParquetDecodingException("Unrecognized enum value: " + + value.toStringUsingUTF8() + + " known values: " + + enumLookup + + " in " + this.field); + } + events.add(new ParquetProtocol("readI32() enum") { @Override public int readI32() throws TException { @@ -794,7 +805,7 @@ public ThriftRecordConverter(ThriftReader thriftReader, String name, MessageT this.thriftReader = thriftReader; this.protocol = new ParquetReadProtocol(); this.thriftType = thriftType; - MessageType fullSchema = new ThriftSchemaConverter().convert(thriftType); + MessageType fullSchema = new ThriftSchemaConverter().convertWithoutProjection(thriftType); missingRequiredFieldsInProjection = hasMissingRequiredFieldInGroupType(requestedParquetSchema, fullSchema); this.structConverter = new StructConverter(rootEvents, requestedParquetSchema, new ThriftField(name, (short)0, Requirement.REQUIRED, thriftType)); } diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java index 2c05c301b5..88effc50e0 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java @@ -76,16 +76,23 @@ class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor { private final FieldProjectionFilter fieldProjectionFilter; private final boolean doProjection; + private final boolean keepOneOfEachUnion; - private ThriftSchemaConvertVisitor(FieldProjectionFilter fieldProjectionFilter, boolean doProjection) { + private ThriftSchemaConvertVisitor(FieldProjectionFilter fieldProjectionFilter, boolean doProjection, boolean keepOneOfEachUnion) { this.fieldProjectionFilter = checkNotNull(fieldProjectionFilter, "fieldProjectionFilter"); this.doProjection = doProjection; + this.keepOneOfEachUnion = keepOneOfEachUnion; } + @Deprecated public static MessageType convert(StructType struct, FieldProjectionFilter filter) { + return convert(struct, filter, true); + } + + public static MessageType convert(StructType struct, FieldProjectionFilter filter, boolean keepOneOfEachUnion) { State state = new State(new FieldsPath(), REPEATED, "ParquetSchema"); - ConvertedField converted = struct.accept(new ThriftSchemaConvertVisitor(filter, true), state); + ConvertedField converted = struct.accept(new ThriftSchemaConvertVisitor(filter, true, keepOneOfEachUnion), state); if (!converted.isKeep()) { throw new ThriftProjectionException("No columns have been selected"); @@ -134,7 +141,7 @@ public ConvertedField visit(MapType mapType, State state) { if (doProjection) { ConvertedField fullConvKey = keyField .getType() - .accept(new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false), keyState); + .accept(new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false, keepOneOfEachUnion), keyState); if (!fullConvKey.asKeep().getType().equals(convertedKey.asKeep().getType())) { throw new ThriftProjectionException("Cannot select only a subset of the fields in a map key, " + @@ -160,7 +167,7 @@ public ConvertedField visit(MapType mapType, State state) { // keep only the key, not the value ConvertedField sentinelValue = - valueField.getType().accept(new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true), valueState); + valueField.getType().accept(new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true, keepOneOfEachUnion), valueState); Type mapField = mapType( state.repetition, @@ -181,7 +188,7 @@ private ConvertedField visitListLike(ThriftField listLike, State state, boolean if (isSet && doProjection) { ConvertedField fullConv = listLike .getType() - .accept(new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false), childState); + .accept(new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false, keepOneOfEachUnion), childState); if (!converted.asKeep().getType().equals(fullConv.asKeep().getType())) { throw new ThriftProjectionException("Cannot select only a subset of the fields in a set, " + "for path " + state.path); @@ -210,7 +217,7 @@ public ConvertedField visit(StructType structType, State state) { // special care is taken when converting unions, // because we are actually both converting + projecting in // one pass, and unions need special handling when projecting. - final boolean isUnion = isUnion(structType.getStructOrUnionType()); + final boolean needsToKeepOneOfEachUnion = keepOneOfEachUnion && isUnion(structType.getStructOrUnionType()); boolean hasSentinelUnionColumns = false; boolean hasNonSentinelUnionColumns = false; @@ -223,7 +230,7 @@ public ConvertedField visit(StructType structType, State state) { ConvertedField converted = child.getType().accept(this, childState); - if (isUnion && !converted.isKeep()) { + if (!converted.isKeep() && needsToKeepOneOfEachUnion) { // user is not keeping this "kind" of union, but we still need // to keep at least one of the primitives of this union around. // in order to know what "kind" of union each record is. @@ -232,7 +239,7 @@ public ConvertedField visit(StructType structType, State state) { // re-do the recursion, with a new projection filter that keeps only // the first primitive it encounters ConvertedField firstPrimitive = child.getType().accept( - new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true), childState); + new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true, keepOneOfEachUnion), childState); convertedChildren.add(firstPrimitive.asKeep().getType().withId(child.getFieldId())); hasSentinelUnionColumns = true; diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java index 95d998b78c..5879e945fa 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java @@ -62,6 +62,12 @@ public MessageType convert(StructType struct) { return messageType; } + public MessageType convertWithoutProjection(StructType struct) { + MessageType messageType = ThriftSchemaConvertVisitor.convert(struct, FieldProjectionFilter.ALL_COLUMNS, false); + fieldProjectionFilter.assertNoUnmatchedPatterns(); + return messageType; + } + public static > StructOrUnionType structOrUnionType(Class klass) { return TUnion.class.isAssignableFrom(klass) ? StructOrUnionType.UNION : StructOrUnionType.STRUCT; } From e26dc0c9b5647cfb38cefe487ba63c1404a42a47 Mon Sep 17 00:00:00 2001 From: Alex Levenson Date: Wed, 29 Jul 2015 21:04:17 -0700 Subject: [PATCH 2/5] Add tests --- .../apache/parquet/thrift/ThriftMetaData.java | 2 +- .../parquet/thrift/ThriftRecordConverter.java | 30 +++---- .../parquet/thrift/ThriftSchemaConverter.java | 2 +- .../parquet/thrift/TestThriftMetaData.java | 37 +++++++++ .../thrift/TestThriftRecordConverter.java | 83 +++++++++++++++++++ .../StructWithUnionV1NoStructOrUnionMeta.json | 49 +++++++++++ 6 files changed, 186 insertions(+), 17 deletions(-) create mode 100644 parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java create mode 100644 parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java create mode 100644 parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java index 3b0f8f0c96..a89f8d97c1 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java @@ -128,6 +128,6 @@ public static Set getThriftClassNames(Map> fileMetad @Override public String toString() { - return String.format("ThriftMetaData(thriftClassName: %s, descriptor: %s", thriftClassName, descriptor.toJSON()); + return String.format("ThriftMetaData(thriftClassName: %s, descriptor: %s)", thriftClassName, descriptor); } } diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java index a7c5922685..f2ef145e4e 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java @@ -63,7 +63,7 @@ */ public class ThriftRecordConverter extends RecordMaterializer { - final ParquetProtocol readFieldEnd = new ParquetProtocol("readFieldEnd()") { + final static ParquetProtocol readFieldEnd = new ParquetProtocol("readFieldEnd()") { @Override public void readFieldEnd() throws TException { } @@ -76,7 +76,7 @@ public void readFieldEnd() throws TException { * @author Julien Le Dem * */ - class PrimitiveFieldHandler extends PrimitiveConverter { + static class PrimitiveFieldHandler extends PrimitiveConverter { private final PrimitiveConverter delegate; private final List events; @@ -155,7 +155,7 @@ public void addLong(long value) { * @author Julien Le Dem * */ - class GroupFieldhandler extends GroupConverter { + static class GroupFieldhandler extends GroupConverter { private final GroupConverter delegate; private final List events; @@ -204,7 +204,7 @@ interface Counter { * @author Julien Le Dem * */ - class GroupCounter extends GroupConverter implements Counter { + static class GroupCounter extends GroupConverter implements Counter { private final GroupConverter delegate; private int count; @@ -247,7 +247,7 @@ public int getCount() { * @author Julien Le Dem * */ - class PrimitiveCounter extends PrimitiveConverter implements Counter { + static class PrimitiveCounter extends PrimitiveConverter implements Counter { private final PrimitiveConverter delegate; private int count; @@ -310,7 +310,7 @@ public int getCount() { * @author Julien Le Dem * */ - class FieldPrimitiveConverter extends PrimitiveConverter { + static class FieldPrimitiveConverter extends PrimitiveConverter { private final List events; private ThriftTypeID type; @@ -401,7 +401,7 @@ public long readI64() throws TException { * @author Julien Le Dem * */ - class FieldStringConverter extends PrimitiveConverter { + static class FieldStringConverter extends PrimitiveConverter { private final List events; @@ -430,7 +430,7 @@ public ByteBuffer readBinary() throws TException { * @author Julien Le Dem * */ - class FieldEnumConverter extends PrimitiveConverter { + static class FieldEnumConverter extends PrimitiveConverter { private final List events; private final Map enumLookup = new HashMap(); @@ -472,7 +472,7 @@ public int readI32() throws TException { * @author Julien Le Dem * */ - class MapConverter extends GroupConverter { + static class MapConverter extends GroupConverter { private final GroupCounter child; private final List mapEvents = new ArrayList(); @@ -534,7 +534,7 @@ public TMap readMapBegin() throws TException { * @author Julien Le Dem * */ - class MapKeyValueConverter extends GroupConverter { + static class MapKeyValueConverter extends GroupConverter { private Converter keyConverter; private Converter valueConverter; @@ -572,7 +572,7 @@ public void end() { * @author Julien Le Dem * */ - class SetConverter extends CollectionConverter { + static class SetConverter extends CollectionConverter { final ParquetProtocol readSetEnd = new ParquetProtocol("readSetEnd()") { @Override @@ -609,7 +609,7 @@ void collectionEnd() { * @author Julien Le Dem * */ - class ListConverter extends CollectionConverter { + static class ListConverter extends CollectionConverter { final ParquetProtocol readListEnd = new ParquetProtocol("readListEnd()") { @Override @@ -646,7 +646,7 @@ void collectionEnd() { * @author Julien Le Dem * */ - abstract class CollectionConverter extends GroupConverter { + static abstract class CollectionConverter extends GroupConverter { private final Converter child; private final Counter childCounter; @@ -707,7 +707,7 @@ public void end() { * @author Julien Le Dem * */ - class StructConverter extends GroupConverter { + static class StructConverter extends GroupConverter { private final int schemaSize; @@ -874,7 +874,7 @@ public GroupConverter getRootConverter() { return structConverter; } - private Converter newConverter(List events, Type type, ThriftField field) { + private static Converter newConverter(List events, Type type, ThriftField field) { switch (field.getType().getType()) { case LIST: return new ListConverter(events, type.asGroupType(), field); diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java index 5879e945fa..900abcfc9d 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java @@ -57,7 +57,7 @@ public MessageType convert(Class> thriftClass) { } public MessageType convert(StructType struct) { - MessageType messageType = ThriftSchemaConvertVisitor.convert(struct, fieldProjectionFilter); + MessageType messageType = ThriftSchemaConvertVisitor.convert(struct, fieldProjectionFilter, true); fieldProjectionFilter.assertNoUnmatchedPatterns(); return messageType; } diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java new file mode 100644 index 0000000000..87ac1d4c9e --- /dev/null +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java @@ -0,0 +1,37 @@ +package org.apache.parquet.thrift; + +import java.util.ArrayList; + +import org.apache.parquet.thrift.struct.ThriftField; +import org.apache.parquet.thrift.struct.ThriftType.StructType; +import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class TestThriftMetaData { + + /** + * Previously, ThriftMetaData.toString would try to instantiate thriftClassName, + * but there is no guarantee that that class is on the classpath, and it is in fact + * normal for that to be the case (for example, when a file was written with TBase objects + * but is being read with scrooge objects). + * + * See PARQUET-345 + */ + @Test + public void testToStringDoesNotThrow() { + + StructType descriptor = new StructType(new ArrayList(), StructOrUnionType.STRUCT); + ThriftMetaData tmd = new ThriftMetaData("non existent class!!!", descriptor); + assertEquals("ThriftMetaData(thriftClassName: non existent class!!!, descriptor: {\n" + + " \"id\" : \"STRUCT\",\n" + + " \"children\" : [ ],\n" + + " \"structOrUnionType\" : \"STRUCT\"\n" + + "})", tmd.toString()); + + tmd = new ThriftMetaData("non existent class!!!", null); + assertEquals("ThriftMetaData(thriftClassName: non existent class!!!, descriptor: null)", tmd.toString()); + + } +} diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java new file mode 100644 index 0000000000..e451389e99 --- /dev/null +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java @@ -0,0 +1,83 @@ +package org.apache.parquet.thrift; + +import java.io.File; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; + +import org.apache.parquet.Files; +import org.apache.parquet.Strings; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.thrift.ThriftRecordConverter.FieldEnumConverter; +import org.apache.parquet.thrift.struct.ThriftField; +import org.apache.parquet.thrift.struct.ThriftField.Requirement; +import org.apache.parquet.thrift.struct.ThriftType; +import org.apache.parquet.thrift.struct.ThriftType.EnumType; +import org.apache.parquet.thrift.struct.ThriftType.EnumValue; +import org.apache.parquet.thrift.struct.ThriftType.StructType; +import org.apache.parquet.thrift.test.compat.StructWithUnionV1; +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TProtocol; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class TestThriftRecordConverter { + @Test + public void testUnknownEnumThrowsGoodException() throws Exception { + EnumType et = new EnumType(Arrays.asList(new EnumValue(77, "hello"))); + ThriftField field = new ThriftField("name", (short) 1, Requirement.REQUIRED, et); + + ArrayList events = new ArrayList(); + + FieldEnumConverter conv = new FieldEnumConverter(events, field); + + conv.addBinary(Binary.fromString("hello")); + + assertEquals(1, events.size()); + assertEquals(77, events.get(0).readI32()); + + try { + conv.addBinary(Binary.fromString("FAKE_ENUM_VALUE")); + fail("this should throw"); + } catch (ParquetDecodingException e) { + assertEquals("Unrecognized enum value: FAKE_ENUM_VALUE known values: {Binary{\"hello\"}=77} in {\n" + + " \"name\" : \"name\",\n" + + " \"fieldId\" : 1,\n" + + " \"requirement\" : \"REQUIRED\",\n" + + " \"type\" : {\n" + + " \"id\" : \"ENUM\",\n" + + " \"values\" : [ {\n" + + " \"id\" : 77,\n" + + " \"name\" : \"hello\"\n" + + " } ]\n" + + " }\n" + + "}", e.getMessage()); + } + } + + @Test + public void constructorDoesNotRequireStructOrUnionTypeMeta() throws Exception { + String jsonWithNoStructOrUnionMeta = Strings.join( + Files.readAllLines( + new File("parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json"), + Charset.forName("UTF-8")), "\n"); + + StructType noStructOrUnionMeta = (StructType) ThriftType.fromJSON(jsonWithNoStructOrUnionMeta); + + // this used to throw, see PARQUET-346 + new ThriftRecordConverter( + new ThriftReader() { + @Override + public StructWithUnionV1 readOneRecord(TProtocol protocol) throws TException { + return null; + } + }, + "name", + new ThriftSchemaConverter().convert(StructWithUnionV1.class), + noStructOrUnionMeta + ); + } +} diff --git a/parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json b/parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json new file mode 100644 index 0000000000..ac42b76a6e --- /dev/null +++ b/parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json @@ -0,0 +1,49 @@ +{ + "id" : "STRUCT", + "children" : [ { + "name" : "name", + "fieldId" : 1, + "requirement" : "REQUIRED", + "type" : { + "id" : "STRING" + } + }, { + "name" : "aUnion", + "fieldId" : 2, + "requirement" : "REQUIRED", + "type" : { + "id" : "STRUCT", + "children" : [ { + "name" : "aString", + "fieldId" : 1, + "requirement" : "DEFAULT", + "type" : { + "id" : "STRUCT", + "children" : [ { + "name" : "s", + "fieldId" : 1, + "requirement" : "REQUIRED", + "type" : { + "id" : "STRING" + } + } ] + } + }, { + "name" : "aLong", + "fieldId" : 2, + "requirement" : "DEFAULT", + "type" : { + "id" : "STRUCT", + "children" : [ { + "name" : "l", + "fieldId" : 1, + "requirement" : "REQUIRED", + "type" : { + "id" : "I64" + } + } ] + } + } ] + } + } ] +} \ No newline at end of file From d9d5dad2aa41c5ae682cb53706ba82899f390232 Mon Sep 17 00:00:00 2001 From: Alex Levenson Date: Wed, 29 Jul 2015 21:33:53 -0700 Subject: [PATCH 3/5] add license headers --- .../parquet/thrift/TestThriftMetaData.java | 18 ++++++++++++++++++ .../thrift/TestThriftRecordConverter.java | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java index 87ac1d4c9e..e7f42ce248 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.parquet.thrift; import java.util.ArrayList; diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java index e451389e99..fddea8524e 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.parquet.thrift; import java.io.File; From 376343e48684f1ca4cdab979df77699671db08ae Mon Sep 17 00:00:00 2001 From: Alex Levenson Date: Wed, 29 Jul 2015 23:03:21 -0700 Subject: [PATCH 4/5] Fix test --- .../org/apache/parquet/thrift/TestThriftRecordConverter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java index fddea8524e..1619dd57e8 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java @@ -80,7 +80,7 @@ public void testUnknownEnumThrowsGoodException() throws Exception { public void constructorDoesNotRequireStructOrUnionTypeMeta() throws Exception { String jsonWithNoStructOrUnionMeta = Strings.join( Files.readAllLines( - new File("parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json"), + new File("src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json"), Charset.forName("UTF-8")), "\n"); StructType noStructOrUnionMeta = (StructType) ThriftType.fromJSON(jsonWithNoStructOrUnionMeta); From 9b5cb0ee16dba4d02233d0bf92cfe6153df6b262 Mon Sep 17 00:00:00 2001 From: Alex Levenson Date: Thu, 30 Jul 2015 18:31:35 -0700 Subject: [PATCH 5/5] Add comments, cleanup some minor use of ThriftSchemaConverter --- .../scrooge/ScroogeStructConverterTest.java | 5 ++--- .../thrift/AbstractThriftWriteSupport.java | 3 +-- .../parquet/hadoop/thrift/TBaseWriteSupport.java | 3 +-- .../hadoop/thrift/ThriftBytesWriteSupport.java | 5 ++--- .../parquet/thrift/ThriftRecordConverter.java | 2 +- .../parquet/thrift/ThriftSchemaConverter.java | 16 ++++++++++++---- .../thrift/struct/CompatibilityRunner.java | 2 +- .../parquet/thrift/TestProtocolReadToWrite.java | 14 +++++++------- .../thrift/TestThriftToPigCompatibility.java | 2 +- .../thrift/struct/CompatibilityCheckerTest.java | 2 +- 10 files changed, 29 insertions(+), 25 deletions(-) diff --git a/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java b/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java index 648634c3a3..8acbf965f7 100644 --- a/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java +++ b/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java @@ -61,15 +61,14 @@ public class ScroogeStructConverterTest { */ private void shouldConvertConsistentlyWithThriftStructConverter(Class scroogeClass) throws ClassNotFoundException { Class> thriftClass = (Class>)Class.forName(scroogeClass.getName().replaceFirst("org.apache.parquet.scrooge.test", "org.apache.parquet.thrift.test")); - ThriftType.StructType structFromThriftSchemaConverter = new ThriftSchemaConverter().toStructType(thriftClass); + ThriftType.StructType structFromThriftSchemaConverter = ThriftSchemaConverter.toStructType(thriftClass); ThriftType.StructType structFromScroogeSchemaConverter = new ScroogeStructConverter().convert(scroogeClass); assertEquals(toParquetSchema(structFromThriftSchemaConverter), toParquetSchema(structFromScroogeSchemaConverter)); } private MessageType toParquetSchema(ThriftType.StructType struct) { - ThriftSchemaConverter sc = new ThriftSchemaConverter(); - return sc.convert(struct); + return ThriftSchemaConverter.convertWithoutProjection(struct); } @Test diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java index 91350cc2f7..5f210d3280 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java @@ -84,8 +84,7 @@ protected void init(Class thriftClass) { this.thriftClass = thriftClass; this.thriftStruct = getThriftStruct(); - ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter(); - this.schema = thriftSchemaConverter.convert(thriftStruct); + this.schema = ThriftSchemaConverter.convertWithoutProjection(thriftStruct); final Map extraMetaData = new ThriftMetaData(thriftClass.getName(), thriftStruct).toExtraMetaData(); // adding the Pig schema as it would have been mapped from thrift diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java index 2b3b3109ee..b45727829d 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java @@ -47,8 +47,7 @@ public TBaseWriteSupport(Class thriftClass) { @Override protected StructType getThriftStruct() { - ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter(); - return thriftSchemaConverter.toStructType((Class>)thriftClass); + return ThriftSchemaConverter.toStructType(thriftClass); } @Override diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java index 6caecbc3c4..6db769ecb7 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java @@ -108,9 +108,8 @@ public WriteContext init(Configuration configuration) { } else { thriftClass = TBaseWriteSupport.getThriftClass(configuration); } - ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter(); - this.thriftStruct = thriftSchemaConverter.toStructType(thriftClass); - this.schema = thriftSchemaConverter.convert(thriftStruct); + this.thriftStruct = ThriftSchemaConverter.toStructType(thriftClass); + this.schema = ThriftSchemaConverter.convertWithoutProjection(thriftStruct); if (buffered) { readToWrite = new BufferedProtocolReadToWrite(thriftStruct, errorHandler); } else { diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java index f2ef145e4e..e18b0e6d17 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java @@ -805,7 +805,7 @@ public ThriftRecordConverter(ThriftReader thriftReader, String name, MessageT this.thriftReader = thriftReader; this.protocol = new ParquetReadProtocol(); this.thriftType = thriftType; - MessageType fullSchema = new ThriftSchemaConverter().convertWithoutProjection(thriftType); + MessageType fullSchema = ThriftSchemaConverter.convertWithoutProjection(thriftType); missingRequiredFieldsInProjection = hasMissingRequiredFieldInGroupType(requestedParquetSchema, fullSchema); this.structConverter = new StructConverter(rootEvents, requestedParquetSchema, new ThriftField(name, (short)0, Requirement.REQUIRED, thriftType)); } diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java index 900abcfc9d..98820c37ee 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java @@ -56,16 +56,24 @@ public MessageType convert(Class> thriftClass) { return convert(toStructType(thriftClass)); } + /** + * struct is assumed to contain valid structOrUnionType metadata when used with this method. + * This method may throw if structOrUnionType is unknown. + * + * Use convertWithoutProjection below to convert a StructType to MessageType + */ public MessageType convert(StructType struct) { MessageType messageType = ThriftSchemaConvertVisitor.convert(struct, fieldProjectionFilter, true); fieldProjectionFilter.assertNoUnmatchedPatterns(); return messageType; } - public MessageType convertWithoutProjection(StructType struct) { - MessageType messageType = ThriftSchemaConvertVisitor.convert(struct, FieldProjectionFilter.ALL_COLUMNS, false); - fieldProjectionFilter.assertNoUnmatchedPatterns(); - return messageType; + /** + * struct is not required to have known structOrUnionType, which is useful + * for converting a StructType from an (older) file schema to a MessageType + */ + public static MessageType convertWithoutProjection(StructType struct) { + return ThriftSchemaConvertVisitor.convert(struct, FieldProjectionFilter.ALL_COLUMNS, false); } public static > StructOrUnionType structOrUnionType(Class klass) { diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java index 0d05e76c20..b8d577d4a6 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java @@ -95,7 +95,7 @@ private static void generateJson(LinkedList arguments) throws ClassNotFo String className = arguments.pollFirst(); String storedPath = arguments.pollFirst(); File storeDir = new File(storedPath); - ThriftType.StructType structType = new ThriftSchemaConverter().toStructType((Class>) Class.forName(className)); + ThriftType.StructType structType = ThriftSchemaConverter.toStructType((Class>) Class.forName(className)); ObjectMapper mapper = new ObjectMapper(); String fileName = catName + ".json"; diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java index e7be3eabaa..ba27166ae3 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java @@ -92,7 +92,7 @@ public void testMapSet() throws Exception { private void writeReadCompare(TBase a) throws TException, InstantiationException, IllegalAccessException { - ProtocolPipe[] pipes = {new ProtocolReadToWrite(), new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType((Class>)a.getClass()))}; + ProtocolPipe[] pipes = {new ProtocolReadToWrite(), new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType((Class>)a.getClass()))}; for (ProtocolPipe p : pipes) { final ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); @@ -110,7 +110,7 @@ public void testIncompatibleSchemaRecord() throws Exception { //handler will rethrow the exception for verifying purpose CountingErrorHandler countingHandler = new CountingErrorHandler(); - BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(AddressBook.class), countingHandler); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(AddressBook.class), countingHandler); final ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); @@ -134,7 +134,7 @@ public void testIncompatibleSchemaRecord() throws Exception { @Test public void testUnrecognizedUnionMemberSchema() throws Exception { CountingErrorHandler countingHandler = new CountingErrorHandler(); - BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructWithUnionV1.class), countingHandler); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructWithUnionV1.class), countingHandler); final ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); StructWithUnionV1 validUnion = new StructWithUnionV1("a valid struct", UnionV1.aLong(new ALong(17L))); @@ -164,7 +164,7 @@ public void testUnrecognizedUnionMemberSchema() throws Exception { @Test public void testUnionWithExtraOrNoValues() throws Exception { CountingErrorHandler countingHandler = new CountingErrorHandler(); - BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructWithUnionV2.class), countingHandler); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructWithUnionV2.class), countingHandler); ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); @@ -229,7 +229,7 @@ public void testUnionWithExtraOrNoValues() throws Exception { @Test public void testEnumMissingSchema() throws Exception { CountingErrorHandler countingHandler = new CountingErrorHandler(); - BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructWithEnum.class), countingHandler); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructWithEnum.class), countingHandler); final ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); StructWithMoreEnum enumDefinedInOldDefinition = new StructWithMoreEnum(NumberEnumWithMoreValue.THREE); @@ -268,7 +268,7 @@ public void handleFieldIgnored(TField field) { fieldIgnoredCount++; } }; - BufferedProtocolReadToWrite structForRead = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructV3.class), countingHandler); + BufferedProtocolReadToWrite structForRead = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructV3.class), countingHandler); //Data has an extra field of type struct final ByteArrayOutputStream in = new ByteArrayOutputStream(); @@ -306,7 +306,7 @@ public void handleFieldIgnored(TField field) { } }; - BufferedProtocolReadToWrite structForRead = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructWithIndexStartsFrom4.class), countingHandler); + BufferedProtocolReadToWrite structForRead = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructWithIndexStartsFrom4.class), countingHandler); //Data has an extra field of type struct final ByteArrayOutputStream in = new ByteArrayOutputStream(); diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java index f45f1759e7..c320f71378 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java @@ -154,7 +154,7 @@ public static > void validateSameTupleAsEB(T o) throws TExc final Class class1 = (Class) o.getClass(); final MessageType schema = thriftSchemaConverter.convert(class1); - final StructType structType = thriftSchemaConverter.toStructType(class1); + final StructType structType = ThriftSchemaConverter.toStructType(class1); final ThriftToPig thriftToPig = new ThriftToPig(class1); final Schema pigSchema = thriftToPig.toSchema(); final TupleRecordMaterializer tupleRecordConverter = new TupleRecordMaterializer(schema, pigSchema, true); diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java index f07aa9dab5..df034bafc1 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java @@ -116,7 +116,7 @@ public void testEmptyStruct() { } private ThriftType.StructType struct(Class thriftClass) { - return new ThriftSchemaConverter().toStructType(thriftClass); + return ThriftSchemaConverter.toStructType(thriftClass); } private CompatibilityReport getCompatibilityReport(Class oldClass, Class newClass) {