Skip to content

Parquet-mr: Protobuf 3 support #1981

@asfimport

Description

@asfimport

Does parquet-mr support Protobuf version 3? I've applied the following patch and the tests are failing mostly due to optional vs required.

diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml
index b3e4e50..aa67423 100644
--- a/parquet-protobuf/pom.xml
+++ b/parquet-protobuf/pom.xml
@@ -31,7 +31,7 @@
 
   <properties>
     <elephant-bird.version>4.4</elephant-bird.version>
-    <protobuf.version>2.5.0</protobuf.version>
+    <protobuf.version>3.0.0-beta-4</protobuf.version>
   </properties>
 
 
diff --git a/parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoInputOutputFormatTest.java b/parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoInputOutputFormatTest.java
index 5c6ebca..7e2557f 100644
--- a/parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoInputOutputFormatTest.java
+++ b/parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoInputOutputFormatTest.java
@@ -88,7 +88,7 @@ public class ProtoInputOutputFormatTest {
 
 
     //test that only requested fields were deserialized
-    assertTrue(readDocument.hasDocId());
+    assertTrue(readDocument.getDocId() == 12345);
     assertTrue("Found data outside projection.", readDocument.getNameCount() == 0);
   }
 
diff --git a/parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoRecordConverterTest.java b/parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoRecordConverterTest.java
index 5318bd2..1cbb972 100644
--- a/parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoRecordConverterTest.java
+++ b/parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoRecordConverterTest.java
@@ -183,16 +183,16 @@ public class ProtoRecordConverterTest {
     TestProtobuf.InnerMessage third = result.getInner(2);
 
     assertEquals("First inner", first.getOne());
-    assertFalse(first.hasTwo());
-    assertFalse(first.hasThree());
+    assertEquals(first.getTwo(), "");
+    assertEquals(first.getThree(), "");
 
     assertEquals("Second inner", second.getTwo());
-    assertFalse(second.hasOne());
-    assertFalse(second.hasThree());
+    assertEquals(second.getOne(), "");
+    assertEquals(second.getThree(), "");
 
     assertEquals("Third inner", third.getThree());
-    assertFalse(third.hasOne());
-    assertFalse(third.hasTwo());
+    assertEquals(third.getOne(), "");
+    assertEquals(third.getTwo(), "");
   }
 
 
diff --git a/parquet-protobuf/src/test/resources/TestProtobuf.proto b/parquet-protobuf/src/test/resources/TestProtobuf.proto
index afa0f63..caf7926 100644
--- a/parquet-protobuf/src/test/resources/TestProtobuf.proto
+++ b/parquet-protobuf/src/test/resources/TestProtobuf.proto
@@ -9,7 +9,7 @@
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
-// Unless required by applicable law or agreed to in writing,
+// Unless by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
@@ -17,6 +17,8 @@
 // under the License.
 //
 
+syntax = "proto3";
+
 package TestProtobuf;
 
 option java_package = "org.apache.parquet.proto.test";
@@ -25,17 +27,18 @@ option java_package = "org.apache.parquet.proto.test";
 // messages but groups were deprecated.
 
 message Document {
-    required int64 DocId = 1;
-    optional Links links = 32;
-    repeated group Name = 24 {
+    int64 DocId = 1;
+    Links links = 32;
+    message Name  {
         repeated Language name = 4;
-        optional string url = 5;
+        string url = 5;
     }
+    repeated Name name = 24;
 }
 
 message Language {
-    required string code = 12;
-    optional string Country = 14;
+    string code = 12;
+    string Country = 14;
 }
 
 message Links {
@@ -47,42 +50,43 @@ message Links {
 // begin - protocol buffers for ProtoSchemaConverterTest
 
  message SchemaConverterSimpleMessage {
-     optional int32 someId = 3;
+     int32 someId = 3;
  }
 
  message SchemaConverterAllDatatypes {
-     optional double optionalDouble = 1;
-     optional float optionalFloat = 2;
-     optional int32 optionalInt32 = 3;
-     optional int64 optionalInt64 = 4;
-     optional uint32 optionalUInt32 = 5;
-     optional uint64 optionalUInt64 = 6;
-     optional sint32 optionalSInt32 = 7;
-     optional sint64 optionalSInt64 = 8;
-     optional fixed32 optionalFixed32 = 9;
-     optional fixed64 optionalFixed64 = 10;
-     optional sfixed32 optionalSFixed32 = 11;
-     optional sfixed64 optionalSFixed64 = 12;
-     optional bool optionalBool = 13;
-     optional string optionalString = 14;
-     optional bytes optionalBytes = 15;
-     optional SchemaConverterSimpleMessage optionalMessage = 16;
-     optional group PbGroup  = 17 {
-       optional int32 groupInt = 2;
+     double optionalDouble = 1;
+     float optionalFloat = 2;
+     int32 optionalInt32 = 3;
+     int64 optionalInt64 = 4;
+     uint32 optionalUInt32 = 5;
+     uint64 optionalUInt64 = 6;
+     sint32 optionalSInt32 = 7;
+     sint64 optionalSInt64 = 8;
+     fixed32 optionalFixed32 = 9;
+     fixed64 optionalFixed64 = 10;
+     sfixed32 optionalSFixed32 = 11;
+     sfixed64 optionalSFixed64 = 12;
+     bool optionalBool = 13;
+     string optionalString = 14;
+     bytes optionalBytes = 15;
+     SchemaConverterSimpleMessage optionalMessage = 16;
+     message PbGroup {
+       int32 groupInt = 2;
      }
+     PbGroup pbGroup = 17;
     enum TestEnum {
         FIRST = 0;
         SECOND = 1;
     }
-    optional TestEnum optionalEnum = 18;
+    TestEnum optionalEnum = 18;
  }
 
  message SchemaConverterRepetition {
-     optional int32 optionalPrimitive = 1;
-     required int32 requiredPrimitive = 2;
+     int32 optionalPrimitive = 1;
+     int32 requiredPrimitive = 2;
      repeated int32 repeatedPrimitive = 3;
-     optional SchemaConverterSimpleMessage optionalMessage = 7;
-     required SchemaConverterSimpleMessage requiredMessage = 8;
+     SchemaConverterSimpleMessage optionalMessage = 7;
+     SchemaConverterSimpleMessage requiredMessage = 8;
      repeated SchemaConverterSimpleMessage repeatedMessage = 9;
  }
 
@@ -92,22 +96,22 @@ message Links {
 //begin protocol buffers for ProtoInputOutputFormatTest
 
 message InputOutputMsgFormat {
-    optional int32 someId = 3;
+    int32 someId = 3;
 }
 
 message IOFormatMessage {
-    optional double optionalDouble = 1;
+    double optionalDouble = 1;
     repeated string repeatedString = 2;
-    optional InputOutputMsgFormat msg = 3;
+    InputOutputMsgFormat msg = 3;
  }
 
 //end protocol buffers for ProtoInputOutputFormatTest
 
 
 message InnerMessage {
-    optional string one = 1;
-    optional string two = 2;
-    optional string three = 3;
+    string one = 1;
+    string two = 2;
+    string three = 3;
 }
 
 message TopMessage {
@@ -115,7 +119,7 @@ message TopMessage {
 }
 
 message MessageA {
-    optional InnerMessage inner = 123;
+    InnerMessage inner = 123;
 }
 
 message RepeatedIntMessage {
@@ -129,11 +133,11 @@ message HighIndexMessage {
 //custom proto class - ProtoInputOutputFormatTest
 
 message FirstCustomClassMessage {
-    optional string string = 11;
+    string string = 11;
 }
 
 message SecondCustomClassMessage {
-    optional string string = 11;
+    string string = 11;
 }
 
 //please place your unit test Protocol Buffer definitions here.

Reporter: Wael Nasreddine / @kalbasit
Assignee: Wael Nasreddine / @kalbasit

Note: This issue was originally created as PARQUET-665. Please see the migration documentation for further details.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions