From 42187d7f8cefca61ea4f6f8d35d944143226cbc8 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 00:55:11 +0000 Subject: [PATCH 01/89] Avro producers - interface and base class --- .../avro/producers/BaseAvroProducer.java | 56 +++++++++++++++++++ .../adapter/avro/producers/Producer.java | 51 +++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java new file mode 100644 index 0000000000..0102a52865 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.FieldVector; + +/** + * Base class for avro producers. + * + * @param vector type. + */ +public abstract class BaseAvroProducer implements Producer { + + protected T vector; + protected int currentIndex; + + /** + * Constructs a base avro consumer. + * + * @param vector the vector to consume. + */ + protected BaseAvroProducer(T vector) { + this.vector = vector; + } + + @Override + public void close() throws Exception { + vector.close(); + } + + @Override + public boolean resetValueVector(T vector) { + this.vector = vector; + this.currentIndex = 0; + return true; + } + + @Override + public FieldVector getVector() { + return vector; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java new file mode 100644 index 0000000000..897ae609da --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.FieldVector; +import org.apache.avro.io.Encoder; + +import java.io.IOException; + +public interface Producer extends AutoCloseable { + + /** + * Produce a specific type value from the vector and write it to avro encoder. + * + * @param encoder avro encoder to write data + * @throws IOException on error + */ + void produce(Encoder encoder) throws IOException; + + /** + * Close this produce + */ + @Override + void close() throws Exception; + + /** + * Reset the vector within producer + * + * @return true if reset is successful, false if reset is not needed. + */ + boolean resetValueVector(T vector); + + /** + * Get the vector within the producer. + */ + FieldVector getVector(); +} From c97c968b0fab6087cf19e53871f244a5c2ffec61 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 00:55:34 +0000 Subject: [PATCH 02/89] Avro producers - basic types --- .../avro/producers/AvroBooleanProducer.java | 44 +++++++++++++++++ .../avro/producers/AvroBytesProducer.java | 48 +++++++++++++++++++ .../avro/producers/AvroDoubleProducer.java | 41 ++++++++++++++++ .../avro/producers/AvroFixedProducer.java | 48 +++++++++++++++++++ .../avro/producers/AvroFloatProducer.java | 41 ++++++++++++++++ .../avro/producers/AvroIntProducer.java | 41 ++++++++++++++++ .../avro/producers/AvroLongProducer.java | 41 ++++++++++++++++ .../avro/producers/AvroNullProducer.java | 42 ++++++++++++++++ .../avro/producers/AvroStringProducer.java | 47 ++++++++++++++++++ 9 files changed, 393 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java new file mode 100644 index 0000000000..c6af1558d6 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; + +import org.apache.arrow.adapter.avro.consumers.BaseAvroConsumer; +import org.apache.arrow.vector.BitVector; +import org.apache.avro.io.Decoder; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces boolean values from a {@link BitVector}, + * writes data to an Avro encoder. + */ +public class AvroBooleanProducer extends BaseAvroProducer { + + /** + * Instantiate am AvroBooleanProducer. + */ + public AvroBooleanProducer(BitVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + int bitValue = vector.get(currentIndex++); + encoder.writeBoolean(bitValue != 0); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java new file mode 100644 index 0000000000..c7c7fe6f9e --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces byte array values from a {@link VarBinaryVector}, + * writes data to an Avro encoder. + */ +public class AvroBytesProducer extends BaseAvroProducer { + + /** + * Instantiate an AvroBytesProducer. + */ + public AvroBytesProducer(VarBinaryVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + // The nio ByteBuffer is created once per call, but underlying data is not copied + long offset = vector.getStartOffset(currentIndex); + int length = vector.getEndOffset(currentIndex); + ByteBuffer nioBuffer = vector.getDataBuffer().nioBuffer(offset, length); + encoder.writeBytes(nioBuffer); + currentIndex++; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java new file mode 100644 index 0000000000..8b85e7ec92 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; + +import org.apache.arrow.vector.Float8Vector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces double values from a {@link Float8Vector}, + * writes data to an Avro encoder. + */ +public class AvroDoubleProducer extends BaseAvroProducer { + + /** + * Instantiate an AvroDoubleProducer. + */ + public AvroDoubleProducer(Float8Vector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeDouble(vector.get(currentIndex++)); + } +} \ No newline at end of file diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java new file mode 100644 index 0000000000..77b4d46275 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces fixed-size binary values from a {@link FixedSizeBinaryVector}, + * writes data to an Avro encoder. + */ +public class AvroFixedProducer extends BaseAvroProducer { + + /** + * Instantiate an AvroFixedProducer. + */ + public AvroFixedProducer(FixedSizeBinaryVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + // The nio ByteBuffer is created once per call, but underlying data is not copied + long offset = (long) currentIndex * vector.getByteWidth(); + int length = vector.getByteWidth(); + ByteBuffer nioBuffer = vector.getDataBuffer().nioBuffer(offset, length); + encoder.writeBytes(nioBuffer); + currentIndex++; + } +} \ No newline at end of file diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java new file mode 100644 index 0000000000..e36f5ecb76 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; + +import org.apache.arrow.vector.Float4Vector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces float values from a {@link Float4Vector}, + * writes data to an Avro encoder. + */ +public class AvroFloatProducer extends BaseAvroProducer { + + /** + * Instantiate an AvroFloatProducer. + */ + public AvroFloatProducer(Float4Vector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeFloat(vector.get(currentIndex++)); + } +} \ No newline at end of file diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java new file mode 100644 index 0000000000..20fc97a617 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.IntVector; +import org.apache.avro.io.Encoder; + +import java.io.IOException; + +/** + * Producer that produces int values from an {@link IntVector}, + * writes data to an avro encoder + */ +public class AvroIntProducer extends BaseAvroProducer { + + /** + * Instantiate an AvroIntConsumer. + */ + public AvroIntProducer(IntVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeInt(vector.get(currentIndex++)); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java new file mode 100644 index 0000000000..02ba7542f8 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; + +import org.apache.arrow.vector.BigIntVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces long values from a {@link BigIntVector}, + * writes data to an Avro encoder. + */ +public class AvroLongProducer extends BaseAvroProducer { + + /** + * Instantiate an AvroLongProducer. + */ + public AvroLongProducer(BigIntVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeLong(vector.get(currentIndex++)); + } +} \ No newline at end of file diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java new file mode 100644 index 0000000000..7d6771ac06 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; + +import org.apache.arrow.vector.NullVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces null values from a {@link NullVector}, + * writes data to an Avro encoder. + */ +public class AvroNullProducer extends BaseAvroProducer { + + /** + * Instantiate an AvroNullProducer. + */ + public AvroNullProducer(NullVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeNull(); + currentIndex++; + } +} \ No newline at end of file diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java new file mode 100644 index 0000000000..db3dc78b07 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.arrow.vector.VarCharVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces string values from a {@link VarCharVector}, + * writes data to an Avro encoder. + */ +public class AvroStringProducer extends BaseAvroProducer { + + /** + * Instantiate an AvroStringProducer. + */ + public AvroStringProducer(VarCharVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + // The nio ByteBuffer is created once per call, but underlying data is not copied + long offset = vector.getStartOffset(currentIndex); + int length = vector.getEndOffset(currentIndex); + ByteBuffer nioBuffer = vector.getDataBuffer().nioBuffer(offset, length); + encoder.writeBytes(nioBuffer); + currentIndex++; + } +} \ No newline at end of file From 2b4d6eae2e58f57571f929ff77ecce7d52930634 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 01:34:22 +0000 Subject: [PATCH 03/89] Avro producers - logical types --- .../producers/logical/AvroDateProducer.java | 41 ++++++++++ .../logical/AvroDecimalProducer.java | 79 +++++++++++++++++++ .../logical/AvroTimeMicroProducer.java | 40 ++++++++++ .../logical/AvroTimeMillisProducer.java | 40 ++++++++++ .../logical/AvroTimestampMicroProducer.java | 40 ++++++++++ .../logical/AvroTimestampMillisProducer.java | 40 ++++++++++ 6 files changed, 280 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java new file mode 100644 index 0000000000..4ca6c53693 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; + +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.DateDayVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces date values from a {@link DateDayVector}, + * writes data to an Avro encoder. + */ +public class AvroDateProducer extends BaseAvroProducer { + + /** Instantiate an AvroDateProducer. */ + public AvroDateProducer(DateDayVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeInt(vector.get(currentIndex++)); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java new file mode 100644 index 0000000000..da53f7d7da --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; + +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.DecimalVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces decimal values from a {@link DecimalVector}, + * writes data to an Avro encoder. + */ +public abstract class AvroDecimalProducer extends BaseAvroProducer { + + /** Instantiate an AvroDecimalProducer. */ + public AvroDecimalProducer(DecimalVector vector) { + super(vector); + } + + /** Producer for decimal logical type with original bytes type. */ + public static class BytesDecimalProducer extends AvroDecimalProducer { + + private final byte[] reuseBytes; + + /** Instantiate a BytesDecimalConsumer. */ + public BytesDecimalProducer(DecimalVector vector) { + super(vector); + Preconditions.checkArgument(vector.getTypeWidth() <= 16, "Decimal bytes length should <= 16."); + reuseBytes = new byte[vector.getTypeWidth()]; + } + + @Override + public void produce(Encoder encoder) throws IOException { + long offset = (long) currentIndex * vector.getTypeWidth(); + vector.getDataBuffer().getBytes(offset, reuseBytes); + encoder.writeBytes(reuseBytes); + currentIndex++; + } + } + + /** Producer for decimal logical type with original fixed type. */ + public static class FixedDecimalProducer extends AvroDecimalProducer { + + private final byte[] reuseBytes; + + /** Instantiate a FixedDecimalConsumer. */ + public FixedDecimalProducer(DecimalVector vector, int size) { + super(vector); + Preconditions.checkArgument(size <= 16, "Decimal bytes length should <= 16."); + reuseBytes = new byte[vector.getTypeWidth()]; + } + + @Override + public void produce(Encoder encoder) throws IOException { + long offset = (long) currentIndex * vector.getTypeWidth(); + vector.getDataBuffer().getBytes(offset, reuseBytes); + encoder.writeFixed(reuseBytes); + currentIndex++; + } + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java new file mode 100644 index 0000000000..f765b8285a --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces time (microseconds) values from a {@link TimeMicroVector}, + * writes data to an Avro encoder. + */ +public class AvroTimeMicroProducer extends BaseAvroProducer { + + /** Instantiate an AvroTimeMicroProducer. */ + public AvroTimeMicroProducer(TimeMicroVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeLong(vector.get(currentIndex++)); + } +} \ No newline at end of file diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java new file mode 100644 index 0000000000..59b781c636 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces time (milliseconds) values from a {@link TimeMilliVector}, + * writes data to an Avro encoder. + */ +public class AvroTimeMillisProducer extends BaseAvroProducer { + + /** Instantiate an AvroTimeMillisProducer. */ + public AvroTimeMillisProducer(TimeMilliVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeInt(vector.get(currentIndex++)); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java new file mode 100644 index 0000000000..b772c94138 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces timestamp (microseconds) values from a {@link TimeStampMicroVector}, + * writes data to an Avro encoder. + */ +public class AvroTimestampMicroProducer extends BaseAvroProducer { + + /** Instantiate an AvroTimestampMicroProducer. */ + public AvroTimestampMicroProducer(TimeStampMicroVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeLong(vector.get(currentIndex++)); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java new file mode 100644 index 0000000000..a015e0fa29 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces timestamp (milliseconds) values from a {@link TimeStampMilliVector}, + * writes data to an Avro encoder. + */ +public class AvroTimestampMillisProducer extends BaseAvroProducer { + + /** Instantiate an AvroTimestampMillisProducer. */ + public AvroTimestampMillisProducer(TimeStampMilliVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeLong(vector.get(currentIndex++)); + } +} From 59374c131ba33c8d7cba9097b3bcaf78e6784c41 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 01:47:10 +0000 Subject: [PATCH 04/89] Add the composite AVRO producer --- .../avro/producers/CompositeAvroProducer.java | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java new file mode 100644 index 0000000000..1333627594 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import java.util.List; + +import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.avro.io.Encoder; + +/** Composite producer which holds all producers. It manages the produce and cleanup process. */ +public class CompositeAvroProducer implements AutoCloseable { + + private final List> producers; + + public CompositeAvroProducer(List> producers) { + this.producers = producers; + } + + public List> getProducers() { + return producers; + } + + /** Produce encoder data. */ + public void produce(Encoder encoder) throws IOException { + for (Producer producer : producers) { + producer.produce(encoder); + } + } + + /** Reset vector of consumers with the given {@link VectorSchemaRoot}. */ + @SuppressWarnings({"unchecked", "rawtypes"}) + public void resetProducerVectors(VectorSchemaRoot root) { + // This method assumes that the VSR matches the constructed set of producers + int index = 0; + for (Producer producer : producers) { + if (producer.resetValueVector(root.getFieldVectors().get(index))) { + index++; + } + } + } + + @Override + public void close() { + // clean up + try { + AutoCloseables.close(producers); + } catch (Exception e) { + throw new RuntimeException("Error occurs in close.", e); + } + } +} From d79a420d2a485ded09851a50494c718530b58307 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 01:48:19 +0000 Subject: [PATCH 05/89] Add producers packages to the module exports --- adapter/avro/src/main/java/module-info.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/adapter/avro/src/main/java/module-info.java b/adapter/avro/src/main/java/module-info.java index 5c6204be60..fee6c72199 100644 --- a/adapter/avro/src/main/java/module-info.java +++ b/adapter/avro/src/main/java/module-info.java @@ -18,6 +18,8 @@ module org.apache.arrow.adapter.avro { exports org.apache.arrow.adapter.avro.consumers; exports org.apache.arrow.adapter.avro.consumers.logical; + exports org.apache.arrow.adapter.avro.producers; + exports org.apache.arrow.adapter.avro.producers.logical; exports org.apache.arrow.adapter.avro; requires org.apache.arrow.memory.core; From e0ad9caac5bba7fc7abc8f45c4b27627d0a1ad8d Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 01:50:44 +0000 Subject: [PATCH 06/89] Protected constructor on abstract class --- .../adapter/avro/producers/logical/AvroDecimalProducer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java index da53f7d7da..9d8314741b 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java @@ -31,7 +31,7 @@ public abstract class AvroDecimalProducer extends BaseAvroProducer { /** Instantiate an AvroDecimalProducer. */ - public AvroDecimalProducer(DecimalVector vector) { + protected AvroDecimalProducer(DecimalVector vector) { super(vector); } From 01d7c52c70a6b3b7d9b447135e0c60e978f7f47a Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 01:51:31 +0000 Subject: [PATCH 07/89] Use byte[] for fixed producer, matching fixed consumer --- .../adapter/avro/producers/AvroFixedProducer.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java index 77b4d46275..b4d5a8ca8d 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java @@ -18,7 +18,6 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; -import java.nio.ByteBuffer; import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.avro.io.Encoder; @@ -29,20 +28,21 @@ */ public class AvroFixedProducer extends BaseAvroProducer { + private final byte[] reuseBytes; + /** * Instantiate an AvroFixedProducer. */ public AvroFixedProducer(FixedSizeBinaryVector vector) { super(vector); + reuseBytes = new byte[vector.getByteWidth()]; } @Override public void produce(Encoder encoder) throws IOException { - // The nio ByteBuffer is created once per call, but underlying data is not copied long offset = (long) currentIndex * vector.getByteWidth(); - int length = vector.getByteWidth(); - ByteBuffer nioBuffer = vector.getDataBuffer().nioBuffer(offset, length); - encoder.writeBytes(nioBuffer); + vector.getDataBuffer().getBytes(offset, reuseBytes); + encoder.writeBytes(reuseBytes); currentIndex++; } } \ No newline at end of file From 560f49f2a07e492c4132b6d576ba2b8bf713bac1 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 02:09:26 +0000 Subject: [PATCH 08/89] Add the AVRO enum producer --- .../avro/producers/AvroEnumProducer.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java new file mode 100644 index 0000000000..cddff52821 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; + +import org.apache.arrow.vector.IntVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces enum values from a dictionary-encoded {@link IntVector}, + * writes data to an Avro encoder. + */ +public class AvroEnumProducer extends BaseAvroProducer { + + /** Instantiate an AvroEnumProducer. */ + public AvroEnumProducer(IntVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + encoder.writeEnum(vector.get(currentIndex++)); + } +} From 26c9ba4e491420a065eddd20d66f5ed3e47d89c4 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 02:44:13 +0000 Subject: [PATCH 09/89] Add skip and set methods to the interface and base class --- .../adapter/avro/producers/BaseAvroProducer.java | 12 +++++++++++- .../arrow/adapter/avro/producers/Producer.java | 8 +++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java index 0102a52865..2bdde080c4 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java @@ -37,6 +37,16 @@ protected BaseAvroProducer(T vector) { this.vector = vector; } + @Override + public void skipNull() { + currentIndex++; + } + + @Override + public void setPosition(int index) { + currentIndex = index; + } + @Override public void close() throws Exception { vector.close(); @@ -50,7 +60,7 @@ public boolean resetValueVector(T vector) { } @Override - public FieldVector getVector() { + public T getVector() { return vector; } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java index 897ae609da..362a5f378a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java @@ -31,6 +31,12 @@ public interface Producer extends AutoCloseable { */ void produce(Encoder encoder) throws IOException; + /** Skip null value in the vector by setting reader position + 1. */ + void skipNull(); + + /** Set the position to read value from vector. */ + void setPosition(int index); + /** * Close this produce */ @@ -47,5 +53,5 @@ public interface Producer extends AutoCloseable { /** * Get the vector within the producer. */ - FieldVector getVector(); + T getVector(); } From 9d3bbe9d020b6609754f00db5e9ef040d6a176c0 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 15 Feb 2025 02:44:40 +0000 Subject: [PATCH 10/89] Add a specialized nullable producer --- .../avro/producers/AvroNullableProducer.java | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java new file mode 100644 index 0000000000..9fded0492d --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.FieldVector; +import org.apache.avro.io.Encoder; + +import java.io.IOException; + + +public class AvroNullableProducer extends BaseAvroProducer { + + private final Producer delegate; + + public AvroNullableProducer(Producer delegate) { + super(delegate.getVector()); + this.delegate = delegate; + } + + @Override + public void produce(Encoder encoder) throws IOException { + if (vector.isNull(currentIndex)) { + encoder.writeInt(1); + encoder.writeNull(); + delegate.skipNull(); + } + else { + encoder.writeInt(0); + delegate.produce(encoder); + } + currentIndex++; + } + + @Override + public void close() throws Exception { + delegate.close(); + } + + @Override + public boolean resetValueVector(T vector) { + return delegate.resetValueVector(vector); + } + + @Override + public T getVector() { + return delegate.getVector(); + } +} From 9aeabf412b3df667b467d577b19cf80875704736 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Feb 2025 17:35:50 +0000 Subject: [PATCH 11/89] Fix build warnings --- .../arrow/adapter/avro/producers/AvroBooleanProducer.java | 2 -- .../apache/arrow/adapter/avro/producers/AvroDoubleProducer.java | 2 +- .../apache/arrow/adapter/avro/producers/AvroFixedProducer.java | 2 +- .../apache/arrow/adapter/avro/producers/AvroFloatProducer.java | 2 +- .../apache/arrow/adapter/avro/producers/AvroLongProducer.java | 2 +- .../apache/arrow/adapter/avro/producers/AvroNullProducer.java | 2 +- .../apache/arrow/adapter/avro/producers/AvroStringProducer.java | 2 +- .../adapter/avro/producers/logical/AvroTimeMicroProducer.java | 2 +- 8 files changed, 7 insertions(+), 9 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java index c6af1558d6..1a165ade09 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java @@ -18,9 +18,7 @@ import java.io.IOException; -import org.apache.arrow.adapter.avro.consumers.BaseAvroConsumer; import org.apache.arrow.vector.BitVector; -import org.apache.avro.io.Decoder; import org.apache.avro.io.Encoder; /** diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java index 8b85e7ec92..72d29246b5 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java @@ -38,4 +38,4 @@ public AvroDoubleProducer(Float8Vector vector) { public void produce(Encoder encoder) throws IOException { encoder.writeDouble(vector.get(currentIndex++)); } -} \ No newline at end of file +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java index b4d5a8ca8d..1b1ed9fbe8 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java @@ -45,4 +45,4 @@ public void produce(Encoder encoder) throws IOException { encoder.writeBytes(reuseBytes); currentIndex++; } -} \ No newline at end of file +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java index e36f5ecb76..7306f2cd27 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java @@ -38,4 +38,4 @@ public AvroFloatProducer(Float4Vector vector) { public void produce(Encoder encoder) throws IOException { encoder.writeFloat(vector.get(currentIndex++)); } -} \ No newline at end of file +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java index 02ba7542f8..7242075ccb 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java @@ -38,4 +38,4 @@ public AvroLongProducer(BigIntVector vector) { public void produce(Encoder encoder) throws IOException { encoder.writeLong(vector.get(currentIndex++)); } -} \ No newline at end of file +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java index 7d6771ac06..d02e006d34 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java @@ -39,4 +39,4 @@ public void produce(Encoder encoder) throws IOException { encoder.writeNull(); currentIndex++; } -} \ No newline at end of file +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java index db3dc78b07..a5751bc9e6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java @@ -44,4 +44,4 @@ public void produce(Encoder encoder) throws IOException { encoder.writeBytes(nioBuffer); currentIndex++; } -} \ No newline at end of file +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java index f765b8285a..a64173ae7c 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java @@ -37,4 +37,4 @@ public AvroTimeMicroProducer(TimeMicroVector vector) { public void produce(Encoder encoder) throws IOException { encoder.writeLong(vector.get(currentIndex++)); } -} \ No newline at end of file +} From 6a6804b2b447c4519e3fb3352eee475664150c40 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Feb 2025 17:38:31 +0000 Subject: [PATCH 12/89] Fix comment warnings --- .../arrow/adapter/avro/producers/AvroIntProducer.java | 2 +- .../apache/arrow/adapter/avro/producers/Producer.java | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java index 20fc97a617..0db84d60e9 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java @@ -23,7 +23,7 @@ /** * Producer that produces int values from an {@link IntVector}, - * writes data to an avro encoder + * writes data to an avro encoder. */ public class AvroIntProducer extends BaseAvroProducer { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java index 362a5f378a..64bd95dbc5 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java @@ -21,6 +21,11 @@ import java.io.IOException; +/** + * Interface that is used to produce values to avro encoder. + * + * @param The vector within producer or its delegate, used for partially produce purpose. + */ public interface Producer extends AutoCloseable { /** @@ -38,13 +43,13 @@ public interface Producer extends AutoCloseable { void setPosition(int index); /** - * Close this produce + * Close this producer. */ @Override void close() throws Exception; /** - * Reset the vector within producer + * Reset the vector within producer. * * @return true if reset is successful, false if reset is not needed. */ From ef60f394fcd8f81c6facf561d6e1c1b0017d812e Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Feb 2025 22:39:18 +0000 Subject: [PATCH 13/89] Utils to create producers for primitive and logical types --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java new file mode 100644 index 0000000000..f26539a659 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro; + +import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; +import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; +import org.apache.arrow.adapter.avro.producers.AvroDoubleProducer; +import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; +import org.apache.arrow.adapter.avro.producers.AvroFloatProducer; +import org.apache.arrow.adapter.avro.producers.AvroIntProducer; +import org.apache.arrow.adapter.avro.producers.AvroLongProducer; +import org.apache.arrow.adapter.avro.producers.AvroNullProducer; +import org.apache.arrow.adapter.avro.producers.AvroNullableProducer; +import org.apache.arrow.adapter.avro.producers.AvroStringProducer; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; +import org.apache.arrow.adapter.avro.producers.Producer; +import org.apache.arrow.adapter.avro.producers.logical.AvroDateProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroDecimalProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimeMicroProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimeMillisProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMicroProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMillisProducer; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.types.Types; + +import java.util.ArrayList; +import java.util.List; + +public class ArrowToAvroUtils { + + /** + * Create a composite Avro producer for a set of field vectors (typically the root set of a VSR). + * + * @param vectors The vectors that will be used to produce Avro data + * @return The resulting composite Avro producer + */ + public static CompositeAvroProducer createCompositeProducer(List vectors) { + + List> producers = new ArrayList<>(vectors.size()); + + for (FieldVector vector : vectors) { + BaseAvroProducer producer = createProducer(vector); + producers.add(producer); + } + + return new CompositeAvroProducer(producers); + } + + private static BaseAvroProducer createProducer(FieldVector vector) { + boolean nullable = vector.getField().isNullable(); + return createProducer(vector, nullable); + } + + private static BaseAvroProducer createProducer(FieldVector vector, boolean nullable) { + + Preconditions.checkNotNull(vector, "Arrow vector object can't be null"); + + if (nullable) { + final BaseAvroProducer innerProducer = createProducer(vector, false); + return new AvroNullableProducer<>(innerProducer); + } + + final Types.MinorType minorType = vector.getMinorType(); + + switch (minorType) { + + // Primitive types with direct mapping to Avro + + case NULL: + return new AvroNullProducer((NullVector) vector); + case BIT: + return new AvroBooleanProducer((BitVector) vector); + case INT: + return new AvroIntProducer((IntVector) vector); + case BIGINT: + return new AvroLongProducer((BigIntVector) vector); + case FLOAT4: + return new AvroFloatProducer((Float4Vector) vector); + case FLOAT8: + return new AvroDoubleProducer((Float8Vector) vector); + case VARBINARY: + return new AvroBytesProducer((VarBinaryVector) vector); + case FIXEDSIZEBINARY: + return new AvroFixedProducer((FixedSizeBinaryVector) vector); + case VARCHAR: + return new AvroStringProducer((VarCharVector) vector); + + // Logical types + + case DECIMAL: + return new AvroDecimalProducer.FixedDecimalProducer((DecimalVector) vector, DecimalVector.TYPE_WIDTH); + case DATEDAY: + return new AvroDateProducer((DateDayVector) vector); + case TIMEMILLI: + return new AvroTimeMillisProducer((TimeMilliVector) vector); + case TIMEMICRO: + return new AvroTimeMicroProducer((TimeMicroVector) vector); + case TIMESTAMPMILLI: + return new AvroTimestampMillisProducer((TimeStampMilliVector) vector); + case TIMESTAMPMICRO: + return new AvroTimestampMicroProducer((TimeStampMicroVector) vector); + + // Not all Arrow types are supported for encoding (yet)! + + default: + String error = String.format("Encoding Arrow type %s to Avro is not currently supported", minorType.name()); + throw new UnsupportedOperationException(error); + } + } +} From 14682fe5fd71104087b8b0c717bb14e9ab49da9c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Feb 2025 22:40:33 +0000 Subject: [PATCH 14/89] Add producer for struct type --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 15 +++++ .../avro/producers/AvroStructProducer.java | 61 +++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index f26539a659..2060d6a00a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -27,6 +27,7 @@ import org.apache.arrow.adapter.avro.producers.AvroNullProducer; import org.apache.arrow.adapter.avro.producers.AvroNullableProducer; import org.apache.arrow.adapter.avro.producers.AvroStringProducer; +import org.apache.arrow.adapter.avro.producers.AvroStructProducer; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; import org.apache.arrow.adapter.avro.producers.Producer; @@ -53,6 +54,7 @@ import org.apache.arrow.vector.TimeStampMilliVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.types.Types; import java.util.ArrayList; @@ -132,6 +134,19 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu case TIMESTAMPMICRO: return new AvroTimestampMicroProducer((TimeStampMicroVector) vector); + // Complex types + + case STRUCT: + + StructVector structVector = (StructVector) vector; + List childVectors = structVector.getChildrenFromFields(); + Producer[] childProducers = new Producer[childVectors.size()]; + for (int i = 0; i < childVectors.size(); i++) { + FieldVector childVector = childVectors.get(i); + childProducers[i] = createProducer(childVector, childVector.getField().isNullable()); + } + return new AvroStructProducer(structVector, childProducers); + // Not all Arrow types are supported for encoding (yet)! default: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java new file mode 100644 index 0000000000..aa37c6e482 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.avro.io.Encoder; + +import java.io.IOException; + +/** + * Producer which produces nested record type values to avro encoder. Read the data from + * {@link org.apache.arrow.vector.complex.StructVector}. + */ +public class AvroStructProducer extends BaseAvroProducer { + + private final Producer[] delegates; + + /** Instantiate a AvroStructProducer. */ + public AvroStructProducer(StructVector vector, Producer[] delegates) { + super(vector); + this.delegates = delegates; + } + + @Override + public void produce(Encoder encoder) throws IOException { + + for (Producer delegate : delegates) { + delegate.produce(encoder); + } + + currentIndex++; + } + + @Override + @SuppressWarnings("unchecked") + public boolean resetValueVector(StructVector vector) { + + for (int i = 0; i < delegates.length; i++) { + Producer delegate = (Producer) delegates[i]; + delegate.resetValueVector(vector.getChildrenFromFields().get(i)); + } + + return super.resetValueVector(vector); + } +} From f0d54876b56c8cd6e17f7b287aa5959482753469 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Feb 2025 22:57:02 +0000 Subject: [PATCH 15/89] Add producer for arrays type --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 9 +++ .../avro/producers/AvroArraysProducer.java | 65 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 2060d6a00a..e7853e8681 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -17,6 +17,7 @@ package org.apache.arrow.adapter.avro; +import org.apache.arrow.adapter.avro.producers.AvroArraysProducer; import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; import org.apache.arrow.adapter.avro.producers.AvroDoubleProducer; @@ -54,6 +55,7 @@ import org.apache.arrow.vector.TimeStampMilliVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.types.Types; @@ -147,6 +149,13 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu } return new AvroStructProducer(structVector, childProducers); + case LIST: + + ListVector listVector = (ListVector) vector; + FieldVector itemVector = listVector.getDataVector(); + Producer itemProducer = createProducer(itemVector, itemVector.getField().isNullable()); + return new AvroArraysProducer(listVector, itemProducer); + // Not all Arrow types are supported for encoding (yet)! default: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java new file mode 100644 index 0000000000..b646515405 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.avro.io.Encoder; + +/** + * Producer which produces array type values to an Avro encoder. + * Writes the data from a {@link ListVector}. + */ +public class AvroArraysProducer extends BaseAvroProducer { + + private final Producer delegate; + + /** Instantiate an ArraysProducer. */ + public AvroArraysProducer(ListVector vector, Producer delegate) { + super(vector); + this.delegate = delegate; + } + + @Override + public void produce(Encoder encoder) throws IOException { + + int startOffset = vector.getOffsetBuffer().getInt(currentIndex * (long) Integer.BYTES); + int endOffset = vector.getOffsetBuffer().getInt((currentIndex + 1) * (long) Integer.BYTES); + int nItems = endOffset - startOffset; + + encoder.writeArrayStart(); + encoder.setItemCount(nItems); + + for (int i = 0; i < nItems; i++) { + encoder.startItem(); + delegate.produce(encoder); + } + + encoder.writeArrayEnd(); + currentIndex++; + } + + @Override + @SuppressWarnings("unchecked") + public boolean resetValueVector(ListVector vector) { + ((Producer) delegate).resetValueVector(vector.getDataVector()); + return super.resetValueVector(vector); + } +} From d4c2bea777978e6c43f2124789a83cac21b06019 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Feb 2025 23:01:39 +0000 Subject: [PATCH 16/89] Add producer for map type --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 13 ++++ .../avro/producers/AvroMapProducer.java | 64 +++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index e7853e8681..1e47729331 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -25,6 +25,7 @@ import org.apache.arrow.adapter.avro.producers.AvroFloatProducer; import org.apache.arrow.adapter.avro.producers.AvroIntProducer; import org.apache.arrow.adapter.avro.producers.AvroLongProducer; +import org.apache.arrow.adapter.avro.producers.AvroMapProducer; import org.apache.arrow.adapter.avro.producers.AvroNullProducer; import org.apache.arrow.adapter.avro.producers.AvroNullableProducer; import org.apache.arrow.adapter.avro.producers.AvroStringProducer; @@ -56,6 +57,7 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.types.Types; @@ -156,6 +158,17 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu Producer itemProducer = createProducer(itemVector, itemVector.getField().isNullable()); return new AvroArraysProducer(listVector, itemProducer); + case MAP: + + MapVector mapVector = (MapVector) vector; + StructVector entryVector = (StructVector) mapVector.getDataVector(); + VarCharVector keyVector = (VarCharVector) entryVector.getChildrenFromFields().get(0); + FieldVector valueVector = entryVector.getChildrenFromFields().get(1); + Producer keyProducer = new AvroStringProducer(keyVector); + Producer valueProducer = createProducer(valueVector, valueVector.getField().isNullable()); + Producer entryProducer = new AvroStructProducer(entryVector, new Producer[] {keyProducer, valueProducer}); + return new AvroMapProducer(mapVector, entryProducer); + // Not all Arrow types are supported for encoding (yet)! default: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java new file mode 100644 index 0000000000..6780dda84d --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.avro.io.Encoder; + +import java.io.IOException; + +/** + * Producer which produces map type values to avro encoder. Write the data to {@link MapVector}. + */ +public class AvroMapProducer extends BaseAvroProducer { + + private final Producer delegate; + + /** Instantiate a AvroMapProducer. */ + public AvroMapProducer(MapVector vector, Producer delegate) { + super(vector); + this.delegate = delegate; + } + + @Override + public void produce(Encoder encoder) throws IOException { + + int startOffset = vector.getOffsetBuffer().getInt(currentIndex * (long) Integer.BYTES); + int endOffset = vector.getOffsetBuffer().getInt((currentIndex + 1) * (long) Integer.BYTES); + int nEntries = endOffset - startOffset; + + encoder.writeMapStart(); + encoder.setItemCount(nEntries); + + for (int i = 0; i < nEntries; i++) { + encoder.startItem(); + delegate.produce(encoder); + } + + encoder.writeMapEnd(); + currentIndex++; + } + + @Override + @SuppressWarnings("unchecked") + public boolean resetValueVector(MapVector vector) { + ((Producer) delegate).resetValueVector(vector.getDataVector()); + return super.resetValueVector(vector); + } +} From 1452137ff3301b17b9bdbe3ddba6780bdf8a7d6b Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Feb 2025 23:13:48 +0000 Subject: [PATCH 17/89] Override methods in composite producers --- .../avro/producers/AvroArraysProducer.java | 18 ++++++++++++ .../avro/producers/AvroMapProducer.java | 18 ++++++++++++ .../avro/producers/AvroNullableProducer.java | 16 +++++++++-- .../avro/producers/AvroStructProducer.java | 28 +++++++++++++++++-- 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java index b646515405..a8422af173 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java @@ -56,10 +56,28 @@ public void produce(Encoder encoder) throws IOException { currentIndex++; } + @Override + public void skipNull() { + delegate.skipNull(); + super.skipNull(); + } + + @Override + public void setPosition(int index) { + delegate.setPosition(index); + super.setPosition(index); + } + @Override @SuppressWarnings("unchecked") public boolean resetValueVector(ListVector vector) { ((Producer) delegate).resetValueVector(vector.getDataVector()); return super.resetValueVector(vector); } + + @Override + public void close() throws Exception { + delegate.close(); + super.close(); + } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java index 6780dda84d..2c2a15c9ab 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java @@ -55,10 +55,28 @@ public void produce(Encoder encoder) throws IOException { currentIndex++; } + @Override + public void skipNull() { + delegate.skipNull(); + super.skipNull(); + } + + @Override + public void setPosition(int index) { + delegate.setPosition(index); + super.setPosition(index); + } + @Override @SuppressWarnings("unchecked") public boolean resetValueVector(MapVector vector) { ((Producer) delegate).resetValueVector(vector.getDataVector()); return super.resetValueVector(vector); } + + @Override + public void close() throws Exception { + delegate.close(); + super.close(); + } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java index 9fded0492d..afcb874ae5 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java @@ -47,8 +47,15 @@ public void produce(Encoder encoder) throws IOException { } @Override - public void close() throws Exception { - delegate.close(); + public void skipNull() { + delegate.skipNull(); + super.skipNull(); + } + + @Override + public void setPosition(int index) { + delegate.setPosition(index); + super.setPosition(index); } @Override @@ -60,4 +67,9 @@ public boolean resetValueVector(T vector) { public T getVector() { return delegate.getVector(); } + + @Override + public void close() throws Exception { + delegate.close(); + } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java index aa37c6e482..3d4fc25beb 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java @@ -40,22 +40,44 @@ public AvroStructProducer(StructVector vector, Producer[] @Override public void produce(Encoder encoder) throws IOException { - for (Producer delegate : delegates) { + for (Producer delegate : delegates) { delegate.produce(encoder); } currentIndex++; } + @Override + public void skipNull() { + for (Producer delegate : delegates) { + delegate.skipNull(); + } + super.skipNull(); + } + + @Override + public void setPosition(int index) { + for (Producer delegate : delegates) { + delegate.setPosition(index); + } + super.setPosition(index); + } + @Override @SuppressWarnings("unchecked") public boolean resetValueVector(StructVector vector) { - for (int i = 0; i < delegates.length; i++) { Producer delegate = (Producer) delegates[i]; delegate.resetValueVector(vector.getChildrenFromFields().get(i)); } - return super.resetValueVector(vector); } + + @Override + public void close() throws Exception { + for (Producer delegate : delegates) { + delegate.close(); + } + super.close(); + } } From 8588d5abe8acb8730cf296c3dc7a09b4787de322 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Feb 2025 23:17:49 +0000 Subject: [PATCH 18/89] Do not allow skipNull() on nullable producer --- .../arrow/adapter/avro/producers/AvroNullableProducer.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java index afcb874ae5..30262edeca 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java @@ -48,8 +48,9 @@ public void produce(Encoder encoder) throws IOException { @Override public void skipNull() { - delegate.skipNull(); - super.skipNull(); + // Should never be called on nullable producer + // Calling produce() will skipNull() on the delegate + throw new UnsupportedOperationException(); } @Override From eedf57500f8d98847a94f4210922e26698c1814e Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Wed, 26 Feb 2025 00:41:38 +0000 Subject: [PATCH 19/89] Add a union type --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 21 +++- .../avro/producers/AvroUnionsProducer.java | 112 ++++++++++++++++++ 2 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 1e47729331..5a6fe79f7b 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -30,6 +30,7 @@ import org.apache.arrow.adapter.avro.producers.AvroNullableProducer; import org.apache.arrow.adapter.avro.producers.AvroStringProducer; import org.apache.arrow.adapter.avro.producers.AvroStructProducer; +import org.apache.arrow.adapter.avro.producers.AvroUnionsProducer; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; import org.apache.arrow.adapter.avro.producers.Producer; @@ -59,6 +60,7 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.Types; import java.util.ArrayList; @@ -93,13 +95,15 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu Preconditions.checkNotNull(vector, "Arrow vector object can't be null"); - if (nullable) { + final Types.MinorType minorType = vector.getMinorType(); + + // Avro understands nullable types as a union of type | null + // Most nullable fields in a VSR will not be unions, so provide a special wrapper + if (nullable && minorType != Types.MinorType.UNION) { final BaseAvroProducer innerProducer = createProducer(vector, false); return new AvroNullableProducer<>(innerProducer); } - final Types.MinorType minorType = vector.getMinorType(); - switch (minorType) { // Primitive types with direct mapping to Avro @@ -169,6 +173,17 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu Producer entryProducer = new AvroStructProducer(entryVector, new Producer[] {keyProducer, valueProducer}); return new AvroMapProducer(mapVector, entryProducer); + case UNION: + + UnionVector unionVector = (UnionVector) vector; + List unionChildVectors = unionVector.getChildrenFromFields(); + Producer[] unionChildProducers = new Producer[unionChildVectors.size()]; + for (int i = 0; i < unionChildVectors.size(); i++) { + FieldVector unionChildVector = unionChildVectors.get(i); + unionChildProducers[i] = createProducer(unionChildVector, /* nullable = */ false); // Do not nest union types + } + return new AvroUnionsProducer(unionVector, unionChildProducers); + // Not all Arrow types are supported for encoding (yet)! default: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java new file mode 100644 index 0000000000..86333e0d70 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.avro.io.Encoder; + +import java.io.IOException; +import java.util.List; + +/** + * Producer which produces unions type values to avro encoder. Write the data to + * {@link org.apache.arrow.vector.complex.UnionVector}. + */ +public class AvroUnionsProducer extends BaseAvroProducer { + + private final Producer[] delegates; + private final UnionMode unionMode; + private final int nullTypeIndex; + + /** Instantiate an AvroUnionsProducer. */ + public AvroUnionsProducer(UnionVector vector, Producer[] delegates) { + super(vector); + this.delegates = delegates; + this.unionMode = vector.getMinorType() == Types.MinorType.DENSEUNION ? UnionMode.Dense : UnionMode.Sparse; + this.nullTypeIndex = findNullTypeIndex(); + } + + private int findNullTypeIndex() { + List childVectors = vector.getChildrenFromFields(); + for (int i = 0; i < childVectors.size(); i++) { + if (childVectors.get(i).getMinorType() == Types.MinorType.NULL) { + return i; + } + } + // For nullable unions with no explicit null type, a null type is appended to the schema + return childVectors.size(); + } + + @Override + public void produce(Encoder encoder) throws IOException { + + if (vector.isNull(currentIndex)) { + encoder.writeInt(nullTypeIndex); + encoder.writeNull(); + } + else { + + int typeIndex = vector.getTypeValue(currentIndex); + int typeVectorIndex; + + if (unionMode == UnionMode.Dense) { + typeVectorIndex = vector.getOffsetBuffer().getInt(currentIndex * (long) Integer.BYTES); + } + else { + typeVectorIndex = currentIndex; + } + + FieldVector typeVector = vector.getChildrenFromFields().get(typeIndex); + + if (typeVector.isNull(typeVectorIndex)) { + encoder.writeInt(nullTypeIndex); + encoder.writeNull(); + } + else { + Producer delegate = delegates[typeIndex]; + encoder.writeInt(typeIndex); + delegate.setPosition(typeVectorIndex); + delegate.produce(encoder); + } + } + + currentIndex++; + } + + @Override + @SuppressWarnings("unchecked") + public boolean resetValueVector(UnionVector vector) { + boolean result = true; + for (int i = 0; i < delegates.length; i++) { + Producer delegate = (Producer) delegates[i]; + result &= delegate.resetValueVector(vector.getChildrenFromFields().get(i)); + } + return result & super.resetValueVector(vector); + } + + @Override + public void close() throws Exception { + for (Producer delegate : delegates) { + delegate.close(); + } + super.close(); + } +} From 2a7c7ee14abd8f16cf8769c8967f18a3cd32d8c9 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Wed, 26 Feb 2025 02:11:59 +0000 Subject: [PATCH 20/89] Very basic first test case --- .../adapter/avro/TestWriteReadAvroRecord.java | 94 ++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java index c318214f5c..b05e68c287 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java @@ -16,11 +16,24 @@ */ package org.apache.arrow.adapter.avro; -import static org.junit.jupiter.api.Assertions.assertEquals; - import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; + +import org.apache.arrow.adapter.avro.consumers.CompositeAvroConsumer; +import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.DataFileWriter; @@ -28,11 +41,17 @@ import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryDecoder; +import org.apache.avro.io.BinaryEncoder; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; +import org.apache.avro.io.DecoderFactory; +import org.apache.avro.io.EncoderFactory; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import static org.junit.jupiter.api.Assertions.*; + public class TestWriteReadAvroRecord { @TempDir public static File TMP; @@ -82,4 +101,75 @@ public void testWriteAndRead() throws Exception { assertEquals(7, deUser2.get("favorite_number")); assertEquals("red", deUser2.get("favorite_color").toString()); } + + @Test + public void testWriteAndReadVSR() throws Exception { + + BufferAllocator allocator = new RootAllocator(); + + List fields = new ArrayList<>(); + fields.add(new Field("name", new FieldType(false, ArrowType.Utf8.INSTANCE, null), null)); + fields.add(new Field("favorite_number", new FieldType(true, new ArrowType.Int(32, true), null), null)); + fields.add(new Field("favorite_color", new FieldType(true, ArrowType.Utf8.INSTANCE, null), null)); + + VarCharVector nameVector = new VarCharVector(fields.get(0), allocator); + nameVector.allocateNew(2); + nameVector.set(0, "Alyssa".getBytes(StandardCharsets.UTF_8)); + nameVector.set(1, "Ben".getBytes(StandardCharsets.UTF_8)); + + IntVector favNumberVector = new IntVector(fields.get(1), allocator); + favNumberVector.allocateNew(2); + favNumberVector.set(0, 256); + favNumberVector.set(1, 7); + + VarCharVector favColorVector = new VarCharVector(fields.get(2), allocator); + favColorVector.allocateNew(2); + favColorVector.setNull(0); + favColorVector.set(1, "red".getBytes(StandardCharsets.UTF_8)); + + List vectors = new ArrayList<>(); + vectors.add(nameVector); + vectors.add(favNumberVector); + vectors.add(favColorVector); + + File dataFile = new File(TMP, "test_vsr.avro"); + Schema schema = AvroTestBase.getSchema("test.avsc"); + AvroToArrowConfig config = new AvroToArrowConfigBuilder(allocator).build(); + + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + + producer.produce(encoder); + producer.produce(encoder); + } + + List roundTripFields = new ArrayList<>(); + List roundTripVectors = new ArrayList<>(); + + try (FileInputStream fis = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = new DecoderFactory().directBinaryDecoder(fis, null); + CompositeAvroConsumer consumer = AvroToArrowUtils.createCompositeConsumer(schema, config); + + consumer.getConsumers().forEach(c -> roundTripFields.add(c.getVector().getField())); + consumer.getConsumers().forEach(c -> roundTripVectors.add(c.getVector())); + consumer.consume(decoder); + consumer.consume(decoder); + } + + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, 2); + VectorSchemaRoot roundTripRoot = new VectorSchemaRoot(roundTripFields, roundTripVectors, 2); + + assertEquals(root.getRowCount(), roundTripRoot.getRowCount()); + + for (int row = 0; row < 2; row++) { + for (int col = 0; col < 3; col++) { + FieldVector vector = root.getVector(col); + FieldVector roundTripVector = roundTripRoot.getVector(col); + assertEquals(vector.getObject(row), roundTripVector.getObject(row)); + } + } + } } From 2299a47f6550375386e5dc6327ffc523d6e68df3 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Wed, 26 Feb 2025 02:12:09 +0000 Subject: [PATCH 21/89] Fixes --- .../apache/arrow/adapter/avro/ArrowToAvroUtils.java | 2 +- .../adapter/avro/producers/AvroStringProducer.java | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 5a6fe79f7b..97336cd31c 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -180,7 +180,7 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu Producer[] unionChildProducers = new Producer[unionChildVectors.size()]; for (int i = 0; i < unionChildVectors.size(); i++) { FieldVector unionChildVector = unionChildVectors.get(i); - unionChildProducers[i] = createProducer(unionChildVector, /* nullable = */ false); // Do not nest union types + unionChildProducers[i] = createProducer(unionChildVector, /* nullable = */ false); // Do not nest union types } return new AvroUnionsProducer(unionVector, unionChildProducers); diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java index a5751bc9e6..dd04ecc21c 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java @@ -37,11 +37,15 @@ public AvroStringProducer(VarCharVector vector) { @Override public void produce(Encoder encoder) throws IOException { + + int start = vector.getStartOffset(currentIndex); + int end = vector.getEndOffset(currentIndex); + int length = end - start; + // The nio ByteBuffer is created once per call, but underlying data is not copied - long offset = vector.getStartOffset(currentIndex); - int length = vector.getEndOffset(currentIndex); - ByteBuffer nioBuffer = vector.getDataBuffer().nioBuffer(offset, length); + ByteBuffer nioBuffer = vector.getDataBuffer().nioBuffer(start, length); encoder.writeBytes(nioBuffer); + currentIndex++; } } From c4cfe29878defaabb66c4ee038b6a8c52de9eabf Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Wed, 26 Feb 2025 17:42:36 +0000 Subject: [PATCH 22/89] Maven spotless checks --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 48 +++++++++---------- .../avro/producers/AvroArraysProducer.java | 6 +-- .../avro/producers/AvroBooleanProducer.java | 8 +--- .../avro/producers/AvroBytesProducer.java | 10 ++-- .../avro/producers/AvroDoubleProducer.java | 8 +--- .../avro/producers/AvroEnumProducer.java | 6 +-- .../avro/producers/AvroFixedProducer.java | 10 ++-- .../avro/producers/AvroFloatProducer.java | 8 +--- .../avro/producers/AvroIntProducer.java | 12 ++--- .../avro/producers/AvroLongProducer.java | 8 +--- .../avro/producers/AvroMapProducer.java | 8 +--- .../avro/producers/AvroNullProducer.java | 10 +--- .../avro/producers/AvroNullableProducer.java | 13 ++--- .../avro/producers/AvroStringProducer.java | 9 ++-- .../avro/producers/AvroStructProducer.java | 8 ++-- .../avro/producers/AvroUnionsProducer.java | 25 +++++----- .../avro/producers/CompositeAvroProducer.java | 2 - .../adapter/avro/producers/Producer.java | 11 ++--- .../producers/logical/AvroDateProducer.java | 5 +- .../logical/AvroDecimalProducer.java | 9 ++-- .../logical/AvroTimeMicroProducer.java | 5 +- .../logical/AvroTimeMillisProducer.java | 5 +- .../logical/AvroTimestampMicroProducer.java | 1 - .../logical/AvroTimestampMillisProducer.java | 1 - .../adapter/avro/TestWriteReadAvroRecord.java | 14 +++--- 25 files changed, 94 insertions(+), 156 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 97336cd31c..68e10eece0 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -14,9 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro; +import java.util.ArrayList; +import java.util.List; import org.apache.arrow.adapter.avro.producers.AvroArraysProducer; import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; @@ -41,8 +42,8 @@ import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMicroProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMillisProducer; import org.apache.arrow.util.Preconditions; -import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.FieldVector; @@ -63,9 +64,6 @@ import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.Types; -import java.util.ArrayList; -import java.util.List; - public class ArrowToAvroUtils { /** @@ -76,14 +74,14 @@ public class ArrowToAvroUtils { */ public static CompositeAvroProducer createCompositeProducer(List vectors) { - List> producers = new ArrayList<>(vectors.size()); + List> producers = new ArrayList<>(vectors.size()); - for (FieldVector vector : vectors) { - BaseAvroProducer producer = createProducer(vector); - producers.add(producer); - } + for (FieldVector vector : vectors) { + BaseAvroProducer producer = createProducer(vector); + producers.add(producer); + } - return new CompositeAvroProducer(producers); + return new CompositeAvroProducer(producers); } private static BaseAvroProducer createProducer(FieldVector vector) { @@ -106,7 +104,7 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu switch (minorType) { - // Primitive types with direct mapping to Avro + // Primitive types with direct mapping to Avro case NULL: return new AvroNullProducer((NullVector) vector); @@ -127,10 +125,11 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu case VARCHAR: return new AvroStringProducer((VarCharVector) vector); - // Logical types + // Logical types case DECIMAL: - return new AvroDecimalProducer.FixedDecimalProducer((DecimalVector) vector, DecimalVector.TYPE_WIDTH); + return new AvroDecimalProducer.FixedDecimalProducer( + (DecimalVector) vector, DecimalVector.TYPE_WIDTH); case DATEDAY: return new AvroDateProducer((DateDayVector) vector); case TIMEMILLI: @@ -142,10 +141,9 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu case TIMESTAMPMICRO: return new AvroTimestampMicroProducer((TimeStampMicroVector) vector); - // Complex types + // Complex types case STRUCT: - StructVector structVector = (StructVector) vector; List childVectors = structVector.getChildrenFromFields(); Producer[] childProducers = new Producer[childVectors.size()]; @@ -156,38 +154,40 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu return new AvroStructProducer(structVector, childProducers); case LIST: - ListVector listVector = (ListVector) vector; FieldVector itemVector = listVector.getDataVector(); Producer itemProducer = createProducer(itemVector, itemVector.getField().isNullable()); return new AvroArraysProducer(listVector, itemProducer); case MAP: - MapVector mapVector = (MapVector) vector; StructVector entryVector = (StructVector) mapVector.getDataVector(); VarCharVector keyVector = (VarCharVector) entryVector.getChildrenFromFields().get(0); FieldVector valueVector = entryVector.getChildrenFromFields().get(1); Producer keyProducer = new AvroStringProducer(keyVector); - Producer valueProducer = createProducer(valueVector, valueVector.getField().isNullable()); - Producer entryProducer = new AvroStructProducer(entryVector, new Producer[] {keyProducer, valueProducer}); + Producer valueProducer = + createProducer(valueVector, valueVector.getField().isNullable()); + Producer entryProducer = + new AvroStructProducer(entryVector, new Producer[] {keyProducer, valueProducer}); return new AvroMapProducer(mapVector, entryProducer); case UNION: - UnionVector unionVector = (UnionVector) vector; List unionChildVectors = unionVector.getChildrenFromFields(); Producer[] unionChildProducers = new Producer[unionChildVectors.size()]; for (int i = 0; i < unionChildVectors.size(); i++) { FieldVector unionChildVector = unionChildVectors.get(i); - unionChildProducers[i] = createProducer(unionChildVector, /* nullable = */ false); // Do not nest union types + unionChildProducers[i] = + createProducer(unionChildVector, /* nullable = */ false); // Do not nest union types } return new AvroUnionsProducer(unionVector, unionChildProducers); - // Not all Arrow types are supported for encoding (yet)! + // Not all Arrow types are supported for encoding (yet)! default: - String error = String.format("Encoding Arrow type %s to Avro is not currently supported", minorType.name()); + String error = + String.format( + "Encoding Arrow type %s to Avro is not currently supported", minorType.name()); throw new UnsupportedOperationException(error); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java index a8422af173..07e8a66d75 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java @@ -14,18 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; import java.io.IOException; - import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.avro.io.Encoder; /** - * Producer which produces array type values to an Avro encoder. - * Writes the data from a {@link ListVector}. + * Producer which produces array type values to an Avro encoder. Writes the data from a {@link + * ListVector}. */ public class AvroArraysProducer extends BaseAvroProducer { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java index 1a165ade09..523ddf110d 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBooleanProducer.java @@ -17,19 +17,15 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; - import org.apache.arrow.vector.BitVector; import org.apache.avro.io.Encoder; /** - * Producer that produces boolean values from a {@link BitVector}, - * writes data to an Avro encoder. + * Producer that produces boolean values from a {@link BitVector}, writes data to an Avro encoder. */ public class AvroBooleanProducer extends BaseAvroProducer { - /** - * Instantiate am AvroBooleanProducer. - */ + /** Instantiate am AvroBooleanProducer. */ public AvroBooleanProducer(BitVector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java index c7c7fe6f9e..5d16d52da6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java @@ -14,24 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; import java.io.IOException; import java.nio.ByteBuffer; - import org.apache.arrow.vector.VarBinaryVector; import org.apache.avro.io.Encoder; /** - * Producer that produces byte array values from a {@link VarBinaryVector}, - * writes data to an Avro encoder. + * Producer that produces byte array values from a {@link VarBinaryVector}, writes data to an Avro + * encoder. */ public class AvroBytesProducer extends BaseAvroProducer { - /** - * Instantiate an AvroBytesProducer. - */ + /** Instantiate an AvroBytesProducer. */ public AvroBytesProducer(VarBinaryVector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java index 72d29246b5..c770b9845f 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java @@ -17,19 +17,15 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; - import org.apache.arrow.vector.Float8Vector; import org.apache.avro.io.Encoder; /** - * Producer that produces double values from a {@link Float8Vector}, - * writes data to an Avro encoder. + * Producer that produces double values from a {@link Float8Vector}, writes data to an Avro encoder. */ public class AvroDoubleProducer extends BaseAvroProducer { - /** - * Instantiate an AvroDoubleProducer. - */ + /** Instantiate an AvroDoubleProducer. */ public AvroDoubleProducer(Float8Vector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java index cddff52821..068566493e 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java @@ -14,17 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; import java.io.IOException; - import org.apache.arrow.vector.IntVector; import org.apache.avro.io.Encoder; /** - * Producer that produces enum values from a dictionary-encoded {@link IntVector}, - * writes data to an Avro encoder. + * Producer that produces enum values from a dictionary-encoded {@link IntVector}, writes data to an + * Avro encoder. */ public class AvroEnumProducer extends BaseAvroProducer { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java index 1b1ed9fbe8..294eac6db6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java @@ -14,25 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; import java.io.IOException; - import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.avro.io.Encoder; /** - * Producer that produces fixed-size binary values from a {@link FixedSizeBinaryVector}, - * writes data to an Avro encoder. + * Producer that produces fixed-size binary values from a {@link FixedSizeBinaryVector}, writes data + * to an Avro encoder. */ public class AvroFixedProducer extends BaseAvroProducer { private final byte[] reuseBytes; - /** - * Instantiate an AvroFixedProducer. - */ + /** Instantiate an AvroFixedProducer. */ public AvroFixedProducer(FixedSizeBinaryVector vector) { super(vector); reuseBytes = new byte[vector.getByteWidth()]; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java index 7306f2cd27..f714f4534e 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java @@ -17,19 +17,15 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; - import org.apache.arrow.vector.Float4Vector; import org.apache.avro.io.Encoder; /** - * Producer that produces float values from a {@link Float4Vector}, - * writes data to an Avro encoder. + * Producer that produces float values from a {@link Float4Vector}, writes data to an Avro encoder. */ public class AvroFloatProducer extends BaseAvroProducer { - /** - * Instantiate an AvroFloatProducer. - */ + /** Instantiate an AvroFloatProducer. */ public AvroFloatProducer(Float4Vector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java index 0db84d60e9..7899108b99 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java @@ -16,20 +16,14 @@ */ package org.apache.arrow.adapter.avro.producers; +import java.io.IOException; import org.apache.arrow.vector.IntVector; import org.apache.avro.io.Encoder; -import java.io.IOException; - -/** - * Producer that produces int values from an {@link IntVector}, - * writes data to an avro encoder. - */ +/** Producer that produces int values from an {@link IntVector}, writes data to an avro encoder. */ public class AvroIntProducer extends BaseAvroProducer { - /** - * Instantiate an AvroIntConsumer. - */ + /** Instantiate an AvroIntConsumer. */ public AvroIntProducer(IntVector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java index 7242075ccb..1767c7f94a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java @@ -17,19 +17,15 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; - import org.apache.arrow.vector.BigIntVector; import org.apache.avro.io.Encoder; /** - * Producer that produces long values from a {@link BigIntVector}, - * writes data to an Avro encoder. + * Producer that produces long values from a {@link BigIntVector}, writes data to an Avro encoder. */ public class AvroLongProducer extends BaseAvroProducer { - /** - * Instantiate an AvroLongProducer. - */ + /** Instantiate an AvroLongProducer. */ public AvroLongProducer(BigIntVector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java index 2c2a15c9ab..6141b53922 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java @@ -14,18 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; +import java.io.IOException; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.avro.io.Encoder; -import java.io.IOException; - -/** - * Producer which produces map type values to avro encoder. Write the data to {@link MapVector}. - */ +/** Producer which produces map type values to avro encoder. Write the data to {@link MapVector}. */ public class AvroMapProducer extends BaseAvroProducer { private final Producer delegate; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java index d02e006d34..1bd1e891f1 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullProducer.java @@ -17,19 +17,13 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; - import org.apache.arrow.vector.NullVector; import org.apache.avro.io.Encoder; -/** - * Producer that produces null values from a {@link NullVector}, - * writes data to an Avro encoder. - */ +/** Producer that produces null values from a {@link NullVector}, writes data to an Avro encoder. */ public class AvroNullProducer extends BaseAvroProducer { - /** - * Instantiate an AvroNullProducer. - */ + /** Instantiate an AvroNullProducer. */ public AvroNullProducer(NullVector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java index 30262edeca..cc081341c0 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java @@ -14,19 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; +import java.io.IOException; import org.apache.arrow.vector.FieldVector; import org.apache.avro.io.Encoder; -import java.io.IOException; - - +/** + * Producer wrapper which producers nullable types to an avro encoder. Write the data to the + * underlying {@link FieldVector}. + */ public class AvroNullableProducer extends BaseAvroProducer { private final Producer delegate; + /** Instantiate a AvroNullableProducer. */ public AvroNullableProducer(Producer delegate) { super(delegate.getVector()); this.delegate = delegate; @@ -38,8 +40,7 @@ public void produce(Encoder encoder) throws IOException { encoder.writeInt(1); encoder.writeNull(); delegate.skipNull(); - } - else { + } else { encoder.writeInt(0); delegate.produce(encoder); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java index dd04ecc21c..19e165cd13 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStringProducer.java @@ -18,19 +18,16 @@ import java.io.IOException; import java.nio.ByteBuffer; - import org.apache.arrow.vector.VarCharVector; import org.apache.avro.io.Encoder; /** - * Producer that produces string values from a {@link VarCharVector}, - * writes data to an Avro encoder. + * Producer that produces string values from a {@link VarCharVector}, writes data to an Avro + * encoder. */ public class AvroStringProducer extends BaseAvroProducer { - /** - * Instantiate an AvroStringProducer. - */ + /** Instantiate an AvroStringProducer. */ public AvroStringProducer(VarCharVector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java index 3d4fc25beb..906909a6b8 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java @@ -14,18 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; +import java.io.IOException; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.avro.io.Encoder; -import java.io.IOException; - /** - * Producer which produces nested record type values to avro encoder. Read the data from - * {@link org.apache.arrow.vector.complex.StructVector}. + * Producer which produces nested record type values to avro encoder. Read the data from {@link + * org.apache.arrow.vector.complex.StructVector}. */ public class AvroStructProducer extends BaseAvroProducer { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java index 86333e0d70..b61f0af83e 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java @@ -14,21 +14,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; +import java.io.IOException; +import java.util.List; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.UnionMode; import org.apache.avro.io.Encoder; -import java.io.IOException; -import java.util.List; - /** - * Producer which produces unions type values to avro encoder. Write the data to - * {@link org.apache.arrow.vector.complex.UnionVector}. + * Producer which produces unions type values to avro encoder. Write the data to {@link + * org.apache.arrow.vector.complex.UnionVector}. */ public class AvroUnionsProducer extends BaseAvroProducer { @@ -40,7 +38,11 @@ public class AvroUnionsProducer extends BaseAvroProducer { public AvroUnionsProducer(UnionVector vector, Producer[] delegates) { super(vector); this.delegates = delegates; - this.unionMode = vector.getMinorType() == Types.MinorType.DENSEUNION ? UnionMode.Dense : UnionMode.Sparse; + if (vector.getMinorType() == Types.MinorType.DENSEUNION) { + this.unionMode = UnionMode.Dense; + } else { + this.unionMode = UnionMode.Sparse; + } this.nullTypeIndex = findNullTypeIndex(); } @@ -61,16 +63,14 @@ public void produce(Encoder encoder) throws IOException { if (vector.isNull(currentIndex)) { encoder.writeInt(nullTypeIndex); encoder.writeNull(); - } - else { + } else { int typeIndex = vector.getTypeValue(currentIndex); int typeVectorIndex; if (unionMode == UnionMode.Dense) { typeVectorIndex = vector.getOffsetBuffer().getInt(currentIndex * (long) Integer.BYTES); - } - else { + } else { typeVectorIndex = currentIndex; } @@ -79,8 +79,7 @@ public void produce(Encoder encoder) throws IOException { if (typeVector.isNull(typeVectorIndex)) { encoder.writeInt(nullTypeIndex); encoder.writeNull(); - } - else { + } else { Producer delegate = delegates[typeIndex]; encoder.writeInt(typeIndex); delegate.setPosition(typeVectorIndex); diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java index 1333627594..3f02f1b5f0 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java @@ -14,12 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers; import java.io.IOException; import java.util.List; - import org.apache.arrow.util.AutoCloseables; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.VectorSchemaRoot; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java index 64bd95dbc5..7f751528d5 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java @@ -16,11 +16,10 @@ */ package org.apache.arrow.adapter.avro.producers; +import java.io.IOException; import org.apache.arrow.vector.FieldVector; import org.apache.avro.io.Encoder; -import java.io.IOException; - /** * Interface that is used to produce values to avro encoder. * @@ -42,9 +41,7 @@ public interface Producer extends AutoCloseable { /** Set the position to read value from vector. */ void setPosition(int index); - /** - * Close this producer. - */ + /** Close this producer. */ @Override void close() throws Exception; @@ -55,8 +52,6 @@ public interface Producer extends AutoCloseable { */ boolean resetValueVector(T vector); - /** - * Get the vector within the producer. - */ + /** Get the vector within the producer. */ T getVector(); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java index 4ca6c53693..17729b3923 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java @@ -14,18 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers.logical; import java.io.IOException; - import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.vector.DateDayVector; import org.apache.avro.io.Encoder; /** - * Producer that produces date values from a {@link DateDayVector}, - * writes data to an Avro encoder. + * Producer that produces date values from a {@link DateDayVector}, writes data to an Avro encoder. */ public class AvroDateProducer extends BaseAvroProducer { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java index 9d8314741b..aaa5d546b6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java @@ -14,19 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers.logical; import java.io.IOException; - import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.DecimalVector; import org.apache.avro.io.Encoder; /** - * Producer that produces decimal values from a {@link DecimalVector}, - * writes data to an Avro encoder. + * Producer that produces decimal values from a {@link DecimalVector}, writes data to an Avro + * encoder. */ public abstract class AvroDecimalProducer extends BaseAvroProducer { @@ -43,7 +41,8 @@ public static class BytesDecimalProducer extends AvroDecimalProducer { /** Instantiate a BytesDecimalConsumer. */ public BytesDecimalProducer(DecimalVector vector) { super(vector); - Preconditions.checkArgument(vector.getTypeWidth() <= 16, "Decimal bytes length should <= 16."); + Preconditions.checkArgument( + vector.getTypeWidth() <= 16, "Decimal bytes length should <= 16."); reuseBytes = new byte[vector.getTypeWidth()]; } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java index a64173ae7c..2da5df730a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers.logical; import java.io.IOException; @@ -23,8 +22,8 @@ import org.apache.avro.io.Encoder; /** - * Producer that produces time (microseconds) values from a {@link TimeMicroVector}, - * writes data to an Avro encoder. + * Producer that produces time (microseconds) values from a {@link TimeMicroVector}, writes data to + * an Avro encoder. */ public class AvroTimeMicroProducer extends BaseAvroProducer { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java index 59b781c636..39a57712a2 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers.logical; import java.io.IOException; @@ -23,8 +22,8 @@ import org.apache.avro.io.Encoder; /** - * Producer that produces time (milliseconds) values from a {@link TimeMilliVector}, - * writes data to an Avro encoder. + * Producer that produces time (milliseconds) values from a {@link TimeMilliVector}, writes data to + * an Avro encoder. */ public class AvroTimeMillisProducer extends BaseAvroProducer { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java index b772c94138..4569ed977f 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers.logical; import java.io.IOException; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java index a015e0fa29..837f33b32f 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro.producers.logical; import java.io.IOException; diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java index b05e68c287..5152b3e981 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java @@ -16,13 +16,14 @@ */ package org.apache.arrow.adapter.avro; +import static org.junit.jupiter.api.Assertions.assertEquals; + import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; - import org.apache.arrow.adapter.avro.consumers.CompositeAvroConsumer; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; import org.apache.arrow.memory.BufferAllocator; @@ -50,8 +51,6 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import static org.junit.jupiter.api.Assertions.*; - public class TestWriteReadAvroRecord { @TempDir public static File TMP; @@ -106,11 +105,14 @@ public void testWriteAndRead() throws Exception { public void testWriteAndReadVSR() throws Exception { BufferAllocator allocator = new RootAllocator(); + FieldType stringNotNull = new FieldType(false, ArrowType.Utf8.INSTANCE, null); + FieldType stringNull = new FieldType(true, ArrowType.Utf8.INSTANCE, null); + FieldType intN32Null = new FieldType(true, new ArrowType.Int(32, true), null); List fields = new ArrayList<>(); - fields.add(new Field("name", new FieldType(false, ArrowType.Utf8.INSTANCE, null), null)); - fields.add(new Field("favorite_number", new FieldType(true, new ArrowType.Int(32, true), null), null)); - fields.add(new Field("favorite_color", new FieldType(true, ArrowType.Utf8.INSTANCE, null), null)); + fields.add(new Field("name", stringNotNull, null)); + fields.add(new Field("favorite_number", intN32Null, null)); + fields.add(new Field("favorite_color", stringNull, null)); VarCharVector nameVector = new VarCharVector(fields.get(0), allocator); nameVector.allocateNew(2); From a79ae6958d7d2f63ba286e2e46d9955bab96de81 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Fri, 28 Feb 2025 17:23:01 +0000 Subject: [PATCH 23/89] Remove AutoClosable interface on producers --- .../adapter/avro/producers/AvroArraysProducer.java | 6 ------ .../adapter/avro/producers/AvroMapProducer.java | 6 ------ .../avro/producers/AvroNullableProducer.java | 7 ++----- .../adapter/avro/producers/AvroStructProducer.java | 8 -------- .../adapter/avro/producers/AvroUnionsProducer.java | 8 -------- .../adapter/avro/producers/BaseAvroProducer.java | 5 ----- .../avro/producers/CompositeAvroProducer.java | 13 +------------ .../arrow/adapter/avro/producers/Producer.java | 6 +----- 8 files changed, 4 insertions(+), 55 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java index 07e8a66d75..8ba68814aa 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java @@ -72,10 +72,4 @@ public boolean resetValueVector(ListVector vector) { ((Producer) delegate).resetValueVector(vector.getDataVector()); return super.resetValueVector(vector); } - - @Override - public void close() throws Exception { - delegate.close(); - super.close(); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java index 6141b53922..848c85ff96 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java @@ -69,10 +69,4 @@ public boolean resetValueVector(MapVector vector) { ((Producer) delegate).resetValueVector(vector.getDataVector()); return super.resetValueVector(vector); } - - @Override - public void close() throws Exception { - delegate.close(); - super.close(); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java index cc081341c0..53e2542335 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java @@ -23,6 +23,8 @@ /** * Producer wrapper which producers nullable types to an avro encoder. Write the data to the * underlying {@link FieldVector}. + * + * @param The vector within producer or its delegate, used for partially produce purpose. */ public class AvroNullableProducer extends BaseAvroProducer { @@ -69,9 +71,4 @@ public boolean resetValueVector(T vector) { public T getVector() { return delegate.getVector(); } - - @Override - public void close() throws Exception { - delegate.close(); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java index 906909a6b8..e5d8dac7e0 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java @@ -70,12 +70,4 @@ public boolean resetValueVector(StructVector vector) { } return super.resetValueVector(vector); } - - @Override - public void close() throws Exception { - for (Producer delegate : delegates) { - delegate.close(); - } - super.close(); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java index b61f0af83e..7f0323cac3 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java @@ -100,12 +100,4 @@ public boolean resetValueVector(UnionVector vector) { } return result & super.resetValueVector(vector); } - - @Override - public void close() throws Exception { - for (Producer delegate : delegates) { - delegate.close(); - } - super.close(); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java index 2bdde080c4..abc7ebd263 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java @@ -47,11 +47,6 @@ public void setPosition(int index) { currentIndex = index; } - @Override - public void close() throws Exception { - vector.close(); - } - @Override public boolean resetValueVector(T vector) { this.vector = vector; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java index 3f02f1b5f0..a39ae2d3f5 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java @@ -18,13 +18,12 @@ import java.io.IOException; import java.util.List; -import org.apache.arrow.util.AutoCloseables; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.avro.io.Encoder; /** Composite producer which holds all producers. It manages the produce and cleanup process. */ -public class CompositeAvroProducer implements AutoCloseable { +public class CompositeAvroProducer { private final List> producers; @@ -54,14 +53,4 @@ public void resetProducerVectors(VectorSchemaRoot root) { } } } - - @Override - public void close() { - // clean up - try { - AutoCloseables.close(producers); - } catch (Exception e) { - throw new RuntimeException("Error occurs in close.", e); - } - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java index 7f751528d5..983fc41cfe 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java @@ -25,7 +25,7 @@ * * @param The vector within producer or its delegate, used for partially produce purpose. */ -public interface Producer extends AutoCloseable { +public interface Producer { /** * Produce a specific type value from the vector and write it to avro encoder. @@ -41,10 +41,6 @@ public interface Producer extends AutoCloseable { /** Set the position to read value from vector. */ void setPosition(int index); - /** Close this producer. */ - @Override - void close() throws Exception; - /** * Reset the vector within producer. * From a171da1e3912f7f00d7167ff082fc71cbeaabf1c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 1 Mar 2025 01:59:59 +0000 Subject: [PATCH 24/89] First pass of Avro schema builder --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 474 +++++++++++++++++- 1 file changed, 469 insertions(+), 5 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 68e10eece0..cbfc7d9c7b 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -62,10 +62,478 @@ import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; public class ArrowToAvroUtils { + public static final String GENERIC_RECORD_TYPE_NAME = "GenericRecord"; + + /** + * Create an Avro record schema for a given list of Arrow fields. + * + *

This method currently performs following type mapping for Avro data types to corresponding + * Arrow data types. + * + *

    + *
  • ArrowType.Null --> NULL + *
  • ArrowType.Bool --> BOOLEAN + *
  • ArrowType.Int(64 bit, unsigned 32 bit) --> LONG + *
  • ArrowType.Int(signed 32 bit, < 32 bit) --> INT + *
  • ArrowType.FloatingPoint(double) --> DOUBLE + *
  • ArrowType.FloatingPoint(single, half) --> FLOAT + *
  • ArrowType.Utf8 --> STRING + *
  • ArrowType.LargeUtf8 --> STRING + *
  • ArrowType.Binary --> BYTES + *
  • ArrowType.LargeBinary --> BYTES + *
  • ArrowType.FixedSizeBinary --> FIXED + *
  • ArrowType.Decimal --> decimal (FIXED) + *
  • ArrowType.Date --> date (INT) + *
  • ArrowType.Time (bit width <= 32) --> time-millis (INT) + *
  • ArrowType.Time (bit width > 32) --> time-micros (LONG) + *
  • ArrowType.Timestamp (NANOSECONDS, TZ != NULL) --> time-nanos (LONG) + *
  • ArrowType.Timestamp (MICROSECONDS, TZ != NULL) --> time-micros (LONG) + *
  • ArrowType.Timestamp (MILLISECONDS | SECONDS, TZ != NULL) --> time-millis (LONG) + *
  • ArrowType.Timestamp (NANOSECONDS, TZ == NULL) --> local-time-nanos (LONG) + *
  • ArrowType.Timestamp (MICROSECONDS, TZ == NULL) --> local-time-micros (LONG) + *
  • ArrowType.Timestamp (MILLISECONDS | SECONDS, TZ == NULL) --> local-time-millis (LONG) + *
  • ArrowType.Duration --> duration (FIXED) + *
  • ArrowType.Interval --> duration (FIXED) + *
  • ArrowType.Struct --> record + *
  • ArrowType.List --> array + *
  • ArrowType.LargeList --> array + *
  • ArrowType.FixedSizeList --> array + *
  • ArrowType.Map --> map + *
  • ArrowType.Union --> union + *
+ * + *

Nullable fields are represented as a union of [null | base-type]. Special treatment is given + * to nullability of unions - a union is considered nullable if the union field is nullable or any + * of its child fields are nullable. The schema for a nullable union will always contain a null + * type,none of the direct child types will be nullable. + * + *

List fields must contain precisely one child field, which may be nullable. Map fields must + * contain precisely two child fields, the key field and the value field. The key field must + * always be of type STRING (Utf8) and cannot be nullable. The value can be of any type and may be + * nullable. Record types must contain at least one child field and cannot contain multiple fields + * with the same name + * + * @param arrowFields The arrow fields used to generate the Avro schema + * @param typeName Name of the top level Avro record type + * @param namespace Namespace of the top level Avro record type + * @return An Avro record schema for the given list of fields, with the specified name and + * namespace + */ + public static Schema createAvroSchema(List arrowFields, String typeName, String namespace) { + SchemaBuilder.RecordBuilder assembler = + SchemaBuilder.record(typeName).namespace(namespace); + return buildRecordSchema(assembler, arrowFields, namespace); + } + + /** Overload provided for convenience, sets namespace = null. */ + public static Schema createAvroSchema(List arrowFields, String typeName) { + return createAvroSchema(arrowFields, typeName, null); + } + + /** Overload provided for convenience, sets name = GENERIC_RECORD_TYPE_NAME. */ + public static Schema createAvroSchema(List arrowFields) { + return createAvroSchema(arrowFields, GENERIC_RECORD_TYPE_NAME); + } + + private static T buildRecordSchema( + SchemaBuilder.RecordBuilder builder, List fields, String namespace) { + if (fields.isEmpty()) { + throw new IllegalArgumentException("Record field must have at least one child field"); + } + SchemaBuilder.FieldAssembler assembler = builder.fields(); + for (Field field : fields) { + assembler = buildFieldSchema(assembler, field, namespace); + } + return assembler.endRecord(); + } + + private static SchemaBuilder.FieldAssembler buildFieldSchema( + SchemaBuilder.FieldAssembler assembler, Field field, String namespace) { + + SchemaBuilder.FieldTypeBuilder builder = assembler.name(field.getName()).type(); + + // Nullable unions need special handling, since union types cannot be directly nested + if (field.getType().getTypeID() != ArrowType.ArrowTypeID.Union) { + boolean unionNullable = field.getChildren().stream().anyMatch(Field::isNullable); + if (field.isNullable() && !unionNullable) { + SchemaBuilder.UnionAccumulator> union = + builder.unionOf().nullType(); + return addTypesToUnion(union, field.getChildren(), namespace).nullDefault(); + } else { + Field headType = field.getChildren().get(0); + List tailTypes = field.getChildren().subList(1, field.getChildren().size()); + SchemaBuilder.UnionAccumulator> union = + buildUnionFieldSchema(builder.unionOf(), headType, namespace); + return addTypesToUnion(union, tailTypes, namespace).noDefault(); + } + } else if (field.isNullable()) { + return buildBaseTypeSchema(builder.optional(), field, namespace); + } else { + return buildBaseFieldSchema(builder, field, namespace); + } + } + + private static T buildArraySchema( + SchemaBuilder.ArrayBuilder builder, Field listField, String namespace) { + if (listField.getChildren().size() != 1) { + throw new IllegalArgumentException("List field must have exactly one child field"); + } + Field itemField = listField.getChildren().get(0); + return buildTypeSchema(builder.items(), itemField, namespace); + } + + private static T buildMapSchema( + SchemaBuilder.MapBuilder builder, Field mapField, String namespace) { + if (mapField.getChildren().size() != 2) { + throw new IllegalArgumentException("Map field must have exactly two child fields"); + } + Field keyField = mapField.getChildren().get(0); + Field valueField = mapField.getChildren().get(1); + if (keyField.getType().getTypeID() != ArrowType.ArrowTypeID.Utf8 || keyField.isNullable()) { + throw new IllegalArgumentException( + "Map keys must be of type string and cannot be nullable for conversion to Avro"); + } + return buildTypeSchema(builder.values(), valueField, namespace); + } + + private static T buildTypeSchema( + SchemaBuilder.TypeBuilder builder, Field field, String namespace) { + + // Nullable unions need special handling, since union types cannot be directly nested + if (field.getType().getTypeID() != ArrowType.ArrowTypeID.Union) { + boolean unionNullable = field.getChildren().stream().anyMatch(Field::isNullable); + if (field.isNullable() && !unionNullable) { + SchemaBuilder.UnionAccumulator union = builder.unionOf().nullType(); + return addTypesToUnion(union, field.getChildren(), namespace); + } else { + Field headType = field.getChildren().get(0); + List tailTypes = field.getChildren().subList(1, field.getChildren().size()); + SchemaBuilder.UnionAccumulator union = + buildBaseTypeSchema(builder.unionOf(), headType, namespace); + return addTypesToUnion(union, tailTypes, namespace); + } + } else if (field.isNullable()) { + return buildBaseTypeSchema(builder.nullable(), field, namespace); + } else { + return buildBaseTypeSchema(builder, field, namespace); + } + } + + private static T buildBaseTypeSchema( + SchemaBuilder.BaseTypeBuilder builder, Field field, String namespace) { + + ArrowType.ArrowTypeID typeID = field.getType().getTypeID(); + + switch (typeID) { + case Null: + return builder.nullType(); + + case Bool: + return builder.booleanType(); + + case Int: + ArrowType.Int intType = (ArrowType.Int) field.getType(); + if (intType.getBitWidth() > 32 || (intType.getBitWidth() == 32 && !intType.getIsSigned())) { + return builder.longType(); + } else { + return builder.intType(); + } + + case FloatingPoint: + ArrowType.FloatingPoint floatType = (ArrowType.FloatingPoint) field.getType(); + if (floatType.getPrecision() == FloatingPointPrecision.DOUBLE) { + return builder.doubleType(); + } else { + return builder.floatType(); + } + + case Utf8: + case LargeUtf8: + return builder.stringType(); + + case Binary: + case LargeBinary: + return builder.bytesType(); + + case FixedSizeBinary: + ArrowType.FixedSizeBinary fixedType = (ArrowType.FixedSizeBinary) field.getType(); + String fixedTypeName = field.getName(); + int fixedTypeWidth = fixedType.getByteWidth(); + return builder.fixed(fixedTypeName).size(fixedTypeWidth); + + case Decimal: + ArrowType.Decimal decimalType = (ArrowType.Decimal) field.getType(); + return builder + .fixed(field.getName()) + .prop("logicalType", "decimal") + .prop("precision", decimalType.getPrecision()) + .prop("scale", decimalType.getScale()) + .size(decimalType.getBitWidth() / 8); + + case Date: + return builder.intBuilder().prop("logicalType", "date").endInt(); + + case Time: + ArrowType.Time timeType = (ArrowType.Time) field.getType(); + if (timeType.getBitWidth() <= 32) { + return builder.intBuilder().prop("logicalType", "time-millis").endInt(); + } else { + return builder.longBuilder().prop("logicalType", "time-micros").endLong(); + } + + case Timestamp: + ArrowType.Timestamp timestampType = (ArrowType.Timestamp) field.getType(); + String timestampLogicalType = timestampLogicalType(timestampType); + return builder.longBuilder().prop("logicalType", timestampLogicalType).endLong(); + + case Duration: + case Interval: + return builder.fixed(field.getName()).prop("logicalType", "duration").size(12); + + case Struct: + String childNamespace = + namespace == null ? field.getName() : namespace + "." + field.getName(); + return buildRecordSchema( + builder.record(field.getName()), field.getChildren(), childNamespace); + + case List: + case LargeList: + case FixedSizeList: + return buildArraySchema(builder.array(), field, namespace); + + case Map: + return buildMapSchema(builder.map(), field, namespace); + + default: + throw new IllegalArgumentException( + "Element type not supported for Avro conversion: " + typeID.name()); + } + } + + private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( + SchemaBuilder.BaseFieldTypeBuilder builder, Field field, String namespace) { + + ArrowType.ArrowTypeID typeID = field.getType().getTypeID(); + + switch (typeID) { + case Null: + return builder.nullType().noDefault(); + + case Bool: + return builder.booleanType().noDefault(); + + case Int: + ArrowType.Int intType = (ArrowType.Int) field.getType(); + if (intType.getBitWidth() > 32 || (intType.getBitWidth() == 32 && !intType.getIsSigned())) { + return builder.longType().noDefault(); + } else { + return builder.intType().noDefault(); + } + + case FloatingPoint: + ArrowType.FloatingPoint floatType = (ArrowType.FloatingPoint) field.getType(); + if (floatType.getPrecision() == FloatingPointPrecision.DOUBLE) { + return builder.doubleType().noDefault(); + } else { + return builder.floatType().noDefault(); + } + + case Utf8: + case LargeUtf8: + return builder.stringType().noDefault(); + + case Binary: + case LargeBinary: + return builder.bytesType().noDefault(); + + case FixedSizeBinary: + ArrowType.FixedSizeBinary fixedType = (ArrowType.FixedSizeBinary) field.getType(); + return builder.fixed(field.getName()).size(fixedType.getByteWidth()).noDefault(); + + case Decimal: + ArrowType.Decimal decimalType = (ArrowType.Decimal) field.getType(); + return builder + .fixed(field.getName()) + .prop("logicalType", "decimal") + .prop("precision", decimalType.getPrecision()) + .prop("scale", decimalType.getScale()) + .size(decimalType.getBitWidth() / 8) + .noDefault(); + + case Date: + return builder.intBuilder().prop("logicalType", "date").endInt().noDefault(); + + case Time: + ArrowType.Time timeType = (ArrowType.Time) field.getType(); + if (timeType.getBitWidth() <= 32) { + return builder.intBuilder().prop("logicalType", "time-millis").endInt().noDefault(); + } else { + return builder.longBuilder().prop("logicalType", "time-micros").endLong().noDefault(); + } + + case Timestamp: + ArrowType.Timestamp timestampType = (ArrowType.Timestamp) field.getType(); + String timestampLogicalType = timestampLogicalType(timestampType); + return builder + .longBuilder() + .prop("logicalType", timestampLogicalType) + .endLong() + .noDefault(); + + case Duration: + case Interval: + return builder.fixed(field.getName()).prop("logicalType", "duration").size(12).noDefault(); + + case Struct: + String childNamespace = + namespace == null ? field.getName() : namespace + "." + field.getName(); + return buildRecordSchema( + builder.record(field.getName()), field.getChildren(), childNamespace) + .noDefault(); + + case List: + case LargeList: + case FixedSizeList: + return buildArraySchema(builder.array(), field, namespace).noDefault(); + + case Map: + return buildMapSchema(builder.map(), field, namespace).noDefault(); + + default: + throw new IllegalArgumentException( + "Field type not supported for Avro conversion: " + typeID.name()); + } + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + private static + SchemaBuilder.UnionAccumulator> buildUnionFieldSchema( + SchemaBuilder.UnionFieldTypeBuilder builder, Field field, String namespace) { + + ArrowType.ArrowTypeID typeID = field.getType().getTypeID(); + + switch (typeID) { + case Null: + return (SchemaBuilder.UnionAccumulator) builder.nullType(); + + case Bool: + return (SchemaBuilder.UnionAccumulator) builder.booleanType(); + + case Int: + ArrowType.Int intType = (ArrowType.Int) field.getType(); + if (intType.getBitWidth() > 32 || (intType.getBitWidth() == 32 && !intType.getIsSigned())) { + return (SchemaBuilder.UnionAccumulator) builder.longType(); + } else { + return (SchemaBuilder.UnionAccumulator) builder.intType(); + } + + case FloatingPoint: + ArrowType.FloatingPoint floatType = (ArrowType.FloatingPoint) field.getType(); + if (floatType.getPrecision() == FloatingPointPrecision.DOUBLE) { + return (SchemaBuilder.UnionAccumulator) builder.doubleType(); + } else { + return (SchemaBuilder.UnionAccumulator) builder.floatType(); + } + + case Utf8: + case LargeUtf8: + return (SchemaBuilder.UnionAccumulator) builder.stringType(); + + case Binary: + case LargeBinary: + return (SchemaBuilder.UnionAccumulator) builder.bytesType(); + + case FixedSizeBinary: + ArrowType.FixedSizeBinary fixedType = (ArrowType.FixedSizeBinary) field.getType(); + String fixedTypeName = field.getName(); + int fixedTypeWidth = fixedType.getByteWidth(); + return (SchemaBuilder.UnionAccumulator) builder.fixed(fixedTypeName).size(fixedTypeWidth); + + case Decimal: + ArrowType.Decimal decimalType = (ArrowType.Decimal) field.getType(); + return (SchemaBuilder.UnionAccumulator) + builder + .fixed(field.getName()) + .prop("logicalType", "decimal") + .prop("precision", decimalType.getPrecision()) + .prop("scale", decimalType.getScale()) + .size(decimalType.getBitWidth() / 8); + + case Date: + return (SchemaBuilder.UnionAccumulator) + builder.intBuilder().prop("logicalType", "date").endInt(); + + case Time: + ArrowType.Time timeType = (ArrowType.Time) field.getType(); + if (timeType.getBitWidth() <= 32) { + return (SchemaBuilder.UnionAccumulator) + builder.intBuilder().prop("logicalType", "time-millis").endInt(); + } else { + return (SchemaBuilder.UnionAccumulator) + builder.longBuilder().prop("logicalType", "time-micros").endLong(); + } + + case Timestamp: + ArrowType.Timestamp timestampType = (ArrowType.Timestamp) field.getType(); + String timestampLogicalType = timestampLogicalType(timestampType); + return (SchemaBuilder.UnionAccumulator) + builder.longBuilder().prop("logicalType", timestampLogicalType).endLong(); + + case Duration: + case Interval: + return (SchemaBuilder.UnionAccumulator) + builder.fixed(field.getName()).prop("logicalType", "duration").size(12); + + case Struct: + String childNamespace = + namespace == null ? field.getName() : namespace + "." + field.getName(); + return (SchemaBuilder.UnionAccumulator) + buildRecordSchema(builder.record(field.getName()), field.getChildren(), childNamespace); + + case List: + case LargeList: + case FixedSizeList: + return (SchemaBuilder.UnionAccumulator) buildArraySchema(builder.array(), field, namespace); + + case Map: + return (SchemaBuilder.UnionAccumulator) buildMapSchema(builder.map(), field, namespace); + + default: + throw new IllegalArgumentException( + "Union member type not supported for Avro conversion: " + typeID.name()); + } + } + + private static T addTypesToUnion( + SchemaBuilder.UnionAccumulator accumulator, List unionFields, String namespace) { + for (var field : unionFields) { + accumulator = buildBaseTypeSchema(accumulator.and(), field, namespace); + } + return accumulator.endUnion(); + } + + private static String timestampLogicalType(ArrowType.Timestamp timestampType) { + boolean zoneAware = timestampType.getTimezone() != null; + if (timestampType.getUnit() == TimeUnit.NANOSECOND) { + return zoneAware ? "timestamp-nanos" : "local-timestamp-nanos"; + } else if (timestampType.getUnit() == TimeUnit.MICROSECOND) { + return zoneAware ? "timestamp-micros" : "local-timestamp-micros"; + } else { + // Timestamp in seconds will be cast to milliseconds, Avro does not support seconds + return zoneAware ? "timestamp-millis" : "local-timestamp-millis"; + } + } + /** * Create a composite Avro producer for a set of field vectors (typically the root set of a VSR). * @@ -103,9 +571,6 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu } switch (minorType) { - - // Primitive types with direct mapping to Avro - case NULL: return new AvroNullProducer((NullVector) vector); case BIT: @@ -182,9 +647,8 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu } return new AvroUnionsProducer(unionVector, unionChildProducers); - // Not all Arrow types are supported for encoding (yet)! - default: + // Not all Arrow types are supported for encoding (yet)! String error = String.format( "Encoding Arrow type %s to Avro is not currently supported", minorType.name()); From b3bad23cd09ae44bbeffe40d4a8ec6c8bdf32cc9 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 1 Mar 2025 02:05:08 +0000 Subject: [PATCH 25/89] Include create schema in high level RT test --- .../adapter/avro/TestWriteReadAvroRecord.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java index 5152b3e981..c3fd9c2baf 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java @@ -50,6 +50,8 @@ import org.apache.avro.io.EncoderFactory; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; public class TestWriteReadAvroRecord { @@ -101,8 +103,9 @@ public void testWriteAndRead() throws Exception { assertEquals("red", deUser2.get("favorite_color").toString()); } - @Test - public void testWriteAndReadVSR() throws Exception { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testWriteAndReadVSR(boolean useSchemaFile) throws Exception { BufferAllocator allocator = new RootAllocator(); FieldType stringNotNull = new FieldType(false, ArrowType.Utf8.INSTANCE, null); @@ -134,8 +137,11 @@ public void testWriteAndReadVSR() throws Exception { vectors.add(favNumberVector); vectors.add(favColorVector); + Schema schema =useSchemaFile + ? AvroTestBase.getSchema("test.avsc") + : ArrowToAvroUtils.createAvroSchema(fields); + File dataFile = new File(TMP, "test_vsr.avro"); - Schema schema = AvroTestBase.getSchema("test.avsc"); AvroToArrowConfig config = new AvroToArrowConfigBuilder(allocator).build(); try (FileOutputStream fos = new FileOutputStream(dataFile)) { @@ -145,6 +151,8 @@ public void testWriteAndReadVSR() throws Exception { producer.produce(encoder); producer.produce(encoder); + + encoder.flush(); } List roundTripFields = new ArrayList<>(); From 8bdb17fba76b52006aea2d2afee15449b04af02e Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 1 Mar 2025 02:05:14 +0000 Subject: [PATCH 26/89] Fixes in create schema logic --- .../org/apache/arrow/adapter/avro/ArrowToAvroUtils.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index cbfc7d9c7b..fd214e4402 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -163,7 +163,7 @@ private static SchemaBuilder.FieldAssembler buildFieldSchema( SchemaBuilder.FieldTypeBuilder builder = assembler.name(field.getName()).type(); // Nullable unions need special handling, since union types cannot be directly nested - if (field.getType().getTypeID() != ArrowType.ArrowTypeID.Union) { + if (field.getType().getTypeID() == ArrowType.ArrowTypeID.Union) { boolean unionNullable = field.getChildren().stream().anyMatch(Field::isNullable); if (field.isNullable() && !unionNullable) { SchemaBuilder.UnionAccumulator> union = @@ -177,7 +177,7 @@ private static SchemaBuilder.FieldAssembler buildFieldSchema( return addTypesToUnion(union, tailTypes, namespace).noDefault(); } } else if (field.isNullable()) { - return buildBaseTypeSchema(builder.optional(), field, namespace); + return buildBaseFieldSchema(builder.nullable(), field, namespace); } else { return buildBaseFieldSchema(builder, field, namespace); } @@ -210,7 +210,7 @@ private static T buildTypeSchema( SchemaBuilder.TypeBuilder builder, Field field, String namespace) { // Nullable unions need special handling, since union types cannot be directly nested - if (field.getType().getTypeID() != ArrowType.ArrowTypeID.Union) { + if (field.getType().getTypeID() == ArrowType.ArrowTypeID.Union) { boolean unionNullable = field.getChildren().stream().anyMatch(Field::isNullable); if (field.isNullable() && !unionNullable) { SchemaBuilder.UnionAccumulator union = builder.unionOf().nullType(); From 22b339f42024fdbe338bde8c044940ca602a6096 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 1 Mar 2025 02:09:56 +0000 Subject: [PATCH 27/89] Spotless fixes --- .../org/apache/arrow/adapter/avro/ArrowToAvroUtils.java | 3 ++- .../apache/arrow/adapter/avro/TestWriteReadAvroRecord.java | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index fd214e4402..439c30f724 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -129,7 +129,8 @@ public class ArrowToAvroUtils { * @return An Avro record schema for the given list of fields, with the specified name and * namespace */ - public static Schema createAvroSchema(List arrowFields, String typeName, String namespace) { + public static Schema createAvroSchema( + List arrowFields, String typeName, String namespace) { SchemaBuilder.RecordBuilder assembler = SchemaBuilder.record(typeName).namespace(namespace); return buildRecordSchema(assembler, arrowFields, namespace); diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java index c3fd9c2baf..76e58a75ae 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/TestWriteReadAvroRecord.java @@ -137,9 +137,10 @@ public void testWriteAndReadVSR(boolean useSchemaFile) throws Exception { vectors.add(favNumberVector); vectors.add(favColorVector); - Schema schema =useSchemaFile - ? AvroTestBase.getSchema("test.avsc") - : ArrowToAvroUtils.createAvroSchema(fields); + Schema schema = + useSchemaFile + ? AvroTestBase.getSchema("test.avsc") + : ArrowToAvroUtils.createAvroSchema(fields); File dataFile = new File(TMP, "test_vsr.avro"); AvroToArrowConfig config = new AvroToArrowConfigBuilder(allocator).build(); From 275e9507e3d079cd3d4d0e6ccb35aee57a049d40 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 1 Mar 2025 03:10:39 +0000 Subject: [PATCH 28/89] Simplify decimal producer - always output fixed width (this is what Arrow has internally) --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 3 +- .../logical/AvroDecimalProducer.java | 55 ++++--------------- 2 files changed, 12 insertions(+), 46 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 439c30f724..92cf9f6a1e 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -594,8 +594,7 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu // Logical types case DECIMAL: - return new AvroDecimalProducer.FixedDecimalProducer( - (DecimalVector) vector, DecimalVector.TYPE_WIDTH); + return new AvroDecimalProducer((DecimalVector) vector); case DATEDAY: return new AvroDateProducer((DateDayVector) vector); case TIMEMILLI: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java index aaa5d546b6..9cbe871719 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java @@ -18,7 +18,6 @@ import java.io.IOException; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; -import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.DecimalVector; import org.apache.avro.io.Encoder; @@ -26,53 +25,21 @@ * Producer that produces decimal values from a {@link DecimalVector}, writes data to an Avro * encoder. */ -public abstract class AvroDecimalProducer extends BaseAvroProducer { +public class AvroDecimalProducer extends BaseAvroProducer { + + private final byte[] reuseBytes; /** Instantiate an AvroDecimalProducer. */ - protected AvroDecimalProducer(DecimalVector vector) { + public AvroDecimalProducer(DecimalVector vector) { super(vector); + reuseBytes = new byte[vector.getTypeWidth()]; } - /** Producer for decimal logical type with original bytes type. */ - public static class BytesDecimalProducer extends AvroDecimalProducer { - - private final byte[] reuseBytes; - - /** Instantiate a BytesDecimalConsumer. */ - public BytesDecimalProducer(DecimalVector vector) { - super(vector); - Preconditions.checkArgument( - vector.getTypeWidth() <= 16, "Decimal bytes length should <= 16."); - reuseBytes = new byte[vector.getTypeWidth()]; - } - - @Override - public void produce(Encoder encoder) throws IOException { - long offset = (long) currentIndex * vector.getTypeWidth(); - vector.getDataBuffer().getBytes(offset, reuseBytes); - encoder.writeBytes(reuseBytes); - currentIndex++; - } - } - - /** Producer for decimal logical type with original fixed type. */ - public static class FixedDecimalProducer extends AvroDecimalProducer { - - private final byte[] reuseBytes; - - /** Instantiate a FixedDecimalConsumer. */ - public FixedDecimalProducer(DecimalVector vector, int size) { - super(vector); - Preconditions.checkArgument(size <= 16, "Decimal bytes length should <= 16."); - reuseBytes = new byte[vector.getTypeWidth()]; - } - - @Override - public void produce(Encoder encoder) throws IOException { - long offset = (long) currentIndex * vector.getTypeWidth(); - vector.getDataBuffer().getBytes(offset, reuseBytes); - encoder.writeFixed(reuseBytes); - currentIndex++; - } + @Override + public void produce(Encoder encoder) throws IOException { + long offset = (long) currentIndex * vector.getTypeWidth(); + vector.getDataBuffer().getBytes(offset, reuseBytes); + encoder.writeFixed(reuseBytes); + currentIndex++; } } From d58c2ce87b255ab803a636e8b24d6bbc9c9951e6 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 1 Mar 2025 10:13:41 +0000 Subject: [PATCH 29/89] Update int, long and fixed producers to support logical types --- .../avro/producers/AvroFixedProducer.java | 16 +++++++++++--- .../avro/producers/AvroIntProducer.java | 22 ++++++++++++++++--- .../avro/producers/AvroLongProducer.java | 18 +++++++++++++-- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java index 294eac6db6..4ca4f39dce 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java @@ -17,26 +17,36 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; +import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.avro.io.Encoder; /** * Producer that produces fixed-size binary values from a {@link FixedSizeBinaryVector}, writes data * to an Avro encoder. + * + *

Logical types are also supported, for vectors derived from {@link BaseFixedWidthVector} where + * the internal representation is fixed width bytes and requires no conversion. */ -public class AvroFixedProducer extends BaseAvroProducer { +public class AvroFixedProducer extends BaseAvroProducer { private final byte[] reuseBytes; /** Instantiate an AvroFixedProducer. */ public AvroFixedProducer(FixedSizeBinaryVector vector) { super(vector); - reuseBytes = new byte[vector.getByteWidth()]; + reuseBytes = new byte[vector.getTypeWidth()]; + } + + /** Protected constructor for logical types with a fixed width representation. */ + protected AvroFixedProducer(BaseFixedWidthVector vector) { + super(vector); + reuseBytes = new byte[vector.getTypeWidth()]; } @Override public void produce(Encoder encoder) throws IOException { - long offset = (long) currentIndex * vector.getByteWidth(); + long offset = (long) currentIndex * vector.getTypeWidth(); vector.getDataBuffer().getBytes(offset, reuseBytes); encoder.writeBytes(reuseBytes); currentIndex++; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java index 7899108b99..1a61d2d5e4 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java @@ -17,19 +17,35 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; +import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.IntVector; import org.apache.avro.io.Encoder; -/** Producer that produces int values from an {@link IntVector}, writes data to an avro encoder. */ -public class AvroIntProducer extends BaseAvroProducer { +/** + * Producer that produces int values from an {@link IntVector}, writes data to an avro encoder. + * + *

Logical types are also supported, for vectors derived from {@link BaseFixedWidthVector} where + * the internal representation matches IntVector and requires no conversion. + */ +public class AvroIntProducer extends BaseAvroProducer { /** Instantiate an AvroIntConsumer. */ public AvroIntProducer(IntVector vector) { super(vector); } + /** Protected constructor for a logical types with an integer representation. */ + protected AvroIntProducer(BaseFixedWidthVector vector) { + super(vector); + if (vector.getTypeWidth() != IntVector.TYPE_WIDTH) { + throw new IllegalArgumentException( + "AvroLongProducer requires type width = " + IntVector.TYPE_WIDTH); + } + } + @Override public void produce(Encoder encoder) throws IOException { - encoder.writeInt(vector.get(currentIndex++)); + int value = vector.getDataBuffer().getInt(currentIndex * (long) IntVector.TYPE_WIDTH); + encoder.writeInt(value); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java index 1767c7f94a..2047639697 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java @@ -17,21 +17,35 @@ package org.apache.arrow.adapter.avro.producers; import java.io.IOException; +import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BigIntVector; import org.apache.avro.io.Encoder; /** * Producer that produces long values from a {@link BigIntVector}, writes data to an Avro encoder. + * + *

Logical types are also supported, for vectors derived from {@link BaseFixedWidthVector} where + * the internal representation matches BigIntVector and requires no conversion. */ -public class AvroLongProducer extends BaseAvroProducer { +public class AvroLongProducer extends BaseAvroProducer { /** Instantiate an AvroLongProducer. */ public AvroLongProducer(BigIntVector vector) { super(vector); } + /** Protected constructor for logical types with a long representation. */ + protected AvroLongProducer(BaseFixedWidthVector vector) { + super(vector); + if (vector.getTypeWidth() != BigIntVector.TYPE_WIDTH) { + throw new IllegalArgumentException( + "AvroLongProducer requires type width = " + BigIntVector.TYPE_WIDTH); + } + } + @Override public void produce(Encoder encoder) throws IOException { - encoder.writeLong(vector.get(currentIndex++)); + long value = vector.getDataBuffer().getLong(currentIndex * (long) BigIntVector.TYPE_WIDTH); + encoder.writeLong(value); } } From f8c219137a2e02cf5b6b2fd8bdd0ade68e0c16e4 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 1 Mar 2025 10:31:13 +0000 Subject: [PATCH 30/89] Support for decimal and decimal 256 as logical types on top of the fixed width producer --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 4 +++ .../logical/AvroDecimal256Producer.java | 34 +++++++++++++++++++ .../logical/AvroDecimalProducer.java | 17 ++-------- 3 files changed, 41 insertions(+), 14 deletions(-) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 92cf9f6a1e..d006618462 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -37,6 +37,7 @@ import org.apache.arrow.adapter.avro.producers.Producer; import org.apache.arrow.adapter.avro.producers.logical.AvroDateProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroDecimalProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroDecimal256Producer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimeMicroProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimeMillisProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMicroProducer; @@ -46,6 +47,7 @@ import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.Decimal256Vector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float4Vector; @@ -595,6 +597,8 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu case DECIMAL: return new AvroDecimalProducer((DecimalVector) vector); + case DECIMAL256: + return new AvroDecimal256Producer((Decimal256Vector) vector); case DATEDAY: return new AvroDateProducer((DateDayVector) vector); case TIMEMILLI: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java new file mode 100644 index 0000000000..f8cf377dc5 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; +import org.apache.arrow.vector.Decimal256Vector; + +/** + * Producer that produces decimal values from a {@link Decimal256Vector}, writes data to an Avro + * encoder. + */ +public class AvroDecimal256Producer extends AvroFixedProducer { + + // Decimal stored as fixed width bytes, matches Avro decimal encoding + + /** Instantiate an AvroDecimalProducer. */ + public AvroDecimal256Producer(Decimal256Vector vector) { + super(vector); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java index 9cbe871719..b8f4a2a2e5 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java @@ -16,30 +16,19 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.io.IOException; -import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; import org.apache.arrow.vector.DecimalVector; -import org.apache.avro.io.Encoder; /** * Producer that produces decimal values from a {@link DecimalVector}, writes data to an Avro * encoder. */ -public class AvroDecimalProducer extends BaseAvroProducer { +public class AvroDecimalProducer extends AvroFixedProducer { - private final byte[] reuseBytes; + // Decimal stored as fixed width bytes, matches Avro decimal encoding /** Instantiate an AvroDecimalProducer. */ public AvroDecimalProducer(DecimalVector vector) { super(vector); - reuseBytes = new byte[vector.getTypeWidth()]; - } - - @Override - public void produce(Encoder encoder) throws IOException { - long offset = (long) currentIndex * vector.getTypeWidth(); - vector.getDataBuffer().getBytes(offset, reuseBytes); - encoder.writeFixed(reuseBytes); - currentIndex++; } } From 5af9db54e9cf7d79e5cb64aa474567d48b2d291c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sat, 1 Mar 2025 10:34:05 +0000 Subject: [PATCH 31/89] Support for date day and date millis as logical types, millis uses value conversion --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 25 +++-------- ...Producer.java => AvroDateDayProducer.java} | 15 +++---- .../logical/AvroDateMilliProducer.java | 45 +++++++++++++++++++ 3 files changed, 56 insertions(+), 29 deletions(-) rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/{AvroDateProducer.java => AvroDateDayProducer.java} (74%) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index d006618462..42bd5668be 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -35,7 +35,8 @@ import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; import org.apache.arrow.adapter.avro.producers.Producer; -import org.apache.arrow.adapter.avro.producers.logical.AvroDateProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroDateDayProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroDateMilliProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroDecimalProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroDecimal256Producer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimeMicroProducer; @@ -43,23 +44,7 @@ import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMicroProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMillisProducer; import org.apache.arrow.util.Preconditions; -import org.apache.arrow.vector.BigIntVector; -import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateDayVector; -import org.apache.arrow.vector.DecimalVector; -import org.apache.arrow.vector.Decimal256Vector; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.FixedSizeBinaryVector; -import org.apache.arrow.vector.Float4Vector; -import org.apache.arrow.vector.Float8Vector; -import org.apache.arrow.vector.IntVector; -import org.apache.arrow.vector.NullVector; -import org.apache.arrow.vector.TimeMicroVector; -import org.apache.arrow.vector.TimeMilliVector; -import org.apache.arrow.vector.TimeStampMicroVector; -import org.apache.arrow.vector.TimeStampMilliVector; -import org.apache.arrow.vector.VarBinaryVector; -import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.*; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; @@ -600,7 +585,9 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu case DECIMAL256: return new AvroDecimal256Producer((Decimal256Vector) vector); case DATEDAY: - return new AvroDateProducer((DateDayVector) vector); + return new AvroDateDayProducer((DateDayVector) vector); + case DATEMILLI: + return new AvroDateMilliProducer((DateMilliVector) vector); case TIMEMILLI: return new AvroTimeMillisProducer((TimeMilliVector) vector); case TIMEMICRO: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateDayProducer.java similarity index 74% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateDayProducer.java index 17729b3923..36680fb196 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateDayProducer.java @@ -16,23 +16,18 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.io.IOException; -import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.adapter.avro.producers.AvroIntProducer; import org.apache.arrow.vector.DateDayVector; -import org.apache.avro.io.Encoder; /** * Producer that produces date values from a {@link DateDayVector}, writes data to an Avro encoder. */ -public class AvroDateProducer extends BaseAvroProducer { +public class AvroDateDayProducer extends AvroIntProducer { + + // Date stored as integer number of days, matches Avro date type /** Instantiate an AvroDateProducer. */ - public AvroDateProducer(DateDayVector vector) { + public AvroDateDayProducer(DateDayVector vector) { super(vector); } - - @Override - public void produce(Encoder encoder) throws IOException { - encoder.writeInt(vector.get(currentIndex++)); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java new file mode 100644 index 0000000000..9d5cf46c0a --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces date values from a {@link DateMilliVector}, writes data to an Avro + * encoder. + */ +public class AvroDateMilliProducer extends BaseAvroProducer { + + // Convert milliseconds to days for Avro date type + + private static final long MILLIS_PER_DAY = 86400000; + + /** Instantiate an AvroDateMilliProducer. */ + public AvroDateMilliProducer(DateMilliVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + long millis = vector.getDataBuffer().getLong(currentIndex * (long) DateMilliVector.TYPE_WIDTH); + long days = millis / MILLIS_PER_DAY; + encoder.writeInt((int) days); + } +} From 8100f8d0830467e18985aaffa913297468203f72 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 10:27:15 +0000 Subject: [PATCH 32/89] Fix missing counter increments --- .../org/apache/arrow/adapter/avro/producers/AvroIntProducer.java | 1 + .../apache/arrow/adapter/avro/producers/AvroLongProducer.java | 1 + .../adapter/avro/producers/logical/AvroDateMilliProducer.java | 1 + 3 files changed, 3 insertions(+) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java index 1a61d2d5e4..9407e5c9de 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java @@ -47,5 +47,6 @@ protected AvroIntProducer(BaseFixedWidthVector vector) { public void produce(Encoder encoder) throws IOException { int value = vector.getDataBuffer().getInt(currentIndex * (long) IntVector.TYPE_WIDTH); encoder.writeInt(value); + currentIndex++; } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java index 2047639697..c98a54c2e8 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java @@ -47,5 +47,6 @@ protected AvroLongProducer(BaseFixedWidthVector vector) { public void produce(Encoder encoder) throws IOException { long value = vector.getDataBuffer().getLong(currentIndex * (long) BigIntVector.TYPE_WIDTH); encoder.writeLong(value); + currentIndex++; } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java index 9d5cf46c0a..45628c42c5 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java @@ -41,5 +41,6 @@ public void produce(Encoder encoder) throws IOException { long millis = vector.getDataBuffer().getLong(currentIndex * (long) DateMilliVector.TYPE_WIDTH); long days = millis / MILLIS_PER_DAY; encoder.writeInt((int) days); + currentIndex++; } } From be27d6e64c25f414855efdac891712cc4e2a51b6 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 10:45:36 +0000 Subject: [PATCH 33/89] Support all type width / unit combinations for time logical type --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 46 ++++++++++++++---- .../logical/AvroTimeMicroProducer.java | 13 ++--- ...oducer.java => AvroTimeMilliProducer.java} | 15 ++---- .../logical/AvroTimeNanoProducer.java | 46 ++++++++++++++++++ .../logical/AvroTimeSecProducer.java | 47 +++++++++++++++++++ 5 files changed, 139 insertions(+), 28 deletions(-) rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/{AvroTimeMillisProducer.java => AvroTimeMilliProducer.java} (73%) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeNanoProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 42bd5668be..06f97c6989 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -37,14 +37,35 @@ import org.apache.arrow.adapter.avro.producers.Producer; import org.apache.arrow.adapter.avro.producers.logical.AvroDateDayProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroDateMilliProducer; -import org.apache.arrow.adapter.avro.producers.logical.AvroDecimalProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroDecimal256Producer; +import org.apache.arrow.adapter.avro.producers.logical.AvroDecimalProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimeMicroProducer; -import org.apache.arrow.adapter.avro.producers.logical.AvroTimeMillisProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimeMilliProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimeNanoProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimeSecProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMicroProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMillisProducer; import org.apache.arrow.util.Preconditions; -import org.apache.arrow.vector.*; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; @@ -81,8 +102,8 @@ public class ArrowToAvroUtils { *

  • ArrowType.FixedSizeBinary --> FIXED *
  • ArrowType.Decimal --> decimal (FIXED) *
  • ArrowType.Date --> date (INT) - *
  • ArrowType.Time (bit width <= 32) --> time-millis (INT) - *
  • ArrowType.Time (bit width > 32) --> time-micros (LONG) + *
  • ArrowType.Time (MILLI) --> time-millis (INT) + *
  • ArrowType.Time (SEC | MICRO | NANO) --> time-micros (LONG) *
  • ArrowType.Timestamp (NANOSECONDS, TZ != NULL) --> time-nanos (LONG) *
  • ArrowType.Timestamp (MICROSECONDS, TZ != NULL) --> time-micros (LONG) *
  • ArrowType.Timestamp (MILLISECONDS | SECONDS, TZ != NULL) --> time-millis (LONG) @@ -273,9 +294,10 @@ private static T buildBaseTypeSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); - if (timeType.getBitWidth() <= 32) { + if (timeType.getUnit() == TimeUnit.MILLISECOND) { return builder.intBuilder().prop("logicalType", "time-millis").endInt(); } else { + // All other time types (sec, micro, nano) are encoded as time-micros (LONG) return builder.longBuilder().prop("logicalType", "time-micros").endLong(); } @@ -363,9 +385,10 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); - if (timeType.getBitWidth() <= 32) { + if (timeType.getUnit() == TimeUnit.MILLISECOND) { return builder.intBuilder().prop("logicalType", "time-millis").endInt().noDefault(); } else { + // All other time types (sec, micro, nano) are encoded as time-micros (LONG) return builder.longBuilder().prop("logicalType", "time-micros").endLong().noDefault(); } @@ -463,11 +486,12 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); - if (timeType.getBitWidth() <= 32) { + if (timeType.getUnit() == TimeUnit.MILLISECOND) { return (SchemaBuilder.UnionAccumulator) builder.intBuilder().prop("logicalType", "time-millis").endInt(); } else { return (SchemaBuilder.UnionAccumulator) + // All other time types (sec, micro, nano) are encoded as time-micros (LONG) builder.longBuilder().prop("logicalType", "time-micros").endLong(); } @@ -588,10 +612,14 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu return new AvroDateDayProducer((DateDayVector) vector); case DATEMILLI: return new AvroDateMilliProducer((DateMilliVector) vector); + case TIMESEC: + return new AvroTimeSecProducer((TimeSecVector) vector); case TIMEMILLI: - return new AvroTimeMillisProducer((TimeMilliVector) vector); + return new AvroTimeMilliProducer((TimeMilliVector) vector); case TIMEMICRO: return new AvroTimeMicroProducer((TimeMicroVector) vector); + case TIMENANO: + return new AvroTimeNanoProducer((TimeNanoVector) vector); case TIMESTAMPMILLI: return new AvroTimestampMillisProducer((TimeStampMilliVector) vector); case TIMESTAMPMICRO: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java index 2da5df730a..36ed458d5c 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java @@ -16,24 +16,19 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.io.IOException; -import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.adapter.avro.producers.AvroLongProducer; import org.apache.arrow.vector.TimeMicroVector; -import org.apache.avro.io.Encoder; /** * Producer that produces time (microseconds) values from a {@link TimeMicroVector}, writes data to * an Avro encoder. */ -public class AvroTimeMicroProducer extends BaseAvroProducer { +public class AvroTimeMicroProducer extends AvroLongProducer { + + // Time in microseconds stored as long, matches Avro time-micros type /** Instantiate an AvroTimeMicroProducer. */ public AvroTimeMicroProducer(TimeMicroVector vector) { super(vector); } - - @Override - public void produce(Encoder encoder) throws IOException { - encoder.writeLong(vector.get(currentIndex++)); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMilliProducer.java similarity index 73% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMilliProducer.java index 39a57712a2..c33e3aa174 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMillisProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMilliProducer.java @@ -16,24 +16,19 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.io.IOException; -import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.adapter.avro.producers.AvroIntProducer; import org.apache.arrow.vector.TimeMilliVector; -import org.apache.avro.io.Encoder; /** * Producer that produces time (milliseconds) values from a {@link TimeMilliVector}, writes data to * an Avro encoder. */ -public class AvroTimeMillisProducer extends BaseAvroProducer { +public class AvroTimeMilliProducer extends AvroIntProducer { + + // Time in milliseconds stored as integer, matches Avro time-millis type /** Instantiate an AvroTimeMillisProducer. */ - public AvroTimeMillisProducer(TimeMilliVector vector) { + public AvroTimeMilliProducer(TimeMilliVector vector) { super(vector); } - - @Override - public void produce(Encoder encoder) throws IOException { - encoder.writeInt(vector.get(currentIndex++)); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeNanoProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeNanoProducer.java new file mode 100644 index 0000000000..7034dbbb50 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeNanoProducer.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that converts nanoseconds from a {@link TimeNanoVector} and produces time (microseconds) + * values, writes data to an Avro encoder. + */ +public class AvroTimeNanoProducer extends BaseAvroProducer { + + // Convert nanoseconds to microseconds for Avro time-micros (LONG) type + // Range is 1000 times less than for microseconds, so the type will fit (with loss of precision) + + private static final long NANOS_PER_MICRO = 1000; + + public AvroTimeNanoProducer(TimeNanoVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + long nanos = vector.getDataBuffer().getLong(currentIndex * (long) TimeNanoVector.TYPE_WIDTH); + long micros = nanos / NANOS_PER_MICRO; + encoder.writeLong(micros); + currentIndex++; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java new file mode 100644 index 0000000000..443a4af8ab --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that converts seconds from a {@link TimeSecVector} and produces time (microseconds) + * values, writes data to an Avro encoder. + */ +public class AvroTimeSecProducer extends BaseAvroProducer { + + // Convert seconds to microseconds for Avro time-micros (LONG) type + // Range is 1000 times more than for milliseconds, so won't fit into time-millis (INT) + + private static final long MICROS_PER_SECOND = 1000; + + /** Instantiate an AvroTimeSecProducer. */ + public AvroTimeSecProducer(TimeSecVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + int seconds = vector.getDataBuffer().getInt(currentIndex * (long) TimeSecVector.TYPE_WIDTH); + long micros = seconds * MICROS_PER_SECOND; + encoder.writeLong(micros); + currentIndex++; + } +} From 7b6d3e02556d1b9e642abc4de8abeaad9efb2fb8 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 10:45:48 +0000 Subject: [PATCH 34/89] Update comment for date milli producer --- .../adapter/avro/producers/logical/AvroDateMilliProducer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java index 45628c42c5..0ce0beaca6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java @@ -22,8 +22,8 @@ import org.apache.avro.io.Encoder; /** - * Producer that produces date values from a {@link DateMilliVector}, writes data to an Avro - * encoder. + * Producer that converts days in milliseconds from a {@link DateMilliVector} and produces date + * (INT) values, writes data to an Avro encoder. */ public class AvroDateMilliProducer extends BaseAvroProducer { From 2a5bffb0375e699cd86be18d3d04b52e585ae168 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 12:47:00 +0000 Subject: [PATCH 35/89] Producers for local timestamps --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 12 ++++- .../logical/AvroTimestampMicroProducer.java | 15 ++---- ...r.java => AvroTimestampMilliProducer.java} | 19 +++---- .../logical/AvroTimestampNanoProducer.java | 34 ++++++++++++ .../logical/AvroTimestampSecProducer.java | 52 +++++++++++++++++++ 5 files changed, 108 insertions(+), 24 deletions(-) rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/{AvroTimestampMillisProducer.java => AvroTimestampMilliProducer.java} (63%) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 06f97c6989..a06326cb75 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -44,7 +44,9 @@ import org.apache.arrow.adapter.avro.producers.logical.AvroTimeNanoProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimeSecProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMicroProducer; -import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMillisProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMilliProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampNanoProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampSecProducer; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; @@ -64,6 +66,8 @@ import org.apache.arrow.vector.TimeSecVector; import org.apache.arrow.vector.TimeStampMicroVector; import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.complex.ListVector; @@ -620,10 +624,14 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu return new AvroTimeMicroProducer((TimeMicroVector) vector); case TIMENANO: return new AvroTimeNanoProducer((TimeNanoVector) vector); + case TIMESTAMPSEC: + return new AvroTimestampSecProducer((TimeStampSecVector) vector); case TIMESTAMPMILLI: - return new AvroTimestampMillisProducer((TimeStampMilliVector) vector); + return new AvroTimestampMilliProducer((TimeStampMilliVector) vector); case TIMESTAMPMICRO: return new AvroTimestampMicroProducer((TimeStampMicroVector) vector); + case TIMESTAMPNANO: + return new AvroTimestampNanoProducer((TimeStampNanoVector) vector); // Complex types diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java index 4569ed977f..688a9cf55a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java @@ -16,24 +16,19 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.io.IOException; -import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.adapter.avro.producers.AvroLongProducer; import org.apache.arrow.vector.TimeStampMicroVector; -import org.apache.avro.io.Encoder; /** - * Producer that produces timestamp (microseconds) values from a {@link TimeStampMicroVector}, + * Producer that produces local timestamp (microseconds) values from a {@link TimeStampMicroVector}, * writes data to an Avro encoder. */ -public class AvroTimestampMicroProducer extends BaseAvroProducer { +public class AvroTimestampMicroProducer extends AvroLongProducer { + + // Local timestamp in epoch microseconds stored as long, matches Avro local-timestamp-micros type /** Instantiate an AvroTimestampMicroProducer. */ public AvroTimestampMicroProducer(TimeStampMicroVector vector) { super(vector); } - - @Override - public void produce(Encoder encoder) throws IOException { - encoder.writeLong(vector.get(currentIndex++)); - } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliProducer.java similarity index 63% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliProducer.java index 837f33b32f..c9265ad719 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMillisProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliProducer.java @@ -16,24 +16,19 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.io.IOException; -import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.adapter.avro.producers.AvroLongProducer; import org.apache.arrow.vector.TimeStampMilliVector; -import org.apache.avro.io.Encoder; /** - * Producer that produces timestamp (milliseconds) values from a {@link TimeStampMilliVector}, + * Producer that produces local timestamp (milliseconds) values from a {@link TimeStampMilliVector}, * writes data to an Avro encoder. */ -public class AvroTimestampMillisProducer extends BaseAvroProducer { +public class AvroTimestampMilliProducer extends AvroLongProducer { - /** Instantiate an AvroTimestampMillisProducer. */ - public AvroTimestampMillisProducer(TimeStampMilliVector vector) { - super(vector); - } + // Local timestamp in epoch milliseconds stored as long, matches Avro local-timestamp-millis type - @Override - public void produce(Encoder encoder) throws IOException { - encoder.writeLong(vector.get(currentIndex++)); + /** Instantiate an AvroTimestampMilliProducer. */ + public AvroTimestampMilliProducer(TimeStampMilliVector vector) { + super(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoProducer.java new file mode 100644 index 0000000000..1c503e569a --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoProducer.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import org.apache.arrow.adapter.avro.producers.AvroLongProducer; +import org.apache.arrow.vector.TimeStampNanoVector; + +/** + * Producer that produces local timestamp (nanoseconds) values from a {@link TimeStampNanoVector}, + * writes data to an Avro encoder. + */ +public class AvroTimestampNanoProducer extends AvroLongProducer { + + // Local timestamp in epoch nanoseconds stored as long, matches Avro local-timestamp-nanos type + + /** Instantiate an AvroTimestampNanoProducer. */ + public AvroTimestampNanoProducer(TimeStampNanoVector vector) { + super(vector); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java new file mode 100644 index 0000000000..6a5ec4085e --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that converts epoch seconds from a {@link TimeStampSecVector} and produces local time + * (milliseconds) values, writes data to an Avro encoder. + */ +public class AvroTimestampSecProducer extends BaseAvroProducer { + + // Convert epoch seconds to milliseconds for Avro local-timestamp-millis type + // Check for overflow and raise an exception + + private static final long MILLIS_PER_SECOND = 1000; + private static final long OVERFLOW_LIMIT = Long.MAX_VALUE / MILLIS_PER_SECOND; + + /** Instantiate an AvroTimestampSecProducer. */ + public AvroTimestampSecProducer(TimeStampSecVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + long seconds = + vector.getDataBuffer().getLong(currentIndex * (long) TimeStampSecVector.TYPE_WIDTH); + if (Math.abs(seconds) > OVERFLOW_LIMIT) { + throw new ArithmeticException("Timestamp value is too large for Avro encoding"); + } + long millis = seconds * MILLIS_PER_SECOND; + encoder.writeLong(millis); + currentIndex++; + } +} From fab4e7b444a2306d6b85d84741a4cef156c28b15 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 12:47:13 +0000 Subject: [PATCH 36/89] Fixes for time producers --- .../adapter/avro/producers/logical/AvroTimeMilliProducer.java | 2 +- .../adapter/avro/producers/logical/AvroTimeSecProducer.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMilliProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMilliProducer.java index c33e3aa174..2a452e75a4 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMilliProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMilliProducer.java @@ -27,7 +27,7 @@ public class AvroTimeMilliProducer extends AvroIntProducer { // Time in milliseconds stored as integer, matches Avro time-millis type - /** Instantiate an AvroTimeMillisProducer. */ + /** Instantiate an AvroTimeMilliProducer. */ public AvroTimeMilliProducer(TimeMilliVector vector) { super(vector); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java index 443a4af8ab..87ebbb04c3 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java @@ -30,7 +30,7 @@ public class AvroTimeSecProducer extends BaseAvroProducer { // Convert seconds to microseconds for Avro time-micros (LONG) type // Range is 1000 times more than for milliseconds, so won't fit into time-millis (INT) - private static final long MICROS_PER_SECOND = 1000; + private static final long MICROS_PER_SECOND = 1000000; /** Instantiate an AvroTimeSecProducer. */ public AvroTimeSecProducer(TimeSecVector vector) { From c523f40aa6db3b7b8a3b5136daae2e695399c339 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 14:23:38 +0000 Subject: [PATCH 37/89] Support for zone-aware timestamp types --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 16 +++++ .../logical/AvroTimestampMicroTzProducer.java | 49 +++++++++++++++ .../logical/AvroTimestampMilliTzProducer.java | 41 +++++++++++++ .../logical/AvroTimestampNanoTzProducer.java | 47 +++++++++++++++ .../logical/AvroTimestampSecProducer.java | 2 +- .../logical/AvroTimestampSecTzProducer.java | 59 +++++++++++++++++++ .../logical/BaseTimestampTzProducer.java | 58 ++++++++++++++++++ 7 files changed, 271 insertions(+), 1 deletion(-) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroTzProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliTzProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoTzProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/BaseTimestampTzProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index a06326cb75..02c06c4c9d 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -44,9 +44,13 @@ import org.apache.arrow.adapter.avro.producers.logical.AvroTimeNanoProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimeSecProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMicroProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMicroTzProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMilliProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampMilliTzProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampNanoProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampNanoTzProducer; import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampSecProducer; +import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampSecTzProducer; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; @@ -64,9 +68,13 @@ import org.apache.arrow.vector.TimeMilliVector; import org.apache.arrow.vector.TimeNanoVector; import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; import org.apache.arrow.vector.TimeStampSecVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; @@ -632,6 +640,14 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu return new AvroTimestampMicroProducer((TimeStampMicroVector) vector); case TIMESTAMPNANO: return new AvroTimestampNanoProducer((TimeStampNanoVector) vector); + case TIMESTAMPSECTZ: + return new AvroTimestampSecTzProducer((TimeStampSecTZVector) vector); + case TIMESTAMPMILLITZ: + return new AvroTimestampMilliTzProducer((TimeStampMilliTZVector) vector); + case TIMESTAMPMICROTZ: + return new AvroTimestampMicroTzProducer((TimeStampMicroTZVector) vector); + case TIMESTAMPNANOTZ: + return new AvroTimestampNanoTzProducer((TimeStampNanoTZVector) vector); // Complex types diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroTzProducer.java new file mode 100644 index 0000000000..ee80d73fda --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroTzProducer.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.time.Instant; +import java.time.ZoneId; +import org.apache.arrow.vector.TimeStampMicroTZVector; + +/** + * Producer that converts timestamps in zone-aware epoch microseconds from a {@link + * TimeStampMicroTZVector} and produces UTC timestamp (microseconds) values, writes data to an Avro + * encoder. + */ +public class AvroTimestampMicroTzProducer extends BaseTimestampTzProducer { + + private static final long MICROS_PER_SECOND = 1000000; + private static final long NANOS_PER_MICRO = 1000; + + /** Instantiate an AvroTimestampMicroTzProducer. */ + public AvroTimestampMicroTzProducer(TimeStampMicroTZVector vector) { + super(vector, vector.getTimeZone(), MICROS_PER_SECOND); + } + + @Override + protected long convertToUtc(long tzValue, ZoneId zoneId) { + // For negative values, e.g. -.5 seconds = -1 second + .5 in micros + long tzSeconds = tzValue >= 0 ? tzValue / MICROS_PER_SECOND : tzValue / MICROS_PER_SECOND - 1; + long tzMicro = tzValue % MICROS_PER_SECOND; + Instant utcInstant = + Instant.ofEpochSecond(tzSeconds, tzMicro * NANOS_PER_MICRO).atZone(zoneId).toInstant(); + long utcSeconds = utcInstant.getEpochSecond(); + long utcMicro = utcInstant.getNano() / NANOS_PER_MICRO; + return utcSeconds * MICROS_PER_SECOND + utcMicro; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliTzProducer.java new file mode 100644 index 0000000000..cfde9612af --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliTzProducer.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.time.Instant; +import java.time.ZoneId; +import org.apache.arrow.vector.TimeStampMilliTZVector; + +/** + * Producer that converts timestamps in zone-aware epoch milliseconds from a {@link + * TimeStampMilliTZVector} and produces UTC timestamp (milliseconds) values, writes data to an Avro + * encoder. + */ +public class AvroTimestampMilliTzProducer extends BaseTimestampTzProducer { + + private static final long MILLIS_PER_SECOND = 1000; + + /** Instantiate an AvroTimestampMilliTzProducer. */ + public AvroTimestampMilliTzProducer(TimeStampMilliTZVector vector) { + super(vector, vector.getTimeZone(), MILLIS_PER_SECOND); + } + + @Override + protected long convertToUtc(long tzValue, ZoneId zoneId) { + return Instant.ofEpochMilli(tzValue).atZone(zoneId).toInstant().toEpochMilli(); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoTzProducer.java new file mode 100644 index 0000000000..168ab7171d --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoTzProducer.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.time.Instant; +import java.time.ZoneId; +import org.apache.arrow.vector.TimeStampNanoTZVector; + +/** + * Producer that converts timestamps in zone-aware epoch nanoseconds from a {@link + * TimeStampNanoTZVector} and produces UTC timestamp (nanoseconds) values, writes data to an Avro + * encoder. + */ +public class AvroTimestampNanoTzProducer extends BaseTimestampTzProducer { + + private static final long NANOS_PER_SECOND = 1000000000; + + /** Instantiate an AvroTimestampNanoTzProducer. */ + public AvroTimestampNanoTzProducer(TimeStampNanoTZVector vector) { + super(vector, vector.getTimeZone(), NANOS_PER_SECOND); + } + + @Override + protected long convertToUtc(long tzValue, ZoneId zoneId) { + // For negative values, e.g. -.5 seconds = -1 second + .5 in nanos + long tzSeconds = tzValue >= 0 ? tzValue / NANOS_PER_SECOND : tzValue / NANOS_PER_SECOND - 1; + long tzNano = tzValue % NANOS_PER_SECOND; + Instant utcInstant = Instant.ofEpochSecond(tzSeconds, tzNano).atZone(zoneId).toInstant(); + long utcSeconds = utcInstant.getEpochSecond(); + long utcNano = utcInstant.getNano(); + return utcSeconds * NANOS_PER_SECOND + utcNano; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java index 6a5ec4085e..ebc29d9b34 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java @@ -27,7 +27,7 @@ */ public class AvroTimestampSecProducer extends BaseAvroProducer { - // Convert epoch seconds to milliseconds for Avro local-timestamp-millis type + // Avro does not support timestamps in seconds, so convert to local-timestamp-millis type // Check for overflow and raise an exception private static final long MILLIS_PER_SECOND = 1000; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java new file mode 100644 index 0000000000..b04939b788 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import java.time.Instant; +import java.time.ZoneId; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that converts timestamps in zone-aware epoch seconds from a {@link TimeStampSecTZVector} + * and produces UTC timestamp (millisecond) values, writes data to an Avro encoder. + */ +public class AvroTimestampSecTzProducer extends BaseTimestampTzProducer { + + // Avro does not support timestamps in seconds, so convert to timestamp-millis type + // Check for overflow and raise an exception + + private static final long MILLIS_PER_SECOND = 1000; + private static final long OVERFLOW_LIMIT = Long.MAX_VALUE / MILLIS_PER_SECOND; + + /** Instantiate an AvroTimestampSecTzProducer. */ + public AvroTimestampSecTzProducer(TimeStampSecTZVector vector) { + super(vector, vector.getTimeZone(), 1); + } + + @Override + protected long convertToUtc(long tzValue, ZoneId zoneId) { + return Instant.ofEpochSecond(tzValue).atZone(zoneId).toInstant().getEpochSecond(); + } + + @Override + public void produce(Encoder encoder) throws IOException { + long tzSeconds = + vector.getDataBuffer().getLong(currentIndex * (long) TimeStampVector.TYPE_WIDTH); + long utcSeconds = fixedOffsetFlag ? tzSeconds + fixedOffset : convertToUtc(tzSeconds, zoneId); + if (Math.abs(utcSeconds) > OVERFLOW_LIMIT) { + throw new ArithmeticException("Timestamp value is too large for Avro encoding"); + } + long utcMillis = utcSeconds * MILLIS_PER_SECOND; + encoder.writeLong(utcMillis); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/BaseTimestampTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/BaseTimestampTzProducer.java new file mode 100644 index 0000000000..cffc3b219e --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/BaseTimestampTzProducer.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers.logical; + +import java.io.IOException; +import java.time.ZoneId; +import java.time.ZoneOffset; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; +import org.apache.arrow.vector.TimeStampVector; +import org.apache.avro.io.Encoder; + +abstract class BaseTimestampTzProducer + extends BaseAvroProducer { + + // Convert TZ values to UTC to encode Avro timestamp types + // Where possible, used a fixed offset (zero for UTC or missing zone info) + // Conversion using named zones is more expensive and depends on the concrete type + + protected final ZoneId zoneId; + protected final boolean fixedOffsetFlag; + protected final long fixedOffset; + + protected abstract long convertToUtc(long tzValue, ZoneId zoneId); + + protected BaseTimestampTzProducer(T vector, String zoneName, long offsetMultiplier) { + super(vector); + zoneId = zoneName != null ? ZoneId.of(zoneName) : ZoneOffset.UTC; + if (zoneId instanceof ZoneOffset) { + ZoneOffset offset = (ZoneOffset) zoneId; + fixedOffsetFlag = true; + fixedOffset = (long) offset.getTotalSeconds() * offsetMultiplier; + } else { + fixedOffsetFlag = false; + fixedOffset = 0; + } + } + + @Override + public void produce(Encoder encoder) throws IOException { + long tzValue = vector.getDataBuffer().getLong(currentIndex * (long) TimeStampVector.TYPE_WIDTH); + long utcValue = fixedOffsetFlag ? tzValue + fixedOffset : convertToUtc(tzValue, zoneId); + encoder.writeLong(utcValue); + } +} From f1ac4014376eb9fcb0e6539f55989387738319eb Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 14:28:09 +0000 Subject: [PATCH 38/89] Remove duration types from schema (not supported yet) --- .../apache/arrow/adapter/avro/ArrowToAvroUtils.java | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 02c06c4c9d..bb68c4f269 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -318,10 +318,6 @@ private static T buildBaseTypeSchema( String timestampLogicalType = timestampLogicalType(timestampType); return builder.longBuilder().prop("logicalType", timestampLogicalType).endLong(); - case Duration: - case Interval: - return builder.fixed(field.getName()).prop("logicalType", "duration").size(12); - case Struct: String childNamespace = namespace == null ? field.getName() : namespace + "." + field.getName(); @@ -413,10 +409,6 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( .endLong() .noDefault(); - case Duration: - case Interval: - return builder.fixed(field.getName()).prop("logicalType", "duration").size(12).noDefault(); - case Struct: String childNamespace = namespace == null ? field.getName() : namespace + "." + field.getName(); @@ -513,11 +505,6 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( return (SchemaBuilder.UnionAccumulator) builder.longBuilder().prop("logicalType", timestampLogicalType).endLong(); - case Duration: - case Interval: - return (SchemaBuilder.UnionAccumulator) - builder.fixed(field.getName()).prop("logicalType", "duration").size(12); - case Struct: String childNamespace = namespace == null ? field.getName() : namespace + "." + field.getName(); From 6af475e5286ea91e20051d3e871f134122da6336 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 14:36:04 +0000 Subject: [PATCH 39/89] Support all type widths for floating point types --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 12 ++++-- .../avro/producers/AvroFloat2Producer.java | 40 +++++++++++++++++++ ...tProducer.java => AvroFloat4Producer.java} | 10 +++-- ...eProducer.java => AvroFloat8Producer.java} | 10 +++-- 4 files changed, 60 insertions(+), 12 deletions(-) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat2Producer.java rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/{AvroFloatProducer.java => AvroFloat4Producer.java} (78%) rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/{AvroDoubleProducer.java => AvroFloat8Producer.java} (79%) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index bb68c4f269..96a2971c14 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -21,9 +21,10 @@ import org.apache.arrow.adapter.avro.producers.AvroArraysProducer; import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; -import org.apache.arrow.adapter.avro.producers.AvroDoubleProducer; import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; -import org.apache.arrow.adapter.avro.producers.AvroFloatProducer; +import org.apache.arrow.adapter.avro.producers.AvroFloat2Producer; +import org.apache.arrow.adapter.avro.producers.AvroFloat4Producer; +import org.apache.arrow.adapter.avro.producers.AvroFloat8Producer; import org.apache.arrow.adapter.avro.producers.AvroIntProducer; import org.apache.arrow.adapter.avro.producers.AvroLongProducer; import org.apache.arrow.adapter.avro.producers.AvroMapProducer; @@ -60,6 +61,7 @@ import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -590,10 +592,12 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu return new AvroIntProducer((IntVector) vector); case BIGINT: return new AvroLongProducer((BigIntVector) vector); + case FLOAT2: + return new AvroFloat2Producer((Float2Vector) vector); case FLOAT4: - return new AvroFloatProducer((Float4Vector) vector); + return new AvroFloat4Producer((Float4Vector) vector); case FLOAT8: - return new AvroDoubleProducer((Float8Vector) vector); + return new AvroFloat8Producer((Float8Vector) vector); case VARBINARY: return new AvroBytesProducer((VarBinaryVector) vector); case FIXEDSIZEBINARY: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat2Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat2Producer.java new file mode 100644 index 0000000000..07e5ea3591 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat2Producer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import org.apache.arrow.memory.util.Float16; +import org.apache.arrow.vector.Float2Vector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces float values from a {@link Float2Vector}, writes data to an Avro encoder. + */ +public class AvroFloat2Producer extends BaseAvroProducer { + + /** Instantiate an AvroFloat2Producer. */ + public AvroFloat2Producer(Float2Vector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + short rawValue = vector.getDataBuffer().getShort(currentIndex * (long) Float2Vector.TYPE_WIDTH); + encoder.writeFloat(Float16.toFloat(rawValue)); + currentIndex++; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat4Producer.java similarity index 78% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat4Producer.java index f714f4534e..5121ba3a11 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloatProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat4Producer.java @@ -23,15 +23,17 @@ /** * Producer that produces float values from a {@link Float4Vector}, writes data to an Avro encoder. */ -public class AvroFloatProducer extends BaseAvroProducer { +public class AvroFloat4Producer extends BaseAvroProducer { - /** Instantiate an AvroFloatProducer. */ - public AvroFloatProducer(Float4Vector vector) { + /** Instantiate an AvroFloat4Producer. */ + public AvroFloat4Producer(Float4Vector vector) { super(vector); } @Override public void produce(Encoder encoder) throws IOException { - encoder.writeFloat(vector.get(currentIndex++)); + float value = vector.getDataBuffer().getFloat(currentIndex * (long) Float4Vector.TYPE_WIDTH); + encoder.writeFloat(value); + currentIndex++; } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat8Producer.java similarity index 79% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat8Producer.java index c770b9845f..05fca750ba 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDoubleProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFloat8Producer.java @@ -23,15 +23,17 @@ /** * Producer that produces double values from a {@link Float8Vector}, writes data to an Avro encoder. */ -public class AvroDoubleProducer extends BaseAvroProducer { +public class AvroFloat8Producer extends BaseAvroProducer { - /** Instantiate an AvroDoubleProducer. */ - public AvroDoubleProducer(Float8Vector vector) { + /** Instantiate an AvroFloat8Producer. */ + public AvroFloat8Producer(Float8Vector vector) { super(vector); } @Override public void produce(Encoder encoder) throws IOException { - encoder.writeDouble(vector.get(currentIndex++)); + double value = vector.getDataBuffer().getDouble(currentIndex * (long) Float8Vector.TYPE_WIDTH); + encoder.writeDouble(value); + currentIndex++; } } From 8f82d0fe18b64394728286d889ead2086510aeb8 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 14:49:45 +0000 Subject: [PATCH 40/89] Support all type widths for signed integer types --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 12 +++++- ...gProducer.java => AvroBigIntProducer.java} | 10 ++--- .../avro/producers/AvroIntProducer.java | 2 +- .../avro/producers/AvroSmallIntProducer.java | 39 +++++++++++++++++++ .../avro/producers/AvroTinyIntProducer.java | 39 +++++++++++++++++++ .../logical/AvroTimeMicroProducer.java | 4 +- .../logical/AvroTimestampMicroProducer.java | 4 +- .../logical/AvroTimestampMilliProducer.java | 4 +- .../logical/AvroTimestampNanoProducer.java | 4 +- 9 files changed, 102 insertions(+), 16 deletions(-) rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/{AvroLongProducer.java => AvroBigIntProducer.java} (84%) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroSmallIntProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroTinyIntProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 96a2971c14..5e52c89f6e 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.arrow.adapter.avro.producers.AvroArraysProducer; +import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; @@ -26,12 +27,13 @@ import org.apache.arrow.adapter.avro.producers.AvroFloat4Producer; import org.apache.arrow.adapter.avro.producers.AvroFloat8Producer; import org.apache.arrow.adapter.avro.producers.AvroIntProducer; -import org.apache.arrow.adapter.avro.producers.AvroLongProducer; import org.apache.arrow.adapter.avro.producers.AvroMapProducer; import org.apache.arrow.adapter.avro.producers.AvroNullProducer; import org.apache.arrow.adapter.avro.producers.AvroNullableProducer; +import org.apache.arrow.adapter.avro.producers.AvroSmallIntProducer; import org.apache.arrow.adapter.avro.producers.AvroStringProducer; import org.apache.arrow.adapter.avro.producers.AvroStructProducer; +import org.apache.arrow.adapter.avro.producers.AvroTinyIntProducer; import org.apache.arrow.adapter.avro.producers.AvroUnionsProducer; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; @@ -66,6 +68,7 @@ import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeMilliVector; import org.apache.arrow.vector.TimeNanoVector; @@ -78,6 +81,7 @@ import org.apache.arrow.vector.TimeStampNanoVector; import org.apache.arrow.vector.TimeStampSecTZVector; import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.complex.ListVector; @@ -588,10 +592,14 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu return new AvroNullProducer((NullVector) vector); case BIT: return new AvroBooleanProducer((BitVector) vector); + case TINYINT: + return new AvroTinyIntProducer((TinyIntVector) vector); + case SMALLINT: + return new AvroSmallIntProducer((SmallIntVector) vector); case INT: return new AvroIntProducer((IntVector) vector); case BIGINT: - return new AvroLongProducer((BigIntVector) vector); + return new AvroBigIntProducer((BigIntVector) vector); case FLOAT2: return new AvroFloat2Producer((Float2Vector) vector); case FLOAT4: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBigIntProducer.java similarity index 84% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBigIntProducer.java index c98a54c2e8..9712e157c6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroLongProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBigIntProducer.java @@ -27,19 +27,19 @@ *

    Logical types are also supported, for vectors derived from {@link BaseFixedWidthVector} where * the internal representation matches BigIntVector and requires no conversion. */ -public class AvroLongProducer extends BaseAvroProducer { +public class AvroBigIntProducer extends BaseAvroProducer { - /** Instantiate an AvroLongProducer. */ - public AvroLongProducer(BigIntVector vector) { + /** Instantiate an AvroBigIntProducer. */ + public AvroBigIntProducer(BigIntVector vector) { super(vector); } /** Protected constructor for logical types with a long representation. */ - protected AvroLongProducer(BaseFixedWidthVector vector) { + protected AvroBigIntProducer(BaseFixedWidthVector vector) { super(vector); if (vector.getTypeWidth() != BigIntVector.TYPE_WIDTH) { throw new IllegalArgumentException( - "AvroLongProducer requires type width = " + BigIntVector.TYPE_WIDTH); + "AvroBigIntProducer requires type width = " + BigIntVector.TYPE_WIDTH); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java index 9407e5c9de..4c9cc9b71a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroIntProducer.java @@ -39,7 +39,7 @@ protected AvroIntProducer(BaseFixedWidthVector vector) { super(vector); if (vector.getTypeWidth() != IntVector.TYPE_WIDTH) { throw new IllegalArgumentException( - "AvroLongProducer requires type width = " + IntVector.TYPE_WIDTH); + "AvroIntProducer requires type width = " + IntVector.TYPE_WIDTH); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroSmallIntProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroSmallIntProducer.java new file mode 100644 index 0000000000..9c37750d9f --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroSmallIntProducer.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces int values from an {@link SmallIntVector}, writes data to an avro encoder. + */ +public class AvroSmallIntProducer extends BaseAvroProducer { + + /** Instantiate an AvroSmallIntProducer. */ + public AvroSmallIntProducer(SmallIntVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + short value = vector.getDataBuffer().getShort(currentIndex * (long) SmallIntVector.TYPE_WIDTH); + encoder.writeInt(value); + currentIndex++; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroTinyIntProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroTinyIntProducer.java new file mode 100644 index 0000000000..30a80e5094 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroTinyIntProducer.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces int values from an {@link TinyIntVector}, writes data to an avro encoder. + */ +public class AvroTinyIntProducer extends BaseAvroProducer { + + /** Instantiate an AvroTinyIntProducer. */ + public AvroTinyIntProducer(TinyIntVector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + byte value = vector.getDataBuffer().getByte(currentIndex * (long) TinyIntVector.TYPE_WIDTH); + encoder.writeInt(value); + currentIndex++; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java index 36ed458d5c..203d102034 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeMicroProducer.java @@ -16,14 +16,14 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import org.apache.arrow.adapter.avro.producers.AvroLongProducer; +import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.vector.TimeMicroVector; /** * Producer that produces time (microseconds) values from a {@link TimeMicroVector}, writes data to * an Avro encoder. */ -public class AvroTimeMicroProducer extends AvroLongProducer { +public class AvroTimeMicroProducer extends AvroBigIntProducer { // Time in microseconds stored as long, matches Avro time-micros type diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java index 688a9cf55a..4e744b5e76 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroProducer.java @@ -16,14 +16,14 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import org.apache.arrow.adapter.avro.producers.AvroLongProducer; +import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.vector.TimeStampMicroVector; /** * Producer that produces local timestamp (microseconds) values from a {@link TimeStampMicroVector}, * writes data to an Avro encoder. */ -public class AvroTimestampMicroProducer extends AvroLongProducer { +public class AvroTimestampMicroProducer extends AvroBigIntProducer { // Local timestamp in epoch microseconds stored as long, matches Avro local-timestamp-micros type diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliProducer.java index c9265ad719..e71acff220 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliProducer.java @@ -16,14 +16,14 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import org.apache.arrow.adapter.avro.producers.AvroLongProducer; +import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.vector.TimeStampMilliVector; /** * Producer that produces local timestamp (milliseconds) values from a {@link TimeStampMilliVector}, * writes data to an Avro encoder. */ -public class AvroTimestampMilliProducer extends AvroLongProducer { +public class AvroTimestampMilliProducer extends AvroBigIntProducer { // Local timestamp in epoch milliseconds stored as long, matches Avro local-timestamp-millis type diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoProducer.java index 1c503e569a..9e172ea91e 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoProducer.java @@ -16,14 +16,14 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import org.apache.arrow.adapter.avro.producers.AvroLongProducer; +import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.vector.TimeStampNanoVector; /** * Producer that produces local timestamp (nanoseconds) values from a {@link TimeStampNanoVector}, * writes data to an Avro encoder. */ -public class AvroTimestampNanoProducer extends AvroLongProducer { +public class AvroTimestampNanoProducer extends AvroBigIntProducer { // Local timestamp in epoch nanoseconds stored as long, matches Avro local-timestamp-nanos type From c3620ca933a13601b8ec17fff739e6ef48ad8ed5 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 15:06:55 +0000 Subject: [PATCH 41/89] Add support for all type widths of unsigned integers --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 16 +++++++ .../avro/producers/AvroUint1Producer.java | 38 +++++++++++++++++ .../avro/producers/AvroUint2Producer.java | 38 +++++++++++++++++ .../avro/producers/AvroUint4Producer.java | 40 ++++++++++++++++++ .../avro/producers/AvroUint8Producer.java | 42 +++++++++++++++++++ 5 files changed, 174 insertions(+) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint1Producer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint2Producer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint4Producer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint8Producer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 5e52c89f6e..750bbc792f 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -34,6 +34,10 @@ import org.apache.arrow.adapter.avro.producers.AvroStringProducer; import org.apache.arrow.adapter.avro.producers.AvroStructProducer; import org.apache.arrow.adapter.avro.producers.AvroTinyIntProducer; +import org.apache.arrow.adapter.avro.producers.AvroUint1Producer; +import org.apache.arrow.adapter.avro.producers.AvroUint2Producer; +import org.apache.arrow.adapter.avro.producers.AvroUint4Producer; +import org.apache.arrow.adapter.avro.producers.AvroUint8Producer; import org.apache.arrow.adapter.avro.producers.AvroUnionsProducer; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; @@ -82,6 +86,10 @@ import org.apache.arrow.vector.TimeStampSecTZVector; import org.apache.arrow.vector.TimeStampSecVector; import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.complex.ListVector; @@ -600,6 +608,14 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu return new AvroIntProducer((IntVector) vector); case BIGINT: return new AvroBigIntProducer((BigIntVector) vector); + case UINT1: + return new AvroUint1Producer((UInt1Vector) vector); + case UINT2: + return new AvroUint2Producer((UInt2Vector) vector); + case UINT4: + return new AvroUint4Producer((UInt4Vector) vector); + case UINT8: + return new AvroUint8Producer((UInt8Vector) vector); case FLOAT2: return new AvroFloat2Producer((Float2Vector) vector); case FLOAT4: diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint1Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint1Producer.java new file mode 100644 index 0000000000..59ce44b662 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint1Producer.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.avro.io.Encoder; + +/** Producer that produces int values from a {@link UInt1Vector}, writes data to an avro encoder. */ +public class AvroUint1Producer extends BaseAvroProducer { + + /** Instantiate an AvroUint1Producer. */ + public AvroUint1Producer(UInt1Vector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + byte unsigned = vector.getDataBuffer().getByte(currentIndex * (long) UInt1Vector.TYPE_WIDTH); + int signed = unsigned & 0xff; + encoder.writeInt(signed); + currentIndex++; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint2Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint2Producer.java new file mode 100644 index 0000000000..8dde176cf6 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint2Producer.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.avro.io.Encoder; + +/** Producer that produces int values from a {@link UInt2Vector}, writes data to an avro encoder. */ +public class AvroUint2Producer extends BaseAvroProducer { + + /** Instantiate an AvroUint2Producer. */ + public AvroUint2Producer(UInt2Vector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + short unsigned = vector.getDataBuffer().getShort(currentIndex * (long) UInt2Vector.TYPE_WIDTH); + int signed = unsigned & 0xffff; + encoder.writeInt(signed); + currentIndex++; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint4Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint4Producer.java new file mode 100644 index 0000000000..d02ffc55f3 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint4Producer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces long values from a {@link UInt4Vector}, writes data to an avro encoder. + */ +public class AvroUint4Producer extends BaseAvroProducer { + + /** Instantiate an AvroUint4Producer. */ + public AvroUint4Producer(UInt4Vector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + int unsigned = vector.getDataBuffer().getInt(currentIndex * (long) UInt4Vector.TYPE_WIDTH); + long signed = unsigned & 0xffffffffL; + encoder.writeLong(signed); + currentIndex++; + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint8Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint8Producer.java new file mode 100644 index 0000000000..819b4d4140 --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint8Producer.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces long values from a {@link UInt8Vector}, writes data to an avro encoder. + */ +public class AvroUint8Producer extends BaseAvroProducer { + + /** Instantiate an AvroUint8Producer. */ + public AvroUint8Producer(UInt8Vector vector) { + super(vector); + } + + @Override + public void produce(Encoder encoder) throws IOException { + long unsigned = vector.getDataBuffer().getLong(currentIndex * (long) UInt8Vector.TYPE_WIDTH); + if (unsigned < 0) { + throw new ArithmeticException("Unsigned long value is too large for Avro encoding"); + } + encoder.writeLong(unsigned); + currentIndex++; + } +} From c223ffb8ccd4f001713232faee41edebeb0d875c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 16:21:12 +0000 Subject: [PATCH 42/89] Support list and fixed list, do not support large list for now --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 16 +++-- .../producers/AvroFixedSizeListProducer.java | 69 +++++++++++++++++++ ...aysProducer.java => AvroListProducer.java} | 18 ++--- 3 files changed, 87 insertions(+), 16 deletions(-) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/{AvroArraysProducer.java => AvroListProducer.java} (79%) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 750bbc792f..0d8b6639df 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -18,15 +18,16 @@ import java.util.ArrayList; import java.util.List; -import org.apache.arrow.adapter.avro.producers.AvroArraysProducer; import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; +import org.apache.arrow.adapter.avro.producers.AvroFixedSizeListProducer; import org.apache.arrow.adapter.avro.producers.AvroFloat2Producer; import org.apache.arrow.adapter.avro.producers.AvroFloat4Producer; import org.apache.arrow.adapter.avro.producers.AvroFloat8Producer; import org.apache.arrow.adapter.avro.producers.AvroIntProducer; +import org.apache.arrow.adapter.avro.producers.AvroListProducer; import org.apache.arrow.adapter.avro.producers.AvroMapProducer; import org.apache.arrow.adapter.avro.producers.AvroNullProducer; import org.apache.arrow.adapter.avro.producers.AvroNullableProducer; @@ -92,6 +93,7 @@ import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; @@ -339,7 +341,6 @@ private static T buildBaseTypeSchema( builder.record(field.getName()), field.getChildren(), childNamespace); case List: - case LargeList: case FixedSizeList: return buildArraySchema(builder.array(), field, namespace); @@ -431,7 +432,6 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( .noDefault(); case List: - case LargeList: case FixedSizeList: return buildArraySchema(builder.array(), field, namespace).noDefault(); @@ -526,7 +526,6 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( buildRecordSchema(builder.record(field.getName()), field.getChildren(), childNamespace); case List: - case LargeList: case FixedSizeList: return (SchemaBuilder.UnionAccumulator) buildArraySchema(builder.array(), field, namespace); @@ -680,7 +679,14 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu ListVector listVector = (ListVector) vector; FieldVector itemVector = listVector.getDataVector(); Producer itemProducer = createProducer(itemVector, itemVector.getField().isNullable()); - return new AvroArraysProducer(listVector, itemProducer); + return new AvroListProducer(listVector, itemProducer); + + case FIXED_SIZE_LIST: + FixedSizeListVector fixedListVector = (FixedSizeListVector) vector; + FieldVector fixedItemVector = fixedListVector.getDataVector(); + Producer fixedItemProducer = + createProducer(fixedItemVector, fixedItemVector.getField().isNullable()); + return new AvroFixedSizeListProducer(fixedListVector, fixedItemProducer); case MAP: MapVector mapVector = (MapVector) vector; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java new file mode 100644 index 0000000000..0e144de0ca --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import java.io.IOException; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.avro.io.Encoder; + +/** + * Producer that produces array values from a {@link FixedSizeListVector}, writes data to an avro + * encoder. + */ +public class AvroFixedSizeListProducer extends BaseAvroProducer { + + private final Producer delegate; + + /** Instantiate an AvroFixedSizeListProducer. */ + public AvroFixedSizeListProducer( + FixedSizeListVector vector, Producer delegate) { + super(vector); + this.delegate = delegate; + } + + @Override + public void produce(Encoder encoder) throws IOException { + + encoder.writeArrayStart(); + encoder.setItemCount(vector.getListSize()); + + for (int i = 0; i < vector.getListSize(); i++) { + encoder.startItem(); + delegate.produce(encoder); + } + + encoder.writeArrayEnd(); + currentIndex++; + } + + // Do not override skipNull(), the delegate delegate vector will not hold data + + @Override + public void setPosition(int index) { + int delegateOffset = vector.getOffsetBuffer().getInt(index * (long) Integer.BYTES); + delegate.setPosition(delegateOffset); + super.setPosition(index); + } + + @Override + @SuppressWarnings("unchecked") + public boolean resetValueVector(FixedSizeListVector vector) { + ((Producer) delegate).resetValueVector(vector.getDataVector()); + return super.resetValueVector(vector); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java similarity index 79% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java index 8ba68814aa..6dda502a32 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroArraysProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java @@ -22,15 +22,14 @@ import org.apache.avro.io.Encoder; /** - * Producer which produces array type values to an Avro encoder. Writes the data from a {@link - * ListVector}. + * Producer that produces array values from a {@link ListVector}, writes data to an avro encoder. */ -public class AvroArraysProducer extends BaseAvroProducer { +public class AvroListProducer extends BaseAvroProducer { private final Producer delegate; - /** Instantiate an ArraysProducer. */ - public AvroArraysProducer(ListVector vector, Producer delegate) { + /** Instantiate an AvroListProducer. */ + public AvroListProducer(ListVector vector, Producer delegate) { super(vector); this.delegate = delegate; } @@ -54,15 +53,12 @@ public void produce(Encoder encoder) throws IOException { currentIndex++; } - @Override - public void skipNull() { - delegate.skipNull(); - super.skipNull(); - } + // Do not override skipNull(), the delegate delegate vector will not hold data @Override public void setPosition(int index) { - delegate.setPosition(index); + int delegateOffset = vector.getOffsetBuffer().getInt(index * (long) Integer.BYTES); + delegate.setPosition(delegateOffset); super.setPosition(index); } From b974053b0590816c59f981c308f42120ebd7fa60 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 16:22:33 +0000 Subject: [PATCH 43/89] Do not include large types in schema generation --- .../org/apache/arrow/adapter/avro/ArrowToAvroUtils.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 0d8b6639df..21fc11a873 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -295,11 +295,9 @@ private static T buildBaseTypeSchema( } case Utf8: - case LargeUtf8: return builder.stringType(); case Binary: - case LargeBinary: return builder.bytesType(); case FixedSizeBinary: @@ -382,11 +380,9 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( } case Utf8: - case LargeUtf8: return builder.stringType().noDefault(); case Binary: - case LargeBinary: return builder.bytesType().noDefault(); case FixedSizeBinary: @@ -475,11 +471,9 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( } case Utf8: - case LargeUtf8: return (SchemaBuilder.UnionAccumulator) builder.stringType(); case Binary: - case LargeBinary: return (SchemaBuilder.UnionAccumulator) builder.bytesType(); case FixedSizeBinary: From 0a468ff8c928a243f529088b7aeef83bab9cf49c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 16:42:22 +0000 Subject: [PATCH 44/89] Include support for dense union --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 17 +++++++-- .../producers/AvroDenseUnionProducer.java | 36 +++++++++++++++++++ .../avro/producers/AvroUnionProducer.java | 35 ++++++++++++++++++ ...nsProducer.java => BaseUnionProducer.java} | 24 ++++++------- 4 files changed, 97 insertions(+), 15 deletions(-) create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDenseUnionProducer.java create mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionProducer.java rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/{AvroUnionsProducer.java => BaseUnionProducer.java} (81%) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 21fc11a873..e677623756 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -21,6 +21,7 @@ import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; +import org.apache.arrow.adapter.avro.producers.AvroDenseUnionProducer; import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; import org.apache.arrow.adapter.avro.producers.AvroFixedSizeListProducer; import org.apache.arrow.adapter.avro.producers.AvroFloat2Producer; @@ -39,7 +40,7 @@ import org.apache.arrow.adapter.avro.producers.AvroUint2Producer; import org.apache.arrow.adapter.avro.producers.AvroUint4Producer; import org.apache.arrow.adapter.avro.producers.AvroUint8Producer; -import org.apache.arrow.adapter.avro.producers.AvroUnionsProducer; +import org.apache.arrow.adapter.avro.producers.AvroUnionProducer; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; import org.apache.arrow.adapter.avro.producers.Producer; @@ -93,6 +94,7 @@ import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; @@ -703,7 +705,18 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu unionChildProducers[i] = createProducer(unionChildVector, /* nullable = */ false); // Do not nest union types } - return new AvroUnionsProducer(unionVector, unionChildProducers); + return new AvroUnionProducer(unionVector, unionChildProducers); + + case DENSEUNION: + DenseUnionVector denseUnionVector = (DenseUnionVector) vector; + List denseChildVectors = denseUnionVector.getChildrenFromFields(); + Producer[] denseChildProducers = new Producer[denseChildVectors.size()]; + for (int i = 0; i < denseChildVectors.size(); i++) { + FieldVector denseChildVector = denseChildVectors.get(i); + denseChildProducers[i] = + createProducer(denseChildVector, /* nullable = */ false); // Do not nest union types + } + return new AvroDenseUnionProducer(denseUnionVector, denseChildProducers); default: // Not all Arrow types are supported for encoding (yet)! diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDenseUnionProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDenseUnionProducer.java new file mode 100644 index 0000000000..1735c72e4e --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDenseUnionProducer.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.complex.DenseUnionVector; + +/** + * Producer which produces union values from a {@link DenseUnionVector}, writes data to an avro + * encoder. + */ +public class AvroDenseUnionProducer extends BaseUnionProducer { + + /** Instantiate an AvroUnionProducer. */ + public AvroDenseUnionProducer(DenseUnionVector vector, Producer[] delegates) { + super(vector, delegates); + } + + @Override + protected int getCurrentTypeIndex() { + return vector.getTypeId(currentIndex); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionProducer.java new file mode 100644 index 0000000000..dfe82d821e --- /dev/null +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionProducer.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro.producers; + +import org.apache.arrow.vector.complex.UnionVector; + +/** + * Producer which produces union values from a {@link UnionVector}, writes data to an avro encoder. + */ +public class AvroUnionProducer extends BaseUnionProducer { + + /** Instantiate an AvroUnionProducer. */ + public AvroUnionProducer(UnionVector vector, Producer[] delegates) { + super(vector, delegates); + } + + @Override + protected int getCurrentTypeIndex() { + return vector.getTypeValue(currentIndex); + } +} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java similarity index 81% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java index 7f0323cac3..e5295fb292 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionsProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java @@ -19,23 +19,22 @@ import java.io.IOException; import java.util.List; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.UnionMode; import org.apache.avro.io.Encoder; -/** - * Producer which produces unions type values to avro encoder. Write the data to {@link - * org.apache.arrow.vector.complex.UnionVector}. - */ -public class AvroUnionsProducer extends BaseAvroProducer { +abstract class BaseUnionProducer extends BaseAvroProducer { + + // Logic is substantially the same for union and dense union, just dense union resolves offsets + // For methods not available on FieldVector some calls are delegate to the child class private final Producer[] delegates; private final UnionMode unionMode; private final int nullTypeIndex; - /** Instantiate an AvroUnionsProducer. */ - public AvroUnionsProducer(UnionVector vector, Producer[] delegates) { + protected abstract int getCurrentTypeIndex(); + + public BaseUnionProducer(T vector, Producer[] delegates) { super(vector); this.delegates = delegates; if (vector.getMinorType() == Types.MinorType.DENSEUNION) { @@ -43,11 +42,10 @@ public AvroUnionsProducer(UnionVector vector, Producer[] delegates) { } else { this.unionMode = UnionMode.Sparse; } - this.nullTypeIndex = findNullTypeIndex(); + this.nullTypeIndex = findNullTypeIndex(vector.getChildrenFromFields()); } - private int findNullTypeIndex() { - List childVectors = vector.getChildrenFromFields(); + protected int findNullTypeIndex(List childVectors) { for (int i = 0; i < childVectors.size(); i++) { if (childVectors.get(i).getMinorType() == Types.MinorType.NULL) { return i; @@ -65,7 +63,7 @@ public void produce(Encoder encoder) throws IOException { encoder.writeNull(); } else { - int typeIndex = vector.getTypeValue(currentIndex); + int typeIndex = getCurrentTypeIndex(); int typeVectorIndex; if (unionMode == UnionMode.Dense) { @@ -92,7 +90,7 @@ public void produce(Encoder encoder) throws IOException { @Override @SuppressWarnings("unchecked") - public boolean resetValueVector(UnionVector vector) { + public boolean resetValueVector(T vector) { boolean result = true; for (int i = 0; i < delegates.length; i++) { Producer delegate = (Producer) delegates[i]; From 904b978902756019bfbb8310e273cf9239715402 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 2 Mar 2025 16:43:27 +0000 Subject: [PATCH 45/89] Rename fixed size binary producer to match convention --- .../org/apache/arrow/adapter/avro/ArrowToAvroUtils.java | 4 ++-- ...ixedProducer.java => AvroFixedSizeBinaryProducer.java} | 8 ++++---- .../avro/producers/logical/AvroDecimal256Producer.java | 4 ++-- .../avro/producers/logical/AvroDecimalProducer.java | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) rename adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/{AvroFixedProducer.java => AvroFixedSizeBinaryProducer.java} (86%) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index e677623756..677c9f1031 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -22,7 +22,7 @@ import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; import org.apache.arrow.adapter.avro.producers.AvroDenseUnionProducer; -import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; +import org.apache.arrow.adapter.avro.producers.AvroFixedSizeBinaryProducer; import org.apache.arrow.adapter.avro.producers.AvroFixedSizeListProducer; import org.apache.arrow.adapter.avro.producers.AvroFloat2Producer; import org.apache.arrow.adapter.avro.producers.AvroFloat4Producer; @@ -620,7 +620,7 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu case VARBINARY: return new AvroBytesProducer((VarBinaryVector) vector); case FIXEDSIZEBINARY: - return new AvroFixedProducer((FixedSizeBinaryVector) vector); + return new AvroFixedSizeBinaryProducer((FixedSizeBinaryVector) vector); case VARCHAR: return new AvroStringProducer((VarCharVector) vector); diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeBinaryProducer.java similarity index 86% rename from adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java rename to adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeBinaryProducer.java index 4ca4f39dce..eadb95d6f1 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeBinaryProducer.java @@ -28,18 +28,18 @@ *

    Logical types are also supported, for vectors derived from {@link BaseFixedWidthVector} where * the internal representation is fixed width bytes and requires no conversion. */ -public class AvroFixedProducer extends BaseAvroProducer { +public class AvroFixedSizeBinaryProducer extends BaseAvroProducer { private final byte[] reuseBytes; - /** Instantiate an AvroFixedProducer. */ - public AvroFixedProducer(FixedSizeBinaryVector vector) { + /** Instantiate an AvroFixedSizeBinaryProducer. */ + public AvroFixedSizeBinaryProducer(FixedSizeBinaryVector vector) { super(vector); reuseBytes = new byte[vector.getTypeWidth()]; } /** Protected constructor for logical types with a fixed width representation. */ - protected AvroFixedProducer(BaseFixedWidthVector vector) { + protected AvroFixedSizeBinaryProducer(BaseFixedWidthVector vector) { super(vector); reuseBytes = new byte[vector.getTypeWidth()]; } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java index f8cf377dc5..76067a496d 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java @@ -16,14 +16,14 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; +import org.apache.arrow.adapter.avro.producers.AvroFixedSizeBinaryProducer; import org.apache.arrow.vector.Decimal256Vector; /** * Producer that produces decimal values from a {@link Decimal256Vector}, writes data to an Avro * encoder. */ -public class AvroDecimal256Producer extends AvroFixedProducer { +public class AvroDecimal256Producer extends AvroFixedSizeBinaryProducer { // Decimal stored as fixed width bytes, matches Avro decimal encoding diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java index b8f4a2a2e5..033b45b7cc 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java @@ -16,14 +16,14 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import org.apache.arrow.adapter.avro.producers.AvroFixedProducer; +import org.apache.arrow.adapter.avro.producers.AvroFixedSizeBinaryProducer; import org.apache.arrow.vector.DecimalVector; /** * Producer that produces decimal values from a {@link DecimalVector}, writes data to an Avro * encoder. */ -public class AvroDecimalProducer extends AvroFixedProducer { +public class AvroDecimalProducer extends AvroFixedSizeBinaryProducer { // Decimal stored as fixed width bytes, matches Avro decimal encoding From 68667491e65c4f259c8de53c06f0ea613542e9f6 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 11:21:52 +0000 Subject: [PATCH 46/89] Remove boolean return flag on resetValueVector --- .../adapter/avro/producers/AvroFixedSizeListProducer.java | 3 +-- .../arrow/adapter/avro/producers/AvroListProducer.java | 3 +-- .../arrow/adapter/avro/producers/AvroMapProducer.java | 3 +-- .../adapter/avro/producers/AvroNullableProducer.java | 4 ++-- .../arrow/adapter/avro/producers/AvroStructProducer.java | 3 +-- .../arrow/adapter/avro/producers/BaseAvroProducer.java | 3 +-- .../arrow/adapter/avro/producers/BaseUnionProducer.java | 6 ++---- .../adapter/avro/producers/CompositeAvroProducer.java | 4 +--- .../org/apache/arrow/adapter/avro/producers/Producer.java | 8 ++------ 9 files changed, 12 insertions(+), 25 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java index 0e144de0ca..bf299253db 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java @@ -62,8 +62,7 @@ public void setPosition(int index) { @Override @SuppressWarnings("unchecked") - public boolean resetValueVector(FixedSizeListVector vector) { + public void resetValueVector(FixedSizeListVector vector) { ((Producer) delegate).resetValueVector(vector.getDataVector()); - return super.resetValueVector(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java index 6dda502a32..1aa61fa6c6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java @@ -64,8 +64,7 @@ public void setPosition(int index) { @Override @SuppressWarnings("unchecked") - public boolean resetValueVector(ListVector vector) { + public void resetValueVector(ListVector vector) { ((Producer) delegate).resetValueVector(vector.getDataVector()); - return super.resetValueVector(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java index 848c85ff96..d95bbe99c9 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java @@ -65,8 +65,7 @@ public void setPosition(int index) { @Override @SuppressWarnings("unchecked") - public boolean resetValueVector(MapVector vector) { + public void resetValueVector(MapVector vector) { ((Producer) delegate).resetValueVector(vector.getDataVector()); - return super.resetValueVector(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java index 53e2542335..2729e9ece1 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java @@ -63,8 +63,8 @@ public void setPosition(int index) { } @Override - public boolean resetValueVector(T vector) { - return delegate.resetValueVector(vector); + public void resetValueVector(T vector) { + delegate.resetValueVector(vector); } @Override diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java index e5d8dac7e0..de34284b7e 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java @@ -63,11 +63,10 @@ public void setPosition(int index) { @Override @SuppressWarnings("unchecked") - public boolean resetValueVector(StructVector vector) { + public void resetValueVector(StructVector vector) { for (int i = 0; i < delegates.length; i++) { Producer delegate = (Producer) delegates[i]; delegate.resetValueVector(vector.getChildrenFromFields().get(i)); } - return super.resetValueVector(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java index abc7ebd263..892a02f77a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java @@ -48,10 +48,9 @@ public void setPosition(int index) { } @Override - public boolean resetValueVector(T vector) { + public void resetValueVector(T vector) { this.vector = vector; this.currentIndex = 0; - return true; } @Override diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java index e5295fb292..5370cca052 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java @@ -90,12 +90,10 @@ public void produce(Encoder encoder) throws IOException { @Override @SuppressWarnings("unchecked") - public boolean resetValueVector(T vector) { - boolean result = true; + public void resetValueVector(T vector) { for (int i = 0; i < delegates.length; i++) { Producer delegate = (Producer) delegates[i]; - result &= delegate.resetValueVector(vector.getChildrenFromFields().get(i)); + delegate.resetValueVector(vector.getChildrenFromFields().get(i)); } - return result & super.resetValueVector(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java index a39ae2d3f5..d1ed506108 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/CompositeAvroProducer.java @@ -48,9 +48,7 @@ public void resetProducerVectors(VectorSchemaRoot root) { // This method assumes that the VSR matches the constructed set of producers int index = 0; for (Producer producer : producers) { - if (producer.resetValueVector(root.getFieldVectors().get(index))) { - index++; - } + producer.resetValueVector(root.getFieldVectors().get(index)); } } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java index 983fc41cfe..aed2543348 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/Producer.java @@ -41,12 +41,8 @@ public interface Producer { /** Set the position to read value from vector. */ void setPosition(int index); - /** - * Reset the vector within producer. - * - * @return true if reset is successful, false if reset is not needed. - */ - boolean resetValueVector(T vector); + /** Reset the vector within producer. */ + void resetValueVector(T vector); /** Get the vector within the producer. */ T getVector(); From 1ace4d2bc76c4ad3c8cced4651f9b1a30f2bd1f7 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 11:56:04 +0000 Subject: [PATCH 47/89] Remove time zone conversion from timestamp producers (both Arrow and Avro are already in UTC) --- .../logical/AvroTimestampMicroTzProducer.java | 28 +++------ .../logical/AvroTimestampMilliTzProducer.java | 20 +++---- .../logical/AvroTimestampNanoTzProducer.java | 26 +++------ .../logical/AvroTimestampSecProducer.java | 4 +- .../logical/AvroTimestampSecTzProducer.java | 21 +++---- .../logical/BaseTimestampTzProducer.java | 58 ------------------- 6 files changed, 31 insertions(+), 126 deletions(-) delete mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/BaseTimestampTzProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroTzProducer.java index ee80d73fda..ece7482303 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroTzProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMicroTzProducer.java @@ -16,34 +16,20 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.time.Instant; -import java.time.ZoneId; +import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.vector.TimeStampMicroTZVector; /** - * Producer that converts timestamps in zone-aware epoch microseconds from a {@link - * TimeStampMicroTZVector} and produces UTC timestamp (microseconds) values, writes data to an Avro - * encoder. + * Producer that produces UTC timestamp (microseconds) values from a {@link TimeStampMicroTZVector}, + * writes data to an Avro encoder. */ -public class AvroTimestampMicroTzProducer extends BaseTimestampTzProducer { +public class AvroTimestampMicroTzProducer extends AvroBigIntProducer { - private static final long MICROS_PER_SECOND = 1000000; - private static final long NANOS_PER_MICRO = 1000; + // UTC timestamp in epoch microseconds stored as long, matches Avro timestamp-micros type + // Both Arrow and Avro store zone-aware times in UTC so zone conversion is not needed /** Instantiate an AvroTimestampMicroTzProducer. */ public AvroTimestampMicroTzProducer(TimeStampMicroTZVector vector) { - super(vector, vector.getTimeZone(), MICROS_PER_SECOND); - } - - @Override - protected long convertToUtc(long tzValue, ZoneId zoneId) { - // For negative values, e.g. -.5 seconds = -1 second + .5 in micros - long tzSeconds = tzValue >= 0 ? tzValue / MICROS_PER_SECOND : tzValue / MICROS_PER_SECOND - 1; - long tzMicro = tzValue % MICROS_PER_SECOND; - Instant utcInstant = - Instant.ofEpochSecond(tzSeconds, tzMicro * NANOS_PER_MICRO).atZone(zoneId).toInstant(); - long utcSeconds = utcInstant.getEpochSecond(); - long utcMicro = utcInstant.getNano() / NANOS_PER_MICRO; - return utcSeconds * MICROS_PER_SECOND + utcMicro; + super(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliTzProducer.java index cfde9612af..b1b55fca78 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliTzProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampMilliTzProducer.java @@ -16,26 +16,20 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.time.Instant; -import java.time.ZoneId; +import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.vector.TimeStampMilliTZVector; /** - * Producer that converts timestamps in zone-aware epoch milliseconds from a {@link - * TimeStampMilliTZVector} and produces UTC timestamp (milliseconds) values, writes data to an Avro - * encoder. + * Producer that produces UTC timestamp (milliseconds) values from a {@link TimeStampMilliTZVector}, + * writes data to an Avro encoder. */ -public class AvroTimestampMilliTzProducer extends BaseTimestampTzProducer { +public class AvroTimestampMilliTzProducer extends AvroBigIntProducer { - private static final long MILLIS_PER_SECOND = 1000; + // UTC timestamp in epoch milliseconds stored as long, matches Avro timestamp-millis type + // Both Arrow and Avro store zone-aware times in UTC so zone conversion is not needed /** Instantiate an AvroTimestampMilliTzProducer. */ public AvroTimestampMilliTzProducer(TimeStampMilliTZVector vector) { - super(vector, vector.getTimeZone(), MILLIS_PER_SECOND); - } - - @Override - protected long convertToUtc(long tzValue, ZoneId zoneId) { - return Instant.ofEpochMilli(tzValue).atZone(zoneId).toInstant().toEpochMilli(); + super(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoTzProducer.java index 168ab7171d..ae261d8396 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoTzProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampNanoTzProducer.java @@ -16,32 +16,20 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import java.time.Instant; -import java.time.ZoneId; +import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.vector.TimeStampNanoTZVector; /** - * Producer that converts timestamps in zone-aware epoch nanoseconds from a {@link - * TimeStampNanoTZVector} and produces UTC timestamp (nanoseconds) values, writes data to an Avro - * encoder. + * Producer that produces local timestamp (nanoseconds) values from a {@link TimeStampNanoTZVector}, + * writes data to an Avro encoder. */ -public class AvroTimestampNanoTzProducer extends BaseTimestampTzProducer { +public class AvroTimestampNanoTzProducer extends AvroBigIntProducer { - private static final long NANOS_PER_SECOND = 1000000000; + // UTC timestamp in epoch nanoseconds stored as long, matches Avro timestamp-nanos type + // Both Arrow and Avro store zone-aware times in UTC so zone conversion is not needed /** Instantiate an AvroTimestampNanoTzProducer. */ public AvroTimestampNanoTzProducer(TimeStampNanoTZVector vector) { - super(vector, vector.getTimeZone(), NANOS_PER_SECOND); - } - - @Override - protected long convertToUtc(long tzValue, ZoneId zoneId) { - // For negative values, e.g. -.5 seconds = -1 second + .5 in nanos - long tzSeconds = tzValue >= 0 ? tzValue / NANOS_PER_SECOND : tzValue / NANOS_PER_SECOND - 1; - long tzNano = tzValue % NANOS_PER_SECOND; - Instant utcInstant = Instant.ofEpochSecond(tzSeconds, tzNano).atZone(zoneId).toInstant(); - long utcSeconds = utcInstant.getEpochSecond(); - long utcNano = utcInstant.getNano(); - return utcSeconds * NANOS_PER_SECOND + utcNano; + super(vector); } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java index ebc29d9b34..a6ade2d19b 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecProducer.java @@ -22,8 +22,8 @@ import org.apache.avro.io.Encoder; /** - * Producer that converts epoch seconds from a {@link TimeStampSecVector} and produces local time - * (milliseconds) values, writes data to an Avro encoder. + * Producer that converts epoch seconds from a {@link TimeStampSecVector} and produces local + * timestamp (milliseconds) values, writes data to an Avro encoder. */ public class AvroTimestampSecProducer extends BaseAvroProducer { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java index b04939b788..f524e59810 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java @@ -17,39 +17,34 @@ package org.apache.arrow.adapter.avro.producers.logical; import java.io.IOException; -import java.time.Instant; -import java.time.ZoneId; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.vector.TimeStampSecTZVector; import org.apache.arrow.vector.TimeStampVector; import org.apache.avro.io.Encoder; /** - * Producer that converts timestamps in zone-aware epoch seconds from a {@link TimeStampSecTZVector} - * and produces UTC timestamp (millisecond) values, writes data to an Avro encoder. + * Producer that converts epoch seconds from a {@link TimeStampSecTZVector} and produces UTC + * timestamp (milliseconds) values, writes data to an Avro encoder. */ -public class AvroTimestampSecTzProducer extends BaseTimestampTzProducer { +public class AvroTimestampSecTzProducer extends BaseAvroProducer { // Avro does not support timestamps in seconds, so convert to timestamp-millis type // Check for overflow and raise an exception + // Both Arrow and Avro store zone-aware times in UTC so zone conversion is not needed + private static final long MILLIS_PER_SECOND = 1000; private static final long OVERFLOW_LIMIT = Long.MAX_VALUE / MILLIS_PER_SECOND; /** Instantiate an AvroTimestampSecTzProducer. */ public AvroTimestampSecTzProducer(TimeStampSecTZVector vector) { - super(vector, vector.getTimeZone(), 1); - } - - @Override - protected long convertToUtc(long tzValue, ZoneId zoneId) { - return Instant.ofEpochSecond(tzValue).atZone(zoneId).toInstant().getEpochSecond(); + super(vector); } @Override public void produce(Encoder encoder) throws IOException { - long tzSeconds = + long utcSeconds = vector.getDataBuffer().getLong(currentIndex * (long) TimeStampVector.TYPE_WIDTH); - long utcSeconds = fixedOffsetFlag ? tzSeconds + fixedOffset : convertToUtc(tzSeconds, zoneId); if (Math.abs(utcSeconds) > OVERFLOW_LIMIT) { throw new ArithmeticException("Timestamp value is too large for Avro encoding"); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/BaseTimestampTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/BaseTimestampTzProducer.java deleted file mode 100644 index cffc3b219e..0000000000 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/BaseTimestampTzProducer.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.adapter.avro.producers.logical; - -import java.io.IOException; -import java.time.ZoneId; -import java.time.ZoneOffset; -import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; -import org.apache.arrow.vector.TimeStampVector; -import org.apache.avro.io.Encoder; - -abstract class BaseTimestampTzProducer - extends BaseAvroProducer { - - // Convert TZ values to UTC to encode Avro timestamp types - // Where possible, used a fixed offset (zero for UTC or missing zone info) - // Conversion using named zones is more expensive and depends on the concrete type - - protected final ZoneId zoneId; - protected final boolean fixedOffsetFlag; - protected final long fixedOffset; - - protected abstract long convertToUtc(long tzValue, ZoneId zoneId); - - protected BaseTimestampTzProducer(T vector, String zoneName, long offsetMultiplier) { - super(vector); - zoneId = zoneName != null ? ZoneId.of(zoneName) : ZoneOffset.UTC; - if (zoneId instanceof ZoneOffset) { - ZoneOffset offset = (ZoneOffset) zoneId; - fixedOffsetFlag = true; - fixedOffset = (long) offset.getTotalSeconds() * offsetMultiplier; - } else { - fixedOffsetFlag = false; - fixedOffset = 0; - } - } - - @Override - public void produce(Encoder encoder) throws IOException { - long tzValue = vector.getDataBuffer().getLong(currentIndex * (long) TimeStampVector.TYPE_WIDTH); - long utcValue = fixedOffsetFlag ? tzValue + fixedOffset : convertToUtc(tzValue, zoneId); - encoder.writeLong(utcValue); - } -} From f81389a9df0c4bbeee725c763d968bfac65298d4 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 12:06:56 +0000 Subject: [PATCH 48/89] Use Java functions to handle conversion of unsigned int types --- .../arrow/adapter/avro/producers/AvroUint1Producer.java | 4 ++-- .../arrow/adapter/avro/producers/AvroUint2Producer.java | 4 ++-- .../arrow/adapter/avro/producers/AvroUint4Producer.java | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint1Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint1Producer.java index 59ce44b662..83cbc9ef8e 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint1Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint1Producer.java @@ -31,8 +31,8 @@ public AvroUint1Producer(UInt1Vector vector) { @Override public void produce(Encoder encoder) throws IOException { byte unsigned = vector.getDataBuffer().getByte(currentIndex * (long) UInt1Vector.TYPE_WIDTH); - int signed = unsigned & 0xff; - encoder.writeInt(signed); + int unsignedInt = Byte.toUnsignedInt(unsigned); + encoder.writeInt(unsignedInt); currentIndex++; } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint2Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint2Producer.java index 8dde176cf6..1e30c82cd2 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint2Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint2Producer.java @@ -31,8 +31,8 @@ public AvroUint2Producer(UInt2Vector vector) { @Override public void produce(Encoder encoder) throws IOException { short unsigned = vector.getDataBuffer().getShort(currentIndex * (long) UInt2Vector.TYPE_WIDTH); - int signed = unsigned & 0xffff; - encoder.writeInt(signed); + int unsignedInt = Short.toUnsignedInt(unsigned); + encoder.writeInt(unsignedInt); currentIndex++; } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint4Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint4Producer.java index d02ffc55f3..63f78429dd 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint4Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUint4Producer.java @@ -33,8 +33,8 @@ public AvroUint4Producer(UInt4Vector vector) { @Override public void produce(Encoder encoder) throws IOException { int unsigned = vector.getDataBuffer().getInt(currentIndex * (long) UInt4Vector.TYPE_WIDTH); - long signed = unsigned & 0xffffffffL; - encoder.writeLong(signed); + long unsignedLong = Integer.toUnsignedLong(unsigned); + encoder.writeLong(unsignedLong); currentIndex++; } } From e58b9c279649b43e1e4006aa100d3d7468386ce8 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 13:27:57 +0000 Subject: [PATCH 49/89] Schema conversion tests for primitive and logical types --- .../adapter/avro/ArrowToAvroSchemaTest.java | 785 ++++++++++++++++++ 1 file changed, 785 insertions(+) create mode 100644 adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java new file mode 100644 index 0000000000..72ae03e181 --- /dev/null +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java @@ -0,0 +1,785 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.adapter.avro; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; +import java.util.List; +import org.apache.arrow.vector.types.DateUnit; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.avro.Schema; +import org.junit.jupiter.api.Test; + +public class ArrowToAvroSchemaTest { + + // Schema conversion for primitive types, nullable and non-nullable + + @Test + public void testConvertNullType() { + List fields = + Arrays.asList(new Field("nullType", FieldType.notNullable(new ArrowType.Null()), null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(1, schema.getFields().size()); + + assertEquals(Schema.Type.NULL, schema.getField("nullType").schema().getType()); + } + + @Test + public void testConvertBooleanTypes() { + List fields = + Arrays.asList( + new Field("nullableBool", FieldType.nullable(new ArrowType.Bool()), null), + new Field("nonNullableBool", FieldType.notNullable(new ArrowType.Bool()), null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(2, schema.getFields().size()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableBool").schema().getType()); + assertEquals(2, schema.getField("nullableBool").schema().getTypes().size()); + assertEquals( + Schema.Type.BOOLEAN, schema.getField("nullableBool").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableBool").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.BOOLEAN, schema.getField("nonNullableBool").schema().getType()); + } + + @Test + public void testConvertIntegralTypes() { + List fields = + Arrays.asList( + new Field("nullableInt8", FieldType.nullable(new ArrowType.Int(8, true)), null), + new Field("nonNullableInt8", FieldType.notNullable(new ArrowType.Int(8, true)), null), + new Field("nullableUInt8", FieldType.nullable(new ArrowType.Int(8, false)), null), + new Field("nonNullableUInt8", FieldType.notNullable(new ArrowType.Int(8, false)), null), + new Field("nullableInt16", FieldType.nullable(new ArrowType.Int(16, true)), null), + new Field("nonNullableInt16", FieldType.notNullable(new ArrowType.Int(16, true)), null), + new Field("nullableUInt16", FieldType.nullable(new ArrowType.Int(16, false)), null), + new Field( + "nonNullableUInt16", FieldType.notNullable(new ArrowType.Int(16, false)), null), + new Field("nullableInt32", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("nonNullableInt32", FieldType.notNullable(new ArrowType.Int(32, true)), null), + new Field("nullableUInt32", FieldType.nullable(new ArrowType.Int(32, false)), null), + new Field( + "nonNullableUInt32", FieldType.notNullable(new ArrowType.Int(32, false)), null), + new Field("nullableInt64", FieldType.nullable(new ArrowType.Int(64, true)), null), + new Field("nonNullableInt64", FieldType.notNullable(new ArrowType.Int(64, true)), null), + new Field("nullableUInt64", FieldType.nullable(new ArrowType.Int(64, false)), null), + new Field( + "nonNullableUInt64", FieldType.notNullable(new ArrowType.Int(64, false)), null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(16, schema.getFields().size()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableInt8").schema().getType()); + assertEquals(2, schema.getField("nullableInt8").schema().getTypes().size()); + assertEquals( + Schema.Type.INT, schema.getField("nullableInt8").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableInt8").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.INT, schema.getField("nonNullableInt8").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableUInt8").schema().getType()); + assertEquals(2, schema.getField("nullableUInt8").schema().getTypes().size()); + assertEquals( + Schema.Type.INT, schema.getField("nullableUInt8").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableUInt8").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.INT, schema.getField("nonNullableUInt8").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableInt16").schema().getType()); + assertEquals(2, schema.getField("nullableInt16").schema().getTypes().size()); + assertEquals( + Schema.Type.INT, schema.getField("nullableInt16").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableInt16").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.INT, schema.getField("nonNullableInt16").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableUInt16").schema().getType()); + assertEquals(2, schema.getField("nullableUInt16").schema().getTypes().size()); + assertEquals( + Schema.Type.INT, schema.getField("nullableUInt16").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableUInt16").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.INT, schema.getField("nonNullableUInt16").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableInt32").schema().getType()); + assertEquals(2, schema.getField("nullableInt32").schema().getTypes().size()); + assertEquals( + Schema.Type.INT, schema.getField("nullableInt32").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableInt32").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.INT, schema.getField("nonNullableInt32").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableUInt32").schema().getType()); + assertEquals(2, schema.getField("nullableUInt32").schema().getTypes().size()); + assertEquals( + Schema.Type.LONG, schema.getField("nullableUInt32").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableUInt32").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.LONG, schema.getField("nonNullableUInt32").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableInt64").schema().getType()); + assertEquals(2, schema.getField("nullableInt64").schema().getTypes().size()); + assertEquals( + Schema.Type.LONG, schema.getField("nullableInt64").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableInt64").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.LONG, schema.getField("nonNullableInt64").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableUInt64").schema().getType()); + assertEquals(2, schema.getField("nullableUInt64").schema().getTypes().size()); + assertEquals( + Schema.Type.LONG, schema.getField("nullableUInt64").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableUInt64").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.LONG, schema.getField("nonNullableUInt64").schema().getType()); + } + + @Test + public void testConvertFloatingPointTypes() { + List fields = + Arrays.asList( + new Field( + "nullableFloat16", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.HALF)), + null), + new Field( + "nonNullableFloat16", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.HALF)), + null), + new Field( + "nullableFloat32", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null), + new Field( + "nonNullableFloat32", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null), + new Field( + "nullableFloat64", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null), + new Field( + "nonNullableFloat64", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(6, schema.getFields().size()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableFloat16").schema().getType()); + assertEquals(2, schema.getField("nullableFloat16").schema().getTypes().size()); + assertEquals( + Schema.Type.FLOAT, schema.getField("nullableFloat16").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableFloat16").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.FLOAT, schema.getField("nonNullableFloat16").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableFloat32").schema().getType()); + assertEquals(2, schema.getField("nullableFloat32").schema().getTypes().size()); + assertEquals( + Schema.Type.FLOAT, schema.getField("nullableFloat32").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableFloat32").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.FLOAT, schema.getField("nonNullableFloat32").schema().getType()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableFloat64").schema().getType()); + assertEquals(2, schema.getField("nullableFloat64").schema().getTypes().size()); + assertEquals( + Schema.Type.DOUBLE, + schema.getField("nullableFloat64").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableFloat64").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.DOUBLE, schema.getField("nonNullableFloat64").schema().getType()); + } + + @Test + public void testConvertStringTypes() { + List fields = + Arrays.asList( + new Field("nullableUtf8", FieldType.nullable(new ArrowType.Utf8()), null), + new Field("nonNullableUtf8", FieldType.notNullable(new ArrowType.Utf8()), null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(2, schema.getFields().size()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableUtf8").schema().getType()); + assertEquals(2, schema.getField("nullableUtf8").schema().getTypes().size()); + assertEquals( + Schema.Type.STRING, schema.getField("nullableUtf8").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableUtf8").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.STRING, schema.getField("nonNullableUtf8").schema().getType()); + } + + @Test + public void testConvertBinaryTypes() { + List fields = + Arrays.asList( + new Field("nullableBinary", FieldType.nullable(new ArrowType.Binary()), null), + new Field("nonNullableBinary", FieldType.notNullable(new ArrowType.Binary()), null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(2, schema.getFields().size()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableBinary").schema().getType()); + assertEquals(2, schema.getField("nullableBinary").schema().getTypes().size()); + assertEquals( + Schema.Type.BYTES, schema.getField("nullableBinary").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableBinary").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.BYTES, schema.getField("nonNullableBinary").schema().getType()); + } + + @Test + public void testConvertFixedSizeBinaryTypes() { + List fields = + Arrays.asList( + new Field( + "nullableFixedSizeBinary", + FieldType.nullable(new ArrowType.FixedSizeBinary(10)), + null), + new Field( + "nonNullableFixedSizeBinary", + FieldType.notNullable(new ArrowType.FixedSizeBinary(10)), + null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(2, schema.getFields().size()); + + assertEquals(Schema.Type.UNION, schema.getField("nullableFixedSizeBinary").schema().getType()); + assertEquals(2, schema.getField("nullableFixedSizeBinary").schema().getTypes().size()); + assertEquals( + Schema.Type.FIXED, + schema.getField("nullableFixedSizeBinary").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableFixedSizeBinary").schema().getTypes().get(1).getType()); + assertEquals( + Schema.Type.FIXED, schema.getField("nonNullableFixedSizeBinary").schema().getType()); + } + + // Schema conversion for logical types, nullable and non-nullable + + @Test + public void testConvertDecimalTypes() { + List fields = + Arrays.asList( + new Field( + "nullableDecimal128", FieldType.nullable(new ArrowType.Decimal(10, 2, 128)), null), + new Field( + "nonNullableDecimal1281", + FieldType.notNullable(new ArrowType.Decimal(10, 2, 128)), + null), + new Field( + "nonNullableDecimal1282", + FieldType.notNullable(new ArrowType.Decimal(15, 5, 128)), + null), + new Field( + "nonNullableDecimal1283", + FieldType.notNullable(new ArrowType.Decimal(20, 10, 128)), + null), + new Field( + "nullableDecimal256", FieldType.nullable(new ArrowType.Decimal(20, 4, 256)), null), + new Field( + "nonNullableDecimal2561", + FieldType.notNullable(new ArrowType.Decimal(20, 4, 256)), + null), + new Field( + "nonNullableDecimal2562", + FieldType.notNullable(new ArrowType.Decimal(25, 8, 256)), + null), + new Field( + "nonNullableDecimal2563", + FieldType.notNullable(new ArrowType.Decimal(30, 15, 256)), + null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(8, schema.getFields().size()); + + // Assertions for nullableDecimal128 + assertEquals(Schema.Type.UNION, schema.getField("nullableDecimal128").schema().getType()); + assertEquals(2, schema.getField("nullableDecimal128").schema().getTypes().size()); + Schema nullableDecimal128Schema = + schema.getField("nullableDecimal128").schema().getTypes().get(0); + assertEquals(Schema.Type.FIXED, nullableDecimal128Schema.getType()); + assertEquals("decimal", nullableDecimal128Schema.getProp("logicalType")); + assertEquals(10, nullableDecimal128Schema.getObjectProp("precision")); + assertEquals(2, nullableDecimal128Schema.getObjectProp("scale")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableDecimal128").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableDecimal1281 + Schema nonNullableDecimal1281Schema = schema.getField("nonNullableDecimal1281").schema(); + assertEquals(Schema.Type.FIXED, nonNullableDecimal1281Schema.getType()); + assertEquals("decimal", nonNullableDecimal1281Schema.getProp("logicalType")); + assertEquals(10, nonNullableDecimal1281Schema.getObjectProp("precision")); + assertEquals(2, nonNullableDecimal1281Schema.getObjectProp("scale")); + + // Assertions for nonNullableDecimal1282 + Schema nonNullableDecimal1282Schema = schema.getField("nonNullableDecimal1282").schema(); + assertEquals(Schema.Type.FIXED, nonNullableDecimal1282Schema.getType()); + assertEquals("decimal", nonNullableDecimal1282Schema.getProp("logicalType")); + assertEquals(15, nonNullableDecimal1282Schema.getObjectProp("precision")); + assertEquals(5, nonNullableDecimal1282Schema.getObjectProp("scale")); + + // Assertions for nonNullableDecimal1283 + Schema nonNullableDecimal1283Schema = schema.getField("nonNullableDecimal1283").schema(); + assertEquals(Schema.Type.FIXED, nonNullableDecimal1283Schema.getType()); + assertEquals("decimal", nonNullableDecimal1283Schema.getProp("logicalType")); + assertEquals(20, nonNullableDecimal1283Schema.getObjectProp("precision")); + assertEquals(10, nonNullableDecimal1283Schema.getObjectProp("scale")); + + // Assertions for nullableDecimal256 + assertEquals(Schema.Type.UNION, schema.getField("nullableDecimal256").schema().getType()); + assertEquals(2, schema.getField("nullableDecimal256").schema().getTypes().size()); + Schema nullableDecimal256Schema = + schema.getField("nullableDecimal256").schema().getTypes().get(0); + assertEquals(Schema.Type.FIXED, nullableDecimal256Schema.getType()); + assertEquals("decimal", nullableDecimal256Schema.getProp("logicalType")); + assertEquals(20, nullableDecimal256Schema.getObjectProp("precision")); + assertEquals(4, nullableDecimal256Schema.getObjectProp("scale")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableDecimal256").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableDecimal2561 + Schema nonNullableDecimal2561Schema = schema.getField("nonNullableDecimal2561").schema(); + assertEquals(Schema.Type.FIXED, nonNullableDecimal2561Schema.getType()); + assertEquals("decimal", nonNullableDecimal2561Schema.getProp("logicalType")); + assertEquals(20, nonNullableDecimal2561Schema.getObjectProp("precision")); + assertEquals(4, nonNullableDecimal2561Schema.getObjectProp("scale")); + + // Assertions for nonNullableDecimal2562 + Schema nonNullableDecimal2562Schema = schema.getField("nonNullableDecimal2562").schema(); + assertEquals(Schema.Type.FIXED, nonNullableDecimal2562Schema.getType()); + assertEquals("decimal", nonNullableDecimal2562Schema.getProp("logicalType")); + assertEquals(25, nonNullableDecimal2562Schema.getObjectProp("precision")); + assertEquals(8, nonNullableDecimal2562Schema.getObjectProp("scale")); + + // Assertions for nonNullableDecimal2563 + Schema nonNullableDecimal2563Schema = schema.getField("nonNullableDecimal2563").schema(); + assertEquals(Schema.Type.FIXED, nonNullableDecimal2563Schema.getType()); + assertEquals("decimal", nonNullableDecimal2563Schema.getProp("logicalType")); + assertEquals(30, nonNullableDecimal2563Schema.getObjectProp("precision")); + assertEquals(15, nonNullableDecimal2563Schema.getObjectProp("scale")); + } + + @Test + public void testConvertDateTypes() { + List fields = + Arrays.asList( + new Field( + "nullableDateDay", FieldType.nullable(new ArrowType.Date(DateUnit.DAY)), null), + new Field( + "nonNullableDateDay", + FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), + null), + new Field( + "nullableDateMilli", + FieldType.nullable(new ArrowType.Date(DateUnit.MILLISECOND)), + null), + new Field( + "nonNullableDateMilli", + FieldType.notNullable(new ArrowType.Date(DateUnit.MILLISECOND)), + null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(4, schema.getFields().size()); + + // Assertions for nullableDateDay + assertEquals(Schema.Type.UNION, schema.getField("nullableDateDay").schema().getType()); + assertEquals(2, schema.getField("nullableDateDay").schema().getTypes().size()); + Schema nullableDateDaySchema = schema.getField("nullableDateDay").schema().getTypes().get(0); + assertEquals(Schema.Type.INT, nullableDateDaySchema.getType()); + assertEquals("date", nullableDateDaySchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, schema.getField("nullableDateDay").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableDateDay + Schema nonNullableDateDaySchema = schema.getField("nonNullableDateDay").schema(); + assertEquals(Schema.Type.INT, nonNullableDateDaySchema.getType()); + assertEquals("date", nonNullableDateDaySchema.getProp("logicalType")); + + // Assertions for nullableDateMilli + assertEquals(Schema.Type.UNION, schema.getField("nullableDateMilli").schema().getType()); + assertEquals(2, schema.getField("nullableDateMilli").schema().getTypes().size()); + Schema nullableDateMilliSchema = + schema.getField("nullableDateMilli").schema().getTypes().get(0); + assertEquals(Schema.Type.INT, nullableDateMilliSchema.getType()); + assertEquals("date", nullableDateMilliSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableDateMilli").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableDateMilli + Schema nonNullableDateMilliSchema = schema.getField("nonNullableDateMilli").schema(); + assertEquals(Schema.Type.INT, nonNullableDateMilliSchema.getType()); + assertEquals("date", nonNullableDateMilliSchema.getProp("logicalType")); + } + + @Test + public void testConvertTimeTypes() { + List fields = + Arrays.asList( + new Field( + "nullableTimeSec", + FieldType.nullable(new ArrowType.Time(TimeUnit.SECOND, 32)), + null), + new Field( + "nonNullableTimeSec", + FieldType.notNullable(new ArrowType.Time(TimeUnit.SECOND, 32)), + null), + new Field( + "nullableTimeMillis", + FieldType.nullable(new ArrowType.Time(TimeUnit.MILLISECOND, 32)), + null), + new Field( + "nonNullableTimeMillis", + FieldType.notNullable(new ArrowType.Time(TimeUnit.MILLISECOND, 32)), + null), + new Field( + "nullableTimeMicros", + FieldType.nullable(new ArrowType.Time(TimeUnit.MICROSECOND, 64)), + null), + new Field( + "nonNullableTimeMicros", + FieldType.notNullable(new ArrowType.Time(TimeUnit.MICROSECOND, 64)), + null), + new Field( + "nullableTimeNanos", + FieldType.nullable(new ArrowType.Time(TimeUnit.NANOSECOND, 64)), + null), + new Field( + "nonNullableTimeNanos", + FieldType.notNullable(new ArrowType.Time(TimeUnit.NANOSECOND, 64)), + null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(8, schema.getFields().size()); + + // Assertions for nullableTimeSec + assertEquals(Schema.Type.UNION, schema.getField("nullableTimeSec").schema().getType()); + assertEquals(2, schema.getField("nullableTimeSec").schema().getTypes().size()); + Schema nullableTimeSecSchema = schema.getField("nullableTimeSec").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimeSecSchema.getType()); + assertEquals("time-micros", nullableTimeSecSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, schema.getField("nullableTimeSec").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimeSec + Schema nonNullableTimeSecSchema = schema.getField("nonNullableTimeSec").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimeSecSchema.getType()); + assertEquals("time-micros", nonNullableTimeSecSchema.getProp("logicalType")); + + // Assertions for nullableTimeMillis + assertEquals(Schema.Type.UNION, schema.getField("nullableTimeMillis").schema().getType()); + assertEquals(2, schema.getField("nullableTimeMillis").schema().getTypes().size()); + Schema nullableTimeMillisSchema = + schema.getField("nullableTimeMillis").schema().getTypes().get(0); + assertEquals(Schema.Type.INT, nullableTimeMillisSchema.getType()); + assertEquals("time-millis", nullableTimeMillisSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimeMillis").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimeMillis + Schema nonNullableTimeMillisSchema = schema.getField("nonNullableTimeMillis").schema(); + assertEquals(Schema.Type.INT, nonNullableTimeMillisSchema.getType()); + assertEquals("time-millis", nonNullableTimeMillisSchema.getProp("logicalType")); + + // Assertions for nullableTimeMicros + assertEquals(Schema.Type.UNION, schema.getField("nullableTimeMicros").schema().getType()); + assertEquals(2, schema.getField("nullableTimeMicros").schema().getTypes().size()); + Schema nullableTimeMicrosSchema = + schema.getField("nullableTimeMicros").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimeMicrosSchema.getType()); + assertEquals("time-micros", nullableTimeMicrosSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimeMicros").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimeMicros + Schema nonNullableTimeMicrosSchema = schema.getField("nonNullableTimeMicros").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimeMicrosSchema.getType()); + assertEquals("time-micros", nonNullableTimeMicrosSchema.getProp("logicalType")); + + // Assertions for nullableTimeNanos + assertEquals(Schema.Type.UNION, schema.getField("nullableTimeNanos").schema().getType()); + assertEquals(2, schema.getField("nullableTimeNanos").schema().getTypes().size()); + Schema nullableTimeNanosSchema = + schema.getField("nullableTimeNanos").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimeNanosSchema.getType()); + assertEquals("time-micros", nullableTimeNanosSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimeNanos").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimeNanos + Schema nonNullableTimeNanosSchema = schema.getField("nonNullableTimeNanos").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimeNanosSchema.getType()); + assertEquals("time-micros", nonNullableTimeNanosSchema.getProp("logicalType")); + } + + @Test + public void testConvertZoneAwareTimestampTypes() { + List fields = + Arrays.asList( + new Field( + "nullableTimestampSecTz", + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.SECOND, "UTC")), + null), + new Field( + "nonNullableTimestampSecTz", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.SECOND, "UTC")), + null), + new Field( + "nullableTimestampMillisTz", + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + null), + new Field( + "nonNullableTimestampMillisTz", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + null), + new Field( + "nullableTimestampMicrosTz", + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC")), + null), + new Field( + "nonNullableTimestampMicrosTz", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC")), + null), + new Field( + "nullableTimestampNanosTz", + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC")), + null), + new Field( + "nonNullableTimestampNanosTz", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC")), + null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(8, schema.getFields().size()); + + // Assertions for nullableTimestampSecTz + assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampSecTz").schema().getType()); + assertEquals(2, schema.getField("nullableTimestampSecTz").schema().getTypes().size()); + Schema nullableTimestampSecTzSchema = + schema.getField("nullableTimestampSecTz").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimestampSecTzSchema.getType()); + assertEquals("timestamp-millis", nullableTimestampSecTzSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimestampSecTz").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampSecTz + Schema nonNullableTimestampSecTzSchema = schema.getField("nonNullableTimestampSecTz").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimestampSecTzSchema.getType()); + assertEquals("timestamp-millis", nonNullableTimestampSecTzSchema.getProp("logicalType")); + + // Assertions for nullableTimestampMillisTz + assertEquals( + Schema.Type.UNION, schema.getField("nullableTimestampMillisTz").schema().getType()); + assertEquals(2, schema.getField("nullableTimestampMillisTz").schema().getTypes().size()); + Schema nullableTimestampMillisTzSchema = + schema.getField("nullableTimestampMillisTz").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimestampMillisTzSchema.getType()); + assertEquals("timestamp-millis", nullableTimestampMillisTzSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimestampMillisTz").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampMillisTz + Schema nonNullableTimestampMillisTzSchema = + schema.getField("nonNullableTimestampMillisTz").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimestampMillisTzSchema.getType()); + assertEquals("timestamp-millis", nonNullableTimestampMillisTzSchema.getProp("logicalType")); + + // Assertions for nullableTimestampMicrosTz + assertEquals( + Schema.Type.UNION, schema.getField("nullableTimestampMicrosTz").schema().getType()); + assertEquals(2, schema.getField("nullableTimestampMicrosTz").schema().getTypes().size()); + Schema nullableTimestampMicrosTzSchema = + schema.getField("nullableTimestampMicrosTz").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimestampMicrosTzSchema.getType()); + assertEquals("timestamp-micros", nullableTimestampMicrosTzSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimestampMicrosTz").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampMicrosTz + Schema nonNullableTimestampMicrosTzSchema = + schema.getField("nonNullableTimestampMicrosTz").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimestampMicrosTzSchema.getType()); + assertEquals("timestamp-micros", nonNullableTimestampMicrosTzSchema.getProp("logicalType")); + + // Assertions for nullableTimestampNanosTz + assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampNanosTz").schema().getType()); + assertEquals(2, schema.getField("nullableTimestampNanosTz").schema().getTypes().size()); + Schema nullableTimestampNanosTzSchema = + schema.getField("nullableTimestampNanosTz").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimestampNanosTzSchema.getType()); + assertEquals("timestamp-nanos", nullableTimestampNanosTzSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimestampNanosTz").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampNanosTz + Schema nonNullableTimestampNanosTzSchema = + schema.getField("nonNullableTimestampNanosTz").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimestampNanosTzSchema.getType()); + assertEquals("timestamp-nanos", nonNullableTimestampNanosTzSchema.getProp("logicalType")); + } + + @Test + public void testConvertLocalTimestampTypes() { + List fields = + Arrays.asList( + new Field( + "nullableTimestampSec", + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.SECOND, null)), + null), + new Field( + "nonNullableTimestampSec", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.SECOND, null)), + null), + new Field( + "nullableTimestampMillis", + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, null)), + null), + new Field( + "nonNullableTimestampMillis", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, null)), + null), + new Field( + "nullableTimestampMicros", + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MICROSECOND, null)), + null), + new Field( + "nonNullableTimestampMicros", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.MICROSECOND, null)), + null), + new Field( + "nullableTimestampNanos", + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.NANOSECOND, null)), + null), + new Field( + "nonNullableTimestampNanos", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.NANOSECOND, null)), + null)); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(8, schema.getFields().size()); + + // Assertions for nullableTimestampSec + assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampSec").schema().getType()); + assertEquals(2, schema.getField("nullableTimestampSec").schema().getTypes().size()); + Schema nullableTimestampSecSchema = + schema.getField("nullableTimestampSec").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimestampSecSchema.getType()); + assertEquals("local-timestamp-millis", nullableTimestampSecSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimestampSec").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampSec + Schema nonNullableTimestampSecSchema = schema.getField("nonNullableTimestampSec").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimestampSecSchema.getType()); + assertEquals("local-timestamp-millis", nonNullableTimestampSecSchema.getProp("logicalType")); + + // Assertions for nullableTimestampMillis + assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampMillis").schema().getType()); + assertEquals(2, schema.getField("nullableTimestampMillis").schema().getTypes().size()); + Schema nullableTimestampMillisSchema = + schema.getField("nullableTimestampMillis").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimestampMillisSchema.getType()); + assertEquals("local-timestamp-millis", nullableTimestampMillisSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimestampMillis").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampMillis + Schema nonNullableTimestampMillisSchema = + schema.getField("nonNullableTimestampMillis").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimestampMillisSchema.getType()); + assertEquals("local-timestamp-millis", nonNullableTimestampMillisSchema.getProp("logicalType")); + + // Assertions for nullableTimestampMicros + assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampMicros").schema().getType()); + assertEquals(2, schema.getField("nullableTimestampMicros").schema().getTypes().size()); + Schema nullableTimestampMicrosSchema = + schema.getField("nullableTimestampMicros").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimestampMicrosSchema.getType()); + assertEquals("local-timestamp-micros", nullableTimestampMicrosSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimestampMicros").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampMicros + Schema nonNullableTimestampMicrosSchema = + schema.getField("nonNullableTimestampMicros").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimestampMicrosSchema.getType()); + assertEquals("local-timestamp-micros", nonNullableTimestampMicrosSchema.getProp("logicalType")); + + // Assertions for nullableTimestampNanos + assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampNanos").schema().getType()); + assertEquals(2, schema.getField("nullableTimestampNanos").schema().getTypes().size()); + Schema nullableTimestampNanosSchema = + schema.getField("nullableTimestampNanos").schema().getTypes().get(0); + assertEquals(Schema.Type.LONG, nullableTimestampNanosSchema.getType()); + assertEquals("local-timestamp-nanos", nullableTimestampNanosSchema.getProp("logicalType")); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableTimestampNanos").schema().getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampNanos + Schema nonNullableTimestampNanosSchema = schema.getField("nonNullableTimestampNanos").schema(); + assertEquals(Schema.Type.LONG, nonNullableTimestampNanosSchema.getType()); + assertEquals("local-timestamp-nanos", nonNullableTimestampNanosSchema.getProp("logicalType")); + } +} From 8f6d1839e51e53c7f040ce62cddb145a9ce78549 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 14:27:21 +0000 Subject: [PATCH 50/89] Fix schema building for map and record types --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 677c9f1031..0ed01a366a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -189,7 +189,7 @@ private static T buildRecordSchema( if (fields.isEmpty()) { throw new IllegalArgumentException("Record field must have at least one child field"); } - SchemaBuilder.FieldAssembler assembler = builder.fields(); + SchemaBuilder.FieldAssembler assembler = builder.namespace(namespace).fields(); for (Field field : fields) { assembler = buildFieldSchema(assembler, field, namespace); } @@ -233,11 +233,15 @@ private static T buildArraySchema( private static T buildMapSchema( SchemaBuilder.MapBuilder builder, Field mapField, String namespace) { - if (mapField.getChildren().size() != 2) { - throw new IllegalArgumentException("Map field must have exactly two child fields"); + if (mapField.getChildren().size() != 1) { + throw new IllegalArgumentException("Map field must have exactly one child field"); } - Field keyField = mapField.getChildren().get(0); - Field valueField = mapField.getChildren().get(1); + Field entriesField = mapField.getChildren().get(0); + if (mapField.getChildren().size() != 1) { + throw new IllegalArgumentException("Map entries must have exactly two child fields"); + } + Field keyField = entriesField.getChildren().get(0); + Field valueField = entriesField.getChildren().get(1); if (keyField.getType().getTypeID() != ArrowType.ArrowTypeID.Utf8 || keyField.isNullable()) { throw new IllegalArgumentException( "Map keys must be of type string and cannot be nullable for conversion to Avro"); From 9de9a984fa2091a99a4761d6714064994b0c0247 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 14:53:18 +0000 Subject: [PATCH 51/89] Fix schema building for unions with null members --- .../java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 0ed01a366a..409e34da61 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -204,7 +204,7 @@ private static SchemaBuilder.FieldAssembler buildFieldSchema( // Nullable unions need special handling, since union types cannot be directly nested if (field.getType().getTypeID() == ArrowType.ArrowTypeID.Union) { boolean unionNullable = field.getChildren().stream().anyMatch(Field::isNullable); - if (field.isNullable() && !unionNullable) { + if (unionNullable) { SchemaBuilder.UnionAccumulator> union = builder.unionOf().nullType(); return addTypesToUnion(union, field.getChildren(), namespace).nullDefault(); @@ -255,7 +255,7 @@ private static T buildTypeSchema( // Nullable unions need special handling, since union types cannot be directly nested if (field.getType().getTypeID() == ArrowType.ArrowTypeID.Union) { boolean unionNullable = field.getChildren().stream().anyMatch(Field::isNullable); - if (field.isNullable() && !unionNullable) { + if (unionNullable) { SchemaBuilder.UnionAccumulator union = builder.unionOf().nullType(); return addTypesToUnion(union, field.getChildren(), namespace); } else { From fe395914472576aadf9c3eb76c9ccd1feb5cc867 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 15:34:54 +0000 Subject: [PATCH 52/89] Schema tests for complex types containing simple types --- .../adapter/avro/ArrowToAvroSchemaTest.java | 611 +++++++++++++++++- 1 file changed, 606 insertions(+), 5 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java index 72ae03e181..68bd8852a5 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.UnionMode; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -283,14 +284,17 @@ public void testConvertFixedSizeBinaryTypes() { assertEquals(Schema.Type.UNION, schema.getField("nullableFixedSizeBinary").schema().getType()); assertEquals(2, schema.getField("nullableFixedSizeBinary").schema().getTypes().size()); - assertEquals( - Schema.Type.FIXED, - schema.getField("nullableFixedSizeBinary").schema().getTypes().get(0).getType()); + Schema nullableFixedSizeBinarySchema = + schema.getField("nullableFixedSizeBinary").schema().getTypes().get(0); + assertEquals(Schema.Type.FIXED, nullableFixedSizeBinarySchema.getType()); + assertEquals(10, nullableFixedSizeBinarySchema.getFixedSize()); assertEquals( Schema.Type.NULL, schema.getField("nullableFixedSizeBinary").schema().getTypes().get(1).getType()); - assertEquals( - Schema.Type.FIXED, schema.getField("nonNullableFixedSizeBinary").schema().getType()); + Schema nonNullableFixedSizeBinarySchema = + schema.getField("nullableFixedSizeBinary").schema().getTypes().get(0); + assertEquals(Schema.Type.FIXED, nonNullableFixedSizeBinarySchema.getType()); + assertEquals(10, nonNullableFixedSizeBinarySchema.getFixedSize()); } // Schema conversion for logical types, nullable and non-nullable @@ -339,6 +343,7 @@ public void testConvertDecimalTypes() { Schema nullableDecimal128Schema = schema.getField("nullableDecimal128").schema().getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimal128Schema.getType()); + assertEquals(16, nullableDecimal128Schema.getFixedSize()); assertEquals("decimal", nullableDecimal128Schema.getProp("logicalType")); assertEquals(10, nullableDecimal128Schema.getObjectProp("precision")); assertEquals(2, nullableDecimal128Schema.getObjectProp("scale")); @@ -349,6 +354,7 @@ public void testConvertDecimalTypes() { // Assertions for nonNullableDecimal1281 Schema nonNullableDecimal1281Schema = schema.getField("nonNullableDecimal1281").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1281Schema.getType()); + assertEquals(16, nonNullableDecimal1281Schema.getFixedSize()); assertEquals("decimal", nonNullableDecimal1281Schema.getProp("logicalType")); assertEquals(10, nonNullableDecimal1281Schema.getObjectProp("precision")); assertEquals(2, nonNullableDecimal1281Schema.getObjectProp("scale")); @@ -356,6 +362,7 @@ public void testConvertDecimalTypes() { // Assertions for nonNullableDecimal1282 Schema nonNullableDecimal1282Schema = schema.getField("nonNullableDecimal1282").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1282Schema.getType()); + assertEquals(16, nonNullableDecimal1282Schema.getFixedSize()); assertEquals("decimal", nonNullableDecimal1282Schema.getProp("logicalType")); assertEquals(15, nonNullableDecimal1282Schema.getObjectProp("precision")); assertEquals(5, nonNullableDecimal1282Schema.getObjectProp("scale")); @@ -363,6 +370,7 @@ public void testConvertDecimalTypes() { // Assertions for nonNullableDecimal1283 Schema nonNullableDecimal1283Schema = schema.getField("nonNullableDecimal1283").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1283Schema.getType()); + assertEquals(16, nonNullableDecimal1283Schema.getFixedSize()); assertEquals("decimal", nonNullableDecimal1283Schema.getProp("logicalType")); assertEquals(20, nonNullableDecimal1283Schema.getObjectProp("precision")); assertEquals(10, nonNullableDecimal1283Schema.getObjectProp("scale")); @@ -373,6 +381,7 @@ public void testConvertDecimalTypes() { Schema nullableDecimal256Schema = schema.getField("nullableDecimal256").schema().getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimal256Schema.getType()); + assertEquals(32, nullableDecimal256Schema.getFixedSize()); assertEquals("decimal", nullableDecimal256Schema.getProp("logicalType")); assertEquals(20, nullableDecimal256Schema.getObjectProp("precision")); assertEquals(4, nullableDecimal256Schema.getObjectProp("scale")); @@ -383,6 +392,7 @@ public void testConvertDecimalTypes() { // Assertions for nonNullableDecimal2561 Schema nonNullableDecimal2561Schema = schema.getField("nonNullableDecimal2561").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2561Schema.getType()); + assertEquals(32, nonNullableDecimal2561Schema.getFixedSize()); assertEquals("decimal", nonNullableDecimal2561Schema.getProp("logicalType")); assertEquals(20, nonNullableDecimal2561Schema.getObjectProp("precision")); assertEquals(4, nonNullableDecimal2561Schema.getObjectProp("scale")); @@ -390,6 +400,7 @@ public void testConvertDecimalTypes() { // Assertions for nonNullableDecimal2562 Schema nonNullableDecimal2562Schema = schema.getField("nonNullableDecimal2562").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2562Schema.getType()); + assertEquals(32, nonNullableDecimal2562Schema.getFixedSize()); assertEquals("decimal", nonNullableDecimal2562Schema.getProp("logicalType")); assertEquals(25, nonNullableDecimal2562Schema.getObjectProp("precision")); assertEquals(8, nonNullableDecimal2562Schema.getObjectProp("scale")); @@ -397,6 +408,7 @@ public void testConvertDecimalTypes() { // Assertions for nonNullableDecimal2563 Schema nonNullableDecimal2563Schema = schema.getField("nonNullableDecimal2563").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2563Schema.getType()); + assertEquals(32, nonNullableDecimal2563Schema.getFixedSize()); assertEquals("decimal", nonNullableDecimal2563Schema.getProp("logicalType")); assertEquals(30, nonNullableDecimal2563Schema.getObjectProp("precision")); assertEquals(15, nonNullableDecimal2563Schema.getObjectProp("scale")); @@ -782,4 +794,593 @@ public void testConvertLocalTimestampTypes() { assertEquals(Schema.Type.LONG, nonNullableTimestampNanosSchema.getType()); assertEquals("local-timestamp-nanos", nonNullableTimestampNanosSchema.getProp("logicalType")); } + + // Schema conversion for complex types, where the contents are primitive and logical types + + @Test + public void testConvertListTypes() { + List fields = + Arrays.asList( + new Field( + "nullableIntList", + FieldType.nullable(new ArrowType.List()), + Arrays.asList( + new Field("item", FieldType.nullable(new ArrowType.Int(32, true)), null))), + new Field( + "nullableDoubleList", + FieldType.nullable(new ArrowType.List()), + Arrays.asList( + new Field( + "item", + FieldType.notNullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null))), + new Field( + "nonNullableDecimalList", + FieldType.notNullable(new ArrowType.List()), + Arrays.asList( + new Field( + "item", FieldType.nullable(new ArrowType.Decimal(10, 2, 128)), null))), + new Field( + "nonNullableTimestampList", + FieldType.notNullable(new ArrowType.List()), + Arrays.asList( + new Field( + "item", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + null)))); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(4, schema.getFields().size()); + + // Assertions for nullableIntList + assertEquals(Schema.Type.UNION, schema.getField("nullableIntList").schema().getType()); + assertEquals(2, schema.getField("nullableIntList").schema().getTypes().size()); + assertEquals( + Schema.Type.ARRAY, schema.getField("nullableIntList").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableIntList").schema().getTypes().get(1).getType()); + Schema nullableIntListItemSchema = + schema.getField("nullableIntList").schema().getTypes().get(0).getElementType(); + assertEquals(Schema.Type.UNION, nullableIntListItemSchema.getType()); + assertEquals(2, nullableIntListItemSchema.getTypes().size()); + assertEquals(Schema.Type.INT, nullableIntListItemSchema.getTypes().get(0).getType()); + assertEquals(Schema.Type.NULL, nullableIntListItemSchema.getTypes().get(1).getType()); + + // Assertions for nullableDoubleList + assertEquals(Schema.Type.UNION, schema.getField("nullableDoubleList").schema().getType()); + assertEquals(2, schema.getField("nullableDoubleList").schema().getTypes().size()); + assertEquals( + Schema.Type.ARRAY, + schema.getField("nullableDoubleList").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableDoubleList").schema().getTypes().get(1).getType()); + Schema nullableDoubleListItemSchema = + schema.getField("nullableDoubleList").schema().getTypes().get(0).getElementType(); + assertEquals(Schema.Type.DOUBLE, nullableDoubleListItemSchema.getType()); + + // Assertions for nonNullableDecimalList + assertEquals(Schema.Type.ARRAY, schema.getField("nonNullableDecimalList").schema().getType()); + Schema nonNullableDecimalListItemSchema = + schema.getField("nonNullableDecimalList").schema().getElementType(); + assertEquals(Schema.Type.UNION, nonNullableDecimalListItemSchema.getType()); + assertEquals(2, nonNullableDecimalListItemSchema.getTypes().size()); + Schema nullableDecimalSchema = nonNullableDecimalListItemSchema.getTypes().get(0); + assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); + assertEquals(16, nullableDecimalSchema.getFixedSize()); + assertEquals("decimal", nullableDecimalSchema.getProp("logicalType")); + assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); + assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); + assertEquals(Schema.Type.NULL, nonNullableDecimalListItemSchema.getTypes().get(1).getType()); + + // Assertions for nonNullableTimestampList + assertEquals(Schema.Type.ARRAY, schema.getField("nonNullableTimestampList").schema().getType()); + Schema nonNullableTimestampListItemSchema = + schema.getField("nonNullableTimestampList").schema().getElementType(); + assertEquals(Schema.Type.LONG, nonNullableTimestampListItemSchema.getType()); + assertEquals("timestamp-millis", nonNullableTimestampListItemSchema.getProp("logicalType")); + } + + @Test + public void testConvertFixedSizeListTypes() { + List fields = + Arrays.asList( + new Field( + "nullableFixedSizeIntList", + FieldType.nullable(new ArrowType.FixedSizeList(3)), + Arrays.asList( + new Field("item", FieldType.nullable(new ArrowType.Int(32, true)), null))), + new Field( + "nullableFixedSizeDoubleList", + FieldType.nullable(new ArrowType.FixedSizeList(3)), + Arrays.asList( + new Field( + "item", + FieldType.notNullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null))), + new Field( + "nonNullableFixedSizeDecimalList", + FieldType.notNullable(new ArrowType.FixedSizeList(3)), + Arrays.asList( + new Field( + "item", FieldType.nullable(new ArrowType.Decimal(10, 2, 128)), null))), + new Field( + "nonNullableFixedSizeTimestampList", + FieldType.notNullable(new ArrowType.FixedSizeList(3)), + Arrays.asList( + new Field( + "item", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + null)))); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(4, schema.getFields().size()); + + // Assertions for nullableFixedSizeIntList + assertEquals(Schema.Type.UNION, schema.getField("nullableFixedSizeIntList").schema().getType()); + assertEquals(2, schema.getField("nullableFixedSizeIntList").schema().getTypes().size()); + assertEquals( + Schema.Type.ARRAY, + schema.getField("nullableFixedSizeIntList").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableFixedSizeIntList").schema().getTypes().get(1).getType()); + Schema nullableFixedSizeIntListItemSchema = + schema.getField("nullableFixedSizeIntList").schema().getTypes().get(0).getElementType(); + assertEquals(Schema.Type.UNION, nullableFixedSizeIntListItemSchema.getType()); + assertEquals(2, nullableFixedSizeIntListItemSchema.getTypes().size()); + assertEquals(Schema.Type.INT, nullableFixedSizeIntListItemSchema.getTypes().get(0).getType()); + assertEquals(Schema.Type.NULL, nullableFixedSizeIntListItemSchema.getTypes().get(1).getType()); + + // Assertions for nullableFixedSizeDoubleList + assertEquals( + Schema.Type.UNION, schema.getField("nullableFixedSizeDoubleList").schema().getType()); + assertEquals(2, schema.getField("nullableFixedSizeDoubleList").schema().getTypes().size()); + assertEquals( + Schema.Type.ARRAY, + schema.getField("nullableFixedSizeDoubleList").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableFixedSizeDoubleList").schema().getTypes().get(1).getType()); + Schema nullableFixedSizeDoubleListItemSchema = + schema.getField("nullableFixedSizeDoubleList").schema().getTypes().get(0).getElementType(); + assertEquals(Schema.Type.DOUBLE, nullableFixedSizeDoubleListItemSchema.getType()); + + // Assertions for nonNullableFixedSizeDecimalList + assertEquals( + Schema.Type.ARRAY, schema.getField("nonNullableFixedSizeDecimalList").schema().getType()); + Schema nonNullableFixedSizeDecimalListItemSchema = + schema.getField("nonNullableFixedSizeDecimalList").schema().getElementType(); + assertEquals(Schema.Type.UNION, nonNullableFixedSizeDecimalListItemSchema.getType()); + assertEquals(2, nonNullableFixedSizeDecimalListItemSchema.getTypes().size()); + Schema nullableDecimalSchema = nonNullableFixedSizeDecimalListItemSchema.getTypes().get(0); + assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); + assertEquals(16, nullableDecimalSchema.getFixedSize()); + assertEquals("decimal", nullableDecimalSchema.getProp("logicalType")); + assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); + assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); + assertEquals( + Schema.Type.NULL, nonNullableFixedSizeDecimalListItemSchema.getTypes().get(1).getType()); + + // Assertions for nonNullableFixedSizeTimestampList + assertEquals( + Schema.Type.ARRAY, schema.getField("nonNullableFixedSizeTimestampList").schema().getType()); + Schema nonNullableFixedSizeTimestampListItemSchema = + schema.getField("nonNullableFixedSizeTimestampList").schema().getElementType(); + assertEquals(Schema.Type.LONG, nonNullableFixedSizeTimestampListItemSchema.getType()); + assertEquals( + "timestamp-millis", nonNullableFixedSizeTimestampListItemSchema.getProp("logicalType")); + } + + @Test + public void testConvertMapTypes() { + List fields = + Arrays.asList( + new Field( + "nullableMapWithNullableInt", + FieldType.nullable(new ArrowType.Map(false)), + Arrays.asList( + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList( + new Field("key", FieldType.notNullable(new ArrowType.Utf8()), null), + new Field( + "value", FieldType.nullable(new ArrowType.Int(32, true)), null))))), + new Field( + "nullableMapWithNonNullableDouble", + FieldType.nullable(new ArrowType.Map(false)), + Arrays.asList( + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList( + new Field("key", FieldType.notNullable(new ArrowType.Utf8()), null), + new Field( + "value", + FieldType.notNullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null))))), + new Field( + "nonNullableMapWithNullableDecimal", + FieldType.notNullable(new ArrowType.Map(false)), + Arrays.asList( + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList( + new Field("key", FieldType.notNullable(new ArrowType.Utf8()), null), + new Field( + "value", + FieldType.nullable(new ArrowType.Decimal(10, 2, 128)), + null))))), + new Field( + "nonNullableMapWithNonNullableTimestamp", + FieldType.notNullable(new ArrowType.Map(false)), + Arrays.asList( + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList( + new Field("key", FieldType.notNullable(new ArrowType.Utf8()), null), + new Field( + "value", + FieldType.notNullable( + new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + null)))))); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(4, schema.getFields().size()); + + // Assertions for nullableMapWithNullableInt + assertEquals( + Schema.Type.UNION, schema.getField("nullableMapWithNullableInt").schema().getType()); + assertEquals(2, schema.getField("nullableMapWithNullableInt").schema().getTypes().size()); + assertEquals( + Schema.Type.MAP, + schema.getField("nullableMapWithNullableInt").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableMapWithNullableInt").schema().getTypes().get(1).getType()); + Schema nullableMapWithNullableIntValueSchema = + schema.getField("nullableMapWithNullableInt").schema().getTypes().get(0).getValueType(); + assertEquals(Schema.Type.UNION, nullableMapWithNullableIntValueSchema.getType()); + assertEquals(2, nullableMapWithNullableIntValueSchema.getTypes().size()); + assertEquals( + Schema.Type.INT, nullableMapWithNullableIntValueSchema.getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, nullableMapWithNullableIntValueSchema.getTypes().get(1).getType()); + + // Assertions for nullableMapWithNonNullableDouble + assertEquals( + Schema.Type.UNION, schema.getField("nullableMapWithNonNullableDouble").schema().getType()); + assertEquals(2, schema.getField("nullableMapWithNonNullableDouble").schema().getTypes().size()); + assertEquals( + Schema.Type.MAP, + schema.getField("nullableMapWithNonNullableDouble").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableMapWithNonNullableDouble").schema().getTypes().get(1).getType()); + Schema nullableMapWithNonNullableDoubleValueSchema = + schema + .getField("nullableMapWithNonNullableDouble") + .schema() + .getTypes() + .get(0) + .getValueType(); + assertEquals(Schema.Type.DOUBLE, nullableMapWithNonNullableDoubleValueSchema.getType()); + + // Assertions for nonNullableMapWithNullableDecimal + assertEquals( + Schema.Type.MAP, schema.getField("nonNullableMapWithNullableDecimal").schema().getType()); + Schema nonNullableMapWithNullableDecimalValueSchema = + schema.getField("nonNullableMapWithNullableDecimal").schema().getValueType(); + assertEquals(Schema.Type.UNION, nonNullableMapWithNullableDecimalValueSchema.getType()); + assertEquals(2, nonNullableMapWithNullableDecimalValueSchema.getTypes().size()); + Schema nullableDecimalSchema = nonNullableMapWithNullableDecimalValueSchema.getTypes().get(0); + assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); + assertEquals(16, nullableDecimalSchema.getFixedSize()); + assertEquals("decimal", nullableDecimalSchema.getProp("logicalType")); + assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); + assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); + assertEquals( + Schema.Type.NULL, nonNullableMapWithNullableDecimalValueSchema.getTypes().get(1).getType()); + + // Assertions for nonNullableMapWithNonNullableTimestamp + assertEquals( + Schema.Type.MAP, + schema.getField("nonNullableMapWithNonNullableTimestamp").schema().getType()); + Schema nonNullableMapWithNonNullableTimestampValueSchema = + schema.getField("nonNullableMapWithNonNullableTimestamp").schema().getValueType(); + assertEquals(Schema.Type.LONG, nonNullableMapWithNonNullableTimestampValueSchema.getType()); + assertEquals( + "timestamp-millis", + nonNullableMapWithNonNullableTimestampValueSchema.getProp("logicalType")); + } + + @Test + public void testConvertRecordTypes() { + List fields = + Arrays.asList( + new Field( + "nullableRecord", + FieldType.nullable(new ArrowType.Struct()), + Arrays.asList( + new Field("field1", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field( + "field2", + FieldType.notNullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null), + new Field( + "field3", FieldType.nullable(new ArrowType.Decimal(10, 2, 128)), null), + new Field( + "field4", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + null))), + new Field( + "nonNullableRecord", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList( + new Field("field1", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field( + "field2", + FieldType.notNullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null), + new Field( + "field3", FieldType.nullable(new ArrowType.Decimal(10, 2, 128)), null), + new Field( + "field4", + FieldType.notNullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + null)))); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(2, schema.getFields().size()); + + // Assertions for nullableRecord + assertEquals(Schema.Type.UNION, schema.getField("nullableRecord").schema().getType()); + assertEquals(2, schema.getField("nullableRecord").schema().getTypes().size()); + assertEquals( + Schema.Type.RECORD, schema.getField("nullableRecord").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, schema.getField("nullableRecord").schema().getTypes().get(1).getType()); + Schema nullableRecordSchema = schema.getField("nullableRecord").schema().getTypes().get(0); + assertEquals(4, nullableRecordSchema.getFields().size()); + assertEquals( + Schema.Type.INT, + nullableRecordSchema.getField("field1").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, + nullableRecordSchema.getField("field1").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.DOUBLE, nullableRecordSchema.getField("field2").schema().getType()); + assertEquals( + Schema.Type.FIXED, + nullableRecordSchema.getField("field3").schema().getTypes().get(0).getType()); + assertEquals( + 16, nullableRecordSchema.getField("field3").schema().getTypes().get(0).getFixedSize()); + assertEquals( + "decimal", + nullableRecordSchema.getField("field3").schema().getTypes().get(0).getProp("logicalType")); + assertEquals( + 10, + nullableRecordSchema + .getField("field3") + .schema() + .getTypes() + .get(0) + .getObjectProp("precision")); + assertEquals( + 2, + nullableRecordSchema.getField("field3").schema().getTypes().get(0).getObjectProp("scale")); + assertEquals( + Schema.Type.NULL, + nullableRecordSchema.getField("field3").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.LONG, nullableRecordSchema.getField("field4").schema().getType()); + assertEquals( + "timestamp-millis", + nullableRecordSchema.getField("field4").schema().getProp("logicalType")); + + // Assertions for nonNullableRecord + assertEquals(Schema.Type.RECORD, schema.getField("nonNullableRecord").schema().getType()); + Schema nonNullableRecordSchema = schema.getField("nonNullableRecord").schema(); + assertEquals(4, nonNullableRecordSchema.getFields().size()); + assertEquals( + Schema.Type.INT, + nonNullableRecordSchema.getField("field1").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.NULL, + nonNullableRecordSchema.getField("field1").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.DOUBLE, nonNullableRecordSchema.getField("field2").schema().getType()); + assertEquals( + Schema.Type.FIXED, + nonNullableRecordSchema.getField("field3").schema().getTypes().get(0).getType()); + assertEquals( + 16, nullableRecordSchema.getField("field3").schema().getTypes().get(0).getFixedSize()); + assertEquals( + "decimal", + nonNullableRecordSchema + .getField("field3") + .schema() + .getTypes() + .get(0) + .getProp("logicalType")); + assertEquals( + 10, + nonNullableRecordSchema + .getField("field3") + .schema() + .getTypes() + .get(0) + .getObjectProp("precision")); + assertEquals( + 2, + nonNullableRecordSchema + .getField("field3") + .schema() + .getTypes() + .get(0) + .getObjectProp("scale")); + assertEquals( + Schema.Type.NULL, + nonNullableRecordSchema.getField("field3").schema().getTypes().get(1).getType()); + assertEquals(Schema.Type.LONG, nonNullableRecordSchema.getField("field4").schema().getType()); + assertEquals( + "timestamp-millis", + nonNullableRecordSchema.getField("field4").schema().getProp("logicalType")); + } + + @Test + public void testConvertUnionTypes() { + List fields = + Arrays.asList( + new Field( + "sparseUnionField", + FieldType.nullable( + new ArrowType.Union( + UnionMode.Sparse, + new int[] { + ArrowType.ArrowTypeID.Int.getFlatbufID(), + ArrowType.ArrowTypeID.FloatingPoint.getFlatbufID(), + ArrowType.ArrowTypeID.Utf8.getFlatbufID() + })), + Arrays.asList( + new Field( + "intMember", FieldType.notNullable(new ArrowType.Int(32, true)), null), + new Field( + "floatMember", + FieldType.notNullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null), + new Field("stringMember", FieldType.notNullable(new ArrowType.Utf8()), null))), + new Field( + "denseUnionField", + FieldType.nullable( + new ArrowType.Union( + UnionMode.Dense, + new int[] { + ArrowType.ArrowTypeID.Int.getFlatbufID(), + ArrowType.ArrowTypeID.FloatingPoint.getFlatbufID(), + ArrowType.ArrowTypeID.Utf8.getFlatbufID() + })), + Arrays.asList( + new Field( + "intMember", FieldType.notNullable(new ArrowType.Int(32, true)), null), + new Field( + "floatMember", + FieldType.notNullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null), + new Field("stringMember", FieldType.notNullable(new ArrowType.Utf8()), null))), + new Field( + "nullableSparseUnionField", + FieldType.nullable( + new ArrowType.Union( + UnionMode.Sparse, + new int[] { + ArrowType.ArrowTypeID.Int.getFlatbufID(), + ArrowType.ArrowTypeID.FloatingPoint.getFlatbufID(), + ArrowType.ArrowTypeID.Utf8.getFlatbufID() + })), + Arrays.asList( + new Field( + "nullableIntMember", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field( + "nullableFloatMember", + FieldType.nullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null), + new Field( + "nullableStringMember", FieldType.nullable(new ArrowType.Utf8()), null))), + new Field( + "nullableDenseUnionField", + FieldType.nullable( + new ArrowType.Union( + UnionMode.Dense, + new int[] { + ArrowType.ArrowTypeID.Int.getFlatbufID(), + ArrowType.ArrowTypeID.FloatingPoint.getFlatbufID(), + ArrowType.ArrowTypeID.Utf8.getFlatbufID() + })), + Arrays.asList( + new Field( + "nullableIntMember", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field( + "nullableFloatMember", + FieldType.nullable( + new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null), + new Field( + "nullableStringMember", FieldType.nullable(new ArrowType.Utf8()), null)))); + + Schema schema = ArrowToAvroUtils.createAvroSchema(fields, "TestRecord"); + + assertEquals(Schema.Type.RECORD, schema.getType()); + assertEquals(4, schema.getFields().size()); + + // Assertions for sparseUnionField + assertEquals(Schema.Type.UNION, schema.getField("sparseUnionField").schema().getType()); + assertEquals(3, schema.getField("sparseUnionField").schema().getTypes().size()); + assertEquals( + Schema.Type.INT, schema.getField("sparseUnionField").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.FLOAT, + schema.getField("sparseUnionField").schema().getTypes().get(1).getType()); + assertEquals( + Schema.Type.STRING, + schema.getField("sparseUnionField").schema().getTypes().get(2).getType()); + + // Assertions for denseUnionField + assertEquals(Schema.Type.UNION, schema.getField("denseUnionField").schema().getType()); + assertEquals(3, schema.getField("denseUnionField").schema().getTypes().size()); + assertEquals( + Schema.Type.INT, schema.getField("denseUnionField").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.FLOAT, schema.getField("denseUnionField").schema().getTypes().get(1).getType()); + assertEquals( + Schema.Type.STRING, + schema.getField("denseUnionField").schema().getTypes().get(2).getType()); + + // Assertions for sparseUnionField + assertEquals(Schema.Type.UNION, schema.getField("nullableSparseUnionField").schema().getType()); + assertEquals(4, schema.getField("nullableSparseUnionField").schema().getTypes().size()); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableSparseUnionField").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.INT, + schema.getField("nullableSparseUnionField").schema().getTypes().get(1).getType()); + assertEquals( + Schema.Type.FLOAT, + schema.getField("nullableSparseUnionField").schema().getTypes().get(2).getType()); + assertEquals( + Schema.Type.STRING, + schema.getField("nullableSparseUnionField").schema().getTypes().get(3).getType()); + + // Assertions for denseUnionField + assertEquals(Schema.Type.UNION, schema.getField("nullableDenseUnionField").schema().getType()); + assertEquals(4, schema.getField("nullableDenseUnionField").schema().getTypes().size()); + assertEquals( + Schema.Type.NULL, + schema.getField("nullableDenseUnionField").schema().getTypes().get(0).getType()); + assertEquals( + Schema.Type.INT, + schema.getField("nullableDenseUnionField").schema().getTypes().get(1).getType()); + assertEquals( + Schema.Type.FLOAT, + schema.getField("nullableDenseUnionField").schema().getTypes().get(2).getType()); + assertEquals( + Schema.Type.STRING, + schema.getField("nullableDenseUnionField").schema().getTypes().get(3).getType()); + } } From 0e362f021b12e385c49c111124908f58efc5fc96 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 18:00:01 +0000 Subject: [PATCH 53/89] Arrow to avro data tests for integers --- .../adapter/avro/ArrowToAvroDataTest.java | 275 ++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java new file mode 100644 index 0000000000..238eecb0aa --- /dev/null +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.adapter.avro; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; +import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryDecoder; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.DecoderFactory; +import org.apache.avro.io.EncoderFactory; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import static org.junit.jupiter.api.Assertions.*; + +public class ArrowToAvroDataTest { + + @TempDir + public static File TMP; + + @Test + public void testWriteIntegers() throws Exception { + + // Field definitions + FieldType int8Field = new FieldType(false, new ArrowType.Int(8, true), null); + FieldType int16Field = new FieldType(false, new ArrowType.Int(16, true), null); + FieldType int32Field = new FieldType(false, new ArrowType.Int(32, true), null); + FieldType int64Field = new FieldType(false, new ArrowType.Int(64, true), null); + FieldType uint8Field = new FieldType(false, new ArrowType.Int(8, false), null); + FieldType uint16Field = new FieldType(false, new ArrowType.Int(16, false), null); + FieldType uint32Field = new FieldType(false, new ArrowType.Int(32, false), null); + FieldType uint64Field = new FieldType(false, new ArrowType.Int(64, false), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + TinyIntVector int8Vector = new TinyIntVector(new Field("int8", int8Field, null), allocator); + SmallIntVector int16Vector = new SmallIntVector(new Field("int16", int16Field, null), allocator); + IntVector int32Vector = new IntVector(new Field("int32", int32Field, null), allocator); + BigIntVector int64Vector = new BigIntVector(new Field("int64", int64Field, null), allocator); + UInt1Vector uint8Vector = new UInt1Vector(new Field("uint8", uint8Field, null), allocator); + UInt2Vector uint16Vector = new UInt2Vector(new Field("uint16", uint16Field, null), allocator); + UInt4Vector uint32Vector = new UInt4Vector(new Field("uint32", uint32Field, null), allocator); + UInt8Vector uint64Vector = new UInt8Vector(new Field("uint64", uint64Field, null), allocator); + + // Set up VSR + List vectors = Arrays.asList( + int8Vector, int16Vector, int32Vector, int64Vector, + uint8Vector, uint16Vector, uint32Vector, uint64Vector); + + int rowCount = 12; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + for (int i = 0; i < 10; i++) { + int8Vector.set(i, 11 * i * (i % 2 == 0 ? 1 : -1)); + int16Vector.set(i, 63 * i * (i % 2 == 0 ? 1 : -1)); + int32Vector.set(i, 513 * i * (i % 2 == 0 ? 1 : -1)); + int64Vector.set(i, 3791L * i * (i % 2 == 0 ? 1 : -1)); + uint8Vector.set(i, 11 * i); + uint16Vector.set(i, 63 * i); + uint32Vector.set(i, 513 * i); + uint64Vector.set(i, 3791L * i); + } + + // Min values + int8Vector.set(10, Byte.MIN_VALUE); + int16Vector.set(10, Short.MIN_VALUE); + int32Vector.set(10, Integer.MIN_VALUE); + int64Vector.set(10, Long.MIN_VALUE); + uint8Vector.set(10, 0); + uint16Vector.set(10, 0); + uint32Vector.set(10, 0); + uint64Vector.set(10, 0); + + // Max values + int8Vector.set(11, Byte.MAX_VALUE); + int16Vector.set(11, Short.MAX_VALUE); + int32Vector.set(11, Integer.MAX_VALUE); + int64Vector.set(11, Long.MAX_VALUE); + uint8Vector.set(11, 0xff); + uint16Vector.set(11, 0xffff); + uint32Vector.set(11, 0xffffffff); + uint64Vector.set(11, Long.MAX_VALUE); // Max that can be encoded + + File dataFile = new File(TMP, "testWriteIntegers.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals((int) int8Vector.get(row), record.get("int8")); + assertEquals((int) int16Vector.get(row), record.get("int16")); + assertEquals(int32Vector.get(row), record.get("int32")); + assertEquals(int64Vector.get(row), record.get("int64")); + assertEquals(Byte.toUnsignedInt(uint8Vector.get(row)), record.get("uint8")); + assertEquals(Short.toUnsignedInt((short) uint16Vector.get(row)), record.get("uint16")); + assertEquals(Integer.toUnsignedLong(uint32Vector.get(row)), record.get("uint32")); + assertEquals(uint64Vector.get(row), record.get("uint64")); + } + } + } + } + + @Test + public void testWriteNullableIntegers() throws Exception { + + // Field definitions + FieldType int8Field = new FieldType(true, new ArrowType.Int(8, true), null); + FieldType int16Field = new FieldType(true, new ArrowType.Int(16, true), null); + FieldType int32Field = new FieldType(true, new ArrowType.Int(32, true), null); + FieldType int64Field = new FieldType(true, new ArrowType.Int(64, true), null); + FieldType uint8Field = new FieldType(true, new ArrowType.Int(8, false), null); + FieldType uint16Field = new FieldType(true, new ArrowType.Int(16, false), null); + FieldType uint32Field = new FieldType(true, new ArrowType.Int(32, false), null); + FieldType uint64Field = new FieldType(true, new ArrowType.Int(64, false), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + TinyIntVector int8Vector = new TinyIntVector(new Field("int8", int8Field, null), allocator); + SmallIntVector int16Vector = new SmallIntVector(new Field("int16", int16Field, null), allocator); + IntVector int32Vector = new IntVector(new Field("int32", int32Field, null), allocator); + BigIntVector int64Vector = new BigIntVector(new Field("int64", int64Field, null), allocator); + UInt1Vector uint8Vector = new UInt1Vector(new Field("uint8", uint8Field, null), allocator); + UInt2Vector uint16Vector = new UInt2Vector(new Field("uint16", uint16Field, null), allocator); + UInt4Vector uint32Vector = new UInt4Vector(new Field("uint32", uint32Field, null), allocator); + UInt8Vector uint64Vector = new UInt8Vector(new Field("uint64", uint64Field, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList( + int8Vector, int16Vector, int32Vector, int64Vector, + uint8Vector, uint16Vector, uint32Vector, uint64Vector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Null values + int8Vector.setNull(0); + int16Vector.setNull(0); + int32Vector.setNull(0); + int64Vector.setNull(0); + uint8Vector.setNull(0); + uint16Vector.setNull(0); + uint32Vector.setNull(0); + uint64Vector.setNull(0); + + // Zero values + int8Vector.set(1, 0); + int16Vector.set(1, 0); + int32Vector.set(1, 0); + int64Vector.set(1, 0); + uint8Vector.set(1, 0); + uint16Vector.set(1, 0); + uint32Vector.set(1, 0); + uint64Vector.set(1, 0); + + // Non-zero values + int8Vector.set(2, Byte.MAX_VALUE); + int16Vector.set(2, Short.MAX_VALUE); + int32Vector.set(2, Integer.MAX_VALUE); + int64Vector.set(2, Long.MAX_VALUE); + uint8Vector.set(2, Byte.MAX_VALUE); + uint16Vector.set(2, Short.MAX_VALUE); + uint32Vector.set(2, Integer.MAX_VALUE); + uint64Vector.set(2, Long.MAX_VALUE); + + File dataFile = new File(TMP, "testWriteNullableIntegers.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("int8")); + assertNull(record.get("int16")); + assertNull(record.get("int32")); + assertNull(record.get("int64")); + assertNull(record.get("uint8")); + assertNull(record.get("uint16")); + assertNull(record.get("uint32")); + assertNull(record.get("uint64")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals((int) int8Vector.get(row), record.get("int8")); + assertEquals((int) int16Vector.get(row), record.get("int16")); + assertEquals(int32Vector.get(row), record.get("int32")); + assertEquals(int64Vector.get(row), record.get("int64")); + assertEquals(Byte.toUnsignedInt(uint8Vector.get(row)), record.get("uint8")); + assertEquals(Short.toUnsignedInt((short) uint16Vector.get(row)), record.get("uint16")); + assertEquals(Integer.toUnsignedLong(uint32Vector.get(row)), record.get("uint32")); + assertEquals(uint64Vector.get(row), record.get("uint64")); + } + } + } + } +} From 67c5abf6a99ad47c7c6398df5ed6f4b1811ae22c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 18:30:35 +0000 Subject: [PATCH 54/89] Add tests for null, boolean and floating types --- .../adapter/avro/ArrowToAvroDataTest.java | 348 +++++++++++++++++- 1 file changed, 339 insertions(+), 9 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 238eecb0aa..9f9fb1e991 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -26,9 +26,15 @@ import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.Float16; import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.Float2Vector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TinyIntVector; import org.apache.arrow.vector.UInt1Vector; @@ -36,6 +42,7 @@ import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -56,6 +63,175 @@ public class ArrowToAvroDataTest { @TempDir public static File TMP; + @Test + public void testWriteNullColumn() throws Exception { + + // Field definition + FieldType nullField = new FieldType(false, new ArrowType.Null(), null); + + // Create empty vector + NullVector nullVector = new NullVector(new Field("nullColumn", nullField, null)); + + int rowCount = 10; + + // Set up VSR + List vectors = Arrays.asList(nullVector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set all values to null + for (int row = 0; row < rowCount; row++) { + nullVector.setNull(row); + } + + File dataFile = new File(TMP, "testWriteNullColumn.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertNull(record.get("nullColumn")); + } + } + } + } + + @Test + public void testWriteBooleans() throws Exception { + + // Field definition + FieldType booleanField = new FieldType(false, new ArrowType.Bool(), null); + + // Create empty vector + BufferAllocator allocator = new RootAllocator(); + BitVector booleanVector = new BitVector(new Field("boolean", booleanField, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(booleanVector); + int rowCount = 10; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + for (int row = 0; row < rowCount; row++) { + booleanVector.set(row, row % 2 == 0 ? 1 : 0); + } + + File dataFile = new File(TMP, "testWriteBooleans.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(booleanVector.get(row) == 1, record.get("boolean")); + } + } + } + } + + @Test + public void testWriteNullableBooleans() throws Exception { + + // Field definition + FieldType booleanField = new FieldType(true, new ArrowType.Bool(), null); + + // Create empty vector + BufferAllocator allocator = new RootAllocator(); + BitVector booleanVector = new BitVector(new Field("boolean", booleanField, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(booleanVector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Null value + booleanVector.setNull(0); + + // False value + booleanVector.set(1, 0); + + // True value + booleanVector.set(2, 1); + + File dataFile = new File(TMP, "testWriteNullableBooleans.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("boolean")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(booleanVector.get(row) == 1, record.get("boolean")); + } + } + } + } + @Test public void testWriteIntegers() throws Exception { @@ -93,15 +269,15 @@ public void testWriteIntegers() throws Exception { root.allocateNew(); // Set test data - for (int i = 0; i < 10; i++) { - int8Vector.set(i, 11 * i * (i % 2 == 0 ? 1 : -1)); - int16Vector.set(i, 63 * i * (i % 2 == 0 ? 1 : -1)); - int32Vector.set(i, 513 * i * (i % 2 == 0 ? 1 : -1)); - int64Vector.set(i, 3791L * i * (i % 2 == 0 ? 1 : -1)); - uint8Vector.set(i, 11 * i); - uint16Vector.set(i, 63 * i); - uint32Vector.set(i, 513 * i); - uint64Vector.set(i, 3791L * i); + for (int row = 0; row < 10; row++) { + int8Vector.set(row, 11 * row * (row % 2 == 0 ? 1 : -1)); + int16Vector.set(row, 63 * row * (row % 2 == 0 ? 1 : -1)); + int32Vector.set(row, 513 * row * (row % 2 == 0 ? 1 : -1)); + int64Vector.set(row, 3791L * row * (row % 2 == 0 ? 1 : -1)); + uint8Vector.set(row, 11 * row); + uint16Vector.set(row, 63 * row); + uint32Vector.set(row, 513 * row); + uint64Vector.set(row, 3791L * row); } // Min values @@ -272,4 +448,158 @@ record = datumReader.read(record, decoder); } } } + + @Test + public void testWriteFloatingPoints() throws Exception { + + // Field definitions + FieldType float16Field = new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.HALF), null); + FieldType float32Field = new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null); + FieldType float64Field = new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + Float2Vector float16Vector = new Float2Vector(new Field("float16", float16Field, null), allocator); + Float4Vector float32Vector = new Float4Vector(new Field("float32", float32Field, null), allocator); + Float8Vector float64Vector = new Float8Vector(new Field("float64", float64Field, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(float16Vector, float32Vector, float64Vector); + int rowCount = 15; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + for (int row = 0; row < 10; row++) { + float16Vector.set(row, Float16.toFloat16(3.6f * row * (row % 2 == 0 ? 1.0f : -1.0f))); + float32Vector.set(row, 37.6f * row * (row % 2 == 0 ? 1 : -1)); + float64Vector.set(row, 37.6d * row * (row % 2 == 0 ? 1 : -1)); + } + + float16Vector.set(10, Float16.toFloat16(Float.MIN_VALUE)); + float32Vector.set(10, Float.MIN_VALUE); + float64Vector.set(10, Double.MIN_VALUE); + + float16Vector.set(11, Float16.toFloat16(Float.MAX_VALUE)); + float32Vector.set(11, Float.MAX_VALUE); + float64Vector.set(11, Double.MAX_VALUE); + + float16Vector.set(12, Float16.toFloat16(Float.NaN)); + float32Vector.set(12, Float.NaN); + float64Vector.set(12, Double.NaN); + + float16Vector.set(13, Float16.toFloat16(Float.POSITIVE_INFINITY)); + float32Vector.set(13, Float.POSITIVE_INFINITY); + float64Vector.set(13, Double.POSITIVE_INFINITY); + + float16Vector.set(14, Float16.toFloat16(Float.NEGATIVE_INFINITY)); + float32Vector.set(14, Float.NEGATIVE_INFINITY); + float64Vector.set(14, Double.NEGATIVE_INFINITY); + + File dataFile = new File(TMP, "testWriteFloatingPoints.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(float16Vector.getValueAsFloat(row), record.get("float16")); + assertEquals(float32Vector.get(row), record.get("float32")); + assertEquals(float64Vector.get(row), record.get("float64")); + } + } + } + } + + @Test + public void testWriteNullableFloatingPoints() throws Exception { + + // Field definitions + FieldType float16Field = new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.HALF), null); + FieldType float32Field = new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null); + FieldType float64Field = new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + Float2Vector float16Vector = new Float2Vector(new Field("float16", float16Field, null), allocator); + Float4Vector float32Vector = new Float4Vector(new Field("float32", float32Field, null), allocator); + Float8Vector float64Vector = new Float8Vector(new Field("float64", float64Field, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(float16Vector, float32Vector, float64Vector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + root.setRowCount(rowCount); + root.allocateNew(); + + // Null values + float16Vector.setNull(0); + float32Vector.setNull(0); + float64Vector.setNull(0); + + // Zero values + float16Vector.setSafeWithPossibleTruncate(1, 0.0f); + float32Vector.set(1, 0.0f); + float64Vector.set(1, 0.0); + + // Non-zero values + float16Vector.setSafeWithPossibleTruncate(2, 1.0f); + float32Vector.set(2, 1.0f); + float64Vector.set(2, 1.0); + + File dataFile = new File(TMP, "testWriteNullableFloatingPoints.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("float16")); + assertNull(record.get("float32")); + assertNull(record.get("float64")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(float16Vector.getValueAsFloat(row), record.get("float16")); + assertEquals(float32Vector.get(row), record.get("float32")); + assertEquals(float64Vector.get(row), record.get("float64")); + } + } + } + } } From acaa6d98ac846138fabb47084dd6e7191b20b522 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 19:35:31 +0000 Subject: [PATCH 55/89] Tests for remaining primitive types --- .../adapter/avro/ArrowToAvroDataTest.java | 255 ++++++++++++++++++ 1 file changed, 255 insertions(+) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 9f9fb1e991..cc63acc98c 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -21,6 +21,7 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; @@ -30,6 +31,7 @@ import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -41,12 +43,15 @@ import org.apache.arrow.vector.UInt2Vector; import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.BinaryDecoder; @@ -550,6 +555,7 @@ public void testWriteNullableFloatingPoints() throws Exception { List vectors = Arrays.asList(float16Vector, float32Vector, float64Vector); try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + root.setRowCount(rowCount); root.allocateNew(); @@ -585,6 +591,7 @@ public void testWriteNullableFloatingPoints() throws Exception { GenericDatumReader datumReader = new GenericDatumReader<>(schema); try (InputStream inputStream = new FileInputStream(dataFile)) { + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); // Read and check values @@ -602,4 +609,252 @@ record = datumReader.read(record, decoder); } } } + + @Test + public void testWriteStrings() throws Exception { + + // Field definition + FieldType stringField = new FieldType(false, new ArrowType.Utf8(), null); + + // Create empty vector + BufferAllocator allocator = new RootAllocator(); + VarCharVector stringVector = new VarCharVector(new Field("string", stringField, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(stringVector); + int rowCount = 5; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + stringVector.setSafe(0, "Hello world!".getBytes()); + stringVector.setSafe(1, "<%**\r\n\t\\abc\0$$>".getBytes()); + stringVector.setSafe(2, "你好世界".getBytes()); + stringVector.setSafe(3, "مرحبا بالعالم".getBytes()); + stringVector.setSafe(4, "(P ∧ P ⇒ Q) ⇒ Q".getBytes()); + + File dataFile = new File(TMP, "testWriteStrings.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(stringVector.getObject(row).toString(), record.get("string").toString()); + } + } + } + } + + @Test + public void testWriteNullableStrings() throws Exception { + + // Field definition + FieldType stringField = new FieldType(true, new ArrowType.Utf8(), null); + + // Create empty vector + BufferAllocator allocator = new RootAllocator(); + VarCharVector stringVector = new VarCharVector(new Field("string", stringField, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(stringVector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + stringVector.setNull(0); + stringVector.setSafe(1, "".getBytes()); + stringVector.setSafe(2, "not empty".getBytes()); + + File dataFile = new File(TMP, "testWriteNullableStrings.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("string")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(stringVector.getObject(row).toString(), record.get("string").toString()); + } + } + } + } + + @Test + public void testWriteBinary() throws Exception { + + // Field definition + FieldType binaryField = new FieldType(false, new ArrowType.Binary(), null); + FieldType fixedField = new FieldType(false, new ArrowType.FixedSizeBinary(5), null); + + // Create empty vector + BufferAllocator allocator = new RootAllocator(); + VarBinaryVector binaryVector = new VarBinaryVector(new Field("binary", binaryField, null), allocator); + FixedSizeBinaryVector fixedVector = new FixedSizeBinaryVector(new Field("fixed", fixedField, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(binaryVector, fixedVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + binaryVector.setSafe(0, new byte[]{1, 2, 3}); + binaryVector.setSafe(1, new byte[]{4, 5, 6, 7}); + binaryVector.setSafe(2, new byte[]{8, 9}); + + fixedVector.setSafe(0, new byte[]{1, 2, 3, 4, 5}); + fixedVector.setSafe(1, new byte[]{4, 5, 6, 7, 8, 9}); + fixedVector.setSafe(2, new byte[]{8, 9, 10, 11, 12}); + + File dataFile = new File(TMP, "testWriteBinary.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + ByteBuffer buf = ((ByteBuffer) record.get("binary")); + byte[] bytes = new byte[buf.remaining()]; + buf.get(bytes); + System.out.print(record); + byte[] fixedBytes = ((GenericData.Fixed) record.get("fixed")).bytes(); + assertArrayEquals(binaryVector.getObject(row), bytes); + assertArrayEquals(fixedVector.getObject(row), fixedBytes); + } + } + } + } + + @Test + public void testWriteNullableBinary() throws Exception { + + // Field definition + FieldType binaryField = new FieldType(true, new ArrowType.Binary(), null); + FieldType fixedField = new FieldType(true, new ArrowType.FixedSizeBinary(5), null); + + // Create empty vector + BufferAllocator allocator = new RootAllocator(); + VarBinaryVector binaryVector = new VarBinaryVector(new Field("binary", binaryField, null), allocator); + FixedSizeBinaryVector fixedVector = new FixedSizeBinaryVector(new Field("fixed", fixedField, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(binaryVector, fixedVector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + binaryVector.setNull(0); + binaryVector.setSafe(1, new byte[]{}); + binaryVector.setSafe(2, new byte[]{10, 11, 12}); + + fixedVector.setNull(0); + fixedVector.setSafe(1, new byte[]{0, 0, 0, 0, 0}); + fixedVector.setSafe(2, new byte[]{10, 11, 12, 13, 14}); + + File dataFile = new File(TMP, "testWriteNullableBinary.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("binary")); + assertNull(record.get("fixed")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + ByteBuffer buf = ((ByteBuffer) record.get("binary")); + byte[] bytes = new byte[buf.remaining()]; + buf.get(bytes); + byte[] fixedBytes = ((GenericData.Fixed) record.get("fixed")).bytes(); + assertArrayEquals(binaryVector.getObject(row), bytes); + assertArrayEquals(fixedVector.getObject(row), fixedBytes); + } + } + } + } } From 8d85c4362c3cf58f5b8d6cfaf669a0f4ee6ba8f5 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 19:35:37 +0000 Subject: [PATCH 56/89] Producer fixes --- .../apache/arrow/adapter/avro/producers/AvroBytesProducer.java | 3 ++- .../adapter/avro/producers/AvroFixedSizeBinaryProducer.java | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java index 5d16d52da6..e1fe6dddc6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroBytesProducer.java @@ -36,7 +36,8 @@ public AvroBytesProducer(VarBinaryVector vector) { public void produce(Encoder encoder) throws IOException { // The nio ByteBuffer is created once per call, but underlying data is not copied long offset = vector.getStartOffset(currentIndex); - int length = vector.getEndOffset(currentIndex); + long endOffset = vector.getEndOffset(currentIndex); + int length = (int) (endOffset - offset); ByteBuffer nioBuffer = vector.getDataBuffer().nioBuffer(offset, length); encoder.writeBytes(nioBuffer); currentIndex++; diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeBinaryProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeBinaryProducer.java index eadb95d6f1..9fb877cfa0 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeBinaryProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeBinaryProducer.java @@ -48,7 +48,7 @@ protected AvroFixedSizeBinaryProducer(BaseFixedWidthVector vector) { public void produce(Encoder encoder) throws IOException { long offset = (long) currentIndex * vector.getTypeWidth(); vector.getDataBuffer().getBytes(offset, reuseBytes); - encoder.writeBytes(reuseBytes); + encoder.writeFixed(reuseBytes); currentIndex++; } } From 2995bc7c01dcd4bd7ae11e105ba7f4e905f1044f Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 21:47:01 +0000 Subject: [PATCH 57/89] Tests for decimal types --- .../adapter/avro/ArrowToAvroDataTest.java | 173 +++++++++++++++++- 1 file changed, 172 insertions(+), 1 deletion(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index cc63acc98c..fd478b5aed 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -21,6 +21,8 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; +import java.math.BigDecimal; +import java.math.RoundingMode; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; @@ -30,6 +32,8 @@ import org.apache.arrow.memory.util.Float16; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.Decimal256Vector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float2Vector; @@ -50,6 +54,8 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.avro.Conversions; +import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; @@ -68,6 +74,8 @@ public class ArrowToAvroDataTest { @TempDir public static File TMP; + // Data production for primitive types, nullable and non-nullable + @Test public void testWriteNullColumn() throws Exception { @@ -780,7 +788,6 @@ record = datumReader.read(record, decoder); ByteBuffer buf = ((ByteBuffer) record.get("binary")); byte[] bytes = new byte[buf.remaining()]; buf.get(bytes); - System.out.print(record); byte[] fixedBytes = ((GenericData.Fixed) record.get("fixed")).bytes(); assertArrayEquals(binaryVector.getObject(row), bytes); assertArrayEquals(fixedVector.getObject(row), fixedBytes); @@ -857,4 +864,168 @@ record = datumReader.read(record, decoder); } } } + + // Data production for logical types, nullable and non-nullable + + @Test + public void testWriteDecimals() throws Exception { + + // Field definitions + FieldType decimal128Field1 = new FieldType(false, new ArrowType.Decimal(38, 10, 128), null); + FieldType decimal128Field2 = new FieldType(false, new ArrowType.Decimal(38, 5, 128), null); + FieldType decimal256Field1 = new FieldType(false, new ArrowType.Decimal(76, 20, 256), null); + FieldType decimal256Field2 = new FieldType(false, new ArrowType.Decimal(76, 10, 256), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + DecimalVector decimal128Vector1 = new DecimalVector(new Field("decimal128_1", decimal128Field1, null), allocator); + DecimalVector decimal128Vector2 = new DecimalVector(new Field("decimal128_2", decimal128Field2, null), allocator); + Decimal256Vector decimal256Vector1 = new Decimal256Vector(new Field("decimal256_1", decimal256Field1, null), allocator); + Decimal256Vector decimal256Vector2 = new Decimal256Vector(new Field("decimal256_2", decimal256Field2, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(decimal128Vector1, decimal128Vector2, decimal256Vector1, decimal256Vector2); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + decimal128Vector1.setSafe(0, new BigDecimal("12345.67890").setScale(10, RoundingMode.UNNECESSARY)); + decimal128Vector1.setSafe(1, new BigDecimal("98765.43210").setScale(10, RoundingMode.UNNECESSARY)); + decimal128Vector1.setSafe(2, new BigDecimal("54321.09876").setScale(10, RoundingMode.UNNECESSARY)); + + decimal128Vector2.setSafe(0, new BigDecimal("12345.67890").setScale(5, RoundingMode.UNNECESSARY)); + decimal128Vector2.setSafe(1, new BigDecimal("98765.43210").setScale(5, RoundingMode.UNNECESSARY)); + decimal128Vector2.setSafe(2, new BigDecimal("54321.09876").setScale(5, RoundingMode.UNNECESSARY)); + + decimal256Vector1.setSafe(0, new BigDecimal("12345678901234567890.12345678901234567890").setScale(20, RoundingMode.UNNECESSARY)); + decimal256Vector1.setSafe(1, new BigDecimal("98765432109876543210.98765432109876543210").setScale(20, RoundingMode.UNNECESSARY)); + decimal256Vector1.setSafe(2, new BigDecimal("54321098765432109876.54321098765432109876").setScale(20, RoundingMode.UNNECESSARY)); + + decimal256Vector2.setSafe(0, new BigDecimal("12345678901234567890.1234567890").setScale(10, RoundingMode.UNNECESSARY)); + decimal256Vector2.setSafe(1, new BigDecimal("98765432109876543210.9876543210").setScale(10, RoundingMode.UNNECESSARY)); + decimal256Vector2.setSafe(2, new BigDecimal("54321098765432109876.5432109876").setScale(10, RoundingMode.UNNECESSARY)); + + File dataFile = new File(TMP, "testWriteDecimals.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(decimal128Vector1.getObject(row), decodeFixedDecimal(record, "decimal128_1")); + assertEquals(decimal128Vector2.getObject(row), decodeFixedDecimal(record, "decimal128_2")); + assertEquals(decimal256Vector1.getObject(row), decodeFixedDecimal(record, "decimal256_1")); + assertEquals(decimal256Vector2.getObject(row), decodeFixedDecimal(record, "decimal256_2")); + } + } + } + } + + @Test + public void testWriteNullableDecimals() throws Exception { + + // Field definitions + FieldType decimal128Field1 = new FieldType(true, new ArrowType.Decimal(38, 10, 128), null); + FieldType decimal128Field2 = new FieldType(true, new ArrowType.Decimal(38, 5, 128), null); + FieldType decimal256Field1 = new FieldType(true, new ArrowType.Decimal(76, 20, 256), null); + FieldType decimal256Field2 = new FieldType(true, new ArrowType.Decimal(76, 10, 256), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + DecimalVector decimal128Vector1 = new DecimalVector(new Field("decimal128_1", decimal128Field1, null), allocator); + DecimalVector decimal128Vector2 = new DecimalVector(new Field("decimal128_2", decimal128Field2, null), allocator); + Decimal256Vector decimal256Vector1 = new Decimal256Vector(new Field("decimal256_1", decimal256Field1, null), allocator); + Decimal256Vector decimal256Vector2 = new Decimal256Vector(new Field("decimal256_2", decimal256Field2, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(decimal128Vector1, decimal128Vector2, decimal256Vector1, decimal256Vector2); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + decimal128Vector1.setNull(0); + decimal128Vector1.setSafe(1, BigDecimal.ZERO.setScale(10, RoundingMode.UNNECESSARY)); + decimal128Vector1.setSafe(2, new BigDecimal("12345.67890").setScale(10, RoundingMode.UNNECESSARY)); + + decimal128Vector2.setNull(0); + decimal128Vector2.setSafe(1, BigDecimal.ZERO.setScale(5, RoundingMode.UNNECESSARY)); + decimal128Vector2.setSafe(2, new BigDecimal("98765.43210").setScale(5, RoundingMode.UNNECESSARY)); + + decimal256Vector1.setNull(0); + decimal256Vector1.setSafe(1, BigDecimal.ZERO.setScale(20, RoundingMode.UNNECESSARY)); + decimal256Vector1.setSafe(2, new BigDecimal("12345678901234567890.12345678901234567890").setScale(20, RoundingMode.UNNECESSARY)); + + decimal256Vector2.setNull(0); + decimal256Vector2.setSafe(1, BigDecimal.ZERO.setScale(10, RoundingMode.UNNECESSARY)); + decimal256Vector2.setSafe(2, new BigDecimal("98765432109876543210.9876543210").setScale(10, RoundingMode.UNNECESSARY)); + + File dataFile = new File(TMP, "testWriteNullableDecimals.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("decimal128_1")); + assertNull(record.get("decimal128_2")); + assertNull(record.get("decimal256_1")); + assertNull(record.get("decimal256_2")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(decimal128Vector1.getObject(row), decodeFixedDecimal(record, "decimal128_1")); + assertEquals(decimal128Vector2.getObject(row), decodeFixedDecimal(record, "decimal128_2")); + assertEquals(decimal256Vector1.getObject(row), decodeFixedDecimal(record, "decimal256_1")); + assertEquals(decimal256Vector2.getObject(row), decodeFixedDecimal(record, "decimal256_2")); + } + } + } + } + + private static BigDecimal decodeFixedDecimal(GenericRecord record, String fieldName) { + GenericData.Fixed fixed = (GenericData.Fixed) record.get(fieldName); + var logicalType = LogicalTypes.fromSchema(fixed.getSchema()); + return new Conversions.DecimalConversion().fromFixed(fixed, fixed.getSchema(), logicalType); + } } From c91e976c51915f9779c3cb67b447ff0ca93aa4f9 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 22:06:22 +0000 Subject: [PATCH 58/89] Test for logical date type --- .../adapter/avro/ArrowToAvroDataTest.java | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index fd478b5aed..e964fab61d 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -24,6 +24,7 @@ import java.math.BigDecimal; import java.math.RoundingMode; import java.nio.ByteBuffer; +import java.time.LocalDate; import java.util.Arrays; import java.util.List; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; @@ -32,6 +33,8 @@ import org.apache.arrow.memory.util.Float16; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Decimal256Vector; import org.apache.arrow.vector.FieldVector; @@ -50,6 +53,7 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; @@ -1028,4 +1032,130 @@ private static BigDecimal decodeFixedDecimal(GenericRecord record, String fieldN var logicalType = LogicalTypes.fromSchema(fixed.getSchema()); return new Conversions.DecimalConversion().fromFixed(fixed, fixed.getSchema(), logicalType); } + + @Test + public void testWriteDates() throws Exception { + + // Field definitions + FieldType dateDayField = new FieldType(false, new ArrowType.Date(DateUnit.DAY), null); + FieldType dateMillisField = new FieldType(false, new ArrowType.Date(DateUnit.MILLISECOND), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + DateDayVector dateDayVector = new DateDayVector(new Field("dateDay", dateDayField, null), allocator); + DateMilliVector dateMillisVector = new DateMilliVector(new Field("dateMillis", dateMillisField, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(dateDayVector, dateMillisVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + dateDayVector.setSafe(0, (int) LocalDate.now().toEpochDay()); + dateDayVector.setSafe(1, (int) LocalDate.now().toEpochDay() + 1); + dateDayVector.setSafe(2, (int) LocalDate.now().toEpochDay() + 2); + + dateMillisVector.setSafe(0, LocalDate.now().toEpochDay() * 86400000L); + dateMillisVector.setSafe(1, (LocalDate.now().toEpochDay() + 1) * 86400000L); + dateMillisVector.setSafe(2, (LocalDate.now().toEpochDay() + 2) * 86400000L); + + File dataFile = new File(TMP, "testWriteDates.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(dateDayVector.get(row), record.get("dateDay")); + assertEquals(dateMillisVector.get(row), ((long) (Integer) record.get("dateMillis")) * 86400000L); + } + } + } + } + + @Test + public void testWriteNullableDates() throws Exception { + + // Field definitions + FieldType dateDayField = new FieldType(true, new ArrowType.Date(DateUnit.DAY), null); + FieldType dateMillisField = new FieldType(true, new ArrowType.Date(DateUnit.MILLISECOND), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + DateDayVector dateDayVector = new DateDayVector(new Field("dateDay", dateDayField, null), allocator); + DateMilliVector dateMillisVector = new DateMilliVector(new Field("dateMillis", dateMillisField, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(dateDayVector, dateMillisVector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + dateDayVector.setNull(0); + dateDayVector.setSafe(1, 0); + dateDayVector.setSafe(2, (int) LocalDate.now().toEpochDay()); + + dateMillisVector.setNull(0); + dateMillisVector.setSafe(1, 0); + dateMillisVector.setSafe(2, LocalDate.now().toEpochDay() * 86400000L); + + File dataFile = new File(TMP, "testWriteNullableDates.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("dateDay")); + assertNull(record.get("dateMillis")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(dateDayVector.get(row), record.get("dateDay")); + assertEquals(dateMillisVector.get(row), ((long) (Integer) record.get("dateMillis")) * 86400000L); + } + } + } + } } From 534acb8773abe6627af0994b716e81679de36905 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 22:37:52 +0000 Subject: [PATCH 59/89] Add tests for time data types --- .../adapter/avro/ArrowToAvroDataTest.java | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index e964fab61d..85cdd262d3 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -45,6 +45,10 @@ import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeNanoVector; import org.apache.arrow.vector.TinyIntVector; import org.apache.arrow.vector.UInt1Vector; import org.apache.arrow.vector.UInt2Vector; @@ -55,6 +59,7 @@ import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -1158,4 +1163,160 @@ record = datumReader.read(record, decoder); } } } + + @Test + public void testWriteTimes() throws Exception { + + // Field definitions + FieldType timeSecField = new FieldType(false, new ArrowType.Time(TimeUnit.SECOND, 32), null); + FieldType timeMillisField = new FieldType(false, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null); + FieldType timeMicrosField = new FieldType(false, new ArrowType.Time(TimeUnit.MICROSECOND, 64), null); + FieldType timeNanosField = new FieldType(false, new ArrowType.Time(TimeUnit.NANOSECOND, 64), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + TimeSecVector timeSecVector = new TimeSecVector(new Field("timeSec", timeSecField, null), allocator); + TimeMilliVector timeMillisVector = new TimeMilliVector(new Field("timeMillis", timeMillisField, null), allocator); + TimeMicroVector timeMicrosVector = new TimeMicroVector(new Field("timeMicros", timeMicrosField, null), allocator); + TimeNanoVector timeNanosVector = new TimeNanoVector(new Field("timeNanos", timeNanosField, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(timeSecVector, timeMillisVector, timeMicrosVector, timeNanosVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + timeSecVector.setSafe(0, (int) (System.currentTimeMillis() / 1000)); + timeSecVector.setSafe(1, (int) (System.currentTimeMillis() / 1000 - 1)); + timeSecVector.setSafe(2, (int) (System.currentTimeMillis() / 1000 - 2)); + + timeMillisVector.setSafe(0, (int) System.currentTimeMillis()); + timeMillisVector.setSafe(1, (int) System.currentTimeMillis() - 1000); + timeMillisVector.setSafe(2, (int) System.currentTimeMillis() - 2000); + + timeMicrosVector.setSafe(0, System.currentTimeMillis() * 1000); + timeMicrosVector.setSafe(1, (System.currentTimeMillis() - 1000) * 1000); + timeMicrosVector.setSafe(2, (System.currentTimeMillis() - 2000) * 1000); + + timeNanosVector.setSafe(0, System.currentTimeMillis() * 1000000); + timeNanosVector.setSafe(1, (System.currentTimeMillis() - 1000) * 1000000); + timeNanosVector.setSafe(2, (System.currentTimeMillis() - 2000) * 1000000); + + File dataFile = new File(TMP, "testWriteTimes.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(timeSecVector.get(row), (int) ((long) record.get("timeSec") / 1000000)); + assertEquals(timeMillisVector.get(row), record.get("timeMillis")); + assertEquals(timeMicrosVector.get(row), record.get("timeMicros")); + assertEquals(timeNanosVector.get(row), (long) record.get("timeNanos") * 1000); + } + } + } + } + + @Test + public void testWriteNullableTimes() throws Exception { + + // Field definitions + FieldType timeSecField = new FieldType(true, new ArrowType.Time(TimeUnit.SECOND, 32), null); + FieldType timeMillisField = new FieldType(true, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null); + FieldType timeMicrosField = new FieldType(true, new ArrowType.Time(TimeUnit.MICROSECOND, 64), null); + FieldType timeNanosField = new FieldType(true, new ArrowType.Time(TimeUnit.NANOSECOND, 64), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + TimeSecVector timeSecVector = new TimeSecVector(new Field("timeSec", timeSecField, null), allocator); + TimeMilliVector timeMillisVector = new TimeMilliVector(new Field("timeMillis", timeMillisField, null), allocator); + TimeMicroVector timeMicrosVector = new TimeMicroVector(new Field("timeMicros", timeMicrosField, null), allocator); + TimeNanoVector timeNanosVector = new TimeNanoVector(new Field("timeNanos", timeNanosField, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(timeSecVector, timeMillisVector, timeMicrosVector, timeNanosVector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + timeSecVector.setNull(0); + timeSecVector.setSafe(1, 0); + timeSecVector.setSafe(2, (int) (System.currentTimeMillis() / 1000)); + + timeMillisVector.setNull(0); + timeMillisVector.setSafe(1, 0); + timeMillisVector.setSafe(2, (int) System.currentTimeMillis()); + + timeMicrosVector.setNull(0); + timeMicrosVector.setSafe(1, 0); + timeMicrosVector.setSafe(2, System.currentTimeMillis() * 1000); + + timeNanosVector.setNull(0); + timeNanosVector.setSafe(1, 0); + timeNanosVector.setSafe(2, System.currentTimeMillis() * 1000000); + + File dataFile = new File(TMP, "testWriteNullableTimes.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("timeSec")); + assertNull(record.get("timeMillis")); + assertNull(record.get("timeMicros")); + assertNull(record.get("timeNanos")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(timeSecVector.get(row), (int) ((long) record.get("timeSec") / 1000000)); + assertEquals(timeMillisVector.get(row), record.get("timeMillis")); + assertEquals(timeMicrosVector.get(row), record.get("timeMicros")); + assertEquals(timeNanosVector.get(row), (long) record.get("timeNanos") * 1000); + } + } + } + } } From d21c658db50f66ef8eaaf6596d62c900e8f6fb12 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 23:14:31 +0000 Subject: [PATCH 60/89] Add tests for timestamp types --- .../adapter/avro/ArrowToAvroDataTest.java | 354 +++++++++++++++++- 1 file changed, 338 insertions(+), 16 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 85cdd262d3..378c2661bf 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -24,7 +24,9 @@ import java.math.BigDecimal; import java.math.RoundingMode; import java.nio.ByteBuffer; +import java.time.Instant; import java.time.LocalDate; +import java.time.ZonedDateTime; import java.util.Arrays; import java.util.List; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; @@ -45,6 +47,14 @@ import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; import org.apache.arrow.vector.TimeSecVector; import org.apache.arrow.vector.TimeMilliVector; import org.apache.arrow.vector.TimeMicroVector; @@ -1190,21 +1200,21 @@ public void testWriteTimes() throws Exception { root.allocateNew(); // Set test data - timeSecVector.setSafe(0, (int) (System.currentTimeMillis() / 1000)); - timeSecVector.setSafe(1, (int) (System.currentTimeMillis() / 1000 - 1)); - timeSecVector.setSafe(2, (int) (System.currentTimeMillis() / 1000 - 2)); + timeSecVector.setSafe(0, ZonedDateTime.now().toLocalTime().toSecondOfDay()); + timeSecVector.setSafe(1, ZonedDateTime.now().toLocalTime().toSecondOfDay() - 1); + timeSecVector.setSafe(2, ZonedDateTime.now().toLocalTime().toSecondOfDay() - 2); - timeMillisVector.setSafe(0, (int) System.currentTimeMillis()); - timeMillisVector.setSafe(1, (int) System.currentTimeMillis() - 1000); - timeMillisVector.setSafe(2, (int) System.currentTimeMillis() - 2000); + timeMillisVector.setSafe(0, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000)); + timeMillisVector.setSafe(1, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000) - 1000); + timeMillisVector.setSafe(2, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000) - 2000); - timeMicrosVector.setSafe(0, System.currentTimeMillis() * 1000); - timeMicrosVector.setSafe(1, (System.currentTimeMillis() - 1000) * 1000); - timeMicrosVector.setSafe(2, (System.currentTimeMillis() - 2000) * 1000); + timeMicrosVector.setSafe(0, ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000); + timeMicrosVector.setSafe(1, ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000 - 1000000); + timeMicrosVector.setSafe(2, ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000 - 2000000); - timeNanosVector.setSafe(0, System.currentTimeMillis() * 1000000); - timeNanosVector.setSafe(1, (System.currentTimeMillis() - 1000) * 1000000); - timeNanosVector.setSafe(2, (System.currentTimeMillis() - 2000) * 1000000); + timeNanosVector.setSafe(0, ZonedDateTime.now().toLocalTime().toNanoOfDay()); + timeNanosVector.setSafe(1, ZonedDateTime.now().toLocalTime().toNanoOfDay() - 1000000000); + timeNanosVector.setSafe(2, ZonedDateTime.now().toLocalTime().toNanoOfDay() - 2000000000); File dataFile = new File(TMP, "testWriteTimes.avro"); @@ -1268,19 +1278,19 @@ public void testWriteNullableTimes() throws Exception { // Set test data timeSecVector.setNull(0); timeSecVector.setSafe(1, 0); - timeSecVector.setSafe(2, (int) (System.currentTimeMillis() / 1000)); + timeSecVector.setSafe(2, ZonedDateTime.now().toLocalTime().toSecondOfDay()); timeMillisVector.setNull(0); timeMillisVector.setSafe(1, 0); - timeMillisVector.setSafe(2, (int) System.currentTimeMillis()); + timeMillisVector.setSafe(2, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000)); timeMicrosVector.setNull(0); timeMicrosVector.setSafe(1, 0); - timeMicrosVector.setSafe(2, System.currentTimeMillis() * 1000); + timeMicrosVector.setSafe(2, ZonedDateTime.now().toLocalTime().toSecondOfDay() / 1000); timeNanosVector.setNull(0); timeNanosVector.setSafe(1, 0); - timeNanosVector.setSafe(2, System.currentTimeMillis() * 1000000); + timeNanosVector.setSafe(2, ZonedDateTime.now().toLocalTime().toNanoOfDay()); File dataFile = new File(TMP, "testWriteNullableTimes.avro"); @@ -1319,4 +1329,316 @@ record = datumReader.read(record, decoder); } } } + + @Test + public void testWriteZoneAwareTimestamps() throws Exception { + + // Field definitions + FieldType timestampSecField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.SECOND, "UTC"), null); + FieldType timestampMillisField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"), null); + FieldType timestampMicrosField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"), null); + FieldType timestampNanosField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC"), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + TimeStampSecTZVector timestampSecVector = new TimeStampSecTZVector(new Field("timestampSec", timestampSecField, null), allocator); + TimeStampMilliTZVector timestampMillisVector = new TimeStampMilliTZVector(new Field("timestampMillis", timestampMillisField, null), allocator); + TimeStampMicroTZVector timestampMicrosVector = new TimeStampMicroTZVector(new Field("timestampMicros", timestampMicrosField, null), allocator); + TimeStampNanoTZVector timestampNanosVector = new TimeStampNanoTZVector(new Field("timestampNanos", timestampNanosField, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + timestampSecVector.setSafe(0, (int) Instant.now().getEpochSecond()); + timestampSecVector.setSafe(1, (int) Instant.now().getEpochSecond() - 1); + timestampSecVector.setSafe(2, (int) Instant.now().getEpochSecond() - 2); + + timestampMillisVector.setSafe(0, (int) Instant.now().toEpochMilli()); + timestampMillisVector.setSafe(1, (int) Instant.now().toEpochMilli() - 1000); + timestampMillisVector.setSafe(2, (int) Instant.now().toEpochMilli() - 2000); + + timestampMicrosVector.setSafe(0, Instant.now().toEpochMilli() * 1000); + timestampMicrosVector.setSafe(1, (Instant.now().toEpochMilli() - 1000) * 1000); + timestampMicrosVector.setSafe(2, (Instant.now().toEpochMilli() - 2000) * 1000); + + timestampNanosVector.setSafe(0, Instant.now().toEpochMilli() * 1000000); + timestampNanosVector.setSafe(1, (Instant.now().toEpochMilli() - 1000) * 1000000); + timestampNanosVector.setSafe(2, (Instant.now().toEpochMilli() - 2000) * 1000000); + + File dataFile = new File(TMP, "testWriteZoneAwareTimestamps.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); + assertEquals(timestampMillisVector.get(row), (int) (long) record.get("timestampMillis")); + assertEquals(timestampMicrosVector.get(row), record.get("timestampMicros")); + assertEquals(timestampNanosVector.get(row), record.get("timestampNanos")); + } + } + } + } + + @Test + public void testWriteNullableZoneAwareTimestamps() throws Exception { + + // Field definitions + FieldType timestampSecField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.SECOND, "UTC"), null); + FieldType timestampMillisField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"), null); + FieldType timestampMicrosField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"), null); + FieldType timestampNanosField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC"), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + TimeStampSecTZVector timestampSecVector = new TimeStampSecTZVector(new Field("timestampSec", timestampSecField, null), allocator); + TimeStampMilliTZVector timestampMillisVector = new TimeStampMilliTZVector(new Field("timestampMillis", timestampMillisField, null), allocator); + TimeStampMicroTZVector timestampMicrosVector = new TimeStampMicroTZVector(new Field("timestampMicros", timestampMicrosField, null), allocator); + TimeStampNanoTZVector timestampNanosVector = new TimeStampNanoTZVector(new Field("timestampNanos", timestampNanosField, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + timestampSecVector.setNull(0); + timestampSecVector.setSafe(1, 0); + timestampSecVector.setSafe(2, (int) Instant.now().getEpochSecond()); + + timestampMillisVector.setNull(0); + timestampMillisVector.setSafe(1, 0); + timestampMillisVector.setSafe(2, (int) Instant.now().toEpochMilli()); + + timestampMicrosVector.setNull(0); + timestampMicrosVector.setSafe(1, 0); + timestampMicrosVector.setSafe(2, Instant.now().toEpochMilli() * 1000); + + timestampNanosVector.setNull(0); + timestampNanosVector.setSafe(1, 0); + timestampNanosVector.setSafe(2, Instant.now().toEpochMilli() * 1000000); + + File dataFile = new File(TMP, "testWriteNullableZoneAwareTimestamps.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("timestampSec")); + assertNull(record.get("timestampMillis")); + assertNull(record.get("timestampMicros")); + assertNull(record.get("timestampNanos")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); + assertEquals(timestampMillisVector.get(row), (int) (long) record.get("timestampMillis")); + assertEquals(timestampMicrosVector.get(row), record.get("timestampMicros")); + assertEquals(timestampNanosVector.get(row), record.get("timestampNanos")); + } + } + } + } + + @Test + public void testWriteLocalTimestamps() throws Exception { + + // Field definitions + FieldType timestampSecField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.SECOND, null), null); + FieldType timestampMillisField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.MILLISECOND, null), null); + FieldType timestampMicrosField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.MICROSECOND, null), null); + FieldType timestampNanosField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.NANOSECOND, null), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + TimeStampSecVector timestampSecVector = new TimeStampSecVector(new Field("timestampSec", timestampSecField, null), allocator); + TimeStampMilliVector timestampMillisVector = new TimeStampMilliVector(new Field("timestampMillis", timestampMillisField, null), allocator); + TimeStampMicroVector timestampMicrosVector = new TimeStampMicroVector(new Field("timestampMicros", timestampMicrosField, null), allocator); + TimeStampNanoVector timestampNanosVector = new TimeStampNanoVector(new Field("timestampNanos", timestampNanosField, null), allocator); + + // Set up VSR + List vectors = Arrays.asList(timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + timestampSecVector.setSafe(0, (int) Instant.now().getEpochSecond()); + timestampSecVector.setSafe(1, (int) Instant.now().getEpochSecond() - 1); + timestampSecVector.setSafe(2, (int) Instant.now().getEpochSecond() - 2); + + timestampMillisVector.setSafe(0, (int) Instant.now().toEpochMilli()); + timestampMillisVector.setSafe(1, (int) Instant.now().toEpochMilli() - 1000); + timestampMillisVector.setSafe(2, (int) Instant.now().toEpochMilli() - 2000); + + timestampMicrosVector.setSafe(0, Instant.now().toEpochMilli() * 1000); + timestampMicrosVector.setSafe(1, (Instant.now().toEpochMilli() - 1000) * 1000); + timestampMicrosVector.setSafe(2, (Instant.now().toEpochMilli() - 2000) * 1000); + + timestampNanosVector.setSafe(0, Instant.now().toEpochMilli() * 1000000); + timestampNanosVector.setSafe(1, (Instant.now().toEpochMilli() - 1000) * 1000000); + timestampNanosVector.setSafe(2, (Instant.now().toEpochMilli() - 2000) * 1000000); + + File dataFile = new File(TMP, "testWriteZoneAwareTimestamps.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); + assertEquals(timestampMillisVector.get(row), (int) (long) record.get("timestampMillis")); + assertEquals(timestampMicrosVector.get(row), record.get("timestampMicros")); + assertEquals(timestampNanosVector.get(row), record.get("timestampNanos")); + } + } + } + } + + @Test + public void testWriteNullableLocalTimestamps() throws Exception { + + // Field definitions + FieldType timestampSecField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.SECOND, null), null); + FieldType timestampMillisField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.MILLISECOND, null), null); + FieldType timestampMicrosField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.MICROSECOND, null), null); + FieldType timestampNanosField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.NANOSECOND, null), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + TimeStampSecVector timestampSecVector = new TimeStampSecVector(new Field("timestampSec", timestampSecField, null), allocator); + TimeStampMilliVector timestampMillisVector = new TimeStampMilliVector(new Field("timestampMillis", timestampMillisField, null), allocator); + TimeStampMicroVector timestampMicrosVector = new TimeStampMicroVector(new Field("timestampMicros", timestampMicrosField, null), allocator); + TimeStampNanoVector timestampNanosVector = new TimeStampNanoVector(new Field("timestampNanos", timestampNanosField, null), allocator); + + int rowCount = 3; + + // Set up VSR + List vectors = Arrays.asList(timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + timestampSecVector.setNull(0); + timestampSecVector.setSafe(1, 0); + timestampSecVector.setSafe(2, (int) Instant.now().getEpochSecond()); + + timestampMillisVector.setNull(0); + timestampMillisVector.setSafe(1, 0); + timestampMillisVector.setSafe(2, (int) Instant.now().toEpochMilli()); + + timestampMicrosVector.setNull(0); + timestampMicrosVector.setSafe(1, 0); + timestampMicrosVector.setSafe(2, Instant.now().toEpochMilli() * 1000); + + timestampNanosVector.setNull(0); + timestampNanosVector.setSafe(1, 0); + timestampNanosVector.setSafe(2, Instant.now().toEpochMilli() * 1000000); + + File dataFile = new File(TMP, "testWriteNullableZoneAwareTimestamps.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + + // Read and check values + GenericRecord record = datumReader.read(null, decoder); + assertNull(record.get("timestampSec")); + assertNull(record.get("timestampMillis")); + assertNull(record.get("timestampMicros")); + assertNull(record.get("timestampNanos")); + + for (int row = 1; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); + assertEquals(timestampMillisVector.get(row), (int) (long) record.get("timestampMillis")); + assertEquals(timestampMicrosVector.get(row), record.get("timestampMicros")); + assertEquals(timestampNanosVector.get(row), record.get("timestampNanos")); + } + } + } + } } From 3c26e5d576b757478bd19c71e8aace227a54630f Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 23:17:39 +0000 Subject: [PATCH 61/89] Fix missing increment in timestamp sec TZ producer --- .../avro/producers/logical/AvroTimestampSecTzProducer.java | 1 + 1 file changed, 1 insertion(+) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java index f524e59810..bd6cc14dad 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimestampSecTzProducer.java @@ -50,5 +50,6 @@ public void produce(Encoder encoder) throws IOException { } long utcMillis = utcSeconds * MILLIS_PER_SECOND; encoder.writeLong(utcMillis); + currentIndex++; } } From e9e2954f8aefc984c538006cec2c8a8b23e36014 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 23:18:25 +0000 Subject: [PATCH 62/89] Throw an error on overflow in date milli producer --- .../adapter/avro/producers/logical/AvroDateMilliProducer.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java index 0ce0beaca6..a64bb3a021 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDateMilliProducer.java @@ -40,6 +40,9 @@ public AvroDateMilliProducer(DateMilliVector vector) { public void produce(Encoder encoder) throws IOException { long millis = vector.getDataBuffer().getLong(currentIndex * (long) DateMilliVector.TYPE_WIDTH); long days = millis / MILLIS_PER_DAY; + if (days > (long) Integer.MAX_VALUE || days < (long) Integer.MIN_VALUE) { + throw new ArithmeticException("Date value is too large for Avro encoding"); + } encoder.writeInt((int) days); currentIndex++; } From 94c214b2713711f4a7f1e2da2bb1503e24883076 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 23:19:35 +0000 Subject: [PATCH 63/89] Include negative values in decimal tests --- .../apache/arrow/adapter/avro/ArrowToAvroDataTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 378c2661bf..ef92f3c91e 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -913,19 +913,19 @@ public void testWriteDecimals() throws Exception { // Set test data decimal128Vector1.setSafe(0, new BigDecimal("12345.67890").setScale(10, RoundingMode.UNNECESSARY)); - decimal128Vector1.setSafe(1, new BigDecimal("98765.43210").setScale(10, RoundingMode.UNNECESSARY)); + decimal128Vector1.setSafe(1, new BigDecimal("-98765.43210").setScale(10, RoundingMode.UNNECESSARY)); decimal128Vector1.setSafe(2, new BigDecimal("54321.09876").setScale(10, RoundingMode.UNNECESSARY)); decimal128Vector2.setSafe(0, new BigDecimal("12345.67890").setScale(5, RoundingMode.UNNECESSARY)); - decimal128Vector2.setSafe(1, new BigDecimal("98765.43210").setScale(5, RoundingMode.UNNECESSARY)); + decimal128Vector2.setSafe(1, new BigDecimal("-98765.43210").setScale(5, RoundingMode.UNNECESSARY)); decimal128Vector2.setSafe(2, new BigDecimal("54321.09876").setScale(5, RoundingMode.UNNECESSARY)); decimal256Vector1.setSafe(0, new BigDecimal("12345678901234567890.12345678901234567890").setScale(20, RoundingMode.UNNECESSARY)); - decimal256Vector1.setSafe(1, new BigDecimal("98765432109876543210.98765432109876543210").setScale(20, RoundingMode.UNNECESSARY)); + decimal256Vector1.setSafe(1, new BigDecimal("-98765432109876543210.98765432109876543210").setScale(20, RoundingMode.UNNECESSARY)); decimal256Vector1.setSafe(2, new BigDecimal("54321098765432109876.54321098765432109876").setScale(20, RoundingMode.UNNECESSARY)); decimal256Vector2.setSafe(0, new BigDecimal("12345678901234567890.1234567890").setScale(10, RoundingMode.UNNECESSARY)); - decimal256Vector2.setSafe(1, new BigDecimal("98765432109876543210.9876543210").setScale(10, RoundingMode.UNNECESSARY)); + decimal256Vector2.setSafe(1, new BigDecimal("-98765432109876543210.9876543210").setScale(10, RoundingMode.UNNECESSARY)); decimal256Vector2.setSafe(2, new BigDecimal("54321098765432109876.5432109876").setScale(10, RoundingMode.UNNECESSARY)); File dataFile = new File(TMP, "testWriteDecimals.avro"); From ce81b1419b3ae66e24443638e495a5f695f46e09 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 23:28:51 +0000 Subject: [PATCH 64/89] Revise logic for decimal producers --- .../logical/AvroDecimal256Producer.java | 19 ++++++++++-- .../logical/AvroDecimalProducer.java | 31 +++++++++++++++++-- 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java index 76067a496d..737c8caf66 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java @@ -16,19 +16,32 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import org.apache.arrow.adapter.avro.producers.AvroFixedSizeBinaryProducer; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.vector.Decimal256Vector; +import org.apache.avro.io.Encoder; + +import java.io.IOException; +import java.math.BigDecimal; /** * Producer that produces decimal values from a {@link Decimal256Vector}, writes data to an Avro * encoder. */ -public class AvroDecimal256Producer extends AvroFixedSizeBinaryProducer { +public class AvroDecimal256Producer extends BaseAvroProducer { + + // Logic is the same as for DecimalVector (128 bit) - // Decimal stored as fixed width bytes, matches Avro decimal encoding + byte[] encodedBytes = new byte[Decimal256Vector.TYPE_WIDTH]; /** Instantiate an AvroDecimalProducer. */ public AvroDecimal256Producer(Decimal256Vector vector) { super(vector); } + + @Override + public void produce(Encoder encoder) throws IOException { + BigDecimal value = vector.getObject(currentIndex++); + AvroDecimalProducer.encodeDecimal(value, encodedBytes); + encoder.writeFixed(encodedBytes); + } } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java index 033b45b7cc..d4e55d55aa 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java @@ -16,19 +16,44 @@ */ package org.apache.arrow.adapter.avro.producers.logical; -import org.apache.arrow.adapter.avro.producers.AvroFixedSizeBinaryProducer; +import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.util.DecimalUtility; +import org.apache.avro.io.Encoder; + +import java.io.IOException; +import java.math.BigDecimal; /** * Producer that produces decimal values from a {@link DecimalVector}, writes data to an Avro * encoder. */ -public class AvroDecimalProducer extends AvroFixedSizeBinaryProducer { +public class AvroDecimalProducer extends BaseAvroProducer { + + // Arrow stores decimals with native endianness, but Avro requires big endian + // Writing the Arrow representation as fixed bytes fails on little-end machines + // Instead, we replicate the big endian logic explicitly here + // See DecimalUtility.writeByteArrayToArrowBufHelper - // Decimal stored as fixed width bytes, matches Avro decimal encoding + byte[] encodedBytes = new byte[DecimalVector.TYPE_WIDTH]; /** Instantiate an AvroDecimalProducer. */ public AvroDecimalProducer(DecimalVector vector) { super(vector); } + + @Override + public void produce(Encoder encoder) throws IOException { + // Use getObject() to go back to a BigDecimal then re-encode + BigDecimal value = vector.getObject(currentIndex++); + encodeDecimal(value, encodedBytes); + encoder.writeFixed(encodedBytes); + } + + static void encodeDecimal(BigDecimal value, byte[] encodedBytes) { + byte[] valueBytes = value.unscaledValue().toByteArray(); + byte[] padding = valueBytes[0] < 0 ? DecimalUtility.minus_one : DecimalUtility.zeroes; + System.arraycopy(padding, 0, encodedBytes, 0, encodedBytes.length - valueBytes.length); + System.arraycopy(valueBytes, 0, encodedBytes, encodedBytes.length - valueBytes.length, valueBytes.length); + } } From 20bb46f888859762282b715f00d8b73113609249 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Sun, 23 Mar 2025 23:40:02 +0000 Subject: [PATCH 65/89] Use time-millis for time (sec) vectors instead of time-micros (consistent with timestamp and should not roll over for time of day) --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 10 +++++----- .../producers/logical/AvroTimeSecProducer.java | 15 ++++++++++----- .../arrow/adapter/avro/ArrowToAvroDataTest.java | 4 ++-- .../arrow/adapter/avro/ArrowToAvroSchemaTest.java | 8 ++++---- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 409e34da61..b36fd32f99 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -132,8 +132,8 @@ public class ArrowToAvroUtils { *

  • ArrowType.FixedSizeBinary --> FIXED *
  • ArrowType.Decimal --> decimal (FIXED) *
  • ArrowType.Date --> date (INT) - *
  • ArrowType.Time (MILLI) --> time-millis (INT) - *
  • ArrowType.Time (SEC | MICRO | NANO) --> time-micros (LONG) + *
  • ArrowType.Time (SEC | MILLI) --> time-millis (INT) + *
  • ArrowType.Time (MICRO | NANO) --> time-micros (LONG) *
  • ArrowType.Timestamp (NANOSECONDS, TZ != NULL) --> time-nanos (LONG) *
  • ArrowType.Timestamp (MICROSECONDS, TZ != NULL) --> time-micros (LONG) *
  • ArrowType.Timestamp (MILLISECONDS | SECONDS, TZ != NULL) --> time-millis (LONG) @@ -326,7 +326,7 @@ private static T buildBaseTypeSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); - if (timeType.getUnit() == TimeUnit.MILLISECOND) { + if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { return builder.intBuilder().prop("logicalType", "time-millis").endInt(); } else { // All other time types (sec, micro, nano) are encoded as time-micros (LONG) @@ -410,7 +410,7 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); - if (timeType.getUnit() == TimeUnit.MILLISECOND) { + if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { return builder.intBuilder().prop("logicalType", "time-millis").endInt().noDefault(); } else { // All other time types (sec, micro, nano) are encoded as time-micros (LONG) @@ -504,7 +504,7 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); - if (timeType.getUnit() == TimeUnit.MILLISECOND) { + if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { return (SchemaBuilder.UnionAccumulator) builder.intBuilder().prop("logicalType", "time-millis").endInt(); } else { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java index 87ebbb04c3..951605b6c3 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroTimeSecProducer.java @@ -27,10 +27,12 @@ */ public class AvroTimeSecProducer extends BaseAvroProducer { - // Convert seconds to microseconds for Avro time-micros (LONG) type - // Range is 1000 times more than for milliseconds, so won't fit into time-millis (INT) + // Convert seconds to milliseconds for Avro time-millis (INT) type + // INT is enough to cover the number of milliseconds in a day + // So overflows should not happen if values are valid times of day - private static final long MICROS_PER_SECOND = 1000000; + private static final int MILLIS_PER_SECOND = 1000; + private static final long OVERFLOW_LIMIT = Integer.MAX_VALUE / 1000; /** Instantiate an AvroTimeSecProducer. */ public AvroTimeSecProducer(TimeSecVector vector) { @@ -40,8 +42,11 @@ public AvroTimeSecProducer(TimeSecVector vector) { @Override public void produce(Encoder encoder) throws IOException { int seconds = vector.getDataBuffer().getInt(currentIndex * (long) TimeSecVector.TYPE_WIDTH); - long micros = seconds * MICROS_PER_SECOND; - encoder.writeLong(micros); + if (Math.abs(seconds) > OVERFLOW_LIMIT) { + throw new ArithmeticException("Time value is too large for Avro encoding"); + } + int millis = seconds * MILLIS_PER_SECOND; + encoder.writeInt(millis); currentIndex++; } } diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index ef92f3c91e..133218242e 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -1240,7 +1240,7 @@ public void testWriteTimes() throws Exception { // Read and check values for (int row = 0; row < rowCount; row++) { record = datumReader.read(record, decoder); - assertEquals(timeSecVector.get(row), (int) ((long) record.get("timeSec") / 1000000)); + assertEquals(timeSecVector.get(row), (int) (record.get("timeSec")) / 1000); assertEquals(timeMillisVector.get(row), record.get("timeMillis")); assertEquals(timeMicrosVector.get(row), record.get("timeMicros")); assertEquals(timeNanosVector.get(row), (long) record.get("timeNanos") * 1000); @@ -1321,7 +1321,7 @@ public void testWriteNullableTimes() throws Exception { for (int row = 1; row < rowCount; row++) { record = datumReader.read(record, decoder); - assertEquals(timeSecVector.get(row), (int) ((long) record.get("timeSec") / 1000000)); + assertEquals(timeSecVector.get(row), ((int) record.get("timeSec") / 1000)); assertEquals(timeMillisVector.get(row), record.get("timeMillis")); assertEquals(timeMicrosVector.get(row), record.get("timeMicros")); assertEquals(timeNanosVector.get(row), (long) record.get("timeNanos") * 1000); diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java index 68bd8852a5..7228c27996 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java @@ -515,15 +515,15 @@ public void testConvertTimeTypes() { assertEquals(Schema.Type.UNION, schema.getField("nullableTimeSec").schema().getType()); assertEquals(2, schema.getField("nullableTimeSec").schema().getTypes().size()); Schema nullableTimeSecSchema = schema.getField("nullableTimeSec").schema().getTypes().get(0); - assertEquals(Schema.Type.LONG, nullableTimeSecSchema.getType()); - assertEquals("time-micros", nullableTimeSecSchema.getProp("logicalType")); + assertEquals(Schema.Type.INT, nullableTimeSecSchema.getType()); + assertEquals("time-millis", nullableTimeSecSchema.getProp("logicalType")); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeSec").schema().getTypes().get(1).getType()); // Assertions for nonNullableTimeSec Schema nonNullableTimeSecSchema = schema.getField("nonNullableTimeSec").schema(); - assertEquals(Schema.Type.LONG, nonNullableTimeSecSchema.getType()); - assertEquals("time-micros", nonNullableTimeSecSchema.getProp("logicalType")); + assertEquals(Schema.Type.INT, nonNullableTimeSecSchema.getType()); + assertEquals("time-millis", nonNullableTimeSecSchema.getProp("logicalType")); // Assertions for nullableTimeMillis assertEquals(Schema.Type.UNION, schema.getField("nullableTimeMillis").schema().getType()); From 976ac8538d2074babac3c9fd3e30d2a13689d189 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 01:10:33 +0000 Subject: [PATCH 66/89] Tests for list and fixed list --- .../adapter/avro/ArrowToAvroDataTest.java | 411 ++++++++++++++++++ 1 file changed, 411 insertions(+) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 133218242e..1f17e5186c 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -45,6 +45,8 @@ import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeStampMicroTZVector; @@ -1641,4 +1643,413 @@ record = datumReader.read(record, decoder); } } } + + // Data production for containers of primitive and logical types, nullable and non-nullable + + @Test + public void testWriteLists() throws Exception { + + // Field definitions + FieldType intListField = new FieldType(false, new ArrowType.List(), null); + FieldType stringListField = new FieldType(false, new ArrowType.List(), null); + FieldType dateListField = new FieldType(false, new ArrowType.List(), null); + + Field intField = new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field stringField = new Field("item", FieldType.notNullable(new ArrowType.Utf8()), null); + Field dateField = new Field("item", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + ListVector intListVector = new ListVector("intList", allocator, intListField, null); + ListVector stringListVector = new ListVector("stringList", allocator, stringListField, null); + ListVector dateListVector = new ListVector("dateList", allocator, dateListField, null); + + intListVector.initializeChildrenFromFields(Arrays.asList(intField)); + stringListVector.initializeChildrenFromFields(Arrays.asList(stringField)); + dateListVector.initializeChildrenFromFields(Arrays.asList(dateField)); + + // Set up VSR + List vectors = Arrays.asList(intListVector, stringListVector, dateListVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data for intList + for (int i = 0, offset = 0; i < rowCount; i++) { + intListVector.startNewValue(i); + IntVector indDataVector = (IntVector) intListVector.getDataVector(); + for (int j = 0; j < 5 - i; j++) { + indDataVector.set(offset + j, j); + } + intListVector.endValue(i, 5 - i); + offset += 5 - i; + } + + // Set test data for stringList + for (int i = 0, offset = 0; i < rowCount; i++) { + stringListVector.startNewValue(i); + VarCharVector varCharVector = (VarCharVector) stringListVector.getDataVector(); + for (int j = 0; j < 5 - i; j++) { + varCharVector.setSafe(offset + j, ("string" + j).getBytes()); + } + stringListVector.endValue(i, 5 - i); + offset += 5 - i; + } + + // Set test data for dateList + for (int i = 0, offset = 0; i < rowCount; i++) { + dateListVector.startNewValue(i); + DateDayVector dateVector = (DateDayVector) dateListVector.getDataVector(); + for (int j = 0; j < 5 - i; j++) { + dateVector.setSafe(offset + j, (int) LocalDate.now().plusDays(j).toEpochDay()); + } + dateListVector.endValue(i, 5 - i); + offset += 5 - i; + } + + File dataFile = new File(TMP, "testWriteLists.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(intListVector.getObject(row), record.get("intList")); + assertEquals(dateListVector.getObject(row), record.get("dateList")); + // Handle conversion from Arrow Text type + List vectorList = stringListVector.getObject(row); + List recordList = (List) record.get("stringList"); + assertEquals(vectorList.size(), recordList.size()); + for (int i = 0; i < vectorList.size(); i++) { + assertEquals(vectorList.get(i).toString(), recordList.get(i).toString()); + } + } + } + } + } + + @Test + public void testWriteNullableLists() throws Exception { + + // Field definitions + FieldType nullListType = new FieldType(true, new ArrowType.List(), null); + FieldType nonNullListType = new FieldType(false, new ArrowType.List(), null); + + Field nullFieldType = new Field("item", FieldType.nullable(new ArrowType.Int(32, true)), null); + Field nonNullFieldType = new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + ListVector nullEntriesVector = new ListVector("nullEntriesVector", allocator, nonNullListType, null); + ListVector nullListVector = new ListVector("nullListVector", allocator, nullListType, null); + ListVector nullBothVector = new ListVector("nullBothVector", allocator, nullListType, null); + + nullEntriesVector.initializeChildrenFromFields(Arrays.asList(nullFieldType)); + nullListVector.initializeChildrenFromFields(Arrays.asList(nonNullFieldType)); + nullBothVector.initializeChildrenFromFields(Arrays.asList(nullFieldType)); + + + // Set up VSR + List vectors = Arrays.asList(nullEntriesVector, nullListVector, nullBothVector); + int rowCount = 4; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data for nullEntriesVector + IntVector nullEntriesData = (IntVector) nullEntriesVector.getDataVector(); + nullEntriesVector.startNewValue(0); + nullEntriesData.setNull(0); + nullEntriesVector.endValue(0, 1); + nullEntriesVector.startNewValue(1); + nullEntriesData.set(1, 0); + nullEntriesVector.endValue(1, 1); + nullEntriesVector.startNewValue(2); + nullEntriesData.set(2, 1); + nullEntriesVector.endValue(2, 1); + nullEntriesVector.startNewValue(3); + nullEntriesData.set(3, 2); + nullEntriesVector.endValue(3, 1); + + // Set test data for nullListVector + IntVector nullListData = (IntVector) nullListVector.getDataVector(); + nullListVector.setNull(0); + nullListVector.startNewValue(1); + nullListData.set(0, 0); + nullListVector.endValue(1, 1); + nullListVector.startNewValue(2); + nullListData.set(1, 1); + nullListVector.endValue(2, 1); + nullListVector.startNewValue(3); + nullListData.set(2, 2); + nullListVector.endValue(3, 1); + + // Set test data for nullBothVector + IntVector nullBothData = (IntVector) nullBothVector.getDataVector(); + nullBothVector.setNull(0); + nullBothVector.startNewValue(1); + nullBothData.setNull(0); + nullBothVector.endValue(1, 1); + nullBothVector.startNewValue(2); + nullBothData.set(1, 0); + nullBothVector.endValue(2, 1); + nullBothVector.startNewValue(3); + nullBothData.set(2, 1); + nullBothVector.endValue(3, 1); + + File dataFile = new File(TMP, "testWriteNullableLists.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + for (String list : Arrays.asList("nullEntriesVector", "nullListVector", "nullBothVector")) { + ListVector vector = (ListVector) root.getVector(list); + Object recordField = record.get(list); + if (vector.isNull(row)) { + assertNull(recordField); + } else { + assertEquals(vector.getObject(row), recordField); + } + } + } + } + } + } + + @Test + public void testWriteFixedLists() throws Exception { + + // Field definitions + FieldType intListField = new FieldType(false, new ArrowType.FixedSizeList(5), null); + FieldType stringListField = new FieldType(false, new ArrowType.FixedSizeList(5), null); + FieldType dateListField = new FieldType(false, new ArrowType.FixedSizeList(5), null); + + Field intField = new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field stringField = new Field("item", FieldType.notNullable(new ArrowType.Utf8()), null); + Field dateField = new Field("item", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + FixedSizeListVector intListVector = new FixedSizeListVector("intList", allocator, intListField, null); + FixedSizeListVector stringListVector = new FixedSizeListVector("stringList", allocator, stringListField, null); + FixedSizeListVector dateListVector = new FixedSizeListVector("dateList", allocator, dateListField, null); + + intListVector.initializeChildrenFromFields(Arrays.asList(intField)); + stringListVector.initializeChildrenFromFields(Arrays.asList(stringField)); + dateListVector.initializeChildrenFromFields(Arrays.asList(dateField)); + + // Set up VSR + List vectors = Arrays.asList(intListVector, stringListVector, dateListVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data for intList + for (int i = 0, offset = 0; i < rowCount; i++) { + intListVector.startNewValue(i); + IntVector indDataVector = (IntVector) intListVector.getDataVector(); + for (int j = 0; j < 5; j++) { + indDataVector.set(offset + j, j); + } + offset += 5; + } + + // Set test data for stringList + for (int i = 0, offset = 0; i < rowCount; i++) { + stringListVector.startNewValue(i); + VarCharVector varCharVector = (VarCharVector) stringListVector.getDataVector(); + for (int j = 0; j < 5; j++) { + varCharVector.setSafe(offset + j, ("string" + j).getBytes()); + } + offset += 5; + } + + // Set test data for dateList + for (int i = 0, offset = 0; i < rowCount; i++) { + dateListVector.startNewValue(i); + DateDayVector dateVector = (DateDayVector) dateListVector.getDataVector(); + for (int j = 0; j < 5; j++) { + dateVector.setSafe(offset + j, (int) LocalDate.now().plusDays(j).toEpochDay()); + } + offset += 5; + } + + File dataFile = new File(TMP, "testWriteFixedLists.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertEquals(intListVector.getObject(row), record.get("intList")); + assertEquals(dateListVector.getObject(row), record.get("dateList")); + // Handle conversion from Arrow Text type + List vectorList = stringListVector.getObject(row); + List recordList = (List) record.get("stringList"); + assertEquals(vectorList.size(), recordList.size()); + for (int i = 0; i < vectorList.size(); i++) { + assertEquals(vectorList.get(i).toString(), recordList.get(i).toString()); + } + } + } + } + } + + @Test + public void testWriteNullableFixedLists() throws Exception { + + // Field definitions + FieldType nullListType = new FieldType(true, new ArrowType.FixedSizeList(1), null); + FieldType nonNullListType = new FieldType(false, new ArrowType.FixedSizeList(1), null); + + Field nullFieldType = new Field("item", FieldType.nullable(new ArrowType.Int(32, true)), null); + Field nonNullFieldType = new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + FixedSizeListVector nullEntriesVector = new FixedSizeListVector("nullEntriesVector", allocator, nonNullListType, null); + FixedSizeListVector nullListVector = new FixedSizeListVector("nullListVector", allocator, nullListType, null); + FixedSizeListVector nullBothVector = new FixedSizeListVector("nullBothVector", allocator, nullListType, null); + + nullEntriesVector.initializeChildrenFromFields(Arrays.asList(nullFieldType)); + nullListVector.initializeChildrenFromFields(Arrays.asList(nonNullFieldType)); + nullBothVector.initializeChildrenFromFields(Arrays.asList(nullFieldType)); + + + // Set up VSR + List vectors = Arrays.asList(nullEntriesVector, nullListVector, nullBothVector); + int rowCount = 4; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data for nullEntriesVector + IntVector nullEntriesData = (IntVector) nullEntriesVector.getDataVector(); + nullEntriesVector.startNewValue(0); + nullEntriesData.setNull(0); + nullEntriesVector.startNewValue(1); + nullEntriesData.set(1, 0); + nullEntriesVector.startNewValue(2); + nullEntriesData.set(2, 1); + nullEntriesVector.startNewValue(3); + nullEntriesData.set(3, 2); + + // Set test data for nullListVector + IntVector nullListData = (IntVector) nullListVector.getDataVector(); + nullListVector.setNull(0); + nullListVector.startNewValue(1); + nullListData.set(1, 0); + nullListVector.startNewValue(2); + nullListData.set(2, 1); + nullListVector.startNewValue(3); + nullListData.set(3, 2); + + // Set test data for nullBothVector + IntVector nullBothData = (IntVector) nullBothVector.getDataVector(); + nullBothVector.setNull(0); + nullBothVector.startNewValue(1); + nullBothData.setNull(1); + nullBothVector.startNewValue(2); + nullBothData.set(2, 0); + nullBothVector.startNewValue(3); + nullBothData.set(3, 1); + + File dataFile = new File(TMP, "testWriteNullableFixedLists.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + for (String list : Arrays.asList("nullEntriesVector", "nullListVector", "nullBothVector")) { + FixedSizeListVector vector = (FixedSizeListVector) root.getVector(list); + Object recordField = record.get(list); + if (vector.isNull(row)) { + assertNull(recordField); + } else { + assertEquals(vector.getObject(row), recordField); + } + } + } + } + } + } } From aa03bdd1bd86cfba3590189378e29e26187f7426 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 01:10:48 +0000 Subject: [PATCH 67/89] Producer fixes for fixed size list --- .../adapter/avro/producers/AvroFixedSizeListProducer.java | 7 ++++++- .../arrow/adapter/avro/producers/AvroListProducer.java | 2 +- .../arrow/adapter/avro/producers/AvroNullableProducer.java | 6 +++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java index bf299253db..fba7f441fc 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java @@ -51,7 +51,12 @@ public void produce(Encoder encoder) throws IOException { currentIndex++; } - // Do not override skipNull(), the delegate delegate vector will not hold data + @Override + public void skipNull() { + // Keep fixed sized child in sync + delegate.skipNull(); + super.skipNull(); + } @Override public void setPosition(int index) { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java index 1aa61fa6c6..38745be6ab 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java @@ -53,7 +53,7 @@ public void produce(Encoder encoder) throws IOException { currentIndex++; } - // Do not override skipNull(), the delegate delegate vector will not hold data + // Do not override skipNull(), delegate will not have an entry if the list is null @Override public void setPosition(int index) { diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java index 2729e9ece1..9d956772c2 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java @@ -51,9 +51,9 @@ public void produce(Encoder encoder) throws IOException { @Override public void skipNull() { - // Should never be called on nullable producer - // Calling produce() will skipNull() on the delegate - throw new UnsupportedOperationException(); + // Can be called by containers of nullable types + delegate.skipNull(); + currentIndex++; } @Override From 5ccef030d9af89b1ffc0f3ad8aa9bec4e62efed6 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 02:21:49 +0000 Subject: [PATCH 68/89] Test for map types --- .../adapter/avro/ArrowToAvroDataTest.java | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 1f17e5186c..e8dc49eecd 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -28,7 +28,9 @@ import java.time.LocalDate; import java.time.ZonedDateTime; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; @@ -47,6 +49,7 @@ import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeStampMicroTZVector; @@ -69,12 +72,15 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.writer.BaseWriter; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.JsonStringHashMap; import org.apache.avro.Conversions; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; @@ -85,6 +91,7 @@ import org.apache.avro.io.BinaryEncoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.EncoderFactory; +import org.apache.avro.util.Utf8; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -2052,4 +2059,140 @@ record = datumReader.read(record, decoder); } } } + + @Test + public void testWriteNonNullableMap() throws Exception { + + // Field definitions + FieldType intMapField = new FieldType(false, new ArrowType.Map(false), null); + FieldType stringMapField = new FieldType(false, new ArrowType.Map(false), null); + FieldType dateMapField = new FieldType(false, new ArrowType.Map(false), null); + + Field keyField = new Field("key", FieldType.notNullable(new ArrowType.Utf8()), null); + Field intField = new Field("value", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field stringField = new Field("value", FieldType.notNullable(new ArrowType.Utf8()), null); + Field dateField = new Field("value", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); + + Field intEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, intField)); + Field stringEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, stringField)); + Field dateEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, dateField)); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + MapVector intMapVector = new MapVector("intMap", allocator, intMapField, null); + MapVector stringMapVector = new MapVector("stringMap", allocator, stringMapField, null); + MapVector dateMapVector = new MapVector("dateMap", allocator, dateMapField, null); + + intMapVector.initializeChildrenFromFields(Arrays.asList(intEntryField)); + stringMapVector.initializeChildrenFromFields(Arrays.asList(stringEntryField)); + dateMapVector.initializeChildrenFromFields(Arrays.asList(dateEntryField)); + + // Set up VSR + List vectors = Arrays.asList(intMapVector, stringMapVector, dateMapVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data for intList + BaseWriter.MapWriter writer = intMapVector.getWriter(); + for (int i = 0; i < rowCount; i++) { + writer.startMap(); + for (int j = 0; j < 5 - i; j++) { + writer.startEntry(); + writer.key().varChar().writeVarChar("key" + j); + writer.value().integer().writeInt(j); + writer.endEntry(); + } + writer.endMap(); + } + + // Set test data for stringList + BaseWriter.MapWriter stringWriter = stringMapVector.getWriter(); + for (int i = 0; i < rowCount; i++) { + stringWriter.startMap(); + for (int j = 0; j < 5 - i; j++) { + stringWriter.startEntry(); + stringWriter.key().varChar().writeVarChar("key" + j); + stringWriter.value().varChar().writeVarChar("string" + j); + stringWriter.endEntry(); + } + stringWriter.endMap(); + } + + // Set test data for dateList + BaseWriter.MapWriter dateWriter = dateMapVector.getWriter(); + for (int i = 0; i < rowCount; i++) { + dateWriter.startMap(); + for (int j = 0; j < 5 - i; j++) { + dateWriter.startEntry(); + dateWriter.key().varChar().writeVarChar("key" + j); + dateWriter.value().dateDay().writeDateDay((int) LocalDate.now().plusDays(j).toEpochDay()); + dateWriter.endEntry(); + } + dateWriter.endMap(); + } + + File dataFile = new File(TMP, "testWriteNonNullableMap.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + Map intMap = convertMap(intMapVector.getObject(row)); + Map stringMap = convertMap(stringMapVector.getObject(row)); + Map dateMap = convertMap(dateMapVector.getObject(row)); + compareMaps(intMap, (Map) record.get("intMap")); + compareMaps(stringMap, (Map) record.get("stringMap")); + compareMaps(dateMap, (Map) record.get("dateMap")); + } + } + } + } + + private Map convertMap(List entryList) { + + Map map = new HashMap<>(); + JsonStringArrayList structList = (JsonStringArrayList) entryList; + for (Object entry : structList) { + JsonStringHashMap structEntry = (JsonStringHashMap) entry; + String key = structEntry.get(MapVector.KEY_NAME).toString(); + Object value = structEntry.get(MapVector.VALUE_NAME); + map.put(key, value); + } + return map; + } + + private void compareMaps(Map expected, Map actual) { + assertEquals(expected.size(), actual.size()); + for (Object key : actual.keySet()) { + assertTrue(expected.containsKey(key.toString())); + Object actualValue = actual.get(key); + if (actualValue instanceof Utf8) { + assertEquals(expected.get(key.toString()).toString(), actualValue.toString()); + } else { + assertEquals(expected.get(key.toString()), actual.get(key)); + } + } + } } From a7ca15142229e8aaf787d50745e5b8da77504b89 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 02:42:28 +0000 Subject: [PATCH 69/89] Test for nullable map types --- .../adapter/avro/ArrowToAvroDataTest.java | 149 ++++++++++++++++-- 1 file changed, 139 insertions(+), 10 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index e8dc49eecd..ad39d7aaa4 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -2061,7 +2061,7 @@ record = datumReader.read(record, decoder); } @Test - public void testWriteNonNullableMap() throws Exception { + public void testWriteMap() throws Exception { // Field definitions FieldType intMapField = new FieldType(false, new ArrowType.Map(false), null); @@ -2135,7 +2135,7 @@ public void testWriteNonNullableMap() throws Exception { dateWriter.endMap(); } - File dataFile = new File(TMP, "testWriteNonNullableMap.avro"); + File dataFile = new File(TMP, "testWriteMap.avro"); // Write an AVRO block using the producer classes try (FileOutputStream fos = new FileOutputStream(dataFile)) { @@ -2170,8 +2170,132 @@ record = datumReader.read(record, decoder); } } + @Test + public void testWriteNullableMap() throws Exception { + + // Field definitions + FieldType nullMapType = new FieldType(true, new ArrowType.Map(false), null); + FieldType nonNullMapType = new FieldType(false, new ArrowType.Map(false), null); + + Field keyField = new Field("key", FieldType.notNullable(new ArrowType.Utf8()), null); + Field nullFieldType = new Field("value", FieldType.nullable(new ArrowType.Int(32, true)), null); + Field nonNullFieldType = new Field("value", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field nullEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, nullFieldType)); + Field nonNullEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, nonNullFieldType)); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + MapVector nullEntriesVector = new MapVector("nullEntriesVector", allocator, nonNullMapType, null); + MapVector nullMapVector = new MapVector("nullMapVector", allocator, nullMapType, null); + MapVector nullBothVector = new MapVector("nullBothVector", allocator, nullMapType, null); + + nullEntriesVector.initializeChildrenFromFields(Arrays.asList(nullEntryField)); + nullMapVector.initializeChildrenFromFields(Arrays.asList(nonNullEntryField)); + nullBothVector.initializeChildrenFromFields(Arrays.asList(nullEntryField)); + + // Set up VSR + List vectors = Arrays.asList(nullEntriesVector, nullMapVector, nullBothVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data for intList + BaseWriter.MapWriter writer = nullEntriesVector.getWriter(); + writer.startMap(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key0"); + writer.value().integer().writeNull(); + writer.endEntry(); + writer.endMap(); + writer.startMap(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key1"); + writer.value().integer().writeInt(0); + writer.endEntry(); + writer.endMap(); + writer.startMap(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key2"); + writer.value().integer().writeInt(1); + writer.endEntry(); + writer.endMap(); + + // Set test data for stringList + BaseWriter.MapWriter nullMapWriter = nullMapVector.getWriter(); + nullMapWriter.writeNull(); + nullMapWriter.startMap(); + nullMapWriter.startEntry(); + nullMapWriter.key().varChar().writeVarChar("key1"); + nullMapWriter.value().integer().writeInt(0); + nullMapWriter.endEntry(); + nullMapWriter.endMap(); + nullMapWriter.startMap(); + nullMapWriter.startEntry(); + nullMapWriter.key().varChar().writeVarChar("key2"); + nullMapWriter.value().integer().writeInt(1); + nullMapWriter.endEntry(); + nullMapWriter.endMap(); + + // Set test data for dateList + BaseWriter.MapWriter nullBothWriter = nullBothVector.getWriter(); + nullBothWriter.writeNull(); + nullBothWriter.startMap(); + nullBothWriter.startEntry(); + nullBothWriter.key().varChar().writeVarChar("key1"); + nullBothWriter.value().integer().writeNull(); + nullBothWriter.endEntry(); + nullBothWriter.endMap(); + nullBothWriter.startMap(); + nullBothWriter.startEntry(); + nullBothWriter.key().varChar().writeVarChar("key2"); + nullBothWriter.value().integer().writeInt(0); + nullBothWriter.endEntry(); + nullBothWriter.endMap(); + + File dataFile = new File(TMP, "testWriteNullableMap.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + Map intMap = convertMap(nullEntriesVector.getObject(row)); + Map stringMap = convertMap(nullMapVector.getObject(row)); + Map dateMap = convertMap(nullBothVector.getObject(row)); + compareMaps(intMap, (Map) record.get("nullEntriesVector")); + compareMaps(stringMap, (Map) record.get("nullMapVector")); + compareMaps(dateMap, (Map) record.get("nullBothVector")); + } + } + } + } + private Map convertMap(List entryList) { + if (entryList == null) { + return null; + } + Map map = new HashMap<>(); JsonStringArrayList structList = (JsonStringArrayList) entryList; for (Object entry : structList) { @@ -2184,14 +2308,19 @@ private Map convertMap(List entryList) { } private void compareMaps(Map expected, Map actual) { - assertEquals(expected.size(), actual.size()); - for (Object key : actual.keySet()) { - assertTrue(expected.containsKey(key.toString())); - Object actualValue = actual.get(key); - if (actualValue instanceof Utf8) { - assertEquals(expected.get(key.toString()).toString(), actualValue.toString()); - } else { - assertEquals(expected.get(key.toString()), actual.get(key)); + if (expected == null) { + assertNull(actual); + } + else { + assertEquals(expected.size(), actual.size()); + for (Object key : actual.keySet()) { + assertTrue(expected.containsKey(key.toString())); + Object actualValue = actual.get(key); + if (actualValue instanceof Utf8) { + assertEquals(expected.get(key.toString()).toString(), actualValue.toString()); + } else { + assertEquals(expected.get(key.toString()), actual.get(key)); + } } } } From f26a2e1fa96dd33b8ab91458aacbc58f017761c5 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 02:44:40 +0000 Subject: [PATCH 70/89] Apply spotless --- .../logical/AvroDecimal256Producer.java | 5 +- .../logical/AvroDecimalProducer.java | 8 +- .../adapter/avro/ArrowToAvroDataTest.java | 551 ++++++++++++------ 3 files changed, 378 insertions(+), 186 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java index 737c8caf66..f72aa6d9e0 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimal256Producer.java @@ -16,13 +16,12 @@ */ package org.apache.arrow.adapter.avro.producers.logical; +import java.io.IOException; +import java.math.BigDecimal; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.vector.Decimal256Vector; import org.apache.avro.io.Encoder; -import java.io.IOException; -import java.math.BigDecimal; - /** * Producer that produces decimal values from a {@link Decimal256Vector}, writes data to an Avro * encoder. diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java index d4e55d55aa..51ad7c7200 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/logical/AvroDecimalProducer.java @@ -16,14 +16,13 @@ */ package org.apache.arrow.adapter.avro.producers.logical; +import java.io.IOException; +import java.math.BigDecimal; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.util.DecimalUtility; import org.apache.avro.io.Encoder; -import java.io.IOException; -import java.math.BigDecimal; - /** * Producer that produces decimal values from a {@link DecimalVector}, writes data to an Avro * encoder. @@ -54,6 +53,7 @@ static void encodeDecimal(BigDecimal value, byte[] encodedBytes) { byte[] valueBytes = value.unscaledValue().toByteArray(); byte[] padding = valueBytes[0] < 0 ? DecimalUtility.minus_one : DecimalUtility.zeroes; System.arraycopy(padding, 0, encodedBytes, 0, encodedBytes.length - valueBytes.length); - System.arraycopy(valueBytes, 0, encodedBytes, encodedBytes.length - valueBytes.length, valueBytes.length); + System.arraycopy( + valueBytes, 0, encodedBytes, encodedBytes.length - valueBytes.length, valueBytes.length); } } diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index ad39d7aaa4..9beed2969b 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -14,9 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.arrow.adapter.avro; +import static org.junit.jupiter.api.Assertions.*; + import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -39,19 +40,20 @@ import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; -import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; -import org.apache.arrow.vector.complex.FixedSizeListVector; -import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; import org.apache.arrow.vector.TimeStampMicroTZVector; import org.apache.arrow.vector.TimeStampMicroVector; import org.apache.arrow.vector.TimeStampMilliTZVector; @@ -60,10 +62,6 @@ import org.apache.arrow.vector.TimeStampNanoVector; import org.apache.arrow.vector.TimeStampSecTZVector; import org.apache.arrow.vector.TimeStampSecVector; -import org.apache.arrow.vector.TimeSecVector; -import org.apache.arrow.vector.TimeMilliVector; -import org.apache.arrow.vector.TimeMicroVector; -import org.apache.arrow.vector.TimeNanoVector; import org.apache.arrow.vector.TinyIntVector; import org.apache.arrow.vector.UInt1Vector; import org.apache.arrow.vector.UInt2Vector; @@ -72,6 +70,9 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.writer.BaseWriter; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; @@ -95,12 +96,9 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import static org.junit.jupiter.api.Assertions.*; - public class ArrowToAvroDataTest { - @TempDir - public static File TMP; + @TempDir public static File TMP; // Data production for primitive types, nullable and non-nullable @@ -289,7 +287,8 @@ public void testWriteIntegers() throws Exception { // Create empty vectors BufferAllocator allocator = new RootAllocator(); TinyIntVector int8Vector = new TinyIntVector(new Field("int8", int8Field, null), allocator); - SmallIntVector int16Vector = new SmallIntVector(new Field("int16", int16Field, null), allocator); + SmallIntVector int16Vector = + new SmallIntVector(new Field("int16", int16Field, null), allocator); IntVector int32Vector = new IntVector(new Field("int32", int32Field, null), allocator); BigIntVector int64Vector = new BigIntVector(new Field("int64", int64Field, null), allocator); UInt1Vector uint8Vector = new UInt1Vector(new Field("uint8", uint8Field, null), allocator); @@ -298,9 +297,16 @@ public void testWriteIntegers() throws Exception { UInt8Vector uint64Vector = new UInt8Vector(new Field("uint64", uint64Field, null), allocator); // Set up VSR - List vectors = Arrays.asList( - int8Vector, int16Vector, int32Vector, int64Vector, - uint8Vector, uint16Vector, uint32Vector, uint64Vector); + List vectors = + Arrays.asList( + int8Vector, + int16Vector, + int32Vector, + int64Vector, + uint8Vector, + uint16Vector, + uint32Vector, + uint64Vector); int rowCount = 12; @@ -394,7 +400,8 @@ public void testWriteNullableIntegers() throws Exception { // Create empty vectors BufferAllocator allocator = new RootAllocator(); TinyIntVector int8Vector = new TinyIntVector(new Field("int8", int8Field, null), allocator); - SmallIntVector int16Vector = new SmallIntVector(new Field("int16", int16Field, null), allocator); + SmallIntVector int16Vector = + new SmallIntVector(new Field("int16", int16Field, null), allocator); IntVector int32Vector = new IntVector(new Field("int32", int32Field, null), allocator); BigIntVector int64Vector = new BigIntVector(new Field("int64", int64Field, null), allocator); UInt1Vector uint8Vector = new UInt1Vector(new Field("uint8", uint8Field, null), allocator); @@ -405,9 +412,16 @@ public void testWriteNullableIntegers() throws Exception { int rowCount = 3; // Set up VSR - List vectors = Arrays.asList( - int8Vector, int16Vector, int32Vector, int64Vector, - uint8Vector, uint16Vector, uint32Vector, uint64Vector); + List vectors = + Arrays.asList( + int8Vector, + int16Vector, + int32Vector, + int64Vector, + uint8Vector, + uint16Vector, + uint32Vector, + uint64Vector); try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -494,15 +508,21 @@ record = datumReader.read(record, decoder); public void testWriteFloatingPoints() throws Exception { // Field definitions - FieldType float16Field = new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.HALF), null); - FieldType float32Field = new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null); - FieldType float64Field = new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null); + FieldType float16Field = + new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.HALF), null); + FieldType float32Field = + new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null); + FieldType float64Field = + new FieldType(false, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - Float2Vector float16Vector = new Float2Vector(new Field("float16", float16Field, null), allocator); - Float4Vector float32Vector = new Float4Vector(new Field("float32", float32Field, null), allocator); - Float8Vector float64Vector = new Float8Vector(new Field("float64", float64Field, null), allocator); + Float2Vector float16Vector = + new Float2Vector(new Field("float16", float16Field, null), allocator); + Float4Vector float32Vector = + new Float4Vector(new Field("float32", float32Field, null), allocator); + Float8Vector float64Vector = + new Float8Vector(new Field("float64", float64Field, null), allocator); // Set up VSR List vectors = Arrays.asList(float16Vector, float32Vector, float64Vector); @@ -575,15 +595,21 @@ record = datumReader.read(record, decoder); public void testWriteNullableFloatingPoints() throws Exception { // Field definitions - FieldType float16Field = new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.HALF), null); - FieldType float32Field = new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null); - FieldType float64Field = new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null); + FieldType float16Field = + new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.HALF), null); + FieldType float32Field = + new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null); + FieldType float64Field = + new FieldType(true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - Float2Vector float16Vector = new Float2Vector(new Field("float16", float16Field, null), allocator); - Float4Vector float32Vector = new Float4Vector(new Field("float32", float32Field, null), allocator); - Float8Vector float64Vector = new Float8Vector(new Field("float64", float64Field, null), allocator); + Float2Vector float16Vector = + new Float2Vector(new Field("float16", float16Field, null), allocator); + Float4Vector float32Vector = + new Float4Vector(new Field("float32", float32Field, null), allocator); + Float8Vector float64Vector = + new Float8Vector(new Field("float64", float64Field, null), allocator); int rowCount = 3; @@ -654,7 +680,8 @@ public void testWriteStrings() throws Exception { // Create empty vector BufferAllocator allocator = new RootAllocator(); - VarCharVector stringVector = new VarCharVector(new Field("string", stringField, null), allocator); + VarCharVector stringVector = + new VarCharVector(new Field("string", stringField, null), allocator); // Set up VSR List vectors = Arrays.asList(stringVector); @@ -710,7 +737,8 @@ public void testWriteNullableStrings() throws Exception { // Create empty vector BufferAllocator allocator = new RootAllocator(); - VarCharVector stringVector = new VarCharVector(new Field("string", stringField, null), allocator); + VarCharVector stringVector = + new VarCharVector(new Field("string", stringField, null), allocator); int rowCount = 3; @@ -768,8 +796,10 @@ public void testWriteBinary() throws Exception { // Create empty vector BufferAllocator allocator = new RootAllocator(); - VarBinaryVector binaryVector = new VarBinaryVector(new Field("binary", binaryField, null), allocator); - FixedSizeBinaryVector fixedVector = new FixedSizeBinaryVector(new Field("fixed", fixedField, null), allocator); + VarBinaryVector binaryVector = + new VarBinaryVector(new Field("binary", binaryField, null), allocator); + FixedSizeBinaryVector fixedVector = + new FixedSizeBinaryVector(new Field("fixed", fixedField, null), allocator); // Set up VSR List vectors = Arrays.asList(binaryVector, fixedVector); @@ -781,13 +811,13 @@ public void testWriteBinary() throws Exception { root.allocateNew(); // Set test data - binaryVector.setSafe(0, new byte[]{1, 2, 3}); - binaryVector.setSafe(1, new byte[]{4, 5, 6, 7}); - binaryVector.setSafe(2, new byte[]{8, 9}); + binaryVector.setSafe(0, new byte[] {1, 2, 3}); + binaryVector.setSafe(1, new byte[] {4, 5, 6, 7}); + binaryVector.setSafe(2, new byte[] {8, 9}); - fixedVector.setSafe(0, new byte[]{1, 2, 3, 4, 5}); - fixedVector.setSafe(1, new byte[]{4, 5, 6, 7, 8, 9}); - fixedVector.setSafe(2, new byte[]{8, 9, 10, 11, 12}); + fixedVector.setSafe(0, new byte[] {1, 2, 3, 4, 5}); + fixedVector.setSafe(1, new byte[] {4, 5, 6, 7, 8, 9}); + fixedVector.setSafe(2, new byte[] {8, 9, 10, 11, 12}); File dataFile = new File(TMP, "testWriteBinary.avro"); @@ -833,8 +863,10 @@ public void testWriteNullableBinary() throws Exception { // Create empty vector BufferAllocator allocator = new RootAllocator(); - VarBinaryVector binaryVector = new VarBinaryVector(new Field("binary", binaryField, null), allocator); - FixedSizeBinaryVector fixedVector = new FixedSizeBinaryVector(new Field("fixed", fixedField, null), allocator); + VarBinaryVector binaryVector = + new VarBinaryVector(new Field("binary", binaryField, null), allocator); + FixedSizeBinaryVector fixedVector = + new FixedSizeBinaryVector(new Field("fixed", fixedField, null), allocator); int rowCount = 3; @@ -848,12 +880,12 @@ public void testWriteNullableBinary() throws Exception { // Set test data binaryVector.setNull(0); - binaryVector.setSafe(1, new byte[]{}); - binaryVector.setSafe(2, new byte[]{10, 11, 12}); + binaryVector.setSafe(1, new byte[] {}); + binaryVector.setSafe(2, new byte[] {10, 11, 12}); fixedVector.setNull(0); - fixedVector.setSafe(1, new byte[]{0, 0, 0, 0, 0}); - fixedVector.setSafe(2, new byte[]{10, 11, 12, 13, 14}); + fixedVector.setSafe(1, new byte[] {0, 0, 0, 0, 0}); + fixedVector.setSafe(2, new byte[] {10, 11, 12, 13, 14}); File dataFile = new File(TMP, "testWriteNullableBinary.avro"); @@ -906,13 +938,18 @@ public void testWriteDecimals() throws Exception { // Create empty vectors BufferAllocator allocator = new RootAllocator(); - DecimalVector decimal128Vector1 = new DecimalVector(new Field("decimal128_1", decimal128Field1, null), allocator); - DecimalVector decimal128Vector2 = new DecimalVector(new Field("decimal128_2", decimal128Field2, null), allocator); - Decimal256Vector decimal256Vector1 = new Decimal256Vector(new Field("decimal256_1", decimal256Field1, null), allocator); - Decimal256Vector decimal256Vector2 = new Decimal256Vector(new Field("decimal256_2", decimal256Field2, null), allocator); + DecimalVector decimal128Vector1 = + new DecimalVector(new Field("decimal128_1", decimal128Field1, null), allocator); + DecimalVector decimal128Vector2 = + new DecimalVector(new Field("decimal128_2", decimal128Field2, null), allocator); + Decimal256Vector decimal256Vector1 = + new Decimal256Vector(new Field("decimal256_1", decimal256Field1, null), allocator); + Decimal256Vector decimal256Vector2 = + new Decimal256Vector(new Field("decimal256_2", decimal256Field2, null), allocator); // Set up VSR - List vectors = Arrays.asList(decimal128Vector1, decimal128Vector2, decimal256Vector1, decimal256Vector2); + List vectors = + Arrays.asList(decimal128Vector1, decimal128Vector2, decimal256Vector1, decimal256Vector2); int rowCount = 3; try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -921,21 +958,43 @@ public void testWriteDecimals() throws Exception { root.allocateNew(); // Set test data - decimal128Vector1.setSafe(0, new BigDecimal("12345.67890").setScale(10, RoundingMode.UNNECESSARY)); - decimal128Vector1.setSafe(1, new BigDecimal("-98765.43210").setScale(10, RoundingMode.UNNECESSARY)); - decimal128Vector1.setSafe(2, new BigDecimal("54321.09876").setScale(10, RoundingMode.UNNECESSARY)); - - decimal128Vector2.setSafe(0, new BigDecimal("12345.67890").setScale(5, RoundingMode.UNNECESSARY)); - decimal128Vector2.setSafe(1, new BigDecimal("-98765.43210").setScale(5, RoundingMode.UNNECESSARY)); - decimal128Vector2.setSafe(2, new BigDecimal("54321.09876").setScale(5, RoundingMode.UNNECESSARY)); - - decimal256Vector1.setSafe(0, new BigDecimal("12345678901234567890.12345678901234567890").setScale(20, RoundingMode.UNNECESSARY)); - decimal256Vector1.setSafe(1, new BigDecimal("-98765432109876543210.98765432109876543210").setScale(20, RoundingMode.UNNECESSARY)); - decimal256Vector1.setSafe(2, new BigDecimal("54321098765432109876.54321098765432109876").setScale(20, RoundingMode.UNNECESSARY)); - - decimal256Vector2.setSafe(0, new BigDecimal("12345678901234567890.1234567890").setScale(10, RoundingMode.UNNECESSARY)); - decimal256Vector2.setSafe(1, new BigDecimal("-98765432109876543210.9876543210").setScale(10, RoundingMode.UNNECESSARY)); - decimal256Vector2.setSafe(2, new BigDecimal("54321098765432109876.5432109876").setScale(10, RoundingMode.UNNECESSARY)); + decimal128Vector1.setSafe( + 0, new BigDecimal("12345.67890").setScale(10, RoundingMode.UNNECESSARY)); + decimal128Vector1.setSafe( + 1, new BigDecimal("-98765.43210").setScale(10, RoundingMode.UNNECESSARY)); + decimal128Vector1.setSafe( + 2, new BigDecimal("54321.09876").setScale(10, RoundingMode.UNNECESSARY)); + + decimal128Vector2.setSafe( + 0, new BigDecimal("12345.67890").setScale(5, RoundingMode.UNNECESSARY)); + decimal128Vector2.setSafe( + 1, new BigDecimal("-98765.43210").setScale(5, RoundingMode.UNNECESSARY)); + decimal128Vector2.setSafe( + 2, new BigDecimal("54321.09876").setScale(5, RoundingMode.UNNECESSARY)); + + decimal256Vector1.setSafe( + 0, + new BigDecimal("12345678901234567890.12345678901234567890") + .setScale(20, RoundingMode.UNNECESSARY)); + decimal256Vector1.setSafe( + 1, + new BigDecimal("-98765432109876543210.98765432109876543210") + .setScale(20, RoundingMode.UNNECESSARY)); + decimal256Vector1.setSafe( + 2, + new BigDecimal("54321098765432109876.54321098765432109876") + .setScale(20, RoundingMode.UNNECESSARY)); + + decimal256Vector2.setSafe( + 0, + new BigDecimal("12345678901234567890.1234567890").setScale(10, RoundingMode.UNNECESSARY)); + decimal256Vector2.setSafe( + 1, + new BigDecimal("-98765432109876543210.9876543210") + .setScale(10, RoundingMode.UNNECESSARY)); + decimal256Vector2.setSafe( + 2, + new BigDecimal("54321098765432109876.5432109876").setScale(10, RoundingMode.UNNECESSARY)); File dataFile = new File(TMP, "testWriteDecimals.avro"); @@ -961,10 +1020,14 @@ public void testWriteDecimals() throws Exception { // Read and check values for (int row = 0; row < rowCount; row++) { record = datumReader.read(record, decoder); - assertEquals(decimal128Vector1.getObject(row), decodeFixedDecimal(record, "decimal128_1")); - assertEquals(decimal128Vector2.getObject(row), decodeFixedDecimal(record, "decimal128_2")); - assertEquals(decimal256Vector1.getObject(row), decodeFixedDecimal(record, "decimal256_1")); - assertEquals(decimal256Vector2.getObject(row), decodeFixedDecimal(record, "decimal256_2")); + assertEquals( + decimal128Vector1.getObject(row), decodeFixedDecimal(record, "decimal128_1")); + assertEquals( + decimal128Vector2.getObject(row), decodeFixedDecimal(record, "decimal128_2")); + assertEquals( + decimal256Vector1.getObject(row), decodeFixedDecimal(record, "decimal256_1")); + assertEquals( + decimal256Vector2.getObject(row), decodeFixedDecimal(record, "decimal256_2")); } } } @@ -981,15 +1044,20 @@ public void testWriteNullableDecimals() throws Exception { // Create empty vectors BufferAllocator allocator = new RootAllocator(); - DecimalVector decimal128Vector1 = new DecimalVector(new Field("decimal128_1", decimal128Field1, null), allocator); - DecimalVector decimal128Vector2 = new DecimalVector(new Field("decimal128_2", decimal128Field2, null), allocator); - Decimal256Vector decimal256Vector1 = new Decimal256Vector(new Field("decimal256_1", decimal256Field1, null), allocator); - Decimal256Vector decimal256Vector2 = new Decimal256Vector(new Field("decimal256_2", decimal256Field2, null), allocator); + DecimalVector decimal128Vector1 = + new DecimalVector(new Field("decimal128_1", decimal128Field1, null), allocator); + DecimalVector decimal128Vector2 = + new DecimalVector(new Field("decimal128_2", decimal128Field2, null), allocator); + Decimal256Vector decimal256Vector1 = + new Decimal256Vector(new Field("decimal256_1", decimal256Field1, null), allocator); + Decimal256Vector decimal256Vector2 = + new Decimal256Vector(new Field("decimal256_2", decimal256Field2, null), allocator); int rowCount = 3; // Set up VSR - List vectors = Arrays.asList(decimal128Vector1, decimal128Vector2, decimal256Vector1, decimal256Vector2); + List vectors = + Arrays.asList(decimal128Vector1, decimal128Vector2, decimal256Vector1, decimal256Vector2); try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -999,19 +1067,26 @@ public void testWriteNullableDecimals() throws Exception { // Set test data decimal128Vector1.setNull(0); decimal128Vector1.setSafe(1, BigDecimal.ZERO.setScale(10, RoundingMode.UNNECESSARY)); - decimal128Vector1.setSafe(2, new BigDecimal("12345.67890").setScale(10, RoundingMode.UNNECESSARY)); + decimal128Vector1.setSafe( + 2, new BigDecimal("12345.67890").setScale(10, RoundingMode.UNNECESSARY)); decimal128Vector2.setNull(0); decimal128Vector2.setSafe(1, BigDecimal.ZERO.setScale(5, RoundingMode.UNNECESSARY)); - decimal128Vector2.setSafe(2, new BigDecimal("98765.43210").setScale(5, RoundingMode.UNNECESSARY)); + decimal128Vector2.setSafe( + 2, new BigDecimal("98765.43210").setScale(5, RoundingMode.UNNECESSARY)); decimal256Vector1.setNull(0); decimal256Vector1.setSafe(1, BigDecimal.ZERO.setScale(20, RoundingMode.UNNECESSARY)); - decimal256Vector1.setSafe(2, new BigDecimal("12345678901234567890.12345678901234567890").setScale(20, RoundingMode.UNNECESSARY)); + decimal256Vector1.setSafe( + 2, + new BigDecimal("12345678901234567890.12345678901234567890") + .setScale(20, RoundingMode.UNNECESSARY)); decimal256Vector2.setNull(0); decimal256Vector2.setSafe(1, BigDecimal.ZERO.setScale(10, RoundingMode.UNNECESSARY)); - decimal256Vector2.setSafe(2, new BigDecimal("98765432109876543210.9876543210").setScale(10, RoundingMode.UNNECESSARY)); + decimal256Vector2.setSafe( + 2, + new BigDecimal("98765432109876543210.9876543210").setScale(10, RoundingMode.UNNECESSARY)); File dataFile = new File(TMP, "testWriteNullableDecimals.avro"); @@ -1042,10 +1117,14 @@ public void testWriteNullableDecimals() throws Exception { for (int row = 1; row < rowCount; row++) { record = datumReader.read(record, decoder); - assertEquals(decimal128Vector1.getObject(row), decodeFixedDecimal(record, "decimal128_1")); - assertEquals(decimal128Vector2.getObject(row), decodeFixedDecimal(record, "decimal128_2")); - assertEquals(decimal256Vector1.getObject(row), decodeFixedDecimal(record, "decimal256_1")); - assertEquals(decimal256Vector2.getObject(row), decodeFixedDecimal(record, "decimal256_2")); + assertEquals( + decimal128Vector1.getObject(row), decodeFixedDecimal(record, "decimal128_1")); + assertEquals( + decimal128Vector2.getObject(row), decodeFixedDecimal(record, "decimal128_2")); + assertEquals( + decimal256Vector1.getObject(row), decodeFixedDecimal(record, "decimal256_1")); + assertEquals( + decimal256Vector2.getObject(row), decodeFixedDecimal(record, "decimal256_2")); } } } @@ -1062,12 +1141,15 @@ public void testWriteDates() throws Exception { // Field definitions FieldType dateDayField = new FieldType(false, new ArrowType.Date(DateUnit.DAY), null); - FieldType dateMillisField = new FieldType(false, new ArrowType.Date(DateUnit.MILLISECOND), null); + FieldType dateMillisField = + new FieldType(false, new ArrowType.Date(DateUnit.MILLISECOND), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - DateDayVector dateDayVector = new DateDayVector(new Field("dateDay", dateDayField, null), allocator); - DateMilliVector dateMillisVector = new DateMilliVector(new Field("dateMillis", dateMillisField, null), allocator); + DateDayVector dateDayVector = + new DateDayVector(new Field("dateDay", dateDayField, null), allocator); + DateMilliVector dateMillisVector = + new DateMilliVector(new Field("dateMillis", dateMillisField, null), allocator); // Set up VSR List vectors = Arrays.asList(dateDayVector, dateMillisVector); @@ -1112,7 +1194,8 @@ public void testWriteDates() throws Exception { for (int row = 0; row < rowCount; row++) { record = datumReader.read(record, decoder); assertEquals(dateDayVector.get(row), record.get("dateDay")); - assertEquals(dateMillisVector.get(row), ((long) (Integer) record.get("dateMillis")) * 86400000L); + assertEquals( + dateMillisVector.get(row), ((long) (Integer) record.get("dateMillis")) * 86400000L); } } } @@ -1127,8 +1210,10 @@ public void testWriteNullableDates() throws Exception { // Create empty vectors BufferAllocator allocator = new RootAllocator(); - DateDayVector dateDayVector = new DateDayVector(new Field("dateDay", dateDayField, null), allocator); - DateMilliVector dateMillisVector = new DateMilliVector(new Field("dateMillis", dateMillisField, null), allocator); + DateDayVector dateDayVector = + new DateDayVector(new Field("dateDay", dateDayField, null), allocator); + DateMilliVector dateMillisVector = + new DateMilliVector(new Field("dateMillis", dateMillisField, null), allocator); int rowCount = 3; @@ -1177,7 +1262,8 @@ public void testWriteNullableDates() throws Exception { for (int row = 1; row < rowCount; row++) { record = datumReader.read(record, decoder); assertEquals(dateDayVector.get(row), record.get("dateDay")); - assertEquals(dateMillisVector.get(row), ((long) (Integer) record.get("dateMillis")) * 86400000L); + assertEquals( + dateMillisVector.get(row), ((long) (Integer) record.get("dateMillis")) * 86400000L); } } } @@ -1188,19 +1274,27 @@ public void testWriteTimes() throws Exception { // Field definitions FieldType timeSecField = new FieldType(false, new ArrowType.Time(TimeUnit.SECOND, 32), null); - FieldType timeMillisField = new FieldType(false, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null); - FieldType timeMicrosField = new FieldType(false, new ArrowType.Time(TimeUnit.MICROSECOND, 64), null); - FieldType timeNanosField = new FieldType(false, new ArrowType.Time(TimeUnit.NANOSECOND, 64), null); + FieldType timeMillisField = + new FieldType(false, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null); + FieldType timeMicrosField = + new FieldType(false, new ArrowType.Time(TimeUnit.MICROSECOND, 64), null); + FieldType timeNanosField = + new FieldType(false, new ArrowType.Time(TimeUnit.NANOSECOND, 64), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - TimeSecVector timeSecVector = new TimeSecVector(new Field("timeSec", timeSecField, null), allocator); - TimeMilliVector timeMillisVector = new TimeMilliVector(new Field("timeMillis", timeMillisField, null), allocator); - TimeMicroVector timeMicrosVector = new TimeMicroVector(new Field("timeMicros", timeMicrosField, null), allocator); - TimeNanoVector timeNanosVector = new TimeNanoVector(new Field("timeNanos", timeNanosField, null), allocator); + TimeSecVector timeSecVector = + new TimeSecVector(new Field("timeSec", timeSecField, null), allocator); + TimeMilliVector timeMillisVector = + new TimeMilliVector(new Field("timeMillis", timeMillisField, null), allocator); + TimeMicroVector timeMicrosVector = + new TimeMicroVector(new Field("timeMicros", timeMicrosField, null), allocator); + TimeNanoVector timeNanosVector = + new TimeNanoVector(new Field("timeNanos", timeNanosField, null), allocator); // Set up VSR - List vectors = Arrays.asList(timeSecVector, timeMillisVector, timeMicrosVector, timeNanosVector); + List vectors = + Arrays.asList(timeSecVector, timeMillisVector, timeMicrosVector, timeNanosVector); int rowCount = 3; try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -1213,9 +1307,12 @@ public void testWriteTimes() throws Exception { timeSecVector.setSafe(1, ZonedDateTime.now().toLocalTime().toSecondOfDay() - 1); timeSecVector.setSafe(2, ZonedDateTime.now().toLocalTime().toSecondOfDay() - 2); - timeMillisVector.setSafe(0, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000)); - timeMillisVector.setSafe(1, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000) - 1000); - timeMillisVector.setSafe(2, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000) - 2000); + timeMillisVector.setSafe( + 0, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000)); + timeMillisVector.setSafe( + 1, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000) - 1000); + timeMillisVector.setSafe( + 2, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000) - 2000); timeMicrosVector.setSafe(0, ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000); timeMicrosVector.setSafe(1, ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000 - 1000000); @@ -1263,21 +1360,29 @@ public void testWriteNullableTimes() throws Exception { // Field definitions FieldType timeSecField = new FieldType(true, new ArrowType.Time(TimeUnit.SECOND, 32), null); - FieldType timeMillisField = new FieldType(true, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null); - FieldType timeMicrosField = new FieldType(true, new ArrowType.Time(TimeUnit.MICROSECOND, 64), null); - FieldType timeNanosField = new FieldType(true, new ArrowType.Time(TimeUnit.NANOSECOND, 64), null); + FieldType timeMillisField = + new FieldType(true, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null); + FieldType timeMicrosField = + new FieldType(true, new ArrowType.Time(TimeUnit.MICROSECOND, 64), null); + FieldType timeNanosField = + new FieldType(true, new ArrowType.Time(TimeUnit.NANOSECOND, 64), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - TimeSecVector timeSecVector = new TimeSecVector(new Field("timeSec", timeSecField, null), allocator); - TimeMilliVector timeMillisVector = new TimeMilliVector(new Field("timeMillis", timeMillisField, null), allocator); - TimeMicroVector timeMicrosVector = new TimeMicroVector(new Field("timeMicros", timeMicrosField, null), allocator); - TimeNanoVector timeNanosVector = new TimeNanoVector(new Field("timeNanos", timeNanosField, null), allocator); + TimeSecVector timeSecVector = + new TimeSecVector(new Field("timeSec", timeSecField, null), allocator); + TimeMilliVector timeMillisVector = + new TimeMilliVector(new Field("timeMillis", timeMillisField, null), allocator); + TimeMicroVector timeMicrosVector = + new TimeMicroVector(new Field("timeMicros", timeMicrosField, null), allocator); + TimeNanoVector timeNanosVector = + new TimeNanoVector(new Field("timeNanos", timeNanosField, null), allocator); int rowCount = 3; // Set up VSR - List vectors = Arrays.asList(timeSecVector, timeMillisVector, timeMicrosVector, timeNanosVector); + List vectors = + Arrays.asList(timeSecVector, timeMillisVector, timeMicrosVector, timeNanosVector); try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -1291,7 +1396,8 @@ public void testWriteNullableTimes() throws Exception { timeMillisVector.setNull(0); timeMillisVector.setSafe(1, 0); - timeMillisVector.setSafe(2, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000)); + timeMillisVector.setSafe( + 2, (int) (ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000000)); timeMicrosVector.setNull(0); timeMicrosVector.setSafe(1, 0); @@ -1343,20 +1449,33 @@ record = datumReader.read(record, decoder); public void testWriteZoneAwareTimestamps() throws Exception { // Field definitions - FieldType timestampSecField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.SECOND, "UTC"), null); - FieldType timestampMillisField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"), null); - FieldType timestampMicrosField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"), null); - FieldType timestampNanosField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC"), null); + FieldType timestampSecField = + new FieldType(false, new ArrowType.Timestamp(TimeUnit.SECOND, "UTC"), null); + FieldType timestampMillisField = + new FieldType(false, new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"), null); + FieldType timestampMicrosField = + new FieldType(false, new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"), null); + FieldType timestampNanosField = + new FieldType(false, new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC"), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - TimeStampSecTZVector timestampSecVector = new TimeStampSecTZVector(new Field("timestampSec", timestampSecField, null), allocator); - TimeStampMilliTZVector timestampMillisVector = new TimeStampMilliTZVector(new Field("timestampMillis", timestampMillisField, null), allocator); - TimeStampMicroTZVector timestampMicrosVector = new TimeStampMicroTZVector(new Field("timestampMicros", timestampMicrosField, null), allocator); - TimeStampNanoTZVector timestampNanosVector = new TimeStampNanoTZVector(new Field("timestampNanos", timestampNanosField, null), allocator); + TimeStampSecTZVector timestampSecVector = + new TimeStampSecTZVector(new Field("timestampSec", timestampSecField, null), allocator); + TimeStampMilliTZVector timestampMillisVector = + new TimeStampMilliTZVector( + new Field("timestampMillis", timestampMillisField, null), allocator); + TimeStampMicroTZVector timestampMicrosVector = + new TimeStampMicroTZVector( + new Field("timestampMicros", timestampMicrosField, null), allocator); + TimeStampNanoTZVector timestampNanosVector = + new TimeStampNanoTZVector( + new Field("timestampNanos", timestampNanosField, null), allocator); // Set up VSR - List vectors = Arrays.asList(timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); + List vectors = + Arrays.asList( + timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); int rowCount = 3; try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -1405,7 +1524,8 @@ public void testWriteZoneAwareTimestamps() throws Exception { // Read and check values for (int row = 0; row < rowCount; row++) { record = datumReader.read(record, decoder); - assertEquals(timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); + assertEquals( + timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); assertEquals(timestampMillisVector.get(row), (int) (long) record.get("timestampMillis")); assertEquals(timestampMicrosVector.get(row), record.get("timestampMicros")); assertEquals(timestampNanosVector.get(row), record.get("timestampNanos")); @@ -1418,22 +1538,35 @@ record = datumReader.read(record, decoder); public void testWriteNullableZoneAwareTimestamps() throws Exception { // Field definitions - FieldType timestampSecField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.SECOND, "UTC"), null); - FieldType timestampMillisField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"), null); - FieldType timestampMicrosField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"), null); - FieldType timestampNanosField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC"), null); + FieldType timestampSecField = + new FieldType(true, new ArrowType.Timestamp(TimeUnit.SECOND, "UTC"), null); + FieldType timestampMillisField = + new FieldType(true, new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"), null); + FieldType timestampMicrosField = + new FieldType(true, new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"), null); + FieldType timestampNanosField = + new FieldType(true, new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC"), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - TimeStampSecTZVector timestampSecVector = new TimeStampSecTZVector(new Field("timestampSec", timestampSecField, null), allocator); - TimeStampMilliTZVector timestampMillisVector = new TimeStampMilliTZVector(new Field("timestampMillis", timestampMillisField, null), allocator); - TimeStampMicroTZVector timestampMicrosVector = new TimeStampMicroTZVector(new Field("timestampMicros", timestampMicrosField, null), allocator); - TimeStampNanoTZVector timestampNanosVector = new TimeStampNanoTZVector(new Field("timestampNanos", timestampNanosField, null), allocator); + TimeStampSecTZVector timestampSecVector = + new TimeStampSecTZVector(new Field("timestampSec", timestampSecField, null), allocator); + TimeStampMilliTZVector timestampMillisVector = + new TimeStampMilliTZVector( + new Field("timestampMillis", timestampMillisField, null), allocator); + TimeStampMicroTZVector timestampMicrosVector = + new TimeStampMicroTZVector( + new Field("timestampMicros", timestampMicrosField, null), allocator); + TimeStampNanoTZVector timestampNanosVector = + new TimeStampNanoTZVector( + new Field("timestampNanos", timestampNanosField, null), allocator); int rowCount = 3; // Set up VSR - List vectors = Arrays.asList(timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); + List vectors = + Arrays.asList( + timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -1486,7 +1619,8 @@ public void testWriteNullableZoneAwareTimestamps() throws Exception { for (int row = 1; row < rowCount; row++) { record = datumReader.read(record, decoder); - assertEquals(timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); + assertEquals( + timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); assertEquals(timestampMillisVector.get(row), (int) (long) record.get("timestampMillis")); assertEquals(timestampMicrosVector.get(row), record.get("timestampMicros")); assertEquals(timestampNanosVector.get(row), record.get("timestampNanos")); @@ -1499,20 +1633,32 @@ record = datumReader.read(record, decoder); public void testWriteLocalTimestamps() throws Exception { // Field definitions - FieldType timestampSecField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.SECOND, null), null); - FieldType timestampMillisField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.MILLISECOND, null), null); - FieldType timestampMicrosField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.MICROSECOND, null), null); - FieldType timestampNanosField = new FieldType(false, new ArrowType.Timestamp(TimeUnit.NANOSECOND, null), null); + FieldType timestampSecField = + new FieldType(false, new ArrowType.Timestamp(TimeUnit.SECOND, null), null); + FieldType timestampMillisField = + new FieldType(false, new ArrowType.Timestamp(TimeUnit.MILLISECOND, null), null); + FieldType timestampMicrosField = + new FieldType(false, new ArrowType.Timestamp(TimeUnit.MICROSECOND, null), null); + FieldType timestampNanosField = + new FieldType(false, new ArrowType.Timestamp(TimeUnit.NANOSECOND, null), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - TimeStampSecVector timestampSecVector = new TimeStampSecVector(new Field("timestampSec", timestampSecField, null), allocator); - TimeStampMilliVector timestampMillisVector = new TimeStampMilliVector(new Field("timestampMillis", timestampMillisField, null), allocator); - TimeStampMicroVector timestampMicrosVector = new TimeStampMicroVector(new Field("timestampMicros", timestampMicrosField, null), allocator); - TimeStampNanoVector timestampNanosVector = new TimeStampNanoVector(new Field("timestampNanos", timestampNanosField, null), allocator); + TimeStampSecVector timestampSecVector = + new TimeStampSecVector(new Field("timestampSec", timestampSecField, null), allocator); + TimeStampMilliVector timestampMillisVector = + new TimeStampMilliVector( + new Field("timestampMillis", timestampMillisField, null), allocator); + TimeStampMicroVector timestampMicrosVector = + new TimeStampMicroVector( + new Field("timestampMicros", timestampMicrosField, null), allocator); + TimeStampNanoVector timestampNanosVector = + new TimeStampNanoVector(new Field("timestampNanos", timestampNanosField, null), allocator); // Set up VSR - List vectors = Arrays.asList(timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); + List vectors = + Arrays.asList( + timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); int rowCount = 3; try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -1561,7 +1707,8 @@ public void testWriteLocalTimestamps() throws Exception { // Read and check values for (int row = 0; row < rowCount; row++) { record = datumReader.read(record, decoder); - assertEquals(timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); + assertEquals( + timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); assertEquals(timestampMillisVector.get(row), (int) (long) record.get("timestampMillis")); assertEquals(timestampMicrosVector.get(row), record.get("timestampMicros")); assertEquals(timestampNanosVector.get(row), record.get("timestampNanos")); @@ -1574,22 +1721,34 @@ record = datumReader.read(record, decoder); public void testWriteNullableLocalTimestamps() throws Exception { // Field definitions - FieldType timestampSecField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.SECOND, null), null); - FieldType timestampMillisField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.MILLISECOND, null), null); - FieldType timestampMicrosField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.MICROSECOND, null), null); - FieldType timestampNanosField = new FieldType(true, new ArrowType.Timestamp(TimeUnit.NANOSECOND, null), null); + FieldType timestampSecField = + new FieldType(true, new ArrowType.Timestamp(TimeUnit.SECOND, null), null); + FieldType timestampMillisField = + new FieldType(true, new ArrowType.Timestamp(TimeUnit.MILLISECOND, null), null); + FieldType timestampMicrosField = + new FieldType(true, new ArrowType.Timestamp(TimeUnit.MICROSECOND, null), null); + FieldType timestampNanosField = + new FieldType(true, new ArrowType.Timestamp(TimeUnit.NANOSECOND, null), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - TimeStampSecVector timestampSecVector = new TimeStampSecVector(new Field("timestampSec", timestampSecField, null), allocator); - TimeStampMilliVector timestampMillisVector = new TimeStampMilliVector(new Field("timestampMillis", timestampMillisField, null), allocator); - TimeStampMicroVector timestampMicrosVector = new TimeStampMicroVector(new Field("timestampMicros", timestampMicrosField, null), allocator); - TimeStampNanoVector timestampNanosVector = new TimeStampNanoVector(new Field("timestampNanos", timestampNanosField, null), allocator); + TimeStampSecVector timestampSecVector = + new TimeStampSecVector(new Field("timestampSec", timestampSecField, null), allocator); + TimeStampMilliVector timestampMillisVector = + new TimeStampMilliVector( + new Field("timestampMillis", timestampMillisField, null), allocator); + TimeStampMicroVector timestampMicrosVector = + new TimeStampMicroVector( + new Field("timestampMicros", timestampMicrosField, null), allocator); + TimeStampNanoVector timestampNanosVector = + new TimeStampNanoVector(new Field("timestampNanos", timestampNanosField, null), allocator); int rowCount = 3; // Set up VSR - List vectors = Arrays.asList(timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); + List vectors = + Arrays.asList( + timestampSecVector, timestampMillisVector, timestampMicrosVector, timestampNanosVector); try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { @@ -1642,7 +1801,8 @@ public void testWriteNullableLocalTimestamps() throws Exception { for (int row = 1; row < rowCount; row++) { record = datumReader.read(record, decoder); - assertEquals(timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); + assertEquals( + timestampSecVector.get(row), (int) ((long) record.get("timestampSec") / 1000)); assertEquals(timestampMillisVector.get(row), (int) (long) record.get("timestampMillis")); assertEquals(timestampMicrosVector.get(row), record.get("timestampMicros")); assertEquals(timestampNanosVector.get(row), record.get("timestampNanos")); @@ -1663,7 +1823,8 @@ public void testWriteLists() throws Exception { Field intField = new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); Field stringField = new Field("item", FieldType.notNullable(new ArrowType.Utf8()), null); - Field dateField = new Field("item", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); + Field dateField = + new Field("item", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); @@ -1763,11 +1924,13 @@ public void testWriteNullableLists() throws Exception { FieldType nonNullListType = new FieldType(false, new ArrowType.List(), null); Field nullFieldType = new Field("item", FieldType.nullable(new ArrowType.Int(32, true)), null); - Field nonNullFieldType = new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field nonNullFieldType = + new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - ListVector nullEntriesVector = new ListVector("nullEntriesVector", allocator, nonNullListType, null); + ListVector nullEntriesVector = + new ListVector("nullEntriesVector", allocator, nonNullListType, null); ListVector nullListVector = new ListVector("nullListVector", allocator, nullListType, null); ListVector nullBothVector = new ListVector("nullBothVector", allocator, nullListType, null); @@ -1775,7 +1938,6 @@ public void testWriteNullableLists() throws Exception { nullListVector.initializeChildrenFromFields(Arrays.asList(nonNullFieldType)); nullBothVector.initializeChildrenFromFields(Arrays.asList(nullFieldType)); - // Set up VSR List vectors = Arrays.asList(nullEntriesVector, nullListVector, nullBothVector); int rowCount = 4; @@ -1849,7 +2011,8 @@ public void testWriteNullableLists() throws Exception { // Read and check values for (int row = 0; row < rowCount; row++) { record = datumReader.read(record, decoder); - for (String list : Arrays.asList("nullEntriesVector", "nullListVector", "nullBothVector")) { + for (String list : + Arrays.asList("nullEntriesVector", "nullListVector", "nullBothVector")) { ListVector vector = (ListVector) root.getVector(list); Object recordField = record.get(list); if (vector.isNull(row)) { @@ -1873,13 +2036,17 @@ public void testWriteFixedLists() throws Exception { Field intField = new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); Field stringField = new Field("item", FieldType.notNullable(new ArrowType.Utf8()), null); - Field dateField = new Field("item", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); + Field dateField = + new Field("item", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - FixedSizeListVector intListVector = new FixedSizeListVector("intList", allocator, intListField, null); - FixedSizeListVector stringListVector = new FixedSizeListVector("stringList", allocator, stringListField, null); - FixedSizeListVector dateListVector = new FixedSizeListVector("dateList", allocator, dateListField, null); + FixedSizeListVector intListVector = + new FixedSizeListVector("intList", allocator, intListField, null); + FixedSizeListVector stringListVector = + new FixedSizeListVector("stringList", allocator, stringListField, null); + FixedSizeListVector dateListVector = + new FixedSizeListVector("dateList", allocator, dateListField, null); intListVector.initializeChildrenFromFields(Arrays.asList(intField)); stringListVector.initializeChildrenFromFields(Arrays.asList(stringField)); @@ -1970,19 +2137,22 @@ public void testWriteNullableFixedLists() throws Exception { FieldType nonNullListType = new FieldType(false, new ArrowType.FixedSizeList(1), null); Field nullFieldType = new Field("item", FieldType.nullable(new ArrowType.Int(32, true)), null); - Field nonNullFieldType = new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field nonNullFieldType = + new Field("item", FieldType.notNullable(new ArrowType.Int(32, true)), null); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - FixedSizeListVector nullEntriesVector = new FixedSizeListVector("nullEntriesVector", allocator, nonNullListType, null); - FixedSizeListVector nullListVector = new FixedSizeListVector("nullListVector", allocator, nullListType, null); - FixedSizeListVector nullBothVector = new FixedSizeListVector("nullBothVector", allocator, nullListType, null); + FixedSizeListVector nullEntriesVector = + new FixedSizeListVector("nullEntriesVector", allocator, nonNullListType, null); + FixedSizeListVector nullListVector = + new FixedSizeListVector("nullListVector", allocator, nullListType, null); + FixedSizeListVector nullBothVector = + new FixedSizeListVector("nullBothVector", allocator, nullListType, null); nullEntriesVector.initializeChildrenFromFields(Arrays.asList(nullFieldType)); nullListVector.initializeChildrenFromFields(Arrays.asList(nonNullFieldType)); nullBothVector.initializeChildrenFromFields(Arrays.asList(nullFieldType)); - // Set up VSR List vectors = Arrays.asList(nullEntriesVector, nullListVector, nullBothVector); int rowCount = 4; @@ -2046,7 +2216,8 @@ public void testWriteNullableFixedLists() throws Exception { // Read and check values for (int row = 0; row < rowCount; row++) { record = datumReader.read(record, decoder); - for (String list : Arrays.asList("nullEntriesVector", "nullListVector", "nullBothVector")) { + for (String list : + Arrays.asList("nullEntriesVector", "nullListVector", "nullBothVector")) { FixedSizeListVector vector = (FixedSizeListVector) root.getVector(list); Object recordField = record.get(list); if (vector.isNull(row)) { @@ -2071,11 +2242,24 @@ public void testWriteMap() throws Exception { Field keyField = new Field("key", FieldType.notNullable(new ArrowType.Utf8()), null); Field intField = new Field("value", FieldType.notNullable(new ArrowType.Int(32, true)), null); Field stringField = new Field("value", FieldType.notNullable(new ArrowType.Utf8()), null); - Field dateField = new Field("value", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); - - Field intEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, intField)); - Field stringEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, stringField)); - Field dateEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, dateField)); + Field dateField = + new Field("value", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); + + Field intEntryField = + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList(keyField, intField)); + Field stringEntryField = + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList(keyField, stringField)); + Field dateEntryField = + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList(keyField, dateField)); // Create empty vectors BufferAllocator allocator = new RootAllocator(); @@ -2179,13 +2363,23 @@ public void testWriteNullableMap() throws Exception { Field keyField = new Field("key", FieldType.notNullable(new ArrowType.Utf8()), null); Field nullFieldType = new Field("value", FieldType.nullable(new ArrowType.Int(32, true)), null); - Field nonNullFieldType = new Field("value", FieldType.notNullable(new ArrowType.Int(32, true)), null); - Field nullEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, nullFieldType)); - Field nonNullEntryField = new Field("entries", FieldType.notNullable(new ArrowType.Struct()), Arrays.asList(keyField, nonNullFieldType)); + Field nonNullFieldType = + new Field("value", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field nullEntryField = + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList(keyField, nullFieldType)); + Field nonNullEntryField = + new Field( + "entries", + FieldType.notNullable(new ArrowType.Struct()), + Arrays.asList(keyField, nonNullFieldType)); // Create empty vectors BufferAllocator allocator = new RootAllocator(); - MapVector nullEntriesVector = new MapVector("nullEntriesVector", allocator, nonNullMapType, null); + MapVector nullEntriesVector = + new MapVector("nullEntriesVector", allocator, nonNullMapType, null); MapVector nullMapVector = new MapVector("nullMapVector", allocator, nullMapType, null); MapVector nullBothVector = new MapVector("nullBothVector", allocator, nullMapType, null); @@ -2310,8 +2504,7 @@ private Map convertMap(List entryList) { private void compareMaps(Map expected, Map actual) { if (expected == null) { assertNull(actual); - } - else { + } else { assertEquals(expected.size(), actual.size()); for (Object key : actual.keySet()) { assertTrue(expected.containsKey(key.toString())); From 44cec9c5f54dd4a109bd7ee3275851be52b26170 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 02:47:05 +0000 Subject: [PATCH 71/89] Add test for struct type --- .../adapter/avro/ArrowToAvroDataTest.java | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 9beed2969b..22c846b434 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -73,6 +73,7 @@ import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.writer.BaseWriter; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; @@ -2517,4 +2518,75 @@ private void compareMaps(Map expected, Map actual) { } } } + + @Test + public void testWriteStruct() throws Exception { + + // Field definitions + FieldType structFieldType = new FieldType(false, new ArrowType.Struct(), null); + Field intField = new Field("intField", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field stringField = new Field("stringField", FieldType.notNullable(new ArrowType.Utf8()), null); + Field dateField = new Field("dateField", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); + Field structField = new Field("struct", structFieldType, Arrays.asList(intField, stringField, dateField)); + + // Create empty vector + BufferAllocator allocator = new RootAllocator(); + StructVector structVector = new StructVector("struct", allocator, structFieldType, null); + structVector.initializeChildrenFromFields(Arrays.asList(intField, stringField, dateField)); + structVector.allocateNew(); + + // Set up VSR + List vectors = Arrays.asList(structVector); + int rowCount = 3; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data + IntVector intVector = (IntVector) structVector.getChild("intField"); + VarCharVector stringVector = (VarCharVector) structVector.getChild("stringField"); + DateDayVector dateVector = (DateDayVector) structVector.getChild("dateField"); + + for (int i = 0; i < rowCount; i++) { + structVector.setIndexDefined(i); + intVector.setSafe(i, i); + stringVector.setSafe(i, ("string" + i).getBytes()); + dateVector.setSafe(i, (int) LocalDate.now().toEpochDay() + i); + } + + File dataFile = new File(TMP, "testWriteStruct.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertNotNull(record.get("struct")); + GenericRecord structRecord = (GenericRecord) record.get("struct"); + assertEquals(row, structRecord.get("intField")); + assertEquals("string" + row, structRecord.get("stringField").toString()); + assertEquals((int) LocalDate.now().toEpochDay() + row, structRecord.get("dateField")); + } + } + } + } } From 1d07493d2c9d49c2b2ad1c1ac4edc30db0f6b3b9 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 02:56:45 +0000 Subject: [PATCH 72/89] Add test for nullable struct type --- .../adapter/avro/ArrowToAvroDataTest.java | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 22c846b434..aa7a1e9175 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -2589,4 +2589,116 @@ record = datumReader.read(record, decoder); } } } + + @Test + public void testWriteNullableStructs() throws Exception { + + // Field definitions + FieldType structFieldType = new FieldType(false, new ArrowType.Struct(), null); + FieldType nullableStructFieldType = new FieldType(true, new ArrowType.Struct(), null); + Field intField = new Field("intField", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field nullableIntField = new Field("nullableIntField", FieldType.nullable(new ArrowType.Int(32, true)), null); + Field structField = new Field("struct", structFieldType, Arrays.asList(intField, nullableIntField)); + Field nullableStructField = new Field("nullableStruct", nullableStructFieldType, Arrays.asList(intField, nullableIntField)); + + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + StructVector structVector = new StructVector("struct", allocator, structFieldType, null); + StructVector nullableStructVector = new StructVector("nullableStruct", allocator, nullableStructFieldType, null); + structVector.initializeChildrenFromFields(Arrays.asList(intField, nullableIntField)); + nullableStructVector.initializeChildrenFromFields(Arrays.asList(intField, nullableIntField)); + structVector.allocateNew(); + nullableStructVector.allocateNew(); + + // Set up VSR + List vectors = Arrays.asList(structVector, nullableStructVector); + int rowCount = 4; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); + + // Set test data for structVector + IntVector intVector = (IntVector) structVector.getChild("intField"); + IntVector nullableIntVector = (IntVector) structVector.getChild("nullableIntField"); + for (int i = 0; i < rowCount; i++) { + structVector.setIndexDefined(i); + intVector.setSafe(i, i); + if (i % 2 == 0) { + nullableIntVector.setSafe(i, i * 10); + } else { + nullableIntVector.setNull(i); + } + } + + // Set test data for nullableStructVector + IntVector nullableStructIntVector = (IntVector) nullableStructVector.getChild("intField"); + IntVector nullableStructNullableIntVector = (IntVector) nullableStructVector.getChild("nullableIntField"); + for (int i = 0; i < rowCount; i++) { + if (i >= 2) { + nullableStructVector.setIndexDefined(i); + nullableStructIntVector.setSafe(i, i); + if (i % 2 == 0) { + nullableStructNullableIntVector.setSafe(i, i * 10); + } else { + nullableStructNullableIntVector.setNull(i); + } + }else { + nullableStructVector.setNull(i); + } + } + + File dataFile = new File(TMP, "testWriteNullableStructs.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + if (row % 2 == 0) { + assertNotNull(record.get("struct")); + GenericRecord structRecord = (GenericRecord) record.get("struct"); + assertEquals(row, structRecord.get("intField")); + assertEquals(row * 10, structRecord.get("nullableIntField")); + } else { + assertNotNull(record.get("struct")); + GenericRecord structRecord = (GenericRecord) record.get("struct"); + assertEquals(row, structRecord.get("intField")); + assertNull(structRecord.get("nullableIntField")); + + } + if (row >= 2) { + assertNotNull(record.get("nullableStruct")); + GenericRecord nullableStructRecord = (GenericRecord) record.get("nullableStruct"); + assertEquals(row, nullableStructRecord.get("intField")); + if (row % 2 == 0) { + assertEquals(row * 10, nullableStructRecord.get("nullableIntField")); + } else { + assertNull(nullableStructRecord.get("nullableIntField")); + } + } else { + assertNull(record.get("nullableStruct")); + } + } + } + } + } } From 593d16011d86d4caf88b5c0e152c915f7a4bc2fe Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 17:07:38 +0000 Subject: [PATCH 73/89] Update tests to use field writers (ensure correct vector layout) --- .../adapter/avro/ArrowToAvroDataTest.java | 520 +++++++++--------- 1 file changed, 267 insertions(+), 253 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index aa7a1e9175..497799bb87 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -75,6 +75,7 @@ import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.writer.BaseWriter; +import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.TimeUnit; @@ -1846,37 +1847,35 @@ public void testWriteLists() throws Exception { root.setRowCount(rowCount); root.allocateNew(); + FieldWriter intListWriter = intListVector.getWriter(); + FieldWriter stringListWriter = stringListVector.getWriter(); + FieldWriter dateListWriter = dateListVector.getWriter(); + // Set test data for intList - for (int i = 0, offset = 0; i < rowCount; i++) { - intListVector.startNewValue(i); - IntVector indDataVector = (IntVector) intListVector.getDataVector(); + for (int i = 0; i < rowCount; i++) { + intListWriter.startList(); for (int j = 0; j < 5 - i; j++) { - indDataVector.set(offset + j, j); + intListWriter.writeInt(j); } - intListVector.endValue(i, 5 - i); - offset += 5 - i; + intListWriter.endList(); } // Set test data for stringList - for (int i = 0, offset = 0; i < rowCount; i++) { - stringListVector.startNewValue(i); - VarCharVector varCharVector = (VarCharVector) stringListVector.getDataVector(); + for (int i = 0; i < rowCount; i++) { + stringListWriter.startList(); for (int j = 0; j < 5 - i; j++) { - varCharVector.setSafe(offset + j, ("string" + j).getBytes()); + stringListWriter.writeVarChar("string" + j); } - stringListVector.endValue(i, 5 - i); - offset += 5 - i; + stringListWriter.endList(); } // Set test data for dateList - for (int i = 0, offset = 0; i < rowCount; i++) { - dateListVector.startNewValue(i); - DateDayVector dateVector = (DateDayVector) dateListVector.getDataVector(); + for (int i = 0; i < rowCount; i++) { + dateListWriter.startList(); for (int j = 0; j < 5 - i; j++) { - dateVector.setSafe(offset + j, (int) LocalDate.now().plusDays(j).toEpochDay()); + dateListWriter.writeDateDay((int) LocalDate.now().plusDays(j).toEpochDay()); } - dateListVector.endValue(i, 5 - i); - offset += 5 - i; + dateListWriter.endList(); } File dataFile = new File(TMP, "testWriteLists.avro"); @@ -1949,45 +1948,47 @@ public void testWriteNullableLists() throws Exception { root.allocateNew(); // Set test data for nullEntriesVector - IntVector nullEntriesData = (IntVector) nullEntriesVector.getDataVector(); - nullEntriesVector.startNewValue(0); - nullEntriesData.setNull(0); - nullEntriesVector.endValue(0, 1); - nullEntriesVector.startNewValue(1); - nullEntriesData.set(1, 0); - nullEntriesVector.endValue(1, 1); - nullEntriesVector.startNewValue(2); - nullEntriesData.set(2, 1); - nullEntriesVector.endValue(2, 1); - nullEntriesVector.startNewValue(3); - nullEntriesData.set(3, 2); - nullEntriesVector.endValue(3, 1); + FieldWriter nullEntriesWriter = nullEntriesVector.getWriter(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeNull(); + nullEntriesWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(0); + nullEntriesWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(1); + nullEntriesWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(2); + nullEntriesWriter.endList(); // Set test data for nullListVector - IntVector nullListData = (IntVector) nullListVector.getDataVector(); - nullListVector.setNull(0); - nullListVector.startNewValue(1); - nullListData.set(0, 0); - nullListVector.endValue(1, 1); - nullListVector.startNewValue(2); - nullListData.set(1, 1); - nullListVector.endValue(2, 1); - nullListVector.startNewValue(3); - nullListData.set(2, 2); - nullListVector.endValue(3, 1); + FieldWriter nullListWriter = nullListVector.getWriter(); + nullListWriter.writeNull(); + nullListWriter.setPosition(1); // writeNull() does not inc. idx() on list vector + nullListWriter.startList(); + nullListWriter.integer().writeInt(0); + nullListWriter.endList(); + nullListWriter.startList(); + nullListWriter.integer().writeInt(1); + nullListWriter.endList(); + nullListWriter.startList(); + nullListWriter.integer().writeInt(2); + nullListWriter.endList(); // Set test data for nullBothVector - IntVector nullBothData = (IntVector) nullBothVector.getDataVector(); - nullBothVector.setNull(0); - nullBothVector.startNewValue(1); - nullBothData.setNull(0); - nullBothVector.endValue(1, 1); - nullBothVector.startNewValue(2); - nullBothData.set(1, 0); - nullBothVector.endValue(2, 1); - nullBothVector.startNewValue(3); - nullBothData.set(2, 1); - nullBothVector.endValue(3, 1); + FieldWriter nullBothWriter = nullBothVector.getWriter(); + nullBothWriter.writeNull(); + nullBothWriter.setPosition(1); + nullBothWriter.startList(); + nullBothWriter.integer().writeNull(); + nullBothWriter.endList(); + nullBothWriter.startList(); + nullBothWriter.integer().writeInt(0); + nullBothWriter.endList(); + nullBothWriter.startList(); + nullBothWriter.integer().writeInt(1); + nullBothWriter.endList(); File dataFile = new File(TMP, "testWriteNullableLists.avro"); @@ -2062,36 +2063,36 @@ public void testWriteFixedLists() throws Exception { root.setRowCount(rowCount); root.allocateNew(); + FieldWriter intListWriter = intListVector.getWriter(); + FieldWriter stringListWriter = stringListVector.getWriter(); + FieldWriter dateListWriter = dateListVector.getWriter(); + // Set test data for intList - for (int i = 0, offset = 0; i < rowCount; i++) { - intListVector.startNewValue(i); - IntVector indDataVector = (IntVector) intListVector.getDataVector(); + for (int i = 0; i < rowCount; i++) { + intListWriter.startList(); for (int j = 0; j < 5; j++) { - indDataVector.set(offset + j, j); + intListWriter.writeInt(j); } - offset += 5; + intListWriter.endList(); } // Set test data for stringList - for (int i = 0, offset = 0; i < rowCount; i++) { - stringListVector.startNewValue(i); - VarCharVector varCharVector = (VarCharVector) stringListVector.getDataVector(); + for (int i = 0; i < rowCount; i++) { + stringListWriter.startList(); for (int j = 0; j < 5; j++) { - varCharVector.setSafe(offset + j, ("string" + j).getBytes()); + stringListWriter.writeVarChar("string" + j); } - offset += 5; + stringListWriter.endList(); } // Set test data for dateList - for (int i = 0, offset = 0; i < rowCount; i++) { - dateListVector.startNewValue(i); - DateDayVector dateVector = (DateDayVector) dateListVector.getDataVector(); + for (int i = 0; i < rowCount; i++) { + dateListWriter.startList(); for (int j = 0; j < 5; j++) { - dateVector.setSafe(offset + j, (int) LocalDate.now().plusDays(j).toEpochDay()); + dateListWriter.writeDateDay((int) LocalDate.now().plusDays(j).toEpochDay()); } - offset += 5; + dateListWriter.endList(); } - File dataFile = new File(TMP, "testWriteFixedLists.avro"); // Write an AVRO block using the producer classes @@ -2164,35 +2165,47 @@ public void testWriteNullableFixedLists() throws Exception { root.allocateNew(); // Set test data for nullEntriesVector - IntVector nullEntriesData = (IntVector) nullEntriesVector.getDataVector(); - nullEntriesVector.startNewValue(0); - nullEntriesData.setNull(0); - nullEntriesVector.startNewValue(1); - nullEntriesData.set(1, 0); - nullEntriesVector.startNewValue(2); - nullEntriesData.set(2, 1); - nullEntriesVector.startNewValue(3); - nullEntriesData.set(3, 2); + FieldWriter nullEntriesWriter = nullEntriesVector.getWriter(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeNull(); + nullEntriesWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(0); + nullEntriesWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(1); + nullEntriesWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(2); + nullEntriesWriter.endList(); // Set test data for nullListVector - IntVector nullListData = (IntVector) nullListVector.getDataVector(); - nullListVector.setNull(0); - nullListVector.startNewValue(1); - nullListData.set(1, 0); - nullListVector.startNewValue(2); - nullListData.set(2, 1); - nullListVector.startNewValue(3); - nullListData.set(3, 2); + FieldWriter nullListWriter = nullListVector.getWriter(); + nullListWriter.writeNull(); + nullListWriter.setPosition(1); // writeNull() does not inc. idx() on list vector + nullListWriter.startList(); + nullListWriter.integer().writeInt(0); + nullListWriter.endList(); + nullListWriter.startList(); + nullListWriter.integer().writeInt(1); + nullListWriter.endList(); + nullListWriter.startList(); + nullListWriter.integer().writeInt(2); + nullListWriter.endList(); // Set test data for nullBothVector - IntVector nullBothData = (IntVector) nullBothVector.getDataVector(); - nullBothVector.setNull(0); - nullBothVector.startNewValue(1); - nullBothData.setNull(1); - nullBothVector.startNewValue(2); - nullBothData.set(2, 0); - nullBothVector.startNewValue(3); - nullBothData.set(3, 1); + FieldWriter nullBothWriter = nullBothVector.getWriter(); + nullBothWriter.writeNull(); + nullBothWriter.setPosition(1); + nullBothWriter.startList(); + nullBothWriter.integer().writeNull(); + nullBothWriter.endList(); + nullBothWriter.startList(); + nullBothWriter.integer().writeInt(0); + nullBothWriter.endList(); + nullBothWriter.startList(); + nullBothWriter.integer().writeInt(1); + nullBothWriter.endList(); File dataFile = new File(TMP, "testWriteNullableFixedLists.avro"); @@ -2421,6 +2434,7 @@ public void testWriteNullableMap() throws Exception { // Set test data for stringList BaseWriter.MapWriter nullMapWriter = nullMapVector.getWriter(); nullMapWriter.writeNull(); + nullMapWriter.setPosition(1); // writeNull() does not inc. idx() on map (list) vector nullMapWriter.startMap(); nullMapWriter.startEntry(); nullMapWriter.key().varChar().writeVarChar("key1"); @@ -2437,6 +2451,7 @@ public void testWriteNullableMap() throws Exception { // Set test data for dateList BaseWriter.MapWriter nullBothWriter = nullBothVector.getWriter(); nullBothWriter.writeNull(); + nullBothWriter.setPosition(1); nullBothWriter.startMap(); nullBothWriter.startEntry(); nullBothWriter.key().varChar().writeVarChar("key1"); @@ -2522,183 +2537,182 @@ private void compareMaps(Map expected, Map actual) { @Test public void testWriteStruct() throws Exception { - // Field definitions - FieldType structFieldType = new FieldType(false, new ArrowType.Struct(), null); - Field intField = new Field("intField", FieldType.notNullable(new ArrowType.Int(32, true)), null); - Field stringField = new Field("stringField", FieldType.notNullable(new ArrowType.Utf8()), null); - Field dateField = new Field("dateField", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); - Field structField = new Field("struct", structFieldType, Arrays.asList(intField, stringField, dateField)); - - // Create empty vector - BufferAllocator allocator = new RootAllocator(); - StructVector structVector = new StructVector("struct", allocator, structFieldType, null); - structVector.initializeChildrenFromFields(Arrays.asList(intField, stringField, dateField)); - structVector.allocateNew(); - - // Set up VSR - List vectors = Arrays.asList(structVector); - int rowCount = 3; - - try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { - - root.setRowCount(rowCount); - root.allocateNew(); - - // Set test data - IntVector intVector = (IntVector) structVector.getChild("intField"); - VarCharVector stringVector = (VarCharVector) structVector.getChild("stringField"); - DateDayVector dateVector = (DateDayVector) structVector.getChild("dateField"); - - for (int i = 0; i < rowCount; i++) { - structVector.setIndexDefined(i); - intVector.setSafe(i, i); - stringVector.setSafe(i, ("string" + i).getBytes()); - dateVector.setSafe(i, (int) LocalDate.now().toEpochDay() + i); - } + // Field definitions + FieldType structFieldType = new FieldType(false, new ArrowType.Struct(), null); + Field intField = + new Field("intField", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field stringField = new Field("stringField", FieldType.notNullable(new ArrowType.Utf8()), null); + Field dateField = + new Field("dateField", FieldType.notNullable(new ArrowType.Date(DateUnit.DAY)), null); - File dataFile = new File(TMP, "testWriteStruct.avro"); + // Create empty vector + BufferAllocator allocator = new RootAllocator(); + StructVector structVector = new StructVector("struct", allocator, structFieldType, null); + structVector.initializeChildrenFromFields(Arrays.asList(intField, stringField, dateField)); - // Write an AVRO block using the producer classes - try (FileOutputStream fos = new FileOutputStream(dataFile)) { - BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); - CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); - for (int row = 0; row < rowCount; row++) { - producer.produce(encoder); - } - encoder.flush(); - } + // Set up VSR + List vectors = Arrays.asList(structVector); + int rowCount = 3; - // Set up reading the AVRO block as a GenericRecord - Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); - GenericDatumReader datumReader = new GenericDatumReader<>(schema); + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { - try (InputStream inputStream = new FileInputStream(dataFile)) { + root.setRowCount(rowCount); + root.allocateNew(); - BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); - GenericRecord record = null; + // Set test data + BaseWriter.StructWriter structWriter = structVector.getWriter(); - // Read and check values - for (int row = 0; row < rowCount; row++) { - record = datumReader.read(record, decoder); - assertNotNull(record.get("struct")); - GenericRecord structRecord = (GenericRecord) record.get("struct"); - assertEquals(row, structRecord.get("intField")); - assertEquals("string" + row, structRecord.get("stringField").toString()); - assertEquals((int) LocalDate.now().toEpochDay() + row, structRecord.get("dateField")); - } - } + for (int i = 0; i < rowCount; i++) { + structWriter.start(); + structWriter.integer("intField").writeInt(i); + structWriter.varChar("stringField").writeVarChar("string" + i); + structWriter.dateDay("dateField").writeDateDay((int) LocalDate.now().toEpochDay() + i); + structWriter.end(); + } + + File dataFile = new File(TMP, "testWriteStruct.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + assertNotNull(record.get("struct")); + GenericRecord structRecord = (GenericRecord) record.get("struct"); + assertEquals(row, structRecord.get("intField")); + assertEquals("string" + row, structRecord.get("stringField").toString()); + assertEquals((int) LocalDate.now().toEpochDay() + row, structRecord.get("dateField")); + } } + } } @Test public void testWriteNullableStructs() throws Exception { - // Field definitions - FieldType structFieldType = new FieldType(false, new ArrowType.Struct(), null); - FieldType nullableStructFieldType = new FieldType(true, new ArrowType.Struct(), null); - Field intField = new Field("intField", FieldType.notNullable(new ArrowType.Int(32, true)), null); - Field nullableIntField = new Field("nullableIntField", FieldType.nullable(new ArrowType.Int(32, true)), null); - Field structField = new Field("struct", structFieldType, Arrays.asList(intField, nullableIntField)); - Field nullableStructField = new Field("nullableStruct", nullableStructFieldType, Arrays.asList(intField, nullableIntField)); - - // Create empty vectors - BufferAllocator allocator = new RootAllocator(); - StructVector structVector = new StructVector("struct", allocator, structFieldType, null); - StructVector nullableStructVector = new StructVector("nullableStruct", allocator, nullableStructFieldType, null); - structVector.initializeChildrenFromFields(Arrays.asList(intField, nullableIntField)); - nullableStructVector.initializeChildrenFromFields(Arrays.asList(intField, nullableIntField)); - structVector.allocateNew(); - nullableStructVector.allocateNew(); - - // Set up VSR - List vectors = Arrays.asList(structVector, nullableStructVector); - int rowCount = 4; - - try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { - - root.setRowCount(rowCount); - root.allocateNew(); - - // Set test data for structVector - IntVector intVector = (IntVector) structVector.getChild("intField"); - IntVector nullableIntVector = (IntVector) structVector.getChild("nullableIntField"); - for (int i = 0; i < rowCount; i++) { - structVector.setIndexDefined(i); - intVector.setSafe(i, i); - if (i % 2 == 0) { - nullableIntVector.setSafe(i, i * 10); - } else { - nullableIntVector.setNull(i); - } - } + // Field definitions + FieldType structFieldType = new FieldType(false, new ArrowType.Struct(), null); + FieldType nullableStructFieldType = new FieldType(true, new ArrowType.Struct(), null); + Field intField = + new Field("intField", FieldType.notNullable(new ArrowType.Int(32, true)), null); + Field nullableIntField = + new Field("nullableIntField", FieldType.nullable(new ArrowType.Int(32, true)), null); - // Set test data for nullableStructVector - IntVector nullableStructIntVector = (IntVector) nullableStructVector.getChild("intField"); - IntVector nullableStructNullableIntVector = (IntVector) nullableStructVector.getChild("nullableIntField"); - for (int i = 0; i < rowCount; i++) { - if (i >= 2) { - nullableStructVector.setIndexDefined(i); - nullableStructIntVector.setSafe(i, i); - if (i % 2 == 0) { - nullableStructNullableIntVector.setSafe(i, i * 10); - } else { - nullableStructNullableIntVector.setNull(i); - } - }else { - nullableStructVector.setNull(i); - } - } + // Create empty vectors + BufferAllocator allocator = new RootAllocator(); + StructVector structVector = new StructVector("struct", allocator, structFieldType, null); + StructVector nullableStructVector = + new StructVector("nullableStruct", allocator, nullableStructFieldType, null); + structVector.initializeChildrenFromFields(Arrays.asList(intField, nullableIntField)); + nullableStructVector.initializeChildrenFromFields(Arrays.asList(intField, nullableIntField)); + + // Set up VSR + List vectors = Arrays.asList(structVector, nullableStructVector); + int rowCount = 4; + + try (VectorSchemaRoot root = new VectorSchemaRoot(vectors)) { + + root.setRowCount(rowCount); + root.allocateNew(); - File dataFile = new File(TMP, "testWriteNullableStructs.avro"); + // Set test data for structVector + BaseWriter.StructWriter structWriter = structVector.getWriter(); + for (int i = 0; i < rowCount; i++) { + structWriter.setPosition(i); + structWriter.start(); + structWriter.integer("intField").writeInt(i); + if (i % 2 == 0) { + structWriter.integer("nullableIntField").writeInt(i * 10); + } else { + structWriter.integer("nullableIntField").writeNull(); + } + structWriter.end(); + } - // Write an AVRO block using the producer classes - try (FileOutputStream fos = new FileOutputStream(dataFile)) { - BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); - CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); - for (int row = 0; row < rowCount; row++) { - producer.produce(encoder); - } - encoder.flush(); + // Set test data for nullableStructVector + BaseWriter.StructWriter nullableStructWriter = nullableStructVector.getWriter(); + for (int i = 0; i < rowCount; i++) { + nullableStructWriter.setPosition(i); + if (i >= 2) { + nullableStructWriter.start(); + nullableStructWriter.integer("intField").writeInt(i); + if (i % 2 == 0) { + nullableStructWriter.integer("nullableIntField").writeInt(i * 10); + } else { + nullableStructWriter.integer("nullableIntField").writeNull(); } + nullableStructWriter.end(); + } else { + nullableStructWriter.writeNull(); + } + } - // Set up reading the AVRO block as a GenericRecord - Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); - GenericDatumReader datumReader = new GenericDatumReader<>(schema); - - try (InputStream inputStream = new FileInputStream(dataFile)) { - - BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); - GenericRecord record = null; - - // Read and check values - for (int row = 0; row < rowCount; row++) { - record = datumReader.read(record, decoder); - if (row % 2 == 0) { - assertNotNull(record.get("struct")); - GenericRecord structRecord = (GenericRecord) record.get("struct"); - assertEquals(row, structRecord.get("intField")); - assertEquals(row * 10, structRecord.get("nullableIntField")); - } else { - assertNotNull(record.get("struct")); - GenericRecord structRecord = (GenericRecord) record.get("struct"); - assertEquals(row, structRecord.get("intField")); - assertNull(structRecord.get("nullableIntField")); - - } - if (row >= 2) { - assertNotNull(record.get("nullableStruct")); - GenericRecord nullableStructRecord = (GenericRecord) record.get("nullableStruct"); - assertEquals(row, nullableStructRecord.get("intField")); - if (row % 2 == 0) { - assertEquals(row * 10, nullableStructRecord.get("nullableIntField")); - } else { - assertNull(nullableStructRecord.get("nullableIntField")); - } - } else { - assertNull(record.get("nullableStruct")); - } - } + File dataFile = new File(TMP, "testWriteNullableStructs.avro"); + + // Write an AVRO block using the producer classes + try (FileOutputStream fos = new FileOutputStream(dataFile)) { + BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); + CompositeAvroProducer producer = ArrowToAvroUtils.createCompositeProducer(vectors); + for (int row = 0; row < rowCount; row++) { + producer.produce(encoder); + } + encoder.flush(); + } + + // Set up reading the AVRO block as a GenericRecord + Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); + GenericDatumReader datumReader = new GenericDatumReader<>(schema); + + try (InputStream inputStream = new FileInputStream(dataFile)) { + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null); + GenericRecord record = null; + + // Read and check values + for (int row = 0; row < rowCount; row++) { + record = datumReader.read(record, decoder); + if (row % 2 == 0) { + assertNotNull(record.get("struct")); + GenericRecord structRecord = (GenericRecord) record.get("struct"); + assertEquals(row, structRecord.get("intField")); + assertEquals(row * 10, structRecord.get("nullableIntField")); + } else { + assertNotNull(record.get("struct")); + GenericRecord structRecord = (GenericRecord) record.get("struct"); + assertEquals(row, structRecord.get("intField")); + assertNull(structRecord.get("nullableIntField")); + } + if (row >= 2) { + assertNotNull(record.get("nullableStruct")); + GenericRecord nullableStructRecord = (GenericRecord) record.get("nullableStruct"); + assertEquals(row, nullableStructRecord.get("intField")); + if (row % 2 == 0) { + assertEquals(row * 10, nullableStructRecord.get("nullableIntField")); + } else { + assertNull(nullableStructRecord.get("nullableIntField")); + } + } else { + assertNull(record.get("nullableStruct")); } + } } + } } } From cf559cb72afd40e9a514330b0d35d692f89437ab Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 17:08:01 +0000 Subject: [PATCH 74/89] Fix delegate offset for map producer --- .../arrow/adapter/avro/producers/AvroMapProducer.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java index d95bbe99c9..8ba2c1808d 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java @@ -51,15 +51,12 @@ public void produce(Encoder encoder) throws IOException { currentIndex++; } - @Override - public void skipNull() { - delegate.skipNull(); - super.skipNull(); - } + // Do not override skipNull(), delegate will not have an entry if the map is null @Override public void setPosition(int index) { - delegate.setPosition(index); + int delegateOffset = vector.getOffsetBuffer().getInt(index * (long) Integer.BYTES); + delegate.setPosition(delegateOffset); super.setPosition(index); } From 361d28c667cc22e6b0b9e3100db68bb2ff24cc6c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 17:16:35 +0000 Subject: [PATCH 75/89] Require key type = VARCHAR to encode Avro maps --- .../java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index b36fd32f99..d80129aa35 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -691,6 +691,10 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu case MAP: MapVector mapVector = (MapVector) vector; StructVector entryVector = (StructVector) mapVector.getDataVector(); + Types.MinorType keyType = entryVector.getChildrenFromFields().get(0).getMinorType(); + if (keyType != Types.MinorType.VARCHAR) { + throw new IllegalArgumentException("MAP key type must be VARCHAR for Avro encoding"); + } VarCharVector keyVector = (VarCharVector) entryVector.getChildrenFromFields().get(0); FieldVector valueVector = entryVector.getChildrenFromFields().get(1); Producer keyProducer = new AvroStringProducer(keyVector); From 81a3ce17902fa863a6b3be36964da27d9e0a2f6b Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 17:23:34 +0000 Subject: [PATCH 76/89] Use HTML table to type mapping comment --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 65 ++++++++++--------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index d80129aa35..a5bef1c981 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -118,37 +118,40 @@ public class ArrowToAvroUtils { *

    This method currently performs following type mapping for Avro data types to corresponding * Arrow data types. * - *

      - *
    • ArrowType.Null --> NULL - *
    • ArrowType.Bool --> BOOLEAN - *
    • ArrowType.Int(64 bit, unsigned 32 bit) --> LONG - *
    • ArrowType.Int(signed 32 bit, < 32 bit) --> INT - *
    • ArrowType.FloatingPoint(double) --> DOUBLE - *
    • ArrowType.FloatingPoint(single, half) --> FLOAT - *
    • ArrowType.Utf8 --> STRING - *
    • ArrowType.LargeUtf8 --> STRING - *
    • ArrowType.Binary --> BYTES - *
    • ArrowType.LargeBinary --> BYTES - *
    • ArrowType.FixedSizeBinary --> FIXED - *
    • ArrowType.Decimal --> decimal (FIXED) - *
    • ArrowType.Date --> date (INT) - *
    • ArrowType.Time (SEC | MILLI) --> time-millis (INT) - *
    • ArrowType.Time (MICRO | NANO) --> time-micros (LONG) - *
    • ArrowType.Timestamp (NANOSECONDS, TZ != NULL) --> time-nanos (LONG) - *
    • ArrowType.Timestamp (MICROSECONDS, TZ != NULL) --> time-micros (LONG) - *
    • ArrowType.Timestamp (MILLISECONDS | SECONDS, TZ != NULL) --> time-millis (LONG) - *
    • ArrowType.Timestamp (NANOSECONDS, TZ == NULL) --> local-time-nanos (LONG) - *
    • ArrowType.Timestamp (MICROSECONDS, TZ == NULL) --> local-time-micros (LONG) - *
    • ArrowType.Timestamp (MILLISECONDS | SECONDS, TZ == NULL) --> local-time-millis (LONG) - *
    • ArrowType.Duration --> duration (FIXED) - *
    • ArrowType.Interval --> duration (FIXED) - *
    • ArrowType.Struct --> record - *
    • ArrowType.List --> array - *
    • ArrowType.LargeList --> array - *
    • ArrowType.FixedSizeList --> array - *
    • ArrowType.Map --> map - *
    • ArrowType.Union --> union - *
    + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
    Arrow typeAvro encoding
    ArrowType.NullNULL
    ArrowType.BoolBOOLEAN
    ArrowType.Int(64 bit, unsigned 32 bit)LONG
    ArrowType.Int(signed 32 bit, < 32 bit)INT
    ArrowType.FloatingPoint(double)DOUBLE
    ArrowType.FloatingPoint(single, half)FLOAT
    ArrowType.Utf8STRING
    ArrowType.LargeUtf8STRING
    ArrowType.BinaryBYTES
    ArrowType.LargeBinaryBYTES
    ArrowType.FixedSizeBinaryFIXED
    ArrowType.Decimaldecimal (FIXED)
    ArrowType.Datedate (INT)
    ArrowType.Time (SEC | MILLI)time-millis (INT)
    ArrowType.Time (MICRO | NANO)time-micros (LONG)
    ArrowType.Timestamp (NANOSECONDS, TZ != NULL)time-nanos (LONG)
    ArrowType.Timestamp (MICROSECONDS, TZ != NULL)time-micros (LONG)
    ArrowType.Timestamp (MILLISECONDS | SECONDS, TZ != NULL)time-millis (LONG)
    ArrowType.Timestamp (NANOSECONDS, TZ == NULL)local-time-nanos (LONG)
    ArrowType.Timestamp (MICROSECONDS, TZ == NULL)local-time-micros (LONG)
    ArrowType.Timestamp (MILLISECONDS | SECONDS, TZ == NULL)local-time-millis (LONG)
    ArrowType.Durationduration (FIXED)
    ArrowType.Intervalduration (FIXED)
    ArrowType.Structrecord
    ArrowType.Listarray
    ArrowType.LargeListarray
    ArrowType.FixedSizeListarray
    ArrowType.Mapmap
    ArrowType.Unionunion
    * *

    Nullable fields are represented as a union of [null | base-type]. Special treatment is given * to nullability of unions - a union is considered nullable if the union field is nullable or any From 5470a78a115c1688557ff0aedde82c1dca4052c2 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 18:06:38 +0000 Subject: [PATCH 77/89] Check bounds on index in setPosition() --- .../producers/AvroFixedSizeListProducer.java | 3 +++ .../avro/producers/AvroListProducer.java | 3 +++ .../avro/producers/AvroMapProducer.java | 3 +++ .../avro/producers/AvroNullableProducer.java | 3 +++ .../avro/producers/AvroStructProducer.java | 3 +++ .../avro/producers/BaseAvroProducer.java | 4 ++++ .../adapter/avro/ArrowToAvroDataTest.java | 21 +++++++++++++++++++ 7 files changed, 40 insertions(+) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java index fba7f441fc..2dbd60541f 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java @@ -60,6 +60,9 @@ public void skipNull() { @Override public void setPosition(int index) { + if (index < 0 || index > vector.getValueCount()) { + throw new IllegalArgumentException("Index out of bounds"); + } int delegateOffset = vector.getOffsetBuffer().getInt(index * (long) Integer.BYTES); delegate.setPosition(delegateOffset); super.setPosition(index); diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java index 38745be6ab..10cfe9549a 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroListProducer.java @@ -57,6 +57,9 @@ public void produce(Encoder encoder) throws IOException { @Override public void setPosition(int index) { + if (index < 0 || index > vector.getValueCount()) { + throw new IllegalArgumentException("Index out of bounds"); + } int delegateOffset = vector.getOffsetBuffer().getInt(index * (long) Integer.BYTES); delegate.setPosition(delegateOffset); super.setPosition(index); diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java index 8ba2c1808d..568d5b62e4 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroMapProducer.java @@ -55,6 +55,9 @@ public void produce(Encoder encoder) throws IOException { @Override public void setPosition(int index) { + if (index < 0 || index > vector.getValueCount()) { + throw new IllegalArgumentException("Index out of bounds"); + } int delegateOffset = vector.getOffsetBuffer().getInt(index * (long) Integer.BYTES); delegate.setPosition(delegateOffset); super.setPosition(index); diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java index 9d956772c2..f4215dbf84 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroNullableProducer.java @@ -58,6 +58,9 @@ public void skipNull() { @Override public void setPosition(int index) { + if (index < 0 || index > vector.getValueCount()) { + throw new IllegalArgumentException("Index out of bounds"); + } delegate.setPosition(index); super.setPosition(index); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java index de34284b7e..86c1949bf6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroStructProducer.java @@ -55,6 +55,9 @@ public void skipNull() { @Override public void setPosition(int index) { + if (index < 0 || index > vector.getValueCount()) { + throw new IllegalArgumentException("Index out of bounds: " + index); + } for (Producer delegate : delegates) { delegate.setPosition(index); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java index 892a02f77a..7e700a3a13 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java @@ -44,6 +44,10 @@ public void skipNull() { @Override public void setPosition(int index) { + // currentIndex == value is a valid state, no more values will be produced + if (index < 0 || index > vector.getValueCount()) { + throw new IllegalArgumentException("Index out of bounds"); + } currentIndex = index; } diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 497799bb87..675ca914a1 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -2294,6 +2294,9 @@ public void testWriteMap() throws Exception { root.setRowCount(rowCount); root.allocateNew(); + // Total number of entries that will be writen to each vector + int entryCount = 5 * 4 * 3; + // Set test data for intList BaseWriter.MapWriter writer = intMapVector.getWriter(); for (int i = 0; i < rowCount; i++) { @@ -2307,6 +2310,9 @@ public void testWriteMap() throws Exception { writer.endMap(); } + // Update count for data vector (map writer does not do this) + intMapVector.getDataVector().setValueCount(entryCount); + // Set test data for stringList BaseWriter.MapWriter stringWriter = stringMapVector.getWriter(); for (int i = 0; i < rowCount; i++) { @@ -2320,6 +2326,9 @@ public void testWriteMap() throws Exception { stringWriter.endMap(); } + // Update count for data vector (map writer does not do this) + stringMapVector.getDataVector().setValueCount(entryCount); + // Set test data for dateList BaseWriter.MapWriter dateWriter = dateMapVector.getWriter(); for (int i = 0; i < rowCount; i++) { @@ -2333,6 +2342,9 @@ public void testWriteMap() throws Exception { dateWriter.endMap(); } + // Update count for data vector (map writer does not do this) + dateMapVector.getDataVector().setValueCount(entryCount); + File dataFile = new File(TMP, "testWriteMap.avro"); // Write an AVRO block using the producer classes @@ -2431,6 +2443,9 @@ public void testWriteNullableMap() throws Exception { writer.endEntry(); writer.endMap(); + // Update count for data vector (map writer does not do this) + nullEntriesVector.getDataVector().setValueCount(3); + // Set test data for stringList BaseWriter.MapWriter nullMapWriter = nullMapVector.getWriter(); nullMapWriter.writeNull(); @@ -2448,6 +2463,9 @@ public void testWriteNullableMap() throws Exception { nullMapWriter.endEntry(); nullMapWriter.endMap(); + // Update count for data vector (map writer does not do this) + nullMapVector.getDataVector().setValueCount(2); + // Set test data for dateList BaseWriter.MapWriter nullBothWriter = nullBothVector.getWriter(); nullBothWriter.writeNull(); @@ -2465,6 +2483,9 @@ public void testWriteNullableMap() throws Exception { nullBothWriter.endEntry(); nullBothWriter.endMap(); + // Update count for data vector (map writer does not do this) + nullBothVector.getDataVector().setValueCount(2); + File dataFile = new File(TMP, "testWriteNullableMap.avro"); // Write an AVRO block using the producer classes From fa7d470d04620d752142a7f5586a7a1717661e21 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Mon, 24 Mar 2025 18:53:58 +0000 Subject: [PATCH 78/89] Fix entry count in write map test --- .../java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 675ca914a1..170218a7b8 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -2295,7 +2295,7 @@ public void testWriteMap() throws Exception { root.allocateNew(); // Total number of entries that will be writen to each vector - int entryCount = 5 * 4 * 3; + int entryCount = 5 + 4 + 3; // Set test data for intList BaseWriter.MapWriter writer = intMapVector.getWriter(); From 81065bf40140b69ce545cf322db2a89b7b0b3c7c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Mar 2025 08:57:12 +0000 Subject: [PATCH 79/89] Remove support for production of union data (pending fixes in the vector classes) --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 38 ++----- .../producers/AvroDenseUnionProducer.java | 36 ------- .../avro/producers/AvroUnionProducer.java | 35 ------- .../avro/producers/BaseUnionProducer.java | 99 ------------------- 4 files changed, 8 insertions(+), 200 deletions(-) delete mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDenseUnionProducer.java delete mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionProducer.java delete mode 100644 adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index a5bef1c981..3a7660c7ea 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -21,7 +21,6 @@ import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer; import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer; import org.apache.arrow.adapter.avro.producers.AvroBytesProducer; -import org.apache.arrow.adapter.avro.producers.AvroDenseUnionProducer; import org.apache.arrow.adapter.avro.producers.AvroFixedSizeBinaryProducer; import org.apache.arrow.adapter.avro.producers.AvroFixedSizeListProducer; import org.apache.arrow.adapter.avro.producers.AvroFloat2Producer; @@ -40,7 +39,6 @@ import org.apache.arrow.adapter.avro.producers.AvroUint2Producer; import org.apache.arrow.adapter.avro.producers.AvroUint4Producer; import org.apache.arrow.adapter.avro.producers.AvroUint8Producer; -import org.apache.arrow.adapter.avro.producers.AvroUnionProducer; import org.apache.arrow.adapter.avro.producers.BaseAvroProducer; import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer; import org.apache.arrow.adapter.avro.producers.Producer; @@ -94,12 +92,10 @@ import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; -import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; -import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.Types; @@ -158,11 +154,11 @@ public class ArrowToAvroUtils { * of its child fields are nullable. The schema for a nullable union will always contain a null * type,none of the direct child types will be nullable. * - *

    List fields must contain precisely one child field, which may be nullable. Map fields must - * contain precisely two child fields, the key field and the value field. The key field must - * always be of type STRING (Utf8) and cannot be nullable. The value can be of any type and may be - * nullable. Record types must contain at least one child field and cannot contain multiple fields - * with the same name + *

    List fields must contain precisely one child field, which may be nullable. Map fields are + * represented as a list of structs, where the struct fields are "key" and "value". The key field + * must always be of type STRING (Utf8) and cannot be nullable. The value can be of any type and + * may be nullable. Record types must contain at least one child field and cannot contain multiple + * fields with the same name * * @param arrowFields The arrow fields used to generate the Avro schema * @param typeName Name of the top level Avro record type @@ -707,27 +703,9 @@ private static BaseAvroProducer createProducer(FieldVector vector, boolean nu new AvroStructProducer(entryVector, new Producer[] {keyProducer, valueProducer}); return new AvroMapProducer(mapVector, entryProducer); - case UNION: - UnionVector unionVector = (UnionVector) vector; - List unionChildVectors = unionVector.getChildrenFromFields(); - Producer[] unionChildProducers = new Producer[unionChildVectors.size()]; - for (int i = 0; i < unionChildVectors.size(); i++) { - FieldVector unionChildVector = unionChildVectors.get(i); - unionChildProducers[i] = - createProducer(unionChildVector, /* nullable = */ false); // Do not nest union types - } - return new AvroUnionProducer(unionVector, unionChildProducers); - - case DENSEUNION: - DenseUnionVector denseUnionVector = (DenseUnionVector) vector; - List denseChildVectors = denseUnionVector.getChildrenFromFields(); - Producer[] denseChildProducers = new Producer[denseChildVectors.size()]; - for (int i = 0; i < denseChildVectors.size(); i++) { - FieldVector denseChildVector = denseChildVectors.get(i); - denseChildProducers[i] = - createProducer(denseChildVector, /* nullable = */ false); // Do not nest union types - } - return new AvroDenseUnionProducer(denseUnionVector, denseChildProducers); + // Support for UNION and DENSEUNION is not currently available + // This is pending fixes in the implementation of the union vectors themselves + // https://github.com/apache/arrow-java/issues/108 default: // Not all Arrow types are supported for encoding (yet)! diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDenseUnionProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDenseUnionProducer.java deleted file mode 100644 index 1735c72e4e..0000000000 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroDenseUnionProducer.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.adapter.avro.producers; - -import org.apache.arrow.vector.complex.DenseUnionVector; - -/** - * Producer which produces union values from a {@link DenseUnionVector}, writes data to an avro - * encoder. - */ -public class AvroDenseUnionProducer extends BaseUnionProducer { - - /** Instantiate an AvroUnionProducer. */ - public AvroDenseUnionProducer(DenseUnionVector vector, Producer[] delegates) { - super(vector, delegates); - } - - @Override - protected int getCurrentTypeIndex() { - return vector.getTypeId(currentIndex); - } -} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionProducer.java deleted file mode 100644 index dfe82d821e..0000000000 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroUnionProducer.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.adapter.avro.producers; - -import org.apache.arrow.vector.complex.UnionVector; - -/** - * Producer which produces union values from a {@link UnionVector}, writes data to an avro encoder. - */ -public class AvroUnionProducer extends BaseUnionProducer { - - /** Instantiate an AvroUnionProducer. */ - public AvroUnionProducer(UnionVector vector, Producer[] delegates) { - super(vector, delegates); - } - - @Override - protected int getCurrentTypeIndex() { - return vector.getTypeValue(currentIndex); - } -} diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java deleted file mode 100644 index 5370cca052..0000000000 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseUnionProducer.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.adapter.avro.producers; - -import java.io.IOException; -import java.util.List; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.types.Types; -import org.apache.arrow.vector.types.UnionMode; -import org.apache.avro.io.Encoder; - -abstract class BaseUnionProducer extends BaseAvroProducer { - - // Logic is substantially the same for union and dense union, just dense union resolves offsets - // For methods not available on FieldVector some calls are delegate to the child class - - private final Producer[] delegates; - private final UnionMode unionMode; - private final int nullTypeIndex; - - protected abstract int getCurrentTypeIndex(); - - public BaseUnionProducer(T vector, Producer[] delegates) { - super(vector); - this.delegates = delegates; - if (vector.getMinorType() == Types.MinorType.DENSEUNION) { - this.unionMode = UnionMode.Dense; - } else { - this.unionMode = UnionMode.Sparse; - } - this.nullTypeIndex = findNullTypeIndex(vector.getChildrenFromFields()); - } - - protected int findNullTypeIndex(List childVectors) { - for (int i = 0; i < childVectors.size(); i++) { - if (childVectors.get(i).getMinorType() == Types.MinorType.NULL) { - return i; - } - } - // For nullable unions with no explicit null type, a null type is appended to the schema - return childVectors.size(); - } - - @Override - public void produce(Encoder encoder) throws IOException { - - if (vector.isNull(currentIndex)) { - encoder.writeInt(nullTypeIndex); - encoder.writeNull(); - } else { - - int typeIndex = getCurrentTypeIndex(); - int typeVectorIndex; - - if (unionMode == UnionMode.Dense) { - typeVectorIndex = vector.getOffsetBuffer().getInt(currentIndex * (long) Integer.BYTES); - } else { - typeVectorIndex = currentIndex; - } - - FieldVector typeVector = vector.getChildrenFromFields().get(typeIndex); - - if (typeVector.isNull(typeVectorIndex)) { - encoder.writeInt(nullTypeIndex); - encoder.writeNull(); - } else { - Producer delegate = delegates[typeIndex]; - encoder.writeInt(typeIndex); - delegate.setPosition(typeVectorIndex); - delegate.produce(encoder); - } - } - - currentIndex++; - } - - @Override - @SuppressWarnings("unchecked") - public void resetValueVector(T vector) { - for (int i = 0; i < delegates.length; i++) { - Producer delegate = (Producer) delegates[i]; - delegate.resetValueVector(vector.getChildrenFromFields().get(i)); - } - } -} From e36682ab39d7dc8ef88e9c6d838b73a026d3b743 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 25 Mar 2025 11:40:26 +0000 Subject: [PATCH 80/89] Fix nano precision test in CI for time types --- .../apache/arrow/adapter/avro/ArrowToAvroDataTest.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 170218a7b8..df9fa96764 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -1351,7 +1351,9 @@ record = datumReader.read(record, decoder); assertEquals(timeSecVector.get(row), (int) (record.get("timeSec")) / 1000); assertEquals(timeMillisVector.get(row), record.get("timeMillis")); assertEquals(timeMicrosVector.get(row), record.get("timeMicros")); - assertEquals(timeNanosVector.get(row), (long) record.get("timeNanos") * 1000); + // Avro doesn't have time-nanos (mar 2025), so expect column to be saved as micros + long nanosAsMicros = (timeNanosVector.get(row) / 1000); + assertEquals(nanosAsMicros, (long) record.get("timeNanos")); } } } @@ -1403,7 +1405,7 @@ public void testWriteNullableTimes() throws Exception { timeMicrosVector.setNull(0); timeMicrosVector.setSafe(1, 0); - timeMicrosVector.setSafe(2, ZonedDateTime.now().toLocalTime().toSecondOfDay() / 1000); + timeMicrosVector.setSafe(2, ZonedDateTime.now().toLocalTime().toNanoOfDay() / 1000); timeNanosVector.setNull(0); timeNanosVector.setSafe(1, 0); @@ -1441,7 +1443,9 @@ record = datumReader.read(record, decoder); assertEquals(timeSecVector.get(row), ((int) record.get("timeSec") / 1000)); assertEquals(timeMillisVector.get(row), record.get("timeMillis")); assertEquals(timeMicrosVector.get(row), record.get("timeMicros")); - assertEquals(timeNanosVector.get(row), (long) record.get("timeNanos") * 1000); + // Avro doesn't have time-nanos (mar 2025), so expect column to be saved as micros + long nanosAsMicros = (timeNanosVector.get(row) / 1000); + assertEquals(nanosAsMicros, (long) record.get("timeNanos")); } } } From 0b59da84fbb770a12d39518b2f5706c7b525cedd Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Thu, 27 Mar 2025 18:37:52 +0000 Subject: [PATCH 81/89] Improve tests for nullable lists and maps --- .../adapter/avro/ArrowToAvroDataTest.java | 139 ++++++++++++------ 1 file changed, 94 insertions(+), 45 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index df9fa96764..61ec21ea70 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -1955,15 +1955,19 @@ public void testWriteNullableLists() throws Exception { FieldWriter nullEntriesWriter = nullEntriesVector.getWriter(); nullEntriesWriter.startList(); nullEntriesWriter.integer().writeNull(); + nullEntriesWriter.integer().writeNull(); nullEntriesWriter.endList(); nullEntriesWriter.startList(); nullEntriesWriter.integer().writeInt(0); + nullEntriesWriter.integer().writeInt(0); nullEntriesWriter.endList(); nullEntriesWriter.startList(); - nullEntriesWriter.integer().writeInt(1); + nullEntriesWriter.integer().writeInt(123); + nullEntriesWriter.integer().writeInt(456); nullEntriesWriter.endList(); nullEntriesWriter.startList(); - nullEntriesWriter.integer().writeInt(2); + nullEntriesWriter.integer().writeInt(789); + nullEntriesWriter.integer().writeInt(789); nullEntriesWriter.endList(); // Set test data for nullListVector @@ -1972,13 +1976,16 @@ public void testWriteNullableLists() throws Exception { nullListWriter.setPosition(1); // writeNull() does not inc. idx() on list vector nullListWriter.startList(); nullListWriter.integer().writeInt(0); + nullListWriter.integer().writeInt(0); nullListWriter.endList(); - nullListWriter.startList(); - nullListWriter.integer().writeInt(1); - nullListWriter.endList(); - nullListWriter.startList(); - nullListWriter.integer().writeInt(2); - nullListWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(123); + nullEntriesWriter.integer().writeInt(456); + nullEntriesWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(789); + nullEntriesWriter.integer().writeInt(789); + nullEntriesWriter.endList(); // Set test data for nullBothVector FieldWriter nullBothWriter = nullBothVector.getWriter(); @@ -1986,13 +1993,16 @@ public void testWriteNullableLists() throws Exception { nullBothWriter.setPosition(1); nullBothWriter.startList(); nullBothWriter.integer().writeNull(); + nullBothWriter.integer().writeNull(); nullBothWriter.endList(); - nullBothWriter.startList(); - nullBothWriter.integer().writeInt(0); - nullBothWriter.endList(); - nullBothWriter.startList(); - nullBothWriter.integer().writeInt(1); - nullBothWriter.endList(); + nullListWriter.startList(); + nullListWriter.integer().writeInt(0); + nullListWriter.integer().writeInt(0); + nullListWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(123); + nullEntriesWriter.integer().writeInt(456); + nullEntriesWriter.endList(); File dataFile = new File(TMP, "testWriteNullableLists.avro"); @@ -2139,8 +2149,8 @@ record = datumReader.read(record, decoder); public void testWriteNullableFixedLists() throws Exception { // Field definitions - FieldType nullListType = new FieldType(true, new ArrowType.FixedSizeList(1), null); - FieldType nonNullListType = new FieldType(false, new ArrowType.FixedSizeList(1), null); + FieldType nullListType = new FieldType(true, new ArrowType.FixedSizeList(2), null); + FieldType nonNullListType = new FieldType(false, new ArrowType.FixedSizeList(2), null); Field nullFieldType = new Field("item", FieldType.nullable(new ArrowType.Int(32, true)), null); Field nonNullFieldType = @@ -2172,15 +2182,19 @@ public void testWriteNullableFixedLists() throws Exception { FieldWriter nullEntriesWriter = nullEntriesVector.getWriter(); nullEntriesWriter.startList(); nullEntriesWriter.integer().writeNull(); + nullEntriesWriter.integer().writeNull(); nullEntriesWriter.endList(); nullEntriesWriter.startList(); nullEntriesWriter.integer().writeInt(0); + nullEntriesWriter.integer().writeInt(0); nullEntriesWriter.endList(); nullEntriesWriter.startList(); - nullEntriesWriter.integer().writeInt(1); + nullEntriesWriter.integer().writeInt(123); + nullEntriesWriter.integer().writeInt(456); nullEntriesWriter.endList(); nullEntriesWriter.startList(); - nullEntriesWriter.integer().writeInt(2); + nullEntriesWriter.integer().writeInt(789); + nullEntriesWriter.integer().writeInt(789); nullEntriesWriter.endList(); // Set test data for nullListVector @@ -2188,28 +2202,34 @@ public void testWriteNullableFixedLists() throws Exception { nullListWriter.writeNull(); nullListWriter.setPosition(1); // writeNull() does not inc. idx() on list vector nullListWriter.startList(); - nullListWriter.integer().writeInt(0); - nullListWriter.endList(); - nullListWriter.startList(); - nullListWriter.integer().writeInt(1); - nullListWriter.endList(); - nullListWriter.startList(); - nullListWriter.integer().writeInt(2); + nullListWriter.integer().writeInt(123); + nullListWriter.integer().writeInt(456); nullListWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(789); + nullEntriesWriter.integer().writeInt(456); + nullEntriesWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(12345); + nullEntriesWriter.integer().writeInt(67891); + nullEntriesWriter.endList(); // Set test data for nullBothVector FieldWriter nullBothWriter = nullBothVector.getWriter(); nullBothWriter.writeNull(); nullBothWriter.setPosition(1); nullBothWriter.startList(); - nullBothWriter.integer().writeNull(); - nullBothWriter.endList(); - nullBothWriter.startList(); - nullBothWriter.integer().writeInt(0); - nullBothWriter.endList(); - nullBothWriter.startList(); - nullBothWriter.integer().writeInt(1); + nullListWriter.integer().writeNull(); + nullListWriter.integer().writeNull(); nullBothWriter.endList(); + nullListWriter.startList(); + nullListWriter.integer().writeInt(123); + nullListWriter.integer().writeInt(456); + nullListWriter.endList(); + nullEntriesWriter.startList(); + nullEntriesWriter.integer().writeInt(789); + nullEntriesWriter.integer().writeInt(456); + nullEntriesWriter.endList(); File dataFile = new File(TMP, "testWriteNullableFixedLists.avro"); @@ -2433,17 +2453,29 @@ public void testWriteNullableMap() throws Exception { writer.key().varChar().writeVarChar("key0"); writer.value().integer().writeNull(); writer.endEntry(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key1"); + writer.value().integer().writeNull(); + writer.endEntry(); writer.endMap(); writer.startMap(); writer.startEntry(); - writer.key().varChar().writeVarChar("key1"); + writer.key().varChar().writeVarChar("key2"); + writer.value().integer().writeInt(0); + writer.endEntry(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key3"); writer.value().integer().writeInt(0); writer.endEntry(); writer.endMap(); writer.startMap(); writer.startEntry(); - writer.key().varChar().writeVarChar("key2"); - writer.value().integer().writeInt(1); + writer.key().varChar().writeVarChar("key4"); + writer.value().integer().writeInt(123); + writer.endEntry(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key5"); + writer.value().integer().writeInt(456); writer.endEntry(); writer.endMap(); @@ -2456,15 +2488,24 @@ public void testWriteNullableMap() throws Exception { nullMapWriter.setPosition(1); // writeNull() does not inc. idx() on map (list) vector nullMapWriter.startMap(); nullMapWriter.startEntry(); - nullMapWriter.key().varChar().writeVarChar("key1"); + nullMapWriter.key().varChar().writeVarChar("key2"); nullMapWriter.value().integer().writeInt(0); nullMapWriter.endEntry(); + writer.startMap(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key3"); + writer.value().integer().writeInt(0); + writer.endEntry(); nullMapWriter.endMap(); nullMapWriter.startMap(); - nullMapWriter.startEntry(); - nullMapWriter.key().varChar().writeVarChar("key2"); - nullMapWriter.value().integer().writeInt(1); - nullMapWriter.endEntry(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key4"); + writer.value().integer().writeInt(123); + writer.endEntry(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key5"); + writer.value().integer().writeInt(456); + writer.endEntry(); nullMapWriter.endMap(); // Update count for data vector (map writer does not do this) @@ -2476,16 +2517,24 @@ public void testWriteNullableMap() throws Exception { nullBothWriter.setPosition(1); nullBothWriter.startMap(); nullBothWriter.startEntry(); - nullBothWriter.key().varChar().writeVarChar("key1"); + nullBothWriter.key().varChar().writeVarChar("key2"); nullBothWriter.value().integer().writeNull(); nullBothWriter.endEntry(); - nullBothWriter.endMap(); - nullBothWriter.startMap(); nullBothWriter.startEntry(); - nullBothWriter.key().varChar().writeVarChar("key2"); - nullBothWriter.value().integer().writeInt(0); + nullBothWriter.key().varChar().writeVarChar("key3"); + nullBothWriter.value().integer().writeNull(); nullBothWriter.endEntry(); nullBothWriter.endMap(); + nullBothWriter.startMap(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key4"); + writer.value().integer().writeInt(123); + writer.endEntry(); + writer.startEntry(); + writer.key().varChar().writeVarChar("key5"); + writer.value().integer().writeInt(456); + writer.endEntry(); + nullBothWriter.endMap(); // Update count for data vector (map writer does not do this) nullBothVector.getDataVector().setValueCount(2); From bdb5a92c853de27ae769800c69bb79c89035c67b Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Thu, 27 Mar 2025 18:49:58 +0000 Subject: [PATCH 82/89] Set value counts for all complex vectors in the tests --- .../adapter/avro/ArrowToAvroDataTest.java | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java index 61ec21ea70..2d70b45021 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroDataTest.java @@ -1882,6 +1882,11 @@ public void testWriteLists() throws Exception { dateListWriter.endList(); } + // Update count for the vectors + intListVector.setValueCount(rowCount); + stringListVector.setValueCount(rowCount); + dateListVector.setValueCount(rowCount); + File dataFile = new File(TMP, "testWriteLists.avro"); // Write an AVRO block using the producer classes @@ -2004,6 +2009,11 @@ public void testWriteNullableLists() throws Exception { nullEntriesWriter.integer().writeInt(456); nullEntriesWriter.endList(); + // Update count for the vectors + nullListVector.setValueCount(4); + nullEntriesVector.setValueCount(4); + nullBothVector.setValueCount(4); + File dataFile = new File(TMP, "testWriteNullableLists.avro"); // Write an AVRO block using the producer classes @@ -2109,6 +2119,11 @@ public void testWriteFixedLists() throws Exception { } File dataFile = new File(TMP, "testWriteFixedLists.avro"); + // Update count for the vectors + intListVector.setValueCount(rowCount); + stringListVector.setValueCount(rowCount); + dateListVector.setValueCount(rowCount); + // Write an AVRO block using the producer classes try (FileOutputStream fos = new FileOutputStream(dataFile)) { BinaryEncoder encoder = new EncoderFactory().directBinaryEncoder(fos, null); @@ -2231,6 +2246,11 @@ public void testWriteNullableFixedLists() throws Exception { nullEntriesWriter.integer().writeInt(456); nullEntriesWriter.endList(); + // Update count for the vectors + nullListVector.setValueCount(4); + nullEntriesVector.setValueCount(4); + nullBothVector.setValueCount(4); + File dataFile = new File(TMP, "testWriteNullableFixedLists.avro"); // Write an AVRO block using the producer classes @@ -2350,6 +2370,11 @@ public void testWriteMap() throws Exception { stringWriter.endMap(); } + // Update count for the vectors + intMapVector.setValueCount(rowCount); + stringMapVector.setValueCount(rowCount); + dateMapVector.setValueCount(rowCount); + // Update count for data vector (map writer does not do this) stringMapVector.getDataVector().setValueCount(entryCount); @@ -2479,9 +2504,6 @@ public void testWriteNullableMap() throws Exception { writer.endEntry(); writer.endMap(); - // Update count for data vector (map writer does not do this) - nullEntriesVector.getDataVector().setValueCount(3); - // Set test data for stringList BaseWriter.MapWriter nullMapWriter = nullMapVector.getWriter(); nullMapWriter.writeNull(); @@ -2508,9 +2530,6 @@ public void testWriteNullableMap() throws Exception { writer.endEntry(); nullMapWriter.endMap(); - // Update count for data vector (map writer does not do this) - nullMapVector.getDataVector().setValueCount(2); - // Set test data for dateList BaseWriter.MapWriter nullBothWriter = nullBothVector.getWriter(); nullBothWriter.writeNull(); @@ -2536,8 +2555,10 @@ public void testWriteNullableMap() throws Exception { writer.endEntry(); nullBothWriter.endMap(); - // Update count for data vector (map writer does not do this) - nullBothVector.getDataVector().setValueCount(2); + // Update count for the vectors + nullEntriesVector.setValueCount(3); + nullMapVector.setValueCount(3); + nullBothVector.setValueCount(3); File dataFile = new File(TMP, "testWriteNullableMap.avro"); @@ -2656,6 +2677,9 @@ public void testWriteStruct() throws Exception { encoder.flush(); } + // Update count for the vector + structVector.setValueCount(rowCount); + // Set up reading the AVRO block as a GenericRecord Schema schema = ArrowToAvroUtils.createAvroSchema(root.getSchema().getFields()); GenericDatumReader datumReader = new GenericDatumReader<>(schema); @@ -2738,6 +2762,10 @@ public void testWriteNullableStructs() throws Exception { } } + // Update count for the vector + structVector.setValueCount(rowCount); + nullableStructVector.setValueCount(rowCount); + File dataFile = new File(TMP, "testWriteNullableStructs.avro"); // Write an AVRO block using the producer classes From ccf9c3a9ca1d0f3b1f2a59b72691e63009aa1a03 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Thu, 27 Mar 2025 18:50:57 +0000 Subject: [PATCH 83/89] Fix handling of child index updates for fixed size lists --- .../avro/producers/AvroFixedSizeListProducer.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java index 2dbd60541f..acb6fb8c00 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroFixedSizeListProducer.java @@ -53,9 +53,10 @@ public void produce(Encoder encoder) throws IOException { @Override public void skipNull() { - // Keep fixed sized child in sync - delegate.skipNull(); super.skipNull(); + // Child vector contains a fixed number of elements for each entry + int childIndex = currentIndex * vector.getListSize(); + delegate.setPosition(childIndex); } @Override @@ -63,9 +64,10 @@ public void setPosition(int index) { if (index < 0 || index > vector.getValueCount()) { throw new IllegalArgumentException("Index out of bounds"); } - int delegateOffset = vector.getOffsetBuffer().getInt(index * (long) Integer.BYTES); - delegate.setPosition(delegateOffset); super.setPosition(index); + // Child vector contains a fixed number of elements for each entry + int childIndex = currentIndex * vector.getListSize(); + delegate.setPosition(childIndex); } @Override From 7f0e3f0ec9b3133cb4b8d7d0056982b8dd5d7cd0 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Thu, 27 Mar 2025 18:59:27 +0000 Subject: [PATCH 84/89] Fixes for PR comments --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 15 +++++++++------ .../adapter/avro/producers/BaseAvroProducer.java | 11 ++++++++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 3a7660c7ea..11adbd9c71 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -150,9 +150,9 @@ public class ArrowToAvroUtils { * * *

    Nullable fields are represented as a union of [null | base-type]. Special treatment is given - * to nullability of unions - a union is considered nullable if the union field is nullable or any - * of its child fields are nullable. The schema for a nullable union will always contain a null - * type,none of the direct child types will be nullable. + * to nullability of unions - a union is considered nullable if any of its child fields are nullable. + * The schema for a nullable union will always contain a null type as its first member, with none of the + * child types being nullable. * *

    List fields must contain precisely one child field, which may be nullable. Map fields are * represented as a list of structs, where the struct fields are "key" and "value". The key field @@ -326,9 +326,10 @@ private static T buildBaseTypeSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { + // Second and millisecond time types are encoded as time-millis (INT) return builder.intBuilder().prop("logicalType", "time-millis").endInt(); } else { - // All other time types (sec, micro, nano) are encoded as time-micros (LONG) + // All other time types (micro, nano) are encoded as time-micros (LONG) return builder.longBuilder().prop("logicalType", "time-micros").endLong(); } @@ -410,9 +411,10 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { + // Second and millisecond time types are encoded as time-millis (INT) return builder.intBuilder().prop("logicalType", "time-millis").endInt().noDefault(); } else { - // All other time types (sec, micro, nano) are encoded as time-micros (LONG) + // All other time types (micro, nano) are encoded as time-micros (LONG) return builder.longBuilder().prop("logicalType", "time-micros").endLong().noDefault(); } @@ -504,11 +506,12 @@ private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { + // Second and millisecond time types are encoded as time-millis (INT) return (SchemaBuilder.UnionAccumulator) builder.intBuilder().prop("logicalType", "time-millis").endInt(); } else { return (SchemaBuilder.UnionAccumulator) - // All other time types (sec, micro, nano) are encoded as time-micros (LONG) + // All other time types (micro, nano) are encoded as time-micros (LONG) builder.longBuilder().prop("logicalType", "time-micros").endLong(); } diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java index 7e700a3a13..7b7d0407b2 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java @@ -42,9 +42,18 @@ public void skipNull() { currentIndex++; } + /** + * Sets the current index for this producer against the underlying vector. + * + *

    For a vector of length N, the valid range is [0, N] inclusive. Setting index = N + * signifies that no further data is available for production (this is the state the + * produce will be in when production for the current vector is complete). + * + * @param index New current index for the producer + */ @Override public void setPosition(int index) { - // currentIndex == value is a valid state, no more values will be produced + // currentIndex == value count is a valid state, no more values will be produced if (index < 0 || index > vector.getValueCount()) { throw new IllegalArgumentException("Index out of bounds"); } From 98a99a573cb572e146d7eed67882a2979f13ed60 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Thu, 27 Mar 2025 18:59:57 +0000 Subject: [PATCH 85/89] Apply spotless --- .../org/apache/arrow/adapter/avro/ArrowToAvroUtils.java | 6 +++--- .../arrow/adapter/avro/producers/BaseAvroProducer.java | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 11adbd9c71..ea37c2aa34 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -150,9 +150,9 @@ public class ArrowToAvroUtils { * * *

    Nullable fields are represented as a union of [null | base-type]. Special treatment is given - * to nullability of unions - a union is considered nullable if any of its child fields are nullable. - * The schema for a nullable union will always contain a null type as its first member, with none of the - * child types being nullable. + * to nullability of unions - a union is considered nullable if any of its child fields are + * nullable. The schema for a nullable union will always contain a null type as its first member, + * with none of the child types being nullable. * *

    List fields must contain precisely one child field, which may be nullable. Map fields are * represented as a list of structs, where the struct fields are "key" and "value". The key field diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java index 7b7d0407b2..30c004bdc6 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/BaseAvroProducer.java @@ -45,9 +45,9 @@ public void skipNull() { /** * Sets the current index for this producer against the underlying vector. * - *

    For a vector of length N, the valid range is [0, N] inclusive. Setting index = N - * signifies that no further data is available for production (this is the state the - * produce will be in when production for the current vector is complete). + *

    For a vector of length N, the valid range is [0, N] inclusive. Setting index = N signifies + * that no further data is available for production (this is the state the produce will be in when + * production for the current vector is complete). * * @param index New current index for the producer */ From e1ca9b29520d03d73c83d4aeea41a538606240bd Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Thu, 27 Mar 2025 19:01:48 +0000 Subject: [PATCH 86/89] Correct comment on schemas for nullable fields --- .../java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index ea37c2aa34..9bbad34d68 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -149,7 +149,7 @@ public class ArrowToAvroUtils { * * * - *

    Nullable fields are represented as a union of [null | base-type]. Special treatment is given + *

    Nullable fields are represented as a union of [base-type | null]. Special treatment is given * to nullability of unions - a union is considered nullable if any of its child fields are * nullable. The schema for a nullable union will always contain a null type as its first member, * with none of the child types being nullable. From e39de2624f552de07860497d6cbd7b187ac6c89c Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 1 Apr 2025 22:03:09 +0100 Subject: [PATCH 87/89] Eliminate duplication in schema building methods --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 230 ++---------------- 1 file changed, 15 insertions(+), 215 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 9bbad34d68..5fb35ec6fe 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -198,26 +198,32 @@ private static T buildRecordSchema( private static SchemaBuilder.FieldAssembler buildFieldSchema( SchemaBuilder.FieldAssembler assembler, Field field, String namespace) { - SchemaBuilder.FieldTypeBuilder builder = assembler.name(field.getName()).type(); + return assembler + .name(field.getName()) + .type(buildTypeSchema(SchemaBuilder.builder(), field, namespace)) + .noDefault(); + } + + private static T buildTypeSchema( + SchemaBuilder.TypeBuilder builder, Field field, String namespace) { // Nullable unions need special handling, since union types cannot be directly nested if (field.getType().getTypeID() == ArrowType.ArrowTypeID.Union) { boolean unionNullable = field.getChildren().stream().anyMatch(Field::isNullable); if (unionNullable) { - SchemaBuilder.UnionAccumulator> union = - builder.unionOf().nullType(); - return addTypesToUnion(union, field.getChildren(), namespace).nullDefault(); + SchemaBuilder.UnionAccumulator union = builder.unionOf().nullType(); + return addTypesToUnion(union, field.getChildren(), namespace); } else { Field headType = field.getChildren().get(0); List tailTypes = field.getChildren().subList(1, field.getChildren().size()); - SchemaBuilder.UnionAccumulator> union = - buildUnionFieldSchema(builder.unionOf(), headType, namespace); - return addTypesToUnion(union, tailTypes, namespace).noDefault(); + SchemaBuilder.UnionAccumulator union = + buildBaseTypeSchema(builder.unionOf(), headType, namespace); + return addTypesToUnion(union, tailTypes, namespace); } } else if (field.isNullable()) { - return buildBaseFieldSchema(builder.nullable(), field, namespace); + return buildBaseTypeSchema(builder.nullable(), field, namespace); } else { - return buildBaseFieldSchema(builder, field, namespace); + return buildBaseTypeSchema(builder, field, namespace); } } @@ -248,29 +254,6 @@ private static T buildMapSchema( return buildTypeSchema(builder.values(), valueField, namespace); } - private static T buildTypeSchema( - SchemaBuilder.TypeBuilder builder, Field field, String namespace) { - - // Nullable unions need special handling, since union types cannot be directly nested - if (field.getType().getTypeID() == ArrowType.ArrowTypeID.Union) { - boolean unionNullable = field.getChildren().stream().anyMatch(Field::isNullable); - if (unionNullable) { - SchemaBuilder.UnionAccumulator union = builder.unionOf().nullType(); - return addTypesToUnion(union, field.getChildren(), namespace); - } else { - Field headType = field.getChildren().get(0); - List tailTypes = field.getChildren().subList(1, field.getChildren().size()); - SchemaBuilder.UnionAccumulator union = - buildBaseTypeSchema(builder.unionOf(), headType, namespace); - return addTypesToUnion(union, tailTypes, namespace); - } - } else if (field.isNullable()) { - return buildBaseTypeSchema(builder.nullable(), field, namespace); - } else { - return buildBaseTypeSchema(builder, field, namespace); - } - } - private static T buildBaseTypeSchema( SchemaBuilder.BaseTypeBuilder builder, Field field, String namespace) { @@ -357,189 +340,6 @@ private static T buildBaseTypeSchema( } } - private static SchemaBuilder.FieldAssembler buildBaseFieldSchema( - SchemaBuilder.BaseFieldTypeBuilder builder, Field field, String namespace) { - - ArrowType.ArrowTypeID typeID = field.getType().getTypeID(); - - switch (typeID) { - case Null: - return builder.nullType().noDefault(); - - case Bool: - return builder.booleanType().noDefault(); - - case Int: - ArrowType.Int intType = (ArrowType.Int) field.getType(); - if (intType.getBitWidth() > 32 || (intType.getBitWidth() == 32 && !intType.getIsSigned())) { - return builder.longType().noDefault(); - } else { - return builder.intType().noDefault(); - } - - case FloatingPoint: - ArrowType.FloatingPoint floatType = (ArrowType.FloatingPoint) field.getType(); - if (floatType.getPrecision() == FloatingPointPrecision.DOUBLE) { - return builder.doubleType().noDefault(); - } else { - return builder.floatType().noDefault(); - } - - case Utf8: - return builder.stringType().noDefault(); - - case Binary: - return builder.bytesType().noDefault(); - - case FixedSizeBinary: - ArrowType.FixedSizeBinary fixedType = (ArrowType.FixedSizeBinary) field.getType(); - return builder.fixed(field.getName()).size(fixedType.getByteWidth()).noDefault(); - - case Decimal: - ArrowType.Decimal decimalType = (ArrowType.Decimal) field.getType(); - return builder - .fixed(field.getName()) - .prop("logicalType", "decimal") - .prop("precision", decimalType.getPrecision()) - .prop("scale", decimalType.getScale()) - .size(decimalType.getBitWidth() / 8) - .noDefault(); - - case Date: - return builder.intBuilder().prop("logicalType", "date").endInt().noDefault(); - - case Time: - ArrowType.Time timeType = (ArrowType.Time) field.getType(); - if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { - // Second and millisecond time types are encoded as time-millis (INT) - return builder.intBuilder().prop("logicalType", "time-millis").endInt().noDefault(); - } else { - // All other time types (micro, nano) are encoded as time-micros (LONG) - return builder.longBuilder().prop("logicalType", "time-micros").endLong().noDefault(); - } - - case Timestamp: - ArrowType.Timestamp timestampType = (ArrowType.Timestamp) field.getType(); - String timestampLogicalType = timestampLogicalType(timestampType); - return builder - .longBuilder() - .prop("logicalType", timestampLogicalType) - .endLong() - .noDefault(); - - case Struct: - String childNamespace = - namespace == null ? field.getName() : namespace + "." + field.getName(); - return buildRecordSchema( - builder.record(field.getName()), field.getChildren(), childNamespace) - .noDefault(); - - case List: - case FixedSizeList: - return buildArraySchema(builder.array(), field, namespace).noDefault(); - - case Map: - return buildMapSchema(builder.map(), field, namespace).noDefault(); - - default: - throw new IllegalArgumentException( - "Field type not supported for Avro conversion: " + typeID.name()); - } - } - - @SuppressWarnings({"unchecked", "rawtypes"}) - private static - SchemaBuilder.UnionAccumulator> buildUnionFieldSchema( - SchemaBuilder.UnionFieldTypeBuilder builder, Field field, String namespace) { - - ArrowType.ArrowTypeID typeID = field.getType().getTypeID(); - - switch (typeID) { - case Null: - return (SchemaBuilder.UnionAccumulator) builder.nullType(); - - case Bool: - return (SchemaBuilder.UnionAccumulator) builder.booleanType(); - - case Int: - ArrowType.Int intType = (ArrowType.Int) field.getType(); - if (intType.getBitWidth() > 32 || (intType.getBitWidth() == 32 && !intType.getIsSigned())) { - return (SchemaBuilder.UnionAccumulator) builder.longType(); - } else { - return (SchemaBuilder.UnionAccumulator) builder.intType(); - } - - case FloatingPoint: - ArrowType.FloatingPoint floatType = (ArrowType.FloatingPoint) field.getType(); - if (floatType.getPrecision() == FloatingPointPrecision.DOUBLE) { - return (SchemaBuilder.UnionAccumulator) builder.doubleType(); - } else { - return (SchemaBuilder.UnionAccumulator) builder.floatType(); - } - - case Utf8: - return (SchemaBuilder.UnionAccumulator) builder.stringType(); - - case Binary: - return (SchemaBuilder.UnionAccumulator) builder.bytesType(); - - case FixedSizeBinary: - ArrowType.FixedSizeBinary fixedType = (ArrowType.FixedSizeBinary) field.getType(); - String fixedTypeName = field.getName(); - int fixedTypeWidth = fixedType.getByteWidth(); - return (SchemaBuilder.UnionAccumulator) builder.fixed(fixedTypeName).size(fixedTypeWidth); - - case Decimal: - ArrowType.Decimal decimalType = (ArrowType.Decimal) field.getType(); - return (SchemaBuilder.UnionAccumulator) - builder - .fixed(field.getName()) - .prop("logicalType", "decimal") - .prop("precision", decimalType.getPrecision()) - .prop("scale", decimalType.getScale()) - .size(decimalType.getBitWidth() / 8); - - case Date: - return (SchemaBuilder.UnionAccumulator) - builder.intBuilder().prop("logicalType", "date").endInt(); - - case Time: - ArrowType.Time timeType = (ArrowType.Time) field.getType(); - if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { - // Second and millisecond time types are encoded as time-millis (INT) - return (SchemaBuilder.UnionAccumulator) - builder.intBuilder().prop("logicalType", "time-millis").endInt(); - } else { - return (SchemaBuilder.UnionAccumulator) - // All other time types (micro, nano) are encoded as time-micros (LONG) - builder.longBuilder().prop("logicalType", "time-micros").endLong(); - } - - case Timestamp: - ArrowType.Timestamp timestampType = (ArrowType.Timestamp) field.getType(); - String timestampLogicalType = timestampLogicalType(timestampType); - return (SchemaBuilder.UnionAccumulator) - builder.longBuilder().prop("logicalType", timestampLogicalType).endLong(); - - case Struct: - String childNamespace = - namespace == null ? field.getName() : namespace + "." + field.getName(); - return (SchemaBuilder.UnionAccumulator) - buildRecordSchema(builder.record(field.getName()), field.getChildren(), childNamespace); - - case List: - case FixedSizeList: - return (SchemaBuilder.UnionAccumulator) buildArraySchema(builder.array(), field, namespace); - - case Map: - return (SchemaBuilder.UnionAccumulator) buildMapSchema(builder.map(), field, namespace); - - default: - throw new IllegalArgumentException( - "Union member type not supported for Avro conversion: " + typeID.name()); - } - } - private static T addTypesToUnion( SchemaBuilder.UnionAccumulator accumulator, List unionFields, String namespace) { for (var field : unionFields) { From 956e867723fe6a443940bcfd55f11de960c18e74 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 1 Apr 2025 22:03:38 +0100 Subject: [PATCH 88/89] Test logical types using built-in field (not a regular schema prop) --- .../adapter/avro/ArrowToAvroSchemaTest.java | 92 +++++++++---------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java index 7228c27996..325f54a56d 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java @@ -344,7 +344,7 @@ public void testConvertDecimalTypes() { schema.getField("nullableDecimal128").schema().getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimal128Schema.getType()); assertEquals(16, nullableDecimal128Schema.getFixedSize()); - assertEquals("decimal", nullableDecimal128Schema.getProp("logicalType")); + assertEquals("decimal", nullableDecimal128Schema.getLogicalType()); assertEquals(10, nullableDecimal128Schema.getObjectProp("precision")); assertEquals(2, nullableDecimal128Schema.getObjectProp("scale")); assertEquals( @@ -355,7 +355,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal1281Schema = schema.getField("nonNullableDecimal1281").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1281Schema.getType()); assertEquals(16, nonNullableDecimal1281Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal1281Schema.getProp("logicalType")); + assertEquals("decimal", nonNullableDecimal1281Schema.getLogicalType()); assertEquals(10, nonNullableDecimal1281Schema.getObjectProp("precision")); assertEquals(2, nonNullableDecimal1281Schema.getObjectProp("scale")); @@ -363,7 +363,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal1282Schema = schema.getField("nonNullableDecimal1282").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1282Schema.getType()); assertEquals(16, nonNullableDecimal1282Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal1282Schema.getProp("logicalType")); + assertEquals("decimal", nonNullableDecimal1282Schema.getLogicalType()); assertEquals(15, nonNullableDecimal1282Schema.getObjectProp("precision")); assertEquals(5, nonNullableDecimal1282Schema.getObjectProp("scale")); @@ -371,7 +371,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal1283Schema = schema.getField("nonNullableDecimal1283").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1283Schema.getType()); assertEquals(16, nonNullableDecimal1283Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal1283Schema.getProp("logicalType")); + assertEquals("decimal", nonNullableDecimal1283Schema.getLogicalType()); assertEquals(20, nonNullableDecimal1283Schema.getObjectProp("precision")); assertEquals(10, nonNullableDecimal1283Schema.getObjectProp("scale")); @@ -382,7 +382,7 @@ public void testConvertDecimalTypes() { schema.getField("nullableDecimal256").schema().getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimal256Schema.getType()); assertEquals(32, nullableDecimal256Schema.getFixedSize()); - assertEquals("decimal", nullableDecimal256Schema.getProp("logicalType")); + assertEquals("decimal", nullableDecimal256Schema.getLogicalType()); assertEquals(20, nullableDecimal256Schema.getObjectProp("precision")); assertEquals(4, nullableDecimal256Schema.getObjectProp("scale")); assertEquals( @@ -393,7 +393,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal2561Schema = schema.getField("nonNullableDecimal2561").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2561Schema.getType()); assertEquals(32, nonNullableDecimal2561Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal2561Schema.getProp("logicalType")); + assertEquals("decimal", nonNullableDecimal2561Schema.getLogicalType()); assertEquals(20, nonNullableDecimal2561Schema.getObjectProp("precision")); assertEquals(4, nonNullableDecimal2561Schema.getObjectProp("scale")); @@ -401,7 +401,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal2562Schema = schema.getField("nonNullableDecimal2562").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2562Schema.getType()); assertEquals(32, nonNullableDecimal2562Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal2562Schema.getProp("logicalType")); + assertEquals("decimal", nonNullableDecimal2562Schema.getLogicalType()); assertEquals(25, nonNullableDecimal2562Schema.getObjectProp("precision")); assertEquals(8, nonNullableDecimal2562Schema.getObjectProp("scale")); @@ -409,7 +409,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal2563Schema = schema.getField("nonNullableDecimal2563").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2563Schema.getType()); assertEquals(32, nonNullableDecimal2563Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal2563Schema.getProp("logicalType")); + assertEquals("decimal", nonNullableDecimal2563Schema.getLogicalType()); assertEquals(30, nonNullableDecimal2563Schema.getObjectProp("precision")); assertEquals(15, nonNullableDecimal2563Schema.getObjectProp("scale")); } @@ -443,14 +443,14 @@ public void testConvertDateTypes() { assertEquals(2, schema.getField("nullableDateDay").schema().getTypes().size()); Schema nullableDateDaySchema = schema.getField("nullableDateDay").schema().getTypes().get(0); assertEquals(Schema.Type.INT, nullableDateDaySchema.getType()); - assertEquals("date", nullableDateDaySchema.getProp("logicalType")); + assertEquals("date", nullableDateDaySchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableDateDay").schema().getTypes().get(1).getType()); // Assertions for nonNullableDateDay Schema nonNullableDateDaySchema = schema.getField("nonNullableDateDay").schema(); assertEquals(Schema.Type.INT, nonNullableDateDaySchema.getType()); - assertEquals("date", nonNullableDateDaySchema.getProp("logicalType")); + assertEquals("date", nonNullableDateDaySchema.getLogicalType()); // Assertions for nullableDateMilli assertEquals(Schema.Type.UNION, schema.getField("nullableDateMilli").schema().getType()); @@ -458,7 +458,7 @@ public void testConvertDateTypes() { Schema nullableDateMilliSchema = schema.getField("nullableDateMilli").schema().getTypes().get(0); assertEquals(Schema.Type.INT, nullableDateMilliSchema.getType()); - assertEquals("date", nullableDateMilliSchema.getProp("logicalType")); + assertEquals("date", nullableDateMilliSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableDateMilli").schema().getTypes().get(1).getType()); @@ -466,7 +466,7 @@ public void testConvertDateTypes() { // Assertions for nonNullableDateMilli Schema nonNullableDateMilliSchema = schema.getField("nonNullableDateMilli").schema(); assertEquals(Schema.Type.INT, nonNullableDateMilliSchema.getType()); - assertEquals("date", nonNullableDateMilliSchema.getProp("logicalType")); + assertEquals("date", nonNullableDateMilliSchema.getLogicalType()); } @Test @@ -516,14 +516,14 @@ public void testConvertTimeTypes() { assertEquals(2, schema.getField("nullableTimeSec").schema().getTypes().size()); Schema nullableTimeSecSchema = schema.getField("nullableTimeSec").schema().getTypes().get(0); assertEquals(Schema.Type.INT, nullableTimeSecSchema.getType()); - assertEquals("time-millis", nullableTimeSecSchema.getProp("logicalType")); + assertEquals("time-millis", nullableTimeSecSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeSec").schema().getTypes().get(1).getType()); // Assertions for nonNullableTimeSec Schema nonNullableTimeSecSchema = schema.getField("nonNullableTimeSec").schema(); assertEquals(Schema.Type.INT, nonNullableTimeSecSchema.getType()); - assertEquals("time-millis", nonNullableTimeSecSchema.getProp("logicalType")); + assertEquals("time-millis", nonNullableTimeSecSchema.getLogicalType()); // Assertions for nullableTimeMillis assertEquals(Schema.Type.UNION, schema.getField("nullableTimeMillis").schema().getType()); @@ -531,7 +531,7 @@ public void testConvertTimeTypes() { Schema nullableTimeMillisSchema = schema.getField("nullableTimeMillis").schema().getTypes().get(0); assertEquals(Schema.Type.INT, nullableTimeMillisSchema.getType()); - assertEquals("time-millis", nullableTimeMillisSchema.getProp("logicalType")); + assertEquals("time-millis", nullableTimeMillisSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeMillis").schema().getTypes().get(1).getType()); @@ -539,7 +539,7 @@ public void testConvertTimeTypes() { // Assertions for nonNullableTimeMillis Schema nonNullableTimeMillisSchema = schema.getField("nonNullableTimeMillis").schema(); assertEquals(Schema.Type.INT, nonNullableTimeMillisSchema.getType()); - assertEquals("time-millis", nonNullableTimeMillisSchema.getProp("logicalType")); + assertEquals("time-millis", nonNullableTimeMillisSchema.getLogicalType()); // Assertions for nullableTimeMicros assertEquals(Schema.Type.UNION, schema.getField("nullableTimeMicros").schema().getType()); @@ -547,7 +547,7 @@ public void testConvertTimeTypes() { Schema nullableTimeMicrosSchema = schema.getField("nullableTimeMicros").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimeMicrosSchema.getType()); - assertEquals("time-micros", nullableTimeMicrosSchema.getProp("logicalType")); + assertEquals("time-micros", nullableTimeMicrosSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeMicros").schema().getTypes().get(1).getType()); @@ -555,7 +555,7 @@ public void testConvertTimeTypes() { // Assertions for nonNullableTimeMicros Schema nonNullableTimeMicrosSchema = schema.getField("nonNullableTimeMicros").schema(); assertEquals(Schema.Type.LONG, nonNullableTimeMicrosSchema.getType()); - assertEquals("time-micros", nonNullableTimeMicrosSchema.getProp("logicalType")); + assertEquals("time-micros", nonNullableTimeMicrosSchema.getLogicalType()); // Assertions for nullableTimeNanos assertEquals(Schema.Type.UNION, schema.getField("nullableTimeNanos").schema().getType()); @@ -563,7 +563,7 @@ public void testConvertTimeTypes() { Schema nullableTimeNanosSchema = schema.getField("nullableTimeNanos").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimeNanosSchema.getType()); - assertEquals("time-micros", nullableTimeNanosSchema.getProp("logicalType")); + assertEquals("time-micros", nullableTimeNanosSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeNanos").schema().getTypes().get(1).getType()); @@ -571,7 +571,7 @@ public void testConvertTimeTypes() { // Assertions for nonNullableTimeNanos Schema nonNullableTimeNanosSchema = schema.getField("nonNullableTimeNanos").schema(); assertEquals(Schema.Type.LONG, nonNullableTimeNanosSchema.getType()); - assertEquals("time-micros", nonNullableTimeNanosSchema.getProp("logicalType")); + assertEquals("time-micros", nonNullableTimeNanosSchema.getLogicalType()); } @Test @@ -622,7 +622,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nullableTimestampSecTzSchema = schema.getField("nullableTimestampSecTz").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampSecTzSchema.getType()); - assertEquals("timestamp-millis", nullableTimestampSecTzSchema.getProp("logicalType")); + assertEquals("timestamp-millis", nullableTimestampSecTzSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampSecTz").schema().getTypes().get(1).getType()); @@ -630,7 +630,7 @@ public void testConvertZoneAwareTimestampTypes() { // Assertions for nonNullableTimestampSecTz Schema nonNullableTimestampSecTzSchema = schema.getField("nonNullableTimestampSecTz").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampSecTzSchema.getType()); - assertEquals("timestamp-millis", nonNullableTimestampSecTzSchema.getProp("logicalType")); + assertEquals("timestamp-millis", nonNullableTimestampSecTzSchema.getLogicalType()); // Assertions for nullableTimestampMillisTz assertEquals( @@ -639,7 +639,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nullableTimestampMillisTzSchema = schema.getField("nullableTimestampMillisTz").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampMillisTzSchema.getType()); - assertEquals("timestamp-millis", nullableTimestampMillisTzSchema.getProp("logicalType")); + assertEquals("timestamp-millis", nullableTimestampMillisTzSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampMillisTz").schema().getTypes().get(1).getType()); @@ -648,7 +648,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nonNullableTimestampMillisTzSchema = schema.getField("nonNullableTimestampMillisTz").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampMillisTzSchema.getType()); - assertEquals("timestamp-millis", nonNullableTimestampMillisTzSchema.getProp("logicalType")); + assertEquals("timestamp-millis", nonNullableTimestampMillisTzSchema.getLogicalType()); // Assertions for nullableTimestampMicrosTz assertEquals( @@ -657,7 +657,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nullableTimestampMicrosTzSchema = schema.getField("nullableTimestampMicrosTz").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampMicrosTzSchema.getType()); - assertEquals("timestamp-micros", nullableTimestampMicrosTzSchema.getProp("logicalType")); + assertEquals("timestamp-micros", nullableTimestampMicrosTzSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampMicrosTz").schema().getTypes().get(1).getType()); @@ -666,7 +666,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nonNullableTimestampMicrosTzSchema = schema.getField("nonNullableTimestampMicrosTz").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampMicrosTzSchema.getType()); - assertEquals("timestamp-micros", nonNullableTimestampMicrosTzSchema.getProp("logicalType")); + assertEquals("timestamp-micros", nonNullableTimestampMicrosTzSchema.getLogicalType()); // Assertions for nullableTimestampNanosTz assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampNanosTz").schema().getType()); @@ -674,7 +674,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nullableTimestampNanosTzSchema = schema.getField("nullableTimestampNanosTz").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampNanosTzSchema.getType()); - assertEquals("timestamp-nanos", nullableTimestampNanosTzSchema.getProp("logicalType")); + assertEquals("timestamp-nanos", nullableTimestampNanosTzSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampNanosTz").schema().getTypes().get(1).getType()); @@ -683,7 +683,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nonNullableTimestampNanosTzSchema = schema.getField("nonNullableTimestampNanosTz").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampNanosTzSchema.getType()); - assertEquals("timestamp-nanos", nonNullableTimestampNanosTzSchema.getProp("logicalType")); + assertEquals("timestamp-nanos", nonNullableTimestampNanosTzSchema.getLogicalType()); } @Test @@ -734,7 +734,7 @@ public void testConvertLocalTimestampTypes() { Schema nullableTimestampSecSchema = schema.getField("nullableTimestampSec").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampSecSchema.getType()); - assertEquals("local-timestamp-millis", nullableTimestampSecSchema.getProp("logicalType")); + assertEquals("local-timestamp-millis", nullableTimestampSecSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampSec").schema().getTypes().get(1).getType()); @@ -742,7 +742,7 @@ public void testConvertLocalTimestampTypes() { // Assertions for nonNullableTimestampSec Schema nonNullableTimestampSecSchema = schema.getField("nonNullableTimestampSec").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampSecSchema.getType()); - assertEquals("local-timestamp-millis", nonNullableTimestampSecSchema.getProp("logicalType")); + assertEquals("local-timestamp-millis", nonNullableTimestampSecSchema.getLogicalType()); // Assertions for nullableTimestampMillis assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampMillis").schema().getType()); @@ -750,7 +750,7 @@ public void testConvertLocalTimestampTypes() { Schema nullableTimestampMillisSchema = schema.getField("nullableTimestampMillis").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampMillisSchema.getType()); - assertEquals("local-timestamp-millis", nullableTimestampMillisSchema.getProp("logicalType")); + assertEquals("local-timestamp-millis", nullableTimestampMillisSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampMillis").schema().getTypes().get(1).getType()); @@ -759,7 +759,7 @@ public void testConvertLocalTimestampTypes() { Schema nonNullableTimestampMillisSchema = schema.getField("nonNullableTimestampMillis").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampMillisSchema.getType()); - assertEquals("local-timestamp-millis", nonNullableTimestampMillisSchema.getProp("logicalType")); + assertEquals("local-timestamp-millis", nonNullableTimestampMillisSchema.getLogicalType()); // Assertions for nullableTimestampMicros assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampMicros").schema().getType()); @@ -767,7 +767,7 @@ public void testConvertLocalTimestampTypes() { Schema nullableTimestampMicrosSchema = schema.getField("nullableTimestampMicros").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampMicrosSchema.getType()); - assertEquals("local-timestamp-micros", nullableTimestampMicrosSchema.getProp("logicalType")); + assertEquals("local-timestamp-micros", nullableTimestampMicrosSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampMicros").schema().getTypes().get(1).getType()); @@ -776,7 +776,7 @@ public void testConvertLocalTimestampTypes() { Schema nonNullableTimestampMicrosSchema = schema.getField("nonNullableTimestampMicros").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampMicrosSchema.getType()); - assertEquals("local-timestamp-micros", nonNullableTimestampMicrosSchema.getProp("logicalType")); + assertEquals("local-timestamp-micros", nonNullableTimestampMicrosSchema.getLogicalType()); // Assertions for nullableTimestampNanos assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampNanos").schema().getType()); @@ -784,7 +784,7 @@ public void testConvertLocalTimestampTypes() { Schema nullableTimestampNanosSchema = schema.getField("nullableTimestampNanos").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampNanosSchema.getType()); - assertEquals("local-timestamp-nanos", nullableTimestampNanosSchema.getProp("logicalType")); + assertEquals("local-timestamp-nanos", nullableTimestampNanosSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampNanos").schema().getTypes().get(1).getType()); @@ -792,7 +792,7 @@ public void testConvertLocalTimestampTypes() { // Assertions for nonNullableTimestampNanos Schema nonNullableTimestampNanosSchema = schema.getField("nonNullableTimestampNanos").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampNanosSchema.getType()); - assertEquals("local-timestamp-nanos", nonNullableTimestampNanosSchema.getProp("logicalType")); + assertEquals("local-timestamp-nanos", nonNullableTimestampNanosSchema.getLogicalType()); } // Schema conversion for complex types, where the contents are primitive and logical types @@ -871,7 +871,7 @@ public void testConvertListTypes() { Schema nullableDecimalSchema = nonNullableDecimalListItemSchema.getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); assertEquals(16, nullableDecimalSchema.getFixedSize()); - assertEquals("decimal", nullableDecimalSchema.getProp("logicalType")); + assertEquals("decimal", nullableDecimalSchema.getLogicalType()); assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); assertEquals(Schema.Type.NULL, nonNullableDecimalListItemSchema.getTypes().get(1).getType()); @@ -881,7 +881,7 @@ public void testConvertListTypes() { Schema nonNullableTimestampListItemSchema = schema.getField("nonNullableTimestampList").schema().getElementType(); assertEquals(Schema.Type.LONG, nonNullableTimestampListItemSchema.getType()); - assertEquals("timestamp-millis", nonNullableTimestampListItemSchema.getProp("logicalType")); + assertEquals("timestamp-millis", nonNullableTimestampListItemSchema.getLogicalType()); } @Test @@ -962,7 +962,7 @@ public void testConvertFixedSizeListTypes() { Schema nullableDecimalSchema = nonNullableFixedSizeDecimalListItemSchema.getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); assertEquals(16, nullableDecimalSchema.getFixedSize()); - assertEquals("decimal", nullableDecimalSchema.getProp("logicalType")); + assertEquals("decimal", nullableDecimalSchema.getLogicalType()); assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); assertEquals( @@ -975,7 +975,7 @@ public void testConvertFixedSizeListTypes() { schema.getField("nonNullableFixedSizeTimestampList").schema().getElementType(); assertEquals(Schema.Type.LONG, nonNullableFixedSizeTimestampListItemSchema.getType()); assertEquals( - "timestamp-millis", nonNullableFixedSizeTimestampListItemSchema.getProp("logicalType")); + "timestamp-millis", nonNullableFixedSizeTimestampListItemSchema.getLogicalType()); } @Test @@ -1088,7 +1088,7 @@ public void testConvertMapTypes() { Schema nullableDecimalSchema = nonNullableMapWithNullableDecimalValueSchema.getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); assertEquals(16, nullableDecimalSchema.getFixedSize()); - assertEquals("decimal", nullableDecimalSchema.getProp("logicalType")); + assertEquals("decimal", nullableDecimalSchema.getLogicalType()); assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); assertEquals( @@ -1103,7 +1103,7 @@ public void testConvertMapTypes() { assertEquals(Schema.Type.LONG, nonNullableMapWithNonNullableTimestampValueSchema.getType()); assertEquals( "timestamp-millis", - nonNullableMapWithNonNullableTimestampValueSchema.getProp("logicalType")); + nonNullableMapWithNonNullableTimestampValueSchema.getLogicalType()); } @Test @@ -1171,7 +1171,7 @@ public void testConvertRecordTypes() { 16, nullableRecordSchema.getField("field3").schema().getTypes().get(0).getFixedSize()); assertEquals( "decimal", - nullableRecordSchema.getField("field3").schema().getTypes().get(0).getProp("logicalType")); + nullableRecordSchema.getField("field3").schema().getTypes().get(0).getLogicalType()); assertEquals( 10, nullableRecordSchema @@ -1189,7 +1189,7 @@ public void testConvertRecordTypes() { assertEquals(Schema.Type.LONG, nullableRecordSchema.getField("field4").schema().getType()); assertEquals( "timestamp-millis", - nullableRecordSchema.getField("field4").schema().getProp("logicalType")); + nullableRecordSchema.getField("field4").schema().getLogicalType()); // Assertions for nonNullableRecord assertEquals(Schema.Type.RECORD, schema.getField("nonNullableRecord").schema().getType()); @@ -1214,7 +1214,7 @@ public void testConvertRecordTypes() { .schema() .getTypes() .get(0) - .getProp("logicalType")); + .getLogicalType()); assertEquals( 10, nonNullableRecordSchema @@ -1237,7 +1237,7 @@ public void testConvertRecordTypes() { assertEquals(Schema.Type.LONG, nonNullableRecordSchema.getField("field4").schema().getType()); assertEquals( "timestamp-millis", - nonNullableRecordSchema.getField("field4").schema().getProp("logicalType")); + nonNullableRecordSchema.getField("field4").schema().getLogicalType()); } @Test From 00bbef033a4a1755cd27c18367b4915664924e88 Mon Sep 17 00:00:00 2001 From: Martin Traverse Date: Tue, 1 Apr 2025 22:27:38 +0100 Subject: [PATCH 89/89] Use structured types instead of raw props for logical type schemas (this seems to be what the Avro framework intends) --- .../arrow/adapter/avro/ArrowToAvroUtils.java | 33 +++--- .../adapter/avro/ArrowToAvroSchemaTest.java | 110 +++++++++--------- 2 files changed, 76 insertions(+), 67 deletions(-) diff --git a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java index 5fb35ec6fe..6f0cb5cffc 100644 --- a/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java +++ b/adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java @@ -101,6 +101,8 @@ import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.avro.LogicalType; +import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; @@ -296,30 +298,31 @@ private static T buildBaseTypeSchema( case Decimal: ArrowType.Decimal decimalType = (ArrowType.Decimal) field.getType(); - return builder - .fixed(field.getName()) - .prop("logicalType", "decimal") - .prop("precision", decimalType.getPrecision()) - .prop("scale", decimalType.getScale()) - .size(decimalType.getBitWidth() / 8); + return builder.type( + LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) + .addToSchema( + Schema.createFixed( + field.getName(), namespace, "", decimalType.getBitWidth() / 8))); case Date: - return builder.intBuilder().prop("logicalType", "date").endInt(); + return builder.type(LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))); case Time: ArrowType.Time timeType = (ArrowType.Time) field.getType(); if ((timeType.getUnit() == TimeUnit.SECOND || timeType.getUnit() == TimeUnit.MILLISECOND)) { // Second and millisecond time types are encoded as time-millis (INT) - return builder.intBuilder().prop("logicalType", "time-millis").endInt(); + return builder.type( + LogicalTypes.timeMillis().addToSchema(Schema.create(Schema.Type.INT))); } else { // All other time types (micro, nano) are encoded as time-micros (LONG) - return builder.longBuilder().prop("logicalType", "time-micros").endLong(); + return builder.type( + LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG))); } case Timestamp: ArrowType.Timestamp timestampType = (ArrowType.Timestamp) field.getType(); - String timestampLogicalType = timestampLogicalType(timestampType); - return builder.longBuilder().prop("logicalType", timestampLogicalType).endLong(); + LogicalType timestampLogicalType = timestampLogicalType(timestampType); + return builder.type(timestampLogicalType.addToSchema(Schema.create(Schema.Type.LONG))); case Struct: String childNamespace = @@ -348,15 +351,15 @@ private static T addTypesToUnion( return accumulator.endUnion(); } - private static String timestampLogicalType(ArrowType.Timestamp timestampType) { + private static LogicalType timestampLogicalType(ArrowType.Timestamp timestampType) { boolean zoneAware = timestampType.getTimezone() != null; if (timestampType.getUnit() == TimeUnit.NANOSECOND) { - return zoneAware ? "timestamp-nanos" : "local-timestamp-nanos"; + return zoneAware ? LogicalTypes.timestampNanos() : LogicalTypes.localTimestampNanos(); } else if (timestampType.getUnit() == TimeUnit.MICROSECOND) { - return zoneAware ? "timestamp-micros" : "local-timestamp-micros"; + return zoneAware ? LogicalTypes.timestampMicros() : LogicalTypes.localTimestampMicros(); } else { // Timestamp in seconds will be cast to milliseconds, Avro does not support seconds - return zoneAware ? "timestamp-millis" : "local-timestamp-millis"; + return zoneAware ? LogicalTypes.timestampMillis() : LogicalTypes.localTimestampMillis(); } } diff --git a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java index 325f54a56d..a05bbc1653 100644 --- a/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java +++ b/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/ArrowToAvroSchemaTest.java @@ -27,6 +27,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.junit.jupiter.api.Test; @@ -344,7 +345,7 @@ public void testConvertDecimalTypes() { schema.getField("nullableDecimal128").schema().getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimal128Schema.getType()); assertEquals(16, nullableDecimal128Schema.getFixedSize()); - assertEquals("decimal", nullableDecimal128Schema.getLogicalType()); + assertEquals(LogicalTypes.decimal(10, 2), nullableDecimal128Schema.getLogicalType()); assertEquals(10, nullableDecimal128Schema.getObjectProp("precision")); assertEquals(2, nullableDecimal128Schema.getObjectProp("scale")); assertEquals( @@ -355,7 +356,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal1281Schema = schema.getField("nonNullableDecimal1281").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1281Schema.getType()); assertEquals(16, nonNullableDecimal1281Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal1281Schema.getLogicalType()); + assertEquals(LogicalTypes.decimal(10, 2), nonNullableDecimal1281Schema.getLogicalType()); assertEquals(10, nonNullableDecimal1281Schema.getObjectProp("precision")); assertEquals(2, nonNullableDecimal1281Schema.getObjectProp("scale")); @@ -363,7 +364,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal1282Schema = schema.getField("nonNullableDecimal1282").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1282Schema.getType()); assertEquals(16, nonNullableDecimal1282Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal1282Schema.getLogicalType()); + assertEquals(LogicalTypes.decimal(15, 5), nonNullableDecimal1282Schema.getLogicalType()); assertEquals(15, nonNullableDecimal1282Schema.getObjectProp("precision")); assertEquals(5, nonNullableDecimal1282Schema.getObjectProp("scale")); @@ -371,7 +372,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal1283Schema = schema.getField("nonNullableDecimal1283").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal1283Schema.getType()); assertEquals(16, nonNullableDecimal1283Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal1283Schema.getLogicalType()); + assertEquals(LogicalTypes.decimal(20, 10), nonNullableDecimal1283Schema.getLogicalType()); assertEquals(20, nonNullableDecimal1283Schema.getObjectProp("precision")); assertEquals(10, nonNullableDecimal1283Schema.getObjectProp("scale")); @@ -382,7 +383,7 @@ public void testConvertDecimalTypes() { schema.getField("nullableDecimal256").schema().getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimal256Schema.getType()); assertEquals(32, nullableDecimal256Schema.getFixedSize()); - assertEquals("decimal", nullableDecimal256Schema.getLogicalType()); + assertEquals(LogicalTypes.decimal(20, 4), nullableDecimal256Schema.getLogicalType()); assertEquals(20, nullableDecimal256Schema.getObjectProp("precision")); assertEquals(4, nullableDecimal256Schema.getObjectProp("scale")); assertEquals( @@ -393,7 +394,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal2561Schema = schema.getField("nonNullableDecimal2561").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2561Schema.getType()); assertEquals(32, nonNullableDecimal2561Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal2561Schema.getLogicalType()); + assertEquals(LogicalTypes.decimal(20, 4), nonNullableDecimal2561Schema.getLogicalType()); assertEquals(20, nonNullableDecimal2561Schema.getObjectProp("precision")); assertEquals(4, nonNullableDecimal2561Schema.getObjectProp("scale")); @@ -401,7 +402,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal2562Schema = schema.getField("nonNullableDecimal2562").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2562Schema.getType()); assertEquals(32, nonNullableDecimal2562Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal2562Schema.getLogicalType()); + assertEquals(LogicalTypes.decimal(25, 8), nonNullableDecimal2562Schema.getLogicalType()); assertEquals(25, nonNullableDecimal2562Schema.getObjectProp("precision")); assertEquals(8, nonNullableDecimal2562Schema.getObjectProp("scale")); @@ -409,7 +410,7 @@ public void testConvertDecimalTypes() { Schema nonNullableDecimal2563Schema = schema.getField("nonNullableDecimal2563").schema(); assertEquals(Schema.Type.FIXED, nonNullableDecimal2563Schema.getType()); assertEquals(32, nonNullableDecimal2563Schema.getFixedSize()); - assertEquals("decimal", nonNullableDecimal2563Schema.getLogicalType()); + assertEquals(LogicalTypes.decimal(30, 15), nonNullableDecimal2563Schema.getLogicalType()); assertEquals(30, nonNullableDecimal2563Schema.getObjectProp("precision")); assertEquals(15, nonNullableDecimal2563Schema.getObjectProp("scale")); } @@ -443,14 +444,14 @@ public void testConvertDateTypes() { assertEquals(2, schema.getField("nullableDateDay").schema().getTypes().size()); Schema nullableDateDaySchema = schema.getField("nullableDateDay").schema().getTypes().get(0); assertEquals(Schema.Type.INT, nullableDateDaySchema.getType()); - assertEquals("date", nullableDateDaySchema.getLogicalType()); + assertEquals(LogicalTypes.date(), nullableDateDaySchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableDateDay").schema().getTypes().get(1).getType()); // Assertions for nonNullableDateDay Schema nonNullableDateDaySchema = schema.getField("nonNullableDateDay").schema(); assertEquals(Schema.Type.INT, nonNullableDateDaySchema.getType()); - assertEquals("date", nonNullableDateDaySchema.getLogicalType()); + assertEquals(LogicalTypes.date(), nonNullableDateDaySchema.getLogicalType()); // Assertions for nullableDateMilli assertEquals(Schema.Type.UNION, schema.getField("nullableDateMilli").schema().getType()); @@ -458,7 +459,7 @@ public void testConvertDateTypes() { Schema nullableDateMilliSchema = schema.getField("nullableDateMilli").schema().getTypes().get(0); assertEquals(Schema.Type.INT, nullableDateMilliSchema.getType()); - assertEquals("date", nullableDateMilliSchema.getLogicalType()); + assertEquals(LogicalTypes.date(), nullableDateMilliSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableDateMilli").schema().getTypes().get(1).getType()); @@ -466,7 +467,7 @@ public void testConvertDateTypes() { // Assertions for nonNullableDateMilli Schema nonNullableDateMilliSchema = schema.getField("nonNullableDateMilli").schema(); assertEquals(Schema.Type.INT, nonNullableDateMilliSchema.getType()); - assertEquals("date", nonNullableDateMilliSchema.getLogicalType()); + assertEquals(LogicalTypes.date(), nonNullableDateMilliSchema.getLogicalType()); } @Test @@ -516,14 +517,14 @@ public void testConvertTimeTypes() { assertEquals(2, schema.getField("nullableTimeSec").schema().getTypes().size()); Schema nullableTimeSecSchema = schema.getField("nullableTimeSec").schema().getTypes().get(0); assertEquals(Schema.Type.INT, nullableTimeSecSchema.getType()); - assertEquals("time-millis", nullableTimeSecSchema.getLogicalType()); + assertEquals(LogicalTypes.timeMillis(), nullableTimeSecSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeSec").schema().getTypes().get(1).getType()); // Assertions for nonNullableTimeSec Schema nonNullableTimeSecSchema = schema.getField("nonNullableTimeSec").schema(); assertEquals(Schema.Type.INT, nonNullableTimeSecSchema.getType()); - assertEquals("time-millis", nonNullableTimeSecSchema.getLogicalType()); + assertEquals(LogicalTypes.timeMillis(), nonNullableTimeSecSchema.getLogicalType()); // Assertions for nullableTimeMillis assertEquals(Schema.Type.UNION, schema.getField("nullableTimeMillis").schema().getType()); @@ -531,7 +532,7 @@ public void testConvertTimeTypes() { Schema nullableTimeMillisSchema = schema.getField("nullableTimeMillis").schema().getTypes().get(0); assertEquals(Schema.Type.INT, nullableTimeMillisSchema.getType()); - assertEquals("time-millis", nullableTimeMillisSchema.getLogicalType()); + assertEquals(LogicalTypes.timeMillis(), nullableTimeMillisSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeMillis").schema().getTypes().get(1).getType()); @@ -539,7 +540,7 @@ public void testConvertTimeTypes() { // Assertions for nonNullableTimeMillis Schema nonNullableTimeMillisSchema = schema.getField("nonNullableTimeMillis").schema(); assertEquals(Schema.Type.INT, nonNullableTimeMillisSchema.getType()); - assertEquals("time-millis", nonNullableTimeMillisSchema.getLogicalType()); + assertEquals(LogicalTypes.timeMillis(), nonNullableTimeMillisSchema.getLogicalType()); // Assertions for nullableTimeMicros assertEquals(Schema.Type.UNION, schema.getField("nullableTimeMicros").schema().getType()); @@ -547,7 +548,7 @@ public void testConvertTimeTypes() { Schema nullableTimeMicrosSchema = schema.getField("nullableTimeMicros").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimeMicrosSchema.getType()); - assertEquals("time-micros", nullableTimeMicrosSchema.getLogicalType()); + assertEquals(LogicalTypes.timeMicros(), nullableTimeMicrosSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeMicros").schema().getTypes().get(1).getType()); @@ -555,7 +556,7 @@ public void testConvertTimeTypes() { // Assertions for nonNullableTimeMicros Schema nonNullableTimeMicrosSchema = schema.getField("nonNullableTimeMicros").schema(); assertEquals(Schema.Type.LONG, nonNullableTimeMicrosSchema.getType()); - assertEquals("time-micros", nonNullableTimeMicrosSchema.getLogicalType()); + assertEquals(LogicalTypes.timeMicros(), nonNullableTimeMicrosSchema.getLogicalType()); // Assertions for nullableTimeNanos assertEquals(Schema.Type.UNION, schema.getField("nullableTimeNanos").schema().getType()); @@ -563,7 +564,7 @@ public void testConvertTimeTypes() { Schema nullableTimeNanosSchema = schema.getField("nullableTimeNanos").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimeNanosSchema.getType()); - assertEquals("time-micros", nullableTimeNanosSchema.getLogicalType()); + assertEquals(LogicalTypes.timeMicros(), nullableTimeNanosSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimeNanos").schema().getTypes().get(1).getType()); @@ -571,7 +572,7 @@ public void testConvertTimeTypes() { // Assertions for nonNullableTimeNanos Schema nonNullableTimeNanosSchema = schema.getField("nonNullableTimeNanos").schema(); assertEquals(Schema.Type.LONG, nonNullableTimeNanosSchema.getType()); - assertEquals("time-micros", nonNullableTimeNanosSchema.getLogicalType()); + assertEquals(LogicalTypes.timeMicros(), nonNullableTimeNanosSchema.getLogicalType()); } @Test @@ -622,7 +623,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nullableTimestampSecTzSchema = schema.getField("nullableTimestampSecTz").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampSecTzSchema.getType()); - assertEquals("timestamp-millis", nullableTimestampSecTzSchema.getLogicalType()); + assertEquals(LogicalTypes.timestampMillis(), nullableTimestampSecTzSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampSecTz").schema().getTypes().get(1).getType()); @@ -630,7 +631,7 @@ public void testConvertZoneAwareTimestampTypes() { // Assertions for nonNullableTimestampSecTz Schema nonNullableTimestampSecTzSchema = schema.getField("nonNullableTimestampSecTz").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampSecTzSchema.getType()); - assertEquals("timestamp-millis", nonNullableTimestampSecTzSchema.getLogicalType()); + assertEquals(LogicalTypes.timestampMillis(), nonNullableTimestampSecTzSchema.getLogicalType()); // Assertions for nullableTimestampMillisTz assertEquals( @@ -639,7 +640,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nullableTimestampMillisTzSchema = schema.getField("nullableTimestampMillisTz").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampMillisTzSchema.getType()); - assertEquals("timestamp-millis", nullableTimestampMillisTzSchema.getLogicalType()); + assertEquals(LogicalTypes.timestampMillis(), nullableTimestampMillisTzSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampMillisTz").schema().getTypes().get(1).getType()); @@ -648,7 +649,8 @@ public void testConvertZoneAwareTimestampTypes() { Schema nonNullableTimestampMillisTzSchema = schema.getField("nonNullableTimestampMillisTz").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampMillisTzSchema.getType()); - assertEquals("timestamp-millis", nonNullableTimestampMillisTzSchema.getLogicalType()); + assertEquals( + LogicalTypes.timestampMillis(), nonNullableTimestampMillisTzSchema.getLogicalType()); // Assertions for nullableTimestampMicrosTz assertEquals( @@ -657,7 +659,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nullableTimestampMicrosTzSchema = schema.getField("nullableTimestampMicrosTz").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampMicrosTzSchema.getType()); - assertEquals("timestamp-micros", nullableTimestampMicrosTzSchema.getLogicalType()); + assertEquals(LogicalTypes.timestampMicros(), nullableTimestampMicrosTzSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampMicrosTz").schema().getTypes().get(1).getType()); @@ -666,7 +668,8 @@ public void testConvertZoneAwareTimestampTypes() { Schema nonNullableTimestampMicrosTzSchema = schema.getField("nonNullableTimestampMicrosTz").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampMicrosTzSchema.getType()); - assertEquals("timestamp-micros", nonNullableTimestampMicrosTzSchema.getLogicalType()); + assertEquals( + LogicalTypes.timestampMicros(), nonNullableTimestampMicrosTzSchema.getLogicalType()); // Assertions for nullableTimestampNanosTz assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampNanosTz").schema().getType()); @@ -674,7 +677,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nullableTimestampNanosTzSchema = schema.getField("nullableTimestampNanosTz").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampNanosTzSchema.getType()); - assertEquals("timestamp-nanos", nullableTimestampNanosTzSchema.getLogicalType()); + assertEquals(LogicalTypes.timestampNanos(), nullableTimestampNanosTzSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampNanosTz").schema().getTypes().get(1).getType()); @@ -683,7 +686,7 @@ public void testConvertZoneAwareTimestampTypes() { Schema nonNullableTimestampNanosTzSchema = schema.getField("nonNullableTimestampNanosTz").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampNanosTzSchema.getType()); - assertEquals("timestamp-nanos", nonNullableTimestampNanosTzSchema.getLogicalType()); + assertEquals(LogicalTypes.timestampNanos(), nonNullableTimestampNanosTzSchema.getLogicalType()); } @Test @@ -734,7 +737,7 @@ public void testConvertLocalTimestampTypes() { Schema nullableTimestampSecSchema = schema.getField("nullableTimestampSec").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampSecSchema.getType()); - assertEquals("local-timestamp-millis", nullableTimestampSecSchema.getLogicalType()); + assertEquals(LogicalTypes.localTimestampMillis(), nullableTimestampSecSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampSec").schema().getTypes().get(1).getType()); @@ -742,7 +745,8 @@ public void testConvertLocalTimestampTypes() { // Assertions for nonNullableTimestampSec Schema nonNullableTimestampSecSchema = schema.getField("nonNullableTimestampSec").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampSecSchema.getType()); - assertEquals("local-timestamp-millis", nonNullableTimestampSecSchema.getLogicalType()); + assertEquals( + LogicalTypes.localTimestampMillis(), nonNullableTimestampSecSchema.getLogicalType()); // Assertions for nullableTimestampMillis assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampMillis").schema().getType()); @@ -750,7 +754,8 @@ public void testConvertLocalTimestampTypes() { Schema nullableTimestampMillisSchema = schema.getField("nullableTimestampMillis").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampMillisSchema.getType()); - assertEquals("local-timestamp-millis", nullableTimestampMillisSchema.getLogicalType()); + assertEquals( + LogicalTypes.localTimestampMillis(), nullableTimestampMillisSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampMillis").schema().getTypes().get(1).getType()); @@ -759,7 +764,8 @@ public void testConvertLocalTimestampTypes() { Schema nonNullableTimestampMillisSchema = schema.getField("nonNullableTimestampMillis").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampMillisSchema.getType()); - assertEquals("local-timestamp-millis", nonNullableTimestampMillisSchema.getLogicalType()); + assertEquals( + LogicalTypes.localTimestampMillis(), nonNullableTimestampMillisSchema.getLogicalType()); // Assertions for nullableTimestampMicros assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampMicros").schema().getType()); @@ -767,7 +773,8 @@ public void testConvertLocalTimestampTypes() { Schema nullableTimestampMicrosSchema = schema.getField("nullableTimestampMicros").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampMicrosSchema.getType()); - assertEquals("local-timestamp-micros", nullableTimestampMicrosSchema.getLogicalType()); + assertEquals( + LogicalTypes.localTimestampMicros(), nullableTimestampMicrosSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampMicros").schema().getTypes().get(1).getType()); @@ -776,7 +783,8 @@ public void testConvertLocalTimestampTypes() { Schema nonNullableTimestampMicrosSchema = schema.getField("nonNullableTimestampMicros").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampMicrosSchema.getType()); - assertEquals("local-timestamp-micros", nonNullableTimestampMicrosSchema.getLogicalType()); + assertEquals( + LogicalTypes.localTimestampMicros(), nonNullableTimestampMicrosSchema.getLogicalType()); // Assertions for nullableTimestampNanos assertEquals(Schema.Type.UNION, schema.getField("nullableTimestampNanos").schema().getType()); @@ -784,7 +792,7 @@ public void testConvertLocalTimestampTypes() { Schema nullableTimestampNanosSchema = schema.getField("nullableTimestampNanos").schema().getTypes().get(0); assertEquals(Schema.Type.LONG, nullableTimestampNanosSchema.getType()); - assertEquals("local-timestamp-nanos", nullableTimestampNanosSchema.getLogicalType()); + assertEquals(LogicalTypes.localTimestampNanos(), nullableTimestampNanosSchema.getLogicalType()); assertEquals( Schema.Type.NULL, schema.getField("nullableTimestampNanos").schema().getTypes().get(1).getType()); @@ -792,7 +800,8 @@ public void testConvertLocalTimestampTypes() { // Assertions for nonNullableTimestampNanos Schema nonNullableTimestampNanosSchema = schema.getField("nonNullableTimestampNanos").schema(); assertEquals(Schema.Type.LONG, nonNullableTimestampNanosSchema.getType()); - assertEquals("local-timestamp-nanos", nonNullableTimestampNanosSchema.getLogicalType()); + assertEquals( + LogicalTypes.localTimestampNanos(), nonNullableTimestampNanosSchema.getLogicalType()); } // Schema conversion for complex types, where the contents are primitive and logical types @@ -871,7 +880,7 @@ public void testConvertListTypes() { Schema nullableDecimalSchema = nonNullableDecimalListItemSchema.getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); assertEquals(16, nullableDecimalSchema.getFixedSize()); - assertEquals("decimal", nullableDecimalSchema.getLogicalType()); + assertEquals(LogicalTypes.decimal(10, 2), nullableDecimalSchema.getLogicalType()); assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); assertEquals(Schema.Type.NULL, nonNullableDecimalListItemSchema.getTypes().get(1).getType()); @@ -881,7 +890,8 @@ public void testConvertListTypes() { Schema nonNullableTimestampListItemSchema = schema.getField("nonNullableTimestampList").schema().getElementType(); assertEquals(Schema.Type.LONG, nonNullableTimestampListItemSchema.getType()); - assertEquals("timestamp-millis", nonNullableTimestampListItemSchema.getLogicalType()); + assertEquals( + LogicalTypes.timestampMillis(), nonNullableTimestampListItemSchema.getLogicalType()); } @Test @@ -962,7 +972,7 @@ public void testConvertFixedSizeListTypes() { Schema nullableDecimalSchema = nonNullableFixedSizeDecimalListItemSchema.getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); assertEquals(16, nullableDecimalSchema.getFixedSize()); - assertEquals("decimal", nullableDecimalSchema.getLogicalType()); + assertEquals(LogicalTypes.decimal(10, 2), nullableDecimalSchema.getLogicalType()); assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); assertEquals( @@ -975,7 +985,8 @@ public void testConvertFixedSizeListTypes() { schema.getField("nonNullableFixedSizeTimestampList").schema().getElementType(); assertEquals(Schema.Type.LONG, nonNullableFixedSizeTimestampListItemSchema.getType()); assertEquals( - "timestamp-millis", nonNullableFixedSizeTimestampListItemSchema.getLogicalType()); + LogicalTypes.timestampMillis(), + nonNullableFixedSizeTimestampListItemSchema.getLogicalType()); } @Test @@ -1088,7 +1099,7 @@ public void testConvertMapTypes() { Schema nullableDecimalSchema = nonNullableMapWithNullableDecimalValueSchema.getTypes().get(0); assertEquals(Schema.Type.FIXED, nullableDecimalSchema.getType()); assertEquals(16, nullableDecimalSchema.getFixedSize()); - assertEquals("decimal", nullableDecimalSchema.getLogicalType()); + assertEquals(LogicalTypes.decimal(10, 2), nullableDecimalSchema.getLogicalType()); assertEquals(10, nullableDecimalSchema.getObjectProp("precision")); assertEquals(2, nullableDecimalSchema.getObjectProp("scale")); assertEquals( @@ -1102,7 +1113,7 @@ public void testConvertMapTypes() { schema.getField("nonNullableMapWithNonNullableTimestamp").schema().getValueType(); assertEquals(Schema.Type.LONG, nonNullableMapWithNonNullableTimestampValueSchema.getType()); assertEquals( - "timestamp-millis", + LogicalTypes.timestampMillis(), nonNullableMapWithNonNullableTimestampValueSchema.getLogicalType()); } @@ -1170,7 +1181,7 @@ public void testConvertRecordTypes() { assertEquals( 16, nullableRecordSchema.getField("field3").schema().getTypes().get(0).getFixedSize()); assertEquals( - "decimal", + LogicalTypes.decimal(10, 2), nullableRecordSchema.getField("field3").schema().getTypes().get(0).getLogicalType()); assertEquals( 10, @@ -1188,7 +1199,7 @@ public void testConvertRecordTypes() { nullableRecordSchema.getField("field3").schema().getTypes().get(1).getType()); assertEquals(Schema.Type.LONG, nullableRecordSchema.getField("field4").schema().getType()); assertEquals( - "timestamp-millis", + LogicalTypes.timestampMillis(), nullableRecordSchema.getField("field4").schema().getLogicalType()); // Assertions for nonNullableRecord @@ -1208,13 +1219,8 @@ public void testConvertRecordTypes() { assertEquals( 16, nullableRecordSchema.getField("field3").schema().getTypes().get(0).getFixedSize()); assertEquals( - "decimal", - nonNullableRecordSchema - .getField("field3") - .schema() - .getTypes() - .get(0) - .getLogicalType()); + LogicalTypes.decimal(10, 2), + nonNullableRecordSchema.getField("field3").schema().getTypes().get(0).getLogicalType()); assertEquals( 10, nonNullableRecordSchema @@ -1236,7 +1242,7 @@ public void testConvertRecordTypes() { nonNullableRecordSchema.getField("field3").schema().getTypes().get(1).getType()); assertEquals(Schema.Type.LONG, nonNullableRecordSchema.getField("field4").schema().getType()); assertEquals( - "timestamp-millis", + LogicalTypes.timestampMillis(), nonNullableRecordSchema.getField("field4").schema().getLogicalType()); }