Skip to content

Commit

Permalink
GH-37728: [Java] Add methods to get an Iterable for a ValueVector (#4…
Browse files Browse the repository at this point in the history
…1895)

### Rationale for this change

Simplify validating the values in a `ValueVector` in unit tests.

### What changes are included in this PR?

Methods for creating an `Iterable` and `Iterator` for a `ValueVector`. Also updated some unit tests to use the new methods.

### Are these changes tested?

Some unit tests were updated.

### Are there any user-facing changes?

The new methods are publicly available in the `ValueVectorUtility` class.
* GitHub Issue: #37728

Authored-by: Norman Jordan <norman.jordan@improving.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
  • Loading branch information
normanj-bitquill authored Jun 18, 2024
1 parent fef86d2 commit 4413110
Show file tree
Hide file tree
Showing 53 changed files with 1,120 additions and 88 deletions.
5 changes: 5 additions & 0 deletions java/dataset/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,11 @@ under the License.
<version>2.15.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<resources>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/
package org.apache.arrow.dataset.substrait;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
Expand All @@ -39,11 +40,14 @@
import org.apache.arrow.dataset.scanner.Scanner;
import org.apache.arrow.dataset.source.Dataset;
import org.apache.arrow.dataset.source.DatasetFactory;
import org.apache.arrow.vector.ValueIterableVector;
import org.apache.arrow.vector.ipc.ArrowReader;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.Text;
import org.hamcrest.collection.IsIterableContainingInOrder;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

Expand Down Expand Up @@ -290,13 +294,15 @@ public void testRunExtendedExpressionsFilter() throws Exception {
int rowcount = 0;
while (reader.loadNextBatch()) {
rowcount += reader.getVectorSchemaRoot().getRowCount();
assertTrue(reader.getVectorSchemaRoot().getVector("id").toString().equals("[19, 1, 11]"));
assertTrue(
reader
.getVectorSchemaRoot()
.getVector("name")
.toString()
.equals("[value_19, value_1, value_11]"));
final ValueIterableVector<Integer> idVector =
(ValueIterableVector<Integer>) reader.getVectorSchemaRoot().getVector("id");
assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(19, 1, 11));
final ValueIterableVector<Text> nameVector =
(ValueIterableVector<Text>) reader.getVectorSchemaRoot().getVector("name");
assertThat(
nameVector.getValueIterable(),
IsIterableContainingInOrder.contains(
new Text("value_19"), new Text("value_1"), new Text("value_11")));
}
assertEquals(3, rowcount);
}
Expand Down Expand Up @@ -443,20 +449,22 @@ public void testRunExtendedExpressionsProjection() throws Exception {
assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields());
int rowcount = 0;
while (reader.loadNextBatch()) {
assertTrue(
reader
.getVectorSchemaRoot()
.getVector("add_two_to_column_a")
.toString()
.equals("[21, 3, 13, 23, 47]"));
assertTrue(
reader
.getVectorSchemaRoot()
.getVector("concat_column_a_and_b")
.toString()
.equals(
"[value_19 - value_19, value_1 - value_1, value_11 - value_11, "
+ "value_21 - value_21, value_45 - value_45]"));
final ValueIterableVector<Integer> sumVector =
(ValueIterableVector<Integer>)
reader.getVectorSchemaRoot().getVector("add_two_to_column_a");
assertThat(
sumVector.getValueIterable(), IsIterableContainingInOrder.contains(21, 3, 13, 23, 47));
final ValueIterableVector<Text> nameVector =
(ValueIterableVector<Text>)
reader.getVectorSchemaRoot().getVector("concat_column_a_and_b");
assertThat(
nameVector.getValueIterable(),
IsIterableContainingInOrder.contains(
new Text("value_19 - value_19"),
new Text("value_1 - value_1"),
new Text("value_11 - value_11"),
new Text("value_21 - value_21"),
new Text("value_45 - value_45")));
rowcount += reader.getVectorSchemaRoot().getRowCount();
}
assertEquals(5, rowcount);
Expand Down Expand Up @@ -507,12 +515,12 @@ public void testRunExtendedExpressionsProjectionWithFilterInsteadOfProjectionExc
assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields());
int rowcount = 0;
while (reader.loadNextBatch()) {
assertTrue(
reader
.getVectorSchemaRoot()
.getVector("filter_id_lower_than_20")
.toString()
.equals("[true, true, true, false, false]"));
final ValueIterableVector<Boolean> booleanVector =
(ValueIterableVector<Boolean>)
reader.getVectorSchemaRoot().getVector("filter_id_lower_than_20");
assertThat(
booleanVector.getValueIterable(),
IsIterableContainingInOrder.contains(true, true, true, false, false));
rowcount += reader.getVectorSchemaRoot().getRowCount();
}
assertEquals(5, rowcount);
Expand Down Expand Up @@ -618,18 +626,19 @@ public void testRunExtendedExpressionsProjectAndFilter() throws Exception {
assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields());
int rowcount = 0;
while (reader.loadNextBatch()) {
assertTrue(
reader
.getVectorSchemaRoot()
.getVector("add_two_to_column_a")
.toString()
.equals("[21, 3, 13]"));
assertTrue(
reader
.getVectorSchemaRoot()
.getVector("concat_column_a_and_b")
.toString()
.equals("[value_19 - value_19, value_1 - value_1, value_11 - value_11]"));
final ValueIterableVector<Integer> sumVector =
(ValueIterableVector<Integer>)
reader.getVectorSchemaRoot().getVector("add_two_to_column_a");
assertThat(sumVector.getValueIterable(), IsIterableContainingInOrder.contains(21, 3, 13));
final ValueIterableVector<Text> nameVector =
(ValueIterableVector<Text>)
reader.getVectorSchemaRoot().getVector("conccat_column_a_and_b");
assertThat(
nameVector.getValueIterable(),
IsIterableContainingInOrder.contains(
new Text("value_19 - value_19"),
new Text("value_1 - value_1"),
new Text("value_11 - value_11")));
rowcount += reader.getVectorSchemaRoot().getRowCount();
}
assertEquals(3, rowcount);
Expand Down
5 changes: 5 additions & 0 deletions java/vector/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ under the License.
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
Expand Down
4 changes: 3 additions & 1 deletion java/vector/src/main/codegen/templates/DenseUnionVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.arrow.vector.BaseValueVector;
import org.apache.arrow.vector.BitVectorHelper;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.ValueIterableVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.complex.AbstractStructVector;
import org.apache.arrow.vector.complex.ListVector;
Expand Down Expand Up @@ -62,6 +63,7 @@
import org.apache.arrow.vector.util.CallBack;
import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
import org.apache.arrow.vector.BaseValueVector;
import org.apache.arrow.vector.ValueIterableVector;
import org.apache.arrow.vector.util.OversizedAllocationException;
import org.apache.arrow.util.Preconditions;

Expand All @@ -84,7 +86,7 @@
* each time the vector is accessed.
* Source code generated using FreeMarker template ${.template_name}
*/
public class DenseUnionVector extends AbstractContainerVector implements FieldVector {
public class DenseUnionVector extends AbstractContainerVector implements FieldVector, ValueIterableVector<Object> {
int valueCount;

NonNullableStructVector internalStruct;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
* BigIntVector implements a fixed width vector (8 bytes) of integer values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class BigIntVector extends BaseFixedWidthVector implements BaseIntVector {
public final class BigIntVector extends BaseFixedWidthVector
implements BaseIntVector, ValueIterableVector<Long> {
public static final byte TYPE_WIDTH = 8;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
* BitVector implements a fixed width (1 bit) vector of boolean values which could be null. Each
* value in the vector corresponds to a single bit in the underlying data stream backing the vector.
*/
public final class BitVector extends BaseFixedWidthVector {
public final class BitVector extends BaseFixedWidthVector implements ValueIterableVector<Boolean> {

private static final int HASH_CODE_FOR_ZERO = 17;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
* DateDayVector implements a fixed width (4 bytes) vector of date values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class DateDayVector extends BaseFixedWidthVector {
public final class DateDayVector extends BaseFixedWidthVector
implements ValueIterableVector<Integer> {

public static final byte TYPE_WIDTH = 4;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
* DateMilliVector implements a fixed width vector (8 bytes) of date values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class DateMilliVector extends BaseFixedWidthVector {
public final class DateMilliVector extends BaseFixedWidthVector
implements ValueIterableVector<LocalDateTime> {
public static final byte TYPE_WIDTH = 8;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
* null. A validity buffer (bit vector) is maintained to track which elements in the vector are
* null.
*/
public final class Decimal256Vector extends BaseFixedWidthVector {
public final class Decimal256Vector extends BaseFixedWidthVector
implements ValueIterableVector<BigDecimal> {
public static final int MAX_PRECISION = 76;
public static final byte TYPE_WIDTH = 32;
private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
* DecimalVector implements a fixed width vector (16 bytes) of decimal values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class DecimalVector extends BaseFixedWidthVector {
public final class DecimalVector extends BaseFixedWidthVector
implements ValueIterableVector<BigDecimal> {
public static final int MAX_PRECISION = 38;
public static final byte TYPE_WIDTH = 16;
private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
* duration values which could be null. A validity buffer (bit vector) is maintained to track which
* elements in the vector are null.
*/
public final class DurationVector extends BaseFixedWidthVector {
public final class DurationVector extends BaseFixedWidthVector
implements ValueIterableVector<Duration> {
public static final byte TYPE_WIDTH = 8;

private final TimeUnit unit;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
* FixedSizeBinaryVector implements a fixed width vector of binary values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public class FixedSizeBinaryVector extends BaseFixedWidthVector {
public class FixedSizeBinaryVector extends BaseFixedWidthVector
implements ValueIterableVector<byte[]> {
private final int byteWidth;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
* Float2Vector implements a fixed width (2 bytes) vector of short values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class Float2Vector extends BaseFixedWidthVector implements FloatingPointVector {
public final class Float2Vector extends BaseFixedWidthVector
implements FloatingPointVector, ValueIterableVector<Short> {
public static final byte TYPE_WIDTH = 2;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
* Float4Vector implements a fixed width vector (4 bytes) of float values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class Float4Vector extends BaseFixedWidthVector implements FloatingPointVector {
public final class Float4Vector extends BaseFixedWidthVector
implements FloatingPointVector, ValueIterableVector<Float> {
public static final byte TYPE_WIDTH = 4;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
* Float8Vector implements a fixed width vector (8 bytes) of double values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class Float8Vector extends BaseFixedWidthVector implements FloatingPointVector {
public final class Float8Vector extends BaseFixedWidthVector
implements FloatingPointVector, ValueIterableVector<Double> {
public static final byte TYPE_WIDTH = 8;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
* IntVector implements a fixed width (4 bytes) vector of integer values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class IntVector extends BaseFixedWidthVector implements BaseIntVector {
public final class IntVector extends BaseFixedWidthVector
implements BaseIntVector, ValueIterableVector<Integer> {
public static final byte TYPE_WIDTH = 4;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
* values which could be null. A validity buffer (bit vector) is maintained to track which elements
* in the vector are null.
*/
public final class IntervalDayVector extends BaseFixedWidthVector {
public final class IntervalDayVector extends BaseFixedWidthVector
implements ValueIterableVector<Duration> {
public static final byte TYPE_WIDTH = 8;
private static final byte MILLISECOND_OFFSET = 4;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
* <p>Month, day and nanoseconds are independent from one another and there is no specific limits
* imposed on their values.
*/
public final class IntervalMonthDayNanoVector extends BaseFixedWidthVector {
public final class IntervalMonthDayNanoVector extends BaseFixedWidthVector
implements ValueIterableVector<PeriodDuration> {
public static final byte TYPE_WIDTH = 16;
private static final byte DAY_OFFSET = 4;
private static final byte NANOSECOND_OFFSET = 8;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
* values which could be null. A validity buffer (bit vector) is maintained to track which elements
* in the vector are null.
*/
public final class IntervalYearVector extends BaseFixedWidthVector {
public final class IntervalYearVector extends BaseFixedWidthVector
implements ValueIterableVector<Period> {
public static final byte TYPE_WIDTH = 4;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
* NULL. A validity buffer (bit vector) is maintained to track which elements in the vector are
* null. The size of the underlying buffer can be over 2GB.
*/
public final class LargeVarBinaryVector extends BaseLargeVariableWidthVector {
public final class LargeVarBinaryVector extends BaseLargeVariableWidthVector
implements ValueIterableVector<byte[]> {

/**
* Instantiate a LargeVarBinaryVector. This doesn't allocate any memory for the data in vector.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
*
* <p>The offset width of this vector is 8, so the underlying buffer can be larger than 2GB.
*/
public final class LargeVarCharVector extends BaseLargeVariableWidthVector {
public final class LargeVarCharVector extends BaseLargeVariableWidthVector
implements ValueIterableVector<Text> {

/**
* Instantiate a LargeVarCharVector. This doesn't allocate any memory for the data in vector.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import org.apache.arrow.vector.util.TransferPair;

/** A null type vector. */
public class NullVector implements FieldVector {
public class NullVector implements FieldVector, ValueIterableVector<Object> {

private int valueCount;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
* SmallIntVector implements a fixed width (2 bytes) vector of short values which could be null. A
* validity buffer (bit vector) is maintained to track which elements in the vector are null.
*/
public final class SmallIntVector extends BaseFixedWidthVector implements BaseIntVector {
public final class SmallIntVector extends BaseFixedWidthVector
implements BaseIntVector, ValueIterableVector<Short> {
public static final byte TYPE_WIDTH = 2;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
* which could be null. A validity buffer (bit vector) is maintained to track which elements in the
* vector are null.
*/
public final class TimeMicroVector extends BaseFixedWidthVector {
public final class TimeMicroVector extends BaseFixedWidthVector
implements ValueIterableVector<Long> {
public static final byte TYPE_WIDTH = 8;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
* which could be null. A validity buffer (bit vector) is maintained to track which elements in the
* vector are null.
*/
public final class TimeMilliVector extends BaseFixedWidthVector {
public final class TimeMilliVector extends BaseFixedWidthVector
implements ValueIterableVector<LocalDateTime> {
public static final byte TYPE_WIDTH = 4;

/**
Expand Down
Loading

0 comments on commit 4413110

Please sign in to comment.