From c08cb66f4a233a08fd3165fb71c3c768fd5e2297 Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Fri, 14 Feb 2020 21:02:16 +0800 Subject: [PATCH 1/6] [ARROW-6111][Java] Support LargeVarChar and LargeBinary types --- .../org/apache/arrow/memory/ArrowBuf.java | 2 +- .../src/main/codegen/data/ArrowTypes.tdd | 10 + .../main/codegen/data/ValueVectorTypes.tdd | 11 + .../codegen/templates/HolderReaderImpl.java | 10 + .../vector/BaseLargeVariableWidthVector.java | 1361 +++++++++++++++++ .../org/apache/arrow/vector/BufferLayout.java | 5 + .../arrow/vector/LargeVarBinaryVector.java | 306 ++++ .../arrow/vector/LargeVarCharVector.java | 332 ++++ .../org/apache/arrow/vector/TypeLayout.java | 27 + .../vector/compare/RangeEqualsVisitor.java | 6 + .../vector/compare/TypeEqualsVisitor.java | 6 + .../arrow/vector/compare/VectorVisitor.java | 3 + .../org/apache/arrow/vector/types/Types.java | 44 + .../arrow/vector/util/VectorAppender.java | 6 + .../vector/TestLargeVarBinaryVector.java | 105 ++ .../arrow/vector/TestLargeVarCharVector.java | 817 ++++++++++ .../apache/arrow/vector/TestValueVector.java | 222 ++- .../arrow/vector/TestVectorReAlloc.java | 55 +- .../apache/arrow/vector/TestVectorReset.java | 12 + .../testing/ValueVectorDataPopulator.java | 29 + 20 files changed, 3242 insertions(+), 127 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java diff --git a/java/memory/src/main/java/org/apache/arrow/memory/ArrowBuf.java b/java/memory/src/main/java/org/apache/arrow/memory/ArrowBuf.java index 58ac1b13c4d..cc2ed4d49c8 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/ArrowBuf.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/ArrowBuf.java @@ -926,7 +926,7 @@ public void setBytes(long index, ByteBuffer src, int srcIndex, int length) { * dst ArrowBuf has access to) * @param length length of data to copy */ - public void getBytes(long index, ArrowBuf dst, int dstIndex, int length) { + public void getBytes(long index, ArrowBuf dst, long dstIndex, int length) { // bound check for this ArrowBuf where the data will be copied from checkIndex(index, length); // bound check for this ArrowBuf where the data will be copied into diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 468db268d03..8361e2ac009 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -60,11 +60,21 @@ fields: [], complex: false }, + { + name: "LargeUtf8", + fields: [], + complex: false + }, { name: "Binary", fields: [], complex: false }, + { + name: "LargeBinary", + fields: [], + complex: false + }, { name: "FixedSizeBinary", fields: [{name: "byteWidth", type: int}], diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index cf8413cd47d..b9e052941ed 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -155,6 +155,17 @@ { class: "VarChar" , friendlyType: "Text" } ] }, + { + major: "VarLen", + width: 8, + javaType: "long", + boxedType: "ArrowBuf", + fields: [{name: "start", type: "long"}, {name: "end", type: "long"}, {name: "buffer", type: "ArrowBuf"}], + minor: [ + { class: "LargeVarChar" , friendlyType: "Text" } + { class: "LargeVarBinary" , friendlyType: "byte[]" } + ] + }, { major: "Bit", width: 1, diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index fcea3e882f0..fe3c66cdea9 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -102,7 +102,11 @@ public void read(Nullable${name}Holder h) { <#if type.major == "VarLen"> + <#if type.width == 4> int length = holder.end - holder.start; + <#elseif type.width == 8> + int length = (int) (holder.end - holder.start); + byte[] value = new byte [length]; holder.buffer.getBytes(holder.start, value, 0, length); <#if minor.class == "VarBinary"> @@ -111,6 +115,12 @@ public void read(Nullable${name}Holder h) { Text text = new Text(); text.set(value); return text; + <#elseif minor.class == "LargeVarChar"> + Text text = new Text(); + text.set(value); + return text; + <#elseif minor.class == "LargeVarBinary"> + return value; <#elseif minor.class == "IntervalDay"> return Duration.ofDays(holder.days).plusMillis(holder.milliseconds); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java new file mode 100644 index 00000000000..21b868a15c2 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -0,0 +1,1361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.BaseAllocator; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +import io.netty.buffer.ArrowBuf; + +/** + * BaseLargeVariableWidthVector is a base class providing functionality for large strings/large bytes types. + */ +public abstract class BaseLargeVariableWidthVector extends BaseValueVector + implements VariableWidthVector, FieldVector, VectorDefinitionSetter { + private static final int DEFAULT_RECORD_BYTE_COUNT = 12; + private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; + private int lastValueCapacity; + private long lastValueAllocationSizeInBytes; + + /* protected members */ + public static final int OFFSET_WIDTH = 8; /* 8 byte unsigned int to track offsets */ + protected static final byte[] emptyByteArray = new byte[]{}; + protected ArrowBuf validityBuffer; + protected ArrowBuf valueBuffer; + protected ArrowBuf offsetBuffer; + protected int valueCount; + protected int lastSet; + protected final Field field; + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector. + * @param allocator The allocator to use for creating/resizing buffers + */ + public BaseLargeVariableWidthVector(Field field, final BufferAllocator allocator) { + super(allocator); + this.field = field; + lastValueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + // -1 because we require one extra slot for the offset array. + lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1; + valueCount = 0; + lastSet = -1; + offsetBuffer = allocator.getEmpty(); + validityBuffer = allocator.getEmpty(); + valueBuffer = allocator.getEmpty(); + } + + @Override + public String getName() { + return field.getName(); + } + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return valueBuffer; + } + + /** + * buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * @return buffer + */ + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + /** + * Get the memory address of buffer that stores the offsets for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getOffsetBufferAddress() { + return offsetBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return valueBuffer.memoryAddress(); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT; + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + lastValueAllocationSizeInBytes = size; + lastValueCapacity = valueCount; + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + * @param density average number of bytes per variable width element + */ + @Override + public void setInitialCapacity(int valueCount, double density) { + long size = Math.max((long)(valueCount * density), 1L); + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + lastValueAllocationSizeInBytes = size; + lastValueCapacity = valueCount; + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final long startOffset = offsetBuffer.getLong(0); + final long endOffset = offsetBuffer.getLong((long) valueCount * OFFSET_WIDTH); + final double totalListSize = endOffset - startOffset; + return totalListSize / valueCount; + } + + /** + * Get the current value capacity for the vector. + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + final long offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); + return capAtMaxInt(Math.min(offsetValueCapacity, getValidityBufferValueCapacity())); + } + + private long getValidityBufferValueCapacity() { + return validityBuffer.capacity() * 8; + } + + private long getOffsetBufferValueCapacity() { + return offsetBuffer.capacity() / OFFSET_WIDTH; + } + + /** + * zero out the vector and the data in associated buffers. + */ + public void zeroVector() { + initValidityBuffer(); + initOffsetBuffer(); + valueBuffer.setZero(0, valueBuffer.capacity()); + } + + /* zero out the validity buffer */ + private void initValidityBuffer() { + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /* zero out the offset buffer */ + private void initOffsetBuffer() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + } + + /** + * Reset the vector to initial state. Same as {@link #zeroVector()}. + * Note that this method doesn't release any memory. + */ + public void reset() { + zeroVector(); + lastSet = -1; + valueCount = 0; + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clear(); + } + + /** + * Same as {@link #close()}. + */ + @Override + public void clear() { + validityBuffer = releaseBuffer(validityBuffer); + valueBuffer = releaseBuffer(valueBuffer); + offsetBuffer = releaseBuffer(offsetBuffer); + lastSet = -1; + valueCount = 0; + } + + @Override + @Deprecated + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Initialize the children in schema for this Field. This operation is a + * NO-OP for scalar types since they don't have any children. + * @param children the schema + * @throws IllegalArgumentException if children is a non-empty list for scalar types. + */ + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector can not have children"); + } + } + + /** + * Get the inner child vectors. + * @return list of child vectors for complex types, empty list for scalar vector types + */ + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + ArrowBuf dataBuffer = ownBuffers.get(2); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = offBuffer.getReferenceManager().retain(offBuffer, allocator); + valueBuffer.getReferenceManager().release(); + valueBuffer = dataBuffer.getReferenceManager().retain(dataBuffer, allocator); + + lastSet = fieldNode.getLength() - 1; + valueCount = fieldNode.getLength(); + } + + /** + * Get the buffers belonging to this vector. + * @return the inner buffers. + */ + public List getFieldBuffers() { + // before flight/IPC, we must bring the vector to a consistent state. + // this is because, it is possible that the offset buffers of some trailing values + // are not updated. this may cause some data in the data buffer being lost. + // for details, please see TestValueVector#testUnloadVariableWidthVector. + fillHoles(valueCount); + + List result = new ArrayList<>(3); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + result.add(valueBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + valueBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + valueBuffer.writerIndex(0); + } else { + final long lastDataOffset = getStartOffset(valueCount); + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH); + valueBuffer.writerIndex(lastDataOffset); + } + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. See {@link #allocateNew(long, int)} for allocating memory for specific + * number of elements in the vector. + * + * @return false if memory allocation fails, true otherwise. + */ + @Override + public boolean allocateNewSafe() { + try { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + return true; + } catch (Exception e) { + return false; + } + } + + /** + * Allocate memory for the vector to support storing at least the provided number of + * elements in the vector. This method must be called prior to using the ValueVector. + * + * @param totalBytes desired total memory capacity + * @param valueCount the desired number of elements in the vector + * @throws org.apache.arrow.memory.OutOfMemoryException if memory allocation fails + */ + @Override + public void allocateNew(long totalBytes, int valueCount) { + assert totalBytes >= 0; + + checkDataBufferSize(totalBytes); + computeAndCheckOffsetsBufferSize(valueCount); + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(totalBytes, valueCount); + } catch (Exception e) { + clear(); + throw e; + } + } + + @Override + public void allocateNew(int valueCount) { + allocateNew(lastValueAllocationSizeInBytes, valueCount); + } + + /* Check if the data buffer size is within bounds. */ + private void checkDataBufferSize(long size) { + if (size > MAX_ALLOCATION_SIZE || size < 0) { + throw new OversizedAllocationException("Memory required for vector " + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + } + + /* + * Compute the buffer size required for 'valueCount' offsets and validity, and check if it's + * within bounds. + */ + private long computeAndCheckOffsetsBufferSize(int valueCount) { + /* to track the end offset of last data element in vector, we need + * an additional slot in offset buffer. + */ + final long size = computeCombinedBufferSize(valueCount + 1, OFFSET_WIDTH); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector capacity " + + valueCount + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + return size; + } + + /* allocate the inner buffers */ + private void allocateBytes(final long valueBufferSize, final int valueCount) { + /* allocate data buffer */ + long curSize = valueBufferSize; + valueBuffer = allocator.buffer(curSize); + valueBuffer.readerIndex(0); + + /* allocate offset buffer and validity buffer */ + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount + 1, OFFSET_WIDTH); + offsetBuffer = buffers.getDataBuf(); + validityBuffer = buffers.getValidityBuf(); + initOffsetBuffer(); + initValidityBuffer(); + + lastValueCapacity = getValueCapacity(); + lastValueAllocationSizeInBytes = capAtMaxInt(valueBuffer.capacity()); + } + + /* allocate offset buffer */ + private void allocateOffsetBuffer(final long size) { + offsetBuffer = allocator.buffer(size); + offsetBuffer.readerIndex(0); + initOffsetBuffer(); + } + + /* allocate validity buffer */ + private void allocateValidityBuffer(final long size) { + validityBuffer = allocator.buffer(size); + validityBuffer.readerIndex(0); + initValidityBuffer(); + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + public void reAlloc() { + reallocDataBuffer(); + reallocValidityAndOffsetBuffers(); + } + + /** + * Reallocate the data buffer. Data Buffer stores the actual data for + * LARGEVARCHAR or LARGEVARBINARY elements in the vector. The behavior is to double + * the size of buffer. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocDataBuffer() { + final long currentBufferCapacity = valueBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (lastValueAllocationSizeInBytes > 0) { + newAllocationSize = lastValueAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_BYTE_COUNT * 2; + } + } + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + checkDataBufferSize(newAllocationSize); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, valueBuffer, 0, currentBufferCapacity); + valueBuffer.getReferenceManager().release(); + valueBuffer = newBuf; + lastValueAllocationSizeInBytes = valueBuffer.capacity(); + } + + /** + * Reallocate the validity and offset buffers for this vector. Validity + * buffer is used to track the NULL or NON-NULL nature of elements in + * the vector and offset buffer is used to store the lengths of variable + * width elements in the vector. + * + *

Note that data buffer for variable length vectors moves independent + * of the companion validity and offset buffers. This is in + * contrast to what we have for fixed width vectors. + * + *

So even though we may have setup an initial capacity of 1024 + * elements in the vector, it is quite possible + * that we need to reAlloc() the data buffer when we are setting + * the 5th element in the vector simply because previous + * variable length elements have exhausted the buffer capacity. + * However, we really don't need to reAlloc() validity and + * offset buffers until we try to set the 1025th element + * This is why we do a separate check for safe methods to + * determine which buffer needs reallocation. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocValidityAndOffsetBuffers() { + int targetOffsetCount = capAtMaxInt((offsetBuffer.capacity() / OFFSET_WIDTH) * 2); + if (targetOffsetCount == 0) { + if (lastValueCapacity > 0) { + targetOffsetCount = (lastValueCapacity + 1); + } else { + targetOffsetCount = 2 * (INITIAL_VALUE_ALLOCATION + 1); + } + } + computeAndCheckOffsetsBufferSize(targetOffsetCount); + + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetOffsetCount, OFFSET_WIDTH); + final ArrowBuf newOffsetBuffer = buffers.getDataBuf(); + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + newOffsetBuffer.setZero(offsetBuffer.capacity(), newOffsetBuffer.capacity() - offsetBuffer.capacity()); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = newOffsetBuffer; + + final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.getReferenceManager().release(); + validityBuffer = newValidityBuffer; + + lastValueCapacity = getValueCapacity(); + } + + /** + * Get the size (number of bytes) of underlying data buffer. + * @return number of bytes in the data buffer + */ + @Override + public int getByteCapacity() { + return capAtMaxInt(valueBuffer.capacity()); + } + + @Override + public int sizeOfValueBuffer() { + if (valueCount == 0) { + return 0; + } + return capAtMaxInt(offsetBuffer.getLong((long) valueCount * OFFSET_WIDTH)); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + return getBufferSizeFor(this.valueCount); + } + + /** + * Get the potential buffer size for a particular number of records. + * @param valueCount desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + final long validityBufferSize = getValidityBufferSizeFromCount(valueCount); + final long offsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH; + /* get the end offset for this valueCount */ + final long dataBufferSize = offsetBuffer.getLong((long) valueCount * OFFSET_WIDTH); + return capAtMaxInt(validityBufferSize + offsetBufferSize + dataBufferSize); + } + + /** + * Get information about how this field is materialized. + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + setReaderAndWriterIndex(); + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = new ArrowBuf[3]; + buffers[0] = validityBuffer; + buffers[1] = offsetBuffer; + buffers[2] = valueBuffer; + } + if (clear) { + for (final ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @param callBack not used + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(getName(), allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return TransferPair + */ + public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + * @param target destination vector for transfer + */ + public void transferTo(BaseLargeVariableWidthVector target) { + compareTypes(target, "transferTo"); + target.clear(); + target.validityBuffer = transferBuffer(validityBuffer, target.allocator); + target.valueBuffer = transferBuffer(valueBuffer, target.allocator); + target.offsetBuffer = transferBuffer(offsetBuffer, target.allocator); + target.setLastSet(this.lastSet); + if (this.valueCount > 0) { + target.setValueCount(this.valueCount); + } + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseLargeVariableWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); + compareTypes(target, "splitAndTransferTo"); + target.clear(); + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); + if (length > 0) { + target.setValueCount(length); + } + } + + /** + * Transfer the offsets along with data. Unlike the data buffer, we cannot simply + * slice the offset buffer for split and transfer. The reason is that offsets + * in the target vector have to be adjusted and made relative to the staring + * offset in source vector from the start index of split. This is why, we + * need to explicitly allocate the offset buffer and set the adjusted offsets + * in the target vector. + */ + private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseLargeVariableWidthVector target) { + final long start = offsetBuffer.getLong((long) startIndex * OFFSET_WIDTH); + final long end = offsetBuffer.getLong((long) (startIndex + length) * OFFSET_WIDTH); + final long dataLength = end - start; + target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); + for (int i = 0; i < length + 1; i++) { + final long relativeSourceOffset = offsetBuffer.getLong((long) (startIndex + i) * OFFSET_WIDTH) - start; + target.offsetBuffer.setLong((long) i * OFFSET_WIDTH, relativeSourceOffset); + } + final ArrowBuf slicedBuffer = valueBuffer.slice(start, dataLength); + target.valueBuffer = transferBuffer(slicedBuffer, target.allocator); + } + + /* + * Transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, + BaseLargeVariableWidthVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + + /*----------------------------------------------------------------* + | | + | common getters and setters | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Check if the given index is within the current value capacity + * of the vector. + * + * @param index position to check + * @return true if index is within the current value capacity + */ + public boolean isSafe(int index) { + return index < getValueCapacity(); + } + + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null + */ + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the value count of vector. This will always be zero unless + * setValueCount(int) has been called prior to calling this. + * + * @return valueCount for the vector + */ + public int getValueCount() { + return valueCount; + } + + /** + * Sets the value count for the vector. + * + * @param valueCount value count + */ + public void setValueCount(int valueCount) { + assert valueCount >= 0; + this.valueCount = valueCount; + while (valueCount > getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + fillHoles(valueCount); + lastSet = valueCount - 1; + setReaderAndWriterIndex(); + } + + /** + * Create holes in the vector upto the given index (exclusive). + * Holes will be created from the current last set position in + * the vector. + * + * @param index target index + */ + public void fillEmpties(int index) { + handleSafe(index, emptyByteArray.length); + fillHoles(index); + lastSet = index - 1; + } + + /** + * Set the index of last non-null element in the vector. + * It is important to call this method with appropriate value + * before calling {@link #setValueCount(int)}. + * + * @param value desired index of last non-null element. + */ + public void setLastSet(int value) { + lastSet = value; + } + + /** + * Get the index of last non-null element in the vector. + * + * @return index of the last non-null element + */ + public int getLastSet() { + return lastSet; + } + + /** + * Mark the particular position in the vector as non-null. + * + * @param index position of the element. + */ + @Override + public void setIndexDefined(int index) { + while (index >= getValidityBufferValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + BitVectorHelper.setBit(validityBuffer, index); + } + + /** + * Sets the value length for an element. + * + * @param index position of the element to set + * @param length length of the element + */ + public void setValueLengthSafe(int index, int length) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + lastSet = index; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return greater than 0 length for non-null element, 0 otherwise + */ + public int getValueLength(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return 0; + } + final long startOffset = getStartOffset(index); + final int dataLength = + (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - startOffset); + return dataLength; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. This is same as using {@link #set(int, byte[], int, int)} + * with start as 0 and length as value.length + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void set(int index, byte[] value) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[])} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void setSafe(int index, byte[] value) { + assert index >= 0; + fillEmpties(index); + handleSafe(index, value.length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in array of bytes + * @param length length of data in array of bytes + */ + public void set(int index, byte[] value, int start, int length) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[], int, int)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in array of bytes + * @param length length of data in array of bytes + */ + public void setSafe(int index, byte[] value, int start, int length) { + assert index >= 0; + fillEmpties(index); + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied ByteBuffer. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void set(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + valueBuffer.setBytes(startOffset, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, ByteBuffer, int, int)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void setSafe(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + fillEmpties(index); + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + valueBuffer.setBytes(startOffset, value, start, length); + lastSet = index; + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + while (index >= getValidityBufferValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int isSet, long start, long end, ArrowBuf buffer) { + assert index >= 0; + final long dataLength = end - start; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, end); + valueBuffer.setBytes(startOffset, buffer, start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, long, long, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int isSet, long start, long end, ArrowBuf buffer) { + assert index >= 0; + final long dataLength = end - start; + fillEmpties(index); + handleSafe(index, (int) dataLength); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, buffer, start, dataLength); + lastSet = index; + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, long start, int length, ArrowBuf buffer) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + valueBuffer.setBytes(startOffset, bb); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, long, long, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, long start, int length, ArrowBuf buffer) { + assert index >= 0; + fillEmpties(index); + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + valueBuffer.setBytes(startOffset, bb); + lastSet = index; + } + + + /*----------------------------------------------------------------* + | | + | helper methods for setters | + | | + *----------------------------------------------------------------*/ + + + protected final void fillHoles(int index) { + for (int i = lastSet + 1; i < index; i++) { + setBytes(i, emptyByteArray, 0, emptyByteArray.length); + } + lastSet = index - 1; + } + + protected final void setBytes(int index, byte[] value, int start, int length) { + /* end offset of current last element in the vector. this will + * be the start offset of new element we are trying to store. + */ + final long startOffset = getStartOffset(index); + /* set new end offset */ + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); + /* store the var length data in value buffer */ + valueBuffer.setBytes(startOffset, value, start, length); + } + + /** + * Gets the starting offset of a record, given its index. + * @param index index of the record. + * @return the starting offset of the record. + */ + protected final long getStartOffset(int index) { + return offsetBuffer.getLong((long) index * OFFSET_WIDTH); + } + + protected final void handleSafe(int index, int dataLength) { + /* + * IMPORTANT: + * value buffer for variable length vectors moves independent + * of the companion validity and offset buffers. This is in + * contrast to what we have for fixed width vectors. + * + * Here there is no concept of getValueCapacity() in the + * data stream. getValueCapacity() is applicable only to validity + * and offset buffers. + * + * So even though we may have setup an initial capacity of 1024 + * elements in the vector, it is quite possible + * that we need to reAlloc() the data buffer when we are setting + * the 5th element in the vector simply because previous + * variable length elements have exhausted the buffer capacity. + * However, we really don't need to reAlloc() validity and + * offset buffers until we try to set the 1025th element + * This is why we do a separate check for safe methods to + * determine which buffer needs reallocation. + */ + while (index >= getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + final long startOffset = getStartOffset(index); + while (valueBuffer.capacity() < (startOffset + dataLength)) { + reallocDataBuffer(); + } + } + + /** + * Method used by Json Writer to read a variable width element from + * the variable width vector and write to Json. + * + *

This method should not be used externally. + * + * @param data buffer storing the variable width vector elements + * @param offset buffer storing the offsets of variable width vector elements + * @param index position of the element in the vector + * @return array of bytes + */ + public static byte[] get(final ArrowBuf data, final ArrowBuf offset, int index) { + final long currentStartOffset = offset.getLong((long) index * OFFSET_WIDTH); + final int dataLength = + (int) (offset.getLong((long) (index + 1) * OFFSET_WIDTH) - currentStartOffset); + final byte[] result = new byte[dataLength]; + data.getBytes(currentStartOffset, result, 0, dataLength); + return result; + } + + /** + * Method used by Json Reader to explicitly set the offsets of the variable + * width vector data. The method takes care of allocating the memory for + * offsets if the caller hasn't done so. + * + *

This method should not be used externally. + * + * @param buffer ArrowBuf to store offsets for variable width elements + * @param allocator memory allocator + * @param valueCount number of elements + * @param index position of the element + * @param value offset of the element + * @return buffer holding the offsets + */ + public static ArrowBuf set(ArrowBuf buffer, BufferAllocator allocator, + int valueCount, int index, long value) { + if (buffer == null) { + buffer = allocator.buffer((long) valueCount * OFFSET_WIDTH); + } + buffer.setLong((long) index * OFFSET_WIDTH, value); + if (index == (valueCount - 1)) { + buffer.writerIndex((long) valueCount * OFFSET_WIDTH); + } + + return buffer; + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + if (from.isNull(fromIndex)) { + fillHoles(thisIndex); + BitVectorHelper.unsetBit(this.validityBuffer, thisIndex); + final long copyStart = offsetBuffer.getLong((long) thisIndex * OFFSET_WIDTH); + offsetBuffer.setLong((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart); + } else { + final long start = from.getOffsetBuffer().getLong((long) fromIndex * OFFSET_WIDTH); + final long end = from.getOffsetBuffer().getLong((long) (fromIndex + 1) * OFFSET_WIDTH); + final long length = end - start; + fillHoles(thisIndex); + BitVectorHelper.setBit(this.validityBuffer, thisIndex); + final long copyStart = offsetBuffer.getLong((long) thisIndex * OFFSET_WIDTH); + from.getDataBuffer().getBytes(start, this.valueBuffer, copyStart, (int) length); + offsetBuffer.setLong((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + } + lastSet = thisIndex; + } + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + if (from.isNull(fromIndex)) { + handleSafe(thisIndex, 0); + fillHoles(thisIndex); + BitVectorHelper.unsetBit(this.validityBuffer, thisIndex); + final long copyStart = offsetBuffer.getLong((long) thisIndex * OFFSET_WIDTH); + offsetBuffer.setLong((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart); + } else { + final long start = from.getOffsetBuffer().getLong((long) fromIndex * OFFSET_WIDTH); + final long end = from.getOffsetBuffer().getLong((long) (fromIndex + 1) * OFFSET_WIDTH); + final int length = (int) (end - start); + handleSafe(thisIndex, length); + fillHoles(thisIndex); + BitVectorHelper.setBit(this.validityBuffer, thisIndex); + final long copyStart = offsetBuffer.getLong((long) thisIndex * OFFSET_WIDTH); + from.getDataBuffer().getBytes(start, this.valueBuffer, copyStart, length); + offsetBuffer.setLong((long) (thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + } + lastSet = thisIndex; + } + + @Override + public ArrowBufPointer getDataPointer(int index) { + return getDataPointer(index, new ArrowBufPointer()); + } + + @Override + public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) { + if (isNull(index)) { + reuse.set(null, 0, 0); + } else { + long offset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); + int length = (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - offset); + reuse.set(valueBuffer, offset, length); + } + return reuse; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isNull(index)) { + return ArrowBufPointer.NULL_HASH_CODE; + } + final long start = getStartOffset(index); + final long end = getStartOffset(index + 1); + return ByteFunctionHelpers.hash(hasher, this.getDataBuffer(), start, end); + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java index d50568b9877..ff2a786b2f0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -48,6 +48,7 @@ public String getName() { private static final BufferLayout VALIDITY_BUFFER = new BufferLayout(BufferType.VALIDITY, 1); private static final BufferLayout OFFSET_BUFFER = new BufferLayout(BufferType.OFFSET, 32); + private static final BufferLayout LARGE_OFFSET_BUFFER = new BufferLayout(BufferType.OFFSET, 64); private static final BufferLayout TYPE_BUFFER = new BufferLayout(BufferType.TYPE, 32); private static final BufferLayout BIT_BUFFER = new BufferLayout(BufferType.DATA, 1); private static final BufferLayout VALUES_128 = new BufferLayout(BufferType.DATA, 128); @@ -64,6 +65,10 @@ public static BufferLayout offsetBuffer() { return OFFSET_BUFFER; } + public static BufferLayout largeOffsetBuffer() { + return LARGE_OFFSET_BUFFER; + } + /** * Returns a databuffer for the given bitwidth. Only supports powers of two between 8 and 128 * inclusive. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java new file mode 100644 index 00000000000..fee4141de9a --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.LargeVarBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.LargeVarBinaryHolder; +import org.apache.arrow.vector.holders.NullableLargeVarBinaryHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * LargeVarBinaryVector implements a large variable width vector of binary + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. + * The size of the underlying buffer can be over 2GB. + */ +public final class LargeVarBinaryVector extends BaseLargeVariableWidthVector { + private final FieldReader reader; + + /** + * Instantiate a LargeVarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public LargeVarBinaryVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.LARGEVARBINARY.getType()), allocator); + } + + /** + * Instantiate a LargeVarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public LargeVarBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a LargeVarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public LargeVarBinaryVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new LargeVarBinaryReaderImpl(LargeVarBinaryVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.LARGEVARBINARY; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the variable length element at specified index as byte array. + * + * @param index position of element to get + * @return array of bytes for non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return null; + } + final long startOffset = getStartOffset(index); + final int dataLength = + (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - startOffset); + final byte[] result = new byte[dataLength]; + valueBuffer.getBytes(startOffset, result, 0, dataLength); + return result; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return byte array for non-null element, null otherwise + */ + public byte[] getObject(int index) { + return get(index); + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableLargeVarBinaryHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.start = getStartOffset(index); + holder.end = offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH); + holder.buffer = valueBuffer; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, LargeVarBinaryHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int dataLength = (int) (holder.end - holder.start); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, LargeVarBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, LargeVarBinaryHolder holder) { + assert index >= 0; + final int dataLength = (int) (holder.end - holder.start); + fillEmpties(index); + handleSafe(index, dataLength); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableLargeVarBinaryHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final long startOffset = getStartOffset(index); + if (holder.isSet != 0) { + final int dataLength = (int) (holder.end - holder.start); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset); + } + lastSet = index; + } + + /** + * Same as {@link #set(int, NullableLargeVarBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableLargeVarBinaryHolder holder) { + assert index >= 0; + fillEmpties(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final long startOffset = getStartOffset(index); + if (holder.isSet != 0) { + final int dataLength = (int) (holder.end - holder.start); + handleSafe(index, dataLength); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + handleSafe(index, 0); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset); + } + lastSet = index; + } + + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((LargeVarBinaryVector) to); + } + + private class TransferImpl implements TransferPair { + LargeVarBinaryVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new LargeVarBinaryVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(LargeVarBinaryVector to) { + this.to = to; + } + + @Override + public LargeVarBinaryVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, LargeVarBinaryVector.this); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java new file mode 100644 index 00000000000..fc37c76d516 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java @@ -0,0 +1,332 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.LargeVarCharReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.LargeVarCharHolder; +import org.apache.arrow.vector.holders.NullableLargeVarCharHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; + +/** + * LargeVarCharVector implements a variable width vector of VARCHAR + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. + *

+ * The offset width of this vector is 8, so the underlying buffer can be larger than 2GB. + *

+ */ +public final class LargeVarCharVector extends BaseLargeVariableWidthVector { + private final FieldReader reader; + + /** + * Instantiate a LargeVarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public LargeVarCharVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.LARGEVARCHAR.getType()), allocator); + } + + /** + * Instantiate a LargeVarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public LargeVarCharVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a LargeVarCharVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public LargeVarCharVector(Field field, BufferAllocator allocator) { + super(field, allocator); + reader = new LargeVarCharReaderImpl(LargeVarCharVector.this); + } + + /** + * Get a reader that supports reading values from this vector. + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.LARGEVARCHAR; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the variable length element at specified index as byte array. + * + * @param index position of element to get + * @return array of bytes for non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return null; + } + final long startOffset = getStartOffset(index); + final int dataLength = + (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - startOffset); + final byte[] result = new byte[dataLength]; + valueBuffer.getBytes(startOffset, result, 0, dataLength); + return result; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return Text object for non-null element, null otherwise + */ + public Text getObject(int index) { + byte[] b = get(index); + if (b == null) { + return null; + } else { + return new Text(b); + } + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableLargeVarCharHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.start = getStartOffset(index); + holder.end = offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH); + holder.buffer = valueBuffer; + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, LargeVarCharHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setBit(validityBuffer, index); + final int dataLength = (int) (holder.end - holder.start); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, LargeVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, LargeVarCharHolder holder) { + assert index >= 0; + final int dataLength = (int) (holder.end - holder.start); + fillEmpties(index); + handleSafe(index, dataLength); + BitVectorHelper.setBit(validityBuffer, index); + final long startOffset = getStartOffset(index); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableLargeVarCharHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final long startOffset = getStartOffset(index); + if (holder.isSet != 0) { + final int dataLength = (int) (holder.end - holder.start); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset); + } + lastSet = index; + } + + /** + * Same as {@link #set(int, NullableLargeVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableLargeVarCharHolder holder) { + assert index >= 0; + fillEmpties(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final long startOffset = getStartOffset(index); + if (holder.isSet != 0) { + final int dataLength = (int) (holder.end - holder.start); + handleSafe(index, dataLength); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + } else { + handleSafe(index, 0); + offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset); + } + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied Text. + * + * @param index position of the element to set + * @param text Text object with data + */ + public void set(int index, Text text) { + set(index, text.getBytes(), 0, text.getLength()); + } + + /** + * Same as {@link #set(int, NullableLargeVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set. + * @param text Text object with data + */ + public void setSafe(int index, Text text) { + setSafe(index, text.getBytes(), 0, text.getLength()); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising of this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new LargeVarCharVector.TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new LargeVarCharVector.TransferImpl((LargeVarCharVector) to); + } + + private class TransferImpl implements TransferPair { + LargeVarCharVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new LargeVarCharVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(LargeVarCharVector to) { + this.to = to; + } + + @Override + public LargeVarCharVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, LargeVarCharVector.this); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index 7bc3de96362..a3999050c32 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -37,6 +37,8 @@ import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.Interval; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; import org.apache.arrow.vector.types.pojo.ArrowType.Map; import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.ArrowType.Struct; @@ -169,11 +171,26 @@ public TypeLayout visit(Utf8 type) { return newVariableWidthTypeLayout(); } + @Override + public TypeLayout visit(LargeUtf8 type) { + return newLargeVariableWidthTypeLayout(); + } + + @Override + public TypeLayout visit(LargeBinary type) { + return newLargeVariableWidthTypeLayout(); + } + private TypeLayout newVariableWidthTypeLayout() { return newPrimitiveTypeLayout(BufferLayout.validityVector(), BufferLayout.offsetBuffer(), BufferLayout.byteVector()); } + private TypeLayout newLargeVariableWidthTypeLayout() { + return newPrimitiveTypeLayout(BufferLayout.validityVector(), BufferLayout.largeOffsetBuffer(), + BufferLayout.byteVector()); + } + private TypeLayout newPrimitiveTypeLayout(BufferLayout... vectors) { return new TypeLayout(asList(vectors)); } @@ -320,6 +337,16 @@ public Integer visit(Utf8 type) { return VARIABLE_WIDTH_BUFFER_COUNT; } + @Override + public Integer visit(LargeUtf8 type) { + return VARIABLE_WIDTH_BUFFER_COUNT; + } + + @Override + public Integer visit(LargeBinary type) { + return VARIABLE_WIDTH_BUFFER_COUNT; + } + @Override public Integer visit(Null type) { return 0; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index ce9cfdeb272..fd615adc310 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.memory.util.ByteFunctionHelpers; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; @@ -146,6 +147,11 @@ public Boolean visit(BaseVariableWidthVector left, Range range) { return compareBaseVariableWidthVectors(range); } + @Override + public Boolean visit(BaseLargeVariableWidthVector left, Range value) { + throw new UnsupportedOperationException(); + } + @Override public Boolean visit(ListVector left, Range range) { if (!validate(left)) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java index 2112e3c82e6..ef115b7ba9f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -21,6 +21,7 @@ import java.util.Objects; import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; @@ -77,6 +78,11 @@ public Boolean visit(BaseVariableWidthVector left, Void value) { return compareField(left.getField(), right.getField()); } + @Override + public Boolean visit(BaseLargeVariableWidthVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + @Override public Boolean visit(ListVector left, Void value) { return compareField(left.getField(), right.getField()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java index 6a52fae04e1..a33c56a5736 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java @@ -18,6 +18,7 @@ package org.apache.arrow.vector.compare; import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.complex.DenseUnionVector; @@ -37,6 +38,8 @@ public interface VectorVisitor { OUT visit(BaseVariableWidthVector left, IN value); + OUT visit(BaseLargeVariableWidthVector left, IN value); + OUT visit(ListVector left, IN value); OUT visit(FixedSizeListVector left, IN value); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 1a070a1d4b1..40d20842d5d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -37,6 +37,8 @@ import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.IntervalDayVector; import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeMicroVector; @@ -78,6 +80,8 @@ import org.apache.arrow.vector.complex.impl.IntWriterImpl; import org.apache.arrow.vector.complex.impl.IntervalDayWriterImpl; import org.apache.arrow.vector.complex.impl.IntervalYearWriterImpl; +import org.apache.arrow.vector.complex.impl.LargeVarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.LargeVarCharWriterImpl; import org.apache.arrow.vector.complex.impl.NullableStructWriter; import org.apache.arrow.vector.complex.impl.SmallIntWriterImpl; import org.apache.arrow.vector.complex.impl.TimeMicroWriterImpl; @@ -115,6 +119,8 @@ import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.Interval; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; import org.apache.arrow.vector.types.pojo.ArrowType.List; import org.apache.arrow.vector.types.pojo.ArrowType.Map; import org.apache.arrow.vector.types.pojo.ArrowType.Null; @@ -464,6 +470,34 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarCharWriterImpl((VarCharVector) vector); } }, + LARGEVARCHAR(LargeUtf8.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new LargeVarCharVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new LargeVarCharWriterImpl((LargeVarCharVector) vector); + } + }, + LARGEVARBINARY(LargeBinary.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new LargeVarBinaryVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new LargeVarBinaryWriterImpl((LargeVarBinaryVector) vector); + } + }, VARBINARY(Binary.INSTANCE) { @Override public FieldVector getNewVector( @@ -818,11 +852,21 @@ public MinorType visit(Utf8 type) { return MinorType.VARCHAR; } + @Override + public Types.MinorType visit(LargeUtf8 type) { + return MinorType.LARGEVARCHAR; + } + @Override public MinorType visit(Binary type) { return MinorType.VARBINARY; } + @Override + public MinorType visit(LargeBinary type) { + return MinorType.LARGEVARBINARY; + } + @Override public MinorType visit(Bool type) { return MinorType.BIT; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index 54d9db9fd68..968caa57d68 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -21,6 +21,7 @@ import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.NullVector; @@ -131,6 +132,11 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { return targetVector; } + @Override + public ValueVector visit(BaseLargeVariableWidthVector left, Void value) { + throw new UnsupportedOperationException(); + } + @Override public ValueVector visit(ListVector deltaVector, Void value) { Preconditions.checkArgument(typeVisitor.equals(deltaVector), diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java new file mode 100644 index 00000000000..7c19dc1cf92 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.holders.NullableLargeVarBinaryHolder; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import io.netty.buffer.ArrowBuf; + +public class TestLargeVarBinaryVector { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testSetNullableLargeVarBinaryHolder() { + try (LargeVarBinaryVector vector = new LargeVarBinaryVector("", allocator)) { + vector.allocateNew(100, 10); + + NullableLargeVarBinaryHolder nullHolder = new NullableLargeVarBinaryHolder(); + nullHolder.isSet = 0; + + NullableLargeVarBinaryHolder binHolder = new NullableLargeVarBinaryHolder(); + binHolder.isSet = 1; + + String str = "hello"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + binHolder.start = 0; + binHolder.end = str.length(); + binHolder.buffer = buf; + + vector.set(0, nullHolder); + vector.set(1, binHolder); + + // verify results + assertTrue(vector.isNull(0)); + assertEquals(str, new String(vector.get(1))); + + buf.close(); + } + } + + @Test + public void testSetNullableLargeVarBinaryHolderSafe() { + try (LargeVarBinaryVector vector = new LargeVarBinaryVector("", allocator)) { + vector.allocateNew(5, 1); + + NullableLargeVarBinaryHolder nullHolder = new NullableLargeVarBinaryHolder(); + nullHolder.isSet = 0; + + NullableLargeVarBinaryHolder binHolder = new NullableLargeVarBinaryHolder(); + binHolder.isSet = 1; + + String str = "hello world"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + binHolder.start = 0; + binHolder.end = str.length(); + binHolder.buffer = buf; + + vector.setSafe(0, binHolder); + vector.setSafe(1, nullHolder); + + // verify results + assertEquals(str, new String(vector.get(0))); + assertTrue(vector.isNull(1)); + + buf.close(); + } + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java new file mode 100644 index 00000000000..530fac71f07 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java @@ -0,0 +1,817 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.holders.NullableLargeVarCharHolder; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.jupiter.api.Assertions; + +import io.netty.buffer.ArrowBuf; + +public class TestLargeVarCharVector { + + private static final byte[] STR1 = "AAAAA1".getBytes(); + private static final byte[] STR2 = "BBBBBBBBB2".getBytes(); + private static final byte[] STR3 = "CCCC3".getBytes(); + private static final byte[] STR4 = "DDDDDDDD4".getBytes(); + private static final byte[] STR5 = "EEE5".getBytes(); + private static final byte[] STR6 = "FFFFF6".getBytes(); + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testTransfer() { + try (BufferAllocator childAllocator1 = allocator.newChildAllocator("child1", 1000000, 1000000); + BufferAllocator childAllocator2 = allocator.newChildAllocator("child2", 1000000, 1000000); + LargeVarCharVector v1 = new LargeVarCharVector("v1", childAllocator1); + LargeVarCharVector v2 = new LargeVarCharVector("v2", childAllocator2);) { + v1.allocateNew(); + v1.setSafe(4094, "hello world".getBytes(), 0, 11); + v1.setValueCount(4001); + + long memoryBeforeTransfer = childAllocator1.getAllocatedMemory(); + + v1.makeTransferPair(v2).transfer(); + + assertEquals(0, childAllocator1.getAllocatedMemory()); + assertEquals(memoryBeforeTransfer, childAllocator2.getAllocatedMemory()); + } + } + + @Test + public void testCopyValueSafe() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator); + final LargeVarCharVector newLargeVarCharVector = new LargeVarCharVector("newvector", allocator)) { + largeVarCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.makeTransferPair(newLargeVarCharVector); + + // new vector memory is not pre-allocated, we expect copyValueSafe work fine. + for (int i = 0; i < valueCount; i++) { + tp.copyValueSafe(i, i); + } + newLargeVarCharVector.setValueCount(valueCount); + + for (int i = 0; i < valueCount; i++) { + final boolean expectedSet = (i % 3) == 0; + if (expectedSet) { + assertFalse(largeVarCharVector.isNull(i)); + assertFalse(newLargeVarCharVector.isNull(i)); + assertArrayEquals(largeVarCharVector.get(i), newLargeVarCharVector.get(i)); + } else { + assertTrue(newLargeVarCharVector.isNull(i)); + } + } + } + } + + @Test + public void testSplitAndTransferNon() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator)) { + + largeVarCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.getTransferPair(allocator); + try (LargeVarCharVector newLargeVarCharVector = (LargeVarCharVector) tp.getTo()) { + + tp.splitAndTransfer(0, 0); + assertEquals(0, newLargeVarCharVector.getValueCount()); + } + } + } + + @Test + public void testSplitAndTransferAll() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator)) { + + largeVarCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.getTransferPair(allocator); + try (LargeVarCharVector newLargeVarCharVector = (LargeVarCharVector) tp.getTo()) { + + tp.splitAndTransfer(0, valueCount); + assertEquals(valueCount, newLargeVarCharVector.getValueCount()); + } + } + } + + @Test + public void testInvalidStartIndex() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator); + final LargeVarCharVector newLargeVarCharVector = new LargeVarCharVector("newvector", allocator)) { + + largeVarCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.makeTransferPair(newLargeVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(valueCount, 10)); + + assertEquals("Invalid startIndex: 500", e.getMessage()); + } + } + + @Test + public void testInvalidLength() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator); + final LargeVarCharVector newLargeVarCharVector = new LargeVarCharVector("newvector", allocator)) { + + largeVarCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateLargeVarcharVector(largeVarCharVector, valueCount, null); + + final TransferPair tp = largeVarCharVector.makeTransferPair(newLargeVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(0, valueCount * 2)); + + assertEquals("Invalid length: 1000", e.getMessage()); + } + } + + @Test /* LargeVarCharVector */ + public void testSizeOfValueBuffer() { + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + int valueCount = 100; + int currentSize = 0; + vector.setInitialCapacity(valueCount); + vector.allocateNew(); + vector.setValueCount(valueCount); + for (int i = 0; i < valueCount; i++) { + currentSize += i; + vector.setSafe(i, new byte[i]); + } + + assertEquals(currentSize, vector.sizeOfValueBuffer()); + } + } + + @Test + public void testSetLastSetUsage() { + final byte[] STR1 = "AAAAA1".getBytes(); + final byte[] STR2 = "BBBBBBBBB2".getBytes(); + final byte[] STR3 = "CCCC3".getBytes(); + final byte[] STR4 = "DDDDDDDD4".getBytes(); + final byte[] STR5 = "EEE5".getBytes(); + final byte[] STR6 = "FFFFF6".getBytes(); + + try (final LargeVarCharVector vector = new LargeVarCharVector("myvector", allocator)) { + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + setBytes(4, STR5, vector); + setBytes(5, STR6, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + /* + * If we don't do setLastSe(5) before setValueCount(), then the latter will corrupt + * the value vector by filling in all positions [0,valuecount-1] will empty byte arrays. + * Run the test by commenting out next line and we should see incorrect vector output. + */ + vector.setLastSet(5); + vector.setValueCount(20); + + /* Check current lastSet */ + assertEquals(19, vector.getLastSet()); + + /* Check the vector output again */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertEquals(0, vector.getValueLength(10)); + assertEquals(0, vector.getValueLength(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + assertEquals(0, vector.getValueLength(15)); + assertEquals(0, vector.getValueLength(16)); + assertEquals(0, vector.getValueLength(17)); + assertEquals(0, vector.getValueLength(18)); + assertEquals(0, vector.getValueLength(19)); + + /* Check offsets */ + assertEquals(0, vector.offsetBuffer.getLong(0 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(6, vector.offsetBuffer.getLong(1 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(16, vector.offsetBuffer.getLong(2 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(21, vector.offsetBuffer.getLong(3 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(30, vector.offsetBuffer.getLong(4 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(34, vector.offsetBuffer.getLong(5 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(6 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(7 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(8 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(9 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(10 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(11 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(12 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(13 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(14 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(15 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(16 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(17 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(18 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getLong(19 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + vector.set(19, STR6); + assertArrayEquals(STR6, vector.get(19)); + assertEquals(40, vector.offsetBuffer.getLong(19 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(46, vector.offsetBuffer.getLong(20 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + } + } + + @Test(expected = OutOfMemoryException.class) + public void testVectorAllocateNew() { + try (RootAllocator smallAllocator = new RootAllocator(200); + LargeVarCharVector vector = new LargeVarCharVector("vec", smallAllocator)) { + vector.allocateNew(); + } + } + + @Test(expected = OversizedAllocationException.class) + public void testLargeVariableVectorReallocation() { + final LargeVarCharVector vector = new LargeVarCharVector("vector", allocator); + // edge case 1: value count = MAX_VALUE_ALLOCATION + final long expectedAllocationInBytes = BaseValueVector.MAX_ALLOCATION_SIZE; + final int expectedOffsetSize = 10; + try { + vector.allocateNew(expectedAllocationInBytes, 10); + assertTrue(expectedOffsetSize <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes <= vector.getDataBuffer().capacity()); + vector.reAlloc(); + assertTrue(expectedOffsetSize * 2 <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes * 2 <= vector.getDataBuffer().capacity()); + } finally { + vector.close(); + } + + // common: value count < MAX_VALUE_ALLOCATION + try { + vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 2, 0); + vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION + vector.reAlloc(); // this tests if it overflows + } finally { + vector.close(); + } + } + + @Test + public void testSplitAndTransfer() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("myvector", allocator)) { + largeVarCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + final String[] compareArray = new String[valueCount]; + + populateLargeVarcharVector(largeVarCharVector, valueCount, compareArray); + + final TransferPair tp = largeVarCharVector.getTransferPair(allocator); + try (final LargeVarCharVector newLargeVarCharVector = (LargeVarCharVector) tp.getTo()) { + final int[][] startLengths = {{0, 201}, {201, 0}, {201, 200}, {401, 99}}; + + for (final int[] startLength : startLengths) { + final int start = startLength[0]; + final int length = startLength[1]; + tp.splitAndTransfer(start, length); + for (int i = 0; i < length; i++) { + final boolean expectedSet = ((start + i) % 3) == 0; + if (expectedSet) { + final byte[] expectedValue = compareArray[start + i].getBytes(); + assertFalse(newLargeVarCharVector.isNull(i)); + assertArrayEquals(expectedValue, newLargeVarCharVector.get(i)); + } else { + assertTrue(newLargeVarCharVector.isNull(i)); + } + } + } + } + } + } + + @Test + public void testReallocAfterVectorTransfer() { + final byte[] STR1 = "AAAAA1".getBytes(); + final byte[] STR2 = "BBBBBBBBB2".getBytes(); + + try (final LargeVarCharVector vector = new LargeVarCharVector("vector", allocator)) { + /* 4096 values with 10 byte per record */ + vector.allocateNew(4096 * 10, 4096); + int valueCapacity = vector.getValueCapacity(); + assertTrue(valueCapacity >= 4096); + + /* populate the vector */ + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* trigger first realloc */ + vector.setSafe(valueCapacity, STR2, 0, STR2.length); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* trigger second realloc */ + vector.setSafe(valueCapacity + 10, STR2, 0, STR2.length); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 1) { + vector.set(i, STR1); + } else { + vector.set(i, STR2); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, vector.get(i)); + } else { + assertArrayEquals(STR2, vector.get(i)); + } + } + + /* we are potentially working with 4x the size of vector buffer + * that we initially started with. Now let's transfer the vector. + */ + + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + try (LargeVarCharVector toVector = (LargeVarCharVector)transferPair.getTo()) { + valueCapacity = toVector.getValueCapacity(); + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, toVector.get(i)); + } else { + assertArrayEquals(STR2, toVector.get(i)); + } + } + } + } + } + + @Test + public void testVectorLoadUnload() { + try (final LargeVarCharVector vector1 = new LargeVarCharVector("myvector", allocator)) { + + ValueVectorDataPopulator.setVector(vector1, STR1, STR2, STR3, STR4, STR5, STR6); + + assertEquals(5, vector1.getLastSet()); + vector1.setValueCount(15); + assertEquals(14, vector1.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector1.get(0)); + assertArrayEquals(STR2, vector1.get(1)); + assertArrayEquals(STR3, vector1.get(2)); + assertArrayEquals(STR4, vector1.get(3)); + assertArrayEquals(STR5, vector1.get(4)); + assertArrayEquals(STR6, vector1.get(5)); + + Field field = vector1.getField(); + String fieldName = field.getName(); + + List fields = new ArrayList<>(); + List fieldVectors = new ArrayList<>(); + + fields.add(field); + fieldVectors.add(vector1); + + Schema schema = new Schema(fields); + + VectorSchemaRoot schemaRoot1 = new VectorSchemaRoot(schema, fieldVectors, vector1.getValueCount()); + VectorUnloader vectorUnloader = new VectorUnloader(schemaRoot1); + + try ( + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + VectorSchemaRoot schemaRoot2 = VectorSchemaRoot.create(schema, allocator); + ) { + + VectorLoader vectorLoader = new VectorLoader(schemaRoot2); + vectorLoader.load(recordBatch); + + LargeVarCharVector vector2 = (LargeVarCharVector) schemaRoot2.getVector(fieldName); + /* + * lastSet would have internally been set by VectorLoader.load() when it invokes + * loadFieldBuffers. + */ + assertEquals(14, vector2.getLastSet()); + vector2.setValueCount(25); + assertEquals(24, vector2.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector2.get(0)); + assertArrayEquals(STR2, vector2.get(1)); + assertArrayEquals(STR3, vector2.get(2)); + assertArrayEquals(STR4, vector2.get(3)); + assertArrayEquals(STR5, vector2.get(4)); + assertArrayEquals(STR6, vector2.get(5)); + } + } + } + + @Test + public void testFillEmptiesUsage() { + try (final LargeVarCharVector vector = new LargeVarCharVector("myvector", allocator)) { + + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + setBytes(4, STR5, vector); + setBytes(5, STR6, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + vector.setLastSet(5); + /* fill empty byte arrays from index [6, 9] */ + vector.fillEmpties(10); + + /* Check current lastSet */ + assertEquals(9, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + + setBytes(10, STR1, vector); + setBytes(11, STR2, vector); + + vector.setLastSet(11); + /* fill empty byte arrays from index [12, 14] */ + vector.setValueCount(15); + + /* Check current lastSet */ + assertEquals(14, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertArrayEquals(STR1, vector.get(10)); + assertArrayEquals(STR2, vector.get(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + + /* Check offsets */ + assertEquals(0, + vector.offsetBuffer.getLong(0 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(6, + vector.offsetBuffer.getLong(1 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(16, + vector.offsetBuffer.getLong(2 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(21, + vector.offsetBuffer.getLong(3 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(30, + vector.offsetBuffer.getLong(4 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(34, + vector.offsetBuffer.getLong(5 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(40, + vector.offsetBuffer.getLong(6 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getLong(7 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getLong(8 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getLong(9 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getLong(10 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(46, + vector.offsetBuffer.getLong(11 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getLong(12 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(56, + vector.offsetBuffer.getLong(13 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getLong(14 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getLong(15 * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + } + } + + @Test + public void testGetBufferAddress1() { + try (final LargeVarCharVector vector = new LargeVarCharVector("myvector", allocator)) { + + ValueVectorDataPopulator.setVector(vector, STR1, STR2, STR3, STR4, STR5, STR6); + vector.setValueCount(15); + + /* check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + List buffers = vector.getFieldBuffers(); + long bitAddress = vector.getValidityBufferAddress(); + long offsetAddress = vector.getOffsetBufferAddress(); + long dataAddress = vector.getDataBufferAddress(); + + assertEquals(3, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); + assertEquals(dataAddress, buffers.get(2).memoryAddress()); + } + } + + @Test + public void testSetNullableLargeVarCharHolder() { + try (LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + vector.allocateNew(100, 10); + + NullableLargeVarCharHolder nullHolder = new NullableLargeVarCharHolder(); + nullHolder.isSet = 0; + + NullableLargeVarCharHolder stringHolder = new NullableLargeVarCharHolder(); + stringHolder.isSet = 1; + + String str = "hello"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + stringHolder.start = 0; + stringHolder.end = str.length(); + stringHolder.buffer = buf; + + vector.set(0, nullHolder); + vector.set(1, stringHolder); + + // verify results + assertTrue(vector.isNull(0)); + assertEquals(str, new String(vector.get(1))); + + buf.close(); + } + } + + @Test + public void testSetNullableLargeVarCharHolderSafe() { + try (LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + vector.allocateNew(5, 1); + + NullableLargeVarCharHolder nullHolder = new NullableLargeVarCharHolder(); + nullHolder.isSet = 0; + + NullableLargeVarCharHolder stringHolder = new NullableLargeVarCharHolder(); + stringHolder.isSet = 1; + + String str = "hello world"; + ArrowBuf buf = allocator.buffer(16); + buf.setBytes(0, str.getBytes()); + + stringHolder.start = 0; + stringHolder.end = str.length(); + stringHolder.buffer = buf; + + vector.setSafe(0, stringHolder); + vector.setSafe(1, nullHolder); + + // verify results + assertEquals(str, new String(vector.get(0))); + assertTrue(vector.isNull(1)); + + buf.close(); + } + } + + @Test + public void testGetNullFromLargeVariableWidthVector() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("largevarcharvec", allocator); + final LargeVarBinaryVector largeVarBinaryVector = new LargeVarBinaryVector("largevarbinary", allocator)) { + largeVarCharVector.allocateNew(10, 1); + largeVarBinaryVector.allocateNew(10, 1); + + largeVarCharVector.setNull(0); + largeVarBinaryVector.setNull(0); + + assertNull(largeVarCharVector.get(0)); + assertNull(largeVarBinaryVector.get(0)); + } + } + + @Test + public void testLargeVariableWidthVectorNullHashCode() { + try (LargeVarCharVector largeVarChVec = new LargeVarCharVector("large var char vector", allocator)) { + largeVarChVec.allocateNew(100, 1); + largeVarChVec.setValueCount(1); + + largeVarChVec.set(0, "abc".getBytes()); + largeVarChVec.setNull(0); + + assertEquals(0, largeVarChVec.hashCode(0)); + } + } + + @Test + public void testUnloadLargeVariableWidthVector() { + try (final LargeVarCharVector largeVarCharVector = new LargeVarCharVector("var char", allocator)) { + largeVarCharVector.allocateNew(5, 2); + largeVarCharVector.setValueCount(2); + + largeVarCharVector.set(0, "abcd".getBytes()); + + List bufs = largeVarCharVector.getFieldBuffers(); + assertEquals(3, bufs.size()); + + ArrowBuf offsetBuf = bufs.get(1); + ArrowBuf dataBuf = bufs.get(2); + + assertEquals(24, offsetBuf.writerIndex()); + assertEquals(4, offsetBuf.getLong(8)); + assertEquals(4, offsetBuf.getLong(16)); + + assertEquals(4, dataBuf.writerIndex()); + } + } + + @Test + public void testNullableType() { + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + vector.setInitialCapacity(512); + vector.allocateNew(); + + assertTrue(vector.getValueCapacity() >= 512); + int initialCapacity = vector.getValueCapacity(); + + try { + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); + Assert.fail("Expected out of bounds exception"); + } catch (Exception e) { + // ok + } + + vector.reAlloc(); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); + + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); + assertEquals("foo", new String(vector.get(initialCapacity), StandardCharsets.UTF_8)); + } + } + + private void populateLargeVarcharVector(final LargeVarCharVector vector, int valueCount, String[] values) { + for (int i = 0; i < valueCount; i += 3) { + final String s = String.format("%010d", i); + vector.set(i, s.getBytes()); + if (values != null) { + values[i] = s; + } + } + vector.setValueCount(valueCount); + } + + public static void setBytes(int index, byte[] bytes, LargeVarCharVector vector) { + final long currentOffset = vector.offsetBuffer.getLong((long) index * BaseLargeVariableWidthVector.OFFSET_WIDTH); + + BitVectorHelper.setBit(vector.validityBuffer, index); + vector.offsetBuffer.setLong( + (long) (index + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH, currentOffset + bytes.length); + vector.valueBuffer.setBytes(currentOffset, bytes, 0, bytes.length); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index a5152c89462..c9075f00ac9 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -1598,7 +1598,7 @@ public void testSetLastSetUsage() { setBytes(5, STR6, vector); /* Check current lastSet */ - assertEquals(Integer.toString(-1), Integer.toString(vector.getLastSet())); + assertEquals(-1, vector.getLastSet()); /* Check the vector output */ assertArrayEquals(STR1, vector.get(0)); @@ -1617,7 +1617,7 @@ public void testSetLastSetUsage() { vector.setValueCount(20); /* Check current lastSet */ - assertEquals(Integer.toString(19), Integer.toString(vector.getLastSet())); + assertEquals(19, vector.getLastSet()); /* Check the vector output again */ assertArrayEquals(STR1, vector.get(0)); @@ -1626,74 +1626,48 @@ public void testSetLastSetUsage() { assertArrayEquals(STR4, vector.get(3)); assertArrayEquals(STR5, vector.get(4)); assertArrayEquals(STR6, vector.get(5)); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(6))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(7))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(8))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(9))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(10))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(11))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(12))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(13))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(14))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(15))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(16))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(17))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(18))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(19))); - /* Check offsets */ - assertEquals(Integer.toString(0), - Integer.toString(vector.offsetBuffer.getInt(0 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(6), - Integer.toString(vector.offsetBuffer.getInt(1 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(16), - Integer.toString(vector.offsetBuffer.getInt(2 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(21), - Integer.toString(vector.offsetBuffer.getInt(3 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(30), - Integer.toString(vector.offsetBuffer.getInt(4 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(34), - Integer.toString(vector.offsetBuffer.getInt(5 * vector.OFFSET_WIDTH))); - - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(6 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(7 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(8 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(9 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(10 * vector.OFFSET_WIDTH))); - - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(11 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(12 * vector.OFFSET_WIDTH))); - - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(13 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(14 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(15 * vector.OFFSET_WIDTH))); - - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(16 * vector.OFFSET_WIDTH))); - - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(17 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(18 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(19 * vector.OFFSET_WIDTH))); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertEquals(0, vector.getValueLength(10)); + assertEquals(0, vector.getValueLength(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + assertEquals(0, vector.getValueLength(15)); + assertEquals(0, vector.getValueLength(16)); + assertEquals(0, vector.getValueLength(17)); + assertEquals(0, vector.getValueLength(18)); + assertEquals(0, vector.getValueLength(19)); + /* Check offsets */ + assertEquals(0, vector.offsetBuffer.getInt(0 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(6, vector.offsetBuffer.getInt(1 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(16, vector.offsetBuffer.getInt(2 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(21, vector.offsetBuffer.getInt(3 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(30, vector.offsetBuffer.getInt(4 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(34, vector.offsetBuffer.getInt(5 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(6 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(7 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(8 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(9 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(10 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(11 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(12 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(13 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(14 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(15 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(16 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(17 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(18 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, vector.offsetBuffer.getInt(19 * BaseVariableWidthVector.OFFSET_WIDTH)); + vector.set(19, STR6); assertArrayEquals(STR6, vector.get(19)); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(19 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(46), - Integer.toString(vector.offsetBuffer.getInt(20 * vector.OFFSET_WIDTH))); + assertEquals(40, vector.offsetBuffer.getInt(19 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(46, vector.offsetBuffer.getInt(20 * BaseVariableWidthVector.OFFSET_WIDTH)); } } @@ -1704,9 +1678,9 @@ public void testVectorLoadUnload() { setVector(vector1, STR1, STR2, STR3, STR4, STR5, STR6); - assertEquals(Integer.toString(5), Integer.toString(vector1.getLastSet())); + assertEquals(5, vector1.getLastSet()); vector1.setValueCount(15); - assertEquals(Integer.toString(14), Integer.toString(vector1.getLastSet())); + assertEquals(14, vector1.getLastSet()); /* Check the vector output */ assertArrayEquals(STR1, vector1.get(0)); @@ -1719,8 +1693,8 @@ public void testVectorLoadUnload() { Field field = vector1.getField(); String fieldName = field.getName(); - List fields = new ArrayList(); - List fieldVectors = new ArrayList(); + List fields = new ArrayList<>(); + List fieldVectors = new ArrayList<>(); fields.add(field); fieldVectors.add(vector1); @@ -1744,9 +1718,9 @@ public void testVectorLoadUnload() { * lastSet would have internally been set by VectorLoader.load() when it invokes * loadFieldBuffers. */ - assertEquals(Integer.toString(14), Integer.toString(vector2.getLastSet())); + assertEquals(14, vector2.getLastSet()); vector2.setValueCount(25); - assertEquals(Integer.toString(24), Integer.toString(vector2.getLastSet())); + assertEquals(24, vector2.getLastSet()); /* Check the vector output */ assertArrayEquals(STR1, vector2.get(0)); @@ -1773,7 +1747,7 @@ public void testFillEmptiesUsage() { setBytes(5, STR6, vector); /* Check current lastSet */ - assertEquals(Integer.toString(-1), Integer.toString(vector.getLastSet())); + assertEquals(-1, vector.getLastSet()); /* Check the vector output */ assertArrayEquals(STR1, vector.get(0)); @@ -1788,7 +1762,7 @@ public void testFillEmptiesUsage() { vector.fillEmpties(10); /* Check current lastSet */ - assertEquals(Integer.toString(9), Integer.toString(vector.getLastSet())); + assertEquals(9, vector.getLastSet()); /* Check the vector output */ assertArrayEquals(STR1, vector.get(0)); @@ -1797,10 +1771,10 @@ public void testFillEmptiesUsage() { assertArrayEquals(STR4, vector.get(3)); assertArrayEquals(STR5, vector.get(4)); assertArrayEquals(STR6, vector.get(5)); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(6))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(7))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(8))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(9))); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); setBytes(10, STR1, vector); setBytes(11, STR2, vector); @@ -1810,7 +1784,7 @@ public void testFillEmptiesUsage() { vector.setValueCount(15); /* Check current lastSet */ - assertEquals(Integer.toString(14), Integer.toString(vector.getLastSet())); + assertEquals(14, vector.getLastSet()); /* Check the vector output */ assertArrayEquals(STR1, vector.get(0)); @@ -1819,52 +1793,52 @@ public void testFillEmptiesUsage() { assertArrayEquals(STR4, vector.get(3)); assertArrayEquals(STR5, vector.get(4)); assertArrayEquals(STR6, vector.get(5)); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(6))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(7))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(8))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(9))); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); assertArrayEquals(STR1, vector.get(10)); assertArrayEquals(STR2, vector.get(11)); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(12))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(13))); - assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(14))); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); /* Check offsets */ - assertEquals(Integer.toString(0), - Integer.toString(vector.offsetBuffer.getInt(0 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(6), - Integer.toString(vector.offsetBuffer.getInt(1 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(16), - Integer.toString(vector.offsetBuffer.getInt(2 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(21), - Integer.toString(vector.offsetBuffer.getInt(3 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(30), - Integer.toString(vector.offsetBuffer.getInt(4 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(34), - Integer.toString(vector.offsetBuffer.getInt(5 * vector.OFFSET_WIDTH))); - - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(6 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(7 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(8 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(9 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(40), - Integer.toString(vector.offsetBuffer.getInt(10 * vector.OFFSET_WIDTH))); - - assertEquals(Integer.toString(46), - Integer.toString(vector.offsetBuffer.getInt(11 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(56), - Integer.toString(vector.offsetBuffer.getInt(12 * vector.OFFSET_WIDTH))); - - assertEquals(Integer.toString(56), - Integer.toString(vector.offsetBuffer.getInt(13 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(56), - Integer.toString(vector.offsetBuffer.getInt(14 * vector.OFFSET_WIDTH))); - assertEquals(Integer.toString(56), - Integer.toString(vector.offsetBuffer.getInt(15 * vector.OFFSET_WIDTH))); + assertEquals(0, + vector.offsetBuffer.getInt(0 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(6, + vector.offsetBuffer.getInt(1 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(16, + vector.offsetBuffer.getInt(2 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(21, + vector.offsetBuffer.getInt(3 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(30, + vector.offsetBuffer.getInt(4 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(34, + vector.offsetBuffer.getInt(5 * BaseVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(40, + vector.offsetBuffer.getInt(6 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getInt(7 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getInt(8 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getInt(9 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(40, + vector.offsetBuffer.getInt(10 * BaseVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(46, + vector.offsetBuffer.getInt(11 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getInt(12 * BaseVariableWidthVector.OFFSET_WIDTH)); + + assertEquals(56, + vector.offsetBuffer.getInt(13 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getInt(14 * BaseVariableWidthVector.OFFSET_WIDTH)); + assertEquals(56, + vector.offsetBuffer.getInt(15 * BaseVariableWidthVector.OFFSET_WIDTH)); } } @@ -1945,10 +1919,10 @@ public void testMultipleClose() { * in a way that lastSet is not set automatically. */ public static void setBytes(int index, byte[] bytes, VarCharVector vector) { - final int currentOffset = vector.offsetBuffer.getInt(index * vector.OFFSET_WIDTH); + final int currentOffset = vector.offsetBuffer.getInt(index * BaseVariableWidthVector.OFFSET_WIDTH); BitVectorHelper.setBit(vector.validityBuffer, index); - vector.offsetBuffer.setInt((index + 1) * vector.OFFSET_WIDTH, currentOffset + bytes.length); + vector.offsetBuffer.setInt((index + 1) * BaseVariableWidthVector.OFFSET_WIDTH, currentOffset + bytes.length); vector.valueBuffer.setBytes(currentOffset, bytes, 0, bytes.length); } @@ -1987,7 +1961,7 @@ public void testSetInitialCapacity() { @Test public void testDefaultAllocNewAll() { - int defaultCapacity = BaseFixedWidthVector.INITIAL_VALUE_ALLOCATION; + int defaultCapacity = BaseValueVector.INITIAL_VALUE_ALLOCATION; int expectedSize; long beforeSize; try (BufferAllocator childAllocator = allocator.newChildAllocator("defaultAllocs", 0, Long.MAX_VALUE); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index 43d710e29dc..e8a51f9d7f2 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -199,6 +199,31 @@ public void testVariableAllocateAfterReAlloc() throws Exception { } } + @Test + public void testLargeVariableAllocateAfterReAlloc() throws Exception { + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + /* + * Allocate the default size, and then, reAlloc. This should double the allocation. + */ + vector.allocateNewSafe(); // Initial allocation + vector.reAlloc(); // Double the allocation size. + int savedValueCapacity = vector.getValueCapacity(); + long savedValueBufferSize = vector.valueBuffer.capacity(); + + /* + * Clear and allocate again. + */ + vector.clear(); + vector.allocateNewSafe(); + + /* + * Verify that the buffer sizes haven't changed. + */ + Assert.assertEquals(vector.getValueCapacity(), savedValueCapacity); + Assert.assertEquals(vector.valueBuffer.capacity(), savedValueBufferSize); + } + } + @Test public void testVarCharAllocateNew() throws Exception { final int count = 6000; @@ -208,7 +233,20 @@ public void testVarCharAllocateNew() throws Exception { // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); - Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * VarCharVector.OFFSET_WIDTH); + Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseVariableWidthVector.OFFSET_WIDTH); + } + } + + @Test + public void testLargeVarCharAllocateNew() throws Exception { + final int count = 6000; + + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + vector.allocateNew(count); + + // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); + Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH); } } @@ -221,7 +259,20 @@ public void testVarCharAllocateNewUsingHelper() throws Exception { // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); - Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * VarCharVector.OFFSET_WIDTH); + Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseVariableWidthVector.OFFSET_WIDTH); + } + } + + @Test + public void testLargeVarCharAllocateNewUsingHelper() throws Exception { + final int count = 6000; + + try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { + AllocationHelper.allocateNew(vector, count); + + // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); + Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java index 5cfbf51bfe9..adb51960ecd 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java @@ -93,6 +93,18 @@ public void testVariableTypeReset() { } } + @Test + public void testLargeVariableTypeReset() { + try (final LargeVarCharVector vector = new LargeVarCharVector("LargeVarChar", allocator)) { + vector.allocateNewSafe(); + vector.set(0, "a".getBytes(StandardCharsets.UTF_8)); + vector.setLastSet(0); + vector.setValueCount(1); + resetVectorAndVerify(vector, vector.getBuffers(false)); + assertEquals(-1, vector.getLastSet()); + } + } + @Test public void testListTypeReset() { try (final ListVector variableList = diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index 051230bbc8f..9975682605f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -35,6 +35,7 @@ import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.IntervalDayVector; import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.LargeVarCharVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeMilliVector; @@ -534,6 +535,20 @@ public static void setVector(VarCharVector vector, byte[]... values) { vector.setValueCount(length); } + /** + * Populate values for LargeVarCharVector. + */ + public static void setVector(LargeVarCharVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + /** * Populate values for VarCharVector. */ @@ -548,6 +563,20 @@ public static void setVector(VarCharVector vector, String... values) { vector.setValueCount(length); } + /** + * Populate values for LargeVarCharVector. + */ + public static void setVector(LargeVarCharVector vector, String... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8)); + } + } + vector.setValueCount(length); + } + /** * Populate values for {@link ListVector}. */ From b5ff33a403363499f999ea9198c6feda0683f092 Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Mon, 9 Mar 2020 21:35:06 +0800 Subject: [PATCH 2/6] [ARROW-6111][Java] Apply changes for the fix of ARROW-7837 --- .../arrow/vector/BaseLargeVariableWidthVector.java | 12 ++++++------ .../apache/arrow/vector/LargeVarBinaryVector.java | 11 +++++------ .../org/apache/arrow/vector/LargeVarCharVector.java | 11 +++++------ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 21b868a15c2..26f99c63739 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -968,8 +968,8 @@ public void set(int index, byte[] value) { */ public void setSafe(int index, byte[] value) { assert index >= 0; - fillEmpties(index); handleSafe(index, value.length); + fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); setBytes(index, value, 0, value.length); lastSet = index; @@ -1004,8 +1004,8 @@ public void set(int index, byte[] value, int start, int length) { */ public void setSafe(int index, byte[] value, int start, int length) { assert index >= 0; - fillEmpties(index); handleSafe(index, length); + fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); setBytes(index, value, start, length); lastSet = index; @@ -1042,8 +1042,8 @@ public void set(int index, ByteBuffer value, int start, int length) { */ public void setSafe(int index, ByteBuffer value, int start, int length) { assert index >= 0; - fillEmpties(index); handleSafe(index, length); + fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); final long startOffset = getStartOffset(index); offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); @@ -1098,8 +1098,8 @@ public void set(int index, int isSet, long start, long end, ArrowBuf buffer) { public void setSafe(int index, int isSet, long start, long end, ArrowBuf buffer) { assert index >= 0; final long dataLength = end - start; - fillEmpties(index); handleSafe(index, (int) dataLength); + fillHoles(index); BitVectorHelper.setValidityBit(validityBuffer, index, isSet); final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); @@ -1139,8 +1139,8 @@ public void set(int index, long start, int length, ArrowBuf buffer) { */ public void setSafe(int index, long start, int length, ArrowBuf buffer) { assert index >= 0; - fillEmpties(index); handleSafe(index, length); + fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); final long startOffset = offsetBuffer.getLong((long) index * OFFSET_WIDTH); offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + length); @@ -1208,7 +1208,7 @@ protected final void handleSafe(int index, int dataLength) { while (index >= getValueCapacity()) { reallocValidityAndOffsetBuffers(); } - final long startOffset = getStartOffset(index); + final long startOffset = lastSet < 0 ? 0L : getStartOffset(lastSet + 1); while (valueBuffer.capacity() < (startOffset + dataLength)) { reallocDataBuffer(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java index fee4141de9a..e9d60b38e7a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java @@ -185,8 +185,8 @@ public void set(int index, LargeVarBinaryHolder holder) { public void setSafe(int index, LargeVarBinaryHolder holder) { assert index >= 0; final int dataLength = (int) (holder.end - holder.start); - fillEmpties(index); handleSafe(index, dataLength); + fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); final long startOffset = getStartOffset(index); offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); @@ -226,18 +226,17 @@ public void set(int index, NullableLargeVarBinaryHolder holder) { */ public void setSafe(int index, NullableLargeVarBinaryHolder holder) { assert index >= 0; - fillEmpties(index); - BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); - final long startOffset = getStartOffset(index); if (holder.isSet != 0) { final int dataLength = (int) (holder.end - holder.start); handleSafe(index, dataLength); + fillHoles(index); + final long startOffset = getStartOffset(index); offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); } else { - handleSafe(index, 0); - offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset); + fillEmpties(index + 1); } + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); lastSet = index; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java index fc37c76d516..fd205726011 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java @@ -189,8 +189,8 @@ public void set(int index, LargeVarCharHolder holder) { public void setSafe(int index, LargeVarCharHolder holder) { assert index >= 0; final int dataLength = (int) (holder.end - holder.start); - fillEmpties(index); handleSafe(index, dataLength); + fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); final long startOffset = getStartOffset(index); offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); @@ -230,18 +230,17 @@ public void set(int index, NullableLargeVarCharHolder holder) { */ public void setSafe(int index, NullableLargeVarCharHolder holder) { assert index >= 0; - fillEmpties(index); - BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); - final long startOffset = getStartOffset(index); if (holder.isSet != 0) { final int dataLength = (int) (holder.end - holder.start); handleSafe(index, dataLength); + fillHoles(index); + final long startOffset = getStartOffset(index); offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset + dataLength); valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); } else { - handleSafe(index, 0); - offsetBuffer.setLong((long) (index + 1) * OFFSET_WIDTH, startOffset); + fillHoles(index + 1); } + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); lastSet = index; } From 40c6a59568c8d28cd3ead0e9f8216ded382e0eaa Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Thu, 16 Apr 2020 14:55:54 +0800 Subject: [PATCH 3/6] [ARROW-6111][Java] Add comments for future todo work --- .../src/main/codegen/templates/HolderReaderImpl.java | 10 ++-------- .../arrow/vector/compare/RangeEqualsVisitor.java | 1 + .../org/apache/arrow/vector/util/VectorAppender.java | 1 + 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index fe3c66cdea9..fa7e83a9f8b 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -109,18 +109,12 @@ public void read(Nullable${name}Holder h) { byte[] value = new byte [length]; holder.buffer.getBytes(holder.start, value, 0, length); - <#if minor.class == "VarBinary"> + <#if minor.class == "VarBinary" || minor.class == "LargeVarBinary"> return value; - <#elseif minor.class == "VarChar"> + <#elseif minor.class == "VarChar" || minor.class == "LargeVarChar"> Text text = new Text(); text.set(value); return text; - <#elseif minor.class == "LargeVarChar"> - Text text = new Text(); - text.set(value); - return text; - <#elseif minor.class == "LargeVarBinary"> - return value; <#elseif minor.class == "IntervalDay"> return Duration.ofDays(holder.days).plusMillis(holder.milliseconds); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index fd615adc310..0861b43d577 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -149,6 +149,7 @@ public Boolean visit(BaseVariableWidthVector left, Range range) { @Override public Boolean visit(BaseLargeVariableWidthVector left, Range value) { + // TODO: support range comparison for BaseLargeVariableWidthVector. throw new UnsupportedOperationException(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index 968caa57d68..e0fd13ea55b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -134,6 +134,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { @Override public ValueVector visit(BaseLargeVariableWidthVector left, Void value) { + // TODO: support vector appending for BaseLargeVariableWidthVector throw new UnsupportedOperationException(); } From 6d9f67a74eaf8aee74f53845e5cf34a56ff5d64a Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Thu, 30 Apr 2020 21:11:20 +0800 Subject: [PATCH 4/6] [ARROW-6111][Java] Add integration for large varchar vector --- .../vector/BaseLargeVariableWidthVector.java | 7 ++-- .../validate/ValidateVectorVisitor.java | 6 +++ .../arrow/vector/ITTestLargeVector.java | 40 ++++++++++++++++++- .../vector/TestLargeVarBinaryVector.java | 3 +- .../arrow/vector/TestLargeVarCharVector.java | 5 +-- 5 files changed, 51 insertions(+), 10 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 26f99c63739..bf1dcad9da0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -24,6 +24,7 @@ import java.util.Collections; import java.util.List; +import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; @@ -38,8 +39,6 @@ import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.TransferPair; -import io.netty.buffer.ArrowBuf; - /** * BaseLargeVariableWidthVector is a base class providing functionality for large strings/large bytes types. */ @@ -166,7 +165,7 @@ public void setInitialCapacity(int valueCount) { */ @Override public void setInitialCapacity(int valueCount, double density) { - long size = Math.max((long)(valueCount * density), 1L); + long size = Math.max((long) (valueCount * density), 1L); checkDataBufferSize(size); computeAndCheckOffsetsBufferSize(valueCount); lastValueAllocationSizeInBytes = size; @@ -411,7 +410,7 @@ private void checkDataBufferSize(long size) { } } - /* + /** * Compute the buffer size required for 'valueCount' offsets and validity, and check if it's * within bounds. */ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java index a6d10c25d3b..756995ad76f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java @@ -21,6 +21,7 @@ import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullVector; @@ -90,6 +91,11 @@ public Void visit(BaseVariableWidthVector vector, Void value) { return null; } + @Override + public Void visit(BaseLargeVariableWidthVector left, Void value) { + return null; + } + @Override public Void visit(ListVector vector, Void value) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/ITTestLargeVector.java b/java/vector/src/test/java/org/apache/arrow/vector/ITTestLargeVector.java index 2736d25a3c9..28461a04336 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/ITTestLargeVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ITTestLargeVector.java @@ -19,6 +19,7 @@ import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; @@ -27,7 +28,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; - /** * Integration test for a vector with a large (more than 2GB) {@link org.apache.arrow.memory.ArrowBuf} as * the data buffer. @@ -181,4 +181,42 @@ public void testLargeFixedSizeBinaryVector() { logger.trace("Successfully released the large vector."); } + @Test + public void testLargeLargeVarCharVector() { + logger.trace("Testing large large var char vector."); + + final long bufSize = 4 * 1024 * 1024 * 1024L; + final int vecLength = (int) (bufSize / BaseLargeVariableWidthVector.OFFSET_WIDTH); + final String strElement = "9876543210"; + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + LargeVarCharVector largeVec = new LargeVarCharVector("vec", allocator)) { + largeVec.allocateNew(vecLength); + + logger.trace("Successfully allocated a vector with capacity " + vecLength); + + for (int i = 0; i < vecLength; i++) { + largeVec.setSafe(i, strElement.getBytes()); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully written " + (i + 1) + " values"); + } + } + largeVec.setValueCount(vecLength); + assertTrue(largeVec.getOffsetBuffer().readableBytes() > Integer.MAX_VALUE); + assertTrue(largeVec.getDataBuffer().readableBytes() > Integer.MAX_VALUE); + logger.trace("Successfully written " + vecLength + " values"); + + for (int i = 0; i < vecLength; i++) { + byte[] val = largeVec.get(i); + assertEquals(strElement, new String(val)); + + if ((i + 1) % 10000 == 0) { + logger.trace("Successfully read " + (i + 1) + " values"); + } + } + logger.trace("Successfully read " + vecLength + " values"); + } + logger.trace("Successfully released the large vector."); + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java index 7c19dc1cf92..644827ce995 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.holders.NullableLargeVarBinaryHolder; @@ -27,8 +28,6 @@ import org.junit.Before; import org.junit.Test; -import io.netty.buffer.ArrowBuf; - public class TestLargeVarBinaryVector { private BufferAllocator allocator; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java index 530fac71f07..1b81c6b209f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.memory.RootAllocator; @@ -43,8 +44,6 @@ import org.junit.Test; import org.junit.jupiter.api.Assertions; -import io.netty.buffer.ArrowBuf; - public class TestLargeVarCharVector { private static final byte[] STR1 = "AAAAA1".getBytes(); @@ -451,7 +450,7 @@ public void testReallocAfterVectorTransfer() { TransferPair transferPair = vector.getTransferPair(allocator); transferPair.transfer(); - try (LargeVarCharVector toVector = (LargeVarCharVector)transferPair.getTo()) { + try (LargeVarCharVector toVector = (LargeVarCharVector) transferPair.getTo()) { valueCapacity = toVector.getValueCapacity(); for (int i = 0; i < valueCapacity; i++) { From 6e12363f2a40ce9167606ad63bf015b2da87b380 Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Wed, 6 May 2020 10:45:09 +0800 Subject: [PATCH 5/6] [ARROW-6111][Java] Revise integration tests --- dev/archery/archery/integration/datagen.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index b86dad0e3c5..e8e1779e24f 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1477,7 +1477,6 @@ def _temp_path(): generate_primitive_large_offsets_case([17, 20]) .skip_category('Go') - .skip_category('Java') # TODO(ARROW-6110) .skip_category('JS'), generate_null_case([10, 0]) From b2bffe0f1d49cb172f8215750db514537972b93e Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Tue, 26 May 2020 11:09:04 +0800 Subject: [PATCH 6/6] [ARROW-6111][Java] Fix integration test problems --- .../arrow/vector/ipc/JsonFileReader.java | 66 ++++++++++++++++++- .../apache/arrow/vector/util/Validator.java | 3 +- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index 4809a26a324..c08c6e829c4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -17,9 +17,15 @@ package org.apache.arrow.vector.ipc; -import static com.fasterxml.jackson.core.JsonToken.*; +import static com.fasterxml.jackson.core.JsonToken.END_ARRAY; +import static com.fasterxml.jackson.core.JsonToken.END_OBJECT; +import static com.fasterxml.jackson.core.JsonToken.START_ARRAY; +import static com.fasterxml.jackson.core.JsonToken.START_OBJECT; import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.arrow.vector.BufferLayout.BufferType.*; +import static org.apache.arrow.vector.BufferLayout.BufferType.DATA; +import static org.apache.arrow.vector.BufferLayout.BufferType.OFFSET; +import static org.apache.arrow.vector.BufferLayout.BufferType.TYPE; +import static org.apache.arrow.vector.BufferLayout.BufferType.VALIDITY; import java.io.File; import java.io.IOException; @@ -477,6 +483,28 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException } }; + BufferReader LARGEVARCHAR = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + ArrayList values = new ArrayList<>(); + long bufferSize = 0L; + for (int i = 0; i < count; i++) { + parser.nextToken(); + final byte[] value = parser.getValueAsString().getBytes(UTF_8); + values.add(value); + bufferSize += value.length; + } + + ArrowBuf buf = allocator.buffer(bufferSize); + + for (byte[] value : values) { + buf.writeBytes(value); + } + + return buf; + } + }; + BufferReader VARBINARY = new BufferReader() { @Override protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { @@ -499,6 +527,28 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException return buf; } }; + + BufferReader LARGEVARBINARY = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + ArrayList values = new ArrayList<>(); + long bufferSize = 0L; + for (int i = 0; i < count; i++) { + parser.nextToken(); + final byte[] value = decodeHexSafe(parser.readValueAs(String.class)); + values.add(value); + bufferSize += value.length; + } + + ArrowBuf buf = allocator.buffer(bufferSize); + + for (byte[] value : values) { + buf.writeBytes(value); + } + + return buf; + } + }; } private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType, @@ -512,7 +562,11 @@ private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType if (bufferType.equals(VALIDITY)) { reader = helper.BIT; } else if (bufferType.equals(OFFSET)) { - reader = helper.INT4; + if (type == Types.MinorType.LARGEVARCHAR || type == Types.MinorType.LARGEVARBINARY) { + reader = helper.INT8; + } else { + reader = helper.INT4; + } } else if (bufferType.equals(TYPE)) { reader = helper.INT1; } else if (bufferType.equals(DATA)) { @@ -559,9 +613,15 @@ private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType case VARCHAR: reader = helper.VARCHAR; break; + case LARGEVARCHAR: + reader = helper.LARGEVARCHAR; + break; case VARBINARY: reader = helper.VARBINARY; break; + case LARGEVARBINARY: + reader = helper.LARGEVARBINARY; + break; case DATEDAY: reader = helper.INT4; break; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index 241f569c990..277f793d13f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -148,7 +148,8 @@ static boolean equals(ArrowType type, final Object o1, final Object o2) { default: throw new UnsupportedOperationException("unsupported precision: " + fpType); } - } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.FixedSizeBinary) { + } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.LargeBinary || + type instanceof ArrowType.FixedSizeBinary) { return Arrays.equals((byte[]) o1, (byte[]) o2); }