Skip to content

Commit

Permalink
ARROW-398: Java file format requires bitmaps of all 1's to be written…
Browse files Browse the repository at this point in the history
… when there are no nulls
  • Loading branch information
julienledem committed Dec 1, 2016
1 parent 3b946b8 commit 4e87d88
Show file tree
Hide file tree
Showing 11 changed files with 96 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public List<FieldVector> getChildrenFromFields() {

@Override
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
org.apache.arrow.vector.BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
org.apache.arrow.vector.BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
bits.valueCount = fieldNode.getLength();
}

Expand Down
2 changes: 1 addition & 1 deletion java/vector/src/main/codegen/templates/UnionVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ public List<FieldVector> getChildrenFromFields() {

@Override
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
this.valueCount = fieldNode.getLength();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.List;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.schema.ArrowFieldNode;

import io.netty.buffer.ArrowBuf;

Expand All @@ -29,13 +30,13 @@ public abstract class BaseDataValueVector extends BaseValueVector implements Buf

protected final static byte[] emptyByteArray = new byte[]{}; // Nullable vectors use this

public static void load(List<BufferBacked> vectors, List<ArrowBuf> buffers) {
public static void load(ArrowFieldNode fieldNode, List<BufferBacked> vectors, List<ArrowBuf> buffers) {
int expectedSize = vectors.size();
if (buffers.size() != expectedSize) {
throw new IllegalArgumentException("Illegal buffer count, expected " + expectedSize + ", got: " + buffers.size());
}
for (int i = 0; i < expectedSize; i++) {
vectors.get(i).load(buffers.get(i));
vectors.get(i).load(fieldNode, buffers.get(i));
}
}

Expand Down Expand Up @@ -106,7 +107,7 @@ public ArrowBuf getBuffer() {
}

@Override
public void load(ArrowBuf data) {
public void load(ArrowFieldNode fieldNode, ArrowBuf data) {
this.data.release();
this.data = data.retain(allocator);
}
Expand Down
29 changes: 29 additions & 0 deletions java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.holders.BitHolder;
import org.apache.arrow.vector.holders.NullableBitHolder;
import org.apache.arrow.vector.schema.ArrowFieldNode;
import org.apache.arrow.vector.types.Types.MinorType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.util.OversizedAllocationException;
Expand All @@ -48,6 +49,34 @@ public BitVector(String name, BufferAllocator allocator) {
super(name, allocator);
}

@Override
public void load(ArrowFieldNode fieldNode, ArrowBuf data) {
// When the vector is all nulls or all defined, the content of the buffer can be omitted
if (data.readableBytes() == 0 && fieldNode.getLength() != 0) {
data.release();
allocateNew(fieldNode.getLength());
int n = getSizeFromCount(fieldNode.getLength());
if (fieldNode.getNullCount() == 0) {
// all defined
// create an all 1s buffer
for (int i = 0; i < n; ++i) {
this.data.setByte(i, 0xFF);
}
} else if (fieldNode.getNullCount() == fieldNode.getLength()) {
// all null
// create an all 0s buffer
for (int i = 0; i < n; ++i) {
this.data.setByte(i, 0x00);
}
} else {
throw new IllegalArgumentException("The buffer can be empty only if there's no data or it's all null or all defined");
}
this.data.writerIndex(n);
} else {
super.load(fieldNode, data);
}
}

@Override
public Field getField() {
throw new UnsupportedOperationException("internal vector");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@
*/
package org.apache.arrow.vector;

import org.apache.arrow.vector.schema.ArrowFieldNode;

import io.netty.buffer.ArrowBuf;

/**
* Content is backed by a buffer and can be loaded/unloaded
*/
public interface BufferBacked {

void load(ArrowBuf data);
void load(ArrowFieldNode fieldNode, ArrowBuf data);

ArrowBuf unLoad();

Expand Down
17 changes: 0 additions & 17 deletions java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,6 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
*/
FieldReader getReader();

/**
* Get the metadata for this field. Used in serialization
*
* @return FieldMetadata for this field.
*/
// SerializedField getMetadata();

/**
* Returns the number of bytes that is used by this vector instance.
*/
Expand Down Expand Up @@ -166,16 +159,6 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
*/
ArrowBuf[] getBuffers(boolean clear);

/**
* Load the data provided in the buffer. Typically used when deserializing from the wire.
*
* @param metadata
* Metadata used to decode the incoming buffer.
* @param buffer
* The buffer that contains the ValueVector.
*/
// void load(SerializedField metadata, DrillBuf buffer);

/**
* An abstraction that is used to read from this vector instance.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ private void loadBuffers(FieldVector vector, Field field, Iterator<ArrowBuf> buf
try {
vector.loadFieldBuffers(fieldNode, ownBuffers);
} catch (RuntimeException e) {
e.printStackTrace();
throw new IllegalArgumentException("Could not load buffers for field " +
field + " error message" + e.getMessage(), e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ public List<FieldVector> getChildrenFromFields() {

@Override
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public NullableMapVector(String name, BufferAllocator allocator, CallBack callBa

@Override
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers);
BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers);
this.valueCount = fieldNode.getLength();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@
*/
package org.apache.arrow.vector;

import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;

import java.io.IOException;
import java.util.Collections;
import java.util.List;

import org.apache.arrow.memory.BufferAllocator;
Expand All @@ -29,12 +35,17 @@
import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter;
import org.apache.arrow.vector.complex.writer.BigIntWriter;
import org.apache.arrow.vector.complex.writer.IntWriter;
import org.apache.arrow.vector.schema.ArrowFieldNode;
import org.apache.arrow.vector.schema.ArrowRecordBatch;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Test;

import io.netty.buffer.ArrowBuf;

public class TestVectorUnloadLoad {

static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE);
Expand Down Expand Up @@ -88,6 +99,51 @@ public void testUnloadLoad() throws IOException {
}
}

/**
* The validity buffer can be empty if:
* - all values are defined
* - all values are null
* @throws IOException
*/
@Test
public void testLoadEmptyValidityBuffer() throws IOException {
Schema schema = new Schema(asList(
new Field("intDefined", true, new ArrowType.Int(32, true), Collections.<Field>emptyList()),
new Field("intNull", true, new ArrowType.Int(32, true), Collections.<Field>emptyList())
));
int count = 10;
ArrowBuf validity = allocator.getEmpty();
ArrowBuf values = allocator.buffer(count * 4); // integers
for (int i = 0; i < count; i++) {
values.setInt(i * 4, i);
}
try (
ArrowRecordBatch recordBatch = new ArrowRecordBatch(count, asList(new ArrowFieldNode(count, 0), new ArrowFieldNode(count, count)), asList(validity, values, validity, values));
BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE);
VectorSchemaRoot newRoot = new VectorSchemaRoot(schema, finalVectorsAllocator);
) {

// load it
VectorLoader vectorLoader = new VectorLoader(newRoot);

vectorLoader.load(recordBatch);

FieldReader intDefinedReader = newRoot.getVector("intDefined").getReader();
FieldReader intNullReader = newRoot.getVector("intNull").getReader();
for (int i = 0; i < count; i++) {
intDefinedReader.setPosition(i);
intNullReader.setPosition(i);
Integer defined = intDefinedReader.readInteger();
assertNotNull("#" + i, defined);
assertEquals("#" + i, i, defined.intValue());
Integer nullVal = intNullReader.readInteger();
assertNull("#" + i, nullVal);
}
} finally {
values.release();
}
}

public static VectorUnloader newVectorUnloader(FieldVector root) {
Schema schema = new Schema(root.getField().getChildren());
int valueCount = root.getAccessor().getValueCount();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.file.ArrowBlock;
import org.apache.arrow.vector.file.ArrowFooter;
import org.apache.arrow.vector.file.ArrowReader;
import org.apache.arrow.vector.file.ArrowWriter;
import org.apache.arrow.vector.schema.ArrowFieldNode;
import org.apache.arrow.vector.schema.ArrowRecordBatch;
import org.apache.arrow.vector.types.pojo.ArrowType;
Expand Down

0 comments on commit 4e87d88

Please sign in to comment.